]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/librbd/fsx.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / test / librbd / fsx.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=8 smarttab
3 /*
4 * Copyright (C) 1991, NeXT Computer, Inc. All Rights Reserverd.
5 *
6 * File: fsx.cc
7 * Author: Avadis Tevanian, Jr.
8 *
9 * File system exerciser.
10 *
11 * Rewritten 8/98 by Conrad Minshall.
12 *
13 * Small changes to work under Linux -- davej.
14 *
15 * Checks for mmap last-page zero fill.
16 */
17
18 #include <sys/types.h>
19 #include <unistd.h>
20 #include <limits.h>
21 #include <time.h>
22 #include <strings.h>
23 #include <sys/file.h>
24 #include <sys/stat.h>
25 #include <sys/mman.h>
26 #include <linux/fs.h>
27 #include <sys/ioctl.h>
28 #ifdef HAVE_ERR_H
29 #include <err.h>
30 #endif
31 #include <signal.h>
32 #include <stdbool.h>
33 #include <stddef.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <stdarg.h>
38 #include <assert.h>
39 #include <errno.h>
40 #include <math.h>
41 #include <fcntl.h>
42 #include <random>
43
44 #include "include/intarith.h"
45 #include "include/krbd.h"
46 #include "include/rados/librados.h"
47 #include "include/rados/librados.hpp"
48 #include "include/rbd/librbd.h"
49 #include "include/rbd/librbd.hpp"
50 #include "common/Cond.h"
51 #include "common/SubProcess.h"
52 #include "common/safe_io.h"
53 #include "journal/Journaler.h"
54 #include "journal/ReplayEntry.h"
55 #include "journal/ReplayHandler.h"
56 #include "journal/Settings.h"
57
58 #include <boost/scope_exit.hpp>
59
60 #define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
61
62 /*
63 * A log entry is an operation and a bunch of arguments.
64 */
65
66 struct log_entry {
67 int operation;
68 int args[3];
69 };
70
71 #define LOGSIZE 1000
72
73 struct log_entry oplog[LOGSIZE]; /* the log */
74 int logptr = 0; /* current position in log */
75 int logcount = 0; /* total ops */
76
77 /*
78 * The operation matrix is complex due to conditional execution of different
79 * features. Hence when we come to deciding what operation to run, we need to
80 * be careful in how we select the different operations. The active operations
81 * are mapped to numbers as follows:
82 *
83 * lite !lite
84 * READ: 0 0
85 * WRITE: 1 1
86 * MAPREAD: 2 2
87 * MAPWRITE: 3 3
88 * TRUNCATE: - 4
89 * FALLOCATE: - 5
90 * PUNCH HOLE: - 6
91 * WRITESAME: - 7
92 *
93 * When mapped read/writes are disabled, they are simply converted to normal
94 * reads and writes. When fallocate/fpunch calls are disabled, they are
95 * converted to OP_SKIPPED. Hence OP_SKIPPED needs to have a number higher than
96 * the operation selction matrix, as does the OP_CLOSEOPEN which is an
97 * operation modifier rather than an operation in itself.
98 *
99 * Because of the "lite" version, we also need to have different "maximum
100 * operation" defines to allow the ops to be selected correctly based on the
101 * mode being run.
102 */
103
104 /* common operations */
105 #define OP_READ 0
106 #define OP_WRITE 1
107 #define OP_MAPREAD 2
108 #define OP_MAPWRITE 3
109 #define OP_MAX_LITE 4
110
111 /* !lite operations */
112 #define OP_TRUNCATE 4
113 #define OP_FALLOCATE 5
114 #define OP_PUNCH_HOLE 6
115 #define OP_WRITESAME 7
116 /* rbd-specific operations */
117 #define OP_CLONE 8
118 #define OP_FLATTEN 9
119 #define OP_MAX_FULL 10
120
121 /* operation modifiers */
122 #define OP_CLOSEOPEN 100
123 #define OP_SKIPPED 101
124
125 #undef PAGE_SIZE
126 #define PAGE_SIZE getpagesize()
127 #undef PAGE_MASK
128 #define PAGE_MASK (PAGE_SIZE - 1)
129
130
131 char *original_buf; /* a pointer to the original data */
132 char *good_buf; /* a pointer to the correct data */
133 char *temp_buf; /* a pointer to the current data */
134
135 char dirpath[1024];
136
137 off_t file_size = 0;
138 off_t biggest = 0;
139 unsigned long testcalls = 0; /* calls to function "test" */
140
141 unsigned long simulatedopcount = 0; /* -b flag */
142 int closeprob = 0; /* -c flag */
143 int debug = 0; /* -d flag */
144 unsigned long debugstart = 0; /* -D flag */
145 int flush_enabled = 0; /* -f flag */
146 int holebdy = 1; /* -h flag */
147 bool journal_replay = false; /* -j flah */
148 int keep_on_success = 0; /* -k flag */
149 int do_fsync = 0; /* -y flag */
150 unsigned long maxfilelen = 256 * 1024; /* -l flag */
151 int sizechecks = 1; /* -n flag disables them */
152 int maxoplen = 64 * 1024; /* -o flag */
153 int quiet = 0; /* -q flag */
154 unsigned long progressinterval = 0; /* -p flag */
155 int readbdy = 1; /* -r flag */
156 int style = 0; /* -s flag */
157 int prealloc = 0; /* -x flag */
158 int truncbdy = 1; /* -t flag */
159 int writebdy = 1; /* -w flag */
160 long monitorstart = -1; /* -m flag */
161 long monitorend = -1; /* -m flag */
162 int lite = 0; /* -L flag */
163 long numops = -1; /* -N flag */
164 int randomoplen = 1; /* -O flag disables it */
165 int seed = 1; /* -S flag */
166 int mapped_writes = 0; /* -W flag disables */
167 int fallocate_calls = 0; /* -F flag disables */
168 int punch_hole_calls = 1; /* -H flag disables */
169 int clone_calls = 1; /* -C flag disables */
170 int randomize_striping = 1; /* -U flag disables */
171 int randomize_parent_overlap = 1;
172 int mapped_reads = 0; /* -R flag disables it */
173 int fsxgoodfd = 0;
174 int o_direct = 0; /* -Z flag */
175
176 int num_clones = 0;
177
178 int page_size;
179 int page_mask;
180 int mmap_mask;
181
182 FILE * fsxlogf = NULL;
183 int badoff = -1;
184 int closeopen = 0;
185
186 void
187 vwarnc(int code, const char *fmt, va_list ap) {
188 fprintf(stderr, "fsx: ");
189 if (fmt != NULL) {
190 vfprintf(stderr, fmt, ap);
191 fprintf(stderr, ": ");
192 }
193 fprintf(stderr, "%s\n", strerror(code));
194 }
195
196 void
197 warn(const char * fmt, ...) {
198 va_list ap;
199 va_start(ap, fmt);
200 vwarnc(errno, fmt, ap);
201 va_end(ap);
202 }
203
204 #define BUF_SIZE 1024
205
206 void
207 prt(const char *fmt, ...)
208 {
209 va_list args;
210 char buffer[BUF_SIZE];
211
212 va_start(args, fmt);
213 vsnprintf(buffer, BUF_SIZE, fmt, args);
214 va_end(args);
215 fprintf(stdout, "%s", buffer);
216 if (fsxlogf)
217 fprintf(fsxlogf, "%s", buffer);
218 }
219
220 void
221 prterr(const char *prefix)
222 {
223 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(errno));
224 }
225
226 void
227 prterrcode(const char *prefix, int code)
228 {
229 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(-code));
230 }
231
232 void
233 simple_err(const char *msg, int err)
234 {
235 fprintf(stderr, "%s: %s\n", msg, strerror(-err));
236 }
237
238 /*
239 * random
240 */
241 std::mt19937 random_generator;
242
243 uint_fast32_t
244 get_random(void)
245 {
246 return random_generator();
247 }
248
249 void replay_imagename(char *buf, size_t len, int clones);
250
251 namespace {
252
253 static const std::string JOURNAL_CLIENT_ID("fsx");
254
255 struct ReplayHandler : public journal::ReplayHandler {
256 journal::Journaler *journaler;
257 journal::Journaler *replay_journaler;
258 Context *on_finish;
259
260 ReplayHandler(journal::Journaler *journaler,
261 journal::Journaler *replay_journaler, Context *on_finish)
262 : journaler(journaler), replay_journaler(replay_journaler),
263 on_finish(on_finish) {
264 }
265
266 void get() override {
267 }
268 void put() override {
269 }
270
271 void handle_entries_available() override {
272 while (true) {
273 journal::ReplayEntry replay_entry;
274 if (!journaler->try_pop_front(&replay_entry)) {
275 return;
276 }
277
278 replay_journaler->append(0, replay_entry.get_data());
279 }
280 }
281
282 void handle_complete(int r) override {
283 on_finish->complete(r);
284 }
285 };
286
287 int get_image_id(librados::IoCtx &io_ctx, const char *image_name,
288 std::string *image_id) {
289 librbd::RBD rbd;
290 librbd::Image image;
291 int r = rbd.open(io_ctx, image, image_name);
292 if (r < 0) {
293 simple_err("failed to open image", r);
294 return r;
295 }
296
297 rbd_image_info_t info;
298 r = image.stat(info, sizeof(info));
299 if (r < 0) {
300 simple_err("failed to stat image", r);
301 return r;
302 }
303
304 *image_id = std::string(&info.block_name_prefix[strlen(RBD_DATA_PREFIX)]);
305 return 0;
306 }
307
308 int register_journal(rados_ioctx_t ioctx, const char *image_name) {
309 librados::IoCtx io_ctx;
310 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
311
312 std::string image_id;
313 int r = get_image_id(io_ctx, image_name, &image_id);
314 if (r < 0) {
315 return r;
316 }
317
318 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
319 r = journaler.register_client(bufferlist());
320 if (r < 0) {
321 simple_err("failed to register journal client", r);
322 return r;
323 }
324 return 0;
325 }
326
327 int unregister_journal(rados_ioctx_t ioctx, const char *image_name) {
328 librados::IoCtx io_ctx;
329 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
330
331 std::string image_id;
332 int r = get_image_id(io_ctx, image_name, &image_id);
333 if (r < 0) {
334 return r;
335 }
336
337 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
338 r = journaler.unregister_client();
339 if (r < 0) {
340 simple_err("failed to unregister journal client", r);
341 return r;
342 }
343 return 0;
344 }
345
346 int create_replay_image(rados_ioctx_t ioctx, int order,
347 uint64_t stripe_unit, int stripe_count,
348 const char *replay_image_name,
349 const char *last_replay_image_name) {
350 librados::IoCtx io_ctx;
351 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
352
353 int r;
354 librbd::RBD rbd;
355 if (last_replay_image_name == nullptr) {
356 r = rbd.create2(io_ctx, replay_image_name, 0,
357 RBD_FEATURES_ALL, &order);
358 } else {
359 r = rbd.clone2(io_ctx, last_replay_image_name, "snap",
360 io_ctx, replay_image_name, RBD_FEATURES_ALL,
361 &order, stripe_unit, stripe_count);
362 }
363
364 if (r < 0) {
365 simple_err("failed to create replay image", r);
366 return r;
367 }
368
369 return 0;
370 }
371
372 int replay_journal(rados_ioctx_t ioctx, const char *image_name,
373 const char *replay_image_name) {
374 librados::IoCtx io_ctx;
375 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
376
377 std::string image_id;
378 int r = get_image_id(io_ctx, image_name, &image_id);
379 if (r < 0) {
380 return r;
381 }
382
383 std::string replay_image_id;
384 r = get_image_id(io_ctx, replay_image_name, &replay_image_id);
385 if (r < 0) {
386 return r;
387 }
388
389 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {});
390 C_SaferCond init_ctx;
391 journaler.init(&init_ctx);
392 BOOST_SCOPE_EXIT_ALL( (&journaler) ) {
393 journaler.shut_down();
394 };
395
396 r = init_ctx.wait();
397 if (r < 0) {
398 simple_err("failed to initialize journal", r);
399 return r;
400 }
401
402 journal::Journaler replay_journaler(io_ctx, replay_image_id, "", {});
403
404 C_SaferCond replay_init_ctx;
405 replay_journaler.init(&replay_init_ctx);
406 BOOST_SCOPE_EXIT_ALL( (&replay_journaler) ) {
407 replay_journaler.shut_down();
408 };
409
410 r = replay_init_ctx.wait();
411 if (r < 0) {
412 simple_err("failed to initialize replay journal", r);
413 return r;
414 }
415
416 replay_journaler.start_append(0, 0, 0);
417
418 C_SaferCond replay_ctx;
419 ReplayHandler replay_handler(&journaler, &replay_journaler,
420 &replay_ctx);
421
422 // copy journal events from source image to replay image
423 journaler.start_replay(&replay_handler);
424 r = replay_ctx.wait();
425
426 journaler.stop_replay();
427
428 C_SaferCond stop_ctx;
429 replay_journaler.stop_append(&stop_ctx);
430 int stop_r = stop_ctx.wait();
431 if (r == 0 && stop_r < 0) {
432 r = stop_r;
433 }
434
435 if (r < 0) {
436 simple_err("failed to replay journal", r);
437 return r;
438 }
439
440 librbd::RBD rbd;
441 librbd::Image image;
442 r = rbd.open(io_ctx, image, replay_image_name);
443 if (r < 0) {
444 simple_err("failed to open replay image", r);
445 return r;
446 }
447
448 // perform an IO op to initiate the journal replay
449 bufferlist bl;
450 r = static_cast<ssize_t>(image.write(0, 0, bl));
451 if (r < 0) {
452 simple_err("failed to write to replay image", r);
453 return r;
454 }
455 return 0;
456 }
457
458 int finalize_journal(rados_ioctx_t ioctx, const char *imagename, int clones,
459 int order, uint64_t stripe_unit, int stripe_count) {
460 char replayimagename[1024];
461 replay_imagename(replayimagename, sizeof(replayimagename), clones);
462
463 char lastreplayimagename[1024];
464 if (clones > 0) {
465 replay_imagename(lastreplayimagename,
466 sizeof(lastreplayimagename), clones - 1);
467 }
468
469 int ret = create_replay_image(ioctx, order, stripe_unit,
470 stripe_count, replayimagename,
471 clones > 0 ? lastreplayimagename :
472 nullptr);
473 if (ret < 0) {
474 exit(EXIT_FAILURE);
475 }
476
477 ret = replay_journal(ioctx, imagename, replayimagename);
478 if (ret < 0) {
479 exit(EXIT_FAILURE);
480 }
481 return 0;
482 }
483
484 } // anonymous namespace
485
486 /*
487 * rbd
488 */
489
490 struct rbd_ctx {
491 const char *name; /* image name */
492 rbd_image_t image; /* image handle */
493 const char *krbd_name; /* image /dev/rbd<id> name */ /* reused for nbd test */
494 int krbd_fd; /* image /dev/rbd<id> fd */ /* reused for nbd test */
495 };
496
497 #define RBD_CTX_INIT (struct rbd_ctx) { NULL, NULL, NULL, -1}
498
499 struct rbd_operations {
500 int (*open)(const char *name, struct rbd_ctx *ctx);
501 int (*close)(struct rbd_ctx *ctx);
502 ssize_t (*read)(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf);
503 ssize_t (*write)(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf);
504 int (*flush)(struct rbd_ctx *ctx);
505 int (*discard)(struct rbd_ctx *ctx, uint64_t off, uint64_t len);
506 int (*get_size)(struct rbd_ctx *ctx, uint64_t *size);
507 int (*resize)(struct rbd_ctx *ctx, uint64_t size);
508 int (*clone)(struct rbd_ctx *ctx, const char *src_snapname,
509 const char *dst_imagename, int *order, int stripe_unit,
510 int stripe_count);
511 int (*flatten)(struct rbd_ctx *ctx);
512 ssize_t (*writesame)(struct rbd_ctx *ctx, uint64_t off, size_t len,
513 const char *buf, size_t data_len);
514 };
515
516 char *pool; /* name of the pool our test image is in */
517 char *iname; /* name of our test image */
518 rados_t cluster; /* handle for our test cluster */
519 rados_ioctx_t ioctx; /* handle for our test pool */
520 struct krbd_ctx *krbd; /* handle for libkrbd */
521 bool skip_partial_discard; /* rbd_skip_partial_discard config value*/
522
523 /*
524 * librbd/krbd rbd_operations handlers. Given the rest of fsx.c, no
525 * attempt to do error handling is made in these handlers.
526 */
527
528 int
529 __librbd_open(const char *name, struct rbd_ctx *ctx)
530 {
531 rbd_image_t image;
532 int ret;
533
534 assert(!ctx->name && !ctx->image &&
535 !ctx->krbd_name && ctx->krbd_fd < 0);
536
537 ret = rbd_open(ioctx, name, &image, NULL);
538 if (ret < 0) {
539 prt("rbd_open(%s) failed\n", name);
540 return ret;
541 }
542
543 ctx->name = strdup(name);
544 ctx->image = image;
545 ctx->krbd_name = NULL;
546 ctx->krbd_fd = -1;
547
548 return 0;
549 }
550
551 int
552 librbd_open(const char *name, struct rbd_ctx *ctx)
553 {
554 return __librbd_open(name, ctx);
555 }
556
557 int
558 __librbd_close(struct rbd_ctx *ctx)
559 {
560 int ret;
561
562 assert(ctx->name && ctx->image);
563
564 ret = rbd_close(ctx->image);
565 if (ret < 0) {
566 prt("rbd_close(%s) failed\n", ctx->name);
567 return ret;
568 }
569
570 free((void *)ctx->name);
571
572 ctx->name = NULL;
573 ctx->image = NULL;
574
575 return 0;
576 }
577
578 int
579 librbd_close(struct rbd_ctx *ctx)
580 {
581 return __librbd_close(ctx);
582 }
583
584 int
585 librbd_verify_object_map(struct rbd_ctx *ctx)
586 {
587 int n;
588 uint64_t flags;
589 n = rbd_get_flags(ctx->image, &flags);
590 if (n < 0) {
591 prt("rbd_get_flags() failed\n");
592 return n;
593 }
594
595 if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
596 prt("rbd_get_flags() indicates object map is invalid\n");
597 return -EINVAL;
598 }
599 return 0;
600 }
601
602 ssize_t
603 librbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
604 {
605 ssize_t n;
606
607 n = rbd_read(ctx->image, off, len, buf);
608 if (n < 0)
609 prt("rbd_read(%llu, %zu) failed\n", off, len);
610
611 return n;
612 }
613
614 ssize_t
615 librbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
616 {
617 ssize_t n;
618 int ret;
619
620 n = rbd_write(ctx->image, off, len, buf);
621 if (n < 0) {
622 prt("rbd_write(%llu, %zu) failed\n", off, len);
623 return n;
624 }
625
626 ret = librbd_verify_object_map(ctx);
627 if (ret < 0) {
628 return ret;
629 }
630 return n;
631 }
632
633 int
634 librbd_flush(struct rbd_ctx *ctx)
635 {
636 int ret;
637
638 ret = rbd_flush(ctx->image);
639 if (ret < 0) {
640 prt("rbd_flush failed\n");
641 return ret;
642 }
643
644 return librbd_verify_object_map(ctx);
645 }
646
647 int
648 librbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
649 {
650 int ret;
651
652 ret = rbd_discard(ctx->image, off, len);
653 if (ret < 0) {
654 prt("rbd_discard(%llu, %llu) failed\n", off, len);
655 return ret;
656 }
657
658 return librbd_verify_object_map(ctx);
659 }
660
661 ssize_t
662 librbd_writesame(struct rbd_ctx *ctx, uint64_t off, size_t len,
663 const char *buf, size_t data_len)
664 {
665 ssize_t n;
666 int ret;
667
668 n = rbd_writesame(ctx->image, off, len, buf, data_len, 0);
669 if (n < 0) {
670 prt("rbd_writesame(%llu, %zu) failed\n", off, len);
671 return n;
672 }
673
674 ret = librbd_verify_object_map(ctx);
675 if (ret < 0) {
676 return ret;
677 }
678 return n;
679 }
680
681 int
682 librbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
683 {
684 rbd_image_info_t info;
685 int ret;
686
687 ret = rbd_stat(ctx->image, &info, sizeof(info));
688 if (ret < 0) {
689 prt("rbd_stat failed\n");
690 return ret;
691 }
692
693 *size = info.size;
694
695 return 0;
696 }
697
698 int
699 __librbd_resize(struct rbd_ctx *ctx, uint64_t size)
700 {
701 int ret;
702
703 ret = rbd_resize(ctx->image, size);
704 if (ret < 0) {
705 prt("rbd_resize(%llu) failed\n", size);
706 return ret;
707 }
708
709 return librbd_verify_object_map(ctx);
710 }
711
712 int
713 librbd_resize(struct rbd_ctx *ctx, uint64_t size)
714 {
715 return __librbd_resize(ctx, size);
716 }
717
718 int
719 __librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
720 const char *dst_imagename, int *order, int stripe_unit,
721 int stripe_count, bool krbd)
722 {
723 int ret;
724
725 ret = rbd_snap_create(ctx->image, src_snapname);
726 if (ret < 0) {
727 prt("rbd_snap_create(%s@%s) failed\n", ctx->name,
728 src_snapname);
729 return ret;
730 }
731
732 ret = rbd_snap_protect(ctx->image, src_snapname);
733 if (ret < 0) {
734 prt("rbd_snap_protect(%s@%s) failed\n", ctx->name,
735 src_snapname);
736 return ret;
737 }
738
739 uint64_t features = RBD_FEATURES_ALL;
740 if (krbd) {
741 features &= ~(RBD_FEATURE_OBJECT_MAP |
742 RBD_FEATURE_FAST_DIFF |
743 RBD_FEATURE_DEEP_FLATTEN |
744 RBD_FEATURE_JOURNALING);
745 }
746 ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
747 dst_imagename, features, order,
748 stripe_unit, stripe_count);
749 if (ret < 0) {
750 prt("rbd_clone2(%s@%s -> %s) failed\n", ctx->name,
751 src_snapname, dst_imagename);
752 return ret;
753 }
754
755 return 0;
756 }
757
758 int
759 librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
760 const char *dst_imagename, int *order, int stripe_unit,
761 int stripe_count)
762 {
763 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
764 stripe_unit, stripe_count, false);
765 }
766
767 int
768 __librbd_flatten(struct rbd_ctx *ctx)
769 {
770 int ret;
771
772 ret = rbd_flatten(ctx->image);
773 if (ret < 0) {
774 prt("rbd_flatten failed\n");
775 return ret;
776 }
777
778 return librbd_verify_object_map(ctx);
779 }
780
781 int
782 librbd_flatten(struct rbd_ctx *ctx)
783 {
784 return __librbd_flatten(ctx);
785 }
786
787 const struct rbd_operations librbd_operations = {
788 librbd_open,
789 librbd_close,
790 librbd_read,
791 librbd_write,
792 librbd_flush,
793 librbd_discard,
794 librbd_get_size,
795 librbd_resize,
796 librbd_clone,
797 librbd_flatten,
798 librbd_writesame,
799 };
800
801 int
802 krbd_open(const char *name, struct rbd_ctx *ctx)
803 {
804 char *devnode;
805 int fd;
806 int ret;
807
808 ret = __librbd_open(name, ctx);
809 if (ret < 0)
810 return ret;
811
812 ret = krbd_map(krbd, pool, name, "", "", &devnode);
813 if (ret < 0) {
814 prt("krbd_map(%s) failed\n", name);
815 return ret;
816 }
817
818 fd = open(devnode, O_RDWR | o_direct);
819 if (fd < 0) {
820 ret = -errno;
821 prt("open(%s) failed\n", devnode);
822 return ret;
823 }
824
825 ctx->krbd_name = devnode;
826 ctx->krbd_fd = fd;
827
828 return 0;
829 }
830
831 int
832 krbd_close(struct rbd_ctx *ctx)
833 {
834 int ret;
835
836 assert(ctx->krbd_name && ctx->krbd_fd >= 0);
837
838 if (close(ctx->krbd_fd) < 0) {
839 ret = -errno;
840 prt("close(%s) failed\n", ctx->krbd_name);
841 return ret;
842 }
843
844 ret = krbd_unmap(krbd, ctx->krbd_name, "");
845 if (ret < 0) {
846 prt("krbd_unmap(%s) failed\n", ctx->krbd_name);
847 return ret;
848 }
849
850 free((void *)ctx->krbd_name);
851
852 ctx->krbd_name = NULL;
853 ctx->krbd_fd = -1;
854
855 return __librbd_close(ctx);
856 }
857
858 ssize_t
859 krbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
860 {
861 ssize_t n;
862
863 n = pread(ctx->krbd_fd, buf, len, off);
864 if (n < 0) {
865 n = -errno;
866 prt("pread(%llu, %zu) failed\n", off, len);
867 return n;
868 }
869
870 return n;
871 }
872
873 ssize_t
874 krbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
875 {
876 ssize_t n;
877
878 n = pwrite(ctx->krbd_fd, buf, len, off);
879 if (n < 0) {
880 n = -errno;
881 prt("pwrite(%llu, %zu) failed\n", off, len);
882 return n;
883 }
884
885 return n;
886 }
887
888 int
889 __krbd_flush(struct rbd_ctx *ctx, bool invalidate)
890 {
891 int ret;
892
893 if (o_direct)
894 return 0;
895
896 /*
897 * BLKFLSBUF will sync the filesystem on top of the device (we
898 * don't care about that here, since we write directly to it),
899 * write out any dirty buffers and invalidate the buffer cache.
900 * It won't do a hardware cache flush.
901 *
902 * fsync() will write out any dirty buffers and do a hardware
903 * cache flush (which we don't care about either, because for
904 * krbd it's a noop). It won't try to empty the buffer cache
905 * nor poke the filesystem before writing out.
906 *
907 * Given that, for our purposes, fsync is a flush, while
908 * BLKFLSBUF is a flush+invalidate.
909 */
910 if (invalidate)
911 ret = ioctl(ctx->krbd_fd, BLKFLSBUF, NULL);
912 else
913 ret = fsync(ctx->krbd_fd);
914 if (ret < 0) {
915 ret = -errno;
916 prt("%s failed\n", invalidate ? "BLKFLSBUF" : "fsync");
917 return ret;
918 }
919
920 return 0;
921 }
922
923 int
924 krbd_flush(struct rbd_ctx *ctx)
925 {
926 return __krbd_flush(ctx, false);
927 }
928
929 int
930 krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
931 {
932 uint64_t range[2] = { off, len };
933 int ret;
934
935 /*
936 * BLKDISCARD goes straight to disk and doesn't do anything
937 * about dirty buffers. This means we need to flush so that
938 *
939 * write 0..3M
940 * discard 1..2M
941 *
942 * results in "data 0000 data" rather than "data data data" on
943 * disk and invalidate so that
944 *
945 * discard 1..2M
946 * read 0..3M
947 *
948 * returns "data 0000 data" rather than "data data data" in
949 * case 1..2M was cached.
950 */
951 ret = __krbd_flush(ctx, true);
952 if (ret < 0)
953 return ret;
954
955 /*
956 * off and len must be 512-byte aligned, otherwise BLKDISCARD
957 * will fail with -EINVAL. This means that -K (enable krbd
958 * mode) requires -h 512 or similar.
959 */
960 if (ioctl(ctx->krbd_fd, BLKDISCARD, &range) < 0) {
961 ret = -errno;
962 prt("BLKDISCARD(%llu, %llu) failed\n", off, len);
963 return ret;
964 }
965
966 return 0;
967 }
968
969 int
970 krbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
971 {
972 uint64_t bytes;
973
974 if (ioctl(ctx->krbd_fd, BLKGETSIZE64, &bytes) < 0) {
975 int ret = -errno;
976 prt("BLKGETSIZE64 failed\n");
977 return ret;
978 }
979
980 *size = bytes;
981
982 return 0;
983 }
984
985 int
986 krbd_resize(struct rbd_ctx *ctx, uint64_t size)
987 {
988 int ret;
989
990 assert(size % truncbdy == 0);
991
992 /*
993 * When krbd detects a size change, it calls revalidate_disk(),
994 * which ends up calling invalidate_bdev(), which invalidates
995 * clean pages and does nothing about dirty pages beyond the
996 * new size. The preceding cache flush makes sure those pages
997 * are invalidated, which is what we need on shrink so that
998 *
999 * write 0..1M
1000 * resize 0
1001 * resize 2M
1002 * read 0..2M
1003 *
1004 * returns "0000 0000" rather than "data 0000".
1005 */
1006 ret = __krbd_flush(ctx, false);
1007 if (ret < 0)
1008 return ret;
1009
1010 return __librbd_resize(ctx, size);
1011 }
1012
1013 int
1014 krbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1015 const char *dst_imagename, int *order, int stripe_unit,
1016 int stripe_count)
1017 {
1018 int ret;
1019
1020 ret = __krbd_flush(ctx, false);
1021 if (ret < 0)
1022 return ret;
1023
1024 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1025 stripe_unit, stripe_count, true);
1026 }
1027
1028 int
1029 krbd_flatten(struct rbd_ctx *ctx)
1030 {
1031 int ret;
1032
1033 ret = __krbd_flush(ctx, false);
1034 if (ret < 0)
1035 return ret;
1036
1037 return __librbd_flatten(ctx);
1038 }
1039
1040 const struct rbd_operations krbd_operations = {
1041 krbd_open,
1042 krbd_close,
1043 krbd_read,
1044 krbd_write,
1045 krbd_flush,
1046 krbd_discard,
1047 krbd_get_size,
1048 krbd_resize,
1049 krbd_clone,
1050 krbd_flatten,
1051 NULL,
1052 };
1053
1054 int
1055 nbd_open(const char *name, struct rbd_ctx *ctx)
1056 {
1057 int r;
1058 int fd;
1059 char dev[4096];
1060 char *devnode;
1061
1062 SubProcess process("rbd-nbd", SubProcess::KEEP, SubProcess::PIPE,
1063 SubProcess::KEEP);
1064 process.add_cmd_arg("map");
1065 std::string img;
1066 img.append(pool);
1067 img.append("/");
1068 img.append(name);
1069 process.add_cmd_arg(img.c_str());
1070
1071 r = __librbd_open(name, ctx);
1072 if (r < 0)
1073 return r;
1074
1075 r = process.spawn();
1076 if (r < 0) {
1077 prt("nbd_open failed to run rbd-nbd error: %s\n", process.err().c_str());
1078 return r;
1079 }
1080 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1081 if (r < 0) {
1082 prt("nbd_open failed to get nbd device path\n");
1083 return r;
1084 }
1085 for (int i = 0; i < r; ++i)
1086 if (dev[i] == 10 || dev[i] == 13)
1087 dev[i] = 0;
1088 dev[r] = 0;
1089 r = process.join();
1090 if (r) {
1091 prt("rbd-nbd failed with error: %s", process.err().c_str());
1092 return -EINVAL;
1093 }
1094
1095 devnode = strdup(dev);
1096 if (!devnode)
1097 return -ENOMEM;
1098
1099 fd = open(devnode, O_RDWR | o_direct);
1100 if (fd < 0) {
1101 r = -errno;
1102 prt("open(%s) failed\n", devnode);
1103 return r;
1104 }
1105
1106 ctx->krbd_name = devnode;
1107 ctx->krbd_fd = fd;
1108
1109 return 0;
1110 }
1111
1112 int
1113 nbd_close(struct rbd_ctx *ctx)
1114 {
1115 int r;
1116
1117 assert(ctx->krbd_name && ctx->krbd_fd >= 0);
1118
1119 if (close(ctx->krbd_fd) < 0) {
1120 r = -errno;
1121 prt("close(%s) failed\n", ctx->krbd_name);
1122 return r;
1123 }
1124
1125 SubProcess process("rbd-nbd");
1126 process.add_cmd_arg("unmap");
1127 process.add_cmd_arg(ctx->krbd_name);
1128
1129 r = process.spawn();
1130 if (r < 0) {
1131 prt("nbd_close failed to run rbd-nbd error: %s\n", process.err().c_str());
1132 return r;
1133 }
1134 r = process.join();
1135 if (r) {
1136 prt("rbd-nbd failed with error: %d", process.err().c_str());
1137 return -EINVAL;
1138 }
1139
1140 free((void *)ctx->krbd_name);
1141
1142 ctx->krbd_name = NULL;
1143 ctx->krbd_fd = -1;
1144
1145 return __librbd_close(ctx);
1146 }
1147
1148 int
1149 nbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1150 const char *dst_imagename, int *order, int stripe_unit,
1151 int stripe_count)
1152 {
1153 int ret;
1154
1155 ret = __krbd_flush(ctx, false);
1156 if (ret < 0)
1157 return ret;
1158
1159 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
1160 stripe_unit, stripe_count, false);
1161 }
1162
1163 const struct rbd_operations nbd_operations = {
1164 nbd_open,
1165 nbd_close,
1166 krbd_read,
1167 krbd_write,
1168 krbd_flush,
1169 krbd_discard,
1170 krbd_get_size,
1171 krbd_resize,
1172 nbd_clone,
1173 krbd_flatten,
1174 NULL,
1175 };
1176
1177 struct rbd_ctx ctx = RBD_CTX_INIT;
1178 const struct rbd_operations *ops = &librbd_operations;
1179
1180 static bool rbd_image_has_parent(struct rbd_ctx *ctx)
1181 {
1182 int ret;
1183
1184 ret = rbd_get_parent_info(ctx->image, NULL, 0, NULL, 0, NULL, 0);
1185 if (ret < 0 && ret != -ENOENT) {
1186 prterrcode("rbd_get_parent_info", ret);
1187 exit(1);
1188 }
1189
1190 return !ret;
1191 }
1192
1193 /*
1194 * fsx
1195 */
1196
1197 void
1198 log4(int operation, int arg0, int arg1, int arg2)
1199 {
1200 struct log_entry *le;
1201
1202 le = &oplog[logptr];
1203 le->operation = operation;
1204 if (closeopen)
1205 le->operation = ~ le->operation;
1206 le->args[0] = arg0;
1207 le->args[1] = arg1;
1208 le->args[2] = arg2;
1209 logptr++;
1210 logcount++;
1211 if (logptr >= LOGSIZE)
1212 logptr = 0;
1213 }
1214
1215 void
1216 logdump(void)
1217 {
1218 int i, count, down;
1219 struct log_entry *lp;
1220 const char *falloc_type[3] = {"PAST_EOF", "EXTENDING", "INTERIOR"};
1221
1222 prt("LOG DUMP (%d total operations):\n", logcount);
1223 if (logcount < LOGSIZE) {
1224 i = 0;
1225 count = logcount;
1226 } else {
1227 i = logptr;
1228 count = LOGSIZE;
1229 }
1230 for ( ; count > 0; count--) {
1231 int opnum;
1232
1233 opnum = i+1 + (logcount/LOGSIZE)*LOGSIZE;
1234 prt("%d(%3d mod 256): ", opnum, opnum%256);
1235 lp = &oplog[i];
1236 if ((closeopen = lp->operation < 0))
1237 lp->operation = ~ lp->operation;
1238
1239 switch (lp->operation) {
1240 case OP_MAPREAD:
1241 prt("MAPREAD 0x%x thru 0x%x\t(0x%x bytes)",
1242 lp->args[0], lp->args[0] + lp->args[1] - 1,
1243 lp->args[1]);
1244 if (badoff >= lp->args[0] && badoff <
1245 lp->args[0] + lp->args[1])
1246 prt("\t***RRRR***");
1247 break;
1248 case OP_MAPWRITE:
1249 prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
1250 lp->args[0], lp->args[0] + lp->args[1] - 1,
1251 lp->args[1]);
1252 if (badoff >= lp->args[0] && badoff <
1253 lp->args[0] + lp->args[1])
1254 prt("\t******WWWW");
1255 break;
1256 case OP_READ:
1257 prt("READ 0x%x thru 0x%x\t(0x%x bytes)",
1258 lp->args[0], lp->args[0] + lp->args[1] - 1,
1259 lp->args[1]);
1260 if (badoff >= lp->args[0] &&
1261 badoff < lp->args[0] + lp->args[1])
1262 prt("\t***RRRR***");
1263 break;
1264 case OP_WRITE:
1265 prt("WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1266 lp->args[0], lp->args[0] + lp->args[1] - 1,
1267 lp->args[1]);
1268 if (lp->args[0] > lp->args[2])
1269 prt(" HOLE");
1270 else if (lp->args[0] + lp->args[1] > lp->args[2])
1271 prt(" EXTEND");
1272 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1273 badoff < lp->args[0] + lp->args[1])
1274 prt("\t***WWWW");
1275 break;
1276 case OP_TRUNCATE:
1277 down = lp->args[0] < lp->args[1];
1278 prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
1279 down ? "DOWN" : "UP", lp->args[1], lp->args[0]);
1280 if (badoff >= lp->args[!down] &&
1281 badoff < lp->args[!!down])
1282 prt("\t******WWWW");
1283 break;
1284 case OP_FALLOCATE:
1285 /* 0: offset 1: length 2: where alloced */
1286 prt("FALLOC 0x%x thru 0x%x\t(0x%x bytes) %s",
1287 lp->args[0], lp->args[0] + lp->args[1],
1288 lp->args[1], falloc_type[lp->args[2]]);
1289 if (badoff >= lp->args[0] &&
1290 badoff < lp->args[0] + lp->args[1])
1291 prt("\t******FFFF");
1292 break;
1293 case OP_PUNCH_HOLE:
1294 prt("PUNCH 0x%x thru 0x%x\t(0x%x bytes)",
1295 lp->args[0], lp->args[0] + lp->args[1] - 1,
1296 lp->args[1]);
1297 if (badoff >= lp->args[0] && badoff <
1298 lp->args[0] + lp->args[1])
1299 prt("\t******PPPP");
1300 break;
1301 case OP_WRITESAME:
1302 prt("WRITESAME 0x%x thru 0x%x\t(0x%x bytes) data_size 0x%x",
1303 lp->args[0], lp->args[0] + lp->args[1] - 1,
1304 lp->args[1], lp->args[2]);
1305 if (badoff >= lp->args[0] &&
1306 badoff < lp->args[0] + lp->args[1])
1307 prt("\t***WSWSWSWS");
1308 break;
1309 case OP_CLONE:
1310 prt("CLONE");
1311 break;
1312 case OP_FLATTEN:
1313 prt("FLATTEN");
1314 break;
1315 case OP_SKIPPED:
1316 prt("SKIPPED (no operation)");
1317 break;
1318 default:
1319 prt("BOGUS LOG ENTRY (operation code = %d)!",
1320 lp->operation);
1321 }
1322 if (closeopen)
1323 prt("\n\t\tCLOSE/OPEN");
1324 prt("\n");
1325 i++;
1326 if (i == LOGSIZE)
1327 i = 0;
1328 }
1329 }
1330
1331 void
1332 save_buffer(char *buffer, off_t bufferlength, int fd)
1333 {
1334 off_t ret;
1335 ssize_t byteswritten;
1336
1337 if (fd <= 0 || bufferlength == 0)
1338 return;
1339
1340 if (bufferlength > SSIZE_MAX) {
1341 prt("fsx flaw: overflow in save_buffer\n");
1342 exit(67);
1343 }
1344
1345 ret = lseek(fd, (off_t)0, SEEK_SET);
1346 if (ret == (off_t)-1)
1347 prterr("save_buffer: lseek 0");
1348
1349 byteswritten = write(fd, buffer, (size_t)bufferlength);
1350 if (byteswritten != bufferlength) {
1351 if (byteswritten == -1)
1352 prterr("save_buffer write");
1353 else
1354 warn("save_buffer: short write, 0x%x bytes instead of 0x%llx\n",
1355 (unsigned)byteswritten,
1356 (unsigned long long)bufferlength);
1357 }
1358 }
1359
1360
1361 void
1362 report_failure(int status)
1363 {
1364 logdump();
1365
1366 if (fsxgoodfd) {
1367 if (good_buf) {
1368 save_buffer(good_buf, file_size, fsxgoodfd);
1369 prt("Correct content saved for comparison\n");
1370 prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
1371 iname, iname);
1372 }
1373 close(fsxgoodfd);
1374 }
1375 sleep(3); // so the log can flush to disk. KLUDGEY!
1376 exit(status);
1377 }
1378
1379 #define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
1380 *(((unsigned char *)(cp)) + 1)))
1381
1382 int
1383 fsxcmp(char *good_buf, char *temp_buf, unsigned size)
1384 {
1385 if (!skip_partial_discard) {
1386 return memcmp(good_buf, temp_buf, size);
1387 }
1388
1389 for (unsigned i = 0; i < size; i++) {
1390 if (good_buf[i] != temp_buf[i] && good_buf[i] != 0) {
1391 return good_buf[i] - temp_buf[i];
1392 }
1393 }
1394 return 0;
1395 }
1396
1397 void
1398 check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size)
1399 {
1400 if (fsxcmp(good_buf + offset, temp_buf, size) != 0) {
1401 unsigned i = 0;
1402 unsigned n = 0;
1403
1404 prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
1405 offset, size, iname);
1406 prt("OFFSET\tGOOD\tBAD\tRANGE\n");
1407 while (size > 0) {
1408 unsigned char c = good_buf[offset];
1409 unsigned char t = temp_buf[i];
1410 if (c != t) {
1411 if (n < 16) {
1412 unsigned bad = short_at(&temp_buf[i]);
1413 prt("0x%5x\t0x%04x\t0x%04x", offset,
1414 short_at(&good_buf[offset]), bad);
1415 unsigned op = temp_buf[(offset & 1) ? i+1 : i];
1416 prt("\t0x%5x\n", n);
1417 if (op)
1418 prt("operation# (mod 256) for "
1419 "the bad data may be %u\n",
1420 ((unsigned)op & 0xff));
1421 else
1422 prt("operation# (mod 256) for "
1423 "the bad data unknown, check"
1424 " HOLE and EXTEND ops\n");
1425 }
1426 n++;
1427 badoff = offset;
1428 }
1429 offset++;
1430 i++;
1431 size--;
1432 }
1433 report_failure(110);
1434 }
1435 }
1436
1437
1438 void
1439 check_size(void)
1440 {
1441 uint64_t size;
1442 int ret;
1443
1444 ret = ops->get_size(&ctx, &size);
1445 if (ret < 0)
1446 prterrcode("check_size: ops->get_size", ret);
1447
1448 if ((uint64_t)file_size != size) {
1449 prt("Size error: expected 0x%llx stat 0x%llx\n",
1450 (unsigned long long)file_size,
1451 (unsigned long long)size);
1452 report_failure(120);
1453 }
1454 }
1455
1456 #define TRUNC_HACK_SIZE (200ULL << 9) /* 512-byte aligned for krbd */
1457
1458 void
1459 check_trunc_hack(void)
1460 {
1461 uint64_t size;
1462 int ret;
1463
1464 ret = ops->resize(&ctx, 0ULL);
1465 if (ret < 0)
1466 prterrcode("check_trunc_hack: ops->resize pre", ret);
1467
1468 ret = ops->resize(&ctx, TRUNC_HACK_SIZE);
1469 if (ret < 0)
1470 prterrcode("check_trunc_hack: ops->resize actual", ret);
1471
1472 ret = ops->get_size(&ctx, &size);
1473 if (ret < 0)
1474 prterrcode("check_trunc_hack: ops->get_size", ret);
1475
1476 if (size != TRUNC_HACK_SIZE) {
1477 prt("no extend on truncate! not posix!\n");
1478 exit(130);
1479 }
1480
1481 ret = ops->resize(&ctx, 0ULL);
1482 if (ret < 0)
1483 prterrcode("check_trunc_hack: ops->resize post", ret);
1484 }
1485
1486 int
1487 create_image()
1488 {
1489 int r;
1490 int order = 0;
1491 char buf[32];
1492
1493 r = rados_create(&cluster, NULL);
1494 if (r < 0) {
1495 simple_err("Could not create cluster handle", r);
1496 return r;
1497 }
1498 rados_conf_parse_env(cluster, NULL);
1499 r = rados_conf_read_file(cluster, NULL);
1500 if (r < 0) {
1501 simple_err("Error reading ceph config file", r);
1502 goto failed_shutdown;
1503 }
1504 r = rados_connect(cluster);
1505 if (r < 0) {
1506 simple_err("Error connecting to cluster", r);
1507 goto failed_shutdown;
1508 }
1509 r = krbd_create_from_context(rados_cct(cluster), &krbd);
1510 if (r < 0) {
1511 simple_err("Could not create libkrbd handle", r);
1512 goto failed_shutdown;
1513 }
1514
1515 r = rados_pool_create(cluster, pool);
1516 if (r < 0 && r != -EEXIST) {
1517 simple_err("Error creating pool", r);
1518 goto failed_krbd;
1519 }
1520 r = rados_ioctx_create(cluster, pool, &ioctx);
1521 if (r < 0) {
1522 simple_err("Error creating ioctx", r);
1523 goto failed_krbd;
1524 }
1525 if (clone_calls || journal_replay) {
1526 uint64_t features = 0;
1527 if (clone_calls) {
1528 features |= RBD_FEATURE_LAYERING;
1529 }
1530 if (journal_replay) {
1531 features |= (RBD_FEATURE_EXCLUSIVE_LOCK |
1532 RBD_FEATURE_JOURNALING);
1533 }
1534 r = rbd_create2(ioctx, iname, 0, features, &order);
1535 } else {
1536 r = rbd_create(ioctx, iname, 0, &order);
1537 }
1538 if (r < 0) {
1539 simple_err("Error creating image", r);
1540 goto failed_open;
1541 }
1542
1543 if (journal_replay) {
1544 r = register_journal(ioctx, iname);
1545 if (r < 0) {
1546 goto failed_open;
1547 }
1548 }
1549
1550 r = rados_conf_get(cluster, "rbd_skip_partial_discard", buf,
1551 sizeof(buf));
1552 if (r < 0) {
1553 simple_err("Could not get rbd_skip_partial_discard value", r);
1554 goto failed_open;
1555 }
1556 skip_partial_discard = (strcmp(buf, "true") == 0);
1557
1558 return 0;
1559
1560 failed_open:
1561 rados_ioctx_destroy(ioctx);
1562 failed_krbd:
1563 krbd_destroy(krbd);
1564 failed_shutdown:
1565 rados_shutdown(cluster);
1566 return r;
1567 }
1568
1569 void
1570 doflush(unsigned offset, unsigned size)
1571 {
1572 int ret;
1573
1574 if (o_direct)
1575 return;
1576
1577 ret = ops->flush(&ctx);
1578 if (ret < 0)
1579 prterrcode("doflush: ops->flush", ret);
1580 }
1581
1582 void
1583 doread(unsigned offset, unsigned size)
1584 {
1585 int ret;
1586
1587 offset -= offset % readbdy;
1588 if (o_direct)
1589 size -= size % readbdy;
1590 if (size == 0) {
1591 if (!quiet && testcalls > simulatedopcount && !o_direct)
1592 prt("skipping zero size read\n");
1593 log4(OP_SKIPPED, OP_READ, offset, size);
1594 return;
1595 }
1596 if (size + offset > file_size) {
1597 if (!quiet && testcalls > simulatedopcount)
1598 prt("skipping seek/read past end of file\n");
1599 log4(OP_SKIPPED, OP_READ, offset, size);
1600 return;
1601 }
1602
1603 log4(OP_READ, offset, size, 0);
1604
1605 if (testcalls <= simulatedopcount)
1606 return;
1607
1608 if (!quiet &&
1609 ((progressinterval && testcalls % progressinterval == 0) ||
1610 (debug &&
1611 (monitorstart == -1 ||
1612 (static_cast<long>(offset + size) > monitorstart &&
1613 (monitorend == -1 ||
1614 static_cast<long>(offset) <= monitorend))))))
1615 prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
1616 offset, offset + size - 1, size);
1617
1618 ret = ops->read(&ctx, offset, size, temp_buf);
1619 if (ret != (int)size) {
1620 if (ret < 0)
1621 prterrcode("doread: ops->read", ret);
1622 else
1623 prt("short read: 0x%x bytes instead of 0x%x\n",
1624 ret, size);
1625 report_failure(141);
1626 }
1627
1628 check_buffers(good_buf, temp_buf, offset, size);
1629 }
1630
1631
1632 void
1633 check_eofpage(char *s, unsigned offset, char *p, int size)
1634 {
1635 unsigned long last_page, should_be_zero;
1636
1637 if (offset + size <= (file_size & ~page_mask))
1638 return;
1639 /*
1640 * we landed in the last page of the file
1641 * test to make sure the VM system provided 0's
1642 * beyond the true end of the file mapping
1643 * (as required by mmap def in 1996 posix 1003.1)
1644 */
1645 last_page = ((unsigned long)p + (offset & page_mask) + size) & ~page_mask;
1646
1647 for (should_be_zero = last_page + (file_size & page_mask);
1648 should_be_zero < last_page + page_size;
1649 should_be_zero++)
1650 if (*(char *)should_be_zero) {
1651 prt("Mapped %s: non-zero data past EOF (0x%llx) page offset 0x%x is 0x%04x\n",
1652 s, file_size - 1, should_be_zero & page_mask,
1653 short_at(should_be_zero));
1654 report_failure(205);
1655 }
1656 }
1657
1658
1659 void
1660 gendata(char *original_buf, char *good_buf, unsigned offset, unsigned size)
1661 {
1662 while (size--) {
1663 good_buf[offset] = testcalls % 256;
1664 if (offset % 2)
1665 good_buf[offset] += original_buf[offset];
1666 offset++;
1667 }
1668 }
1669
1670
1671 void
1672 dowrite(unsigned offset, unsigned size)
1673 {
1674 ssize_t ret;
1675 off_t newsize;
1676
1677 offset -= offset % writebdy;
1678 if (o_direct)
1679 size -= size % writebdy;
1680 if (size == 0) {
1681 if (!quiet && testcalls > simulatedopcount && !o_direct)
1682 prt("skipping zero size write\n");
1683 log4(OP_SKIPPED, OP_WRITE, offset, size);
1684 return;
1685 }
1686
1687 log4(OP_WRITE, offset, size, file_size);
1688
1689 gendata(original_buf, good_buf, offset, size);
1690 if (file_size < offset + size) {
1691 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
1692 if (file_size < newsize)
1693 memset(good_buf + file_size, '\0', newsize - file_size);
1694 file_size = newsize;
1695 if (lite) {
1696 warn("Lite file size bug in fsx!");
1697 report_failure(149);
1698 }
1699 ret = ops->resize(&ctx, newsize);
1700 if (ret < 0) {
1701 prterrcode("dowrite: ops->resize", ret);
1702 report_failure(150);
1703 }
1704 }
1705
1706 if (testcalls <= simulatedopcount)
1707 return;
1708
1709 if (!quiet &&
1710 ((progressinterval && testcalls % progressinterval == 0) ||
1711 (debug &&
1712 (monitorstart == -1 ||
1713 (static_cast<long>(offset + size) > monitorstart &&
1714 (monitorend == -1 ||
1715 static_cast<long>(offset) <= monitorend))))))
1716 prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
1717 offset, offset + size - 1, size);
1718
1719 ret = ops->write(&ctx, offset, size, good_buf + offset);
1720 if (ret != (ssize_t)size) {
1721 if (ret < 0)
1722 prterrcode("dowrite: ops->write", ret);
1723 else
1724 prt("short write: 0x%x bytes instead of 0x%x\n",
1725 ret, size);
1726 report_failure(151);
1727 }
1728
1729 if (flush_enabled)
1730 doflush(offset, size);
1731 }
1732
1733
1734 void
1735 dotruncate(unsigned size)
1736 {
1737 int oldsize = file_size;
1738 int ret;
1739
1740 size -= size % truncbdy;
1741 if (size > biggest) {
1742 biggest = size;
1743 if (!quiet && testcalls > simulatedopcount)
1744 prt("truncating to largest ever: 0x%x\n", size);
1745 }
1746
1747 log4(OP_TRUNCATE, size, (unsigned)file_size, 0);
1748
1749 if (size > file_size)
1750 memset(good_buf + file_size, '\0', size - file_size);
1751 else if (size < file_size)
1752 memset(good_buf + size, '\0', file_size - size);
1753 file_size = size;
1754
1755 if (testcalls <= simulatedopcount)
1756 return;
1757
1758 if ((progressinterval && testcalls % progressinterval == 0) ||
1759 (debug && (monitorstart == -1 || monitorend == -1 ||
1760 (long)size <= monitorend)))
1761 prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls, oldsize, size);
1762
1763 ret = ops->resize(&ctx, size);
1764 if (ret < 0) {
1765 prterrcode("dotruncate: ops->resize", ret);
1766 report_failure(160);
1767 }
1768 }
1769
1770 void
1771 do_punch_hole(unsigned offset, unsigned length)
1772 {
1773 unsigned end_offset;
1774 int max_offset = 0;
1775 int max_len = 0;
1776 int ret;
1777
1778 offset -= offset % holebdy;
1779 length -= length % holebdy;
1780 if (length == 0) {
1781 if (!quiet && testcalls > simulatedopcount)
1782 prt("skipping zero length punch hole\n");
1783 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
1784 return;
1785 }
1786
1787 if (file_size <= (loff_t)offset) {
1788 if (!quiet && testcalls > simulatedopcount)
1789 prt("skipping hole punch off the end of the file\n");
1790 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
1791 return;
1792 }
1793
1794 end_offset = offset + length;
1795
1796 log4(OP_PUNCH_HOLE, offset, length, 0);
1797
1798 if (testcalls <= simulatedopcount)
1799 return;
1800
1801 if ((progressinterval && testcalls % progressinterval == 0) ||
1802 (debug && (monitorstart == -1 || monitorend == -1 ||
1803 (long)end_offset <= monitorend))) {
1804 prt("%lu punch\tfrom 0x%x to 0x%x, (0x%x bytes)\n", testcalls,
1805 offset, offset+length, length);
1806 }
1807
1808 ret = ops->discard(&ctx, (unsigned long long)offset,
1809 (unsigned long long)length);
1810 if (ret < 0) {
1811 prterrcode("do_punch_hole: ops->discard", ret);
1812 report_failure(161);
1813 }
1814
1815 max_offset = offset < file_size ? offset : file_size;
1816 max_len = max_offset + length <= file_size ? length :
1817 file_size - max_offset;
1818 memset(good_buf + max_offset, '\0', max_len);
1819 }
1820
1821 unsigned get_data_size(unsigned size)
1822 {
1823 unsigned i;
1824 unsigned hint;
1825 unsigned max = sqrt((double)size) + 1;
1826 unsigned good = 1;
1827 unsigned curr = good;
1828
1829 hint = get_random() % max;
1830
1831 for (i = 1; i < max && curr < hint; i++) {
1832 if (size % i == 0) {
1833 good = curr;
1834 curr = i;
1835 }
1836 }
1837
1838 if (curr == hint)
1839 good = curr;
1840
1841 return good;
1842 }
1843
1844 void
1845 dowritesame(unsigned offset, unsigned size)
1846 {
1847 ssize_t ret;
1848 off_t newsize;
1849 unsigned buf_off;
1850 unsigned data_size;
1851 int n;
1852
1853 offset -= offset % writebdy;
1854 if (o_direct)
1855 size -= size % writebdy;
1856 if (size == 0) {
1857 if (!quiet && testcalls > simulatedopcount && !o_direct)
1858 prt("skipping zero size writesame\n");
1859 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
1860 return;
1861 }
1862
1863 data_size = get_data_size(size);
1864
1865 log4(OP_WRITESAME, offset, size, data_size);
1866
1867 gendata(original_buf, good_buf, offset, data_size);
1868 if (file_size < offset + size) {
1869 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
1870 if (file_size < newsize)
1871 memset(good_buf + file_size, '\0', newsize - file_size);
1872 file_size = newsize;
1873 if (lite) {
1874 warn("Lite file size bug in fsx!");
1875 report_failure(162);
1876 }
1877 ret = ops->resize(&ctx, newsize);
1878 if (ret < 0) {
1879 prterrcode("dowritesame: ops->resize", ret);
1880 report_failure(163);
1881 }
1882 }
1883
1884 for (n = size / data_size, buf_off = data_size; n > 1; n--) {
1885 memcpy(good_buf + offset + buf_off, good_buf + offset, data_size);
1886 buf_off += data_size;
1887 }
1888
1889 if (testcalls <= simulatedopcount)
1890 return;
1891
1892 if (!quiet &&
1893 ((progressinterval && testcalls % progressinterval == 0) ||
1894 (debug &&
1895 (monitorstart == -1 ||
1896 (static_cast<long>(offset + size) > monitorstart &&
1897 (monitorend == -1 ||
1898 static_cast<long>(offset) <= monitorend))))))
1899 prt("%lu writesame\t0x%x thru\t0x%x\tdata_size\t0x%x(0x%x bytes)\n", testcalls,
1900 offset, offset + size - 1, data_size, size);
1901
1902 ret = ops->writesame(&ctx, offset, size, good_buf + offset, data_size);
1903 if (ret != (ssize_t)size) {
1904 if (ret < 0)
1905 prterrcode("dowritesame: ops->writesame", ret);
1906 else
1907 prt("short writesame: 0x%x bytes instead of 0x%x\n",
1908 ret, size);
1909 report_failure(164);
1910 }
1911
1912 if (flush_enabled)
1913 doflush(offset, size);
1914 }
1915
1916 void clone_filename(char *buf, size_t len, int clones)
1917 {
1918 snprintf(buf, len, "%s/fsx-%s-parent%d",
1919 dirpath, iname, clones);
1920 }
1921
1922 void clone_imagename(char *buf, size_t len, int clones)
1923 {
1924 if (clones > 0)
1925 snprintf(buf, len, "%s-clone%d", iname, clones);
1926 else
1927 strncpy(buf, iname, len);
1928 buf[len - 1] = '\0';
1929 }
1930
1931 void replay_imagename(char *buf, size_t len, int clones)
1932 {
1933 clone_imagename(buf, len, clones);
1934 strncat(buf, "-replay", len - strlen(buf));
1935 buf[len - 1] = '\0';
1936 }
1937
1938 void check_clone(int clonenum, bool replay_image);
1939
1940 void
1941 do_clone()
1942 {
1943 char filename[1024];
1944 char imagename[1024];
1945 char lastimagename[1024];
1946 int ret, fd;
1947 int order = 0, stripe_unit = 0, stripe_count = 0;
1948 uint64_t newsize = file_size;
1949
1950 log4(OP_CLONE, 0, 0, 0);
1951 ++num_clones;
1952
1953 if (randomize_striping) {
1954 order = 18 + get_random() % 8;
1955 stripe_unit = 1ull << (order - 1 - (get_random() % 8));
1956 stripe_count = 2 + get_random() % 14;
1957 }
1958
1959 prt("%lu clone\t%d order %d su %d sc %d\n", testcalls, num_clones,
1960 order, stripe_unit, stripe_count);
1961
1962 clone_imagename(imagename, sizeof(imagename), num_clones);
1963 clone_imagename(lastimagename, sizeof(lastimagename),
1964 num_clones - 1);
1965 assert(strcmp(lastimagename, ctx.name) == 0);
1966
1967 ret = ops->clone(&ctx, "snap", imagename, &order, stripe_unit,
1968 stripe_count);
1969 if (ret < 0) {
1970 prterrcode("do_clone: ops->clone", ret);
1971 exit(165);
1972 }
1973
1974 if (randomize_parent_overlap && rbd_image_has_parent(&ctx)) {
1975 int rand = get_random() % 16 + 1; // [1..16]
1976
1977 if (rand < 13) {
1978 uint64_t overlap;
1979
1980 ret = rbd_get_overlap(ctx.image, &overlap);
1981 if (ret < 0) {
1982 prterrcode("do_clone: rbd_get_overlap", ret);
1983 exit(1);
1984 }
1985
1986 if (rand < 10) { // 9/16
1987 newsize = overlap * ((double)rand / 10);
1988 newsize -= newsize % truncbdy;
1989 } else { // 3/16
1990 newsize = 0;
1991 }
1992
1993 assert(newsize != (uint64_t)file_size);
1994 prt("truncating image %s from 0x%llx (overlap 0x%llx) to 0x%llx\n",
1995 ctx.name, file_size, overlap, newsize);
1996
1997 ret = ops->resize(&ctx, newsize);
1998 if (ret < 0) {
1999 prterrcode("do_clone: ops->resize", ret);
2000 exit(1);
2001 }
2002 } else if (rand < 15) { // 2/16
2003 prt("flattening image %s\n", ctx.name);
2004
2005 ret = ops->flatten(&ctx);
2006 if (ret < 0) {
2007 prterrcode("do_clone: ops->flatten", ret);
2008 exit(1);
2009 }
2010 } else { // 2/16
2011 prt("leaving image %s intact\n", ctx.name);
2012 }
2013 }
2014
2015 clone_filename(filename, sizeof(filename), num_clones);
2016 if ((fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) {
2017 simple_err("do_clone: open", -errno);
2018 exit(162);
2019 }
2020 save_buffer(good_buf, newsize, fd);
2021 if ((ret = close(fd)) < 0) {
2022 simple_err("do_clone: close", -errno);
2023 exit(163);
2024 }
2025
2026 /*
2027 * Close parent.
2028 */
2029 if ((ret = ops->close(&ctx)) < 0) {
2030 prterrcode("do_clone: ops->close", ret);
2031 exit(174);
2032 }
2033
2034 if (journal_replay) {
2035 ret = finalize_journal(ioctx, lastimagename, num_clones - 1,
2036 order, stripe_unit, stripe_count);
2037 if (ret < 0) {
2038 exit(EXIT_FAILURE);
2039 }
2040
2041 ret = register_journal(ioctx, imagename);
2042 if (ret < 0) {
2043 exit(EXIT_FAILURE);
2044 }
2045 }
2046
2047 /*
2048 * Open freshly made clone.
2049 */
2050 if ((ret = ops->open(imagename, &ctx)) < 0) {
2051 prterrcode("do_clone: ops->open", ret);
2052 exit(166);
2053 }
2054
2055 if (num_clones > 1) {
2056 if (journal_replay) {
2057 check_clone(num_clones - 2, true);
2058 }
2059 check_clone(num_clones - 2, false);
2060 }
2061 }
2062
2063 void
2064 check_clone(int clonenum, bool replay_image)
2065 {
2066 char filename[128];
2067 char imagename[128];
2068 int ret, fd;
2069 struct rbd_ctx cur_ctx = RBD_CTX_INIT;
2070 struct stat file_info;
2071 char *good_buf, *temp_buf;
2072
2073 if (replay_image) {
2074 replay_imagename(imagename, sizeof(imagename), clonenum);
2075 } else {
2076 clone_imagename(imagename, sizeof(imagename), clonenum);
2077 }
2078
2079 if ((ret = ops->open(imagename, &cur_ctx)) < 0) {
2080 prterrcode("check_clone: ops->open", ret);
2081 exit(167);
2082 }
2083
2084 clone_filename(filename, sizeof(filename), clonenum + 1);
2085 if ((fd = open(filename, O_RDONLY)) < 0) {
2086 simple_err("check_clone: open", -errno);
2087 exit(168);
2088 }
2089
2090 prt("checking clone #%d, image %s against file %s\n",
2091 clonenum, imagename, filename);
2092 if ((ret = fstat(fd, &file_info)) < 0) {
2093 simple_err("check_clone: fstat", -errno);
2094 exit(169);
2095 }
2096
2097 good_buf = NULL;
2098 ret = posix_memalign((void **)&good_buf,
2099 MAX(writebdy, (int)sizeof(void *)),
2100 file_info.st_size);
2101 if (ret > 0) {
2102 prterrcode("check_clone: posix_memalign(good_buf)", -ret);
2103 exit(96);
2104 }
2105
2106 temp_buf = NULL;
2107 ret = posix_memalign((void **)&temp_buf,
2108 MAX(readbdy, (int)sizeof(void *)),
2109 file_info.st_size);
2110 if (ret > 0) {
2111 prterrcode("check_clone: posix_memalign(temp_buf)", -ret);
2112 exit(97);
2113 }
2114
2115 if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) {
2116 simple_err("check_clone: pread", -errno);
2117 exit(170);
2118 }
2119 if ((ret = ops->read(&cur_ctx, 0, file_info.st_size, temp_buf)) < 0) {
2120 prterrcode("check_clone: ops->read", ret);
2121 exit(171);
2122 }
2123 close(fd);
2124 if ((ret = ops->close(&cur_ctx)) < 0) {
2125 prterrcode("check_clone: ops->close", ret);
2126 exit(174);
2127 }
2128 check_buffers(good_buf, temp_buf, 0, file_info.st_size);
2129
2130 if (!replay_image) {
2131 unlink(filename);
2132 }
2133
2134 free(good_buf);
2135 free(temp_buf);
2136 }
2137
2138 void
2139 writefileimage()
2140 {
2141 ssize_t ret;
2142
2143 ret = ops->write(&ctx, 0, file_size, good_buf);
2144 if (ret != file_size) {
2145 if (ret < 0)
2146 prterrcode("writefileimage: ops->write", ret);
2147 else
2148 prt("short write: 0x%x bytes instead of 0x%llx\n",
2149 ret, (unsigned long long)file_size);
2150 report_failure(172);
2151 }
2152
2153 if (!lite) {
2154 ret = ops->resize(&ctx, file_size);
2155 if (ret < 0) {
2156 prterrcode("writefileimage: ops->resize", ret);
2157 report_failure(173);
2158 }
2159 }
2160 }
2161
2162 void
2163 do_flatten()
2164 {
2165 int ret;
2166
2167 if (!rbd_image_has_parent(&ctx)) {
2168 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2169 return;
2170 }
2171 log4(OP_FLATTEN, 0, 0, 0);
2172 prt("%lu flatten\n", testcalls);
2173
2174 ret = ops->flatten(&ctx);
2175 if (ret < 0) {
2176 prterrcode("writefileimage: ops->flatten", ret);
2177 exit(177);
2178 }
2179 }
2180
2181 void
2182 docloseopen(void)
2183 {
2184 char *name;
2185 int ret;
2186
2187 if (testcalls <= simulatedopcount)
2188 return;
2189
2190 name = strdup(ctx.name);
2191
2192 if (debug)
2193 prt("%lu close/open\n", testcalls);
2194
2195 ret = ops->close(&ctx);
2196 if (ret < 0) {
2197 prterrcode("docloseopen: ops->close", ret);
2198 report_failure(180);
2199 }
2200
2201 ret = ops->open(name, &ctx);
2202 if (ret < 0) {
2203 prterrcode("docloseopen: ops->open", ret);
2204 report_failure(181);
2205 }
2206
2207 free(name);
2208 }
2209
2210 #define TRIM_OFF_LEN(off, len, size) \
2211 do { \
2212 if (size) \
2213 (off) %= (size); \
2214 else \
2215 (off) = 0; \
2216 if ((unsigned)(off) + (unsigned)(len) > (unsigned)(size)) \
2217 (len) = (size) - (off); \
2218 } while (0)
2219
2220 void
2221 test(void)
2222 {
2223 unsigned long offset;
2224 unsigned long size = maxoplen;
2225 unsigned long rv = get_random();
2226 unsigned long op;
2227
2228 if (simulatedopcount > 0 && testcalls == simulatedopcount)
2229 writefileimage();
2230
2231 testcalls++;
2232
2233 if (closeprob)
2234 closeopen = (rv >> 3) < (1u << 28) / (unsigned)closeprob;
2235
2236 if (debugstart > 0 && testcalls >= debugstart)
2237 debug = 1;
2238
2239 if (!quiet && testcalls < simulatedopcount && testcalls % 100000 == 0)
2240 prt("%lu...\n", testcalls);
2241
2242 offset = get_random();
2243 if (randomoplen)
2244 size = get_random() % (maxoplen + 1);
2245
2246 /* calculate appropriate op to run */
2247 if (lite)
2248 op = rv % OP_MAX_LITE;
2249 else
2250 op = rv % OP_MAX_FULL;
2251
2252 switch (op) {
2253 case OP_MAPREAD:
2254 if (!mapped_reads)
2255 op = OP_READ;
2256 break;
2257 case OP_MAPWRITE:
2258 if (!mapped_writes)
2259 op = OP_WRITE;
2260 break;
2261 case OP_FALLOCATE:
2262 if (!fallocate_calls) {
2263 log4(OP_SKIPPED, OP_FALLOCATE, offset, size);
2264 goto out;
2265 }
2266 break;
2267 case OP_PUNCH_HOLE:
2268 if (!punch_hole_calls) {
2269 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, size);
2270 goto out;
2271 }
2272 break;
2273 case OP_CLONE:
2274 /* clone, 8% chance */
2275 if (!clone_calls || file_size == 0 || get_random() % 100 >= 8) {
2276 log4(OP_SKIPPED, OP_CLONE, 0, 0);
2277 goto out;
2278 }
2279 break;
2280 case OP_FLATTEN:
2281 /* flatten four times as rarely as clone, 2% chance */
2282 if (get_random() % 100 >= 2) {
2283 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2284 goto out;
2285 }
2286 break;
2287 case OP_WRITESAME:
2288 /* writesame not implemented */
2289 if (!ops->writesame) {
2290 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2291 goto out;
2292 }
2293 }
2294
2295 switch (op) {
2296 case OP_READ:
2297 TRIM_OFF_LEN(offset, size, file_size);
2298 doread(offset, size);
2299 break;
2300
2301 case OP_WRITE:
2302 TRIM_OFF_LEN(offset, size, maxfilelen);
2303 dowrite(offset, size);
2304 break;
2305
2306 case OP_MAPREAD:
2307 TRIM_OFF_LEN(offset, size, file_size);
2308 exit(183);
2309 break;
2310
2311 case OP_MAPWRITE:
2312 TRIM_OFF_LEN(offset, size, maxfilelen);
2313 exit(182);
2314 break;
2315
2316 case OP_TRUNCATE:
2317 if (!style)
2318 size = get_random() % maxfilelen;
2319 dotruncate(size);
2320 break;
2321
2322 case OP_PUNCH_HOLE:
2323 TRIM_OFF_LEN(offset, size, file_size);
2324 do_punch_hole(offset, size);
2325 break;
2326
2327 case OP_WRITESAME:
2328 TRIM_OFF_LEN(offset, size, maxfilelen);
2329 dowritesame(offset, size);
2330 break;
2331
2332 case OP_CLONE:
2333 do_clone();
2334 break;
2335
2336 case OP_FLATTEN:
2337 do_flatten();
2338 break;
2339
2340 default:
2341 prterr("test: unknown operation");
2342 report_failure(42);
2343 break;
2344 }
2345
2346 out:
2347 if (sizechecks && testcalls > simulatedopcount)
2348 check_size();
2349 if (closeopen)
2350 docloseopen();
2351 }
2352
2353
2354 void
2355 cleanup(int sig)
2356 {
2357 if (sig)
2358 prt("signal %d\n", sig);
2359 prt("testcalls = %lu\n", testcalls);
2360 exit(sig);
2361 }
2362
2363
2364 void
2365 usage(void)
2366 {
2367 fprintf(stdout, "usage: %s",
2368 "fsx [-dfjknqxyACFHKLORUWZ] [-b opnum] [-c Prob] [-h holebdy] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] pname iname\n\
2369 -b opnum: beginning operation number (default 1)\n\
2370 -c P: 1 in P chance of file close+open at each op (default infinity)\n\
2371 -d: debug output for all operations\n\
2372 -f: flush and invalidate cache after I/O\n\
2373 -h holebdy: 4096 would make discards page aligned (default 1)\n\
2374 -j: journal replay stress test\n\
2375 -k: keep data on success (default 0)\n\
2376 -l flen: the upper bound on file size (default 262144)\n\
2377 -m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
2378 -n: no verifications of file size\n\
2379 -o oplen: the upper bound on operation size (default 65536)\n\
2380 -p progressinterval: debug output at specified operation interval\n\
2381 -q: quieter operation\n\
2382 -r readbdy: 4096 would make reads page aligned (default 1)\n\
2383 -s style: 1 gives smaller truncates (default 0)\n\
2384 -t truncbdy: 4096 would make truncates page aligned (default 1)\n\
2385 -w writebdy: 4096 would make writes page aligned (default 1)\n\
2386 -x: preallocate file space before starting, XFS only (default 0)\n\
2387 -y: synchronize changes to a file\n"
2388
2389 " -C: do not use clone calls\n\
2390 -D startingop: debug output starting at specified operation\n"
2391 #ifdef FALLOCATE
2392 " -F: Do not use fallocate (preallocation) calls\n"
2393 #endif
2394 " -H: do not use punch hole calls\n\
2395 -K: enable krbd mode (use -t and -h too)\n\
2396 -M: enable rbd-nbd mode (use -t and -h too)\n\
2397 -L: fsxLite - no file creations & no file size changes\n\
2398 -N numops: total # operations to do (default infinity)\n\
2399 -O: use oplen (see -o flag) for every op (default random)\n\
2400 -P dirpath: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
2401 -R: read() system calls only (mapped reads disabled)\n\
2402 -S seed: for random # generator (default 1) 0 gets timestamp\n\
2403 -U: disable randomized striping\n\
2404 -W: mapped write operations DISabled\n\
2405 -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
2406 poolname: this is REQUIRED (no default)\n\
2407 imagename: this is REQUIRED (no default)\n");
2408 exit(89);
2409 }
2410
2411
2412 int
2413 getnum(char *s, char **e)
2414 {
2415 int ret;
2416
2417 *e = (char *) 0;
2418 ret = strtol(s, e, 0);
2419 if (*e)
2420 switch (**e) {
2421 case 'b':
2422 case 'B':
2423 ret *= 512;
2424 *e = *e + 1;
2425 break;
2426 case 'k':
2427 case 'K':
2428 ret *= 1024;
2429 *e = *e + 1;
2430 break;
2431 case 'm':
2432 case 'M':
2433 ret *= 1024*1024;
2434 *e = *e + 1;
2435 break;
2436 case 'w':
2437 case 'W':
2438 ret *= 4;
2439 *e = *e + 1;
2440 break;
2441 }
2442 return (ret);
2443 }
2444
2445 void
2446 test_fallocate()
2447 {
2448 #ifdef FALLOCATE
2449 if (!lite && fallocate_calls) {
2450 if (fallocate(fd, 0, 0, 1) && errno == EOPNOTSUPP) {
2451 if(!quiet)
2452 warn("main: filesystem does not support fallocate, disabling\n");
2453 fallocate_calls = 0;
2454 } else {
2455 ftruncate(fd, 0);
2456 }
2457 }
2458 #else /* ! FALLOCATE */
2459 fallocate_calls = 0;
2460 #endif
2461
2462 }
2463
2464 void remove_image(rados_ioctx_t ioctx, char *imagename, bool remove_snap,
2465 bool unregister) {
2466 rbd_image_t image;
2467 char errmsg[128];
2468 int ret;
2469
2470 if ((ret = rbd_open(ioctx, imagename, &image, NULL)) < 0) {
2471 sprintf(errmsg, "rbd_open %s", imagename);
2472 prterrcode(errmsg, ret);
2473 report_failure(101);
2474 }
2475 if (remove_snap) {
2476 if ((ret = rbd_snap_unprotect(image, "snap")) < 0) {
2477 sprintf(errmsg, "rbd_snap_unprotect %s@snap",
2478 imagename);
2479 prterrcode(errmsg, ret);
2480 report_failure(102);
2481 }
2482 if ((ret = rbd_snap_remove(image, "snap")) < 0) {
2483 sprintf(errmsg, "rbd_snap_remove %s@snap",
2484 imagename);
2485 prterrcode(errmsg, ret);
2486 report_failure(103);
2487 }
2488 }
2489 if ((ret = rbd_close(image)) < 0) {
2490 sprintf(errmsg, "rbd_close %s", imagename);
2491 prterrcode(errmsg, ret);
2492 report_failure(104);
2493 }
2494
2495 if (unregister &&
2496 (ret = unregister_journal(ioctx, imagename)) < 0) {
2497 report_failure(105);
2498 }
2499
2500 if ((ret = rbd_remove(ioctx, imagename)) < 0) {
2501 sprintf(errmsg, "rbd_remove %s", imagename);
2502 prterrcode(errmsg, ret);
2503 report_failure(106);
2504 }
2505 }
2506
2507 int
2508 main(int argc, char **argv)
2509 {
2510 int i, style, ch, ret;
2511 char *endp;
2512 char goodfile[1024];
2513 char logfile[1024];
2514
2515 goodfile[0] = 0;
2516 logfile[0] = 0;
2517
2518 page_size = getpagesize();
2519 page_mask = page_size - 1;
2520 mmap_mask = page_mask;
2521
2522 setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
2523
2524 while ((ch = getopt(argc, argv, "b:c:dfh:jkl:m:no:p:qr:s:t:w:xyCD:FHKMLN:OP:RS:UWZ"))
2525 != EOF)
2526 switch (ch) {
2527 case 'b':
2528 simulatedopcount = getnum(optarg, &endp);
2529 if (!quiet)
2530 fprintf(stdout, "Will begin at operation %lu\n",
2531 simulatedopcount);
2532 if (simulatedopcount == 0)
2533 usage();
2534 simulatedopcount -= 1;
2535 break;
2536 case 'c':
2537 closeprob = getnum(optarg, &endp);
2538 if (!quiet)
2539 fprintf(stdout,
2540 "Chance of close/open is 1 in %d\n",
2541 closeprob);
2542 if (closeprob <= 0)
2543 usage();
2544 break;
2545 case 'd':
2546 debug = 1;
2547 break;
2548 case 'f':
2549 flush_enabled = 1;
2550 break;
2551 case 'h':
2552 holebdy = getnum(optarg, &endp);
2553 if (holebdy <= 0)
2554 usage();
2555 break;
2556 case 'j':
2557 journal_replay = true;
2558 break;
2559 case 'k':
2560 keep_on_success = 1;
2561 break;
2562 case 'l':
2563 {
2564 int _num = getnum(optarg, &endp);
2565 if (_num <= 0)
2566 usage();
2567 maxfilelen = _num;
2568 }
2569 break;
2570 case 'm':
2571 monitorstart = getnum(optarg, &endp);
2572 if (monitorstart < 0)
2573 usage();
2574 if (!endp || *endp++ != ':')
2575 usage();
2576 monitorend = getnum(endp, &endp);
2577 if (monitorend < 0)
2578 usage();
2579 if (monitorend == 0)
2580 monitorend = -1; /* aka infinity */
2581 debug = 1;
2582 break;
2583 case 'n':
2584 sizechecks = 0;
2585 break;
2586 case 'o':
2587 maxoplen = getnum(optarg, &endp);
2588 if (maxoplen <= 0)
2589 usage();
2590 break;
2591 case 'p':
2592 progressinterval = getnum(optarg, &endp);
2593 if (progressinterval == 0)
2594 usage();
2595 break;
2596 case 'q':
2597 quiet = 1;
2598 break;
2599 case 'r':
2600 readbdy = getnum(optarg, &endp);
2601 if (readbdy <= 0)
2602 usage();
2603 break;
2604 case 's':
2605 style = getnum(optarg, &endp);
2606 if (style < 0 || style > 1)
2607 usage();
2608 break;
2609 case 't':
2610 truncbdy = getnum(optarg, &endp);
2611 if (truncbdy <= 0)
2612 usage();
2613 break;
2614 case 'w':
2615 writebdy = getnum(optarg, &endp);
2616 if (writebdy <= 0)
2617 usage();
2618 break;
2619 case 'x':
2620 prealloc = 1;
2621 break;
2622 case 'y':
2623 do_fsync = 1;
2624 break;
2625 case 'C':
2626 clone_calls = 0;
2627 break;
2628 case 'D':
2629 debugstart = getnum(optarg, &endp);
2630 if (debugstart < 1)
2631 usage();
2632 break;
2633 case 'F':
2634 fallocate_calls = 0;
2635 break;
2636 case 'H':
2637 punch_hole_calls = 0;
2638 break;
2639 case 'K':
2640 prt("krbd mode enabled\n");
2641 ops = &krbd_operations;
2642 break;
2643 case 'M':
2644 prt("rbd-nbd mode enabled\n");
2645 ops = &nbd_operations;
2646 break;
2647 case 'L':
2648 prt("lite mode not supported for rbd\n");
2649 exit(1);
2650 break;
2651 case 'N':
2652 numops = getnum(optarg, &endp);
2653 if (numops < 0)
2654 usage();
2655 break;
2656 case 'O':
2657 randomoplen = 0;
2658 break;
2659 case 'P':
2660 strncpy(dirpath, optarg, sizeof(dirpath)-1);
2661 dirpath[sizeof(dirpath)-1] = '\0';
2662 strncpy(goodfile, dirpath, sizeof(goodfile)-1);
2663 goodfile[sizeof(goodfile)-1] = '\0';
2664 if (strlen(goodfile) < sizeof(goodfile)-2) {
2665 strcat(goodfile, "/");
2666 } else {
2667 prt("file name to long\n");
2668 exit(1);
2669 }
2670 strncpy(logfile, dirpath, sizeof(logfile)-1);
2671 logfile[sizeof(logfile)-1] = '\0';
2672 if (strlen(logfile) < sizeof(logfile)-2) {
2673 strcat(logfile, "/");
2674 } else {
2675 prt("file path to long\n");
2676 exit(1);
2677 }
2678 break;
2679 case 'R':
2680 mapped_reads = 0;
2681 if (!quiet)
2682 fprintf(stdout, "mapped reads DISABLED\n");
2683 break;
2684 case 'S':
2685 seed = getnum(optarg, &endp);
2686 if (seed == 0)
2687 seed = time(0) % 10000;
2688 if (!quiet)
2689 fprintf(stdout, "Seed set to %d\n", seed);
2690 if (seed < 0)
2691 usage();
2692 break;
2693 case 'U':
2694 randomize_striping = 0;
2695 break;
2696 case 'W':
2697 mapped_writes = 0;
2698 if (!quiet)
2699 fprintf(stdout, "mapped writes DISABLED\n");
2700 break;
2701 case 'Z':
2702 o_direct = O_DIRECT;
2703 break;
2704 default:
2705 usage();
2706 /* NOTREACHED */
2707 }
2708 argc -= optind;
2709 argv += optind;
2710 if (argc != 2)
2711 usage();
2712 pool = argv[0];
2713 iname = argv[1];
2714
2715 signal(SIGHUP, cleanup);
2716 signal(SIGINT, cleanup);
2717 signal(SIGPIPE, cleanup);
2718 signal(SIGALRM, cleanup);
2719 signal(SIGTERM, cleanup);
2720 signal(SIGXCPU, cleanup);
2721 signal(SIGXFSZ, cleanup);
2722 signal(SIGVTALRM, cleanup);
2723 signal(SIGUSR1, cleanup);
2724 signal(SIGUSR2, cleanup);
2725
2726 random_generator.seed(seed);
2727
2728 ret = create_image();
2729 if (ret < 0) {
2730 prterrcode(iname, ret);
2731 exit(90);
2732 }
2733 ret = ops->open(iname, &ctx);
2734 if (ret < 0) {
2735 simple_err("Error opening image", ret);
2736 exit(91);
2737 }
2738 if (!dirpath[0])
2739 strcat(dirpath, ".");
2740 strncat(goodfile, iname, 256);
2741 strcat (goodfile, ".fsxgood");
2742 fsxgoodfd = open(goodfile, O_RDWR|O_CREAT|O_TRUNC, 0666);
2743 if (fsxgoodfd < 0) {
2744 prterr(goodfile);
2745 exit(92);
2746 }
2747 strncat(logfile, iname, 256);
2748 strcat (logfile, ".fsxlog");
2749 fsxlogf = fopen(logfile, "w");
2750 if (fsxlogf == NULL) {
2751 prterr(logfile);
2752 exit(93);
2753 }
2754
2755 original_buf = (char *) malloc(maxfilelen);
2756 for (i = 0; i < (int)maxfilelen; i++)
2757 original_buf[i] = get_random() % 256;
2758
2759 ret = posix_memalign((void **)&good_buf,
2760 MAX(writebdy, (int)sizeof(void *)), maxfilelen);
2761 if (ret > 0) {
2762 if (ret == EINVAL)
2763 prt("writebdy is not a suitable power of two\n");
2764 else
2765 prterrcode("main: posix_memalign(good_buf)", -ret);
2766 exit(94);
2767 }
2768 memset(good_buf, '\0', maxfilelen);
2769
2770 ret = posix_memalign((void **)&temp_buf,
2771 MAX(readbdy, (int)sizeof(void *)), maxfilelen);
2772 if (ret > 0) {
2773 if (ret == EINVAL)
2774 prt("readbdy is not a suitable power of two\n");
2775 else
2776 prterrcode("main: posix_memalign(temp_buf)", -ret);
2777 exit(95);
2778 }
2779 memset(temp_buf, '\0', maxfilelen);
2780
2781 if (lite) { /* zero entire existing file */
2782 ssize_t written;
2783
2784 written = ops->write(&ctx, 0, (size_t)maxfilelen, good_buf);
2785 if (written != (ssize_t)maxfilelen) {
2786 if (written < 0) {
2787 prterrcode(iname, written);
2788 warn("main: error on write");
2789 } else
2790 warn("main: short write, 0x%x bytes instead "
2791 "of 0x%lx\n",
2792 (unsigned)written,
2793 maxfilelen);
2794 exit(98);
2795 }
2796 } else
2797 check_trunc_hack();
2798
2799 //test_fallocate();
2800
2801 while (numops == -1 || numops--)
2802 test();
2803
2804 ret = ops->close(&ctx);
2805 if (ret < 0) {
2806 prterrcode("ops->close", ret);
2807 report_failure(99);
2808 }
2809
2810 if (journal_replay) {
2811 char imagename[1024];
2812 clone_imagename(imagename, sizeof(imagename), num_clones);
2813 ret = finalize_journal(ioctx, imagename, num_clones, 0, 0, 0);
2814 if (ret < 0) {
2815 report_failure(100);
2816 }
2817 }
2818
2819 if (num_clones > 0) {
2820 if (journal_replay) {
2821 check_clone(num_clones - 1, true);
2822 }
2823 check_clone(num_clones - 1, false);
2824 }
2825
2826 if (!keep_on_success) {
2827 while (num_clones >= 0) {
2828 static bool remove_snap = false;
2829
2830 if (journal_replay) {
2831 char replayimagename[1024];
2832 replay_imagename(replayimagename,
2833 sizeof(replayimagename),
2834 num_clones);
2835 remove_image(ioctx, replayimagename,
2836 remove_snap,
2837 false);
2838 }
2839
2840 char clonename[128];
2841 clone_imagename(clonename, 128, num_clones);
2842 remove_image(ioctx, clonename, remove_snap,
2843 journal_replay);
2844
2845 remove_snap = true;
2846 num_clones--;
2847 }
2848 }
2849
2850 prt("All operations completed A-OK!\n");
2851 fclose(fsxlogf);
2852
2853 rados_ioctx_destroy(ioctx);
2854 krbd_destroy(krbd);
2855 rados_shutdown(cluster);
2856
2857 free(original_buf);
2858 free(good_buf);
2859 free(temp_buf);
2860
2861 exit(0);
2862 return 0;
2863 }