]> git.proxmox.com Git - ceph.git/blame - ceph/src/test/librbd/fsx.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / test / librbd / fsx.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:8; indent-tabs-mode:t -*-
2// vim: ts=8 sw=8 smarttab
3/*
4 * Copyright (C) 1991, NeXT Computer, Inc. All Rights Reserverd.
5 *
6 * File: fsx.cc
7 * Author: Avadis Tevanian, Jr.
8 *
9 * File system exerciser.
10 *
11 * Rewritten 8/98 by Conrad Minshall.
12 *
13 * Small changes to work under Linux -- davej.
14 *
15 * Checks for mmap last-page zero fill.
16 */
17
18#include <sys/types.h>
19#include <unistd.h>
11fdf7f2 20#include <getopt.h>
7c673cae 21#include <limits.h>
7c673cae 22#include <strings.h>
11fdf7f2
TL
23#if defined(__FreeBSD__)
24#include <sys/disk.h>
25#endif
7c673cae
FG
26#include <sys/file.h>
27#include <sys/stat.h>
f67539c2 28#ifndef _WIN32
7c673cae 29#include <sys/mman.h>
f67539c2
TL
30#include <sys/ioctl.h>
31#endif
11fdf7f2 32#if defined(__linux__)
7c673cae 33#include <linux/fs.h>
11fdf7f2 34#endif
7c673cae
FG
35#ifdef HAVE_ERR_H
36#include <err.h>
37#endif
38#include <signal.h>
7c673cae
FG
39#include <stddef.h>
40#include <stdio.h>
41#include <stdlib.h>
42#include <string.h>
43#include <stdarg.h>
44#include <assert.h>
45#include <errno.h>
46#include <math.h>
47#include <fcntl.h>
48#include <random>
49
11fdf7f2 50#include "include/compat.h"
7c673cae 51#include "include/intarith.h"
11fdf7f2 52#if defined(WITH_KRBD)
7c673cae 53#include "include/krbd.h"
11fdf7f2 54#endif
7c673cae
FG
55#include "include/rados/librados.h"
56#include "include/rados/librados.hpp"
57#include "include/rbd/librbd.h"
58#include "include/rbd/librbd.hpp"
59#include "common/Cond.h"
60#include "common/SubProcess.h"
61#include "common/safe_io.h"
62#include "journal/Journaler.h"
63#include "journal/ReplayEntry.h"
64#include "journal/ReplayHandler.h"
65#include "journal/Settings.h"
66
67#include <boost/scope_exit.hpp>
68
69#define NUMPRINTCOLUMNS 32 /* # columns of data to print on each line */
70
71/*
72 * A log entry is an operation and a bunch of arguments.
73 */
74
75struct log_entry {
76 int operation;
77 int args[3];
78};
79
80#define LOGSIZE 1000
81
82struct log_entry oplog[LOGSIZE]; /* the log */
83int logptr = 0; /* current position in log */
84int logcount = 0; /* total ops */
85
86/*
87 * The operation matrix is complex due to conditional execution of different
88 * features. Hence when we come to deciding what operation to run, we need to
89 * be careful in how we select the different operations. The active operations
90 * are mapped to numbers as follows:
91 *
92 * lite !lite
93 * READ: 0 0
94 * WRITE: 1 1
95 * MAPREAD: 2 2
96 * MAPWRITE: 3 3
97 * TRUNCATE: - 4
98 * FALLOCATE: - 5
99 * PUNCH HOLE: - 6
100 * WRITESAME: - 7
c07f9fc5 101 * COMPAREANDWRITE: - 8
7c673cae
FG
102 *
103 * When mapped read/writes are disabled, they are simply converted to normal
104 * reads and writes. When fallocate/fpunch calls are disabled, they are
105 * converted to OP_SKIPPED. Hence OP_SKIPPED needs to have a number higher than
106 * the operation selction matrix, as does the OP_CLOSEOPEN which is an
107 * operation modifier rather than an operation in itself.
108 *
109 * Because of the "lite" version, we also need to have different "maximum
110 * operation" defines to allow the ops to be selected correctly based on the
111 * mode being run.
112 */
113
114/* common operations */
115#define OP_READ 0
116#define OP_WRITE 1
117#define OP_MAPREAD 2
118#define OP_MAPWRITE 3
119#define OP_MAX_LITE 4
120
121/* !lite operations */
122#define OP_TRUNCATE 4
123#define OP_FALLOCATE 5
124#define OP_PUNCH_HOLE 6
125#define OP_WRITESAME 7
c07f9fc5 126#define OP_COMPARE_AND_WRITE 8
7c673cae 127/* rbd-specific operations */
c07f9fc5
FG
128#define OP_CLONE 9
129#define OP_FLATTEN 10
130#define OP_MAX_FULL 11
7c673cae
FG
131
132/* operation modifiers */
133#define OP_CLOSEOPEN 100
134#define OP_SKIPPED 101
135
136#undef PAGE_SIZE
f67539c2 137#define PAGE_SIZE get_page_size()
7c673cae
FG
138#undef PAGE_MASK
139#define PAGE_MASK (PAGE_SIZE - 1)
140
141
142char *original_buf; /* a pointer to the original data */
143char *good_buf; /* a pointer to the correct data */
144char *temp_buf; /* a pointer to the current data */
145
146char dirpath[1024];
147
148off_t file_size = 0;
149off_t biggest = 0;
150unsigned long testcalls = 0; /* calls to function "test" */
151
11fdf7f2
TL
152const char* cluster_name = "ceph"; /* --cluster optional */
153const char* client_id = "admin"; /* --id optional */
154
7c673cae
FG
155unsigned long simulatedopcount = 0; /* -b flag */
156int closeprob = 0; /* -c flag */
157int debug = 0; /* -d flag */
158unsigned long debugstart = 0; /* -D flag */
159int flush_enabled = 0; /* -f flag */
11fdf7f2 160int deep_copy = 0; /* -g flag */
7c673cae
FG
161int holebdy = 1; /* -h flag */
162bool journal_replay = false; /* -j flah */
163int keep_on_success = 0; /* -k flag */
164int do_fsync = 0; /* -y flag */
165unsigned long maxfilelen = 256 * 1024; /* -l flag */
166int sizechecks = 1; /* -n flag disables them */
167int maxoplen = 64 * 1024; /* -o flag */
168int quiet = 0; /* -q flag */
169unsigned long progressinterval = 0; /* -p flag */
170int readbdy = 1; /* -r flag */
171int style = 0; /* -s flag */
172int prealloc = 0; /* -x flag */
173int truncbdy = 1; /* -t flag */
174int writebdy = 1; /* -w flag */
175long monitorstart = -1; /* -m flag */
176long monitorend = -1; /* -m flag */
177int lite = 0; /* -L flag */
178long numops = -1; /* -N flag */
179int randomoplen = 1; /* -O flag disables it */
180int seed = 1; /* -S flag */
181int mapped_writes = 0; /* -W flag disables */
182int fallocate_calls = 0; /* -F flag disables */
183int punch_hole_calls = 1; /* -H flag disables */
184int clone_calls = 1; /* -C flag disables */
185int randomize_striping = 1; /* -U flag disables */
186int randomize_parent_overlap = 1;
187int mapped_reads = 0; /* -R flag disables it */
188int fsxgoodfd = 0;
189int o_direct = 0; /* -Z flag */
190
191int num_clones = 0;
192
193int page_size;
194int page_mask;
195int mmap_mask;
196
197FILE * fsxlogf = NULL;
198int badoff = -1;
199int closeopen = 0;
200
201void
202vwarnc(int code, const char *fmt, va_list ap) {
203 fprintf(stderr, "fsx: ");
204 if (fmt != NULL) {
205 vfprintf(stderr, fmt, ap);
206 fprintf(stderr, ": ");
207 }
208 fprintf(stderr, "%s\n", strerror(code));
209}
210
211void
212warn(const char * fmt, ...) {
213 va_list ap;
214 va_start(ap, fmt);
215 vwarnc(errno, fmt, ap);
216 va_end(ap);
217}
218
219#define BUF_SIZE 1024
220
221void
222prt(const char *fmt, ...)
223{
224 va_list args;
225 char buffer[BUF_SIZE];
226
227 va_start(args, fmt);
228 vsnprintf(buffer, BUF_SIZE, fmt, args);
229 va_end(args);
230 fprintf(stdout, "%s", buffer);
231 if (fsxlogf)
232 fprintf(fsxlogf, "%s", buffer);
233}
234
235void
236prterr(const char *prefix)
237{
238 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(errno));
239}
240
241void
242prterrcode(const char *prefix, int code)
243{
244 prt("%s%s%s\n", prefix, prefix ? ": " : "", strerror(-code));
245}
246
247void
248simple_err(const char *msg, int err)
249{
250 fprintf(stderr, "%s: %s\n", msg, strerror(-err));
251}
252
253/*
254 * random
255 */
256std::mt19937 random_generator;
257
258uint_fast32_t
259get_random(void)
260{
261 return random_generator();
262}
263
11fdf7f2 264int get_features(uint64_t* features);
7c673cae
FG
265void replay_imagename(char *buf, size_t len, int clones);
266
267namespace {
268
269static const std::string JOURNAL_CLIENT_ID("fsx");
270
271struct ReplayHandler : public journal::ReplayHandler {
272 journal::Journaler *journaler;
273 journal::Journaler *replay_journaler;
274 Context *on_finish;
275
276 ReplayHandler(journal::Journaler *journaler,
277 journal::Journaler *replay_journaler, Context *on_finish)
278 : journaler(journaler), replay_journaler(replay_journaler),
279 on_finish(on_finish) {
280 }
281
7c673cae
FG
282 void handle_entries_available() override {
283 while (true) {
284 journal::ReplayEntry replay_entry;
285 if (!journaler->try_pop_front(&replay_entry)) {
286 return;
287 }
288
289 replay_journaler->append(0, replay_entry.get_data());
290 }
291 }
292
293 void handle_complete(int r) override {
294 on_finish->complete(r);
295 }
296};
297
298int get_image_id(librados::IoCtx &io_ctx, const char *image_name,
299 std::string *image_id) {
300 librbd::RBD rbd;
301 librbd::Image image;
302 int r = rbd.open(io_ctx, image, image_name);
303 if (r < 0) {
304 simple_err("failed to open image", r);
305 return r;
306 }
307
308 rbd_image_info_t info;
309 r = image.stat(info, sizeof(info));
310 if (r < 0) {
311 simple_err("failed to stat image", r);
312 return r;
313 }
314
315 *image_id = std::string(&info.block_name_prefix[strlen(RBD_DATA_PREFIX)]);
316 return 0;
317}
318
319int register_journal(rados_ioctx_t ioctx, const char *image_name) {
320 librados::IoCtx io_ctx;
321 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
322
323 std::string image_id;
324 int r = get_image_id(io_ctx, image_name, &image_id);
325 if (r < 0) {
326 return r;
327 }
328
9f95a23c
TL
329 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {},
330 nullptr);
7c673cae
FG
331 r = journaler.register_client(bufferlist());
332 if (r < 0) {
333 simple_err("failed to register journal client", r);
334 return r;
335 }
336 return 0;
337}
338
339int unregister_journal(rados_ioctx_t ioctx, const char *image_name) {
340 librados::IoCtx io_ctx;
341 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
342
343 std::string image_id;
344 int r = get_image_id(io_ctx, image_name, &image_id);
345 if (r < 0) {
346 return r;
347 }
348
9f95a23c
TL
349 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {},
350 nullptr);
7c673cae
FG
351 r = journaler.unregister_client();
352 if (r < 0) {
353 simple_err("failed to unregister journal client", r);
354 return r;
355 }
356 return 0;
357}
358
359int create_replay_image(rados_ioctx_t ioctx, int order,
360 uint64_t stripe_unit, int stripe_count,
361 const char *replay_image_name,
362 const char *last_replay_image_name) {
363 librados::IoCtx io_ctx;
364 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
365
11fdf7f2
TL
366 uint64_t features;
367 int r = get_features(&features);
368 if (r < 0) {
369 return r;
370 }
371
7c673cae
FG
372 librbd::RBD rbd;
373 if (last_replay_image_name == nullptr) {
11fdf7f2 374 r = rbd.create2(io_ctx, replay_image_name, 0, features, &order);
7c673cae
FG
375 } else {
376 r = rbd.clone2(io_ctx, last_replay_image_name, "snap",
11fdf7f2
TL
377 io_ctx, replay_image_name, features, &order,
378 stripe_unit, stripe_count);
7c673cae
FG
379 }
380
381 if (r < 0) {
382 simple_err("failed to create replay image", r);
383 return r;
384 }
385
386 return 0;
387}
388
389int replay_journal(rados_ioctx_t ioctx, const char *image_name,
390 const char *replay_image_name) {
391 librados::IoCtx io_ctx;
392 librados::IoCtx::from_rados_ioctx_t(ioctx, io_ctx);
393
394 std::string image_id;
395 int r = get_image_id(io_ctx, image_name, &image_id);
396 if (r < 0) {
397 return r;
398 }
399
400 std::string replay_image_id;
401 r = get_image_id(io_ctx, replay_image_name, &replay_image_id);
402 if (r < 0) {
403 return r;
404 }
405
9f95a23c
TL
406 journal::Journaler journaler(io_ctx, image_id, JOURNAL_CLIENT_ID, {},
407 nullptr);
7c673cae
FG
408 C_SaferCond init_ctx;
409 journaler.init(&init_ctx);
410 BOOST_SCOPE_EXIT_ALL( (&journaler) ) {
411 journaler.shut_down();
412 };
413
414 r = init_ctx.wait();
415 if (r < 0) {
416 simple_err("failed to initialize journal", r);
417 return r;
418 }
419
9f95a23c
TL
420 journal::Journaler replay_journaler(io_ctx, replay_image_id, "", {},
421 nullptr);
7c673cae
FG
422
423 C_SaferCond replay_init_ctx;
424 replay_journaler.init(&replay_init_ctx);
425 BOOST_SCOPE_EXIT_ALL( (&replay_journaler) ) {
426 replay_journaler.shut_down();
427 };
428
429 r = replay_init_ctx.wait();
430 if (r < 0) {
431 simple_err("failed to initialize replay journal", r);
432 return r;
433 }
434
494da23a 435 replay_journaler.start_append(0);
7c673cae
FG
436
437 C_SaferCond replay_ctx;
438 ReplayHandler replay_handler(&journaler, &replay_journaler,
439 &replay_ctx);
440
441 // copy journal events from source image to replay image
442 journaler.start_replay(&replay_handler);
443 r = replay_ctx.wait();
444
445 journaler.stop_replay();
446
447 C_SaferCond stop_ctx;
448 replay_journaler.stop_append(&stop_ctx);
449 int stop_r = stop_ctx.wait();
450 if (r == 0 && stop_r < 0) {
451 r = stop_r;
452 }
453
454 if (r < 0) {
455 simple_err("failed to replay journal", r);
456 return r;
457 }
458
459 librbd::RBD rbd;
460 librbd::Image image;
461 r = rbd.open(io_ctx, image, replay_image_name);
462 if (r < 0) {
463 simple_err("failed to open replay image", r);
464 return r;
465 }
466
467 // perform an IO op to initiate the journal replay
468 bufferlist bl;
469 r = static_cast<ssize_t>(image.write(0, 0, bl));
470 if (r < 0) {
471 simple_err("failed to write to replay image", r);
472 return r;
473 }
474 return 0;
475}
476
477int finalize_journal(rados_ioctx_t ioctx, const char *imagename, int clones,
478 int order, uint64_t stripe_unit, int stripe_count) {
479 char replayimagename[1024];
480 replay_imagename(replayimagename, sizeof(replayimagename), clones);
481
482 char lastreplayimagename[1024];
483 if (clones > 0) {
484 replay_imagename(lastreplayimagename,
485 sizeof(lastreplayimagename), clones - 1);
486 }
487
488 int ret = create_replay_image(ioctx, order, stripe_unit,
489 stripe_count, replayimagename,
490 clones > 0 ? lastreplayimagename :
491 nullptr);
492 if (ret < 0) {
493 exit(EXIT_FAILURE);
494 }
495
496 ret = replay_journal(ioctx, imagename, replayimagename);
497 if (ret < 0) {
498 exit(EXIT_FAILURE);
499 }
500 return 0;
501}
502
503} // anonymous namespace
504
505/*
506 * rbd
507 */
508
509struct rbd_ctx {
510 const char *name; /* image name */
511 rbd_image_t image; /* image handle */
512 const char *krbd_name; /* image /dev/rbd<id> name */ /* reused for nbd test */
513 int krbd_fd; /* image /dev/rbd<id> fd */ /* reused for nbd test */
514};
515
516#define RBD_CTX_INIT (struct rbd_ctx) { NULL, NULL, NULL, -1}
517
518struct rbd_operations {
519 int (*open)(const char *name, struct rbd_ctx *ctx);
520 int (*close)(struct rbd_ctx *ctx);
521 ssize_t (*read)(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf);
522 ssize_t (*write)(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf);
523 int (*flush)(struct rbd_ctx *ctx);
524 int (*discard)(struct rbd_ctx *ctx, uint64_t off, uint64_t len);
525 int (*get_size)(struct rbd_ctx *ctx, uint64_t *size);
526 int (*resize)(struct rbd_ctx *ctx, uint64_t size);
527 int (*clone)(struct rbd_ctx *ctx, const char *src_snapname,
528 const char *dst_imagename, int *order, int stripe_unit,
529 int stripe_count);
530 int (*flatten)(struct rbd_ctx *ctx);
531 ssize_t (*writesame)(struct rbd_ctx *ctx, uint64_t off, size_t len,
532 const char *buf, size_t data_len);
c07f9fc5
FG
533 ssize_t (*compare_and_write)(struct rbd_ctx *ctx, uint64_t off, size_t len,
534 const char *cmp_buf, const char *buf);
7c673cae
FG
535};
536
537char *pool; /* name of the pool our test image is in */
538char *iname; /* name of our test image */
539rados_t cluster; /* handle for our test cluster */
540rados_ioctx_t ioctx; /* handle for our test pool */
11fdf7f2 541#if defined(WITH_KRBD)
7c673cae 542struct krbd_ctx *krbd; /* handle for libkrbd */
11fdf7f2 543#endif
7c673cae
FG
544bool skip_partial_discard; /* rbd_skip_partial_discard config value*/
545
11fdf7f2
TL
546int get_features(uint64_t* features) {
547 char buf[1024];
548 int r = rados_conf_get(cluster, "rbd_default_features", buf,
549 sizeof(buf));
550 if (r < 0) {
551 simple_err("Could not get rbd_default_features value", r);
552 return r;
553 }
554
555 *features = strtol(buf, NULL, 0);
556
557 if (clone_calls) {
558 *features |= RBD_FEATURE_LAYERING;
559 }
560 if (journal_replay) {
561 *features |= (RBD_FEATURE_EXCLUSIVE_LOCK |
562 RBD_FEATURE_JOURNALING);
563 }
564 return 0;
565}
566
7c673cae
FG
567/*
568 * librbd/krbd rbd_operations handlers. Given the rest of fsx.c, no
569 * attempt to do error handling is made in these handlers.
570 */
571
572int
573__librbd_open(const char *name, struct rbd_ctx *ctx)
574{
575 rbd_image_t image;
576 int ret;
577
11fdf7f2 578 ceph_assert(!ctx->name && !ctx->image &&
7c673cae
FG
579 !ctx->krbd_name && ctx->krbd_fd < 0);
580
581 ret = rbd_open(ioctx, name, &image, NULL);
582 if (ret < 0) {
583 prt("rbd_open(%s) failed\n", name);
584 return ret;
585 }
586
587 ctx->name = strdup(name);
588 ctx->image = image;
589 ctx->krbd_name = NULL;
590 ctx->krbd_fd = -1;
591
592 return 0;
593}
594
595int
596librbd_open(const char *name, struct rbd_ctx *ctx)
597{
598 return __librbd_open(name, ctx);
599}
600
601int
602__librbd_close(struct rbd_ctx *ctx)
603{
604 int ret;
605
11fdf7f2 606 ceph_assert(ctx->name && ctx->image);
7c673cae
FG
607
608 ret = rbd_close(ctx->image);
609 if (ret < 0) {
610 prt("rbd_close(%s) failed\n", ctx->name);
611 return ret;
612 }
613
614 free((void *)ctx->name);
615
616 ctx->name = NULL;
617 ctx->image = NULL;
618
619 return 0;
620}
621
622int
623librbd_close(struct rbd_ctx *ctx)
624{
625 return __librbd_close(ctx);
626}
627
628int
629librbd_verify_object_map(struct rbd_ctx *ctx)
630{
631 int n;
632 uint64_t flags;
633 n = rbd_get_flags(ctx->image, &flags);
634 if (n < 0) {
635 prt("rbd_get_flags() failed\n");
636 return n;
637 }
638
639 if ((flags & RBD_FLAG_OBJECT_MAP_INVALID) != 0) {
640 prt("rbd_get_flags() indicates object map is invalid\n");
641 return -EINVAL;
642 }
643 return 0;
644}
645
646ssize_t
647librbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
648{
649 ssize_t n;
650
651 n = rbd_read(ctx->image, off, len, buf);
652 if (n < 0)
653 prt("rbd_read(%llu, %zu) failed\n", off, len);
654
655 return n;
656}
657
658ssize_t
659librbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
660{
661 ssize_t n;
662 int ret;
663
664 n = rbd_write(ctx->image, off, len, buf);
665 if (n < 0) {
666 prt("rbd_write(%llu, %zu) failed\n", off, len);
667 return n;
668 }
669
670 ret = librbd_verify_object_map(ctx);
671 if (ret < 0) {
672 return ret;
673 }
674 return n;
675}
676
677int
678librbd_flush(struct rbd_ctx *ctx)
679{
680 int ret;
681
682 ret = rbd_flush(ctx->image);
683 if (ret < 0) {
684 prt("rbd_flush failed\n");
685 return ret;
686 }
687
688 return librbd_verify_object_map(ctx);
689}
690
691int
692librbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
693{
694 int ret;
695
696 ret = rbd_discard(ctx->image, off, len);
697 if (ret < 0) {
698 prt("rbd_discard(%llu, %llu) failed\n", off, len);
699 return ret;
700 }
701
702 return librbd_verify_object_map(ctx);
703}
704
705ssize_t
706librbd_writesame(struct rbd_ctx *ctx, uint64_t off, size_t len,
707 const char *buf, size_t data_len)
708{
709 ssize_t n;
710 int ret;
711
712 n = rbd_writesame(ctx->image, off, len, buf, data_len, 0);
713 if (n < 0) {
714 prt("rbd_writesame(%llu, %zu) failed\n", off, len);
715 return n;
716 }
717
718 ret = librbd_verify_object_map(ctx);
719 if (ret < 0) {
720 return ret;
721 }
722 return n;
723}
724
c07f9fc5
FG
725ssize_t
726librbd_compare_and_write(struct rbd_ctx *ctx, uint64_t off, size_t len,
727 const char *cmp_buf, const char *buf)
728{
729 ssize_t n;
730 int ret;
731 uint64_t mismatch_off = 0;
732
733 n = rbd_compare_and_write(ctx->image, off, len, cmp_buf, buf, &mismatch_off, 0);
734 if (n == -EINVAL) {
735 return n;
736 } else if (n < 0) {
737 prt("rbd_compare_and_write mismatch(%llu, %zu, %llu) failed\n",
738 off, len, mismatch_off);
739 return n;
740 }
741
742 ret = librbd_verify_object_map(ctx);
743 if (ret < 0) {
744 return ret;
745 }
746 return n;
747
748}
749
7c673cae
FG
750int
751librbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
752{
7c673cae
FG
753 int ret;
754
11fdf7f2 755 ret = rbd_get_size(ctx->image, size);
7c673cae 756 if (ret < 0) {
11fdf7f2 757 prt("rbd_get_size failed\n");
7c673cae
FG
758 return ret;
759 }
760
7c673cae
FG
761 return 0;
762}
763
764int
765__librbd_resize(struct rbd_ctx *ctx, uint64_t size)
766{
767 int ret;
768
769 ret = rbd_resize(ctx->image, size);
770 if (ret < 0) {
771 prt("rbd_resize(%llu) failed\n", size);
772 return ret;
773 }
774
775 return librbd_verify_object_map(ctx);
776}
777
778int
779librbd_resize(struct rbd_ctx *ctx, uint64_t size)
780{
781 return __librbd_resize(ctx, size);
782}
783
11fdf7f2
TL
784int
785__librbd_deep_copy(struct rbd_ctx *ctx, const char *src_snapname,
786 const char *dst_imagename, uint64_t features, int *order,
787 int stripe_unit, int stripe_count) {
788 int ret;
789
790 rbd_image_options_t opts;
791 rbd_image_options_create(&opts);
792 BOOST_SCOPE_EXIT_ALL( (&opts) ) {
793 rbd_image_options_destroy(opts);
794 };
795 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FEATURES,
796 features);
797 ceph_assert(ret == 0);
798 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_ORDER,
799 *order);
800 ceph_assert(ret == 0);
801 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_UNIT,
802 stripe_unit);
803 ceph_assert(ret == 0);
804 ret = rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_COUNT,
805 stripe_count);
806 ceph_assert(ret == 0);
807
808 ret = rbd_snap_set(ctx->image, src_snapname);
809 if (ret < 0) {
810 prt("rbd_snap_set(%s@%s) failed\n", ctx->name, src_snapname);
811 return ret;
812 }
813
814 ret = rbd_deep_copy(ctx->image, ioctx, dst_imagename, opts);
815 if (ret < 0) {
816 prt("rbd_deep_copy(%s@%s -> %s) failed\n",
817 ctx->name, src_snapname, dst_imagename);
818 return ret;
819 }
820
821 ret = rbd_snap_set(ctx->image, "");
822 if (ret < 0) {
823 prt("rbd_snap_set(%s@) failed\n", ctx->name);
824 return ret;
825 }
826
827 rbd_image_t image;
828 ret = rbd_open(ioctx, dst_imagename, &image, nullptr);
829 if (ret < 0) {
830 prt("rbd_open(%s) failed\n", dst_imagename);
831 return ret;
832 }
833
834 ret = rbd_snap_unprotect(image, src_snapname);
835 if (ret < 0) {
836 prt("rbd_snap_unprotect(%s@%s) failed\n", dst_imagename,
837 src_snapname);
838 return ret;
839 }
840
841 ret = rbd_snap_remove(image, src_snapname);
842 if (ret < 0) {
843 prt("rbd_snap_remove(%s@%s) failed\n", dst_imagename,
844 src_snapname);
845 return ret;
846 }
847
848 ret = rbd_close(image);
849 if (ret < 0) {
850 prt("rbd_close(%s) failed\n", dst_imagename);
851 return ret;
852 }
853
854 return 0;
855}
856
7c673cae
FG
857int
858__librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
859 const char *dst_imagename, int *order, int stripe_unit,
9f95a23c 860 int stripe_count)
7c673cae
FG
861{
862 int ret;
863
864 ret = rbd_snap_create(ctx->image, src_snapname);
865 if (ret < 0) {
866 prt("rbd_snap_create(%s@%s) failed\n", ctx->name,
867 src_snapname);
868 return ret;
869 }
870
871 ret = rbd_snap_protect(ctx->image, src_snapname);
872 if (ret < 0) {
873 prt("rbd_snap_protect(%s@%s) failed\n", ctx->name,
874 src_snapname);
875 return ret;
876 }
877
11fdf7f2
TL
878 uint64_t features;
879 ret = get_features(&features);
880 if (ret < 0) {
881 return ret;
882 }
883
11fdf7f2
TL
884 if (deep_copy) {
885 ret = __librbd_deep_copy(ctx, src_snapname, dst_imagename, features,
886 order, stripe_unit, stripe_count);
887 if (ret < 0) {
888 prt("deep_copy(%s@%s -> %s) failed\n", ctx->name,
889 src_snapname, dst_imagename);
890 return ret;
891 }
892 } else {
893 ret = rbd_clone2(ioctx, ctx->name, src_snapname, ioctx,
894 dst_imagename, features, order,
895 stripe_unit, stripe_count);
896 if (ret < 0) {
897 prt("rbd_clone2(%s@%s -> %s) failed\n", ctx->name,
898 src_snapname, dst_imagename);
899 return ret;
900 }
7c673cae
FG
901 }
902
903 return 0;
904}
905
906int
907librbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
908 const char *dst_imagename, int *order, int stripe_unit,
909 int stripe_count)
910{
911 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
9f95a23c 912 stripe_unit, stripe_count);
7c673cae
FG
913}
914
915int
916__librbd_flatten(struct rbd_ctx *ctx)
917{
918 int ret;
919
920 ret = rbd_flatten(ctx->image);
921 if (ret < 0) {
922 prt("rbd_flatten failed\n");
923 return ret;
924 }
925
926 return librbd_verify_object_map(ctx);
927}
928
929int
930librbd_flatten(struct rbd_ctx *ctx)
931{
932 return __librbd_flatten(ctx);
933}
934
935const struct rbd_operations librbd_operations = {
936 librbd_open,
937 librbd_close,
938 librbd_read,
939 librbd_write,
940 librbd_flush,
941 librbd_discard,
942 librbd_get_size,
943 librbd_resize,
944 librbd_clone,
945 librbd_flatten,
946 librbd_writesame,
c07f9fc5 947 librbd_compare_and_write,
7c673cae
FG
948};
949
11fdf7f2 950#if defined(WITH_KRBD)
7c673cae
FG
951int
952krbd_open(const char *name, struct rbd_ctx *ctx)
953{
f6b5b4d7 954 char buf[1024];
7c673cae
FG
955 char *devnode;
956 int fd;
957 int ret;
958
959 ret = __librbd_open(name, ctx);
960 if (ret < 0)
961 return ret;
962
f6b5b4d7
TL
963 ret = rados_conf_get(cluster, "rbd_default_map_options", buf,
964 sizeof(buf));
965 if (ret < 0) {
966 simple_err("Could not get rbd_default_map_options value", ret);
967 return ret;
968 }
969
970 ret = krbd_map(krbd, pool, "", name, "", buf, &devnode);
7c673cae
FG
971 if (ret < 0) {
972 prt("krbd_map(%s) failed\n", name);
973 return ret;
974 }
975
976 fd = open(devnode, O_RDWR | o_direct);
977 if (fd < 0) {
978 ret = -errno;
979 prt("open(%s) failed\n", devnode);
980 return ret;
981 }
982
983 ctx->krbd_name = devnode;
984 ctx->krbd_fd = fd;
985
986 return 0;
987}
988
989int
990krbd_close(struct rbd_ctx *ctx)
991{
992 int ret;
993
11fdf7f2 994 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
7c673cae
FG
995
996 if (close(ctx->krbd_fd) < 0) {
997 ret = -errno;
998 prt("close(%s) failed\n", ctx->krbd_name);
999 return ret;
1000 }
1001
1002 ret = krbd_unmap(krbd, ctx->krbd_name, "");
1003 if (ret < 0) {
1004 prt("krbd_unmap(%s) failed\n", ctx->krbd_name);
1005 return ret;
1006 }
1007
1008 free((void *)ctx->krbd_name);
1009
1010 ctx->krbd_name = NULL;
1011 ctx->krbd_fd = -1;
1012
1013 return __librbd_close(ctx);
1014}
11fdf7f2 1015#endif // WITH_KRBD
7c673cae 1016
11fdf7f2 1017#if defined(__linux__)
7c673cae
FG
1018ssize_t
1019krbd_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
1020{
1021 ssize_t n;
1022
1023 n = pread(ctx->krbd_fd, buf, len, off);
1024 if (n < 0) {
1025 n = -errno;
1026 prt("pread(%llu, %zu) failed\n", off, len);
1027 return n;
1028 }
1029
1030 return n;
1031}
1032
1033ssize_t
1034krbd_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
1035{
1036 ssize_t n;
1037
1038 n = pwrite(ctx->krbd_fd, buf, len, off);
1039 if (n < 0) {
1040 n = -errno;
1041 prt("pwrite(%llu, %zu) failed\n", off, len);
1042 return n;
1043 }
1044
1045 return n;
1046}
1047
1048int
1049__krbd_flush(struct rbd_ctx *ctx, bool invalidate)
1050{
1051 int ret;
1052
1053 if (o_direct)
1054 return 0;
1055
1056 /*
1057 * BLKFLSBUF will sync the filesystem on top of the device (we
1058 * don't care about that here, since we write directly to it),
1059 * write out any dirty buffers and invalidate the buffer cache.
1060 * It won't do a hardware cache flush.
1061 *
1062 * fsync() will write out any dirty buffers and do a hardware
1063 * cache flush (which we don't care about either, because for
1064 * krbd it's a noop). It won't try to empty the buffer cache
1065 * nor poke the filesystem before writing out.
1066 *
1067 * Given that, for our purposes, fsync is a flush, while
1068 * BLKFLSBUF is a flush+invalidate.
1069 */
1070 if (invalidate)
1071 ret = ioctl(ctx->krbd_fd, BLKFLSBUF, NULL);
1072 else
1073 ret = fsync(ctx->krbd_fd);
1074 if (ret < 0) {
1075 ret = -errno;
1076 prt("%s failed\n", invalidate ? "BLKFLSBUF" : "fsync");
1077 return ret;
1078 }
1079
1080 return 0;
1081}
1082
1083int
1084krbd_flush(struct rbd_ctx *ctx)
1085{
1086 return __krbd_flush(ctx, false);
1087}
1088
1089int
1090krbd_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
1091{
1092 uint64_t range[2] = { off, len };
1093 int ret;
1094
1095 /*
a8e16298 1096 * BLKZEROOUT goes straight to disk and doesn't do anything
7c673cae
FG
1097 * about dirty buffers. This means we need to flush so that
1098 *
1099 * write 0..3M
1100 * discard 1..2M
1101 *
1102 * results in "data 0000 data" rather than "data data data" on
1103 * disk and invalidate so that
1104 *
1105 * discard 1..2M
1106 * read 0..3M
1107 *
1108 * returns "data 0000 data" rather than "data data data" in
1109 * case 1..2M was cached.
a8e16298
TL
1110 *
1111 * Note: These cache coherency issues are supposed to be fixed
1112 * in recent kernels.
7c673cae
FG
1113 */
1114 ret = __krbd_flush(ctx, true);
1115 if (ret < 0)
1116 return ret;
1117
1118 /*
a8e16298 1119 * off and len must be 512-byte aligned, otherwise BLKZEROOUT
7c673cae
FG
1120 * will fail with -EINVAL. This means that -K (enable krbd
1121 * mode) requires -h 512 or similar.
1122 */
a8e16298 1123 if (ioctl(ctx->krbd_fd, BLKZEROOUT, &range) < 0) {
7c673cae 1124 ret = -errno;
a8e16298 1125 prt("BLKZEROOUT(%llu, %llu) failed\n", off, len);
7c673cae
FG
1126 return ret;
1127 }
1128
1129 return 0;
1130}
1131
1132int
1133krbd_get_size(struct rbd_ctx *ctx, uint64_t *size)
1134{
1135 uint64_t bytes;
1136
1137 if (ioctl(ctx->krbd_fd, BLKGETSIZE64, &bytes) < 0) {
1138 int ret = -errno;
1139 prt("BLKGETSIZE64 failed\n");
1140 return ret;
1141 }
1142
1143 *size = bytes;
1144
1145 return 0;
1146}
1147
1148int
1149krbd_resize(struct rbd_ctx *ctx, uint64_t size)
1150{
1151 int ret;
aee94f69
TL
1152 int count = 0;
1153 uint64_t effective_size;
7c673cae 1154
11fdf7f2 1155 ceph_assert(size % truncbdy == 0);
7c673cae
FG
1156
1157 /*
1158 * When krbd detects a size change, it calls revalidate_disk(),
1159 * which ends up calling invalidate_bdev(), which invalidates
1160 * clean pages and does nothing about dirty pages beyond the
1161 * new size. The preceding cache flush makes sure those pages
1162 * are invalidated, which is what we need on shrink so that
1163 *
1164 * write 0..1M
1165 * resize 0
1166 * resize 2M
1167 * read 0..2M
1168 *
1169 * returns "0000 0000" rather than "data 0000".
1170 */
1171 ret = __krbd_flush(ctx, false);
1172 if (ret < 0)
1173 return ret;
1174
aee94f69
TL
1175 ret = __librbd_resize(ctx, size);
1176 if (ret < 0)
1177 return ret;
1178
1179 for (;;) {
1180 ret = krbd_get_size(ctx, &effective_size);
1181 if (ret < 0)
1182 return ret;
1183
1184 if (effective_size == size)
1185 break;
1186
1187 if (count++ >= 15) {
1188 prt("BLKGETSIZE64 size error: expected 0x%llx, actual 0x%llx\n",
1189 (unsigned long long)size,
1190 (unsigned long long)effective_size);
1191 return -EINVAL;
1192 }
1193
1194 usleep(count * 250 * 1000);
1195 }
1196
1197 return 0;
7c673cae
FG
1198}
1199
1200int
1201krbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1202 const char *dst_imagename, int *order, int stripe_unit,
1203 int stripe_count)
1204{
1205 int ret;
1206
1207 ret = __krbd_flush(ctx, false);
1208 if (ret < 0)
1209 return ret;
1210
1211 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
9f95a23c 1212 stripe_unit, stripe_count);
7c673cae
FG
1213}
1214
1215int
1216krbd_flatten(struct rbd_ctx *ctx)
1217{
1218 int ret;
1219
1220 ret = __krbd_flush(ctx, false);
1221 if (ret < 0)
1222 return ret;
1223
1224 return __librbd_flatten(ctx);
1225}
11fdf7f2 1226#endif // __linux__
7c673cae 1227
11fdf7f2 1228#if defined(WITH_KRBD)
7c673cae
FG
1229const struct rbd_operations krbd_operations = {
1230 krbd_open,
1231 krbd_close,
1232 krbd_read,
1233 krbd_write,
1234 krbd_flush,
1235 krbd_discard,
1236 krbd_get_size,
1237 krbd_resize,
1238 krbd_clone,
1239 krbd_flatten,
1240 NULL,
1241};
11fdf7f2 1242#endif // WITH_KRBD
7c673cae 1243
11fdf7f2 1244#if defined(__linux__)
7c673cae
FG
1245int
1246nbd_open(const char *name, struct rbd_ctx *ctx)
1247{
1248 int r;
1249 int fd;
1250 char dev[4096];
1251 char *devnode;
1252
1253 SubProcess process("rbd-nbd", SubProcess::KEEP, SubProcess::PIPE,
1254 SubProcess::KEEP);
1255 process.add_cmd_arg("map");
f67539c2 1256 process.add_cmd_arg("--io-timeout=600");
7c673cae
FG
1257 std::string img;
1258 img.append(pool);
1259 img.append("/");
1260 img.append(name);
1261 process.add_cmd_arg(img.c_str());
1262
1263 r = __librbd_open(name, ctx);
1264 if (r < 0)
1265 return r;
1266
1267 r = process.spawn();
1268 if (r < 0) {
1269 prt("nbd_open failed to run rbd-nbd error: %s\n", process.err().c_str());
1270 return r;
1271 }
1272 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1273 if (r < 0) {
1274 prt("nbd_open failed to get nbd device path\n");
1275 return r;
1276 }
1277 for (int i = 0; i < r; ++i)
1278 if (dev[i] == 10 || dev[i] == 13)
1279 dev[i] = 0;
1280 dev[r] = 0;
1281 r = process.join();
1282 if (r) {
1283 prt("rbd-nbd failed with error: %s", process.err().c_str());
1284 return -EINVAL;
1285 }
1286
1287 devnode = strdup(dev);
1288 if (!devnode)
1289 return -ENOMEM;
1290
1291 fd = open(devnode, O_RDWR | o_direct);
1292 if (fd < 0) {
1293 r = -errno;
1294 prt("open(%s) failed\n", devnode);
1295 return r;
1296 }
1297
1298 ctx->krbd_name = devnode;
1299 ctx->krbd_fd = fd;
1300
1301 return 0;
1302}
1303
1304int
1305nbd_close(struct rbd_ctx *ctx)
1306{
1307 int r;
1308
11fdf7f2 1309 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
7c673cae
FG
1310
1311 if (close(ctx->krbd_fd) < 0) {
1312 r = -errno;
1313 prt("close(%s) failed\n", ctx->krbd_name);
1314 return r;
1315 }
1316
1317 SubProcess process("rbd-nbd");
1318 process.add_cmd_arg("unmap");
1319 process.add_cmd_arg(ctx->krbd_name);
1320
1321 r = process.spawn();
1322 if (r < 0) {
1323 prt("nbd_close failed to run rbd-nbd error: %s\n", process.err().c_str());
1324 return r;
1325 }
1326 r = process.join();
1327 if (r) {
1328 prt("rbd-nbd failed with error: %d", process.err().c_str());
1329 return -EINVAL;
1330 }
1331
1332 free((void *)ctx->krbd_name);
1333
1334 ctx->krbd_name = NULL;
1335 ctx->krbd_fd = -1;
1336
1337 return __librbd_close(ctx);
1338}
1339
1340int
1341nbd_clone(struct rbd_ctx *ctx, const char *src_snapname,
1342 const char *dst_imagename, int *order, int stripe_unit,
1343 int stripe_count)
1344{
1345 int ret;
1346
1347 ret = __krbd_flush(ctx, false);
1348 if (ret < 0)
1349 return ret;
1350
1351 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
9f95a23c 1352 stripe_unit, stripe_count);
7c673cae
FG
1353}
1354
1355const struct rbd_operations nbd_operations = {
1356 nbd_open,
1357 nbd_close,
1358 krbd_read,
1359 krbd_write,
1360 krbd_flush,
1361 krbd_discard,
1362 krbd_get_size,
1363 krbd_resize,
1364 nbd_clone,
1365 krbd_flatten,
1366 NULL,
1367};
11fdf7f2
TL
1368#endif // __linux__
1369
1370#if defined(__FreeBSD__)
1371int
1372ggate_open(const char *name, struct rbd_ctx *ctx)
1373{
1374 int r;
1375 int fd;
1376 char dev[4096];
1377 char *devnode;
1378
1379 SubProcess process("rbd-ggate", SubProcess::KEEP, SubProcess::PIPE,
1380 SubProcess::KEEP);
1381 process.add_cmd_arg("map");
1382 std::string img;
1383 img.append(pool);
1384 img.append("/");
1385 img.append(name);
1386 process.add_cmd_arg(img.c_str());
1387
1388 r = __librbd_open(name, ctx);
1389 if (r < 0) {
1390 return r;
1391 }
1392
1393 r = process.spawn();
1394 if (r < 0) {
1395 prt("ggate_open failed to run rbd-ggate: %s\n",
1396 process.err().c_str());
1397 return r;
1398 }
1399 r = safe_read(process.get_stdout(), dev, sizeof(dev));
1400 if (r < 0) {
1401 prt("ggate_open failed to get ggate device path\n");
1402 return r;
1403 }
1404 for (int i = 0; i < r; ++i) {
1405 if (dev[i] == '\r' || dev[i] == '\n') {
1406 dev[i] = 0;
1407 }
1408 }
1409 dev[r] = 0;
1410 r = process.join();
1411 if (r) {
1412 prt("rbd-ggate failed with error: %s", process.err().c_str());
1413 return -EINVAL;
1414 }
1415
1416 devnode = strdup(dev);
1417 if (!devnode) {
1418 return -ENOMEM;
1419 }
1420
1421 for (int i = 0; i < 100; i++) {
1422 fd = open(devnode, O_RDWR | o_direct);
1423 if (fd >= 0 || errno != ENOENT) {
1424 break;
1425 }
1426 usleep(100000);
1427 }
1428 if (fd < 0) {
1429 r = -errno;
1430 prt("open(%s) failed\n", devnode);
1431 return r;
1432 }
1433
1434 ctx->krbd_name = devnode;
1435 ctx->krbd_fd = fd;
1436
1437 return 0;
1438}
1439
1440int
1441ggate_close(struct rbd_ctx *ctx)
1442{
1443 int r;
1444
1445 ceph_assert(ctx->krbd_name && ctx->krbd_fd >= 0);
1446
1447 if (close(ctx->krbd_fd) < 0) {
1448 r = -errno;
1449 prt("close(%s) failed\n", ctx->krbd_name);
1450 return r;
1451 }
1452
1453 SubProcess process("rbd-ggate");
1454 process.add_cmd_arg("unmap");
1455 process.add_cmd_arg(ctx->krbd_name);
1456
1457 r = process.spawn();
1458 if (r < 0) {
1459 prt("ggate_close failed to run rbd-nbd: %s\n",
1460 process.err().c_str());
1461 return r;
1462 }
1463 r = process.join();
1464 if (r) {
1465 prt("rbd-ggate failed with error: %d", process.err().c_str());
1466 return -EINVAL;
1467 }
1468
1469 free((void *)ctx->krbd_name);
1470
1471 ctx->krbd_name = NULL;
1472 ctx->krbd_fd = -1;
1473
1474 return __librbd_close(ctx);
1475}
1476
1477ssize_t
1478ggate_read(struct rbd_ctx *ctx, uint64_t off, size_t len, char *buf)
1479{
1480 ssize_t n;
1481
1482 n = pread(ctx->krbd_fd, buf, len, off);
1483 if (n < 0) {
1484 n = -errno;
1485 prt("pread(%llu, %zu) failed\n", off, len);
1486 return n;
1487 }
1488
1489 return n;
1490}
1491
1492ssize_t
1493ggate_write(struct rbd_ctx *ctx, uint64_t off, size_t len, const char *buf)
1494{
1495 ssize_t n;
1496
1497 n = pwrite(ctx->krbd_fd, buf, len, off);
1498 if (n < 0) {
1499 n = -errno;
1500 prt("pwrite(%llu, %zu) failed\n", off, len);
1501 return n;
1502 }
1503
1504 return n;
1505}
1506
1507int
1508__ggate_flush(struct rbd_ctx *ctx, bool invalidate)
1509{
1510 int ret;
1511
1512 if (o_direct) {
1513 return 0;
1514 }
1515
1516 if (invalidate) {
1517 ret = ioctl(ctx->krbd_fd, DIOCGFLUSH, NULL);
1518 } else {
1519 ret = fsync(ctx->krbd_fd);
1520 }
1521 if (ret < 0) {
1522 ret = -errno;
1523 prt("%s failed\n", invalidate ? "DIOCGFLUSH" : "fsync");
1524 return ret;
1525 }
1526
1527 return 0;
1528}
1529
1530int
1531ggate_flush(struct rbd_ctx *ctx)
1532{
1533 return __ggate_flush(ctx, false);
1534}
1535
1536int
1537ggate_discard(struct rbd_ctx *ctx, uint64_t off, uint64_t len)
1538{
1539 off_t range[2] = {static_cast<off_t>(off), static_cast<off_t>(len)};
1540 int ret;
1541
1542 ret = __ggate_flush(ctx, true);
1543 if (ret < 0) {
1544 return ret;
1545 }
1546
1547 if (ioctl(ctx->krbd_fd, DIOCGDELETE, &range) < 0) {
1548 ret = -errno;
1549 prt("DIOCGDELETE(%llu, %llu) failed\n", off, len);
1550 return ret;
1551 }
1552
1553 return 0;
1554}
1555
1556int
1557ggate_get_size(struct rbd_ctx *ctx, uint64_t *size)
1558{
1559 off_t bytes;
1560
1561 if (ioctl(ctx->krbd_fd, DIOCGMEDIASIZE, &bytes) < 0) {
1562 int ret = -errno;
1563 prt("DIOCGMEDIASIZE failed\n");
1564 return ret;
1565 }
1566
1567 *size = bytes;
1568
1569 return 0;
1570}
1571
1572int
1573ggate_resize(struct rbd_ctx *ctx, uint64_t size)
1574{
1575 int ret;
1576
1577 ceph_assert(size % truncbdy == 0);
1578
1579 ret = __ggate_flush(ctx, false);
1580 if (ret < 0) {
1581 return ret;
1582 }
1583
1584 return __librbd_resize(ctx, size);
1585}
1586
1587int
1588ggate_clone(struct rbd_ctx *ctx, const char *src_snapname,
1589 const char *dst_imagename, int *order, int stripe_unit,
1590 int stripe_count)
1591{
1592 int ret;
1593
1594 ret = __ggate_flush(ctx, false);
1595 if (ret < 0) {
1596 return ret;
1597 }
1598
1599 return __librbd_clone(ctx, src_snapname, dst_imagename, order,
9f95a23c 1600 stripe_unit, stripe_count);
11fdf7f2
TL
1601}
1602
1603int
1604ggate_flatten(struct rbd_ctx *ctx)
1605{
1606 int ret;
1607
1608 ret = __ggate_flush(ctx, false);
1609 if (ret < 0) {
1610 return ret;
1611 }
1612
1613 return __librbd_flatten(ctx);
1614}
1615
1616const struct rbd_operations ggate_operations = {
1617 ggate_open,
1618 ggate_close,
1619 ggate_read,
1620 ggate_write,
1621 ggate_flush,
1622 ggate_discard,
1623 ggate_get_size,
1624 ggate_resize,
1625 ggate_clone,
1626 ggate_flatten,
1627 NULL,
1628};
1629#endif // __FreeBSD__
7c673cae
FG
1630
1631struct rbd_ctx ctx = RBD_CTX_INIT;
1632const struct rbd_operations *ops = &librbd_operations;
1633
1634static bool rbd_image_has_parent(struct rbd_ctx *ctx)
1635{
1636 int ret;
11fdf7f2
TL
1637 rbd_linked_image_spec_t parent_image;
1638 rbd_snap_spec_t parent_snap;
7c673cae 1639
11fdf7f2
TL
1640 ret = rbd_get_parent(ctx->image, &parent_image, &parent_snap);
1641 if (ret < 0 && ret != -ENOENT) {
7c673cae
FG
1642 prterrcode("rbd_get_parent_info", ret);
1643 exit(1);
1644 }
11fdf7f2
TL
1645 rbd_linked_image_spec_cleanup(&parent_image);
1646 rbd_snap_spec_cleanup(&parent_snap);
7c673cae
FG
1647
1648 return !ret;
1649}
1650
1651/*
1652 * fsx
1653 */
1654
1655void
1656log4(int operation, int arg0, int arg1, int arg2)
1657{
1658 struct log_entry *le;
1659
1660 le = &oplog[logptr];
1661 le->operation = operation;
1662 if (closeopen)
1663 le->operation = ~ le->operation;
1664 le->args[0] = arg0;
1665 le->args[1] = arg1;
1666 le->args[2] = arg2;
1667 logptr++;
1668 logcount++;
1669 if (logptr >= LOGSIZE)
1670 logptr = 0;
1671}
1672
1673void
1674logdump(void)
1675{
1676 int i, count, down;
1677 struct log_entry *lp;
1678 const char *falloc_type[3] = {"PAST_EOF", "EXTENDING", "INTERIOR"};
1679
1680 prt("LOG DUMP (%d total operations):\n", logcount);
1681 if (logcount < LOGSIZE) {
1682 i = 0;
1683 count = logcount;
1684 } else {
1685 i = logptr;
1686 count = LOGSIZE;
1687 }
1688 for ( ; count > 0; count--) {
1689 int opnum;
1690
1691 opnum = i+1 + (logcount/LOGSIZE)*LOGSIZE;
1692 prt("%d(%3d mod 256): ", opnum, opnum%256);
1693 lp = &oplog[i];
1694 if ((closeopen = lp->operation < 0))
1695 lp->operation = ~ lp->operation;
1696
1697 switch (lp->operation) {
1698 case OP_MAPREAD:
1699 prt("MAPREAD 0x%x thru 0x%x\t(0x%x bytes)",
1700 lp->args[0], lp->args[0] + lp->args[1] - 1,
1701 lp->args[1]);
1702 if (badoff >= lp->args[0] && badoff <
1703 lp->args[0] + lp->args[1])
1704 prt("\t***RRRR***");
1705 break;
1706 case OP_MAPWRITE:
1707 prt("MAPWRITE 0x%x thru 0x%x\t(0x%x bytes)",
1708 lp->args[0], lp->args[0] + lp->args[1] - 1,
1709 lp->args[1]);
1710 if (badoff >= lp->args[0] && badoff <
1711 lp->args[0] + lp->args[1])
1712 prt("\t******WWWW");
1713 break;
1714 case OP_READ:
1715 prt("READ 0x%x thru 0x%x\t(0x%x bytes)",
1716 lp->args[0], lp->args[0] + lp->args[1] - 1,
1717 lp->args[1]);
1718 if (badoff >= lp->args[0] &&
1719 badoff < lp->args[0] + lp->args[1])
1720 prt("\t***RRRR***");
1721 break;
1722 case OP_WRITE:
1723 prt("WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1724 lp->args[0], lp->args[0] + lp->args[1] - 1,
1725 lp->args[1]);
1726 if (lp->args[0] > lp->args[2])
1727 prt(" HOLE");
1728 else if (lp->args[0] + lp->args[1] > lp->args[2])
1729 prt(" EXTEND");
1730 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1731 badoff < lp->args[0] + lp->args[1])
1732 prt("\t***WWWW");
1733 break;
1734 case OP_TRUNCATE:
1735 down = lp->args[0] < lp->args[1];
1736 prt("TRUNCATE %s\tfrom 0x%x to 0x%x",
1737 down ? "DOWN" : "UP", lp->args[1], lp->args[0]);
1738 if (badoff >= lp->args[!down] &&
1739 badoff < lp->args[!!down])
1740 prt("\t******WWWW");
1741 break;
1742 case OP_FALLOCATE:
1743 /* 0: offset 1: length 2: where alloced */
1744 prt("FALLOC 0x%x thru 0x%x\t(0x%x bytes) %s",
1745 lp->args[0], lp->args[0] + lp->args[1],
1746 lp->args[1], falloc_type[lp->args[2]]);
1747 if (badoff >= lp->args[0] &&
1748 badoff < lp->args[0] + lp->args[1])
1749 prt("\t******FFFF");
1750 break;
1751 case OP_PUNCH_HOLE:
1752 prt("PUNCH 0x%x thru 0x%x\t(0x%x bytes)",
1753 lp->args[0], lp->args[0] + lp->args[1] - 1,
1754 lp->args[1]);
1755 if (badoff >= lp->args[0] && badoff <
1756 lp->args[0] + lp->args[1])
1757 prt("\t******PPPP");
1758 break;
1759 case OP_WRITESAME:
1760 prt("WRITESAME 0x%x thru 0x%x\t(0x%x bytes) data_size 0x%x",
1761 lp->args[0], lp->args[0] + lp->args[1] - 1,
1762 lp->args[1], lp->args[2]);
1763 if (badoff >= lp->args[0] &&
1764 badoff < lp->args[0] + lp->args[1])
1765 prt("\t***WSWSWSWS");
1766 break;
c07f9fc5
FG
1767 case OP_COMPARE_AND_WRITE:
1768 prt("COMPARE_AND_WRITE 0x%x thru 0x%x\t(0x%x bytes)",
1769 lp->args[0], lp->args[0] + lp->args[1] - 1,
1770 lp->args[1]);
1771 if (lp->args[0] > lp->args[2])
1772 prt(" HOLE");
1773 else if (lp->args[0] + lp->args[1] > lp->args[2])
1774 prt(" EXTEND");
1775 if ((badoff >= lp->args[0] || badoff >=lp->args[2]) &&
1776 badoff < lp->args[0] + lp->args[1])
1777 prt("\t***WWWW");
1778 break;
7c673cae
FG
1779 case OP_CLONE:
1780 prt("CLONE");
1781 break;
1782 case OP_FLATTEN:
1783 prt("FLATTEN");
1784 break;
1785 case OP_SKIPPED:
1786 prt("SKIPPED (no operation)");
1787 break;
1788 default:
1789 prt("BOGUS LOG ENTRY (operation code = %d)!",
1790 lp->operation);
1791 }
1792 if (closeopen)
1793 prt("\n\t\tCLOSE/OPEN");
1794 prt("\n");
1795 i++;
1796 if (i == LOGSIZE)
1797 i = 0;
1798 }
1799}
1800
1801void
1802save_buffer(char *buffer, off_t bufferlength, int fd)
1803{
1804 off_t ret;
1805 ssize_t byteswritten;
1806
1807 if (fd <= 0 || bufferlength == 0)
1808 return;
1809
1810 if (bufferlength > SSIZE_MAX) {
1811 prt("fsx flaw: overflow in save_buffer\n");
1812 exit(67);
1813 }
1814
1815 ret = lseek(fd, (off_t)0, SEEK_SET);
1816 if (ret == (off_t)-1)
1817 prterr("save_buffer: lseek 0");
1818
1819 byteswritten = write(fd, buffer, (size_t)bufferlength);
1820 if (byteswritten != bufferlength) {
1821 if (byteswritten == -1)
1822 prterr("save_buffer write");
1823 else
1824 warn("save_buffer: short write, 0x%x bytes instead of 0x%llx\n",
1825 (unsigned)byteswritten,
1826 (unsigned long long)bufferlength);
1827 }
1828}
1829
1830
1831void
1832report_failure(int status)
1833{
1834 logdump();
1835
1836 if (fsxgoodfd) {
1837 if (good_buf) {
1838 save_buffer(good_buf, file_size, fsxgoodfd);
1839 prt("Correct content saved for comparison\n");
1840 prt("(maybe hexdump \"%s\" vs \"%s.fsxgood\")\n",
1841 iname, iname);
1842 }
1843 close(fsxgoodfd);
1844 }
1845 sleep(3); // so the log can flush to disk. KLUDGEY!
1846 exit(status);
1847}
1848
1849#define short_at(cp) ((unsigned short)((*((unsigned char *)(cp)) << 8) | \
1850 *(((unsigned char *)(cp)) + 1)))
1851
1852int
1853fsxcmp(char *good_buf, char *temp_buf, unsigned size)
1854{
1855 if (!skip_partial_discard) {
1856 return memcmp(good_buf, temp_buf, size);
1857 }
1858
1859 for (unsigned i = 0; i < size; i++) {
1860 if (good_buf[i] != temp_buf[i] && good_buf[i] != 0) {
1861 return good_buf[i] - temp_buf[i];
1862 }
1863 }
1864 return 0;
1865}
1866
1867void
1868check_buffers(char *good_buf, char *temp_buf, unsigned offset, unsigned size)
1869{
1870 if (fsxcmp(good_buf + offset, temp_buf, size) != 0) {
1871 unsigned i = 0;
1872 unsigned n = 0;
1873
1874 prt("READ BAD DATA: offset = 0x%x, size = 0x%x, fname = %s\n",
1875 offset, size, iname);
1876 prt("OFFSET\tGOOD\tBAD\tRANGE\n");
1877 while (size > 0) {
1878 unsigned char c = good_buf[offset];
1879 unsigned char t = temp_buf[i];
1880 if (c != t) {
1881 if (n < 16) {
1882 unsigned bad = short_at(&temp_buf[i]);
1883 prt("0x%5x\t0x%04x\t0x%04x", offset,
1884 short_at(&good_buf[offset]), bad);
1885 unsigned op = temp_buf[(offset & 1) ? i+1 : i];
1886 prt("\t0x%5x\n", n);
1887 if (op)
1888 prt("operation# (mod 256) for "
1889 "the bad data may be %u\n",
1890 ((unsigned)op & 0xff));
1891 else
1892 prt("operation# (mod 256) for "
1893 "the bad data unknown, check"
1894 " HOLE and EXTEND ops\n");
1895 }
1896 n++;
1897 badoff = offset;
1898 }
1899 offset++;
1900 i++;
1901 size--;
1902 }
1903 report_failure(110);
1904 }
1905}
1906
1907
1908void
1909check_size(void)
1910{
1911 uint64_t size;
1912 int ret;
1913
1914 ret = ops->get_size(&ctx, &size);
1915 if (ret < 0)
1916 prterrcode("check_size: ops->get_size", ret);
1917
1918 if ((uint64_t)file_size != size) {
1919 prt("Size error: expected 0x%llx stat 0x%llx\n",
1920 (unsigned long long)file_size,
1921 (unsigned long long)size);
1922 report_failure(120);
1923 }
1924}
1925
1926#define TRUNC_HACK_SIZE (200ULL << 9) /* 512-byte aligned for krbd */
1927
1928void
1929check_trunc_hack(void)
1930{
1931 uint64_t size;
1932 int ret;
1933
1934 ret = ops->resize(&ctx, 0ULL);
1935 if (ret < 0)
1936 prterrcode("check_trunc_hack: ops->resize pre", ret);
1937
1938 ret = ops->resize(&ctx, TRUNC_HACK_SIZE);
1939 if (ret < 0)
1940 prterrcode("check_trunc_hack: ops->resize actual", ret);
1941
1942 ret = ops->get_size(&ctx, &size);
1943 if (ret < 0)
1944 prterrcode("check_trunc_hack: ops->get_size", ret);
1945
1946 if (size != TRUNC_HACK_SIZE) {
1947 prt("no extend on truncate! not posix!\n");
1948 exit(130);
1949 }
1950
1951 ret = ops->resize(&ctx, 0ULL);
1952 if (ret < 0)
1953 prterrcode("check_trunc_hack: ops->resize post", ret);
1954}
1955
1956int
1957create_image()
1958{
1959 int r;
1960 int order = 0;
1961 char buf[32];
11fdf7f2
TL
1962 char client_name[256];
1963
1964 sprintf(client_name, "client.%s", client_id);
7c673cae 1965
11fdf7f2 1966 r = rados_create2(&cluster, cluster_name, client_name, 0);
7c673cae
FG
1967 if (r < 0) {
1968 simple_err("Could not create cluster handle", r);
1969 return r;
1970 }
1971 rados_conf_parse_env(cluster, NULL);
1972 r = rados_conf_read_file(cluster, NULL);
1973 if (r < 0) {
1974 simple_err("Error reading ceph config file", r);
1975 goto failed_shutdown;
1976 }
1977 r = rados_connect(cluster);
1978 if (r < 0) {
1979 simple_err("Error connecting to cluster", r);
1980 goto failed_shutdown;
1981 }
11fdf7f2 1982#if defined(WITH_KRBD)
f91f0fd5 1983 r = krbd_create_from_context(rados_cct(cluster), 0, &krbd);
7c673cae
FG
1984 if (r < 0) {
1985 simple_err("Could not create libkrbd handle", r);
1986 goto failed_shutdown;
1987 }
11fdf7f2 1988#endif
7c673cae
FG
1989
1990 r = rados_pool_create(cluster, pool);
1991 if (r < 0 && r != -EEXIST) {
1992 simple_err("Error creating pool", r);
1993 goto failed_krbd;
1994 }
1995 r = rados_ioctx_create(cluster, pool, &ioctx);
1996 if (r < 0) {
1997 simple_err("Error creating ioctx", r);
1998 goto failed_krbd;
1999 }
c07f9fc5
FG
2000 rados_application_enable(ioctx, "rbd", 1);
2001
7c673cae 2002 if (clone_calls || journal_replay) {
11fdf7f2
TL
2003 uint64_t features;
2004 r = get_features(&features);
2005 if (r < 0) {
2006 goto failed_open;
7c673cae 2007 }
11fdf7f2
TL
2008
2009 r = rbd_create2(ioctx, iname, file_size, features, &order);
7c673cae 2010 } else {
11fdf7f2 2011 r = rbd_create(ioctx, iname, file_size, &order);
7c673cae
FG
2012 }
2013 if (r < 0) {
2014 simple_err("Error creating image", r);
2015 goto failed_open;
2016 }
2017
2018 if (journal_replay) {
2019 r = register_journal(ioctx, iname);
2020 if (r < 0) {
2021 goto failed_open;
2022 }
2023 }
2024
2025 r = rados_conf_get(cluster, "rbd_skip_partial_discard", buf,
2026 sizeof(buf));
2027 if (r < 0) {
2028 simple_err("Could not get rbd_skip_partial_discard value", r);
2029 goto failed_open;
2030 }
2031 skip_partial_discard = (strcmp(buf, "true") == 0);
2032
2033 return 0;
2034
2035 failed_open:
2036 rados_ioctx_destroy(ioctx);
2037 failed_krbd:
11fdf7f2 2038#if defined(WITH_KRBD)
7c673cae 2039 krbd_destroy(krbd);
11fdf7f2 2040#endif
7c673cae
FG
2041 failed_shutdown:
2042 rados_shutdown(cluster);
2043 return r;
2044}
2045
2046void
2047doflush(unsigned offset, unsigned size)
2048{
2049 int ret;
2050
2051 if (o_direct)
2052 return;
2053
2054 ret = ops->flush(&ctx);
2055 if (ret < 0)
2056 prterrcode("doflush: ops->flush", ret);
2057}
2058
2059void
2060doread(unsigned offset, unsigned size)
2061{
2062 int ret;
2063
2064 offset -= offset % readbdy;
2065 if (o_direct)
2066 size -= size % readbdy;
2067 if (size == 0) {
2068 if (!quiet && testcalls > simulatedopcount && !o_direct)
2069 prt("skipping zero size read\n");
2070 log4(OP_SKIPPED, OP_READ, offset, size);
2071 return;
2072 }
2073 if (size + offset > file_size) {
2074 if (!quiet && testcalls > simulatedopcount)
2075 prt("skipping seek/read past end of file\n");
2076 log4(OP_SKIPPED, OP_READ, offset, size);
2077 return;
2078 }
2079
2080 log4(OP_READ, offset, size, 0);
2081
2082 if (testcalls <= simulatedopcount)
2083 return;
2084
2085 if (!quiet &&
2086 ((progressinterval && testcalls % progressinterval == 0) ||
2087 (debug &&
2088 (monitorstart == -1 ||
2089 (static_cast<long>(offset + size) > monitorstart &&
2090 (monitorend == -1 ||
2091 static_cast<long>(offset) <= monitorend))))))
2092 prt("%lu read\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2093 offset, offset + size - 1, size);
2094
2095 ret = ops->read(&ctx, offset, size, temp_buf);
2096 if (ret != (int)size) {
2097 if (ret < 0)
2098 prterrcode("doread: ops->read", ret);
2099 else
2100 prt("short read: 0x%x bytes instead of 0x%x\n",
2101 ret, size);
2102 report_failure(141);
2103 }
2104
2105 check_buffers(good_buf, temp_buf, offset, size);
2106}
2107
2108
2109void
2110check_eofpage(char *s, unsigned offset, char *p, int size)
2111{
2112 unsigned long last_page, should_be_zero;
2113
2114 if (offset + size <= (file_size & ~page_mask))
2115 return;
2116 /*
2117 * we landed in the last page of the file
2118 * test to make sure the VM system provided 0's
2119 * beyond the true end of the file mapping
2120 * (as required by mmap def in 1996 posix 1003.1)
2121 */
2122 last_page = ((unsigned long)p + (offset & page_mask) + size) & ~page_mask;
2123
2124 for (should_be_zero = last_page + (file_size & page_mask);
2125 should_be_zero < last_page + page_size;
2126 should_be_zero++)
2127 if (*(char *)should_be_zero) {
2128 prt("Mapped %s: non-zero data past EOF (0x%llx) page offset 0x%x is 0x%04x\n",
2129 s, file_size - 1, should_be_zero & page_mask,
2130 short_at(should_be_zero));
2131 report_failure(205);
2132 }
2133}
2134
2135
2136void
2137gendata(char *original_buf, char *good_buf, unsigned offset, unsigned size)
2138{
2139 while (size--) {
2140 good_buf[offset] = testcalls % 256;
2141 if (offset % 2)
2142 good_buf[offset] += original_buf[offset];
2143 offset++;
2144 }
2145}
2146
2147
2148void
2149dowrite(unsigned offset, unsigned size)
2150{
2151 ssize_t ret;
2152 off_t newsize;
2153
2154 offset -= offset % writebdy;
2155 if (o_direct)
2156 size -= size % writebdy;
2157 if (size == 0) {
2158 if (!quiet && testcalls > simulatedopcount && !o_direct)
2159 prt("skipping zero size write\n");
2160 log4(OP_SKIPPED, OP_WRITE, offset, size);
2161 return;
2162 }
2163
2164 log4(OP_WRITE, offset, size, file_size);
2165
2166 gendata(original_buf, good_buf, offset, size);
2167 if (file_size < offset + size) {
2168 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
2169 if (file_size < newsize)
2170 memset(good_buf + file_size, '\0', newsize - file_size);
2171 file_size = newsize;
2172 if (lite) {
2173 warn("Lite file size bug in fsx!");
2174 report_failure(149);
2175 }
2176 ret = ops->resize(&ctx, newsize);
2177 if (ret < 0) {
2178 prterrcode("dowrite: ops->resize", ret);
2179 report_failure(150);
2180 }
2181 }
2182
2183 if (testcalls <= simulatedopcount)
2184 return;
2185
2186 if (!quiet &&
2187 ((progressinterval && testcalls % progressinterval == 0) ||
2188 (debug &&
2189 (monitorstart == -1 ||
2190 (static_cast<long>(offset + size) > monitorstart &&
2191 (monitorend == -1 ||
2192 static_cast<long>(offset) <= monitorend))))))
2193 prt("%lu write\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2194 offset, offset + size - 1, size);
2195
2196 ret = ops->write(&ctx, offset, size, good_buf + offset);
2197 if (ret != (ssize_t)size) {
2198 if (ret < 0)
2199 prterrcode("dowrite: ops->write", ret);
2200 else
2201 prt("short write: 0x%x bytes instead of 0x%x\n",
2202 ret, size);
2203 report_failure(151);
2204 }
2205
2206 if (flush_enabled)
2207 doflush(offset, size);
2208}
2209
2210
2211void
2212dotruncate(unsigned size)
2213{
2214 int oldsize = file_size;
2215 int ret;
2216
2217 size -= size % truncbdy;
2218 if (size > biggest) {
2219 biggest = size;
2220 if (!quiet && testcalls > simulatedopcount)
2221 prt("truncating to largest ever: 0x%x\n", size);
2222 }
2223
2224 log4(OP_TRUNCATE, size, (unsigned)file_size, 0);
2225
2226 if (size > file_size)
2227 memset(good_buf + file_size, '\0', size - file_size);
2228 else if (size < file_size)
2229 memset(good_buf + size, '\0', file_size - size);
2230 file_size = size;
2231
2232 if (testcalls <= simulatedopcount)
2233 return;
2234
2235 if ((progressinterval && testcalls % progressinterval == 0) ||
2236 (debug && (monitorstart == -1 || monitorend == -1 ||
2237 (long)size <= monitorend)))
2238 prt("%lu trunc\tfrom 0x%x to 0x%x\n", testcalls, oldsize, size);
2239
2240 ret = ops->resize(&ctx, size);
2241 if (ret < 0) {
2242 prterrcode("dotruncate: ops->resize", ret);
2243 report_failure(160);
2244 }
2245}
2246
2247void
2248do_punch_hole(unsigned offset, unsigned length)
2249{
2250 unsigned end_offset;
2251 int max_offset = 0;
2252 int max_len = 0;
2253 int ret;
2254
2255 offset -= offset % holebdy;
2256 length -= length % holebdy;
2257 if (length == 0) {
2258 if (!quiet && testcalls > simulatedopcount)
2259 prt("skipping zero length punch hole\n");
2260 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
2261 return;
2262 }
2263
2264 if (file_size <= (loff_t)offset) {
2265 if (!quiet && testcalls > simulatedopcount)
2266 prt("skipping hole punch off the end of the file\n");
2267 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, length);
2268 return;
2269 }
2270
2271 end_offset = offset + length;
2272
2273 log4(OP_PUNCH_HOLE, offset, length, 0);
2274
2275 if (testcalls <= simulatedopcount)
2276 return;
2277
2278 if ((progressinterval && testcalls % progressinterval == 0) ||
2279 (debug && (monitorstart == -1 || monitorend == -1 ||
2280 (long)end_offset <= monitorend))) {
2281 prt("%lu punch\tfrom 0x%x to 0x%x, (0x%x bytes)\n", testcalls,
2282 offset, offset+length, length);
2283 }
2284
2285 ret = ops->discard(&ctx, (unsigned long long)offset,
2286 (unsigned long long)length);
2287 if (ret < 0) {
2288 prterrcode("do_punch_hole: ops->discard", ret);
2289 report_failure(161);
2290 }
2291
2292 max_offset = offset < file_size ? offset : file_size;
2293 max_len = max_offset + length <= file_size ? length :
2294 file_size - max_offset;
2295 memset(good_buf + max_offset, '\0', max_len);
2296}
2297
2298unsigned get_data_size(unsigned size)
2299{
2300 unsigned i;
2301 unsigned hint;
2302 unsigned max = sqrt((double)size) + 1;
2303 unsigned good = 1;
2304 unsigned curr = good;
2305
2306 hint = get_random() % max;
2307
2308 for (i = 1; i < max && curr < hint; i++) {
2309 if (size % i == 0) {
2310 good = curr;
2311 curr = i;
2312 }
2313 }
2314
2315 if (curr == hint)
2316 good = curr;
2317
2318 return good;
2319}
2320
2321void
2322dowritesame(unsigned offset, unsigned size)
2323{
2324 ssize_t ret;
2325 off_t newsize;
2326 unsigned buf_off;
2327 unsigned data_size;
2328 int n;
2329
2330 offset -= offset % writebdy;
2331 if (o_direct)
2332 size -= size % writebdy;
2333 if (size == 0) {
2334 if (!quiet && testcalls > simulatedopcount && !o_direct)
2335 prt("skipping zero size writesame\n");
2336 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2337 return;
2338 }
2339
2340 data_size = get_data_size(size);
2341
2342 log4(OP_WRITESAME, offset, size, data_size);
2343
2344 gendata(original_buf, good_buf, offset, data_size);
2345 if (file_size < offset + size) {
2346 newsize = ceil(((double)offset + size) / truncbdy) * truncbdy;
2347 if (file_size < newsize)
2348 memset(good_buf + file_size, '\0', newsize - file_size);
2349 file_size = newsize;
2350 if (lite) {
2351 warn("Lite file size bug in fsx!");
2352 report_failure(162);
2353 }
2354 ret = ops->resize(&ctx, newsize);
2355 if (ret < 0) {
2356 prterrcode("dowritesame: ops->resize", ret);
2357 report_failure(163);
2358 }
2359 }
2360
2361 for (n = size / data_size, buf_off = data_size; n > 1; n--) {
2362 memcpy(good_buf + offset + buf_off, good_buf + offset, data_size);
2363 buf_off += data_size;
2364 }
2365
2366 if (testcalls <= simulatedopcount)
2367 return;
2368
2369 if (!quiet &&
2370 ((progressinterval && testcalls % progressinterval == 0) ||
2371 (debug &&
2372 (monitorstart == -1 ||
2373 (static_cast<long>(offset + size) > monitorstart &&
2374 (monitorend == -1 ||
2375 static_cast<long>(offset) <= monitorend))))))
2376 prt("%lu writesame\t0x%x thru\t0x%x\tdata_size\t0x%x(0x%x bytes)\n", testcalls,
2377 offset, offset + size - 1, data_size, size);
2378
2379 ret = ops->writesame(&ctx, offset, size, good_buf + offset, data_size);
2380 if (ret != (ssize_t)size) {
2381 if (ret < 0)
2382 prterrcode("dowritesame: ops->writesame", ret);
2383 else
2384 prt("short writesame: 0x%x bytes instead of 0x%x\n",
2385 ret, size);
2386 report_failure(164);
2387 }
2388
2389 if (flush_enabled)
2390 doflush(offset, size);
2391}
2392
c07f9fc5
FG
2393void
2394docompareandwrite(unsigned offset, unsigned size)
2395{
2396 int ret;
2397
b32b8144
FG
2398 if (skip_partial_discard) {
2399 if (!quiet && testcalls > simulatedopcount)
2400 prt("compare and write disabled\n");
2401 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2402 return;
2403 }
2404
c07f9fc5
FG
2405 offset -= offset % writebdy;
2406 if (o_direct)
2407 size -= size % writebdy;
2408
2409 if (size == 0) {
2410 if (!quiet && testcalls > simulatedopcount && !o_direct)
2411 prt("skipping zero size read\n");
2412 log4(OP_SKIPPED, OP_READ, offset, size);
2413 return;
2414 }
2415
2416 if (size + offset > file_size) {
2417 if (!quiet && testcalls > simulatedopcount)
2418 prt("skipping seek/compare past end of file\n");
2419 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2420 return;
2421 }
2422
2423 memcpy(temp_buf + offset, good_buf + offset, size);
2424 gendata(original_buf, good_buf, offset, size);
2425 log4(OP_COMPARE_AND_WRITE, offset, size, 0);
2426
2427 if (testcalls <= simulatedopcount)
2428 return;
2429
2430 if (!quiet &&
2431 ((progressinterval && testcalls % progressinterval == 0) ||
2432 (debug &&
2433 (monitorstart == -1 ||
2434 (static_cast<long>(offset + size) > monitorstart &&
2435 (monitorend == -1 ||
2436 static_cast<long>(offset) <= monitorend))))))
2437 prt("%lu compareandwrite\t0x%x thru\t0x%x\t(0x%x bytes)\n", testcalls,
2438 offset, offset + size - 1, size);
2439
2440 ret = ops->compare_and_write(&ctx, offset, size, temp_buf + offset,
2441 good_buf + offset);
2442 if (ret != (ssize_t)size) {
2443 if (ret == -EINVAL) {
2444 memcpy(good_buf + offset, temp_buf + offset, size);
2445 return;
2446 }
2447 if (ret < 0)
2448 prterrcode("docompareandwrite: ops->compare_and_write", ret);
2449 else
2450 prt("short write: 0x%x bytes instead of 0x%x\n", ret, size);
2451 report_failure(151);
2452 return;
2453 }
2454
2455 if (flush_enabled)
2456 doflush(offset, size);
2457}
2458
7c673cae
FG
2459void clone_filename(char *buf, size_t len, int clones)
2460{
11fdf7f2
TL
2461#if __GNUC__ && __GNUC__ >= 8
2462#pragma GCC diagnostic push
2463#pragma GCC diagnostic ignored "-Wformat-truncation"
2464#endif
7c673cae
FG
2465 snprintf(buf, len, "%s/fsx-%s-parent%d",
2466 dirpath, iname, clones);
11fdf7f2
TL
2467#if __GNUC__ && __GNUC__ >= 8
2468#pragma GCC diagnostic pop
2469#endif
7c673cae
FG
2470}
2471
2472void clone_imagename(char *buf, size_t len, int clones)
2473{
f67539c2 2474 if (clones > 0) {
7c673cae 2475 snprintf(buf, len, "%s-clone%d", iname, clones);
f67539c2
TL
2476 } else {
2477 strncpy(buf, iname, len - 1);
2478 buf[len - 1] = '\0';
2479 }
7c673cae
FG
2480}
2481
2482void replay_imagename(char *buf, size_t len, int clones)
2483{
2484 clone_imagename(buf, len, clones);
2485 strncat(buf, "-replay", len - strlen(buf));
2486 buf[len - 1] = '\0';
2487}
2488
2489void check_clone(int clonenum, bool replay_image);
2490
2491void
2492do_clone()
2493{
2494 char filename[1024];
2495 char imagename[1024];
2496 char lastimagename[1024];
2497 int ret, fd;
2498 int order = 0, stripe_unit = 0, stripe_count = 0;
2499 uint64_t newsize = file_size;
2500
2501 log4(OP_CLONE, 0, 0, 0);
2502 ++num_clones;
2503
2504 if (randomize_striping) {
2505 order = 18 + get_random() % 8;
2506 stripe_unit = 1ull << (order - 1 - (get_random() % 8));
2507 stripe_count = 2 + get_random() % 14;
2508 }
2509
2510 prt("%lu clone\t%d order %d su %d sc %d\n", testcalls, num_clones,
2511 order, stripe_unit, stripe_count);
2512
2513 clone_imagename(imagename, sizeof(imagename), num_clones);
2514 clone_imagename(lastimagename, sizeof(lastimagename),
2515 num_clones - 1);
11fdf7f2 2516 ceph_assert(strcmp(lastimagename, ctx.name) == 0);
7c673cae
FG
2517
2518 ret = ops->clone(&ctx, "snap", imagename, &order, stripe_unit,
2519 stripe_count);
2520 if (ret < 0) {
2521 prterrcode("do_clone: ops->clone", ret);
2522 exit(165);
2523 }
2524
2525 if (randomize_parent_overlap && rbd_image_has_parent(&ctx)) {
2526 int rand = get_random() % 16 + 1; // [1..16]
2527
2528 if (rand < 13) {
2529 uint64_t overlap;
2530
2531 ret = rbd_get_overlap(ctx.image, &overlap);
2532 if (ret < 0) {
2533 prterrcode("do_clone: rbd_get_overlap", ret);
2534 exit(1);
2535 }
2536
2537 if (rand < 10) { // 9/16
2538 newsize = overlap * ((double)rand / 10);
2539 newsize -= newsize % truncbdy;
2540 } else { // 3/16
2541 newsize = 0;
2542 }
2543
11fdf7f2 2544 ceph_assert(newsize != (uint64_t)file_size);
7c673cae
FG
2545 prt("truncating image %s from 0x%llx (overlap 0x%llx) to 0x%llx\n",
2546 ctx.name, file_size, overlap, newsize);
2547
2548 ret = ops->resize(&ctx, newsize);
2549 if (ret < 0) {
2550 prterrcode("do_clone: ops->resize", ret);
2551 exit(1);
2552 }
2553 } else if (rand < 15) { // 2/16
2554 prt("flattening image %s\n", ctx.name);
2555
2556 ret = ops->flatten(&ctx);
2557 if (ret < 0) {
2558 prterrcode("do_clone: ops->flatten", ret);
2559 exit(1);
2560 }
2561 } else { // 2/16
2562 prt("leaving image %s intact\n", ctx.name);
2563 }
2564 }
2565
2566 clone_filename(filename, sizeof(filename), num_clones);
2567 if ((fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) {
2568 simple_err("do_clone: open", -errno);
2569 exit(162);
2570 }
2571 save_buffer(good_buf, newsize, fd);
2572 if ((ret = close(fd)) < 0) {
2573 simple_err("do_clone: close", -errno);
2574 exit(163);
2575 }
2576
2577 /*
2578 * Close parent.
2579 */
2580 if ((ret = ops->close(&ctx)) < 0) {
2581 prterrcode("do_clone: ops->close", ret);
2582 exit(174);
2583 }
2584
2585 if (journal_replay) {
2586 ret = finalize_journal(ioctx, lastimagename, num_clones - 1,
2587 order, stripe_unit, stripe_count);
2588 if (ret < 0) {
2589 exit(EXIT_FAILURE);
2590 }
2591
2592 ret = register_journal(ioctx, imagename);
2593 if (ret < 0) {
2594 exit(EXIT_FAILURE);
2595 }
2596 }
2597
2598 /*
2599 * Open freshly made clone.
2600 */
2601 if ((ret = ops->open(imagename, &ctx)) < 0) {
2602 prterrcode("do_clone: ops->open", ret);
2603 exit(166);
2604 }
2605
2606 if (num_clones > 1) {
2607 if (journal_replay) {
2608 check_clone(num_clones - 2, true);
2609 }
2610 check_clone(num_clones - 2, false);
2611 }
2612}
2613
2614void
2615check_clone(int clonenum, bool replay_image)
2616{
2617 char filename[128];
2618 char imagename[128];
2619 int ret, fd;
2620 struct rbd_ctx cur_ctx = RBD_CTX_INIT;
2621 struct stat file_info;
2622 char *good_buf, *temp_buf;
2623
2624 if (replay_image) {
2625 replay_imagename(imagename, sizeof(imagename), clonenum);
2626 } else {
2627 clone_imagename(imagename, sizeof(imagename), clonenum);
2628 }
2629
2630 if ((ret = ops->open(imagename, &cur_ctx)) < 0) {
2631 prterrcode("check_clone: ops->open", ret);
2632 exit(167);
2633 }
2634
2635 clone_filename(filename, sizeof(filename), clonenum + 1);
2636 if ((fd = open(filename, O_RDONLY)) < 0) {
2637 simple_err("check_clone: open", -errno);
2638 exit(168);
2639 }
2640
2641 prt("checking clone #%d, image %s against file %s\n",
2642 clonenum, imagename, filename);
2643 if ((ret = fstat(fd, &file_info)) < 0) {
2644 simple_err("check_clone: fstat", -errno);
2645 exit(169);
2646 }
2647
2648 good_buf = NULL;
2649 ret = posix_memalign((void **)&good_buf,
11fdf7f2 2650 std::max(writebdy, (int)sizeof(void *)),
7c673cae
FG
2651 file_info.st_size);
2652 if (ret > 0) {
2653 prterrcode("check_clone: posix_memalign(good_buf)", -ret);
2654 exit(96);
2655 }
2656
2657 temp_buf = NULL;
2658 ret = posix_memalign((void **)&temp_buf,
11fdf7f2 2659 std::max(readbdy, (int)sizeof(void *)),
7c673cae
FG
2660 file_info.st_size);
2661 if (ret > 0) {
2662 prterrcode("check_clone: posix_memalign(temp_buf)", -ret);
2663 exit(97);
2664 }
2665
2666 if ((ret = pread(fd, good_buf, file_info.st_size, 0)) < 0) {
2667 simple_err("check_clone: pread", -errno);
2668 exit(170);
2669 }
2670 if ((ret = ops->read(&cur_ctx, 0, file_info.st_size, temp_buf)) < 0) {
2671 prterrcode("check_clone: ops->read", ret);
2672 exit(171);
2673 }
2674 close(fd);
2675 if ((ret = ops->close(&cur_ctx)) < 0) {
2676 prterrcode("check_clone: ops->close", ret);
2677 exit(174);
2678 }
2679 check_buffers(good_buf, temp_buf, 0, file_info.st_size);
2680
2681 if (!replay_image) {
2682 unlink(filename);
2683 }
2684
2685 free(good_buf);
2686 free(temp_buf);
2687}
2688
2689void
2690writefileimage()
2691{
2692 ssize_t ret;
2693
2694 ret = ops->write(&ctx, 0, file_size, good_buf);
2695 if (ret != file_size) {
2696 if (ret < 0)
2697 prterrcode("writefileimage: ops->write", ret);
2698 else
2699 prt("short write: 0x%x bytes instead of 0x%llx\n",
2700 ret, (unsigned long long)file_size);
2701 report_failure(172);
2702 }
2703
2704 if (!lite) {
2705 ret = ops->resize(&ctx, file_size);
2706 if (ret < 0) {
2707 prterrcode("writefileimage: ops->resize", ret);
2708 report_failure(173);
2709 }
2710 }
2711}
2712
2713void
2714do_flatten()
2715{
2716 int ret;
2717
2718 if (!rbd_image_has_parent(&ctx)) {
2719 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2720 return;
2721 }
2722 log4(OP_FLATTEN, 0, 0, 0);
2723 prt("%lu flatten\n", testcalls);
2724
2725 ret = ops->flatten(&ctx);
2726 if (ret < 0) {
2727 prterrcode("writefileimage: ops->flatten", ret);
2728 exit(177);
2729 }
2730}
2731
2732void
2733docloseopen(void)
2734{
2735 char *name;
2736 int ret;
2737
2738 if (testcalls <= simulatedopcount)
2739 return;
2740
2741 name = strdup(ctx.name);
2742
2743 if (debug)
2744 prt("%lu close/open\n", testcalls);
2745
2746 ret = ops->close(&ctx);
2747 if (ret < 0) {
2748 prterrcode("docloseopen: ops->close", ret);
2749 report_failure(180);
2750 }
2751
2752 ret = ops->open(name, &ctx);
2753 if (ret < 0) {
2754 prterrcode("docloseopen: ops->open", ret);
2755 report_failure(181);
2756 }
2757
2758 free(name);
2759}
2760
2761#define TRIM_OFF_LEN(off, len, size) \
2762do { \
2763 if (size) \
2764 (off) %= (size); \
2765 else \
2766 (off) = 0; \
2767 if ((unsigned)(off) + (unsigned)(len) > (unsigned)(size)) \
2768 (len) = (size) - (off); \
2769} while (0)
2770
2771void
2772test(void)
2773{
2774 unsigned long offset;
2775 unsigned long size = maxoplen;
2776 unsigned long rv = get_random();
2777 unsigned long op;
2778
2779 if (simulatedopcount > 0 && testcalls == simulatedopcount)
2780 writefileimage();
2781
2782 testcalls++;
2783
2784 if (closeprob)
2785 closeopen = (rv >> 3) < (1u << 28) / (unsigned)closeprob;
2786
2787 if (debugstart > 0 && testcalls >= debugstart)
2788 debug = 1;
2789
2790 if (!quiet && testcalls < simulatedopcount && testcalls % 100000 == 0)
2791 prt("%lu...\n", testcalls);
2792
2793 offset = get_random();
2794 if (randomoplen)
2795 size = get_random() % (maxoplen + 1);
2796
2797 /* calculate appropriate op to run */
2798 if (lite)
2799 op = rv % OP_MAX_LITE;
2800 else
2801 op = rv % OP_MAX_FULL;
2802
2803 switch (op) {
2804 case OP_MAPREAD:
2805 if (!mapped_reads)
2806 op = OP_READ;
2807 break;
2808 case OP_MAPWRITE:
2809 if (!mapped_writes)
2810 op = OP_WRITE;
2811 break;
2812 case OP_FALLOCATE:
2813 if (!fallocate_calls) {
2814 log4(OP_SKIPPED, OP_FALLOCATE, offset, size);
2815 goto out;
2816 }
2817 break;
2818 case OP_PUNCH_HOLE:
2819 if (!punch_hole_calls) {
2820 log4(OP_SKIPPED, OP_PUNCH_HOLE, offset, size);
2821 goto out;
2822 }
2823 break;
2824 case OP_CLONE:
2825 /* clone, 8% chance */
2826 if (!clone_calls || file_size == 0 || get_random() % 100 >= 8) {
2827 log4(OP_SKIPPED, OP_CLONE, 0, 0);
2828 goto out;
2829 }
2830 break;
2831 case OP_FLATTEN:
2832 /* flatten four times as rarely as clone, 2% chance */
2833 if (get_random() % 100 >= 2) {
2834 log4(OP_SKIPPED, OP_FLATTEN, 0, 0);
2835 goto out;
2836 }
2837 break;
2838 case OP_WRITESAME:
2839 /* writesame not implemented */
2840 if (!ops->writesame) {
2841 log4(OP_SKIPPED, OP_WRITESAME, offset, size);
2842 goto out;
2843 }
d2e6a577 2844 break;
c07f9fc5
FG
2845 case OP_COMPARE_AND_WRITE:
2846 /* compare_and_write not implemented */
2847 if (!ops->compare_and_write) {
2848 log4(OP_SKIPPED, OP_COMPARE_AND_WRITE, offset, size);
2849 goto out;
2850 }
d2e6a577 2851 break;
7c673cae
FG
2852 }
2853
2854 switch (op) {
2855 case OP_READ:
2856 TRIM_OFF_LEN(offset, size, file_size);
2857 doread(offset, size);
2858 break;
2859
2860 case OP_WRITE:
2861 TRIM_OFF_LEN(offset, size, maxfilelen);
2862 dowrite(offset, size);
2863 break;
2864
2865 case OP_MAPREAD:
2866 TRIM_OFF_LEN(offset, size, file_size);
2867 exit(183);
2868 break;
2869
2870 case OP_MAPWRITE:
2871 TRIM_OFF_LEN(offset, size, maxfilelen);
2872 exit(182);
2873 break;
2874
2875 case OP_TRUNCATE:
2876 if (!style)
2877 size = get_random() % maxfilelen;
2878 dotruncate(size);
2879 break;
2880
2881 case OP_PUNCH_HOLE:
2882 TRIM_OFF_LEN(offset, size, file_size);
2883 do_punch_hole(offset, size);
2884 break;
2885
2886 case OP_WRITESAME:
2887 TRIM_OFF_LEN(offset, size, maxfilelen);
2888 dowritesame(offset, size);
2889 break;
c07f9fc5
FG
2890 case OP_COMPARE_AND_WRITE:
2891 TRIM_OFF_LEN(offset, size, file_size);
2892 docompareandwrite(offset, size);
2893 break;
7c673cae
FG
2894
2895 case OP_CLONE:
2896 do_clone();
2897 break;
2898
2899 case OP_FLATTEN:
2900 do_flatten();
2901 break;
2902
2903 default:
2904 prterr("test: unknown operation");
2905 report_failure(42);
2906 break;
2907 }
2908
2909out:
2910 if (sizechecks && testcalls > simulatedopcount)
2911 check_size();
2912 if (closeopen)
2913 docloseopen();
2914}
2915
2916
2917void
2918cleanup(int sig)
2919{
2920 if (sig)
2921 prt("signal %d\n", sig);
2922 prt("testcalls = %lu\n", testcalls);
2923 exit(sig);
2924}
2925
2926
2927void
2928usage(void)
2929{
2930 fprintf(stdout, "usage: %s",
2931 "fsx [-dfjknqxyACFHKLORUWZ] [-b opnum] [-c Prob] [-h holebdy] [-l flen] [-m start:end] [-o oplen] [-p progressinterval] [-r readbdy] [-s style] [-t truncbdy] [-w writebdy] [-D startingop] [-N numops] [-P dirpath] [-S seed] pname iname\n\
2932 -b opnum: beginning operation number (default 1)\n\
2933 -c P: 1 in P chance of file close+open at each op (default infinity)\n\
2934 -d: debug output for all operations\n\
2935 -f: flush and invalidate cache after I/O\n\
11fdf7f2 2936 -g: deep copy instead of clone\n\
7c673cae
FG
2937 -h holebdy: 4096 would make discards page aligned (default 1)\n\
2938 -j: journal replay stress test\n\
2939 -k: keep data on success (default 0)\n\
2940 -l flen: the upper bound on file size (default 262144)\n\
2941 -m startop:endop: monitor (print debug output) specified byte range (default 0:infinity)\n\
2942 -n: no verifications of file size\n\
2943 -o oplen: the upper bound on operation size (default 65536)\n\
2944 -p progressinterval: debug output at specified operation interval\n\
2945 -q: quieter operation\n\
2946 -r readbdy: 4096 would make reads page aligned (default 1)\n\
2947 -s style: 1 gives smaller truncates (default 0)\n\
2948 -t truncbdy: 4096 would make truncates page aligned (default 1)\n\
2949 -w writebdy: 4096 would make writes page aligned (default 1)\n\
2950 -x: preallocate file space before starting, XFS only (default 0)\n\
2951 -y: synchronize changes to a file\n"
2952
2953" -C: do not use clone calls\n\
2954 -D startingop: debug output starting at specified operation\n"
2955#ifdef FALLOCATE
2956" -F: Do not use fallocate (preallocation) calls\n"
2957#endif
11fdf7f2
TL
2958#if defined(__FreeBSD__)
2959" -G: enable rbd-ggate mode (use -L, -r and -w too)\n"
2960#endif
2961" -H: do not use punch hole calls\n"
2962#if defined(WITH_KRBD)
2963" -K: enable krbd mode (use -t and -h too)\n"
2964#endif
2965#if defined(__linux__)
2966" -M: enable rbd-nbd mode (use -t and -h too)\n"
2967#endif
2968" -L: fsxLite - no file creations & no file size changes\n\
7c673cae
FG
2969 -N numops: total # operations to do (default infinity)\n\
2970 -O: use oplen (see -o flag) for every op (default random)\n\
2971 -P dirpath: save .fsxlog and .fsxgood files in dirpath (default ./)\n\
2972 -R: read() system calls only (mapped reads disabled)\n\
2973 -S seed: for random # generator (default 1) 0 gets timestamp\n\
2974 -U: disable randomized striping\n\
2975 -W: mapped write operations DISabled\n\
2976 -Z: O_DIRECT (use -R, -W, -r and -w too)\n\
2977 poolname: this is REQUIRED (no default)\n\
2978 imagename: this is REQUIRED (no default)\n");
2979 exit(89);
2980}
2981
2982
2983int
2984getnum(char *s, char **e)
2985{
2986 int ret;
2987
2988 *e = (char *) 0;
2989 ret = strtol(s, e, 0);
2990 if (*e)
2991 switch (**e) {
2992 case 'b':
2993 case 'B':
2994 ret *= 512;
2995 *e = *e + 1;
2996 break;
2997 case 'k':
2998 case 'K':
2999 ret *= 1024;
3000 *e = *e + 1;
3001 break;
3002 case 'm':
3003 case 'M':
3004 ret *= 1024*1024;
3005 *e = *e + 1;
3006 break;
3007 case 'w':
3008 case 'W':
3009 ret *= 4;
3010 *e = *e + 1;
3011 break;
3012 }
3013 return (ret);
3014}
3015
3016void
3017test_fallocate()
3018{
3019#ifdef FALLOCATE
3020 if (!lite && fallocate_calls) {
3021 if (fallocate(fd, 0, 0, 1) && errno == EOPNOTSUPP) {
3022 if(!quiet)
3023 warn("main: filesystem does not support fallocate, disabling\n");
3024 fallocate_calls = 0;
3025 } else {
3026 ftruncate(fd, 0);
3027 }
3028 }
3029#else /* ! FALLOCATE */
3030 fallocate_calls = 0;
3031#endif
3032
3033}
3034
3035void remove_image(rados_ioctx_t ioctx, char *imagename, bool remove_snap,
3036 bool unregister) {
3037 rbd_image_t image;
3038 char errmsg[128];
3039 int ret;
3040
3041 if ((ret = rbd_open(ioctx, imagename, &image, NULL)) < 0) {
3042 sprintf(errmsg, "rbd_open %s", imagename);
3043 prterrcode(errmsg, ret);
3044 report_failure(101);
3045 }
3046 if (remove_snap) {
3047 if ((ret = rbd_snap_unprotect(image, "snap")) < 0) {
3048 sprintf(errmsg, "rbd_snap_unprotect %s@snap",
3049 imagename);
3050 prterrcode(errmsg, ret);
3051 report_failure(102);
3052 }
3053 if ((ret = rbd_snap_remove(image, "snap")) < 0) {
3054 sprintf(errmsg, "rbd_snap_remove %s@snap",
3055 imagename);
3056 prterrcode(errmsg, ret);
3057 report_failure(103);
3058 }
3059 }
3060 if ((ret = rbd_close(image)) < 0) {
3061 sprintf(errmsg, "rbd_close %s", imagename);
3062 prterrcode(errmsg, ret);
3063 report_failure(104);
3064 }
3065
3066 if (unregister &&
3067 (ret = unregister_journal(ioctx, imagename)) < 0) {
3068 report_failure(105);
3069 }
3070
3071 if ((ret = rbd_remove(ioctx, imagename)) < 0) {
3072 sprintf(errmsg, "rbd_remove %s", imagename);
3073 prterrcode(errmsg, ret);
3074 report_failure(106);
3075 }
3076}
3077
3078int
3079main(int argc, char **argv)
3080{
11fdf7f2
TL
3081 enum {
3082 LONG_OPT_CLUSTER = 1000,
3083 LONG_OPT_ID = 1001
3084 };
3085
7c673cae
FG
3086 int i, style, ch, ret;
3087 char *endp;
3088 char goodfile[1024];
3089 char logfile[1024];
3090
11fdf7f2
TL
3091 const char* optstring = "b:c:dfgh:jkl:m:no:p:qr:s:t:w:xyCD:FGHKMLN:OP:RS:UWZ";
3092 const struct option longopts[] = {
3093 {"cluster", 1, NULL, LONG_OPT_CLUSTER},
3094 {"id", 1, NULL, LONG_OPT_ID}};
3095
7c673cae
FG
3096 goodfile[0] = 0;
3097 logfile[0] = 0;
3098
f67539c2 3099 page_size = PAGE_SIZE;
7c673cae
FG
3100 page_mask = page_size - 1;
3101 mmap_mask = page_mask;
3102
3103 setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
3104
11fdf7f2 3105 while ((ch = getopt_long(argc, argv, optstring, longopts, NULL)) != EOF) {
7c673cae 3106 switch (ch) {
11fdf7f2
TL
3107 case LONG_OPT_CLUSTER:
3108 cluster_name = optarg;
3109 break;
3110 case LONG_OPT_ID:
3111 client_id = optarg;
3112 break;
7c673cae
FG
3113 case 'b':
3114 simulatedopcount = getnum(optarg, &endp);
3115 if (!quiet)
3116 fprintf(stdout, "Will begin at operation %lu\n",
3117 simulatedopcount);
3118 if (simulatedopcount == 0)
3119 usage();
3120 simulatedopcount -= 1;
3121 break;
3122 case 'c':
3123 closeprob = getnum(optarg, &endp);
3124 if (!quiet)
3125 fprintf(stdout,
3126 "Chance of close/open is 1 in %d\n",
3127 closeprob);
3128 if (closeprob <= 0)
3129 usage();
3130 break;
3131 case 'd':
3132 debug = 1;
3133 break;
3134 case 'f':
3135 flush_enabled = 1;
3136 break;
11fdf7f2
TL
3137 case 'g':
3138 deep_copy = 1;
3139 break;
7c673cae
FG
3140 case 'h':
3141 holebdy = getnum(optarg, &endp);
3142 if (holebdy <= 0)
3143 usage();
3144 break;
3145 case 'j':
3146 journal_replay = true;
3147 break;
3148 case 'k':
3149 keep_on_success = 1;
3150 break;
3151 case 'l':
3152 {
3153 int _num = getnum(optarg, &endp);
3154 if (_num <= 0)
3155 usage();
3156 maxfilelen = _num;
3157 }
3158 break;
3159 case 'm':
3160 monitorstart = getnum(optarg, &endp);
3161 if (monitorstart < 0)
3162 usage();
3163 if (!endp || *endp++ != ':')
3164 usage();
3165 monitorend = getnum(endp, &endp);
3166 if (monitorend < 0)
3167 usage();
3168 if (monitorend == 0)
3169 monitorend = -1; /* aka infinity */
3170 debug = 1;
3171 break;
3172 case 'n':
3173 sizechecks = 0;
3174 break;
3175 case 'o':
3176 maxoplen = getnum(optarg, &endp);
3177 if (maxoplen <= 0)
3178 usage();
3179 break;
3180 case 'p':
3181 progressinterval = getnum(optarg, &endp);
3182 if (progressinterval == 0)
3183 usage();
3184 break;
3185 case 'q':
3186 quiet = 1;
3187 break;
3188 case 'r':
3189 readbdy = getnum(optarg, &endp);
3190 if (readbdy <= 0)
3191 usage();
3192 break;
3193 case 's':
3194 style = getnum(optarg, &endp);
3195 if (style < 0 || style > 1)
3196 usage();
3197 break;
3198 case 't':
3199 truncbdy = getnum(optarg, &endp);
3200 if (truncbdy <= 0)
3201 usage();
3202 break;
3203 case 'w':
3204 writebdy = getnum(optarg, &endp);
3205 if (writebdy <= 0)
3206 usage();
3207 break;
3208 case 'x':
3209 prealloc = 1;
3210 break;
3211 case 'y':
3212 do_fsync = 1;
3213 break;
3214 case 'C':
3215 clone_calls = 0;
3216 break;
3217 case 'D':
3218 debugstart = getnum(optarg, &endp);
3219 if (debugstart < 1)
3220 usage();
3221 break;
3222 case 'F':
3223 fallocate_calls = 0;
3224 break;
11fdf7f2
TL
3225#if defined(__FreeBSD__)
3226 case 'G':
3227 prt("rbd-ggate mode enabled\n");
3228 ops = &ggate_operations;
3229 break;
3230#endif
7c673cae
FG
3231 case 'H':
3232 punch_hole_calls = 0;
3233 break;
11fdf7f2 3234#if defined(WITH_KRBD)
7c673cae
FG
3235 case 'K':
3236 prt("krbd mode enabled\n");
3237 ops = &krbd_operations;
3238 break;
11fdf7f2
TL
3239#endif
3240#if defined(__linux__)
7c673cae
FG
3241 case 'M':
3242 prt("rbd-nbd mode enabled\n");
3243 ops = &nbd_operations;
3244 break;
11fdf7f2 3245#endif
7c673cae 3246 case 'L':
11fdf7f2 3247 lite = 1;
7c673cae
FG
3248 break;
3249 case 'N':
3250 numops = getnum(optarg, &endp);
3251 if (numops < 0)
3252 usage();
3253 break;
3254 case 'O':
3255 randomoplen = 0;
3256 break;
3257 case 'P':
3258 strncpy(dirpath, optarg, sizeof(dirpath)-1);
3259 dirpath[sizeof(dirpath)-1] = '\0';
3260 strncpy(goodfile, dirpath, sizeof(goodfile)-1);
3261 goodfile[sizeof(goodfile)-1] = '\0';
3262 if (strlen(goodfile) < sizeof(goodfile)-2) {
3263 strcat(goodfile, "/");
3264 } else {
3265 prt("file name to long\n");
3266 exit(1);
3267 }
3268 strncpy(logfile, dirpath, sizeof(logfile)-1);
3269 logfile[sizeof(logfile)-1] = '\0';
3270 if (strlen(logfile) < sizeof(logfile)-2) {
3271 strcat(logfile, "/");
3272 } else {
3273 prt("file path to long\n");
3274 exit(1);
3275 }
3276 break;
3277 case 'R':
3278 mapped_reads = 0;
3279 if (!quiet)
3280 fprintf(stdout, "mapped reads DISABLED\n");
3281 break;
3282 case 'S':
3283 seed = getnum(optarg, &endp);
3284 if (seed == 0)
11fdf7f2 3285 seed = std::random_device()() % 10000;
7c673cae
FG
3286 if (!quiet)
3287 fprintf(stdout, "Seed set to %d\n", seed);
3288 if (seed < 0)
3289 usage();
3290 break;
3291 case 'U':
3292 randomize_striping = 0;
3293 break;
3294 case 'W':
3295 mapped_writes = 0;
3296 if (!quiet)
3297 fprintf(stdout, "mapped writes DISABLED\n");
3298 break;
3299 case 'Z':
f67539c2 3300 #ifdef O_DIRECT
7c673cae 3301 o_direct = O_DIRECT;
f67539c2 3302 #endif
7c673cae
FG
3303 break;
3304 default:
3305 usage();
3306 /* NOTREACHED */
3307 }
11fdf7f2 3308 }
7c673cae
FG
3309 argc -= optind;
3310 argv += optind;
3311 if (argc != 2)
3312 usage();
3313 pool = argv[0];
3314 iname = argv[1];
3315
f67539c2 3316 #ifndef _WIN32
7c673cae
FG
3317 signal(SIGHUP, cleanup);
3318 signal(SIGINT, cleanup);
3319 signal(SIGPIPE, cleanup);
3320 signal(SIGALRM, cleanup);
3321 signal(SIGTERM, cleanup);
3322 signal(SIGXCPU, cleanup);
3323 signal(SIGXFSZ, cleanup);
3324 signal(SIGVTALRM, cleanup);
3325 signal(SIGUSR1, cleanup);
3326 signal(SIGUSR2, cleanup);
f67539c2 3327 #endif
7c673cae
FG
3328
3329 random_generator.seed(seed);
3330
11fdf7f2
TL
3331 if (lite) {
3332 file_size = maxfilelen;
3333 }
3334
7c673cae
FG
3335 ret = create_image();
3336 if (ret < 0) {
3337 prterrcode(iname, ret);
3338 exit(90);
3339 }
3340 ret = ops->open(iname, &ctx);
3341 if (ret < 0) {
3342 simple_err("Error opening image", ret);
3343 exit(91);
3344 }
3345 if (!dirpath[0])
3346 strcat(dirpath, ".");
3347 strncat(goodfile, iname, 256);
3348 strcat (goodfile, ".fsxgood");
3349 fsxgoodfd = open(goodfile, O_RDWR|O_CREAT|O_TRUNC, 0666);
3350 if (fsxgoodfd < 0) {
3351 prterr(goodfile);
3352 exit(92);
3353 }
3354 strncat(logfile, iname, 256);
3355 strcat (logfile, ".fsxlog");
3356 fsxlogf = fopen(logfile, "w");
3357 if (fsxlogf == NULL) {
3358 prterr(logfile);
3359 exit(93);
3360 }
3361
3362 original_buf = (char *) malloc(maxfilelen);
3363 for (i = 0; i < (int)maxfilelen; i++)
3364 original_buf[i] = get_random() % 256;
3365
3366 ret = posix_memalign((void **)&good_buf,
11fdf7f2 3367 std::max(writebdy, (int)sizeof(void *)), maxfilelen);
7c673cae
FG
3368 if (ret > 0) {
3369 if (ret == EINVAL)
3370 prt("writebdy is not a suitable power of two\n");
3371 else
3372 prterrcode("main: posix_memalign(good_buf)", -ret);
3373 exit(94);
3374 }
3375 memset(good_buf, '\0', maxfilelen);
3376
3377 ret = posix_memalign((void **)&temp_buf,
11fdf7f2 3378 std::max(readbdy, (int)sizeof(void *)), maxfilelen);
7c673cae
FG
3379 if (ret > 0) {
3380 if (ret == EINVAL)
3381 prt("readbdy is not a suitable power of two\n");
3382 else
3383 prterrcode("main: posix_memalign(temp_buf)", -ret);
3384 exit(95);
3385 }
3386 memset(temp_buf, '\0', maxfilelen);
3387
3388 if (lite) { /* zero entire existing file */
3389 ssize_t written;
3390
3391 written = ops->write(&ctx, 0, (size_t)maxfilelen, good_buf);
3392 if (written != (ssize_t)maxfilelen) {
3393 if (written < 0) {
3394 prterrcode(iname, written);
3395 warn("main: error on write");
3396 } else
3397 warn("main: short write, 0x%x bytes instead "
3398 "of 0x%lx\n",
3399 (unsigned)written,
3400 maxfilelen);
3401 exit(98);
3402 }
3403 } else
3404 check_trunc_hack();
3405
3406 //test_fallocate();
3407
3408 while (numops == -1 || numops--)
3409 test();
3410
3411 ret = ops->close(&ctx);
3412 if (ret < 0) {
3413 prterrcode("ops->close", ret);
3414 report_failure(99);
3415 }
3416
3417 if (journal_replay) {
3418 char imagename[1024];
3419 clone_imagename(imagename, sizeof(imagename), num_clones);
3420 ret = finalize_journal(ioctx, imagename, num_clones, 0, 0, 0);
3421 if (ret < 0) {
3422 report_failure(100);
3423 }
3424 }
3425
3426 if (num_clones > 0) {
3427 if (journal_replay) {
3428 check_clone(num_clones - 1, true);
3429 }
3430 check_clone(num_clones - 1, false);
3431 }
3432
3433 if (!keep_on_success) {
3434 while (num_clones >= 0) {
3435 static bool remove_snap = false;
3436
3437 if (journal_replay) {
3438 char replayimagename[1024];
3439 replay_imagename(replayimagename,
3440 sizeof(replayimagename),
3441 num_clones);
3442 remove_image(ioctx, replayimagename,
3443 remove_snap,
3444 false);
3445 }
3446
3447 char clonename[128];
3448 clone_imagename(clonename, 128, num_clones);
3449 remove_image(ioctx, clonename, remove_snap,
3450 journal_replay);
3451
3452 remove_snap = true;
3453 num_clones--;
3454 }
3455 }
3456
3457 prt("All operations completed A-OK!\n");
3458 fclose(fsxlogf);
3459
3460 rados_ioctx_destroy(ioctx);
11fdf7f2 3461#if defined(WITH_KRBD)
7c673cae 3462 krbd_destroy(krbd);
11fdf7f2 3463#endif
7c673cae
FG
3464 rados_shutdown(cluster);
3465
3466 free(original_buf);
3467 free(good_buf);
3468 free(temp_buf);
3469
3470 exit(0);
3471 return 0;
3472}