]> git.proxmox.com Git - mirror_zfs.git/blame - cmd/ztest/ztest.c
Add provides lustre-backend-fs to rpm
[mirror_zfs.git] / cmd / ztest / ztest.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
6d974228 23 * Copyright (c) 2011 by Delphix. All rights reserved.
34dc7c2f
BB
24 */
25
34dc7c2f
BB
26/*
27 * The objective of this program is to provide a DMU/ZAP/SPA stress test
28 * that runs entirely in userland, is easy to use, and easy to extend.
29 *
30 * The overall design of the ztest program is as follows:
31 *
32 * (1) For each major functional area (e.g. adding vdevs to a pool,
33 * creating and destroying datasets, reading and writing objects, etc)
34 * we have a simple routine to test that functionality. These
35 * individual routines do not have to do anything "stressful".
36 *
37 * (2) We turn these simple functionality tests into a stress test by
38 * running them all in parallel, with as many threads as desired,
39 * and spread across as many datasets, objects, and vdevs as desired.
40 *
41 * (3) While all this is happening, we inject faults into the pool to
42 * verify that self-healing data really works.
43 *
44 * (4) Every time we open a dataset, we change its checksum and compression
45 * functions. Thus even individual objects vary from block to block
46 * in which checksum they use and whether they're compressed.
47 *
48 * (5) To verify that we never lose on-disk consistency after a crash,
49 * we run the entire test in a child of the main process.
50 * At random times, the child self-immolates with a SIGKILL.
51 * This is the software equivalent of pulling the power cord.
52 * The parent then runs the test again, using the existing
53 * storage pool, as many times as desired.
54 *
55 * (6) To verify that we don't have future leaks or temporal incursions,
56 * many of the functional tests record the transaction group number
57 * as part of their data. When reading old data, they verify that
58 * the transaction group number is less than the current, open txg.
59 * If you add a new test, please do this if applicable.
60 *
1e33ac1e
BB
61 * (7) Threads are created with a reduced stack size, for sanity checking.
62 * Therefore, it's important not to allocate huge buffers on the stack.
63 *
34dc7c2f
BB
64 * When run with no arguments, ztest runs for about five minutes and
65 * produces no output if successful. To get a little bit of information,
66 * specify -V. To get more information, specify -VV, and so on.
67 *
68 * To turn this into an overnight stress test, use -T to specify run time.
69 *
70 * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
71 * to increase the pool capacity, fanout, and overall stress level.
72 *
73 * The -N(okill) option will suppress kills, so each child runs to completion.
74 * This can be useful when you're trying to distinguish temporal incursions
75 * from plain old race conditions.
76 */
77
78#include <sys/zfs_context.h>
79#include <sys/spa.h>
80#include <sys/dmu.h>
81#include <sys/txg.h>
9babb374 82#include <sys/dbuf.h>
34dc7c2f 83#include <sys/zap.h>
34dc7c2f
BB
84#include <sys/dmu_objset.h>
85#include <sys/poll.h>
86#include <sys/stat.h>
87#include <sys/time.h>
88#include <sys/wait.h>
89#include <sys/mman.h>
90#include <sys/resource.h>
91#include <sys/zio.h>
34dc7c2f 92#include <sys/zil.h>
428870ff 93#include <sys/zil_impl.h>
34dc7c2f 94#include <sys/vdev_impl.h>
b128c09f 95#include <sys/vdev_file.h>
34dc7c2f 96#include <sys/spa_impl.h>
428870ff 97#include <sys/metaslab_impl.h>
34dc7c2f 98#include <sys/dsl_prop.h>
9babb374 99#include <sys/dsl_dataset.h>
428870ff
BB
100#include <sys/dsl_scan.h>
101#include <sys/zio_checksum.h>
34dc7c2f
BB
102#include <sys/refcount.h>
103#include <stdio.h>
104#include <stdio_ext.h>
105#include <stdlib.h>
106#include <unistd.h>
107#include <signal.h>
108#include <umem.h>
109#include <dlfcn.h>
110#include <ctype.h>
111#include <math.h>
112#include <sys/fs/zfs.h>
428870ff 113#include <libnvpair.h>
34dc7c2f
BB
114
115static char cmdname[] = "ztest";
116static char *zopt_pool = cmdname;
117
118static uint64_t zopt_vdevs = 5;
119static uint64_t zopt_vdevtime;
120static int zopt_ashift = SPA_MINBLOCKSHIFT;
121static int zopt_mirrors = 2;
122static int zopt_raidz = 4;
123static int zopt_raidz_parity = 1;
124static size_t zopt_vdev_size = SPA_MINDEVSIZE;
125static int zopt_datasets = 7;
126static int zopt_threads = 23;
127static uint64_t zopt_passtime = 60; /* 60 seconds */
128static uint64_t zopt_killrate = 70; /* 70% kill rate */
129static int zopt_verbose = 0;
130static int zopt_init = 1;
131static char *zopt_dir = "/tmp";
132static uint64_t zopt_time = 300; /* 5 minutes */
428870ff
BB
133static uint64_t zopt_maxloops = 50; /* max loops during spa_freeze() */
134
135#define BT_MAGIC 0x123456789abcdefULL
136#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1)
137
138enum ztest_io_type {
139 ZTEST_IO_WRITE_TAG,
140 ZTEST_IO_WRITE_PATTERN,
141 ZTEST_IO_WRITE_ZEROES,
142 ZTEST_IO_TRUNCATE,
143 ZTEST_IO_SETATTR,
144 ZTEST_IO_TYPES
145};
34dc7c2f
BB
146
147typedef struct ztest_block_tag {
428870ff 148 uint64_t bt_magic;
34dc7c2f
BB
149 uint64_t bt_objset;
150 uint64_t bt_object;
151 uint64_t bt_offset;
428870ff 152 uint64_t bt_gen;
34dc7c2f 153 uint64_t bt_txg;
428870ff 154 uint64_t bt_crtxg;
34dc7c2f
BB
155} ztest_block_tag_t;
156
428870ff
BB
157typedef struct bufwad {
158 uint64_t bw_index;
159 uint64_t bw_txg;
160 uint64_t bw_data;
161} bufwad_t;
162
163/*
164 * XXX -- fix zfs range locks to be generic so we can use them here.
165 */
166typedef enum {
167 RL_READER,
168 RL_WRITER,
169 RL_APPEND
170} rl_type_t;
171
172typedef struct rll {
173 void *rll_writer;
174 int rll_readers;
1e33ac1e
BB
175 kmutex_t rll_lock;
176 kcondvar_t rll_cv;
428870ff
BB
177} rll_t;
178
179typedef struct rl {
180 uint64_t rl_object;
181 uint64_t rl_offset;
182 uint64_t rl_size;
183 rll_t *rl_lock;
184} rl_t;
185
186#define ZTEST_RANGE_LOCKS 64
187#define ZTEST_OBJECT_LOCKS 64
188
189/*
190 * Object descriptor. Used as a template for object lookup/create/remove.
191 */
192typedef struct ztest_od {
193 uint64_t od_dir;
194 uint64_t od_object;
195 dmu_object_type_t od_type;
196 dmu_object_type_t od_crtype;
197 uint64_t od_blocksize;
198 uint64_t od_crblocksize;
199 uint64_t od_gen;
200 uint64_t od_crgen;
201 char od_name[MAXNAMELEN];
202} ztest_od_t;
34dc7c2f 203
428870ff
BB
204/*
205 * Per-dataset state.
206 */
207typedef struct ztest_ds {
208 objset_t *zd_os;
3e31d2b0 209 krwlock_t zd_zilog_lock;
428870ff
BB
210 zilog_t *zd_zilog;
211 uint64_t zd_seq;
212 ztest_od_t *zd_od; /* debugging aid */
213 char zd_name[MAXNAMELEN];
1e33ac1e 214 kmutex_t zd_dirobj_lock;
428870ff
BB
215 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
216 rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
217} ztest_ds_t;
218
219/*
220 * Per-iteration state.
221 */
222typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
223
224typedef struct ztest_info {
225 ztest_func_t *zi_func; /* test function */
226 uint64_t zi_iters; /* iterations per execution */
227 uint64_t *zi_interval; /* execute every <interval> seconds */
228 uint64_t zi_call_count; /* per-pass count */
229 uint64_t zi_call_time; /* per-pass time */
230 uint64_t zi_call_next; /* next time to call this function */
231} ztest_info_t;
34dc7c2f
BB
232
233/*
234 * Note: these aren't static because we want dladdr() to work.
235 */
236ztest_func_t ztest_dmu_read_write;
237ztest_func_t ztest_dmu_write_parallel;
238ztest_func_t ztest_dmu_object_alloc_free;
428870ff 239ztest_func_t ztest_dmu_commit_callbacks;
34dc7c2f
BB
240ztest_func_t ztest_zap;
241ztest_func_t ztest_zap_parallel;
428870ff 242ztest_func_t ztest_zil_commit;
3e31d2b0 243ztest_func_t ztest_zil_remount;
428870ff 244ztest_func_t ztest_dmu_read_write_zcopy;
34dc7c2f 245ztest_func_t ztest_dmu_objset_create_destroy;
428870ff
BB
246ztest_func_t ztest_dmu_prealloc;
247ztest_func_t ztest_fzap;
34dc7c2f 248ztest_func_t ztest_dmu_snapshot_create_destroy;
428870ff
BB
249ztest_func_t ztest_dsl_prop_get_set;
250ztest_func_t ztest_spa_prop_get_set;
34dc7c2f
BB
251ztest_func_t ztest_spa_create_destroy;
252ztest_func_t ztest_fault_inject;
428870ff
BB
253ztest_func_t ztest_ddt_repair;
254ztest_func_t ztest_dmu_snapshot_hold;
b128c09f 255ztest_func_t ztest_spa_rename;
428870ff
BB
256ztest_func_t ztest_scrub;
257ztest_func_t ztest_dsl_dataset_promote_busy;
34dc7c2f
BB
258ztest_func_t ztest_vdev_attach_detach;
259ztest_func_t ztest_vdev_LUN_growth;
260ztest_func_t ztest_vdev_add_remove;
b128c09f 261ztest_func_t ztest_vdev_aux_add_remove;
428870ff 262ztest_func_t ztest_split_pool;
34dc7c2f 263
428870ff
BB
264uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
265uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
266uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
267uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
268uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */
34dc7c2f
BB
269
270ztest_info_t ztest_info[] = {
271 { ztest_dmu_read_write, 1, &zopt_always },
428870ff 272 { ztest_dmu_write_parallel, 10, &zopt_always },
34dc7c2f 273 { ztest_dmu_object_alloc_free, 1, &zopt_always },
428870ff 274 { ztest_dmu_commit_callbacks, 1, &zopt_always },
34dc7c2f
BB
275 { ztest_zap, 30, &zopt_always },
276 { ztest_zap_parallel, 100, &zopt_always },
428870ff
BB
277 { ztest_split_pool, 1, &zopt_always },
278 { ztest_zil_commit, 1, &zopt_incessant },
3e31d2b0 279 { ztest_zil_remount, 1, &zopt_sometimes },
428870ff
BB
280 { ztest_dmu_read_write_zcopy, 1, &zopt_often },
281 { ztest_dmu_objset_create_destroy, 1, &zopt_often },
282 { ztest_dsl_prop_get_set, 1, &zopt_often },
283 { ztest_spa_prop_get_set, 1, &zopt_sometimes },
284#if 0
285 { ztest_dmu_prealloc, 1, &zopt_sometimes },
286#endif
287 { ztest_fzap, 1, &zopt_sometimes },
288 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
289 { ztest_spa_create_destroy, 1, &zopt_sometimes },
34dc7c2f 290 { ztest_fault_inject, 1, &zopt_sometimes },
428870ff
BB
291 { ztest_ddt_repair, 1, &zopt_sometimes },
292 { ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
34dc7c2f 293 { ztest_spa_rename, 1, &zopt_rarely },
428870ff 294 { ztest_scrub, 1, &zopt_rarely },
9babb374 295 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
428870ff
BB
296 { ztest_vdev_attach_detach, 1, &zopt_rarely },
297 { ztest_vdev_LUN_growth, 1, &zopt_rarely },
298 { ztest_vdev_add_remove, 1, &zopt_vdevtime },
b128c09f 299 { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime },
34dc7c2f
BB
300};
301
302#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
303
428870ff
BB
304/*
305 * The following struct is used to hold a list of uncalled commit callbacks.
306 * The callbacks are ordered by txg number.
307 */
308typedef struct ztest_cb_list {
1e33ac1e
BB
309 kmutex_t zcl_callbacks_lock;
310 list_t zcl_callbacks;
428870ff 311} ztest_cb_list_t;
34dc7c2f
BB
312
313/*
314 * Stuff we need to share writably between parent and child.
315 */
316typedef struct ztest_shared {
428870ff
BB
317 char *zs_pool;
318 spa_t *zs_spa;
319 hrtime_t zs_proc_start;
320 hrtime_t zs_proc_stop;
321 hrtime_t zs_thread_start;
322 hrtime_t zs_thread_stop;
323 hrtime_t zs_thread_kill;
34dc7c2f 324 uint64_t zs_enospc_count;
428870ff
BB
325 uint64_t zs_vdev_next_leaf;
326 uint64_t zs_vdev_aux;
34dc7c2f
BB
327 uint64_t zs_alloc;
328 uint64_t zs_space;
1e33ac1e
BB
329 kmutex_t zs_vdev_lock;
330 krwlock_t zs_name_lock;
34dc7c2f 331 ztest_info_t zs_info[ZTEST_FUNCS];
428870ff
BB
332 uint64_t zs_splits;
333 uint64_t zs_mirrors;
334 ztest_ds_t zs_zd[];
34dc7c2f
BB
335} ztest_shared_t;
336
428870ff
BB
337#define ID_PARALLEL -1ULL
338
34dc7c2f 339static char ztest_dev_template[] = "%s/%s.%llua";
b128c09f 340static char ztest_aux_template[] = "%s/%s.%s.%llu";
428870ff
BB
341ztest_shared_t *ztest_shared;
342uint64_t *ztest_seq;
34dc7c2f
BB
343
344static int ztest_random_fd;
345static int ztest_dump_core = 1;
346
b128c09f 347static boolean_t ztest_exiting;
34dc7c2f 348
428870ff
BB
349/* Global commit callback list */
350static ztest_cb_list_t zcl;
090ff092
RC
351/* Commit cb delay */
352static uint64_t zc_min_txg_delay = UINT64_MAX;
353static int zc_cb_counter = 0;
354
355/*
356 * Minimum number of commit callbacks that need to be registered for us to check
357 * whether the minimum txg delay is acceptable.
358 */
359#define ZTEST_COMMIT_CB_MIN_REG 100
360
361/*
362 * If a number of txgs equal to this threshold have been created after a commit
363 * callback has been registered but not called, then we assume there is an
364 * implementation bug.
365 */
366#define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000)
428870ff 367
34dc7c2f 368extern uint64_t metaslab_gang_bang;
9babb374 369extern uint64_t metaslab_df_alloc_threshold;
428870ff 370static uint64_t metaslab_sz;
34dc7c2f 371
428870ff
BB
372enum ztest_object {
373 ZTEST_META_DNODE = 0,
374 ZTEST_DIROBJ,
375 ZTEST_OBJECTS
376};
34dc7c2f
BB
377
378static void usage(boolean_t) __NORETURN;
379
380/*
381 * These libumem hooks provide a reasonable set of defaults for the allocator's
382 * debugging facilities.
383 */
384const char *
0bc8fd78 385_umem_debug_init(void)
34dc7c2f
BB
386{
387 return ("default,verbose"); /* $UMEM_DEBUG setting */
388}
389
390const char *
391_umem_logging_init(void)
392{
393 return ("fail,contents"); /* $UMEM_LOGGING setting */
394}
395
396#define FATAL_MSG_SZ 1024
397
398char *fatal_msg;
399
400static void
401fatal(int do_perror, char *message, ...)
402{
403 va_list args;
404 int save_errno = errno;
40b84e7a 405 char *buf;
34dc7c2f
BB
406
407 (void) fflush(stdout);
40b84e7a 408 buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL);
34dc7c2f
BB
409
410 va_start(args, message);
411 (void) sprintf(buf, "ztest: ");
412 /* LINTED */
413 (void) vsprintf(buf + strlen(buf), message, args);
414 va_end(args);
415 if (do_perror) {
416 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
417 ": %s", strerror(save_errno));
418 }
419 (void) fprintf(stderr, "%s\n", buf);
420 fatal_msg = buf; /* to ease debugging */
421 if (ztest_dump_core)
422 abort();
423 exit(3);
424}
425
426static int
427str2shift(const char *buf)
428{
429 const char *ends = "BKMGTPEZ";
430 int i;
431
432 if (buf[0] == '\0')
433 return (0);
434 for (i = 0; i < strlen(ends); i++) {
435 if (toupper(buf[0]) == ends[i])
436 break;
437 }
438 if (i == strlen(ends)) {
439 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
440 buf);
441 usage(B_FALSE);
442 }
443 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
444 return (10*i);
445 }
446 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
447 usage(B_FALSE);
448 /* NOTREACHED */
449}
450
451static uint64_t
452nicenumtoull(const char *buf)
453{
454 char *end;
455 uint64_t val;
456
457 val = strtoull(buf, &end, 0);
458 if (end == buf) {
459 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
460 usage(B_FALSE);
461 } else if (end[0] == '.') {
462 double fval = strtod(buf, &end);
463 fval *= pow(2, str2shift(end));
464 if (fval > UINT64_MAX) {
465 (void) fprintf(stderr, "ztest: value too large: %s\n",
466 buf);
467 usage(B_FALSE);
468 }
469 val = (uint64_t)fval;
470 } else {
471 int shift = str2shift(end);
472 if (shift >= 64 || (val << shift) >> shift != val) {
473 (void) fprintf(stderr, "ztest: value too large: %s\n",
474 buf);
475 usage(B_FALSE);
476 }
477 val <<= shift;
478 }
479 return (val);
480}
481
482static void
483usage(boolean_t requested)
484{
485 char nice_vdev_size[10];
486 char nice_gang_bang[10];
487 FILE *fp = requested ? stdout : stderr;
488
489 nicenum(zopt_vdev_size, nice_vdev_size);
490 nicenum(metaslab_gang_bang, nice_gang_bang);
491
492 (void) fprintf(fp, "Usage: %s\n"
493 "\t[-v vdevs (default: %llu)]\n"
494 "\t[-s size_of_each_vdev (default: %s)]\n"
428870ff 495 "\t[-a alignment_shift (default: %d)] use 0 for random\n"
34dc7c2f
BB
496 "\t[-m mirror_copies (default: %d)]\n"
497 "\t[-r raidz_disks (default: %d)]\n"
498 "\t[-R raidz_parity (default: %d)]\n"
499 "\t[-d datasets (default: %d)]\n"
500 "\t[-t threads (default: %d)]\n"
501 "\t[-g gang_block_threshold (default: %s)]\n"
428870ff
BB
502 "\t[-i init_count (default: %d)] initialize pool i times\n"
503 "\t[-k kill_percentage (default: %llu%%)]\n"
34dc7c2f 504 "\t[-p pool_name (default: %s)]\n"
428870ff
BB
505 "\t[-f dir (default: %s)] file directory for vdev files\n"
506 "\t[-V] verbose (use multiple times for ever more blather)\n"
507 "\t[-E] use existing pool instead of creating new one\n"
508 "\t[-T time (default: %llu sec)] total run time\n"
509 "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
510 "\t[-P passtime (default: %llu sec)] time per pass\n"
34dc7c2f
BB
511 "\t[-h] (print help)\n"
512 "",
513 cmdname,
514 (u_longlong_t)zopt_vdevs, /* -v */
515 nice_vdev_size, /* -s */
516 zopt_ashift, /* -a */
517 zopt_mirrors, /* -m */
518 zopt_raidz, /* -r */
519 zopt_raidz_parity, /* -R */
520 zopt_datasets, /* -d */
521 zopt_threads, /* -t */
522 nice_gang_bang, /* -g */
523 zopt_init, /* -i */
524 (u_longlong_t)zopt_killrate, /* -k */
525 zopt_pool, /* -p */
526 zopt_dir, /* -f */
527 (u_longlong_t)zopt_time, /* -T */
428870ff 528 (u_longlong_t)zopt_maxloops, /* -F */
b128c09f 529 (u_longlong_t)zopt_passtime); /* -P */
34dc7c2f
BB
530 exit(requested ? 0 : 1);
531}
532
34dc7c2f
BB
533static void
534process_options(int argc, char **argv)
535{
536 int opt;
537 uint64_t value;
538
539 /* By default, test gang blocks for blocks 32K and greater */
540 metaslab_gang_bang = 32 << 10;
541
34dc7c2f 542 while ((opt = getopt(argc, argv,
428870ff 543 "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:")) != EOF) {
34dc7c2f
BB
544 value = 0;
545 switch (opt) {
546 case 'v':
547 case 's':
548 case 'a':
549 case 'm':
550 case 'r':
551 case 'R':
552 case 'd':
553 case 't':
554 case 'g':
555 case 'i':
556 case 'k':
557 case 'T':
558 case 'P':
428870ff 559 case 'F':
34dc7c2f
BB
560 value = nicenumtoull(optarg);
561 }
562 switch (opt) {
563 case 'v':
564 zopt_vdevs = value;
565 break;
566 case 's':
567 zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
568 break;
569 case 'a':
570 zopt_ashift = value;
571 break;
572 case 'm':
573 zopt_mirrors = value;
574 break;
575 case 'r':
576 zopt_raidz = MAX(1, value);
577 break;
578 case 'R':
45d1cae3 579 zopt_raidz_parity = MIN(MAX(value, 1), 3);
34dc7c2f
BB
580 break;
581 case 'd':
582 zopt_datasets = MAX(1, value);
583 break;
584 case 't':
585 zopt_threads = MAX(1, value);
586 break;
587 case 'g':
588 metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
589 break;
590 case 'i':
591 zopt_init = value;
592 break;
593 case 'k':
594 zopt_killrate = value;
595 break;
596 case 'p':
597 zopt_pool = strdup(optarg);
598 break;
599 case 'f':
600 zopt_dir = strdup(optarg);
601 break;
602 case 'V':
603 zopt_verbose++;
604 break;
605 case 'E':
606 zopt_init = 0;
607 break;
608 case 'T':
609 zopt_time = value;
610 break;
611 case 'P':
612 zopt_passtime = MAX(1, value);
613 break;
428870ff
BB
614 case 'F':
615 zopt_maxloops = MAX(1, value);
616 break;
34dc7c2f
BB
617 case 'h':
618 usage(B_TRUE);
619 break;
620 case '?':
621 default:
622 usage(B_FALSE);
623 break;
624 }
625 }
626
627 zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
628
428870ff
BB
629 zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs :
630 UINT64_MAX >> 2);
631}
632
633static void
634ztest_kill(ztest_shared_t *zs)
635{
636 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa));
637 zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa));
638 (void) kill(getpid(), SIGKILL);
639}
640
641static uint64_t
642ztest_random(uint64_t range)
643{
644 uint64_t r;
645
646 if (range == 0)
647 return (0);
648
649 if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
650 fatal(1, "short read from /dev/urandom");
651
652 return (r % range);
653}
654
655/* ARGSUSED */
656static void
657ztest_record_enospc(const char *s)
658{
659 ztest_shared->zs_enospc_count++;
34dc7c2f
BB
660}
661
662static uint64_t
663ztest_get_ashift(void)
664{
665 if (zopt_ashift == 0)
666 return (SPA_MINBLOCKSHIFT + ztest_random(3));
667 return (zopt_ashift);
668}
669
670static nvlist_t *
b128c09f 671make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
34dc7c2f 672{
40b84e7a 673 char *pathbuf;
34dc7c2f 674 uint64_t vdev;
34dc7c2f
BB
675 nvlist_t *file;
676
40b84e7a
BB
677 pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
678
b128c09f
BB
679 if (ashift == 0)
680 ashift = ztest_get_ashift();
681
682 if (path == NULL) {
683 path = pathbuf;
684
685 if (aux != NULL) {
686 vdev = ztest_shared->zs_vdev_aux;
687 (void) sprintf(path, ztest_aux_template,
688 zopt_dir, zopt_pool, aux, vdev);
689 } else {
428870ff 690 vdev = ztest_shared->zs_vdev_next_leaf++;
b128c09f
BB
691 (void) sprintf(path, ztest_dev_template,
692 zopt_dir, zopt_pool, vdev);
693 }
694 }
34dc7c2f 695
b128c09f
BB
696 if (size != 0) {
697 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
34dc7c2f 698 if (fd == -1)
b128c09f 699 fatal(1, "can't open %s", path);
34dc7c2f 700 if (ftruncate(fd, size) != 0)
b128c09f 701 fatal(1, "can't ftruncate %s", path);
34dc7c2f
BB
702 (void) close(fd);
703 }
704
705 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
706 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
b128c09f 707 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
34dc7c2f 708 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
40b84e7a 709 umem_free(pathbuf, MAXPATHLEN);
34dc7c2f
BB
710
711 return (file);
712}
713
714static nvlist_t *
b128c09f 715make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r)
34dc7c2f
BB
716{
717 nvlist_t *raidz, **child;
718 int c;
719
720 if (r < 2)
b128c09f 721 return (make_vdev_file(path, aux, size, ashift));
34dc7c2f
BB
722 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
723
724 for (c = 0; c < r; c++)
b128c09f 725 child[c] = make_vdev_file(path, aux, size, ashift);
34dc7c2f
BB
726
727 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
728 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
729 VDEV_TYPE_RAIDZ) == 0);
730 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
731 zopt_raidz_parity) == 0);
732 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
733 child, r) == 0);
734
735 for (c = 0; c < r; c++)
736 nvlist_free(child[c]);
737
738 umem_free(child, r * sizeof (nvlist_t *));
739
740 return (raidz);
741}
742
743static nvlist_t *
b128c09f
BB
744make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift,
745 int r, int m)
34dc7c2f
BB
746{
747 nvlist_t *mirror, **child;
748 int c;
749
750 if (m < 1)
b128c09f 751 return (make_vdev_raidz(path, aux, size, ashift, r));
34dc7c2f
BB
752
753 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
754
755 for (c = 0; c < m; c++)
b128c09f 756 child[c] = make_vdev_raidz(path, aux, size, ashift, r);
34dc7c2f
BB
757
758 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
759 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
760 VDEV_TYPE_MIRROR) == 0);
761 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
762 child, m) == 0);
34dc7c2f
BB
763
764 for (c = 0; c < m; c++)
765 nvlist_free(child[c]);
766
767 umem_free(child, m * sizeof (nvlist_t *));
768
769 return (mirror);
770}
771
772static nvlist_t *
b128c09f
BB
773make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
774 int log, int r, int m, int t)
34dc7c2f
BB
775{
776 nvlist_t *root, **child;
777 int c;
778
779 ASSERT(t > 0);
780
781 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
782
b128c09f
BB
783 for (c = 0; c < t; c++) {
784 child[c] = make_vdev_mirror(path, aux, size, ashift, r, m);
785 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
786 log) == 0);
787 }
34dc7c2f
BB
788
789 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
790 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
b128c09f 791 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
34dc7c2f
BB
792 child, t) == 0);
793
794 for (c = 0; c < t; c++)
795 nvlist_free(child[c]);
796
797 umem_free(child, t * sizeof (nvlist_t *));
798
799 return (root);
800}
801
428870ff
BB
802static int
803ztest_random_blocksize(void)
34dc7c2f 804{
428870ff
BB
805 return (1 << (SPA_MINBLOCKSHIFT +
806 ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
807}
34dc7c2f 808
428870ff
BB
809static int
810ztest_random_ibshift(void)
811{
812 return (DN_MIN_INDBLKSHIFT +
813 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
34dc7c2f
BB
814}
815
428870ff
BB
816static uint64_t
817ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
34dc7c2f 818{
428870ff
BB
819 uint64_t top;
820 vdev_t *rvd = spa->spa_root_vdev;
821 vdev_t *tvd;
34dc7c2f 822
428870ff 823 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
34dc7c2f 824
428870ff
BB
825 do {
826 top = ztest_random(rvd->vdev_children);
827 tvd = rvd->vdev_child[top];
828 } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
829 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
34dc7c2f 830
428870ff 831 return (top);
34dc7c2f
BB
832}
833
428870ff
BB
834static uint64_t
835ztest_random_dsl_prop(zfs_prop_t prop)
34dc7c2f 836{
428870ff
BB
837 uint64_t value;
838
839 do {
840 value = zfs_prop_random_value(prop, ztest_random(-1ULL));
841 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
842
843 return (value);
34dc7c2f
BB
844}
845
34dc7c2f 846static int
428870ff
BB
847ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
848 boolean_t inherit)
34dc7c2f 849{
428870ff
BB
850 const char *propname = zfs_prop_to_name(prop);
851 const char *valname;
40b84e7a 852 char *setpoint;
428870ff 853 uint64_t curval;
34dc7c2f
BB
854 int error;
855
428870ff
BB
856 error = dsl_prop_set(osname, propname,
857 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL),
858 sizeof (value), 1, &value);
34dc7c2f 859
428870ff
BB
860 if (error == ENOSPC) {
861 ztest_record_enospc(FTAG);
34dc7c2f
BB
862 return (error);
863 }
34dc7c2f 864 ASSERT3U(error, ==, 0);
34dc7c2f 865
40b84e7a 866 setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
428870ff
BB
867 VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
868 1, &curval, setpoint), ==, 0);
869
870 if (zopt_verbose >= 6) {
871 VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
872 (void) printf("%s %s = %s at '%s'\n",
873 osname, propname, valname, setpoint);
34dc7c2f 874 }
40b84e7a 875 umem_free(setpoint, MAXPATHLEN);
34dc7c2f
BB
876
877 return (error);
878}
879
880static int
428870ff 881ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
34dc7c2f 882{
428870ff
BB
883 spa_t *spa = zs->zs_spa;
884 nvlist_t *props = NULL;
34dc7c2f
BB
885 int error;
886
428870ff
BB
887 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
888 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
34dc7c2f 889
428870ff
BB
890 error = spa_prop_set(spa, props);
891
892 nvlist_free(props);
893
894 if (error == ENOSPC) {
895 ztest_record_enospc(FTAG);
34dc7c2f
BB
896 return (error);
897 }
428870ff 898 ASSERT3U(error, ==, 0);
34dc7c2f
BB
899
900 return (error);
901}
902
428870ff
BB
903static void
904ztest_rll_init(rll_t *rll)
905{
906 rll->rll_writer = NULL;
907 rll->rll_readers = 0;
1e33ac1e
BB
908 mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL);
909 cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL);
428870ff 910}
34dc7c2f 911
428870ff
BB
912static void
913ztest_rll_destroy(rll_t *rll)
34dc7c2f 914{
428870ff
BB
915 ASSERT(rll->rll_writer == NULL);
916 ASSERT(rll->rll_readers == 0);
1e33ac1e
BB
917 mutex_destroy(&rll->rll_lock);
918 cv_destroy(&rll->rll_cv);
428870ff 919}
34dc7c2f 920
428870ff
BB
921static void
922ztest_rll_lock(rll_t *rll, rl_type_t type)
923{
1e33ac1e 924 mutex_enter(&rll->rll_lock);
34dc7c2f 925
428870ff
BB
926 if (type == RL_READER) {
927 while (rll->rll_writer != NULL)
1e33ac1e 928 (void) cv_wait(&rll->rll_cv, &rll->rll_lock);
428870ff
BB
929 rll->rll_readers++;
930 } else {
931 while (rll->rll_writer != NULL || rll->rll_readers)
1e33ac1e 932 (void) cv_wait(&rll->rll_cv, &rll->rll_lock);
428870ff
BB
933 rll->rll_writer = curthread;
934 }
34dc7c2f 935
1e33ac1e 936 mutex_exit(&rll->rll_lock);
428870ff 937}
34dc7c2f 938
428870ff
BB
939static void
940ztest_rll_unlock(rll_t *rll)
941{
1e33ac1e 942 mutex_enter(&rll->rll_lock);
34dc7c2f 943
428870ff
BB
944 if (rll->rll_writer) {
945 ASSERT(rll->rll_readers == 0);
946 rll->rll_writer = NULL;
947 } else {
948 ASSERT(rll->rll_readers != 0);
949 ASSERT(rll->rll_writer == NULL);
950 rll->rll_readers--;
951 }
34dc7c2f 952
428870ff 953 if (rll->rll_writer == NULL && rll->rll_readers == 0)
1e33ac1e 954 cv_broadcast(&rll->rll_cv);
428870ff 955
1e33ac1e 956 mutex_exit(&rll->rll_lock);
34dc7c2f
BB
957}
958
428870ff
BB
959static void
960ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
b128c09f 961{
428870ff 962 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
b128c09f 963
428870ff
BB
964 ztest_rll_lock(rll, type);
965}
b128c09f 966
428870ff
BB
967static void
968ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
969{
970 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
b128c09f 971
428870ff 972 ztest_rll_unlock(rll);
b128c09f
BB
973}
974
428870ff
BB
975static rl_t *
976ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
977 uint64_t size, rl_type_t type)
34dc7c2f 978{
428870ff
BB
979 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
980 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
981 rl_t *rl;
34dc7c2f 982
428870ff
BB
983 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
984 rl->rl_object = object;
985 rl->rl_offset = offset;
986 rl->rl_size = size;
987 rl->rl_lock = rll;
34dc7c2f 988
428870ff
BB
989 ztest_rll_lock(rll, type);
990
991 return (rl);
992}
34dc7c2f 993
428870ff
BB
994static void
995ztest_range_unlock(rl_t *rl)
996{
997 rll_t *rll = rl->rl_lock;
34dc7c2f 998
428870ff 999 ztest_rll_unlock(rll);
34dc7c2f 1000
428870ff
BB
1001 umem_free(rl, sizeof (*rl));
1002}
34dc7c2f 1003
428870ff
BB
1004static void
1005ztest_zd_init(ztest_ds_t *zd, objset_t *os)
1006{
1007 zd->zd_os = os;
1008 zd->zd_zilog = dmu_objset_zil(os);
1009 zd->zd_seq = 0;
1010 dmu_objset_name(os, zd->zd_name);
d6320ddb 1011 int l;
428870ff 1012
3e31d2b0 1013 rw_init(&zd->zd_zilog_lock, NULL, RW_DEFAULT, NULL);
1e33ac1e 1014 mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f 1015
d6320ddb 1016 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
428870ff 1017 ztest_rll_init(&zd->zd_object_lock[l]);
34dc7c2f 1018
d6320ddb 1019 for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
428870ff 1020 ztest_rll_init(&zd->zd_range_lock[l]);
34dc7c2f
BB
1021}
1022
428870ff
BB
1023static void
1024ztest_zd_fini(ztest_ds_t *zd)
34dc7c2f 1025{
d6320ddb
BB
1026 int l;
1027
1e33ac1e 1028 mutex_destroy(&zd->zd_dirobj_lock);
3e31d2b0 1029 rw_destroy(&zd->zd_zilog_lock);
34dc7c2f 1030
d6320ddb 1031 for (l = 0; l < ZTEST_OBJECT_LOCKS; l++)
428870ff 1032 ztest_rll_destroy(&zd->zd_object_lock[l]);
b128c09f 1033
d6320ddb 1034 for (l = 0; l < ZTEST_RANGE_LOCKS; l++)
428870ff
BB
1035 ztest_rll_destroy(&zd->zd_range_lock[l]);
1036}
b128c09f 1037
428870ff 1038#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
b128c09f 1039
428870ff
BB
1040static uint64_t
1041ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
1042{
1043 uint64_t txg;
1044 int error;
1045
1046 /*
1047 * Attempt to assign tx to some transaction group.
1048 */
1049 error = dmu_tx_assign(tx, txg_how);
1050 if (error) {
1051 if (error == ERESTART) {
1052 ASSERT(txg_how == TXG_NOWAIT);
1053 dmu_tx_wait(tx);
1054 } else {
1055 ASSERT3U(error, ==, ENOSPC);
1056 ztest_record_enospc(tag);
1057 }
1058 dmu_tx_abort(tx);
1059 return (0);
1060 }
1061 txg = dmu_tx_get_txg(tx);
1062 ASSERT(txg != 0);
1063 return (txg);
1064}
1065
1066static void
1067ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
1068{
1069 uint64_t *ip = buf;
1070 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
1071
1072 while (ip < ip_end)
1073 *ip++ = value;
1074}
1075
1fde1e37 1076#ifndef NDEBUG
428870ff
BB
1077static boolean_t
1078ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
1079{
1080 uint64_t *ip = buf;
1081 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
1082 uint64_t diff = 0;
1083
1084 while (ip < ip_end)
1085 diff |= (value - *ip++);
1086
1087 return (diff == 0);
1088}
1fde1e37 1089#endif
428870ff
BB
1090
1091static void
1092ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
1093 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
1094{
1095 bt->bt_magic = BT_MAGIC;
1096 bt->bt_objset = dmu_objset_id(os);
1097 bt->bt_object = object;
1098 bt->bt_offset = offset;
1099 bt->bt_gen = gen;
1100 bt->bt_txg = txg;
1101 bt->bt_crtxg = crtxg;
1102}
1103
1104static void
1105ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
1106 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
1107{
1108 ASSERT(bt->bt_magic == BT_MAGIC);
1109 ASSERT(bt->bt_objset == dmu_objset_id(os));
1110 ASSERT(bt->bt_object == object);
1111 ASSERT(bt->bt_offset == offset);
1112 ASSERT(bt->bt_gen <= gen);
1113 ASSERT(bt->bt_txg <= txg);
1114 ASSERT(bt->bt_crtxg == crtxg);
1115}
1116
1117static ztest_block_tag_t *
1118ztest_bt_bonus(dmu_buf_t *db)
1119{
1120 dmu_object_info_t doi;
1121 ztest_block_tag_t *bt;
1122
1123 dmu_object_info_from_db(db, &doi);
1124 ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
1125 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
1126 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
1127
1128 return (bt);
1129}
1130
1131/*
1132 * ZIL logging ops
1133 */
1134
1135#define lrz_type lr_mode
1136#define lrz_blocksize lr_uid
1137#define lrz_ibshift lr_gid
1138#define lrz_bonustype lr_rdev
1139#define lrz_bonuslen lr_crtime[1]
1140
572e2857 1141static void
428870ff
BB
1142ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
1143{
1144 char *name = (void *)(lr + 1); /* name follows lr */
1145 size_t namesize = strlen(name) + 1;
1146 itx_t *itx;
1147
1148 if (zil_replaying(zd->zd_zilog, tx))
572e2857 1149 return;
428870ff
BB
1150
1151 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
1152 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1153 sizeof (*lr) + namesize - sizeof (lr_t));
1154
572e2857 1155 zil_itx_assign(zd->zd_zilog, itx, tx);
428870ff
BB
1156}
1157
572e2857
BB
1158static void
1159ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
428870ff
BB
1160{
1161 char *name = (void *)(lr + 1); /* name follows lr */
1162 size_t namesize = strlen(name) + 1;
1163 itx_t *itx;
1164
1165 if (zil_replaying(zd->zd_zilog, tx))
572e2857 1166 return;
428870ff
BB
1167
1168 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
1169 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1170 sizeof (*lr) + namesize - sizeof (lr_t));
1171
572e2857
BB
1172 itx->itx_oid = object;
1173 zil_itx_assign(zd->zd_zilog, itx, tx);
428870ff
BB
1174}
1175
572e2857 1176static void
428870ff
BB
1177ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
1178{
1179 itx_t *itx;
1180 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
1181
1182 if (zil_replaying(zd->zd_zilog, tx))
572e2857 1183 return;
428870ff
BB
1184
1185 if (lr->lr_length > ZIL_MAX_LOG_DATA)
1186 write_state = WR_INDIRECT;
1187
1188 itx = zil_itx_create(TX_WRITE,
1189 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
1190
1191 if (write_state == WR_COPIED &&
1192 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
1193 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
1194 zil_itx_destroy(itx);
1195 itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1196 write_state = WR_NEED_COPY;
1197 }
1198 itx->itx_private = zd;
1199 itx->itx_wr_state = write_state;
1200 itx->itx_sync = (ztest_random(8) == 0);
1201 itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
1202
1203 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1204 sizeof (*lr) - sizeof (lr_t));
1205
572e2857 1206 zil_itx_assign(zd->zd_zilog, itx, tx);
428870ff
BB
1207}
1208
572e2857 1209static void
428870ff
BB
1210ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
1211{
1212 itx_t *itx;
1213
1214 if (zil_replaying(zd->zd_zilog, tx))
572e2857 1215 return;
428870ff
BB
1216
1217 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1218 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1219 sizeof (*lr) - sizeof (lr_t));
1220
572e2857
BB
1221 itx->itx_sync = B_FALSE;
1222 zil_itx_assign(zd->zd_zilog, itx, tx);
428870ff
BB
1223}
1224
572e2857 1225static void
428870ff
BB
1226ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
1227{
1228 itx_t *itx;
1229
1230 if (zil_replaying(zd->zd_zilog, tx))
572e2857 1231 return;
428870ff
BB
1232
1233 itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
1234 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1235 sizeof (*lr) - sizeof (lr_t));
1236
572e2857
BB
1237 itx->itx_sync = B_FALSE;
1238 zil_itx_assign(zd->zd_zilog, itx, tx);
428870ff
BB
1239}
1240
1241/*
1242 * ZIL replay ops
1243 */
1244static int
1245ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
1246{
1247 char *name = (void *)(lr + 1); /* name follows lr */
1248 objset_t *os = zd->zd_os;
1249 ztest_block_tag_t *bbt;
1250 dmu_buf_t *db;
1251 dmu_tx_t *tx;
1252 uint64_t txg;
1253 int error = 0;
1254
1255 if (byteswap)
1256 byteswap_uint64_array(lr, sizeof (*lr));
1257
1258 ASSERT(lr->lr_doid == ZTEST_DIROBJ);
1259 ASSERT(name[0] != '\0');
1260
1261 tx = dmu_tx_create(os);
1262
1263 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
1264
1265 if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
1266 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1267 } else {
1268 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1269 }
1270
1271 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1272 if (txg == 0)
1273 return (ENOSPC);
1274
1275 ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
1276
1277 if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
1278 if (lr->lr_foid == 0) {
1279 lr->lr_foid = zap_create(os,
1280 lr->lrz_type, lr->lrz_bonustype,
1281 lr->lrz_bonuslen, tx);
1282 } else {
1283 error = zap_create_claim(os, lr->lr_foid,
1284 lr->lrz_type, lr->lrz_bonustype,
1285 lr->lrz_bonuslen, tx);
1286 }
1287 } else {
1288 if (lr->lr_foid == 0) {
1289 lr->lr_foid = dmu_object_alloc(os,
1290 lr->lrz_type, 0, lr->lrz_bonustype,
1291 lr->lrz_bonuslen, tx);
1292 } else {
1293 error = dmu_object_claim(os, lr->lr_foid,
1294 lr->lrz_type, 0, lr->lrz_bonustype,
1295 lr->lrz_bonuslen, tx);
1296 }
1297 }
1298
1299 if (error) {
1300 ASSERT3U(error, ==, EEXIST);
1301 ASSERT(zd->zd_zilog->zl_replay);
1302 dmu_tx_commit(tx);
1303 return (error);
1304 }
1305
1306 ASSERT(lr->lr_foid != 0);
1307
1308 if (lr->lrz_type != DMU_OT_ZAP_OTHER)
1309 VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
1310 lr->lrz_blocksize, lr->lrz_ibshift, tx));
1311
1312 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
1313 bbt = ztest_bt_bonus(db);
1314 dmu_buf_will_dirty(db, tx);
1315 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
1316 dmu_buf_rele(db, FTAG);
1317
1318 VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
1319 &lr->lr_foid, tx));
1320
1321 (void) ztest_log_create(zd, tx, lr);
1322
1323 dmu_tx_commit(tx);
1324
1325 return (0);
1326}
1327
1328static int
1329ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
1330{
1331 char *name = (void *)(lr + 1); /* name follows lr */
1332 objset_t *os = zd->zd_os;
1333 dmu_object_info_t doi;
1334 dmu_tx_t *tx;
1335 uint64_t object, txg;
1336
1337 if (byteswap)
1338 byteswap_uint64_array(lr, sizeof (*lr));
1339
1340 ASSERT(lr->lr_doid == ZTEST_DIROBJ);
1341 ASSERT(name[0] != '\0');
1342
1343 VERIFY3U(0, ==,
1344 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
1345 ASSERT(object != 0);
1346
1347 ztest_object_lock(zd, object, RL_WRITER);
1348
1349 VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
1350
1351 tx = dmu_tx_create(os);
1352
1353 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
1354 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
1355
1356 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1357 if (txg == 0) {
1358 ztest_object_unlock(zd, object);
1359 return (ENOSPC);
1360 }
1361
1362 if (doi.doi_type == DMU_OT_ZAP_OTHER) {
1363 VERIFY3U(0, ==, zap_destroy(os, object, tx));
1364 } else {
1365 VERIFY3U(0, ==, dmu_object_free(os, object, tx));
1366 }
1367
1368 VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
1369
572e2857 1370 (void) ztest_log_remove(zd, tx, lr, object);
428870ff
BB
1371
1372 dmu_tx_commit(tx);
1373
1374 ztest_object_unlock(zd, object);
1375
1376 return (0);
1377}
1378
1379static int
1380ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
1381{
1382 objset_t *os = zd->zd_os;
1383 void *data = lr + 1; /* data follows lr */
1384 uint64_t offset, length;
1385 ztest_block_tag_t *bt = data;
1386 ztest_block_tag_t *bbt;
1387 uint64_t gen, txg, lrtxg, crtxg;
1388 dmu_object_info_t doi;
1389 dmu_tx_t *tx;
1390 dmu_buf_t *db;
1391 arc_buf_t *abuf = NULL;
1392 rl_t *rl;
1393
1394 if (byteswap)
1395 byteswap_uint64_array(lr, sizeof (*lr));
1396
1397 offset = lr->lr_offset;
1398 length = lr->lr_length;
1399
1400 /* If it's a dmu_sync() block, write the whole block */
1401 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
1402 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
1403 if (length < blocksize) {
1404 offset -= offset % blocksize;
1405 length = blocksize;
1406 }
1407 }
1408
1409 if (bt->bt_magic == BSWAP_64(BT_MAGIC))
1410 byteswap_uint64_array(bt, sizeof (*bt));
1411
1412 if (bt->bt_magic != BT_MAGIC)
1413 bt = NULL;
1414
1415 ztest_object_lock(zd, lr->lr_foid, RL_READER);
1416 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
1417
1418 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
1419
1420 dmu_object_info_from_db(db, &doi);
1421
1422 bbt = ztest_bt_bonus(db);
1423 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1424 gen = bbt->bt_gen;
1425 crtxg = bbt->bt_crtxg;
1426 lrtxg = lr->lr_common.lrc_txg;
1427
1428 tx = dmu_tx_create(os);
1429
1430 dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
1431
1432 if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
1433 P2PHASE(offset, length) == 0)
1434 abuf = dmu_request_arcbuf(db, length);
1435
1436 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1437 if (txg == 0) {
1438 if (abuf != NULL)
1439 dmu_return_arcbuf(abuf);
1440 dmu_buf_rele(db, FTAG);
1441 ztest_range_unlock(rl);
1442 ztest_object_unlock(zd, lr->lr_foid);
1443 return (ENOSPC);
1444 }
1445
1446 if (bt != NULL) {
1447 /*
1448 * Usually, verify the old data before writing new data --
1449 * but not always, because we also want to verify correct
1450 * behavior when the data was not recently read into cache.
1451 */
1452 ASSERT(offset % doi.doi_data_block_size == 0);
1453 if (ztest_random(4) != 0) {
1454 int prefetch = ztest_random(2) ?
1455 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
1456 ztest_block_tag_t rbt;
1457
1458 VERIFY(dmu_read(os, lr->lr_foid, offset,
1459 sizeof (rbt), &rbt, prefetch) == 0);
1460 if (rbt.bt_magic == BT_MAGIC) {
1461 ztest_bt_verify(&rbt, os, lr->lr_foid,
1462 offset, gen, txg, crtxg);
1463 }
1464 }
1465
1466 /*
1467 * Writes can appear to be newer than the bonus buffer because
1468 * the ztest_get_data() callback does a dmu_read() of the
1469 * open-context data, which may be different than the data
1470 * as it was when the write was generated.
1471 */
1472 if (zd->zd_zilog->zl_replay) {
1473 ztest_bt_verify(bt, os, lr->lr_foid, offset,
1474 MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
1475 bt->bt_crtxg);
1476 }
1477
1478 /*
1479 * Set the bt's gen/txg to the bonus buffer's gen/txg
1480 * so that all of the usual ASSERTs will work.
1481 */
1482 ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
1483 }
1484
1485 if (abuf == NULL) {
1486 dmu_write(os, lr->lr_foid, offset, length, data, tx);
1487 } else {
1488 bcopy(data, abuf->b_data, length);
1489 dmu_assign_arcbuf(db, offset, abuf, tx);
1490 }
1491
1492 (void) ztest_log_write(zd, tx, lr);
1493
1494 dmu_buf_rele(db, FTAG);
1495
1496 dmu_tx_commit(tx);
1497
1498 ztest_range_unlock(rl);
1499 ztest_object_unlock(zd, lr->lr_foid);
1500
1501 return (0);
1502}
1503
1504static int
1505ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
1506{
1507 objset_t *os = zd->zd_os;
1508 dmu_tx_t *tx;
1509 uint64_t txg;
1510 rl_t *rl;
1511
1512 if (byteswap)
1513 byteswap_uint64_array(lr, sizeof (*lr));
1514
1515 ztest_object_lock(zd, lr->lr_foid, RL_READER);
1516 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
1517 RL_WRITER);
1518
1519 tx = dmu_tx_create(os);
1520
1521 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
1522
1523 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1524 if (txg == 0) {
1525 ztest_range_unlock(rl);
1526 ztest_object_unlock(zd, lr->lr_foid);
1527 return (ENOSPC);
1528 }
1529
1530 VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
1531 lr->lr_length, tx) == 0);
1532
1533 (void) ztest_log_truncate(zd, tx, lr);
1534
1535 dmu_tx_commit(tx);
1536
1537 ztest_range_unlock(rl);
1538 ztest_object_unlock(zd, lr->lr_foid);
1539
1540 return (0);
1541}
1542
1543static int
1544ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
1545{
1546 objset_t *os = zd->zd_os;
1547 dmu_tx_t *tx;
1548 dmu_buf_t *db;
1549 ztest_block_tag_t *bbt;
1550 uint64_t txg, lrtxg, crtxg;
1551
1552 if (byteswap)
1553 byteswap_uint64_array(lr, sizeof (*lr));
1554
1555 ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
1556
1557 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
1558
1559 tx = dmu_tx_create(os);
1560 dmu_tx_hold_bonus(tx, lr->lr_foid);
1561
1562 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1563 if (txg == 0) {
1564 dmu_buf_rele(db, FTAG);
1565 ztest_object_unlock(zd, lr->lr_foid);
1566 return (ENOSPC);
1567 }
1568
1569 bbt = ztest_bt_bonus(db);
1570 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1571 crtxg = bbt->bt_crtxg;
1572 lrtxg = lr->lr_common.lrc_txg;
1573
1574 if (zd->zd_zilog->zl_replay) {
1575 ASSERT(lr->lr_size != 0);
1576 ASSERT(lr->lr_mode != 0);
1577 ASSERT(lrtxg != 0);
1578 } else {
1579 /*
1580 * Randomly change the size and increment the generation.
1581 */
1582 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
1583 sizeof (*bbt);
1584 lr->lr_mode = bbt->bt_gen + 1;
1585 ASSERT(lrtxg == 0);
1586 }
1587
1588 /*
1589 * Verify that the current bonus buffer is not newer than our txg.
1590 */
1591 ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
1592 MAX(txg, lrtxg), crtxg);
1593
1594 dmu_buf_will_dirty(db, tx);
1595
1596 ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
1597 ASSERT3U(lr->lr_size, <=, db->db_size);
1598 VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
1599 bbt = ztest_bt_bonus(db);
1600
1601 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
1602
1603 dmu_buf_rele(db, FTAG);
1604
1605 (void) ztest_log_setattr(zd, tx, lr);
1606
1607 dmu_tx_commit(tx);
1608
1609 ztest_object_unlock(zd, lr->lr_foid);
1610
1611 return (0);
1612}
1613
1614zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
0bc8fd78
BB
1615 NULL, /* 0 no such transaction type */
1616 (zil_replay_func_t *)ztest_replay_create, /* TX_CREATE */
1617 NULL, /* TX_MKDIR */
1618 NULL, /* TX_MKXATTR */
1619 NULL, /* TX_SYMLINK */
1620 (zil_replay_func_t *)ztest_replay_remove, /* TX_REMOVE */
1621 NULL, /* TX_RMDIR */
1622 NULL, /* TX_LINK */
1623 NULL, /* TX_RENAME */
1624 (zil_replay_func_t *)ztest_replay_write, /* TX_WRITE */
1625 (zil_replay_func_t *)ztest_replay_truncate, /* TX_TRUNCATE */
1626 (zil_replay_func_t *)ztest_replay_setattr, /* TX_SETATTR */
1627 NULL, /* TX_ACL */
1628 NULL, /* TX_CREATE_ACL */
1629 NULL, /* TX_CREATE_ATTR */
1630 NULL, /* TX_CREATE_ACL_ATTR */
1631 NULL, /* TX_MKDIR_ACL */
1632 NULL, /* TX_MKDIR_ATTR */
1633 NULL, /* TX_MKDIR_ACL_ATTR */
1634 NULL, /* TX_WRITE2 */
428870ff
BB
1635};
1636
1637/*
1638 * ZIL get_data callbacks
1639 */
1640
1641static void
1642ztest_get_done(zgd_t *zgd, int error)
1643{
1644 ztest_ds_t *zd = zgd->zgd_private;
1645 uint64_t object = zgd->zgd_rl->rl_object;
1646
1647 if (zgd->zgd_db)
1648 dmu_buf_rele(zgd->zgd_db, zgd);
1649
1650 ztest_range_unlock(zgd->zgd_rl);
1651 ztest_object_unlock(zd, object);
1652
1653 if (error == 0 && zgd->zgd_bp)
1654 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1655
1656 umem_free(zgd, sizeof (*zgd));
1657}
1658
1659static int
1660ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1661{
1662 ztest_ds_t *zd = arg;
1663 objset_t *os = zd->zd_os;
1664 uint64_t object = lr->lr_foid;
1665 uint64_t offset = lr->lr_offset;
1666 uint64_t size = lr->lr_length;
1667 blkptr_t *bp = &lr->lr_blkptr;
1668 uint64_t txg = lr->lr_common.lrc_txg;
1669 uint64_t crtxg;
1670 dmu_object_info_t doi;
1671 dmu_buf_t *db;
1672 zgd_t *zgd;
1673 int error;
1674
1675 ztest_object_lock(zd, object, RL_READER);
1676 error = dmu_bonus_hold(os, object, FTAG, &db);
1677 if (error) {
1678 ztest_object_unlock(zd, object);
1679 return (error);
1680 }
1681
1682 crtxg = ztest_bt_bonus(db)->bt_crtxg;
1683
1684 if (crtxg == 0 || crtxg > txg) {
1685 dmu_buf_rele(db, FTAG);
1686 ztest_object_unlock(zd, object);
1687 return (ENOENT);
1688 }
1689
1690 dmu_object_info_from_db(db, &doi);
1691 dmu_buf_rele(db, FTAG);
1692 db = NULL;
1693
1694 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
1695 zgd->zgd_zilog = zd->zd_zilog;
1696 zgd->zgd_private = zd;
1697
1698 if (buf != NULL) { /* immediate write */
1699 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
1700 RL_READER);
1701
1702 error = dmu_read(os, object, offset, size, buf,
1703 DMU_READ_NO_PREFETCH);
1704 ASSERT(error == 0);
1705 } else {
1706 size = doi.doi_data_block_size;
1707 if (ISP2(size)) {
1708 offset = P2ALIGN(offset, size);
1709 } else {
1710 ASSERT(offset < size);
1711 offset = 0;
1712 }
1713
1714 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
1715 RL_READER);
1716
1717 error = dmu_buf_hold(os, object, offset, zgd, &db,
1718 DMU_READ_NO_PREFETCH);
1719
1720 if (error == 0) {
1721 zgd->zgd_db = db;
1722 zgd->zgd_bp = bp;
1723
1724 ASSERT(db->db_offset == offset);
1725 ASSERT(db->db_size == size);
1726
1727 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1728 ztest_get_done, zgd);
1729
1730 if (error == 0)
1731 return (0);
1732 }
1733 }
1734
1735 ztest_get_done(zgd, error);
1736
1737 return (error);
1738}
1739
1740static void *
1741ztest_lr_alloc(size_t lrsize, char *name)
1742{
1743 char *lr;
1744 size_t namesize = name ? strlen(name) + 1 : 0;
1745
1746 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
1747
1748 if (name)
1749 bcopy(name, lr + lrsize, namesize);
1750
1751 return (lr);
1752}
1753
1754void
1755ztest_lr_free(void *lr, size_t lrsize, char *name)
1756{
1757 size_t namesize = name ? strlen(name) + 1 : 0;
1758
1759 umem_free(lr, lrsize + namesize);
1760}
1761
1762/*
1763 * Lookup a bunch of objects. Returns the number of objects not found.
1764 */
1765static int
1766ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
1767{
1768 int missing = 0;
1769 int error;
d6320ddb 1770 int i;
428870ff 1771
1e33ac1e 1772 ASSERT(mutex_held(&zd->zd_dirobj_lock));
428870ff 1773
d6320ddb 1774 for (i = 0; i < count; i++, od++) {
428870ff
BB
1775 od->od_object = 0;
1776 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
1777 sizeof (uint64_t), 1, &od->od_object);
1778 if (error) {
1779 ASSERT(error == ENOENT);
1780 ASSERT(od->od_object == 0);
1781 missing++;
1782 } else {
1783 dmu_buf_t *db;
1784 ztest_block_tag_t *bbt;
1785 dmu_object_info_t doi;
1786
1787 ASSERT(od->od_object != 0);
1788 ASSERT(missing == 0); /* there should be no gaps */
1789
1790 ztest_object_lock(zd, od->od_object, RL_READER);
1791 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
1792 od->od_object, FTAG, &db));
1793 dmu_object_info_from_db(db, &doi);
1794 bbt = ztest_bt_bonus(db);
1795 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1796 od->od_type = doi.doi_type;
1797 od->od_blocksize = doi.doi_data_block_size;
1798 od->od_gen = bbt->bt_gen;
1799 dmu_buf_rele(db, FTAG);
1800 ztest_object_unlock(zd, od->od_object);
1801 }
1802 }
1803
1804 return (missing);
1805}
1806
1807static int
1808ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
1809{
1810 int missing = 0;
d6320ddb 1811 int i;
428870ff 1812
1e33ac1e 1813 ASSERT(mutex_held(&zd->zd_dirobj_lock));
428870ff 1814
d6320ddb 1815 for (i = 0; i < count; i++, od++) {
428870ff
BB
1816 if (missing) {
1817 od->od_object = 0;
1818 missing++;
1819 continue;
1820 }
1821
1822 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
1823
1824 lr->lr_doid = od->od_dir;
1825 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
1826 lr->lrz_type = od->od_crtype;
1827 lr->lrz_blocksize = od->od_crblocksize;
1828 lr->lrz_ibshift = ztest_random_ibshift();
1829 lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
1830 lr->lrz_bonuslen = dmu_bonus_max();
1831 lr->lr_gen = od->od_crgen;
1832 lr->lr_crtime[0] = time(NULL);
1833
1834 if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
1835 ASSERT(missing == 0);
1836 od->od_object = 0;
1837 missing++;
1838 } else {
1839 od->od_object = lr->lr_foid;
1840 od->od_type = od->od_crtype;
1841 od->od_blocksize = od->od_crblocksize;
1842 od->od_gen = od->od_crgen;
1843 ASSERT(od->od_object != 0);
1844 }
1845
1846 ztest_lr_free(lr, sizeof (*lr), od->od_name);
1847 }
1848
1849 return (missing);
1850}
1851
1852static int
1853ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
1854{
1855 int missing = 0;
1856 int error;
d6320ddb 1857 int i;
428870ff 1858
1e33ac1e 1859 ASSERT(mutex_held(&zd->zd_dirobj_lock));
428870ff
BB
1860
1861 od += count - 1;
1862
d6320ddb 1863 for (i = count - 1; i >= 0; i--, od--) {
428870ff
BB
1864 if (missing) {
1865 missing++;
1866 continue;
1867 }
1868
1869 if (od->od_object == 0)
1870 continue;
1871
1872 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
1873
1874 lr->lr_doid = od->od_dir;
1875
1876 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
1877 ASSERT3U(error, ==, ENOSPC);
1878 missing++;
1879 } else {
1880 od->od_object = 0;
1881 }
1882 ztest_lr_free(lr, sizeof (*lr), od->od_name);
1883 }
1884
1885 return (missing);
1886}
1887
1888static int
1889ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
1890 void *data)
1891{
1892 lr_write_t *lr;
1893 int error;
1894
1895 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
1896
1897 lr->lr_foid = object;
1898 lr->lr_offset = offset;
1899 lr->lr_length = size;
1900 lr->lr_blkoff = 0;
1901 BP_ZERO(&lr->lr_blkptr);
1902
1903 bcopy(data, lr + 1, size);
1904
1905 error = ztest_replay_write(zd, lr, B_FALSE);
1906
1907 ztest_lr_free(lr, sizeof (*lr) + size, NULL);
1908
1909 return (error);
1910}
1911
1912static int
1913ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
1914{
1915 lr_truncate_t *lr;
1916 int error;
1917
1918 lr = ztest_lr_alloc(sizeof (*lr), NULL);
1919
1920 lr->lr_foid = object;
1921 lr->lr_offset = offset;
1922 lr->lr_length = size;
1923
1924 error = ztest_replay_truncate(zd, lr, B_FALSE);
1925
1926 ztest_lr_free(lr, sizeof (*lr), NULL);
1927
1928 return (error);
1929}
1930
1931static int
1932ztest_setattr(ztest_ds_t *zd, uint64_t object)
1933{
1934 lr_setattr_t *lr;
1935 int error;
1936
1937 lr = ztest_lr_alloc(sizeof (*lr), NULL);
1938
1939 lr->lr_foid = object;
1940 lr->lr_size = 0;
1941 lr->lr_mode = 0;
1942
1943 error = ztest_replay_setattr(zd, lr, B_FALSE);
1944
1945 ztest_lr_free(lr, sizeof (*lr), NULL);
1946
1947 return (error);
1948}
1949
1950static void
1951ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
1952{
1953 objset_t *os = zd->zd_os;
1954 dmu_tx_t *tx;
1955 uint64_t txg;
1956 rl_t *rl;
1957
1958 txg_wait_synced(dmu_objset_pool(os), 0);
1959
1960 ztest_object_lock(zd, object, RL_READER);
1961 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
1962
1963 tx = dmu_tx_create(os);
1964
1965 dmu_tx_hold_write(tx, object, offset, size);
1966
1967 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1968
1969 if (txg != 0) {
1970 dmu_prealloc(os, object, offset, size, tx);
1971 dmu_tx_commit(tx);
1972 txg_wait_synced(dmu_objset_pool(os), txg);
1973 } else {
1974 (void) dmu_free_long_range(os, object, offset, size);
1975 }
1976
1977 ztest_range_unlock(rl);
1978 ztest_object_unlock(zd, object);
1979}
1980
1981static void
1982ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
1983{
1984 ztest_block_tag_t wbt;
1985 dmu_object_info_t doi;
1986 enum ztest_io_type io_type;
1987 uint64_t blocksize;
1988 void *data;
1989
1990 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
1991 blocksize = doi.doi_data_block_size;
1992 data = umem_alloc(blocksize, UMEM_NOFAIL);
1993
1994 /*
1995 * Pick an i/o type at random, biased toward writing block tags.
1996 */
1997 io_type = ztest_random(ZTEST_IO_TYPES);
1998 if (ztest_random(2) == 0)
1999 io_type = ZTEST_IO_WRITE_TAG;
2000
3e31d2b0
ES
2001 (void) rw_enter(&zd->zd_zilog_lock, RW_READER);
2002
428870ff
BB
2003 switch (io_type) {
2004
2005 case ZTEST_IO_WRITE_TAG:
2006 ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
2007 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
2008 break;
2009
2010 case ZTEST_IO_WRITE_PATTERN:
2011 (void) memset(data, 'a' + (object + offset) % 5, blocksize);
2012 if (ztest_random(2) == 0) {
2013 /*
2014 * Induce fletcher2 collisions to ensure that
2015 * zio_ddt_collision() detects and resolves them
2016 * when using fletcher2-verify for deduplication.
2017 */
2018 ((uint64_t *)data)[0] ^= 1ULL << 63;
2019 ((uint64_t *)data)[4] ^= 1ULL << 63;
2020 }
2021 (void) ztest_write(zd, object, offset, blocksize, data);
2022 break;
2023
2024 case ZTEST_IO_WRITE_ZEROES:
2025 bzero(data, blocksize);
2026 (void) ztest_write(zd, object, offset, blocksize, data);
2027 break;
2028
2029 case ZTEST_IO_TRUNCATE:
2030 (void) ztest_truncate(zd, object, offset, blocksize);
2031 break;
2032
2033 case ZTEST_IO_SETATTR:
2034 (void) ztest_setattr(zd, object);
2035 break;
e75c13c3
BB
2036 default:
2037 break;
428870ff
BB
2038 }
2039
3e31d2b0
ES
2040 (void) rw_exit(&zd->zd_zilog_lock);
2041
428870ff
BB
2042 umem_free(data, blocksize);
2043}
2044
2045/*
2046 * Initialize an object description template.
2047 */
2048static void
2049ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
2050 dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
2051{
2052 od->od_dir = ZTEST_DIROBJ;
2053 od->od_object = 0;
2054
2055 od->od_crtype = type;
2056 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
2057 od->od_crgen = gen;
2058
2059 od->od_type = DMU_OT_NONE;
2060 od->od_blocksize = 0;
2061 od->od_gen = 0;
2062
2063 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
b8864a23 2064 tag, (longlong_t)id, (u_longlong_t)index);
428870ff
BB
2065}
2066
2067/*
2068 * Lookup or create the objects for a test using the od template.
2069 * If the objects do not all exist, or if 'remove' is specified,
2070 * remove any existing objects and create new ones. Otherwise,
2071 * use the existing objects.
2072 */
2073static int
2074ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
2075{
2076 int count = size / sizeof (*od);
2077 int rv = 0;
2078
1e33ac1e 2079 mutex_enter(&zd->zd_dirobj_lock);
428870ff
BB
2080 if ((ztest_lookup(zd, od, count) != 0 || remove) &&
2081 (ztest_remove(zd, od, count) != 0 ||
2082 ztest_create(zd, od, count) != 0))
2083 rv = -1;
2084 zd->zd_od = od;
1e33ac1e 2085 mutex_exit(&zd->zd_dirobj_lock);
428870ff
BB
2086
2087 return (rv);
2088}
2089
2090/* ARGSUSED */
2091void
2092ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
2093{
2094 zilog_t *zilog = zd->zd_zilog;
2095
3e31d2b0
ES
2096 (void) rw_enter(&zd->zd_zilog_lock, RW_READER);
2097
572e2857 2098 zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
428870ff
BB
2099
2100 /*
2101 * Remember the committed values in zd, which is in parent/child
2102 * shared memory. If we die, the next iteration of ztest_run()
2103 * will verify that the log really does contain this record.
2104 */
2105 mutex_enter(&zilog->zl_lock);
2106 ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
2107 zd->zd_seq = zilog->zl_commit_lr_seq;
2108 mutex_exit(&zilog->zl_lock);
3e31d2b0
ES
2109
2110 (void) rw_exit(&zd->zd_zilog_lock);
2111}
2112
2113/*
2114 * This function is designed to simulate the operations that occur during a
2115 * mount/unmount operation. We hold the dataset across these operations in an
2116 * attempt to expose any implicit assumptions about ZIL management.
2117 */
2118/* ARGSUSED */
2119void
2120ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
2121{
2122 objset_t *os = zd->zd_os;
2123
2124 (void) rw_enter(&zd->zd_zilog_lock, RW_WRITER);
2125
2126 /* zfsvfs_teardown() */
2127 zil_close(zd->zd_zilog);
2128
2129 /* zfsvfs_setup() */
2130 VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
2131 zil_replay(os, zd, ztest_replay_vector);
2132
2133 (void) rw_exit(&zd->zd_zilog_lock);
428870ff
BB
2134}
2135
2136/*
2137 * Verify that we can't destroy an active pool, create an existing pool,
2138 * or create a pool with a bad vdev spec.
2139 */
2140/* ARGSUSED */
2141void
2142ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
2143{
2144 ztest_shared_t *zs = ztest_shared;
2145 spa_t *spa;
2146 nvlist_t *nvroot;
2147
2148 /*
2149 * Attempt to create using a bad file.
2150 */
2151 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
2152 VERIFY3U(ENOENT, ==,
2153 spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
2154 nvlist_free(nvroot);
2155
2156 /*
2157 * Attempt to create using a bad mirror.
2158 */
2159 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
2160 VERIFY3U(ENOENT, ==,
2161 spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
2162 nvlist_free(nvroot);
2163
2164 /*
2165 * Attempt to create an existing pool. It shouldn't matter
2166 * what's in the nvroot; we should fail with EEXIST.
2167 */
1e33ac1e 2168 (void) rw_enter(&zs->zs_name_lock, RW_READER);
428870ff
BB
2169 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
2170 VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
2171 nvlist_free(nvroot);
2172 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
2173 VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
2174 spa_close(spa, FTAG);
2175
1e33ac1e 2176 (void) rw_exit(&zs->zs_name_lock);
428870ff
BB
2177}
2178
2179static vdev_t *
2180vdev_lookup_by_path(vdev_t *vd, const char *path)
2181{
2182 vdev_t *mvd;
d6320ddb 2183 int c;
428870ff
BB
2184
2185 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
2186 return (vd);
2187
d6320ddb 2188 for (c = 0; c < vd->vdev_children; c++)
428870ff
BB
2189 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
2190 NULL)
2191 return (mvd);
2192
2193 return (NULL);
2194}
2195
2196/*
2197 * Find the first available hole which can be used as a top-level.
2198 */
2199int
2200find_vdev_hole(spa_t *spa)
2201{
2202 vdev_t *rvd = spa->spa_root_vdev;
2203 int c;
2204
2205 ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
2206
2207 for (c = 0; c < rvd->vdev_children; c++) {
2208 vdev_t *cvd = rvd->vdev_child[c];
2209
2210 if (cvd->vdev_ishole)
2211 break;
2212 }
2213 return (c);
2214}
2215
2216/*
2217 * Verify that vdev_add() works as expected.
2218 */
2219/* ARGSUSED */
2220void
2221ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
2222{
2223 ztest_shared_t *zs = ztest_shared;
2224 spa_t *spa = zs->zs_spa;
2225 uint64_t leaves;
2226 uint64_t guid;
2227 nvlist_t *nvroot;
2228 int error;
2229
1e33ac1e 2230 mutex_enter(&zs->zs_vdev_lock);
428870ff
BB
2231 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
2232
2233 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2234
2235 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
2236
2237 /*
2238 * If we have slogs then remove them 1/4 of the time.
2239 */
2240 if (spa_has_slogs(spa) && ztest_random(4) == 0) {
2241 /*
2242 * Grab the guid from the head of the log class rotor.
2243 */
2244 guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
2245
2246 spa_config_exit(spa, SCL_VDEV, FTAG);
2247
2248 /*
2249 * We have to grab the zs_name_lock as writer to
2250 * prevent a race between removing a slog (dmu_objset_find)
2251 * and destroying a dataset. Removing the slog will
2252 * grab a reference on the dataset which may cause
2253 * dmu_objset_destroy() to fail with EBUSY thus
2254 * leaving the dataset in an inconsistent state.
2255 */
1e33ac1e 2256 rw_enter(&ztest_shared->zs_name_lock, RW_WRITER);
428870ff 2257 error = spa_vdev_remove(spa, guid, B_FALSE);
1e33ac1e 2258 rw_exit(&ztest_shared->zs_name_lock);
428870ff
BB
2259
2260 if (error && error != EEXIST)
2261 fatal(0, "spa_vdev_remove() = %d", error);
2262 } else {
2263 spa_config_exit(spa, SCL_VDEV, FTAG);
2264
2265 /*
2266 * Make 1/4 of the devices be log devices.
2267 */
2268 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
2269 ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
2270
2271 error = spa_vdev_add(spa, nvroot);
2272 nvlist_free(nvroot);
2273
2274 if (error == ENOSPC)
2275 ztest_record_enospc("spa_vdev_add");
2276 else if (error != 0)
2277 fatal(0, "spa_vdev_add() = %d", error);
2278 }
2279
1e33ac1e 2280 mutex_exit(&ztest_shared->zs_vdev_lock);
428870ff
BB
2281}
2282
2283/*
2284 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
2285 */
2286/* ARGSUSED */
2287void
2288ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
2289{
2290 ztest_shared_t *zs = ztest_shared;
2291 spa_t *spa = zs->zs_spa;
2292 vdev_t *rvd = spa->spa_root_vdev;
2293 spa_aux_vdev_t *sav;
2294 char *aux;
40b84e7a 2295 char *path;
428870ff
BB
2296 uint64_t guid = 0;
2297 int error;
2298
40b84e7a
BB
2299 path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
2300
428870ff
BB
2301 if (ztest_random(2) == 0) {
2302 sav = &spa->spa_spares;
2303 aux = ZPOOL_CONFIG_SPARES;
2304 } else {
2305 sav = &spa->spa_l2cache;
2306 aux = ZPOOL_CONFIG_L2CACHE;
2307 }
2308
1e33ac1e 2309 mutex_enter(&zs->zs_vdev_lock);
428870ff
BB
2310
2311 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2312
2313 if (sav->sav_count != 0 && ztest_random(4) == 0) {
b128c09f
BB
2314 /*
2315 * Pick a random device to remove.
2316 */
2317 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
2318 } else {
2319 /*
2320 * Find an unused device we can add.
2321 */
428870ff 2322 zs->zs_vdev_aux = 0;
b128c09f 2323 for (;;) {
b128c09f
BB
2324 int c;
2325 (void) sprintf(path, ztest_aux_template, zopt_dir,
428870ff 2326 zopt_pool, aux, zs->zs_vdev_aux);
b128c09f
BB
2327 for (c = 0; c < sav->sav_count; c++)
2328 if (strcmp(sav->sav_vdevs[c]->vdev_path,
2329 path) == 0)
2330 break;
2331 if (c == sav->sav_count &&
2332 vdev_lookup_by_path(rvd, path) == NULL)
2333 break;
428870ff 2334 zs->zs_vdev_aux++;
34dc7c2f
BB
2335 }
2336 }
2337
b128c09f 2338 spa_config_exit(spa, SCL_VDEV, FTAG);
34dc7c2f 2339
b128c09f
BB
2340 if (guid == 0) {
2341 /*
2342 * Add a new device.
2343 */
2344 nvlist_t *nvroot = make_vdev_root(NULL, aux,
2345 (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
2346 error = spa_vdev_add(spa, nvroot);
2347 if (error != 0)
2348 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
2349 nvlist_free(nvroot);
2350 } else {
2351 /*
2352 * Remove an existing device. Sometimes, dirty its
2353 * vdev state first to make sure we handle removal
2354 * of devices that have pending state changes.
2355 */
2356 if (ztest_random(2) == 0)
9babb374 2357 (void) vdev_online(spa, guid, 0, NULL);
b128c09f
BB
2358
2359 error = spa_vdev_remove(spa, guid, B_FALSE);
2360 if (error != 0 && error != EBUSY)
2361 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
2362 }
2363
1e33ac1e 2364 mutex_exit(&zs->zs_vdev_lock);
40b84e7a
BB
2365
2366 umem_free(path, MAXPATHLEN);
428870ff
BB
2367}
2368
2369/*
2370 * split a pool if it has mirror tlvdevs
2371 */
2372/* ARGSUSED */
2373void
2374ztest_split_pool(ztest_ds_t *zd, uint64_t id)
2375{
2376 ztest_shared_t *zs = ztest_shared;
2377 spa_t *spa = zs->zs_spa;
2378 vdev_t *rvd = spa->spa_root_vdev;
2379 nvlist_t *tree, **child, *config, *split, **schild;
2380 uint_t c, children, schildren = 0, lastlogid = 0;
2381 int error = 0;
2382
1e33ac1e 2383 mutex_enter(&zs->zs_vdev_lock);
428870ff
BB
2384
2385 /* ensure we have a useable config; mirrors of raidz aren't supported */
2386 if (zs->zs_mirrors < 3 || zopt_raidz > 1) {
1e33ac1e 2387 mutex_exit(&zs->zs_vdev_lock);
428870ff
BB
2388 return;
2389 }
2390
2391 /* clean up the old pool, if any */
2392 (void) spa_destroy("splitp");
2393
2394 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2395
2396 /* generate a config from the existing config */
2397 mutex_enter(&spa->spa_props_lock);
2398 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
2399 &tree) == 0);
2400 mutex_exit(&spa->spa_props_lock);
2401
2402 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
2403 &children) == 0);
2404
2405 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
2406 for (c = 0; c < children; c++) {
2407 vdev_t *tvd = rvd->vdev_child[c];
2408 nvlist_t **mchild;
2409 uint_t mchildren;
2410
2411 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
2412 VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
2413 0) == 0);
2414 VERIFY(nvlist_add_string(schild[schildren],
2415 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
2416 VERIFY(nvlist_add_uint64(schild[schildren],
2417 ZPOOL_CONFIG_IS_HOLE, 1) == 0);
2418 if (lastlogid == 0)
2419 lastlogid = schildren;
2420 ++schildren;
2421 continue;
2422 }
2423 lastlogid = 0;
2424 VERIFY(nvlist_lookup_nvlist_array(child[c],
2425 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
2426 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
2427 }
2428
2429 /* OK, create a config that can be used to split */
2430 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
2431 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
2432 VDEV_TYPE_ROOT) == 0);
2433 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
2434 lastlogid != 0 ? lastlogid : schildren) == 0);
2435
2436 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
2437 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
2438
2439 for (c = 0; c < schildren; c++)
2440 nvlist_free(schild[c]);
2441 free(schild);
2442 nvlist_free(split);
2443
2444 spa_config_exit(spa, SCL_VDEV, FTAG);
2445
1e33ac1e 2446 (void) rw_enter(&zs->zs_name_lock, RW_WRITER);
428870ff 2447 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
1e33ac1e 2448 (void) rw_exit(&zs->zs_name_lock);
428870ff
BB
2449
2450 nvlist_free(config);
2451
2452 if (error == 0) {
2453 (void) printf("successful split - results:\n");
2454 mutex_enter(&spa_namespace_lock);
2455 show_pool_stats(spa);
2456 show_pool_stats(spa_lookup("splitp"));
2457 mutex_exit(&spa_namespace_lock);
2458 ++zs->zs_splits;
2459 --zs->zs_mirrors;
2460 }
1e33ac1e 2461 mutex_exit(&zs->zs_vdev_lock);
428870ff 2462
34dc7c2f
BB
2463}
2464
2465/*
2466 * Verify that we can attach and detach devices.
2467 */
428870ff 2468/* ARGSUSED */
34dc7c2f 2469void
428870ff 2470ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
34dc7c2f 2471{
428870ff
BB
2472 ztest_shared_t *zs = ztest_shared;
2473 spa_t *spa = zs->zs_spa;
b128c09f 2474 spa_aux_vdev_t *sav = &spa->spa_spares;
34dc7c2f
BB
2475 vdev_t *rvd = spa->spa_root_vdev;
2476 vdev_t *oldvd, *newvd, *pvd;
b128c09f 2477 nvlist_t *root;
428870ff 2478 uint64_t leaves;
34dc7c2f
BB
2479 uint64_t leaf, top;
2480 uint64_t ashift = ztest_get_ashift();
fb5f0bc8 2481 uint64_t oldguid, pguid;
34dc7c2f 2482 size_t oldsize, newsize;
40b84e7a 2483 char *oldpath, *newpath;
34dc7c2f 2484 int replacing;
b128c09f
BB
2485 int oldvd_has_siblings = B_FALSE;
2486 int newvd_is_spare = B_FALSE;
2487 int oldvd_is_log;
34dc7c2f 2488 int error, expected_error;
34dc7c2f 2489
40b84e7a
BB
2490 oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
2491 newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
2492
1e33ac1e 2493 mutex_enter(&zs->zs_vdev_lock);
428870ff 2494 leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
34dc7c2f 2495
b128c09f 2496 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
34dc7c2f
BB
2497
2498 /*
2499 * Decide whether to do an attach or a replace.
2500 */
2501 replacing = ztest_random(2);
2502
2503 /*
2504 * Pick a random top-level vdev.
2505 */
428870ff 2506 top = ztest_random_vdev_top(spa, B_TRUE);
34dc7c2f
BB
2507
2508 /*
2509 * Pick a random leaf within it.
2510 */
2511 leaf = ztest_random(leaves);
2512
2513 /*
b128c09f 2514 * Locate this vdev.
34dc7c2f 2515 */
b128c09f 2516 oldvd = rvd->vdev_child[top];
428870ff 2517 if (zs->zs_mirrors >= 1) {
fb5f0bc8 2518 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
428870ff 2519 ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
b128c09f 2520 oldvd = oldvd->vdev_child[leaf / zopt_raidz];
fb5f0bc8
BB
2521 }
2522 if (zopt_raidz > 1) {
2523 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
2524 ASSERT(oldvd->vdev_children == zopt_raidz);
b128c09f 2525 oldvd = oldvd->vdev_child[leaf % zopt_raidz];
fb5f0bc8 2526 }
34dc7c2f
BB
2527
2528 /*
b128c09f
BB
2529 * If we're already doing an attach or replace, oldvd may be a
2530 * mirror vdev -- in which case, pick a random child.
34dc7c2f 2531 */
b128c09f
BB
2532 while (oldvd->vdev_children != 0) {
2533 oldvd_has_siblings = B_TRUE;
fb5f0bc8
BB
2534 ASSERT(oldvd->vdev_children >= 2);
2535 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
b128c09f
BB
2536 }
2537
2538 oldguid = oldvd->vdev_guid;
9babb374 2539 oldsize = vdev_get_min_asize(oldvd);
b128c09f
BB
2540 oldvd_is_log = oldvd->vdev_top->vdev_islog;
2541 (void) strcpy(oldpath, oldvd->vdev_path);
2542 pvd = oldvd->vdev_parent;
fb5f0bc8 2543 pguid = pvd->vdev_guid;
34dc7c2f
BB
2544
2545 /*
b128c09f 2546 * If oldvd has siblings, then half of the time, detach it.
34dc7c2f 2547 */
b128c09f
BB
2548 if (oldvd_has_siblings && ztest_random(2) == 0) {
2549 spa_config_exit(spa, SCL_VDEV, FTAG);
fb5f0bc8
BB
2550 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
2551 if (error != 0 && error != ENODEV && error != EBUSY &&
2552 error != ENOTSUP)
2553 fatal(0, "detach (%s) returned %d", oldpath, error);
40b84e7a 2554 goto out;
b128c09f 2555 }
34dc7c2f
BB
2556
2557 /*
b128c09f
BB
2558 * For the new vdev, choose with equal probability between the two
2559 * standard paths (ending in either 'a' or 'b') or a random hot spare.
34dc7c2f 2560 */
b128c09f
BB
2561 if (sav->sav_count != 0 && ztest_random(3) == 0) {
2562 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
2563 newvd_is_spare = B_TRUE;
2564 (void) strcpy(newpath, newvd->vdev_path);
2565 } else {
2566 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
2567 zopt_dir, zopt_pool, top * leaves + leaf);
2568 if (ztest_random(2) == 0)
2569 newpath[strlen(newpath) - 1] = 'b';
2570 newvd = vdev_lookup_by_path(rvd, newpath);
2571 }
2572
2573 if (newvd) {
9babb374 2574 newsize = vdev_get_min_asize(newvd);
b128c09f
BB
2575 } else {
2576 /*
2577 * Make newsize a little bigger or smaller than oldsize.
2578 * If it's smaller, the attach should fail.
2579 * If it's larger, and we're doing a replace,
2580 * we should get dynamic LUN growth when we're done.
2581 */
2582 newsize = 10 * oldsize / (9 + ztest_random(3));
2583 }
34dc7c2f
BB
2584
2585 /*
2586 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
2587 * unless it's a replace; in that case any non-replacing parent is OK.
2588 *
2589 * If newvd is already part of the pool, it should fail with EBUSY.
2590 *
2591 * If newvd is too small, it should fail with EOVERFLOW.
2592 */
b128c09f
BB
2593 if (pvd->vdev_ops != &vdev_mirror_ops &&
2594 pvd->vdev_ops != &vdev_root_ops && (!replacing ||
2595 pvd->vdev_ops == &vdev_replacing_ops ||
2596 pvd->vdev_ops == &vdev_spare_ops))
34dc7c2f 2597 expected_error = ENOTSUP;
b128c09f
BB
2598 else if (newvd_is_spare && (!replacing || oldvd_is_log))
2599 expected_error = ENOTSUP;
2600 else if (newvd == oldvd)
2601 expected_error = replacing ? 0 : EBUSY;
2602 else if (vdev_lookup_by_path(rvd, newpath) != NULL)
2603 expected_error = EBUSY;
34dc7c2f
BB
2604 else if (newsize < oldsize)
2605 expected_error = EOVERFLOW;
2606 else if (ashift > oldvd->vdev_top->vdev_ashift)
2607 expected_error = EDOM;
2608 else
2609 expected_error = 0;
2610
b128c09f 2611 spa_config_exit(spa, SCL_VDEV, FTAG);
34dc7c2f
BB
2612
2613 /*
2614 * Build the nvlist describing newpath.
2615 */
b128c09f
BB
2616 root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0,
2617 ashift, 0, 0, 0, 1);
34dc7c2f 2618
b128c09f 2619 error = spa_vdev_attach(spa, oldguid, root, replacing);
34dc7c2f 2620
34dc7c2f
BB
2621 nvlist_free(root);
2622
2623 /*
2624 * If our parent was the replacing vdev, but the replace completed,
2625 * then instead of failing with ENOTSUP we may either succeed,
2626 * fail with ENODEV, or fail with EOVERFLOW.
2627 */
2628 if (expected_error == ENOTSUP &&
2629 (error == 0 || error == ENODEV || error == EOVERFLOW))
2630 expected_error = error;
2631
2632 /*
2633 * If someone grew the LUN, the replacement may be too small.
2634 */
b128c09f 2635 if (error == EOVERFLOW || error == EBUSY)
34dc7c2f
BB
2636 expected_error = error;
2637
b128c09f
BB
2638 /* XXX workaround 6690467 */
2639 if (error != expected_error && expected_error != EBUSY) {
2640 fatal(0, "attach (%s %llu, %s %llu, %d) "
2641 "returned %d, expected %d",
2642 oldpath, (longlong_t)oldsize, newpath,
2643 (longlong_t)newsize, replacing, error, expected_error);
34dc7c2f 2644 }
40b84e7a 2645out:
1e33ac1e 2646 mutex_exit(&zs->zs_vdev_lock);
40b84e7a
BB
2647
2648 umem_free(oldpath, MAXPATHLEN);
2649 umem_free(newpath, MAXPATHLEN);
34dc7c2f
BB
2650}
2651
9babb374
BB
2652/*
2653 * Callback function which expands the physical size of the vdev.
2654 */
2655vdev_t *
2656grow_vdev(vdev_t *vd, void *arg)
2657{
1fde1e37 2658 ASSERTV(spa_t *spa = vd->vdev_spa);
9babb374
BB
2659 size_t *newsize = arg;
2660 size_t fsize;
2661 int fd;
2662
2663 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2664 ASSERT(vd->vdev_ops->vdev_op_leaf);
2665
2666 if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
2667 return (vd);
2668
2669 fsize = lseek(fd, 0, SEEK_END);
0e5b68e0 2670 VERIFY(ftruncate(fd, *newsize) == 0);
9babb374
BB
2671
2672 if (zopt_verbose >= 6) {
2673 (void) printf("%s grew from %lu to %lu bytes\n",
2674 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
2675 }
2676 (void) close(fd);
2677 return (NULL);
2678}
2679
2680/*
2681 * Callback function which expands a given vdev by calling vdev_online().
2682 */
2683/* ARGSUSED */
2684vdev_t *
2685online_vdev(vdev_t *vd, void *arg)
2686{
2687 spa_t *spa = vd->vdev_spa;
2688 vdev_t *tvd = vd->vdev_top;
9babb374 2689 uint64_t guid = vd->vdev_guid;
428870ff
BB
2690 uint64_t generation = spa->spa_config_generation + 1;
2691 vdev_state_t newstate = VDEV_STATE_UNKNOWN;
2692 int error;
9babb374
BB
2693
2694 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2695 ASSERT(vd->vdev_ops->vdev_op_leaf);
2696
2697 /* Calling vdev_online will initialize the new metaslabs */
2698 spa_config_exit(spa, SCL_STATE, spa);
428870ff 2699 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
9babb374
BB
2700 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
2701
428870ff
BB
2702 /*
2703 * If vdev_online returned an error or the underlying vdev_open
2704 * failed then we abort the expand. The only way to know that
2705 * vdev_open fails is by checking the returned newstate.
2706 */
2707 if (error || newstate != VDEV_STATE_HEALTHY) {
2708 if (zopt_verbose >= 5) {
2709 (void) printf("Unable to expand vdev, state %llu, "
2710 "error %d\n", (u_longlong_t)newstate, error);
2711 }
2712 return (vd);
2713 }
2714 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
2715
9babb374
BB
2716 /*
2717 * Since we dropped the lock we need to ensure that we're
2718 * still talking to the original vdev. It's possible this
2719 * vdev may have been detached/replaced while we were
2720 * trying to online it.
2721 */
428870ff
BB
2722 if (generation != spa->spa_config_generation) {
2723 if (zopt_verbose >= 5) {
2724 (void) printf("vdev configuration has changed, "
2725 "guid %llu, state %llu, expected gen %llu, "
2726 "got gen %llu\n",
2727 (u_longlong_t)guid,
2728 (u_longlong_t)tvd->vdev_state,
2729 (u_longlong_t)generation,
2730 (u_longlong_t)spa->spa_config_generation);
9babb374
BB
2731 }
2732 return (vd);
2733 }
2734 return (NULL);
2735}
2736
2737/*
2738 * Traverse the vdev tree calling the supplied function.
2739 * We continue to walk the tree until we either have walked all
2740 * children or we receive a non-NULL return from the callback.
2741 * If a NULL callback is passed, then we just return back the first
2742 * leaf vdev we encounter.
2743 */
2744vdev_t *
2745vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
2746{
d6320ddb
BB
2747 uint_t c;
2748
9babb374
BB
2749 if (vd->vdev_ops->vdev_op_leaf) {
2750 if (func == NULL)
2751 return (vd);
2752 else
2753 return (func(vd, arg));
2754 }
2755
d6320ddb 2756 for (c = 0; c < vd->vdev_children; c++) {
9babb374
BB
2757 vdev_t *cvd = vd->vdev_child[c];
2758 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
2759 return (cvd);
2760 }
2761 return (NULL);
2762}
2763
34dc7c2f
BB
2764/*
2765 * Verify that dynamic LUN growth works as expected.
2766 */
428870ff 2767/* ARGSUSED */
34dc7c2f 2768void
428870ff 2769ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
34dc7c2f 2770{
428870ff
BB
2771 ztest_shared_t *zs = ztest_shared;
2772 spa_t *spa = zs->zs_spa;
2773 vdev_t *vd, *tvd;
2774 metaslab_class_t *mc;
2775 metaslab_group_t *mg;
9babb374 2776 size_t psize, newsize;
428870ff
BB
2777 uint64_t top;
2778 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
34dc7c2f 2779
1e33ac1e 2780 mutex_enter(&zs->zs_vdev_lock);
9babb374
BB
2781 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
2782
428870ff 2783 top = ztest_random_vdev_top(spa, B_TRUE);
9babb374 2784
428870ff
BB
2785 tvd = spa->spa_root_vdev->vdev_child[top];
2786 mg = tvd->vdev_mg;
2787 mc = mg->mg_class;
2788 old_ms_count = tvd->vdev_ms_count;
2789 old_class_space = metaslab_class_get_space(mc);
34dc7c2f
BB
2790
2791 /*
9babb374
BB
2792 * Determine the size of the first leaf vdev associated with
2793 * our top-level device.
34dc7c2f 2794 */
9babb374
BB
2795 vd = vdev_walk_tree(tvd, NULL, NULL);
2796 ASSERT3P(vd, !=, NULL);
2797 ASSERT(vd->vdev_ops->vdev_op_leaf);
34dc7c2f 2798
9babb374 2799 psize = vd->vdev_psize;
34dc7c2f 2800
9babb374 2801 /*
428870ff
BB
2802 * We only try to expand the vdev if it's healthy, less than 4x its
2803 * original size, and it has a valid psize.
9babb374 2804 */
428870ff
BB
2805 if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
2806 psize == 0 || psize >= 4 * zopt_vdev_size) {
9babb374 2807 spa_config_exit(spa, SCL_STATE, spa);
1e33ac1e 2808 mutex_exit(&zs->zs_vdev_lock);
9babb374
BB
2809 return;
2810 }
2811 ASSERT(psize > 0);
2812 newsize = psize + psize / 8;
2813 ASSERT3U(newsize, >, psize);
34dc7c2f 2814
9babb374 2815 if (zopt_verbose >= 6) {
428870ff 2816 (void) printf("Expanding LUN %s from %lu to %lu\n",
9babb374
BB
2817 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
2818 }
2819
9babb374
BB
2820 /*
2821 * Growing the vdev is a two step process:
2822 * 1). expand the physical size (i.e. relabel)
2823 * 2). online the vdev to create the new metaslabs
2824 */
2825 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
2826 vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
2827 tvd->vdev_state != VDEV_STATE_HEALTHY) {
2828 if (zopt_verbose >= 5) {
2829 (void) printf("Could not expand LUN because "
428870ff 2830 "the vdev configuration changed.\n");
34dc7c2f 2831 }
428870ff 2832 spa_config_exit(spa, SCL_STATE, spa);
1e33ac1e 2833 mutex_exit(&zs->zs_vdev_lock);
9babb374 2834 return;
34dc7c2f
BB
2835 }
2836
428870ff 2837 spa_config_exit(spa, SCL_STATE, spa);
9babb374
BB
2838
2839 /*
2840 * Expanding the LUN will update the config asynchronously,
2841 * thus we must wait for the async thread to complete any
2842 * pending tasks before proceeding.
2843 */
428870ff
BB
2844 for (;;) {
2845 boolean_t done;
2846 mutex_enter(&spa->spa_async_lock);
2847 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
2848 mutex_exit(&spa->spa_async_lock);
2849 if (done)
2850 break;
2851 txg_wait_synced(spa_get_dsl(spa), 0);
2852 (void) poll(NULL, 0, 100);
2853 }
9babb374
BB
2854
2855 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
428870ff
BB
2856
2857 tvd = spa->spa_root_vdev->vdev_child[top];
2858 new_ms_count = tvd->vdev_ms_count;
2859 new_class_space = metaslab_class_get_space(mc);
2860
2861 if (tvd->vdev_mg != mg || mg->mg_class != mc) {
2862 if (zopt_verbose >= 5) {
2863 (void) printf("Could not verify LUN expansion due to "
2864 "intervening vdev offline or remove.\n");
2865 }
2866 spa_config_exit(spa, SCL_STATE, spa);
1e33ac1e 2867 mutex_exit(&zs->zs_vdev_lock);
428870ff
BB
2868 return;
2869 }
2870
2871 /*
2872 * Make sure we were able to grow the vdev.
2873 */
2874 if (new_ms_count <= old_ms_count)
2875 fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
2876 old_ms_count, new_ms_count);
9babb374
BB
2877
2878 /*
2879 * Make sure we were able to grow the pool.
2880 */
428870ff
BB
2881 if (new_class_space <= old_class_space)
2882 fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
2883 old_class_space, new_class_space);
2884
2885 if (zopt_verbose >= 5) {
9babb374
BB
2886 char oldnumbuf[6], newnumbuf[6];
2887
428870ff
BB
2888 nicenum(old_class_space, oldnumbuf);
2889 nicenum(new_class_space, newnumbuf);
9babb374
BB
2890 (void) printf("%s grew from %s to %s\n",
2891 spa->spa_name, oldnumbuf, newnumbuf);
2892 }
428870ff 2893
9babb374 2894 spa_config_exit(spa, SCL_STATE, spa);
1e33ac1e 2895 mutex_exit(&zs->zs_vdev_lock);
34dc7c2f
BB
2896}
2897
428870ff
BB
2898/*
2899 * Verify that dmu_objset_{create,destroy,open,close} work as expected.
2900 */
34dc7c2f
BB
2901/* ARGSUSED */
2902static void
428870ff 2903ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
34dc7c2f
BB
2904{
2905 /*
428870ff 2906 * Create the objects common to all ztest datasets.
34dc7c2f 2907 */
428870ff 2908 VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
34dc7c2f 2909 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
428870ff 2910}
34dc7c2f 2911
428870ff
BB
2912static int
2913ztest_dataset_create(char *dsname)
2914{
2915 uint64_t zilset = ztest_random(100);
2916 int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
2917 ztest_objset_create_cb, NULL);
2918
2919 if (err || zilset < 80)
2920 return (err);
2921
2922 (void) printf("Setting dataset %s to sync always\n", dsname);
2923 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
2924 ZFS_SYNC_ALWAYS, B_FALSE));
34dc7c2f
BB
2925}
2926
428870ff 2927/* ARGSUSED */
34dc7c2f 2928static int
428870ff 2929ztest_objset_destroy_cb(const char *name, void *arg)
34dc7c2f 2930{
34dc7c2f 2931 objset_t *os;
428870ff 2932 dmu_object_info_t doi;
34dc7c2f
BB
2933 int error;
2934
2935 /*
2936 * Verify that the dataset contains a directory object.
2937 */
428870ff
BB
2938 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
2939 error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
34dc7c2f
BB
2940 if (error != ENOENT) {
2941 /* We could have crashed in the middle of destroying it */
2942 ASSERT3U(error, ==, 0);
428870ff
BB
2943 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
2944 ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
34dc7c2f 2945 }
428870ff 2946 dmu_objset_rele(os, FTAG);
34dc7c2f
BB
2947
2948 /*
2949 * Destroy the dataset.
2950 */
428870ff 2951 VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
34dc7c2f
BB
2952 return (0);
2953}
2954
428870ff
BB
2955static boolean_t
2956ztest_snapshot_create(char *osname, uint64_t id)
34dc7c2f 2957{
428870ff
BB
2958 char snapname[MAXNAMELEN];
2959 int error;
2960
2961 (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
2962 (u_longlong_t)id);
2963
2964 error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
572e2857 2965 NULL, NULL, B_FALSE, B_FALSE, -1);
428870ff
BB
2966 if (error == ENOSPC) {
2967 ztest_record_enospc(FTAG);
2968 return (B_FALSE);
2969 }
2970 if (error != 0 && error != EEXIST)
2971 fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
2972 return (B_TRUE);
2973}
2974
2975static boolean_t
2976ztest_snapshot_destroy(char *osname, uint64_t id)
2977{
2978 char snapname[MAXNAMELEN];
2979 int error;
2980
2981 (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
2982 (u_longlong_t)id);
2983
2984 error = dmu_objset_destroy(snapname, B_FALSE);
2985 if (error != 0 && error != ENOENT)
2986 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
2987 return (B_TRUE);
34dc7c2f
BB
2988}
2989
428870ff 2990/* ARGSUSED */
34dc7c2f 2991void
428870ff 2992ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
34dc7c2f 2993{
428870ff 2994 ztest_shared_t *zs = ztest_shared;
40b84e7a 2995 ztest_ds_t *zdtmp;
428870ff 2996 int iters;
34dc7c2f 2997 int error;
b128c09f 2998 objset_t *os, *os2;
40b84e7a 2999 char *name;
34dc7c2f 3000 zilog_t *zilog;
d6320ddb 3001 int i;
34dc7c2f 3002
40b84e7a
BB
3003 zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL);
3004 name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3005
1e33ac1e 3006 (void) rw_enter(&zs->zs_name_lock, RW_READER);
34dc7c2f 3007
428870ff
BB
3008 (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
3009 zs->zs_pool, (u_longlong_t)id);
34dc7c2f
BB
3010
3011 /*
3012 * If this dataset exists from a previous run, process its replay log
3013 * half of the time. If we don't replay it, then dmu_objset_destroy()
428870ff 3014 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
34dc7c2f
BB
3015 */
3016 if (ztest_random(2) == 0 &&
428870ff 3017 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
40b84e7a
BB
3018 ztest_zd_init(zdtmp, os);
3019 zil_replay(os, zdtmp, ztest_replay_vector);
3020 ztest_zd_fini(zdtmp);
428870ff 3021 dmu_objset_disown(os, FTAG);
34dc7c2f
BB
3022 }
3023
3024 /*
3025 * There may be an old instance of the dataset we're about to
3026 * create lying around from a previous run. If so, destroy it
3027 * and all of its snapshots.
3028 */
428870ff 3029 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
34dc7c2f
BB
3030 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
3031
3032 /*
3033 * Verify that the destroyed dataset is no longer in the namespace.
3034 */
428870ff 3035 VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
34dc7c2f
BB
3036
3037 /*
3038 * Verify that we can create a new dataset.
3039 */
428870ff 3040 error = ztest_dataset_create(name);
34dc7c2f
BB
3041 if (error) {
3042 if (error == ENOSPC) {
428870ff 3043 ztest_record_enospc(FTAG);
40b84e7a 3044 goto out;
34dc7c2f
BB
3045 }
3046 fatal(0, "dmu_objset_create(%s) = %d", name, error);
3047 }
3048
428870ff
BB
3049 VERIFY3U(0, ==,
3050 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
3051
40b84e7a 3052 ztest_zd_init(zdtmp, os);
34dc7c2f
BB
3053
3054 /*
3055 * Open the intent log for it.
3056 */
428870ff 3057 zilog = zil_open(os, ztest_get_data);
34dc7c2f
BB
3058
3059 /*
428870ff
BB
3060 * Put some objects in there, do a little I/O to them,
3061 * and randomly take a couple of snapshots along the way.
34dc7c2f 3062 */
428870ff 3063 iters = ztest_random(5);
d6320ddb 3064 for (i = 0; i < iters; i++) {
40b84e7a 3065 ztest_dmu_object_alloc_free(zdtmp, id);
428870ff
BB
3066 if (ztest_random(iters) == 0)
3067 (void) ztest_snapshot_create(name, i);
34dc7c2f
BB
3068 }
3069
3070 /*
3071 * Verify that we cannot create an existing dataset.
3072 */
428870ff
BB
3073 VERIFY3U(EEXIST, ==,
3074 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
34dc7c2f
BB
3075
3076 /*
428870ff 3077 * Verify that we can hold an objset that is also owned.
b128c09f 3078 */
428870ff
BB
3079 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
3080 dmu_objset_rele(os2, FTAG);
34dc7c2f 3081
428870ff
BB
3082 /*
3083 * Verify that we cannot own an objset that is already owned.
3084 */
3085 VERIFY3U(EBUSY, ==,
3086 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
34dc7c2f 3087
428870ff
BB
3088 zil_close(zilog);
3089 dmu_objset_disown(os, FTAG);
40b84e7a
BB
3090 ztest_zd_fini(zdtmp);
3091out:
1e33ac1e 3092 (void) rw_exit(&zs->zs_name_lock);
40b84e7a
BB
3093
3094 umem_free(name, MAXNAMELEN);
3095 umem_free(zdtmp, sizeof (ztest_ds_t));
34dc7c2f
BB
3096}
3097
3098/*
3099 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
3100 */
3101void
428870ff 3102ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
34dc7c2f 3103{
428870ff 3104 ztest_shared_t *zs = ztest_shared;
34dc7c2f 3105
1e33ac1e 3106 (void) rw_enter(&zs->zs_name_lock, RW_READER);
428870ff
BB
3107 (void) ztest_snapshot_destroy(zd->zd_name, id);
3108 (void) ztest_snapshot_create(zd->zd_name, id);
1e33ac1e 3109 (void) rw_exit(&zs->zs_name_lock);
34dc7c2f
BB
3110}
3111
9babb374
BB
3112/*
3113 * Cleanup non-standard snapshots and clones.
3114 */
3115void
428870ff 3116ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
9babb374 3117{
40b84e7a
BB
3118 char *snap1name;
3119 char *clone1name;
3120 char *snap2name;
3121 char *clone2name;
3122 char *snap3name;
9babb374
BB
3123 int error;
3124
40b84e7a
BB
3125 snap1name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3126 clone1name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3127 snap2name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3128 clone2name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3129 snap3name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3130
b8864a23
BB
3131 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu",
3132 osname, (u_longlong_t)id);
3133 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu",
3134 osname, (u_longlong_t)id);
3135 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu",
3136 clone1name, (u_longlong_t)id);
3137 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu",
3138 osname, (u_longlong_t)id);
3139 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu",
3140 clone1name, (u_longlong_t)id);
9babb374 3141
45d1cae3 3142 error = dmu_objset_destroy(clone2name, B_FALSE);
9babb374
BB
3143 if (error && error != ENOENT)
3144 fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
45d1cae3 3145 error = dmu_objset_destroy(snap3name, B_FALSE);
9babb374
BB
3146 if (error && error != ENOENT)
3147 fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
45d1cae3 3148 error = dmu_objset_destroy(snap2name, B_FALSE);
9babb374
BB
3149 if (error && error != ENOENT)
3150 fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
45d1cae3 3151 error = dmu_objset_destroy(clone1name, B_FALSE);
9babb374
BB
3152 if (error && error != ENOENT)
3153 fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
45d1cae3 3154 error = dmu_objset_destroy(snap1name, B_FALSE);
9babb374
BB
3155 if (error && error != ENOENT)
3156 fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
40b84e7a
BB
3157
3158 umem_free(snap1name, MAXNAMELEN);
3159 umem_free(clone1name, MAXNAMELEN);
3160 umem_free(snap2name, MAXNAMELEN);
3161 umem_free(clone2name, MAXNAMELEN);
3162 umem_free(snap3name, MAXNAMELEN);
9babb374
BB
3163}
3164
3165/*
3166 * Verify dsl_dataset_promote handles EBUSY
3167 */
3168void
428870ff 3169ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
9babb374 3170{
428870ff 3171 ztest_shared_t *zs = ztest_shared;
9babb374
BB
3172 objset_t *clone;
3173 dsl_dataset_t *ds;
40b84e7a
BB
3174 char *snap1name;
3175 char *clone1name;
3176 char *snap2name;
3177 char *clone2name;
3178 char *snap3name;
428870ff
BB
3179 char *osname = zd->zd_name;
3180 int error;
9babb374 3181
40b84e7a
BB
3182 snap1name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3183 clone1name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3184 snap2name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3185 clone2name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3186 snap3name = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
3187
1e33ac1e 3188 (void) rw_enter(&zs->zs_name_lock, RW_READER);
9babb374 3189
428870ff 3190 ztest_dsl_dataset_cleanup(osname, id);
9babb374 3191
b8864a23
BB
3192 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu",
3193 osname, (u_longlong_t)id);
3194 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu",
3195 osname, (u_longlong_t)id);
3196 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu",
3197 clone1name, (u_longlong_t)id);
3198 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu",
3199 osname, (u_longlong_t)id);
3200 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu",
3201 clone1name, (u_longlong_t)id);
9babb374
BB
3202
3203 error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
572e2857 3204 NULL, NULL, B_FALSE, B_FALSE, -1);
9babb374
BB
3205 if (error && error != EEXIST) {
3206 if (error == ENOSPC) {
428870ff 3207 ztest_record_enospc(FTAG);
9babb374
BB
3208 goto out;
3209 }
3210 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
3211 }
3212
428870ff 3213 error = dmu_objset_hold(snap1name, FTAG, &clone);
9babb374
BB
3214 if (error)
3215 fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
3216
428870ff
BB
3217 error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
3218 dmu_objset_rele(clone, FTAG);
9babb374
BB
3219 if (error) {
3220 if (error == ENOSPC) {
428870ff 3221 ztest_record_enospc(FTAG);
9babb374
BB
3222 goto out;
3223 }
3224 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
3225 }
3226
428870ff 3227 error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
572e2857 3228 NULL, NULL, B_FALSE, B_FALSE, -1);
428870ff
BB
3229 if (error && error != EEXIST) {
3230 if (error == ENOSPC) {
3231 ztest_record_enospc(FTAG);
3232 goto out;
34dc7c2f 3233 }
428870ff
BB
3234 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
3235 }
34dc7c2f 3236
428870ff 3237 error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
572e2857 3238 NULL, NULL, B_FALSE, B_FALSE, -1);
428870ff
BB
3239 if (error && error != EEXIST) {
3240 if (error == ENOSPC) {
3241 ztest_record_enospc(FTAG);
3242 goto out;
3243 }
3244 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
3245 }
34dc7c2f 3246
428870ff
BB
3247 error = dmu_objset_hold(snap3name, FTAG, &clone);
3248 if (error)
3249 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
34dc7c2f 3250
428870ff
BB
3251 error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
3252 dmu_objset_rele(clone, FTAG);
3253 if (error) {
3254 if (error == ENOSPC) {
3255 ztest_record_enospc(FTAG);
3256 goto out;
3257 }
3258 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
3259 }
34dc7c2f 3260
428870ff
BB
3261 error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds);
3262 if (error)
3263 fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error);
3264 error = dsl_dataset_promote(clone2name, NULL);
3265 if (error != EBUSY)
3266 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
3267 error);
3268 dsl_dataset_disown(ds, FTAG);
34dc7c2f 3269
428870ff
BB
3270out:
3271 ztest_dsl_dataset_cleanup(osname, id);
34dc7c2f 3272
1e33ac1e 3273 (void) rw_exit(&zs->zs_name_lock);
40b84e7a
BB
3274
3275 umem_free(snap1name, MAXNAMELEN);
3276 umem_free(clone1name, MAXNAMELEN);
3277 umem_free(snap2name, MAXNAMELEN);
3278 umem_free(clone2name, MAXNAMELEN);
3279 umem_free(snap3name, MAXNAMELEN);
428870ff 3280}
34dc7c2f 3281
40b84e7a
BB
3282#undef OD_ARRAY_SIZE
3283#define OD_ARRAY_SIZE 4
3284
428870ff
BB
3285/*
3286 * Verify that dmu_object_{alloc,free} work as expected.
3287 */
3288void
3289ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
3290{
40b84e7a
BB
3291 ztest_od_t *od;
3292 int batchsize;
3293 int size;
d6320ddb 3294 int b;
34dc7c2f 3295
40b84e7a
BB
3296 size = sizeof(ztest_od_t) * OD_ARRAY_SIZE;
3297 od = umem_alloc(size, UMEM_NOFAIL);
3298 batchsize = OD_ARRAY_SIZE;
3299
d6320ddb 3300 for (b = 0; b < batchsize; b++)
40b84e7a 3301 ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
34dc7c2f 3302
428870ff
BB
3303 /*
3304 * Destroy the previous batch of objects, create a new batch,
3305 * and do some I/O on the new objects.
3306 */
40b84e7a 3307 if (ztest_object_init(zd, od, size, B_TRUE) != 0)
428870ff 3308 return;
34dc7c2f 3309
428870ff
BB
3310 while (ztest_random(4 * batchsize) != 0)
3311 ztest_io(zd, od[ztest_random(batchsize)].od_object,
3312 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
40b84e7a
BB
3313
3314 umem_free(od, size);
34dc7c2f
BB
3315}
3316
40b84e7a
BB
3317#undef OD_ARRAY_SIZE
3318#define OD_ARRAY_SIZE 2
3319
34dc7c2f
BB
3320/*
3321 * Verify that dmu_{read,write} work as expected.
3322 */
34dc7c2f 3323void
428870ff 3324ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
34dc7c2f 3325{
40b84e7a
BB
3326 int size;
3327 ztest_od_t *od;
3328
428870ff 3329 objset_t *os = zd->zd_os;
40b84e7a
BB
3330 size = sizeof(ztest_od_t) * OD_ARRAY_SIZE;
3331 od = umem_alloc(size, UMEM_NOFAIL);
34dc7c2f
BB
3332 dmu_tx_t *tx;
3333 int i, freeit, error;
3334 uint64_t n, s, txg;
3335 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
428870ff
BB
3336 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
3337 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
34dc7c2f
BB
3338 uint64_t regions = 997;
3339 uint64_t stride = 123456789ULL;
3340 uint64_t width = 40;
3341 int free_percent = 5;
3342
3343 /*
3344 * This test uses two objects, packobj and bigobj, that are always
3345 * updated together (i.e. in the same tx) so that their contents are
3346 * in sync and can be compared. Their contents relate to each other
3347 * in a simple way: packobj is a dense array of 'bufwad' structures,
3348 * while bigobj is a sparse array of the same bufwads. Specifically,
3349 * for any index n, there are three bufwads that should be identical:
3350 *
3351 * packobj, at offset n * sizeof (bufwad_t)
3352 * bigobj, at the head of the nth chunk
3353 * bigobj, at the tail of the nth chunk
3354 *
3355 * The chunk size is arbitrary. It doesn't have to be a power of two,
3356 * and it doesn't have any relation to the object blocksize.
3357 * The only requirement is that it can hold at least two bufwads.
3358 *
3359 * Normally, we write the bufwad to each of these locations.
3360 * However, free_percent of the time we instead write zeroes to
3361 * packobj and perform a dmu_free_range() on bigobj. By comparing
3362 * bigobj to packobj, we can verify that the DMU is correctly
3363 * tracking which parts of an object are allocated and free,
3364 * and that the contents of the allocated blocks are correct.
3365 */
3366
3367 /*
3368 * Read the directory info. If it's the first time, set things up.
3369 */
40b84e7a
BB
3370 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
3371 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
34dc7c2f 3372
40b84e7a
BB
3373 if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
3374 umem_free(od, size);
428870ff 3375 return;
40b84e7a 3376 }
34dc7c2f 3377
428870ff
BB
3378 bigobj = od[0].od_object;
3379 packobj = od[1].od_object;
3380 chunksize = od[0].od_gen;
3381 ASSERT(chunksize == od[1].od_gen);
34dc7c2f
BB
3382
3383 /*
3384 * Prefetch a random chunk of the big object.
3385 * Our aim here is to get some async reads in flight
3386 * for blocks that we may free below; the DMU should
3387 * handle this race correctly.
3388 */
3389 n = ztest_random(regions) * stride + ztest_random(width);
3390 s = 1 + ztest_random(2 * width - 1);
428870ff 3391 dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
34dc7c2f
BB
3392
3393 /*
3394 * Pick a random index and compute the offsets into packobj and bigobj.
3395 */
3396 n = ztest_random(regions) * stride + ztest_random(width);
3397 s = 1 + ztest_random(width - 1);
3398
3399 packoff = n * sizeof (bufwad_t);
3400 packsize = s * sizeof (bufwad_t);
3401
428870ff
BB
3402 bigoff = n * chunksize;
3403 bigsize = s * chunksize;
34dc7c2f
BB
3404
3405 packbuf = umem_alloc(packsize, UMEM_NOFAIL);
3406 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
3407
3408 /*
3409 * free_percent of the time, free a range of bigobj rather than
3410 * overwriting it.
3411 */
3412 freeit = (ztest_random(100) < free_percent);
3413
3414 /*
3415 * Read the current contents of our objects.
3416 */
428870ff 3417 error = dmu_read(os, packobj, packoff, packsize, packbuf,
9babb374 3418 DMU_READ_PREFETCH);
34dc7c2f 3419 ASSERT3U(error, ==, 0);
428870ff 3420 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
9babb374 3421 DMU_READ_PREFETCH);
34dc7c2f
BB
3422 ASSERT3U(error, ==, 0);
3423
3424 /*
3425 * Get a tx for the mods to both packobj and bigobj.
3426 */
3427 tx = dmu_tx_create(os);
3428
428870ff 3429 dmu_tx_hold_write(tx, packobj, packoff, packsize);
34dc7c2f
BB
3430
3431 if (freeit)
428870ff 3432 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
34dc7c2f 3433 else
428870ff 3434 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
34dc7c2f 3435
428870ff
BB
3436 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
3437 if (txg == 0) {
34dc7c2f
BB
3438 umem_free(packbuf, packsize);
3439 umem_free(bigbuf, bigsize);
40b84e7a 3440 umem_free(od, size);
34dc7c2f
BB
3441 return;
3442 }
3443
428870ff
BB
3444 dmu_object_set_checksum(os, bigobj,
3445 (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
3446
3447 dmu_object_set_compress(os, bigobj,
3448 (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
34dc7c2f
BB
3449
3450 /*
3451 * For each index from n to n + s, verify that the existing bufwad
3452 * in packobj matches the bufwads at the head and tail of the
3453 * corresponding chunk in bigobj. Then update all three bufwads
3454 * with the new values we want to write out.
3455 */
3456 for (i = 0; i < s; i++) {
3457 /* LINTED */
3458 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
3459 /* LINTED */
428870ff 3460 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
34dc7c2f 3461 /* LINTED */
428870ff 3462 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
34dc7c2f
BB
3463
3464 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
3465 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
3466
3467 if (pack->bw_txg > txg)
3468 fatal(0, "future leak: got %llx, open txg is %llx",
3469 pack->bw_txg, txg);
3470
3471 if (pack->bw_data != 0 && pack->bw_index != n + i)
3472 fatal(0, "wrong index: got %llx, wanted %llx+%llx",
3473 pack->bw_index, n, i);
3474
3475 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
3476 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
3477
3478 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
3479 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
3480
3481 if (freeit) {
3482 bzero(pack, sizeof (bufwad_t));
3483 } else {
3484 pack->bw_index = n + i;
3485 pack->bw_txg = txg;
3486 pack->bw_data = 1 + ztest_random(-2ULL);
3487 }
3488 *bigH = *pack;
3489 *bigT = *pack;
3490 }
3491
3492 /*
3493 * We've verified all the old bufwads, and made new ones.
3494 * Now write them out.
3495 */
428870ff 3496 dmu_write(os, packobj, packoff, packsize, packbuf, tx);
34dc7c2f
BB
3497
3498 if (freeit) {
428870ff 3499 if (zopt_verbose >= 7) {
34dc7c2f
BB
3500 (void) printf("freeing offset %llx size %llx"
3501 " txg %llx\n",
3502 (u_longlong_t)bigoff,
3503 (u_longlong_t)bigsize,
3504 (u_longlong_t)txg);
3505 }
428870ff 3506 VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
34dc7c2f 3507 } else {
428870ff 3508 if (zopt_verbose >= 7) {
34dc7c2f
BB
3509 (void) printf("writing offset %llx size %llx"
3510 " txg %llx\n",
3511 (u_longlong_t)bigoff,
3512 (u_longlong_t)bigsize,
3513 (u_longlong_t)txg);
3514 }
428870ff 3515 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
34dc7c2f
BB
3516 }
3517
3518 dmu_tx_commit(tx);
3519
3520 /*
3521 * Sanity check the stuff we just wrote.
3522 */
3523 {
3524 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
3525 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
3526
428870ff 3527 VERIFY(0 == dmu_read(os, packobj, packoff,
9babb374 3528 packsize, packcheck, DMU_READ_PREFETCH));
428870ff 3529 VERIFY(0 == dmu_read(os, bigobj, bigoff,
9babb374 3530 bigsize, bigcheck, DMU_READ_PREFETCH));
34dc7c2f
BB
3531
3532 ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
3533 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
3534
3535 umem_free(packcheck, packsize);
3536 umem_free(bigcheck, bigsize);
3537 }
3538
3539 umem_free(packbuf, packsize);
3540 umem_free(bigbuf, bigsize);
40b84e7a 3541 umem_free(od, size);
34dc7c2f
BB
3542}
3543
9babb374
BB
3544void
3545compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
428870ff 3546 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
9babb374
BB
3547{
3548 uint64_t i;
3549 bufwad_t *pack;
3550 bufwad_t *bigH;
3551 bufwad_t *bigT;
3552
3553 /*
3554 * For each index from n to n + s, verify that the existing bufwad
3555 * in packobj matches the bufwads at the head and tail of the
3556 * corresponding chunk in bigobj. Then update all three bufwads
3557 * with the new values we want to write out.
3558 */
3559 for (i = 0; i < s; i++) {
3560 /* LINTED */
3561 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
3562 /* LINTED */
428870ff 3563 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
9babb374 3564 /* LINTED */
428870ff 3565 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
9babb374
BB
3566
3567 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
3568 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
3569
3570 if (pack->bw_txg > txg)
3571 fatal(0, "future leak: got %llx, open txg is %llx",
3572 pack->bw_txg, txg);
3573
3574 if (pack->bw_data != 0 && pack->bw_index != n + i)
3575 fatal(0, "wrong index: got %llx, wanted %llx+%llx",
3576 pack->bw_index, n, i);
3577
3578 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
3579 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
3580
3581 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
3582 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
3583
3584 pack->bw_index = n + i;
3585 pack->bw_txg = txg;
3586 pack->bw_data = 1 + ztest_random(-2ULL);
3587
3588 *bigH = *pack;
3589 *bigT = *pack;
3590 }
3591}
3592
40b84e7a
BB
3593#undef OD_ARRAY_SIZE
3594#define OD_ARRAY_SIZE 2
3595
9babb374 3596void
428870ff 3597ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
9babb374 3598{
428870ff 3599 objset_t *os = zd->zd_os;
40b84e7a 3600 ztest_od_t *od;
9babb374
BB
3601 dmu_tx_t *tx;
3602 uint64_t i;
3603 int error;
40b84e7a 3604 int size;
9babb374
BB
3605 uint64_t n, s, txg;
3606 bufwad_t *packbuf, *bigbuf;
428870ff
BB
3607 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
3608 uint64_t blocksize = ztest_random_blocksize();
3609 uint64_t chunksize = blocksize;
9babb374
BB
3610 uint64_t regions = 997;
3611 uint64_t stride = 123456789ULL;
3612 uint64_t width = 9;
3613 dmu_buf_t *bonus_db;
3614 arc_buf_t **bigbuf_arcbufs;
428870ff 3615 dmu_object_info_t doi;
9babb374 3616
40b84e7a
BB
3617 size = sizeof(ztest_od_t) * OD_ARRAY_SIZE;
3618 od = umem_alloc(size, UMEM_NOFAIL);
3619
9babb374
BB
3620 /*
3621 * This test uses two objects, packobj and bigobj, that are always
3622 * updated together (i.e. in the same tx) so that their contents are
3623 * in sync and can be compared. Their contents relate to each other
3624 * in a simple way: packobj is a dense array of 'bufwad' structures,
3625 * while bigobj is a sparse array of the same bufwads. Specifically,
3626 * for any index n, there are three bufwads that should be identical:
3627 *
3628 * packobj, at offset n * sizeof (bufwad_t)
3629 * bigobj, at the head of the nth chunk
3630 * bigobj, at the tail of the nth chunk
3631 *
3632 * The chunk size is set equal to bigobj block size so that
3633 * dmu_assign_arcbuf() can be tested for object updates.
3634 */
3635
3636 /*
3637 * Read the directory info. If it's the first time, set things up.
3638 */
40b84e7a
BB
3639 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
3640 ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
3641
9babb374 3642
40b84e7a
BB
3643 if (ztest_object_init(zd, od, size, B_FALSE) != 0) {
3644 umem_free(od, size);
428870ff 3645 return;
40b84e7a 3646 }
9babb374 3647
428870ff
BB
3648 bigobj = od[0].od_object;
3649 packobj = od[1].od_object;
3650 blocksize = od[0].od_blocksize;
3651 chunksize = blocksize;
3652 ASSERT(chunksize == od[1].od_gen);
9babb374 3653
428870ff
BB
3654 VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
3655 VERIFY(ISP2(doi.doi_data_block_size));
3656 VERIFY(chunksize == doi.doi_data_block_size);
3657 VERIFY(chunksize >= 2 * sizeof (bufwad_t));
9babb374
BB
3658
3659 /*
3660 * Pick a random index and compute the offsets into packobj and bigobj.
3661 */
3662 n = ztest_random(regions) * stride + ztest_random(width);
3663 s = 1 + ztest_random(width - 1);
3664
3665 packoff = n * sizeof (bufwad_t);
3666 packsize = s * sizeof (bufwad_t);
3667
428870ff
BB
3668 bigoff = n * chunksize;
3669 bigsize = s * chunksize;
9babb374
BB
3670
3671 packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
3672 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
3673
428870ff 3674 VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
9babb374
BB
3675
3676 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
3677
3678 /*
3679 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
3680 * Iteration 1 test zcopy to already referenced dbufs.
3681 * Iteration 2 test zcopy to dirty dbuf in the same txg.
3682 * Iteration 3 test zcopy to dbuf dirty in previous txg.
3683 * Iteration 4 test zcopy when dbuf is no longer dirty.
3684 * Iteration 5 test zcopy when it can't be done.
3685 * Iteration 6 one more zcopy write.
3686 */
3687 for (i = 0; i < 7; i++) {
3688 uint64_t j;
3689 uint64_t off;
3690
3691 /*
3692 * In iteration 5 (i == 5) use arcbufs
3693 * that don't match bigobj blksz to test
3694 * dmu_assign_arcbuf() when it can't directly
3695 * assign an arcbuf to a dbuf.
3696 */
3697 for (j = 0; j < s; j++) {
3698 if (i != 5) {
3699 bigbuf_arcbufs[j] =
428870ff 3700 dmu_request_arcbuf(bonus_db, chunksize);
9babb374
BB
3701 } else {
3702 bigbuf_arcbufs[2 * j] =
428870ff 3703 dmu_request_arcbuf(bonus_db, chunksize / 2);
9babb374 3704 bigbuf_arcbufs[2 * j + 1] =
428870ff 3705 dmu_request_arcbuf(bonus_db, chunksize / 2);
9babb374
BB
3706 }
3707 }
3708
3709 /*
3710 * Get a tx for the mods to both packobj and bigobj.
3711 */
3712 tx = dmu_tx_create(os);
3713
428870ff
BB
3714 dmu_tx_hold_write(tx, packobj, packoff, packsize);
3715 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
9babb374 3716
428870ff
BB
3717 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
3718 if (txg == 0) {
9babb374
BB
3719 umem_free(packbuf, packsize);
3720 umem_free(bigbuf, bigsize);
3721 for (j = 0; j < s; j++) {
3722 if (i != 5) {
3723 dmu_return_arcbuf(bigbuf_arcbufs[j]);
3724 } else {
3725 dmu_return_arcbuf(
3726 bigbuf_arcbufs[2 * j]);
3727 dmu_return_arcbuf(
3728 bigbuf_arcbufs[2 * j + 1]);
3729 }
3730 }
3731 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
40b84e7a 3732 umem_free(od, size);
9babb374
BB
3733 dmu_buf_rele(bonus_db, FTAG);
3734 return;
3735 }
3736
9babb374
BB
3737 /*
3738 * 50% of the time don't read objects in the 1st iteration to
3739 * test dmu_assign_arcbuf() for the case when there're no
3740 * existing dbufs for the specified offsets.
3741 */
3742 if (i != 0 || ztest_random(2) != 0) {
428870ff 3743 error = dmu_read(os, packobj, packoff,
9babb374
BB
3744 packsize, packbuf, DMU_READ_PREFETCH);
3745 ASSERT3U(error, ==, 0);
428870ff 3746 error = dmu_read(os, bigobj, bigoff, bigsize,
9babb374
BB
3747 bigbuf, DMU_READ_PREFETCH);
3748 ASSERT3U(error, ==, 0);
3749 }
3750 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
428870ff 3751 n, chunksize, txg);
9babb374
BB
3752
3753 /*
3754 * We've verified all the old bufwads, and made new ones.
3755 * Now write them out.
3756 */
428870ff
BB
3757 dmu_write(os, packobj, packoff, packsize, packbuf, tx);
3758 if (zopt_verbose >= 7) {
9babb374
BB
3759 (void) printf("writing offset %llx size %llx"
3760 " txg %llx\n",
3761 (u_longlong_t)bigoff,
3762 (u_longlong_t)bigsize,
3763 (u_longlong_t)txg);
3764 }
428870ff 3765 for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
9babb374
BB
3766 dmu_buf_t *dbt;
3767 if (i != 5) {
3768 bcopy((caddr_t)bigbuf + (off - bigoff),
428870ff 3769 bigbuf_arcbufs[j]->b_data, chunksize);
9babb374
BB
3770 } else {
3771 bcopy((caddr_t)bigbuf + (off - bigoff),
3772 bigbuf_arcbufs[2 * j]->b_data,
428870ff 3773 chunksize / 2);
9babb374 3774 bcopy((caddr_t)bigbuf + (off - bigoff) +
428870ff 3775 chunksize / 2,
9babb374 3776 bigbuf_arcbufs[2 * j + 1]->b_data,
428870ff 3777 chunksize / 2);
9babb374
BB
3778 }
3779
3780 if (i == 1) {
428870ff
BB
3781 VERIFY(dmu_buf_hold(os, bigobj, off,
3782 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
9babb374
BB
3783 }
3784 if (i != 5) {
3785 dmu_assign_arcbuf(bonus_db, off,
3786 bigbuf_arcbufs[j], tx);
3787 } else {
3788 dmu_assign_arcbuf(bonus_db, off,
3789 bigbuf_arcbufs[2 * j], tx);
3790 dmu_assign_arcbuf(bonus_db,
428870ff 3791 off + chunksize / 2,
9babb374
BB
3792 bigbuf_arcbufs[2 * j + 1], tx);
3793 }
3794 if (i == 1) {
3795 dmu_buf_rele(dbt, FTAG);
3796 }
3797 }
3798 dmu_tx_commit(tx);
3799
3800 /*
3801 * Sanity check the stuff we just wrote.
3802 */
3803 {
3804 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
3805 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
3806
428870ff 3807 VERIFY(0 == dmu_read(os, packobj, packoff,
9babb374 3808 packsize, packcheck, DMU_READ_PREFETCH));
428870ff 3809 VERIFY(0 == dmu_read(os, bigobj, bigoff,
9babb374
BB
3810 bigsize, bigcheck, DMU_READ_PREFETCH));
3811
3812 ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
3813 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
3814
3815 umem_free(packcheck, packsize);
3816 umem_free(bigcheck, bigsize);
3817 }
3818 if (i == 2) {
3819 txg_wait_open(dmu_objset_pool(os), 0);
3820 } else if (i == 3) {
3821 txg_wait_synced(dmu_objset_pool(os), 0);
3822 }
3823 }
3824
3825 dmu_buf_rele(bonus_db, FTAG);
3826 umem_free(packbuf, packsize);
3827 umem_free(bigbuf, bigsize);
3828 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
40b84e7a 3829 umem_free(od, size);
9babb374
BB
3830}
3831
428870ff 3832/* ARGSUSED */
34dc7c2f 3833void
428870ff 3834ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
34dc7c2f 3835{
40b84e7a
BB
3836 ztest_od_t *od;
3837
3838 od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL);
428870ff
BB
3839 uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
3840 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
34dc7c2f
BB
3841
3842 /*
428870ff
BB
3843 * Have multiple threads write to large offsets in an object
3844 * to verify that parallel writes to an object -- even to the
3845 * same blocks within the object -- doesn't cause any trouble.
34dc7c2f 3846 */
40b84e7a 3847 ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
9babb374 3848
40b84e7a 3849 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0)
34dc7c2f 3850 return;
34dc7c2f 3851
428870ff 3852 while (ztest_random(10) != 0)
40b84e7a
BB
3853 ztest_io(zd, od->od_object, offset);
3854
3855 umem_free(od, sizeof(ztest_od_t));
428870ff 3856}
34dc7c2f 3857
428870ff
BB
3858void
3859ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
3860{
40b84e7a 3861 ztest_od_t *od;
428870ff
BB
3862 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
3863 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
3864 uint64_t count = ztest_random(20) + 1;
3865 uint64_t blocksize = ztest_random_blocksize();
3866 void *data;
34dc7c2f 3867
40b84e7a 3868 od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL);
34dc7c2f 3869
40b84e7a
BB
3870 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
3871
3872 if (ztest_object_init(zd, od, sizeof (ztest_od_t), !ztest_random(2)) != 0) {
3873 umem_free(od, sizeof(ztest_od_t));
34dc7c2f 3874 return;
40b84e7a 3875 }
34dc7c2f 3876
40b84e7a
BB
3877 if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) {
3878 umem_free(od, sizeof(ztest_od_t));
34dc7c2f 3879 return;
40b84e7a 3880 }
34dc7c2f 3881
40b84e7a 3882 ztest_prealloc(zd, od->od_object, offset, count * blocksize);
34dc7c2f 3883
428870ff 3884 data = umem_zalloc(blocksize, UMEM_NOFAIL);
34dc7c2f 3885
428870ff
BB
3886 while (ztest_random(count) != 0) {
3887 uint64_t randoff = offset + (ztest_random(count) * blocksize);
40b84e7a 3888 if (ztest_write(zd, od->od_object, randoff, blocksize,
428870ff
BB
3889 data) != 0)
3890 break;
3891 while (ztest_random(4) != 0)
40b84e7a 3892 ztest_io(zd, od->od_object, randoff);
9babb374 3893 }
34dc7c2f 3894
428870ff 3895 umem_free(data, blocksize);
40b84e7a 3896 umem_free(od, sizeof(ztest_od_t));
34dc7c2f
BB
3897}
3898
3899/*
3900 * Verify that zap_{create,destroy,add,remove,update} work as expected.
3901 */
3902#define ZTEST_ZAP_MIN_INTS 1
3903#define ZTEST_ZAP_MAX_INTS 4
3904#define ZTEST_ZAP_MAX_PROPS 1000
3905
3906void
428870ff 3907ztest_zap(ztest_ds_t *zd, uint64_t id)
34dc7c2f 3908{
428870ff 3909 objset_t *os = zd->zd_os;
40b84e7a 3910 ztest_od_t *od;
34dc7c2f
BB
3911 uint64_t object;
3912 uint64_t txg, last_txg;
3913 uint64_t value[ZTEST_ZAP_MAX_INTS];
3914 uint64_t zl_ints, zl_intsize, prop;
3915 int i, ints;
3916 dmu_tx_t *tx;
3917 char propname[100], txgname[100];
3918 int error;
34dc7c2f
BB
3919 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
3920
40b84e7a
BB
3921 od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL);
3922 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
34dc7c2f 3923
40b84e7a
BB
3924 if (ztest_object_init(zd, od, sizeof (ztest_od_t),
3925 !ztest_random(2)) != 0)
3926 goto out;
34dc7c2f 3927
40b84e7a 3928 object = od->od_object;
34dc7c2f 3929
428870ff
BB
3930 /*
3931 * Generate a known hash collision, and verify that
3932 * we can lookup and remove both entries.
3933 */
3934 tx = dmu_tx_create(os);
3935 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
3936 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
3937 if (txg == 0)
40b84e7a 3938 goto out;
428870ff
BB
3939 for (i = 0; i < 2; i++) {
3940 value[i] = i;
3941 VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
3942 1, &value[i], tx));
3943 }
3944 for (i = 0; i < 2; i++) {
3945 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
3946 sizeof (uint64_t), 1, &value[i], tx));
3947 VERIFY3U(0, ==,
3948 zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
3949 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
3950 ASSERT3U(zl_ints, ==, 1);
3951 }
3952 for (i = 0; i < 2; i++) {
3953 VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
34dc7c2f 3954 }
428870ff 3955 dmu_tx_commit(tx);
34dc7c2f 3956
428870ff
BB
3957 /*
3958 * Generate a buch of random entries.
3959 */
34dc7c2f
BB
3960 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
3961
3962 prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
3963 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
3964 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
3965 bzero(value, sizeof (value));
3966 last_txg = 0;
3967
3968 /*
3969 * If these zap entries already exist, validate their contents.
3970 */
3971 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
3972 if (error == 0) {
3973 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
3974 ASSERT3U(zl_ints, ==, 1);
3975
3976 VERIFY(zap_lookup(os, object, txgname, zl_intsize,
3977 zl_ints, &last_txg) == 0);
3978
3979 VERIFY(zap_length(os, object, propname, &zl_intsize,
3980 &zl_ints) == 0);
3981
3982 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
3983 ASSERT3U(zl_ints, ==, ints);
3984
3985 VERIFY(zap_lookup(os, object, propname, zl_intsize,
3986 zl_ints, value) == 0);
3987
3988 for (i = 0; i < ints; i++) {
3989 ASSERT3U(value[i], ==, last_txg + object + i);
3990 }
3991 } else {
3992 ASSERT3U(error, ==, ENOENT);
3993 }
3994
3995 /*
3996 * Atomically update two entries in our zap object.
3997 * The first is named txg_%llu, and contains the txg
3998 * in which the property was last updated. The second
3999 * is named prop_%llu, and the nth element of its value
4000 * should be txg + object + n.
4001 */
4002 tx = dmu_tx_create(os);
428870ff
BB
4003 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
4004 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4005 if (txg == 0)
40b84e7a 4006 goto out;
34dc7c2f
BB
4007
4008 if (last_txg > txg)
4009 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
4010
4011 for (i = 0; i < ints; i++)
4012 value[i] = txg + object + i;
4013
428870ff
BB
4014 VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
4015 1, &txg, tx));
4016 VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
4017 ints, value, tx));
34dc7c2f
BB
4018
4019 dmu_tx_commit(tx);
4020
4021 /*
4022 * Remove a random pair of entries.
4023 */
4024 prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
4025 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
4026 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
4027
4028 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
4029
4030 if (error == ENOENT)
40b84e7a 4031 goto out;
34dc7c2f
BB
4032
4033 ASSERT3U(error, ==, 0);
4034
4035 tx = dmu_tx_create(os);
428870ff
BB
4036 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
4037 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4038 if (txg == 0)
40b84e7a 4039 goto out;
428870ff
BB
4040 VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
4041 VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
4042 dmu_tx_commit(tx);
40b84e7a
BB
4043out:
4044 umem_free(od, sizeof(ztest_od_t));
428870ff 4045}
34dc7c2f 4046
428870ff
BB
4047/*
4048 * Testcase to test the upgrading of a microzap to fatzap.
4049 */
4050void
4051ztest_fzap(ztest_ds_t *zd, uint64_t id)
4052{
4053 objset_t *os = zd->zd_os;
40b84e7a 4054 ztest_od_t *od;
428870ff 4055 uint64_t object, txg;
d6320ddb 4056 int i;
34dc7c2f 4057
40b84e7a
BB
4058 od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL);
4059 ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
428870ff 4060
40b84e7a
BB
4061 if (ztest_object_init(zd, od, sizeof (ztest_od_t),
4062 !ztest_random(2)) != 0)
4063 goto out;
4064 object = od->od_object;
34dc7c2f
BB
4065
4066 /*
428870ff
BB
4067 * Add entries to this ZAP and make sure it spills over
4068 * and gets upgraded to a fatzap. Also, since we are adding
4069 * 2050 entries we should see ptrtbl growth and leaf-block split.
34dc7c2f 4070 */
d6320ddb 4071 for (i = 0; i < 2050; i++) {
428870ff
BB
4072 char name[MAXNAMELEN];
4073 uint64_t value = i;
4074 dmu_tx_t *tx;
4075 int error;
34dc7c2f 4076
428870ff 4077 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
b8864a23 4078 (u_longlong_t)id, (u_longlong_t)value);
428870ff
BB
4079
4080 tx = dmu_tx_create(os);
4081 dmu_tx_hold_zap(tx, object, B_TRUE, name);
4082 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4083 if (txg == 0)
40b84e7a 4084 goto out;
428870ff
BB
4085 error = zap_add(os, object, name, sizeof (uint64_t), 1,
4086 &value, tx);
4087 ASSERT(error == 0 || error == EEXIST);
4088 dmu_tx_commit(tx);
34dc7c2f 4089 }
40b84e7a
BB
4090out:
4091 umem_free(od, sizeof(ztest_od_t));
34dc7c2f
BB
4092}
4093
428870ff 4094/* ARGSUSED */
34dc7c2f 4095void
428870ff 4096ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
34dc7c2f 4097{
428870ff 4098 objset_t *os = zd->zd_os;
40b84e7a 4099 ztest_od_t *od;
34dc7c2f
BB
4100 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
4101 dmu_tx_t *tx;
4102 int i, namelen, error;
428870ff 4103 int micro = ztest_random(2);
34dc7c2f
BB
4104 char name[20], string_value[20];
4105 void *data;
4106
40b84e7a
BB
4107 od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL);
4108 ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
428870ff 4109
40b84e7a
BB
4110 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
4111 umem_free(od, sizeof(ztest_od_t));
428870ff 4112 return;
40b84e7a 4113 }
428870ff 4114
40b84e7a 4115 object = od->od_object;
428870ff 4116
34dc7c2f
BB
4117 /*
4118 * Generate a random name of the form 'xxx.....' where each
4119 * x is a random printable character and the dots are dots.
4120 * There are 94 such characters, and the name length goes from
4121 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
4122 */
4123 namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
4124
4125 for (i = 0; i < 3; i++)
4126 name[i] = '!' + ztest_random('~' - '!' + 1);
4127 for (; i < namelen - 1; i++)
4128 name[i] = '.';
4129 name[i] = '\0';
4130
428870ff 4131 if ((namelen & 1) || micro) {
34dc7c2f
BB
4132 wsize = sizeof (txg);
4133 wc = 1;
4134 data = &txg;
4135 } else {
4136 wsize = 1;
4137 wc = namelen;
4138 data = string_value;
4139 }
4140
4141 count = -1ULL;
4142 VERIFY(zap_count(os, object, &count) == 0);
4143 ASSERT(count != -1ULL);
4144
4145 /*
4146 * Select an operation: length, lookup, add, update, remove.
4147 */
4148 i = ztest_random(5);
4149
4150 if (i >= 2) {
4151 tx = dmu_tx_create(os);
428870ff
BB
4152 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
4153 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4154 if (txg == 0)
34dc7c2f 4155 return;
34dc7c2f
BB
4156 bcopy(name, string_value, namelen);
4157 } else {
4158 tx = NULL;
4159 txg = 0;
4160 bzero(string_value, namelen);
4161 }
4162
4163 switch (i) {
4164
4165 case 0:
4166 error = zap_length(os, object, name, &zl_wsize, &zl_wc);
4167 if (error == 0) {
4168 ASSERT3U(wsize, ==, zl_wsize);
4169 ASSERT3U(wc, ==, zl_wc);
4170 } else {
4171 ASSERT3U(error, ==, ENOENT);
4172 }
4173 break;
4174
4175 case 1:
4176 error = zap_lookup(os, object, name, wsize, wc, data);
4177 if (error == 0) {
4178 if (data == string_value &&
4179 bcmp(name, data, namelen) != 0)
4180 fatal(0, "name '%s' != val '%s' len %d",
4181 name, data, namelen);
4182 } else {
4183 ASSERT3U(error, ==, ENOENT);
4184 }
4185 break;
4186
4187 case 2:
4188 error = zap_add(os, object, name, wsize, wc, data, tx);
4189 ASSERT(error == 0 || error == EEXIST);
4190 break;
4191
4192 case 3:
4193 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
4194 break;
4195
4196 case 4:
4197 error = zap_remove(os, object, name, tx);
4198 ASSERT(error == 0 || error == ENOENT);
4199 break;
4200 }
4201
4202 if (tx != NULL)
4203 dmu_tx_commit(tx);
40b84e7a
BB
4204
4205 umem_free(od, sizeof(ztest_od_t));
34dc7c2f
BB
4206}
4207
428870ff
BB
4208/*
4209 * Commit callback data.
4210 */
4211typedef struct ztest_cb_data {
4212 list_node_t zcd_node;
4213 uint64_t zcd_txg;
4214 int zcd_expected_err;
4215 boolean_t zcd_added;
4216 boolean_t zcd_called;
4217 spa_t *zcd_spa;
4218} ztest_cb_data_t;
4219
4220/* This is the actual commit callback function */
4221static void
4222ztest_commit_callback(void *arg, int error)
4223{
4224 ztest_cb_data_t *data = arg;
4225 uint64_t synced_txg;
4226
4227 VERIFY(data != NULL);
4228 VERIFY3S(data->zcd_expected_err, ==, error);
4229 VERIFY(!data->zcd_called);
4230
4231 synced_txg = spa_last_synced_txg(data->zcd_spa);
4232 if (data->zcd_txg > synced_txg)
4233 fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
4234 ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
4235 synced_txg);
4236
4237 data->zcd_called = B_TRUE;
4238
4239 if (error == ECANCELED) {
4240 ASSERT3U(data->zcd_txg, ==, 0);
4241 ASSERT(!data->zcd_added);
4242
4243 /*
4244 * The private callback data should be destroyed here, but
4245 * since we are going to check the zcd_called field after
4246 * dmu_tx_abort(), we will destroy it there.
4247 */
4248 return;
4249 }
4250
090ff092 4251 ASSERT(data->zcd_added);
428870ff
BB
4252 ASSERT3U(data->zcd_txg, !=, 0);
4253
1e33ac1e 4254 (void) mutex_enter(&zcl.zcl_callbacks_lock);
090ff092
RC
4255
4256 /* See if this cb was called more quickly */
4257 if ((synced_txg - data->zcd_txg) < zc_min_txg_delay)
4258 zc_min_txg_delay = synced_txg - data->zcd_txg;
4259
4260 /* Remove our callback from the list */
428870ff 4261 list_remove(&zcl.zcl_callbacks, data);
090ff092 4262
1e33ac1e 4263 (void) mutex_exit(&zcl.zcl_callbacks_lock);
428870ff 4264
428870ff
BB
4265 umem_free(data, sizeof (ztest_cb_data_t));
4266}
4267
4268/* Allocate and initialize callback data structure */
4269static ztest_cb_data_t *
4270ztest_create_cb_data(objset_t *os, uint64_t txg)
4271{
4272 ztest_cb_data_t *cb_data;
4273
4274 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
4275
4276 cb_data->zcd_txg = txg;
4277 cb_data->zcd_spa = dmu_objset_spa(os);
1e33ac1e 4278 list_link_init(&cb_data->zcd_node);
428870ff
BB
4279
4280 return (cb_data);
4281}
4282
428870ff
BB
4283/*
4284 * Commit callback test.
4285 */
34dc7c2f 4286void
428870ff
BB
4287ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
4288{
4289 objset_t *os = zd->zd_os;
40b84e7a 4290 ztest_od_t *od;
428870ff
BB
4291 dmu_tx_t *tx;
4292 ztest_cb_data_t *cb_data[3], *tmp_cb;
4293 uint64_t old_txg, txg;
090ff092 4294 int i, error = 0;
428870ff 4295
40b84e7a
BB
4296 od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL);
4297 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
428870ff 4298
40b84e7a
BB
4299 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
4300 umem_free(od, sizeof(ztest_od_t));
428870ff 4301 return;
40b84e7a 4302 }
428870ff
BB
4303
4304 tx = dmu_tx_create(os);
4305
4306 cb_data[0] = ztest_create_cb_data(os, 0);
4307 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
4308
40b84e7a 4309 dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t));
428870ff
BB
4310
4311 /* Every once in a while, abort the transaction on purpose */
4312 if (ztest_random(100) == 0)
4313 error = -1;
4314
4315 if (!error)
4316 error = dmu_tx_assign(tx, TXG_NOWAIT);
4317
4318 txg = error ? 0 : dmu_tx_get_txg(tx);
4319
4320 cb_data[0]->zcd_txg = txg;
4321 cb_data[1] = ztest_create_cb_data(os, txg);
4322 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
4323
4324 if (error) {
4325 /*
4326 * It's not a strict requirement to call the registered
4327 * callbacks from inside dmu_tx_abort(), but that's what
4328 * it's supposed to happen in the current implementation
4329 * so we will check for that.
4330 */
4331 for (i = 0; i < 2; i++) {
4332 cb_data[i]->zcd_expected_err = ECANCELED;
4333 VERIFY(!cb_data[i]->zcd_called);
4334 }
4335
4336 dmu_tx_abort(tx);
4337
4338 for (i = 0; i < 2; i++) {
4339 VERIFY(cb_data[i]->zcd_called);
4340 umem_free(cb_data[i], sizeof (ztest_cb_data_t));
4341 }
4342
40b84e7a 4343 umem_free(od, sizeof(ztest_od_t));
428870ff
BB
4344 return;
4345 }
4346
4347 cb_data[2] = ztest_create_cb_data(os, txg);
4348 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
4349
4350 /*
4351 * Read existing data to make sure there isn't a future leak.
4352 */
40b84e7a 4353 VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t),
428870ff
BB
4354 &old_txg, DMU_READ_PREFETCH));
4355
4356 if (old_txg > txg)
4357 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
4358 old_txg, txg);
4359
40b84e7a 4360 dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx);
428870ff 4361
1e33ac1e 4362 (void) mutex_enter(&zcl.zcl_callbacks_lock);
428870ff
BB
4363
4364 /*
4365 * Since commit callbacks don't have any ordering requirement and since
4366 * it is theoretically possible for a commit callback to be called
4367 * after an arbitrary amount of time has elapsed since its txg has been
4368 * synced, it is difficult to reliably determine whether a commit
4369 * callback hasn't been called due to high load or due to a flawed
4370 * implementation.
4371 *
4372 * In practice, we will assume that if after a certain number of txgs a
4373 * commit callback hasn't been called, then most likely there's an
4374 * implementation bug..
4375 */
4376 tmp_cb = list_head(&zcl.zcl_callbacks);
4377 if (tmp_cb != NULL &&
090ff092 4378 tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) {
428870ff
BB
4379 fatal(0, "Commit callback threshold exceeded, oldest txg: %"
4380 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
4381 }
4382
4383 /*
4384 * Let's find the place to insert our callbacks.
4385 *
4386 * Even though the list is ordered by txg, it is possible for the
4387 * insertion point to not be the end because our txg may already be
4388 * quiescing at this point and other callbacks in the open txg
4389 * (from other objsets) may have sneaked in.
4390 */
4391 tmp_cb = list_tail(&zcl.zcl_callbacks);
4392 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
4393 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
4394
4395 /* Add the 3 callbacks to the list */
4396 for (i = 0; i < 3; i++) {
4397 if (tmp_cb == NULL)
4398 list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
4399 else
4400 list_insert_after(&zcl.zcl_callbacks, tmp_cb,
4401 cb_data[i]);
4402
4403 cb_data[i]->zcd_added = B_TRUE;
4404 VERIFY(!cb_data[i]->zcd_called);
4405
4406 tmp_cb = cb_data[i];
4407 }
4408
090ff092
RC
4409 zc_cb_counter += 3;
4410
1e33ac1e 4411 (void) mutex_exit(&zcl.zcl_callbacks_lock);
428870ff
BB
4412
4413 dmu_tx_commit(tx);
40b84e7a
BB
4414
4415 umem_free(od, sizeof(ztest_od_t));
428870ff
BB
4416}
4417
4418/* ARGSUSED */
4419void
4420ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
4421{
4422 zfs_prop_t proplist[] = {
4423 ZFS_PROP_CHECKSUM,
4424 ZFS_PROP_COMPRESSION,
4425 ZFS_PROP_COPIES,
4426 ZFS_PROP_DEDUP
4427 };
4428 ztest_shared_t *zs = ztest_shared;
d6320ddb 4429 int p;
428870ff 4430
1e33ac1e 4431 (void) rw_enter(&zs->zs_name_lock, RW_READER);
428870ff 4432
d6320ddb 4433 for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
428870ff
BB
4434 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
4435 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
4436
1e33ac1e 4437 (void) rw_exit(&zs->zs_name_lock);
428870ff
BB
4438}
4439
4440/* ARGSUSED */
4441void
4442ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
4443{
4444 ztest_shared_t *zs = ztest_shared;
4445 nvlist_t *props = NULL;
4446
1e33ac1e 4447 (void) rw_enter(&zs->zs_name_lock, RW_READER);
428870ff
BB
4448
4449 (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
4450 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
4451
4452 VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
4453
4454 if (zopt_verbose >= 6)
4455 dump_nvlist(props, 4);
4456
4457 nvlist_free(props);
4458
1e33ac1e 4459 (void) rw_exit(&zs->zs_name_lock);
428870ff
BB
4460}
4461
4462/*
4463 * Test snapshot hold/release and deferred destroy.
4464 */
4465void
4466ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
34dc7c2f 4467{
34dc7c2f 4468 int error;
428870ff
BB
4469 objset_t *os = zd->zd_os;
4470 objset_t *origin;
4471 char snapname[100];
4472 char fullname[100];
4473 char clonename[100];
4474 char tag[100];
4475 char osname[MAXNAMELEN];
34dc7c2f 4476
1e33ac1e 4477 (void) rw_enter(&ztest_shared->zs_name_lock, RW_READER);
34dc7c2f
BB
4478
4479 dmu_objset_name(os, osname);
4480
1e33ac1e 4481 (void) snprintf(snapname, 100, "sh1_%llu", (u_longlong_t)id);
428870ff 4482 (void) snprintf(fullname, 100, "%s@%s", osname, snapname);
1e33ac1e
BB
4483 (void) snprintf(clonename, 100, "%s/ch1_%llu",osname,(u_longlong_t)id);
4484 (void) snprintf(tag, 100, "tag_%llu", (u_longlong_t)id);
428870ff
BB
4485
4486 /*
4487 * Clean up from any previous run.
4488 */
4489 (void) dmu_objset_destroy(clonename, B_FALSE);
4490 (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
4491 (void) dmu_objset_destroy(fullname, B_FALSE);
4492
4493 /*
4494 * Create snapshot, clone it, mark snap for deferred destroy,
4495 * destroy clone, verify snap was also destroyed.
4496 */
572e2857
BB
4497 error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE,
4498 FALSE, -1);
428870ff
BB
4499 if (error) {
4500 if (error == ENOSPC) {
4501 ztest_record_enospc("dmu_objset_snapshot");
4502 goto out;
34dc7c2f 4503 }
428870ff
BB
4504 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
4505 }
34dc7c2f 4506
428870ff
BB
4507 error = dmu_objset_hold(fullname, FTAG, &origin);
4508 if (error)
4509 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
34dc7c2f 4510
428870ff
BB
4511 error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0);
4512 dmu_objset_rele(origin, FTAG);
4513 if (error) {
34dc7c2f 4514 if (error == ENOSPC) {
428870ff
BB
4515 ztest_record_enospc("dmu_objset_clone");
4516 goto out;
34dc7c2f 4517 }
428870ff
BB
4518 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
4519 }
34dc7c2f 4520
428870ff
BB
4521 error = dmu_objset_destroy(fullname, B_TRUE);
4522 if (error) {
4523 fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
4524 fullname, error);
4525 }
34dc7c2f 4526
428870ff
BB
4527 error = dmu_objset_destroy(clonename, B_FALSE);
4528 if (error)
4529 fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error);
34dc7c2f 4530
428870ff
BB
4531 error = dmu_objset_hold(fullname, FTAG, &origin);
4532 if (error != ENOENT)
4533 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
34dc7c2f 4534
428870ff
BB
4535 /*
4536 * Create snapshot, add temporary hold, verify that we can't
4537 * destroy a held snapshot, mark for deferred destroy,
4538 * release hold, verify snapshot was destroyed.
4539 */
572e2857
BB
4540 error = dmu_objset_snapshot(osname, snapname, NULL, NULL, FALSE,
4541 FALSE, -1);
428870ff
BB
4542 if (error) {
4543 if (error == ENOSPC) {
4544 ztest_record_enospc("dmu_objset_snapshot");
4545 goto out;
34dc7c2f 4546 }
428870ff
BB
4547 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
4548 }
4549
572e2857
BB
4550 error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE,
4551 B_TRUE, -1);
428870ff
BB
4552 if (error)
4553 fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
4554
4555 error = dmu_objset_destroy(fullname, B_FALSE);
4556 if (error != EBUSY) {
4557 fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
4558 fullname, error);
4559 }
4560
4561 error = dmu_objset_destroy(fullname, B_TRUE);
4562 if (error) {
4563 fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
4564 fullname, error);
34dc7c2f
BB
4565 }
4566
428870ff
BB
4567 error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
4568 if (error)
4569 fatal(0, "dsl_dataset_user_release(%s)", fullname, tag);
4570
4571 VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT);
4572
4573out:
1e33ac1e 4574 (void) rw_exit(&ztest_shared->zs_name_lock);
34dc7c2f
BB
4575}
4576
34dc7c2f
BB
4577/*
4578 * Inject random faults into the on-disk data.
4579 */
428870ff 4580/* ARGSUSED */
34dc7c2f 4581void
428870ff 4582ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
34dc7c2f 4583{
428870ff
BB
4584 ztest_shared_t *zs = ztest_shared;
4585 spa_t *spa = zs->zs_spa;
34dc7c2f
BB
4586 int fd;
4587 uint64_t offset;
428870ff 4588 uint64_t leaves;
c5b3a7bb 4589 uint64_t bad = 0x1990c0ffeedecadeull;
34dc7c2f 4590 uint64_t top, leaf;
40b84e7a
BB
4591 char *path0;
4592 char *pathrand;
34dc7c2f 4593 size_t fsize;
34dc7c2f
BB
4594 int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
4595 int iters = 1000;
428870ff
BB
4596 int maxfaults;
4597 int mirror_save;
b128c09f 4598 vdev_t *vd0 = NULL;
34dc7c2f 4599 uint64_t guid0 = 0;
428870ff
BB
4600 boolean_t islog = B_FALSE;
4601
40b84e7a
BB
4602 path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
4603 pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
4604
1e33ac1e 4605 mutex_enter(&zs->zs_vdev_lock);
428870ff
BB
4606 maxfaults = MAXFAULTS();
4607 leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
4608 mirror_save = zs->zs_mirrors;
1e33ac1e 4609 mutex_exit(&zs->zs_vdev_lock);
34dc7c2f 4610
b128c09f 4611 ASSERT(leaves >= 1);
34dc7c2f
BB
4612
4613 /*
b128c09f 4614 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
34dc7c2f 4615 */
b128c09f 4616 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
34dc7c2f 4617
b128c09f
BB
4618 if (ztest_random(2) == 0) {
4619 /*
428870ff 4620 * Inject errors on a normal data device or slog device.
b128c09f 4621 */
428870ff
BB
4622 top = ztest_random_vdev_top(spa, B_TRUE);
4623 leaf = ztest_random(leaves) + zs->zs_splits;
34dc7c2f 4624
b128c09f
BB
4625 /*
4626 * Generate paths to the first leaf in this top-level vdev,
4627 * and to the random leaf we selected. We'll induce transient
4628 * write failures and random online/offline activity on leaf 0,
4629 * and we'll write random garbage to the randomly chosen leaf.
4630 */
4631 (void) snprintf(path0, sizeof (path0), ztest_dev_template,
428870ff 4632 zopt_dir, zopt_pool, top * leaves + zs->zs_splits);
b128c09f
BB
4633 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
4634 zopt_dir, zopt_pool, top * leaves + leaf);
34dc7c2f 4635
b128c09f 4636 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
428870ff
BB
4637 if (vd0 != NULL && vd0->vdev_top->vdev_islog)
4638 islog = B_TRUE;
4639
b128c09f
BB
4640 if (vd0 != NULL && maxfaults != 1) {
4641 /*
4642 * Make vd0 explicitly claim to be unreadable,
4643 * or unwriteable, or reach behind its back
4644 * and close the underlying fd. We can do this if
4645 * maxfaults == 0 because we'll fail and reexecute,
4646 * and we can do it if maxfaults >= 2 because we'll
4647 * have enough redundancy. If maxfaults == 1, the
4648 * combination of this with injection of random data
4649 * corruption below exceeds the pool's fault tolerance.
4650 */
4651 vdev_file_t *vf = vd0->vdev_tsd;
4652
4653 if (vf != NULL && ztest_random(3) == 0) {
4654 (void) close(vf->vf_vnode->v_fd);
4655 vf->vf_vnode->v_fd = -1;
4656 } else if (ztest_random(2) == 0) {
4657 vd0->vdev_cant_read = B_TRUE;
4658 } else {
4659 vd0->vdev_cant_write = B_TRUE;
4660 }
4661 guid0 = vd0->vdev_guid;
4662 }
4663 } else {
4664 /*
4665 * Inject errors on an l2cache device.
4666 */
4667 spa_aux_vdev_t *sav = &spa->spa_l2cache;
34dc7c2f 4668
b128c09f
BB
4669 if (sav->sav_count == 0) {
4670 spa_config_exit(spa, SCL_STATE, FTAG);
40b84e7a 4671 goto out;
b128c09f
BB
4672 }
4673 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
34dc7c2f 4674 guid0 = vd0->vdev_guid;
b128c09f
BB
4675 (void) strcpy(path0, vd0->vdev_path);
4676 (void) strcpy(pathrand, vd0->vdev_path);
4677
4678 leaf = 0;
4679 leaves = 1;
4680 maxfaults = INT_MAX; /* no limit on cache devices */
34dc7c2f
BB
4681 }
4682
b128c09f
BB
4683 spa_config_exit(spa, SCL_STATE, FTAG);
4684
34dc7c2f 4685 /*
428870ff
BB
4686 * If we can tolerate two or more faults, or we're dealing
4687 * with a slog, randomly online/offline vd0.
34dc7c2f 4688 */
428870ff 4689 if ((maxfaults >= 2 || islog) && guid0 != 0) {
fb5f0bc8
BB
4690 if (ztest_random(10) < 6) {
4691 int flags = (ztest_random(2) == 0 ?
4692 ZFS_OFFLINE_TEMPORARY : 0);
428870ff
BB
4693
4694 /*
4695 * We have to grab the zs_name_lock as writer to
4696 * prevent a race between offlining a slog and
4697 * destroying a dataset. Offlining the slog will
4698 * grab a reference on the dataset which may cause
4699 * dmu_objset_destroy() to fail with EBUSY thus
4700 * leaving the dataset in an inconsistent state.
4701 */
4702 if (islog)
1e33ac1e
BB
4703 (void) rw_enter(&ztest_shared->zs_name_lock,
4704 RW_WRITER);
428870ff 4705
fb5f0bc8 4706 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
428870ff
BB
4707
4708 if (islog)
1e33ac1e 4709 (void) rw_exit(&ztest_shared->zs_name_lock);
fb5f0bc8
BB
4710 } else {
4711 (void) vdev_online(spa, guid0, 0, NULL);
4712 }
34dc7c2f
BB
4713 }
4714
428870ff 4715 if (maxfaults == 0)
40b84e7a 4716 goto out;
428870ff 4717
34dc7c2f
BB
4718 /*
4719 * We have at least single-fault tolerance, so inject data corruption.
4720 */
4721 fd = open(pathrand, O_RDWR);
4722
4723 if (fd == -1) /* we hit a gap in the device namespace */
40b84e7a 4724 goto out;
34dc7c2f
BB
4725
4726 fsize = lseek(fd, 0, SEEK_END);
4727
4728 while (--iters != 0) {
4729 offset = ztest_random(fsize / (leaves << bshift)) *
4730 (leaves << bshift) + (leaf << bshift) +
4731 (ztest_random(1ULL << (bshift - 1)) & -8ULL);
4732
4733 if (offset >= fsize)
4734 continue;
4735
1e33ac1e 4736 mutex_enter(&zs->zs_vdev_lock);
428870ff 4737 if (mirror_save != zs->zs_mirrors) {
1e33ac1e 4738 mutex_exit(&zs->zs_vdev_lock);
428870ff 4739 (void) close(fd);
40b84e7a 4740 goto out;
428870ff 4741 }
34dc7c2f
BB
4742
4743 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
4744 fatal(1, "can't inject bad word at 0x%llx in %s",
4745 offset, pathrand);
428870ff 4746
1e33ac1e 4747 mutex_exit(&zs->zs_vdev_lock);
428870ff
BB
4748
4749 if (zopt_verbose >= 7)
4750 (void) printf("injected bad word into %s,"
4751 " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
34dc7c2f
BB
4752 }
4753
4754 (void) close(fd);
40b84e7a
BB
4755out:
4756 umem_free(path0, MAXPATHLEN);
4757 umem_free(pathrand, MAXPATHLEN);
34dc7c2f
BB
4758}
4759
4760/*
428870ff 4761 * Verify that DDT repair works as expected.
34dc7c2f
BB
4762 */
4763void
428870ff 4764ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
34dc7c2f 4765{
428870ff
BB
4766 ztest_shared_t *zs = ztest_shared;
4767 spa_t *spa = zs->zs_spa;
4768 objset_t *os = zd->zd_os;
40b84e7a 4769 ztest_od_t *od;
428870ff
BB
4770 uint64_t object, blocksize, txg, pattern, psize;
4771 enum zio_checksum checksum = spa_dedup_checksum(spa);
4772 dmu_buf_t *db;
4773 dmu_tx_t *tx;
4774 void *buf;
4775 blkptr_t blk;
4776 int copies = 2 * ZIO_DEDUPDITTO_MIN;
d6320ddb 4777 int i;
34dc7c2f 4778
428870ff
BB
4779 blocksize = ztest_random_blocksize();
4780 blocksize = MIN(blocksize, 2048); /* because we write so many */
34dc7c2f 4781
40b84e7a
BB
4782 od = umem_alloc(sizeof(ztest_od_t), UMEM_NOFAIL);
4783 ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
34dc7c2f 4784
40b84e7a
BB
4785 if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) {
4786 umem_free(od, sizeof(ztest_od_t));
428870ff 4787 return;
40b84e7a 4788 }
34dc7c2f
BB
4789
4790 /*
428870ff
BB
4791 * Take the name lock as writer to prevent anyone else from changing
4792 * the pool and dataset properies we need to maintain during this test.
34dc7c2f 4793 */
1e33ac1e 4794 (void) rw_enter(&zs->zs_name_lock, RW_WRITER);
34dc7c2f 4795
428870ff
BB
4796 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
4797 B_FALSE) != 0 ||
4798 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
4799 B_FALSE) != 0) {
1e33ac1e 4800 (void) rw_exit(&zs->zs_name_lock);
40b84e7a 4801 umem_free(od, sizeof(ztest_od_t));
428870ff
BB
4802 return;
4803 }
4804
4805 object = od[0].od_object;
4806 blocksize = od[0].od_blocksize;
4807 pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
4808
4809 ASSERT(object != 0);
4810
4811 tx = dmu_tx_create(os);
4812 dmu_tx_hold_write(tx, object, 0, copies * blocksize);
4813 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
4814 if (txg == 0) {
1e33ac1e 4815 (void) rw_exit(&zs->zs_name_lock);
40b84e7a 4816 umem_free(od, sizeof(ztest_od_t));
428870ff
BB
4817 return;
4818 }
34dc7c2f
BB
4819
4820 /*
428870ff 4821 * Write all the copies of our block.
34dc7c2f 4822 */
d6320ddb 4823 for (i = 0; i < copies; i++) {
428870ff
BB
4824 uint64_t offset = i * blocksize;
4825 VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
4826 DMU_READ_NO_PREFETCH) == 0);
4827 ASSERT(db->db_offset == offset);
4828 ASSERT(db->db_size == blocksize);
4829 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
4830 ztest_pattern_match(db->db_data, db->db_size, 0ULL));
4831 dmu_buf_will_fill(db, tx);
4832 ztest_pattern_set(db->db_data, db->db_size, pattern);
4833 dmu_buf_rele(db, FTAG);
4834 }
34dc7c2f 4835
428870ff
BB
4836 dmu_tx_commit(tx);
4837 txg_wait_synced(spa_get_dsl(spa), txg);
34dc7c2f
BB
4838
4839 /*
428870ff 4840 * Find out what block we got.
34dc7c2f 4841 */
428870ff
BB
4842 VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
4843 DMU_READ_NO_PREFETCH) == 0);
4844 blk = *((dmu_buf_impl_t *)db)->db_blkptr;
4845 dmu_buf_rele(db, FTAG);
34dc7c2f
BB
4846
4847 /*
428870ff 4848 * Damage the block. Dedup-ditto will save us when we read it later.
34dc7c2f 4849 */
428870ff
BB
4850 psize = BP_GET_PSIZE(&blk);
4851 buf = zio_buf_alloc(psize);
4852 ztest_pattern_set(buf, psize, ~pattern);
34dc7c2f 4853
428870ff
BB
4854 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
4855 buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
4856 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
34dc7c2f 4857
428870ff 4858 zio_buf_free(buf, psize);
34dc7c2f 4859
1e33ac1e 4860 (void) rw_exit(&zs->zs_name_lock);
40b84e7a 4861 umem_free(od, sizeof(ztest_od_t));
34dc7c2f
BB
4862}
4863
34dc7c2f 4864/*
428870ff 4865 * Scrub the pool.
34dc7c2f 4866 */
428870ff
BB
4867/* ARGSUSED */
4868void
4869ztest_scrub(ztest_ds_t *zd, uint64_t id)
34dc7c2f 4870{
428870ff
BB
4871 ztest_shared_t *zs = ztest_shared;
4872 spa_t *spa = zs->zs_spa;
34dc7c2f 4873
428870ff
BB
4874 (void) spa_scan(spa, POOL_SCAN_SCRUB);
4875 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
4876 (void) spa_scan(spa, POOL_SCAN_SCRUB);
4877}
34dc7c2f 4878
428870ff
BB
4879/*
4880 * Rename the pool to a different name and then rename it back.
4881 */
4882/* ARGSUSED */
4883void
4884ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
4885{
4886 ztest_shared_t *zs = ztest_shared;
4887 char *oldname, *newname;
4888 spa_t *spa;
34dc7c2f 4889
1e33ac1e 4890 (void) rw_enter(&zs->zs_name_lock, RW_WRITER);
34dc7c2f 4891
428870ff
BB
4892 oldname = zs->zs_pool;
4893 newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
4894 (void) strcpy(newname, oldname);
4895 (void) strcat(newname, "_tmp");
34dc7c2f
BB
4896
4897 /*
428870ff 4898 * Do the rename
34dc7c2f 4899 */
428870ff 4900 VERIFY3U(0, ==, spa_rename(oldname, newname));
34dc7c2f
BB
4901
4902 /*
428870ff 4903 * Try to open it under the old name, which shouldn't exist
34dc7c2f 4904 */
428870ff 4905 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
34dc7c2f
BB
4906
4907 /*
428870ff
BB
4908 * Open it under the new name and make sure it's still the same spa_t.
4909 */
4910 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
34dc7c2f 4911
428870ff
BB
4912 ASSERT(spa == zs->zs_spa);
4913 spa_close(spa, FTAG);
34dc7c2f
BB
4914
4915 /*
428870ff 4916 * Rename it back to the original
34dc7c2f 4917 */
428870ff 4918 VERIFY3U(0, ==, spa_rename(newname, oldname));
34dc7c2f 4919
428870ff
BB
4920 /*
4921 * Make sure it can still be opened
4922 */
4923 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
34dc7c2f 4924
428870ff
BB
4925 ASSERT(spa == zs->zs_spa);
4926 spa_close(spa, FTAG);
4927
4928 umem_free(newname, strlen(newname) + 1);
4929
1e33ac1e 4930 (void) rw_exit(&zs->zs_name_lock);
34dc7c2f
BB
4931}
4932
428870ff
BB
4933/*
4934 * Verify pool integrity by running zdb.
4935 */
34dc7c2f 4936static void
428870ff 4937ztest_run_zdb(char *pool)
34dc7c2f
BB
4938{
4939 int status;
34dc7c2f 4940 char *bin;
0e8d1b2d
BB
4941 char *zdb;
4942 char *zbuf;
34dc7c2f
BB
4943 FILE *fp;
4944
0e8d1b2d
BB
4945 bin = umem_alloc(MAXPATHLEN + MAXNAMELEN + 20, UMEM_NOFAIL);
4946 zdb = umem_alloc(MAXPATHLEN + MAXNAMELEN + 20, UMEM_NOFAIL);
4947 zbuf = umem_alloc(1024, UMEM_NOFAIL);
34dc7c2f 4948
0e8d1b2d 4949 VERIFY(realpath(getexecname(), bin) != NULL);
341b5f1d 4950 if (strncmp(bin, "/usr/sbin/ztest", 15) == 0) {
0e8d1b2d 4951 strcpy(bin, "/usr/sbin/zdb"); /* Installed */
341b5f1d
BB
4952 } else if (strncmp(bin, "/sbin/ztest", 11) == 0) {
4953 strcpy(bin, "/sbin/zdb"); /* Installed */
0e8d1b2d
BB
4954 } else {
4955 strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */
4956 strcat(bin, "/zdb/zdb");
4957 }
4958
4959 (void) sprintf(zdb,
4960 "%s -bcc%s%s -U %s %s",
4961 bin,
34dc7c2f
BB
4962 zopt_verbose >= 3 ? "s" : "",
4963 zopt_verbose >= 4 ? "v" : "",
428870ff 4964 spa_config_path,
b128c09f 4965 pool);
34dc7c2f
BB
4966
4967 if (zopt_verbose >= 5)
4968 (void) printf("Executing %s\n", strstr(zdb, "zdb "));
4969
4970 fp = popen(zdb, "r");
4971
4972 while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
4973 if (zopt_verbose >= 3)
4974 (void) printf("%s", zbuf);
4975
4976 status = pclose(fp);
4977
4978 if (status == 0)
0e8d1b2d 4979 goto out;
34dc7c2f
BB
4980
4981 ztest_dump_core = 0;
4982 if (WIFEXITED(status))
4983 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
4984 else
4985 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
0e8d1b2d
BB
4986out:
4987 umem_free(bin, MAXPATHLEN + MAXNAMELEN + 20);
4988 umem_free(zdb, MAXPATHLEN + MAXNAMELEN + 20);
4989 umem_free(zbuf, 1024);
34dc7c2f
BB
4990}
4991
4992static void
4993ztest_walk_pool_directory(char *header)
4994{
4995 spa_t *spa = NULL;
4996
4997 if (zopt_verbose >= 6)
4998 (void) printf("%s\n", header);
4999
5000 mutex_enter(&spa_namespace_lock);
5001 while ((spa = spa_next(spa)) != NULL)
5002 if (zopt_verbose >= 6)
5003 (void) printf("\t%s\n", spa_name(spa));
5004 mutex_exit(&spa_namespace_lock);
5005}
5006
5007static void
5008ztest_spa_import_export(char *oldname, char *newname)
5009{
fb5f0bc8 5010 nvlist_t *config, *newconfig;
34dc7c2f
BB
5011 uint64_t pool_guid;
5012 spa_t *spa;
34dc7c2f
BB
5013
5014 if (zopt_verbose >= 4) {
5015 (void) printf("import/export: old = %s, new = %s\n",
5016 oldname, newname);
5017 }
5018
5019 /*
5020 * Clean up from previous runs.
5021 */
5022 (void) spa_destroy(newname);
5023
5024 /*
5025 * Get the pool's configuration and guid.
5026 */
428870ff 5027 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
34dc7c2f 5028
fb5f0bc8
BB
5029 /*
5030 * Kick off a scrub to tickle scrub/export races.
5031 */
5032 if (ztest_random(2) == 0)
428870ff 5033 (void) spa_scan(spa, POOL_SCAN_SCRUB);
fb5f0bc8 5034
34dc7c2f
BB
5035 pool_guid = spa_guid(spa);
5036 spa_close(spa, FTAG);
5037
5038 ztest_walk_pool_directory("pools before export");
5039
5040 /*
5041 * Export it.
5042 */
428870ff 5043 VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
34dc7c2f
BB
5044
5045 ztest_walk_pool_directory("pools after export");
5046
fb5f0bc8
BB
5047 /*
5048 * Try to import it.
5049 */
5050 newconfig = spa_tryimport(config);
5051 ASSERT(newconfig != NULL);
5052 nvlist_free(newconfig);
5053
34dc7c2f
BB
5054 /*
5055 * Import it under the new name.
5056 */
572e2857 5057 VERIFY3U(0, ==, spa_import(newname, config, NULL, 0));
34dc7c2f
BB
5058
5059 ztest_walk_pool_directory("pools after import");
5060
5061 /*
5062 * Try to import it again -- should fail with EEXIST.
5063 */
572e2857 5064 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
34dc7c2f
BB
5065
5066 /*
5067 * Try to import it under a different name -- should fail with EEXIST.
5068 */
572e2857 5069 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
34dc7c2f
BB
5070
5071 /*
5072 * Verify that the pool is no longer visible under the old name.
5073 */
428870ff 5074 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
34dc7c2f
BB
5075
5076 /*
5077 * Verify that we can open and close the pool using the new name.
5078 */
428870ff 5079 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
34dc7c2f
BB
5080 ASSERT(pool_guid == spa_guid(spa));
5081 spa_close(spa, FTAG);
5082
5083 nvlist_free(config);
5084}
5085
fb5f0bc8
BB
5086static void
5087ztest_resume(spa_t *spa)
5088{
428870ff
BB
5089 if (spa_suspended(spa) && zopt_verbose >= 6)
5090 (void) printf("resuming from suspended state\n");
5091 spa_vdev_state_enter(spa, SCL_NONE);
5092 vdev_clear(spa, NULL);
5093 (void) spa_vdev_state_exit(spa, NULL, 0);
5094 (void) zio_resume(spa);
fb5f0bc8
BB
5095}
5096
34dc7c2f 5097static void *
fb5f0bc8 5098ztest_resume_thread(void *arg)
34dc7c2f 5099{
b128c09f 5100 spa_t *spa = arg;
34dc7c2f
BB
5101
5102 while (!ztest_exiting) {
428870ff
BB
5103 if (spa_suspended(spa))
5104 ztest_resume(spa);
5105 (void) poll(NULL, 0, 100);
34dc7c2f 5106 }
34dc7c2f 5107
1e33ac1e 5108 thread_exit();
428870ff 5109
1e33ac1e
BB
5110 return (NULL);
5111}
428870ff 5112
1e33ac1e 5113#define GRACE 300
428870ff 5114
1e33ac1e
BB
5115static void
5116ztest_deadman_alarm(int sig)
5117{
5118 fatal(0, "failed to complete within %d seconds of deadline", GRACE);
428870ff
BB
5119}
5120
5121static void
5122ztest_execute(ztest_info_t *zi, uint64_t id)
5123{
5124 ztest_shared_t *zs = ztest_shared;
5125 ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
5126 hrtime_t functime = gethrtime();
d6320ddb 5127 int i;
428870ff 5128
d6320ddb 5129 for (i = 0; i < zi->zi_iters; i++)
428870ff
BB
5130 zi->zi_func(zd, id);
5131
5132 functime = gethrtime() - functime;
5133
5134 atomic_add_64(&zi->zi_call_count, 1);
5135 atomic_add_64(&zi->zi_call_time, functime);
5136
5137 if (zopt_verbose >= 4) {
5138 Dl_info dli;
5139 (void) dladdr((void *)zi->zi_func, &dli);
5140 (void) printf("%6.2f sec in %s\n",
5141 (double)functime / NANOSEC, dli.dli_sname);
5142 }
5143}
5144
34dc7c2f
BB
5145static void *
5146ztest_thread(void *arg)
5147{
428870ff 5148 uint64_t id = (uintptr_t)arg;
34dc7c2f 5149 ztest_shared_t *zs = ztest_shared;
428870ff
BB
5150 uint64_t call_next;
5151 hrtime_t now;
34dc7c2f 5152 ztest_info_t *zi;
34dc7c2f 5153
428870ff 5154 while ((now = gethrtime()) < zs->zs_thread_stop) {
34dc7c2f
BB
5155 /*
5156 * See if it's time to force a crash.
5157 */
428870ff
BB
5158 if (now > zs->zs_thread_kill)
5159 ztest_kill(zs);
34dc7c2f
BB
5160
5161 /*
428870ff 5162 * If we're getting ENOSPC with some regularity, stop.
34dc7c2f 5163 */
428870ff
BB
5164 if (zs->zs_enospc_count > 10)
5165 break;
34dc7c2f
BB
5166
5167 /*
428870ff 5168 * Pick a random function to execute.
34dc7c2f 5169 */
428870ff
BB
5170 zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
5171 call_next = zi->zi_call_next;
34dc7c2f 5172
428870ff
BB
5173 if (now >= call_next &&
5174 atomic_cas_64(&zi->zi_call_next, call_next, call_next +
5175 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
5176 ztest_execute(zi, id);
5177 }
34dc7c2f 5178
1e33ac1e
BB
5179 thread_exit();
5180
428870ff
BB
5181 return (NULL);
5182}
34dc7c2f 5183
428870ff
BB
5184static void
5185ztest_dataset_name(char *dsname, char *pool, int d)
5186{
5187 (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
5188}
34dc7c2f 5189
428870ff
BB
5190static void
5191ztest_dataset_destroy(ztest_shared_t *zs, int d)
5192{
5193 char name[MAXNAMELEN];
d6320ddb 5194 int t;
34dc7c2f 5195
428870ff 5196 ztest_dataset_name(name, zs->zs_pool, d);
34dc7c2f 5197
428870ff
BB
5198 if (zopt_verbose >= 3)
5199 (void) printf("Destroying %s to free up space\n", name);
34dc7c2f 5200
428870ff
BB
5201 /*
5202 * Cleanup any non-standard clones and snapshots. In general,
5203 * ztest thread t operates on dataset (t % zopt_datasets),
5204 * so there may be more than one thing to clean up.
5205 */
d6320ddb 5206 for (t = d; t < zopt_threads; t += zopt_datasets)
428870ff
BB
5207 ztest_dsl_dataset_cleanup(name, t);
5208
5209 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
5210 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
5211}
5212
5213static void
5214ztest_dataset_dirobj_verify(ztest_ds_t *zd)
5215{
5216 uint64_t usedobjs, dirobjs, scratch;
5217
5218 /*
5219 * ZTEST_DIROBJ is the object directory for the entire dataset.
5220 * Therefore, the number of objects in use should equal the
5221 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
5222 * If not, we have an object leak.
5223 *
5224 * Note that we can only check this in ztest_dataset_open(),
5225 * when the open-context and syncing-context values agree.
5226 * That's because zap_count() returns the open-context value,
5227 * while dmu_objset_space() returns the rootbp fill count.
5228 */
5229 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
5230 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
5231 ASSERT3U(dirobjs + 1, ==, usedobjs);
5232}
5233
5234static int
5235ztest_dataset_open(ztest_shared_t *zs, int d)
5236{
5237 ztest_ds_t *zd = &zs->zs_zd[d];
5238 uint64_t committed_seq = zd->zd_seq;
5239 objset_t *os;
5240 zilog_t *zilog;
5241 char name[MAXNAMELEN];
5242 int error;
5243
5244 ztest_dataset_name(name, zs->zs_pool, d);
5245
1e33ac1e 5246 (void) rw_enter(&zs->zs_name_lock, RW_READER);
428870ff
BB
5247
5248 error = ztest_dataset_create(name);
5249 if (error == ENOSPC) {
1e33ac1e 5250 (void) rw_exit(&zs->zs_name_lock);
428870ff
BB
5251 ztest_record_enospc(FTAG);
5252 return (error);
34dc7c2f 5253 }
428870ff 5254 ASSERT(error == 0 || error == EEXIST);
34dc7c2f 5255
428870ff 5256 VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
1e33ac1e 5257 (void) rw_exit(&zs->zs_name_lock);
428870ff
BB
5258
5259 ztest_zd_init(zd, os);
5260
5261 zilog = zd->zd_zilog;
5262
5263 if (zilog->zl_header->zh_claim_lr_seq != 0 &&
5264 zilog->zl_header->zh_claim_lr_seq < committed_seq)
5265 fatal(0, "missing log records: claimed %llu < committed %llu",
5266 zilog->zl_header->zh_claim_lr_seq, committed_seq);
5267
5268 ztest_dataset_dirobj_verify(zd);
5269
5270 zil_replay(os, zd, ztest_replay_vector);
5271
5272 ztest_dataset_dirobj_verify(zd);
5273
5274 if (zopt_verbose >= 6)
5275 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
5276 zd->zd_name,
5277 (u_longlong_t)zilog->zl_parse_blk_count,
5278 (u_longlong_t)zilog->zl_parse_lr_count,
5279 (u_longlong_t)zilog->zl_replaying_seq);
5280
5281 zilog = zil_open(os, ztest_get_data);
5282
5283 if (zilog->zl_replaying_seq != 0 &&
5284 zilog->zl_replaying_seq < committed_seq)
5285 fatal(0, "missing log records: replayed %llu < committed %llu",
5286 zilog->zl_replaying_seq, committed_seq);
5287
5288 return (0);
5289}
5290
5291static void
5292ztest_dataset_close(ztest_shared_t *zs, int d)
5293{
5294 ztest_ds_t *zd = &zs->zs_zd[d];
5295
5296 zil_close(zd->zd_zilog);
5297 dmu_objset_rele(zd->zd_os, zd);
5298
5299 ztest_zd_fini(zd);
34dc7c2f
BB
5300}
5301
5302/*
5303 * Kick off threads to run tests on all datasets in parallel.
5304 */
5305static void
428870ff 5306ztest_run(ztest_shared_t *zs)
34dc7c2f 5307{
1e33ac1e 5308 kt_did_t *tid;
34dc7c2f 5309 spa_t *spa;
1e33ac1e
BB
5310 kthread_t *resume_thread;
5311 uint64_t object;
428870ff 5312 int error;
d6320ddb 5313 int t, d;
b128c09f
BB
5314
5315 ztest_exiting = B_FALSE;
34dc7c2f 5316
34dc7c2f 5317 /*
428870ff 5318 * Initialize parent/child shared state.
34dc7c2f 5319 */
1e33ac1e
BB
5320 mutex_init(&zs->zs_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
5321 rw_init(&zs->zs_name_lock, NULL, RW_DEFAULT, NULL);
34dc7c2f 5322
428870ff
BB
5323 zs->zs_thread_start = gethrtime();
5324 zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC;
5325 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
5326 zs->zs_thread_kill = zs->zs_thread_stop;
5327 if (ztest_random(100) < zopt_killrate)
5328 zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC);
34dc7c2f 5329
1e33ac1e 5330 mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f 5331
428870ff
BB
5332 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
5333 offsetof(ztest_cb_data_t, zcd_node));
34dc7c2f
BB
5334
5335 /*
b128c09f 5336 * Open our pool.
34dc7c2f 5337 */
428870ff
BB
5338 kernel_init(FREAD | FWRITE);
5339 VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
6d974228 5340 spa->spa_debug = B_TRUE;
428870ff
BB
5341 zs->zs_spa = spa;
5342
5343 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
34dc7c2f 5344
fb5f0bc8
BB
5345 /*
5346 * We don't expect the pool to suspend unless maxfaults == 0,
5347 * in which case ztest_fault_inject() temporarily takes away
5348 * the only valid replica.
5349 */
428870ff 5350 if (MAXFAULTS() == 0)
fb5f0bc8
BB
5351 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
5352 else
5353 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
5354
34dc7c2f 5355 /*
b128c09f 5356 * Create a thread to periodically resume suspended I/O.
34dc7c2f 5357 */
1e33ac1e
BB
5358 VERIFY3P((resume_thread = thread_create(NULL, 0, ztest_resume_thread,
5359 spa, TS_RUN, NULL, 0, 0)), !=, NULL);
34dc7c2f 5360
428870ff 5361 /*
1e33ac1e 5362 * Set a deadman alarm to abort() if we hang.
428870ff 5363 */
1e33ac1e
BB
5364 signal(SIGALRM, ztest_deadman_alarm);
5365 alarm((zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + GRACE);
428870ff 5366
34dc7c2f
BB
5367 /*
5368 * Verify that we can safely inquire about about any object,
5369 * whether it's allocated or not. To make it interesting,
5370 * we probe a 5-wide window around each power of two.
5371 * This hits all edge cases, including zero and the max.
5372 */
d6320ddb
BB
5373 for (t = 0; t < 64; t++) {
5374 for (d = -5; d <= 5; d++) {
34dc7c2f
BB
5375 error = dmu_object_info(spa->spa_meta_objset,
5376 (1ULL << t) + d, NULL);
5377 ASSERT(error == 0 || error == ENOENT ||
5378 error == EINVAL);
5379 }
5380 }
5381
5382 /*
428870ff 5383 * If we got any ENOSPC errors on the previous run, destroy something.
34dc7c2f 5384 */
428870ff
BB
5385 if (zs->zs_enospc_count != 0) {
5386 int d = ztest_random(zopt_datasets);
5387 ztest_dataset_destroy(zs, d);
5388 }
34dc7c2f
BB
5389 zs->zs_enospc_count = 0;
5390
1e33ac1e 5391 tid = umem_zalloc(zopt_threads * sizeof (kt_did_t), UMEM_NOFAIL);
34dc7c2f
BB
5392
5393 if (zopt_verbose >= 4)
5394 (void) printf("starting main threads...\n");
5395
428870ff
BB
5396 /*
5397 * Kick off all the tests that run in parallel.
5398 */
d6320ddb 5399 for (t = 0; t < zopt_threads; t++) {
1e33ac1e
BB
5400 kthread_t *thread;
5401
428870ff
BB
5402 if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
5403 return;
1e33ac1e
BB
5404
5405 VERIFY3P(thread = thread_create(NULL, 0, ztest_thread,
5406 (void *)(uintptr_t)t, TS_RUN, NULL, 0, 0), !=, NULL);
5407 tid[t] = thread->t_tid;
34dc7c2f
BB
5408 }
5409
428870ff
BB
5410 /*
5411 * Wait for all of the tests to complete. We go in reverse order
5412 * so we don't close datasets while threads are still using them.
5413 */
d6320ddb 5414 for (t = zopt_threads - 1; t >= 0; t--) {
1e33ac1e 5415 thread_join(tid[t]);
428870ff
BB
5416 if (t < zopt_datasets)
5417 ztest_dataset_close(zs, t);
34dc7c2f
BB
5418 }
5419
34dc7c2f
BB
5420 txg_wait_synced(spa_get_dsl(spa), 0);
5421
428870ff
BB
5422 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
5423 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
5424
1e33ac1e 5425 umem_free(tid, zopt_threads * sizeof (kt_did_t));
428870ff
BB
5426
5427 /* Kill the resume thread */
5428 ztest_exiting = B_TRUE;
1e33ac1e 5429 thread_join(resume_thread->t_tid);
428870ff 5430 ztest_resume(spa);
34dc7c2f
BB
5431
5432 /*
428870ff
BB
5433 * Right before closing the pool, kick off a bunch of async I/O;
5434 * spa_close() should wait for it to complete.
34dc7c2f 5435 */
1e33ac1e 5436 for (object = 1; object < 50; object++)
428870ff
BB
5437 dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
5438
090ff092
RC
5439 /* Verify that at least one commit cb was called in a timely fashion */
5440 if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG)
5441 VERIFY3U(zc_min_txg_delay, ==, 0);
5442
428870ff
BB
5443 spa_close(spa, FTAG);
5444
5445 /*
5446 * Verify that we can loop over all pools.
5447 */
5448 mutex_enter(&spa_namespace_lock);
5449 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
5450 if (zopt_verbose > 3)
5451 (void) printf("spa_next: found %s\n", spa_name(spa));
5452 mutex_exit(&spa_namespace_lock);
5453
5454 /*
5455 * Verify that we can export the pool and reimport it under a
5456 * different name.
5457 */
5458 if (ztest_random(2) == 0) {
5459 char name[MAXNAMELEN];
5460 (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
5461 ztest_spa_import_export(zs->zs_pool, name);
5462 ztest_spa_import_export(name, zs->zs_pool);
5463 }
5464
5465 kernel_fini();
572e2857
BB
5466
5467 list_destroy(&zcl.zcl_callbacks);
0e8d1b2d
BB
5468 mutex_destroy(&zcl.zcl_callbacks_lock);
5469 rw_destroy(&zs->zs_name_lock);
5470 mutex_destroy(&zs->zs_vdev_lock);
428870ff
BB
5471}
5472
5473static void
5474ztest_freeze(ztest_shared_t *zs)
5475{
5476 ztest_ds_t *zd = &zs->zs_zd[0];
5477 spa_t *spa;
5478 int numloops = 0;
5479
5480 if (zopt_verbose >= 3)
5481 (void) printf("testing spa_freeze()...\n");
9babb374 5482
428870ff
BB
5483 kernel_init(FREAD | FWRITE);
5484 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
5485 VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
9babb374 5486
428870ff
BB
5487 /*
5488 * Force the first log block to be transactionally allocated.
5489 * We have to do this before we freeze the pool -- otherwise
5490 * the log chain won't be anchored.
5491 */
5492 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
5493 ztest_dmu_object_alloc_free(zd, 0);
572e2857 5494 zil_commit(zd->zd_zilog, 0);
34dc7c2f
BB
5495 }
5496
5497 txg_wait_synced(spa_get_dsl(spa), 0);
5498
428870ff
BB
5499 /*
5500 * Freeze the pool. This stops spa_sync() from doing anything,
5501 * so that the only way to record changes from now on is the ZIL.
5502 */
5503 spa_freeze(spa);
b128c09f 5504
428870ff
BB
5505 /*
5506 * Run tests that generate log records but don't alter the pool config
5507 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
5508 * We do a txg_wait_synced() after each iteration to force the txg
5509 * to increase well beyond the last synced value in the uberblock.
5510 * The ZIL should be OK with that.
5511 */
5512 while (ztest_random(10) != 0 && numloops++ < zopt_maxloops) {
5513 ztest_dmu_write_parallel(zd, 0);
5514 ztest_dmu_object_alloc_free(zd, 0);
5515 txg_wait_synced(spa_get_dsl(spa), 0);
5516 }
b128c09f 5517
34dc7c2f 5518 /*
428870ff 5519 * Commit all of the changes we just generated.
34dc7c2f 5520 */
572e2857 5521 zil_commit(zd->zd_zilog, 0);
428870ff 5522 txg_wait_synced(spa_get_dsl(spa), 0);
34dc7c2f 5523
428870ff
BB
5524 /*
5525 * Close our dataset and close the pool.
5526 */
5527 ztest_dataset_close(zs, 0);
34dc7c2f 5528 spa_close(spa, FTAG);
428870ff 5529 kernel_fini();
34dc7c2f 5530
428870ff
BB
5531 /*
5532 * Open and close the pool and dataset to induce log replay.
5533 */
5534 kernel_init(FREAD | FWRITE);
5535 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
5536 VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
5537 ztest_dataset_close(zs, 0);
5538 spa_close(spa, FTAG);
34dc7c2f
BB
5539 kernel_fini();
5540}
5541
5542void
5543print_time(hrtime_t t, char *timebuf)
5544{
5545 hrtime_t s = t / NANOSEC;
5546 hrtime_t m = s / 60;
5547 hrtime_t h = m / 60;
5548 hrtime_t d = h / 24;
5549
5550 s -= m * 60;
5551 m -= h * 60;
5552 h -= d * 24;
5553
5554 timebuf[0] = '\0';
5555
5556 if (d)
5557 (void) sprintf(timebuf,
5558 "%llud%02lluh%02llum%02llus", d, h, m, s);
5559 else if (h)
5560 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
5561 else if (m)
5562 (void) sprintf(timebuf, "%llum%02llus", m, s);
5563 else
5564 (void) sprintf(timebuf, "%llus", s);
5565}
5566
428870ff 5567static nvlist_t *
0bc8fd78 5568make_random_props(void)
428870ff
BB
5569{
5570 nvlist_t *props;
5571
5572 if (ztest_random(2) == 0)
5573 return (NULL);
5574
5575 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
5576 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
5577
5578 (void) printf("props:\n");
5579 dump_nvlist(props, 4);
5580
5581 return (props);
5582}
5583
34dc7c2f
BB
5584/*
5585 * Create a storage pool with the given name and initial vdev size.
428870ff 5586 * Then test spa_freeze() functionality.
34dc7c2f
BB
5587 */
5588static void
428870ff 5589ztest_init(ztest_shared_t *zs)
34dc7c2f
BB
5590{
5591 spa_t *spa;
428870ff
BB
5592 nvlist_t *nvroot, *props;
5593
1e33ac1e
BB
5594 mutex_init(&zs->zs_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
5595 rw_init(&zs->zs_name_lock, NULL, RW_DEFAULT, NULL);
34dc7c2f
BB
5596
5597 kernel_init(FREAD | FWRITE);
5598
5599 /*
5600 * Create the storage pool.
5601 */
428870ff
BB
5602 (void) spa_destroy(zs->zs_pool);
5603 ztest_shared->zs_vdev_next_leaf = 0;
5604 zs->zs_splits = 0;
5605 zs->zs_mirrors = zopt_mirrors;
b128c09f 5606 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
428870ff
BB
5607 0, zopt_raidz, zs->zs_mirrors, 1);
5608 props = make_random_props();
5609 VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL));
34dc7c2f
BB
5610 nvlist_free(nvroot);
5611
428870ff 5612 VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
9babb374 5613 metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
34dc7c2f
BB
5614 spa_close(spa, FTAG);
5615
5616 kernel_fini();
428870ff
BB
5617
5618 ztest_run_zdb(zs->zs_pool);
5619
5620 ztest_freeze(zs);
5621
5622 ztest_run_zdb(zs->zs_pool);
572e2857 5623
1e33ac1e
BB
5624 (void) rw_destroy(&zs->zs_name_lock);
5625 (void) mutex_destroy(&zs->zs_vdev_lock);
34dc7c2f
BB
5626}
5627
5628int
5629main(int argc, char **argv)
5630{
5631 int kills = 0;
5632 int iters = 0;
34dc7c2f 5633 ztest_shared_t *zs;
428870ff 5634 size_t shared_size;
34dc7c2f
BB
5635 ztest_info_t *zi;
5636 char timebuf[100];
5637 char numbuf[6];
428870ff 5638 spa_t *spa;
d6320ddb 5639 int i, f;
34dc7c2f
BB
5640
5641 (void) setvbuf(stdout, NULL, _IOLBF, 0);
5642
34dc7c2f
BB
5643 ztest_random_fd = open("/dev/urandom", O_RDONLY);
5644
0e8d1b2d 5645 dprintf_setup(&argc, argv);
34dc7c2f
BB
5646 process_options(argc, argv);
5647
428870ff 5648 /* Override location of zpool.cache */
0e5b68e0
BB
5649 VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache",
5650 zopt_dir) != -1);
428870ff 5651
34dc7c2f
BB
5652 /*
5653 * Blow away any existing copy of zpool.cache
5654 */
5655 if (zopt_init != 0)
428870ff
BB
5656 (void) remove(spa_config_path);
5657
5658 shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t);
34dc7c2f
BB
5659
5660 zs = ztest_shared = (void *)mmap(0,
428870ff 5661 P2ROUNDUP(shared_size, getpagesize()),
34dc7c2f
BB
5662 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
5663
5664 if (zopt_verbose >= 1) {
5665 (void) printf("%llu vdevs, %d datasets, %d threads,"
5666 " %llu seconds...\n",
5667 (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads,
5668 (u_longlong_t)zopt_time);
5669 }
5670
5671 /*
5672 * Create and initialize our storage pool.
5673 */
d6320ddb 5674 for (i = 1; i <= zopt_init; i++) {
34dc7c2f
BB
5675 bzero(zs, sizeof (ztest_shared_t));
5676 if (zopt_verbose >= 3 && zopt_init != 1)
5677 (void) printf("ztest_init(), pass %d\n", i);
428870ff
BB
5678 zs->zs_pool = zopt_pool;
5679 ztest_init(zs);
34dc7c2f
BB
5680 }
5681
428870ff
BB
5682 zs->zs_pool = zopt_pool;
5683 zs->zs_proc_start = gethrtime();
5684 zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
34dc7c2f 5685
d6320ddb 5686 for (f = 0; f < ZTEST_FUNCS; f++) {
428870ff 5687 zi = &zs->zs_info[f];
34dc7c2f 5688 *zi = ztest_info[f];
428870ff
BB
5689 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
5690 zi->zi_call_next = UINT64_MAX;
34dc7c2f 5691 else
428870ff
BB
5692 zi->zi_call_next = zs->zs_proc_start +
5693 ztest_random(2 * zi->zi_interval[0] + 1);
34dc7c2f
BB
5694 }
5695
34dc7c2f
BB
5696 /*
5697 * Run the tests in a loop. These tests include fault injection
5698 * to verify that self-healing data works, and forced crashes
5699 * to verify that we never lose on-disk consistency.
5700 */
428870ff 5701 while (gethrtime() < zs->zs_proc_stop) {
34dc7c2f
BB
5702 int status;
5703 pid_t pid;
34dc7c2f
BB
5704
5705 /*
5706 * Initialize the workload counters for each function.
5707 */
d6320ddb 5708 for (f = 0; f < ZTEST_FUNCS; f++) {
34dc7c2f 5709 zi = &zs->zs_info[f];
428870ff 5710 zi->zi_call_count = 0;
34dc7c2f
BB
5711 zi->zi_call_time = 0;
5712 }
5713
9babb374
BB
5714 /* Set the allocation switch size */
5715 metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
5716
34dc7c2f
BB
5717 pid = fork();
5718
5719 if (pid == -1)
5720 fatal(1, "fork failed");
5721
5722 if (pid == 0) { /* child */
5723 struct rlimit rl = { 1024, 1024 };
5724 (void) setrlimit(RLIMIT_NOFILE, &rl);
5725 (void) enable_extended_FILE_stdio(-1, -1);
428870ff 5726 ztest_run(zs);
34dc7c2f
BB
5727 exit(0);
5728 }
5729
5730 while (waitpid(pid, &status, 0) != pid)
5731 continue;
5732
5733 if (WIFEXITED(status)) {
5734 if (WEXITSTATUS(status) != 0) {
5735 (void) fprintf(stderr,
5736 "child exited with code %d\n",
5737 WEXITSTATUS(status));
5738 exit(2);
5739 }
5740 } else if (WIFSIGNALED(status)) {
5741 if (WTERMSIG(status) != SIGKILL) {
5742 (void) fprintf(stderr,
5743 "child died with signal %d\n",
5744 WTERMSIG(status));
5745 exit(3);
5746 }
5747 kills++;
5748 } else {
5749 (void) fprintf(stderr, "something strange happened "
5750 "to child\n");
5751 exit(4);
5752 }
5753
5754 iters++;
5755
5756 if (zopt_verbose >= 1) {
5757 hrtime_t now = gethrtime();
5758
428870ff
BB
5759 now = MIN(now, zs->zs_proc_stop);
5760 print_time(zs->zs_proc_stop - now, timebuf);
34dc7c2f
BB
5761 nicenum(zs->zs_space, numbuf);
5762
5763 (void) printf("Pass %3d, %8s, %3llu ENOSPC, "
5764 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
5765 iters,
5766 WIFEXITED(status) ? "Complete" : "SIGKILL",
5767 (u_longlong_t)zs->zs_enospc_count,
5768 100.0 * zs->zs_alloc / zs->zs_space,
5769 numbuf,
428870ff 5770 100.0 * (now - zs->zs_proc_start) /
34dc7c2f
BB
5771 (zopt_time * NANOSEC), timebuf);
5772 }
5773
5774 if (zopt_verbose >= 2) {
5775 (void) printf("\nWorkload summary:\n\n");
5776 (void) printf("%7s %9s %s\n",
5777 "Calls", "Time", "Function");
5778 (void) printf("%7s %9s %s\n",
5779 "-----", "----", "--------");
d6320ddb 5780 for (f = 0; f < ZTEST_FUNCS; f++) {
34dc7c2f
BB
5781 Dl_info dli;
5782
5783 zi = &zs->zs_info[f];
5784 print_time(zi->zi_call_time, timebuf);
5785 (void) dladdr((void *)zi->zi_func, &dli);
5786 (void) printf("%7llu %9s %s\n",
428870ff 5787 (u_longlong_t)zi->zi_call_count, timebuf,
34dc7c2f
BB
5788 dli.dli_sname);
5789 }
5790 (void) printf("\n");
5791 }
5792
5793 /*
428870ff
BB
5794 * It's possible that we killed a child during a rename test,
5795 * in which case we'll have a 'ztest_tmp' pool lying around
5796 * instead of 'ztest'. Do a blind rename in case this happened.
34dc7c2f 5797 */
428870ff
BB
5798 kernel_init(FREAD);
5799 if (spa_open(zopt_pool, &spa, FTAG) == 0) {
5800 spa_close(spa, FTAG);
5801 } else {
5802 char tmpname[MAXNAMELEN];
5803 kernel_fini();
5804 kernel_init(FREAD | FWRITE);
5805 (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
5806 zopt_pool);
5807 (void) spa_rename(tmpname, zopt_pool);
5808 }
34dc7c2f 5809 kernel_fini();
34dc7c2f 5810
428870ff
BB
5811 ztest_run_zdb(zopt_pool);
5812 }
34dc7c2f
BB
5813
5814 if (zopt_verbose >= 1) {
5815 (void) printf("%d killed, %d completed, %.0f%% kill rate\n",
5816 kills, iters - kills, (100.0 * kills) / MAX(1, iters));
5817 }
5818
5819 return (0);
5820}