]> git.proxmox.com Git - mirror_zfs.git/blame - cmd/ztest/ztest.c
Rebase master to b117
[mirror_zfs.git] / cmd / ztest / ztest.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
9babb374 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
34dc7c2f
BB
23 * Use is subject to license terms.
24 */
25
34dc7c2f
BB
26/*
27 * The objective of this program is to provide a DMU/ZAP/SPA stress test
28 * that runs entirely in userland, is easy to use, and easy to extend.
29 *
30 * The overall design of the ztest program is as follows:
31 *
32 * (1) For each major functional area (e.g. adding vdevs to a pool,
33 * creating and destroying datasets, reading and writing objects, etc)
34 * we have a simple routine to test that functionality. These
35 * individual routines do not have to do anything "stressful".
36 *
37 * (2) We turn these simple functionality tests into a stress test by
38 * running them all in parallel, with as many threads as desired,
39 * and spread across as many datasets, objects, and vdevs as desired.
40 *
41 * (3) While all this is happening, we inject faults into the pool to
42 * verify that self-healing data really works.
43 *
44 * (4) Every time we open a dataset, we change its checksum and compression
45 * functions. Thus even individual objects vary from block to block
46 * in which checksum they use and whether they're compressed.
47 *
48 * (5) To verify that we never lose on-disk consistency after a crash,
49 * we run the entire test in a child of the main process.
50 * At random times, the child self-immolates with a SIGKILL.
51 * This is the software equivalent of pulling the power cord.
52 * The parent then runs the test again, using the existing
53 * storage pool, as many times as desired.
54 *
55 * (6) To verify that we don't have future leaks or temporal incursions,
56 * many of the functional tests record the transaction group number
57 * as part of their data. When reading old data, they verify that
58 * the transaction group number is less than the current, open txg.
59 * If you add a new test, please do this if applicable.
60 *
61 * When run with no arguments, ztest runs for about five minutes and
62 * produces no output if successful. To get a little bit of information,
63 * specify -V. To get more information, specify -VV, and so on.
64 *
65 * To turn this into an overnight stress test, use -T to specify run time.
66 *
67 * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
68 * to increase the pool capacity, fanout, and overall stress level.
69 *
70 * The -N(okill) option will suppress kills, so each child runs to completion.
71 * This can be useful when you're trying to distinguish temporal incursions
72 * from plain old race conditions.
73 */
74
75#include <sys/zfs_context.h>
76#include <sys/spa.h>
77#include <sys/dmu.h>
78#include <sys/txg.h>
9babb374 79#include <sys/dbuf.h>
34dc7c2f 80#include <sys/zap.h>
34dc7c2f
BB
81#include <sys/dmu_objset.h>
82#include <sys/poll.h>
83#include <sys/stat.h>
84#include <sys/time.h>
85#include <sys/wait.h>
86#include <sys/mman.h>
87#include <sys/resource.h>
88#include <sys/zio.h>
89#include <sys/zio_checksum.h>
90#include <sys/zio_compress.h>
91#include <sys/zil.h>
92#include <sys/vdev_impl.h>
b128c09f 93#include <sys/vdev_file.h>
34dc7c2f
BB
94#include <sys/spa_impl.h>
95#include <sys/dsl_prop.h>
9babb374 96#include <sys/dsl_dataset.h>
34dc7c2f
BB
97#include <sys/refcount.h>
98#include <stdio.h>
99#include <stdio_ext.h>
100#include <stdlib.h>
101#include <unistd.h>
102#include <signal.h>
103#include <umem.h>
104#include <dlfcn.h>
105#include <ctype.h>
106#include <math.h>
107#include <sys/fs/zfs.h>
108
109static char cmdname[] = "ztest";
110static char *zopt_pool = cmdname;
111
112static uint64_t zopt_vdevs = 5;
113static uint64_t zopt_vdevtime;
114static int zopt_ashift = SPA_MINBLOCKSHIFT;
115static int zopt_mirrors = 2;
116static int zopt_raidz = 4;
117static int zopt_raidz_parity = 1;
118static size_t zopt_vdev_size = SPA_MINDEVSIZE;
119static int zopt_datasets = 7;
120static int zopt_threads = 23;
121static uint64_t zopt_passtime = 60; /* 60 seconds */
122static uint64_t zopt_killrate = 70; /* 70% kill rate */
123static int zopt_verbose = 0;
124static int zopt_init = 1;
125static char *zopt_dir = "/tmp";
126static uint64_t zopt_time = 300; /* 5 minutes */
127static int zopt_maxfaults;
34dc7c2f
BB
128
129typedef struct ztest_block_tag {
130 uint64_t bt_objset;
131 uint64_t bt_object;
132 uint64_t bt_offset;
133 uint64_t bt_txg;
134 uint64_t bt_thread;
135 uint64_t bt_seq;
136} ztest_block_tag_t;
137
138typedef struct ztest_args {
139 char za_pool[MAXNAMELEN];
140 spa_t *za_spa;
141 objset_t *za_os;
142 zilog_t *za_zilog;
143 thread_t za_thread;
144 uint64_t za_instance;
145 uint64_t za_random;
146 uint64_t za_diroff;
147 uint64_t za_diroff_shared;
148 uint64_t za_zil_seq;
149 hrtime_t za_start;
150 hrtime_t za_stop;
151 hrtime_t za_kill;
34dc7c2f
BB
152 /*
153 * Thread-local variables can go here to aid debugging.
154 */
155 ztest_block_tag_t za_rbt;
156 ztest_block_tag_t za_wbt;
157 dmu_object_info_t za_doi;
158 dmu_buf_t *za_dbuf;
159} ztest_args_t;
160
161typedef void ztest_func_t(ztest_args_t *);
162
163/*
164 * Note: these aren't static because we want dladdr() to work.
165 */
166ztest_func_t ztest_dmu_read_write;
9babb374 167ztest_func_t ztest_dmu_read_write_zcopy;
34dc7c2f
BB
168ztest_func_t ztest_dmu_write_parallel;
169ztest_func_t ztest_dmu_object_alloc_free;
170ztest_func_t ztest_zap;
171ztest_func_t ztest_zap_parallel;
172ztest_func_t ztest_traverse;
173ztest_func_t ztest_dsl_prop_get_set;
174ztest_func_t ztest_dmu_objset_create_destroy;
175ztest_func_t ztest_dmu_snapshot_create_destroy;
9babb374 176ztest_func_t ztest_dsl_dataset_promote_busy;
34dc7c2f
BB
177ztest_func_t ztest_spa_create_destroy;
178ztest_func_t ztest_fault_inject;
b128c09f 179ztest_func_t ztest_spa_rename;
34dc7c2f
BB
180ztest_func_t ztest_vdev_attach_detach;
181ztest_func_t ztest_vdev_LUN_growth;
182ztest_func_t ztest_vdev_add_remove;
b128c09f 183ztest_func_t ztest_vdev_aux_add_remove;
34dc7c2f 184ztest_func_t ztest_scrub;
34dc7c2f
BB
185
186typedef struct ztest_info {
187 ztest_func_t *zi_func; /* test function */
188 uint64_t zi_iters; /* iterations per execution */
189 uint64_t *zi_interval; /* execute every <interval> seconds */
190 uint64_t zi_calls; /* per-pass count */
191 uint64_t zi_call_time; /* per-pass time */
192 uint64_t zi_call_total; /* cumulative total */
193 uint64_t zi_call_target; /* target cumulative total */
194} ztest_info_t;
195
196uint64_t zopt_always = 0; /* all the time */
197uint64_t zopt_often = 1; /* every second */
198uint64_t zopt_sometimes = 10; /* every 10 seconds */
199uint64_t zopt_rarely = 60; /* every 60 seconds */
200
201ztest_info_t ztest_info[] = {
202 { ztest_dmu_read_write, 1, &zopt_always },
9babb374 203 { ztest_dmu_read_write_zcopy, 1, &zopt_always },
34dc7c2f
BB
204 { ztest_dmu_write_parallel, 30, &zopt_always },
205 { ztest_dmu_object_alloc_free, 1, &zopt_always },
206 { ztest_zap, 30, &zopt_always },
207 { ztest_zap_parallel, 100, &zopt_always },
34dc7c2f 208 { ztest_dsl_prop_get_set, 1, &zopt_sometimes },
b128c09f
BB
209 { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes },
210 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
211 { ztest_spa_create_destroy, 1, &zopt_sometimes },
34dc7c2f
BB
212 { ztest_fault_inject, 1, &zopt_sometimes },
213 { ztest_spa_rename, 1, &zopt_rarely },
214 { ztest_vdev_attach_detach, 1, &zopt_rarely },
215 { ztest_vdev_LUN_growth, 1, &zopt_rarely },
9babb374 216 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
34dc7c2f 217 { ztest_vdev_add_remove, 1, &zopt_vdevtime },
b128c09f 218 { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime },
34dc7c2f
BB
219 { ztest_scrub, 1, &zopt_vdevtime },
220};
221
222#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
223
224#define ZTEST_SYNC_LOCKS 16
225
226/*
227 * Stuff we need to share writably between parent and child.
228 */
229typedef struct ztest_shared {
230 mutex_t zs_vdev_lock;
231 rwlock_t zs_name_lock;
232 uint64_t zs_vdev_primaries;
b128c09f 233 uint64_t zs_vdev_aux;
34dc7c2f
BB
234 uint64_t zs_enospc_count;
235 hrtime_t zs_start_time;
236 hrtime_t zs_stop_time;
237 uint64_t zs_alloc;
238 uint64_t zs_space;
239 ztest_info_t zs_info[ZTEST_FUNCS];
240 mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS];
241 uint64_t zs_seq[ZTEST_SYNC_LOCKS];
242} ztest_shared_t;
243
244static char ztest_dev_template[] = "%s/%s.%llua";
b128c09f 245static char ztest_aux_template[] = "%s/%s.%s.%llu";
34dc7c2f
BB
246static ztest_shared_t *ztest_shared;
247
248static int ztest_random_fd;
249static int ztest_dump_core = 1;
250
9babb374 251static uint64_t metaslab_sz;
b128c09f 252static boolean_t ztest_exiting;
34dc7c2f
BB
253
254extern uint64_t metaslab_gang_bang;
9babb374 255extern uint64_t metaslab_df_alloc_threshold;
34dc7c2f
BB
256
257#define ZTEST_DIROBJ 1
258#define ZTEST_MICROZAP_OBJ 2
259#define ZTEST_FATZAP_OBJ 3
260
261#define ZTEST_DIROBJ_BLOCKSIZE (1 << 10)
262#define ZTEST_DIRSIZE 256
263
264static void usage(boolean_t) __NORETURN;
265
266/*
267 * These libumem hooks provide a reasonable set of defaults for the allocator's
268 * debugging facilities.
269 */
270const char *
271_umem_debug_init()
272{
273 return ("default,verbose"); /* $UMEM_DEBUG setting */
274}
275
276const char *
277_umem_logging_init(void)
278{
279 return ("fail,contents"); /* $UMEM_LOGGING setting */
280}
281
282#define FATAL_MSG_SZ 1024
283
284char *fatal_msg;
285
286static void
287fatal(int do_perror, char *message, ...)
288{
289 va_list args;
290 int save_errno = errno;
291 char buf[FATAL_MSG_SZ];
292
293 (void) fflush(stdout);
294
295 va_start(args, message);
296 (void) sprintf(buf, "ztest: ");
297 /* LINTED */
298 (void) vsprintf(buf + strlen(buf), message, args);
299 va_end(args);
300 if (do_perror) {
301 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
302 ": %s", strerror(save_errno));
303 }
304 (void) fprintf(stderr, "%s\n", buf);
305 fatal_msg = buf; /* to ease debugging */
306 if (ztest_dump_core)
307 abort();
308 exit(3);
309}
310
311static int
312str2shift(const char *buf)
313{
314 const char *ends = "BKMGTPEZ";
315 int i;
316
317 if (buf[0] == '\0')
318 return (0);
319 for (i = 0; i < strlen(ends); i++) {
320 if (toupper(buf[0]) == ends[i])
321 break;
322 }
323 if (i == strlen(ends)) {
324 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
325 buf);
326 usage(B_FALSE);
327 }
328 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
329 return (10*i);
330 }
331 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
332 usage(B_FALSE);
333 /* NOTREACHED */
334}
335
336static uint64_t
337nicenumtoull(const char *buf)
338{
339 char *end;
340 uint64_t val;
341
342 val = strtoull(buf, &end, 0);
343 if (end == buf) {
344 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
345 usage(B_FALSE);
346 } else if (end[0] == '.') {
347 double fval = strtod(buf, &end);
348 fval *= pow(2, str2shift(end));
349 if (fval > UINT64_MAX) {
350 (void) fprintf(stderr, "ztest: value too large: %s\n",
351 buf);
352 usage(B_FALSE);
353 }
354 val = (uint64_t)fval;
355 } else {
356 int shift = str2shift(end);
357 if (shift >= 64 || (val << shift) >> shift != val) {
358 (void) fprintf(stderr, "ztest: value too large: %s\n",
359 buf);
360 usage(B_FALSE);
361 }
362 val <<= shift;
363 }
364 return (val);
365}
366
367static void
368usage(boolean_t requested)
369{
370 char nice_vdev_size[10];
371 char nice_gang_bang[10];
372 FILE *fp = requested ? stdout : stderr;
373
374 nicenum(zopt_vdev_size, nice_vdev_size);
375 nicenum(metaslab_gang_bang, nice_gang_bang);
376
377 (void) fprintf(fp, "Usage: %s\n"
378 "\t[-v vdevs (default: %llu)]\n"
379 "\t[-s size_of_each_vdev (default: %s)]\n"
380 "\t[-a alignment_shift (default: %d) (use 0 for random)]\n"
381 "\t[-m mirror_copies (default: %d)]\n"
382 "\t[-r raidz_disks (default: %d)]\n"
383 "\t[-R raidz_parity (default: %d)]\n"
384 "\t[-d datasets (default: %d)]\n"
385 "\t[-t threads (default: %d)]\n"
386 "\t[-g gang_block_threshold (default: %s)]\n"
387 "\t[-i initialize pool i times (default: %d)]\n"
388 "\t[-k kill percentage (default: %llu%%)]\n"
389 "\t[-p pool_name (default: %s)]\n"
390 "\t[-f file directory for vdev files (default: %s)]\n"
391 "\t[-V(erbose)] (use multiple times for ever more blather)\n"
392 "\t[-E(xisting)] (use existing pool instead of creating new one)\n"
393 "\t[-T time] total run time (default: %llu sec)\n"
394 "\t[-P passtime] time per pass (default: %llu sec)\n"
34dc7c2f
BB
395 "\t[-h] (print help)\n"
396 "",
397 cmdname,
398 (u_longlong_t)zopt_vdevs, /* -v */
399 nice_vdev_size, /* -s */
400 zopt_ashift, /* -a */
401 zopt_mirrors, /* -m */
402 zopt_raidz, /* -r */
403 zopt_raidz_parity, /* -R */
404 zopt_datasets, /* -d */
405 zopt_threads, /* -t */
406 nice_gang_bang, /* -g */
407 zopt_init, /* -i */
408 (u_longlong_t)zopt_killrate, /* -k */
409 zopt_pool, /* -p */
410 zopt_dir, /* -f */
411 (u_longlong_t)zopt_time, /* -T */
b128c09f 412 (u_longlong_t)zopt_passtime); /* -P */
34dc7c2f
BB
413 exit(requested ? 0 : 1);
414}
415
416static uint64_t
417ztest_random(uint64_t range)
418{
419 uint64_t r;
420
421 if (range == 0)
422 return (0);
423
424 if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
425 fatal(1, "short read from /dev/urandom");
426
427 return (r % range);
428}
429
fb5f0bc8 430/* ARGSUSED */
34dc7c2f
BB
431static void
432ztest_record_enospc(char *s)
433{
34dc7c2f
BB
434 ztest_shared->zs_enospc_count++;
435}
436
437static void
438process_options(int argc, char **argv)
439{
440 int opt;
441 uint64_t value;
442
443 /* By default, test gang blocks for blocks 32K and greater */
444 metaslab_gang_bang = 32 << 10;
445
34dc7c2f 446 while ((opt = getopt(argc, argv,
b128c09f 447 "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:h")) != EOF) {
34dc7c2f
BB
448 value = 0;
449 switch (opt) {
450 case 'v':
451 case 's':
452 case 'a':
453 case 'm':
454 case 'r':
455 case 'R':
456 case 'd':
457 case 't':
458 case 'g':
459 case 'i':
460 case 'k':
461 case 'T':
462 case 'P':
34dc7c2f
BB
463 value = nicenumtoull(optarg);
464 }
465 switch (opt) {
466 case 'v':
467 zopt_vdevs = value;
468 break;
469 case 's':
470 zopt_vdev_size = MAX(SPA_MINDEVSIZE, value);
471 break;
472 case 'a':
473 zopt_ashift = value;
474 break;
475 case 'm':
476 zopt_mirrors = value;
477 break;
478 case 'r':
479 zopt_raidz = MAX(1, value);
480 break;
481 case 'R':
482 zopt_raidz_parity = MIN(MAX(value, 1), 2);
483 break;
484 case 'd':
485 zopt_datasets = MAX(1, value);
486 break;
487 case 't':
488 zopt_threads = MAX(1, value);
489 break;
490 case 'g':
491 metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value);
492 break;
493 case 'i':
494 zopt_init = value;
495 break;
496 case 'k':
497 zopt_killrate = value;
498 break;
499 case 'p':
500 zopt_pool = strdup(optarg);
501 break;
502 case 'f':
503 zopt_dir = strdup(optarg);
504 break;
505 case 'V':
506 zopt_verbose++;
507 break;
508 case 'E':
509 zopt_init = 0;
510 break;
511 case 'T':
512 zopt_time = value;
513 break;
514 case 'P':
515 zopt_passtime = MAX(1, value);
516 break;
34dc7c2f
BB
517 case 'h':
518 usage(B_TRUE);
519 break;
520 case '?':
521 default:
522 usage(B_FALSE);
523 break;
524 }
525 }
526
527 zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
528
529 zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
530 zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
531}
532
533static uint64_t
534ztest_get_ashift(void)
535{
536 if (zopt_ashift == 0)
537 return (SPA_MINBLOCKSHIFT + ztest_random(3));
538 return (zopt_ashift);
539}
540
541static nvlist_t *
b128c09f 542make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
34dc7c2f 543{
b128c09f 544 char pathbuf[MAXPATHLEN];
34dc7c2f 545 uint64_t vdev;
34dc7c2f
BB
546 nvlist_t *file;
547
b128c09f
BB
548 if (ashift == 0)
549 ashift = ztest_get_ashift();
550
551 if (path == NULL) {
552 path = pathbuf;
553
554 if (aux != NULL) {
555 vdev = ztest_shared->zs_vdev_aux;
556 (void) sprintf(path, ztest_aux_template,
557 zopt_dir, zopt_pool, aux, vdev);
558 } else {
559 vdev = ztest_shared->zs_vdev_primaries++;
560 (void) sprintf(path, ztest_dev_template,
561 zopt_dir, zopt_pool, vdev);
562 }
563 }
34dc7c2f 564
b128c09f
BB
565 if (size != 0) {
566 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
34dc7c2f 567 if (fd == -1)
b128c09f 568 fatal(1, "can't open %s", path);
34dc7c2f 569 if (ftruncate(fd, size) != 0)
b128c09f 570 fatal(1, "can't ftruncate %s", path);
34dc7c2f
BB
571 (void) close(fd);
572 }
573
574 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
575 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
b128c09f 576 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
34dc7c2f
BB
577 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
578
579 return (file);
580}
581
582static nvlist_t *
b128c09f 583make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r)
34dc7c2f
BB
584{
585 nvlist_t *raidz, **child;
586 int c;
587
588 if (r < 2)
b128c09f 589 return (make_vdev_file(path, aux, size, ashift));
34dc7c2f
BB
590 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
591
592 for (c = 0; c < r; c++)
b128c09f 593 child[c] = make_vdev_file(path, aux, size, ashift);
34dc7c2f
BB
594
595 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
596 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
597 VDEV_TYPE_RAIDZ) == 0);
598 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
599 zopt_raidz_parity) == 0);
600 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
601 child, r) == 0);
602
603 for (c = 0; c < r; c++)
604 nvlist_free(child[c]);
605
606 umem_free(child, r * sizeof (nvlist_t *));
607
608 return (raidz);
609}
610
611static nvlist_t *
b128c09f
BB
612make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift,
613 int r, int m)
34dc7c2f
BB
614{
615 nvlist_t *mirror, **child;
616 int c;
617
618 if (m < 1)
b128c09f 619 return (make_vdev_raidz(path, aux, size, ashift, r));
34dc7c2f
BB
620
621 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
622
623 for (c = 0; c < m; c++)
b128c09f 624 child[c] = make_vdev_raidz(path, aux, size, ashift, r);
34dc7c2f
BB
625
626 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
627 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
628 VDEV_TYPE_MIRROR) == 0);
629 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
630 child, m) == 0);
34dc7c2f
BB
631
632 for (c = 0; c < m; c++)
633 nvlist_free(child[c]);
634
635 umem_free(child, m * sizeof (nvlist_t *));
636
637 return (mirror);
638}
639
640static nvlist_t *
b128c09f
BB
641make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
642 int log, int r, int m, int t)
34dc7c2f
BB
643{
644 nvlist_t *root, **child;
645 int c;
646
647 ASSERT(t > 0);
648
649 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
650
b128c09f
BB
651 for (c = 0; c < t; c++) {
652 child[c] = make_vdev_mirror(path, aux, size, ashift, r, m);
653 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
654 log) == 0);
655 }
34dc7c2f
BB
656
657 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
658 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
b128c09f 659 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
34dc7c2f
BB
660 child, t) == 0);
661
662 for (c = 0; c < t; c++)
663 nvlist_free(child[c]);
664
665 umem_free(child, t * sizeof (nvlist_t *));
666
667 return (root);
668}
669
670static void
671ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
672{
673 int bs = SPA_MINBLOCKSHIFT +
674 ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
675 int ibs = DN_MIN_INDBLKSHIFT +
676 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
677 int error;
678
679 error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
680 if (error) {
681 char osname[300];
682 dmu_objset_name(os, osname);
683 fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
684 osname, object, 1 << bs, ibs, error);
685 }
686}
687
688static uint8_t
689ztest_random_checksum(void)
690{
691 uint8_t checksum;
692
693 do {
694 checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
695 } while (zio_checksum_table[checksum].ci_zbt);
696
697 if (checksum == ZIO_CHECKSUM_OFF)
698 checksum = ZIO_CHECKSUM_ON;
699
700 return (checksum);
701}
702
703static uint8_t
704ztest_random_compress(void)
705{
706 return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
707}
708
34dc7c2f 709static int
fb5f0bc8 710ztest_replay_create(objset_t *os, lr_create_t *lr, boolean_t byteswap)
34dc7c2f 711{
34dc7c2f
BB
712 dmu_tx_t *tx;
713 int error;
714
715 if (byteswap)
716 byteswap_uint64_array(lr, sizeof (*lr));
717
718 tx = dmu_tx_create(os);
719 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
fb5f0bc8 720 error = dmu_tx_assign(tx, TXG_WAIT);
34dc7c2f
BB
721 if (error) {
722 dmu_tx_abort(tx);
723 return (error);
724 }
725
726 error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
727 DMU_OT_NONE, 0, tx);
728 ASSERT3U(error, ==, 0);
729 dmu_tx_commit(tx);
730
731 if (zopt_verbose >= 5) {
732 char osname[MAXNAMELEN];
733 dmu_objset_name(os, osname);
734 (void) printf("replay create of %s object %llu"
735 " in txg %llu = %d\n",
736 osname, (u_longlong_t)lr->lr_doid,
fb5f0bc8 737 (u_longlong_t)dmu_tx_get_txg(tx), error);
34dc7c2f
BB
738 }
739
740 return (error);
741}
742
743static int
fb5f0bc8 744ztest_replay_remove(objset_t *os, lr_remove_t *lr, boolean_t byteswap)
34dc7c2f 745{
34dc7c2f
BB
746 dmu_tx_t *tx;
747 int error;
748
749 if (byteswap)
750 byteswap_uint64_array(lr, sizeof (*lr));
751
752 tx = dmu_tx_create(os);
753 dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
fb5f0bc8 754 error = dmu_tx_assign(tx, TXG_WAIT);
34dc7c2f
BB
755 if (error) {
756 dmu_tx_abort(tx);
757 return (error);
758 }
759
760 error = dmu_object_free(os, lr->lr_doid, tx);
761 dmu_tx_commit(tx);
762
763 return (error);
764}
765
766zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
767 NULL, /* 0 no such transaction type */
768 ztest_replay_create, /* TX_CREATE */
769 NULL, /* TX_MKDIR */
770 NULL, /* TX_MKXATTR */
771 NULL, /* TX_SYMLINK */
772 ztest_replay_remove, /* TX_REMOVE */
773 NULL, /* TX_RMDIR */
774 NULL, /* TX_LINK */
775 NULL, /* TX_RENAME */
776 NULL, /* TX_WRITE */
777 NULL, /* TX_TRUNCATE */
778 NULL, /* TX_SETATTR */
779 NULL, /* TX_ACL */
780};
781
782/*
783 * Verify that we can't destroy an active pool, create an existing pool,
784 * or create a pool with a bad vdev spec.
785 */
786void
787ztest_spa_create_destroy(ztest_args_t *za)
788{
789 int error;
790 spa_t *spa;
791 nvlist_t *nvroot;
792
793 /*
794 * Attempt to create using a bad file.
795 */
b128c09f
BB
796 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
797 error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
34dc7c2f
BB
798 nvlist_free(nvroot);
799 if (error != ENOENT)
800 fatal(0, "spa_create(bad_file) = %d", error);
801
802 /*
803 * Attempt to create using a bad mirror.
804 */
b128c09f
BB
805 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
806 error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
34dc7c2f
BB
807 nvlist_free(nvroot);
808 if (error != ENOENT)
809 fatal(0, "spa_create(bad_mirror) = %d", error);
810
811 /*
812 * Attempt to create an existing pool. It shouldn't matter
813 * what's in the nvroot; we should fail with EEXIST.
814 */
815 (void) rw_rdlock(&ztest_shared->zs_name_lock);
b128c09f
BB
816 nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
817 error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
34dc7c2f
BB
818 nvlist_free(nvroot);
819 if (error != EEXIST)
820 fatal(0, "spa_create(whatever) = %d", error);
821
822 error = spa_open(za->za_pool, &spa, FTAG);
823 if (error)
824 fatal(0, "spa_open() = %d", error);
825
826 error = spa_destroy(za->za_pool);
827 if (error != EBUSY)
828 fatal(0, "spa_destroy() = %d", error);
829
830 spa_close(spa, FTAG);
831 (void) rw_unlock(&ztest_shared->zs_name_lock);
832}
833
b128c09f
BB
834static vdev_t *
835vdev_lookup_by_path(vdev_t *vd, const char *path)
836{
837 vdev_t *mvd;
838
839 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
840 return (vd);
841
842 for (int c = 0; c < vd->vdev_children; c++)
843 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
844 NULL)
845 return (mvd);
846
847 return (NULL);
848}
849
34dc7c2f
BB
850/*
851 * Verify that vdev_add() works as expected.
852 */
853void
854ztest_vdev_add_remove(ztest_args_t *za)
855{
856 spa_t *spa = za->za_spa;
857 uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
858 nvlist_t *nvroot;
859 int error;
860
34dc7c2f
BB
861 (void) mutex_lock(&ztest_shared->zs_vdev_lock);
862
b128c09f 863 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
34dc7c2f
BB
864
865 ztest_shared->zs_vdev_primaries =
866 spa->spa_root_vdev->vdev_children * leaves;
867
b128c09f 868 spa_config_exit(spa, SCL_VDEV, FTAG);
34dc7c2f
BB
869
870 /*
871 * Make 1/4 of the devices be log devices.
872 */
b128c09f 873 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
34dc7c2f
BB
874 ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
875
876 error = spa_vdev_add(spa, nvroot);
877 nvlist_free(nvroot);
878
879 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
880
881 if (error == ENOSPC)
882 ztest_record_enospc("spa_vdev_add");
883 else if (error != 0)
884 fatal(0, "spa_vdev_add() = %d", error);
34dc7c2f
BB
885}
886
b128c09f
BB
887/*
888 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
889 */
890void
891ztest_vdev_aux_add_remove(ztest_args_t *za)
34dc7c2f 892{
b128c09f
BB
893 spa_t *spa = za->za_spa;
894 vdev_t *rvd = spa->spa_root_vdev;
895 spa_aux_vdev_t *sav;
896 char *aux;
897 uint64_t guid = 0;
898 int error;
34dc7c2f 899
b128c09f
BB
900 if (ztest_random(2) == 0) {
901 sav = &spa->spa_spares;
902 aux = ZPOOL_CONFIG_SPARES;
903 } else {
904 sav = &spa->spa_l2cache;
905 aux = ZPOOL_CONFIG_L2CACHE;
906 }
907
908 (void) mutex_lock(&ztest_shared->zs_vdev_lock);
909
910 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
911
912 if (sav->sav_count != 0 && ztest_random(4) == 0) {
913 /*
914 * Pick a random device to remove.
915 */
916 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
917 } else {
918 /*
919 * Find an unused device we can add.
920 */
921 ztest_shared->zs_vdev_aux = 0;
922 for (;;) {
923 char path[MAXPATHLEN];
924 int c;
925 (void) sprintf(path, ztest_aux_template, zopt_dir,
926 zopt_pool, aux, ztest_shared->zs_vdev_aux);
927 for (c = 0; c < sav->sav_count; c++)
928 if (strcmp(sav->sav_vdevs[c]->vdev_path,
929 path) == 0)
930 break;
931 if (c == sav->sav_count &&
932 vdev_lookup_by_path(rvd, path) == NULL)
933 break;
934 ztest_shared->zs_vdev_aux++;
34dc7c2f
BB
935 }
936 }
937
b128c09f 938 spa_config_exit(spa, SCL_VDEV, FTAG);
34dc7c2f 939
b128c09f
BB
940 if (guid == 0) {
941 /*
942 * Add a new device.
943 */
944 nvlist_t *nvroot = make_vdev_root(NULL, aux,
945 (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
946 error = spa_vdev_add(spa, nvroot);
947 if (error != 0)
948 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
949 nvlist_free(nvroot);
950 } else {
951 /*
952 * Remove an existing device. Sometimes, dirty its
953 * vdev state first to make sure we handle removal
954 * of devices that have pending state changes.
955 */
956 if (ztest_random(2) == 0)
9babb374 957 (void) vdev_online(spa, guid, 0, NULL);
b128c09f
BB
958
959 error = spa_vdev_remove(spa, guid, B_FALSE);
960 if (error != 0 && error != EBUSY)
961 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
962 }
963
964 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
34dc7c2f
BB
965}
966
967/*
968 * Verify that we can attach and detach devices.
969 */
970void
971ztest_vdev_attach_detach(ztest_args_t *za)
972{
973 spa_t *spa = za->za_spa;
b128c09f 974 spa_aux_vdev_t *sav = &spa->spa_spares;
34dc7c2f
BB
975 vdev_t *rvd = spa->spa_root_vdev;
976 vdev_t *oldvd, *newvd, *pvd;
b128c09f 977 nvlist_t *root;
34dc7c2f
BB
978 uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
979 uint64_t leaf, top;
980 uint64_t ashift = ztest_get_ashift();
fb5f0bc8 981 uint64_t oldguid, pguid;
34dc7c2f
BB
982 size_t oldsize, newsize;
983 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
984 int replacing;
b128c09f
BB
985 int oldvd_has_siblings = B_FALSE;
986 int newvd_is_spare = B_FALSE;
987 int oldvd_is_log;
34dc7c2f 988 int error, expected_error;
34dc7c2f
BB
989
990 (void) mutex_lock(&ztest_shared->zs_vdev_lock);
991
b128c09f 992 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
34dc7c2f
BB
993
994 /*
995 * Decide whether to do an attach or a replace.
996 */
997 replacing = ztest_random(2);
998
999 /*
1000 * Pick a random top-level vdev.
1001 */
1002 top = ztest_random(rvd->vdev_children);
1003
1004 /*
1005 * Pick a random leaf within it.
1006 */
1007 leaf = ztest_random(leaves);
1008
1009 /*
b128c09f 1010 * Locate this vdev.
34dc7c2f 1011 */
b128c09f 1012 oldvd = rvd->vdev_child[top];
fb5f0bc8
BB
1013 if (zopt_mirrors >= 1) {
1014 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
1015 ASSERT(oldvd->vdev_children >= zopt_mirrors);
b128c09f 1016 oldvd = oldvd->vdev_child[leaf / zopt_raidz];
fb5f0bc8
BB
1017 }
1018 if (zopt_raidz > 1) {
1019 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
1020 ASSERT(oldvd->vdev_children == zopt_raidz);
b128c09f 1021 oldvd = oldvd->vdev_child[leaf % zopt_raidz];
fb5f0bc8 1022 }
34dc7c2f
BB
1023
1024 /*
b128c09f
BB
1025 * If we're already doing an attach or replace, oldvd may be a
1026 * mirror vdev -- in which case, pick a random child.
34dc7c2f 1027 */
b128c09f
BB
1028 while (oldvd->vdev_children != 0) {
1029 oldvd_has_siblings = B_TRUE;
fb5f0bc8
BB
1030 ASSERT(oldvd->vdev_children >= 2);
1031 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
b128c09f
BB
1032 }
1033
1034 oldguid = oldvd->vdev_guid;
9babb374 1035 oldsize = vdev_get_min_asize(oldvd);
b128c09f
BB
1036 oldvd_is_log = oldvd->vdev_top->vdev_islog;
1037 (void) strcpy(oldpath, oldvd->vdev_path);
1038 pvd = oldvd->vdev_parent;
fb5f0bc8 1039 pguid = pvd->vdev_guid;
34dc7c2f
BB
1040
1041 /*
b128c09f 1042 * If oldvd has siblings, then half of the time, detach it.
34dc7c2f 1043 */
b128c09f
BB
1044 if (oldvd_has_siblings && ztest_random(2) == 0) {
1045 spa_config_exit(spa, SCL_VDEV, FTAG);
fb5f0bc8
BB
1046 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
1047 if (error != 0 && error != ENODEV && error != EBUSY &&
1048 error != ENOTSUP)
1049 fatal(0, "detach (%s) returned %d", oldpath, error);
b128c09f
BB
1050 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
1051 return;
1052 }
34dc7c2f
BB
1053
1054 /*
b128c09f
BB
1055 * For the new vdev, choose with equal probability between the two
1056 * standard paths (ending in either 'a' or 'b') or a random hot spare.
34dc7c2f 1057 */
b128c09f
BB
1058 if (sav->sav_count != 0 && ztest_random(3) == 0) {
1059 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
1060 newvd_is_spare = B_TRUE;
1061 (void) strcpy(newpath, newvd->vdev_path);
1062 } else {
1063 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
1064 zopt_dir, zopt_pool, top * leaves + leaf);
1065 if (ztest_random(2) == 0)
1066 newpath[strlen(newpath) - 1] = 'b';
1067 newvd = vdev_lookup_by_path(rvd, newpath);
1068 }
1069
1070 if (newvd) {
9babb374 1071 newsize = vdev_get_min_asize(newvd);
b128c09f
BB
1072 } else {
1073 /*
1074 * Make newsize a little bigger or smaller than oldsize.
1075 * If it's smaller, the attach should fail.
1076 * If it's larger, and we're doing a replace,
1077 * we should get dynamic LUN growth when we're done.
1078 */
1079 newsize = 10 * oldsize / (9 + ztest_random(3));
1080 }
34dc7c2f
BB
1081
1082 /*
1083 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
1084 * unless it's a replace; in that case any non-replacing parent is OK.
1085 *
1086 * If newvd is already part of the pool, it should fail with EBUSY.
1087 *
1088 * If newvd is too small, it should fail with EOVERFLOW.
1089 */
b128c09f
BB
1090 if (pvd->vdev_ops != &vdev_mirror_ops &&
1091 pvd->vdev_ops != &vdev_root_ops && (!replacing ||
1092 pvd->vdev_ops == &vdev_replacing_ops ||
1093 pvd->vdev_ops == &vdev_spare_ops))
34dc7c2f 1094 expected_error = ENOTSUP;
b128c09f
BB
1095 else if (newvd_is_spare && (!replacing || oldvd_is_log))
1096 expected_error = ENOTSUP;
1097 else if (newvd == oldvd)
1098 expected_error = replacing ? 0 : EBUSY;
1099 else if (vdev_lookup_by_path(rvd, newpath) != NULL)
1100 expected_error = EBUSY;
34dc7c2f
BB
1101 else if (newsize < oldsize)
1102 expected_error = EOVERFLOW;
1103 else if (ashift > oldvd->vdev_top->vdev_ashift)
1104 expected_error = EDOM;
1105 else
1106 expected_error = 0;
1107
b128c09f 1108 spa_config_exit(spa, SCL_VDEV, FTAG);
34dc7c2f
BB
1109
1110 /*
1111 * Build the nvlist describing newpath.
1112 */
b128c09f
BB
1113 root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0,
1114 ashift, 0, 0, 0, 1);
34dc7c2f 1115
b128c09f 1116 error = spa_vdev_attach(spa, oldguid, root, replacing);
34dc7c2f 1117
34dc7c2f
BB
1118 nvlist_free(root);
1119
1120 /*
1121 * If our parent was the replacing vdev, but the replace completed,
1122 * then instead of failing with ENOTSUP we may either succeed,
1123 * fail with ENODEV, or fail with EOVERFLOW.
1124 */
1125 if (expected_error == ENOTSUP &&
1126 (error == 0 || error == ENODEV || error == EOVERFLOW))
1127 expected_error = error;
1128
1129 /*
1130 * If someone grew the LUN, the replacement may be too small.
1131 */
b128c09f 1132 if (error == EOVERFLOW || error == EBUSY)
34dc7c2f
BB
1133 expected_error = error;
1134
b128c09f
BB
1135 /* XXX workaround 6690467 */
1136 if (error != expected_error && expected_error != EBUSY) {
1137 fatal(0, "attach (%s %llu, %s %llu, %d) "
1138 "returned %d, expected %d",
1139 oldpath, (longlong_t)oldsize, newpath,
1140 (longlong_t)newsize, replacing, error, expected_error);
34dc7c2f
BB
1141 }
1142
1143 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
1144}
1145
9babb374
BB
1146/*
1147 * Callback function which expands the physical size of the vdev.
1148 */
1149vdev_t *
1150grow_vdev(vdev_t *vd, void *arg)
1151{
1152 spa_t *spa = vd->vdev_spa;
1153 size_t *newsize = arg;
1154 size_t fsize;
1155 int fd;
1156
1157 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
1158 ASSERT(vd->vdev_ops->vdev_op_leaf);
1159
1160 if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
1161 return (vd);
1162
1163 fsize = lseek(fd, 0, SEEK_END);
1164 (void) ftruncate(fd, *newsize);
1165
1166 if (zopt_verbose >= 6) {
1167 (void) printf("%s grew from %lu to %lu bytes\n",
1168 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
1169 }
1170 (void) close(fd);
1171 return (NULL);
1172}
1173
1174/*
1175 * Callback function which expands a given vdev by calling vdev_online().
1176 */
1177/* ARGSUSED */
1178vdev_t *
1179online_vdev(vdev_t *vd, void *arg)
1180{
1181 spa_t *spa = vd->vdev_spa;
1182 vdev_t *tvd = vd->vdev_top;
1183 vdev_t *pvd = vd->vdev_parent;
1184 uint64_t guid = vd->vdev_guid;
1185
1186 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
1187 ASSERT(vd->vdev_ops->vdev_op_leaf);
1188
1189 /* Calling vdev_online will initialize the new metaslabs */
1190 spa_config_exit(spa, SCL_STATE, spa);
1191 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
1192 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
1193
1194 /*
1195 * Since we dropped the lock we need to ensure that we're
1196 * still talking to the original vdev. It's possible this
1197 * vdev may have been detached/replaced while we were
1198 * trying to online it.
1199 */
1200 if (vd != vdev_lookup_by_guid(tvd, guid) || vd->vdev_parent != pvd) {
1201 if (zopt_verbose >= 6) {
1202 (void) printf("vdev %p has disappeared, was "
1203 "guid %llu\n", (void *)vd, (u_longlong_t)guid);
1204 }
1205 return (vd);
1206 }
1207 return (NULL);
1208}
1209
1210/*
1211 * Traverse the vdev tree calling the supplied function.
1212 * We continue to walk the tree until we either have walked all
1213 * children or we receive a non-NULL return from the callback.
1214 * If a NULL callback is passed, then we just return back the first
1215 * leaf vdev we encounter.
1216 */
1217vdev_t *
1218vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
1219{
1220 if (vd->vdev_ops->vdev_op_leaf) {
1221 if (func == NULL)
1222 return (vd);
1223 else
1224 return (func(vd, arg));
1225 }
1226
1227 for (uint_t c = 0; c < vd->vdev_children; c++) {
1228 vdev_t *cvd = vd->vdev_child[c];
1229 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
1230 return (cvd);
1231 }
1232 return (NULL);
1233}
1234
34dc7c2f
BB
1235/*
1236 * Verify that dynamic LUN growth works as expected.
1237 */
34dc7c2f
BB
1238void
1239ztest_vdev_LUN_growth(ztest_args_t *za)
1240{
1241 spa_t *spa = za->za_spa;
9babb374
BB
1242 vdev_t *vd, *tvd = NULL;
1243 size_t psize, newsize;
1244 uint64_t spa_newsize, spa_cursize, ms_count;
34dc7c2f
BB
1245
1246 (void) mutex_lock(&ztest_shared->zs_vdev_lock);
9babb374
BB
1247 mutex_enter(&spa_namespace_lock);
1248 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
1249
1250 while (tvd == NULL || tvd->vdev_islog) {
1251 uint64_t vdev;
1252
1253 vdev = ztest_random(spa->spa_root_vdev->vdev_children);
1254 tvd = spa->spa_root_vdev->vdev_child[vdev];
1255 }
34dc7c2f
BB
1256
1257 /*
9babb374
BB
1258 * Determine the size of the first leaf vdev associated with
1259 * our top-level device.
34dc7c2f 1260 */
9babb374
BB
1261 vd = vdev_walk_tree(tvd, NULL, NULL);
1262 ASSERT3P(vd, !=, NULL);
1263 ASSERT(vd->vdev_ops->vdev_op_leaf);
34dc7c2f 1264
9babb374 1265 psize = vd->vdev_psize;
34dc7c2f 1266
9babb374
BB
1267 /*
1268 * We only try to expand the vdev if it's less than 4x its
1269 * original size and it has a valid psize.
1270 */
1271 if (psize == 0 || psize >= 4 * zopt_vdev_size) {
1272 spa_config_exit(spa, SCL_STATE, spa);
1273 mutex_exit(&spa_namespace_lock);
1274 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
1275 return;
1276 }
1277 ASSERT(psize > 0);
1278 newsize = psize + psize / 8;
1279 ASSERT3U(newsize, >, psize);
34dc7c2f 1280
9babb374
BB
1281 if (zopt_verbose >= 6) {
1282 (void) printf("Expanding vdev %s from %lu to %lu\n",
1283 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
1284 }
1285
1286 spa_cursize = spa_get_space(spa);
1287 ms_count = tvd->vdev_ms_count;
1288
1289 /*
1290 * Growing the vdev is a two step process:
1291 * 1). expand the physical size (i.e. relabel)
1292 * 2). online the vdev to create the new metaslabs
1293 */
1294 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
1295 vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
1296 tvd->vdev_state != VDEV_STATE_HEALTHY) {
1297 if (zopt_verbose >= 5) {
1298 (void) printf("Could not expand LUN because "
1299 "some vdevs were not healthy\n");
34dc7c2f 1300 }
9babb374
BB
1301 (void) spa_config_exit(spa, SCL_STATE, spa);
1302 mutex_exit(&spa_namespace_lock);
1303 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
1304 return;
34dc7c2f
BB
1305 }
1306
9babb374
BB
1307 (void) spa_config_exit(spa, SCL_STATE, spa);
1308 mutex_exit(&spa_namespace_lock);
1309
1310 /*
1311 * Expanding the LUN will update the config asynchronously,
1312 * thus we must wait for the async thread to complete any
1313 * pending tasks before proceeding.
1314 */
1315 mutex_enter(&spa->spa_async_lock);
1316 while (spa->spa_async_thread != NULL || spa->spa_async_tasks)
1317 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
1318 mutex_exit(&spa->spa_async_lock);
1319
1320 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
1321 spa_newsize = spa_get_space(spa);
1322
1323 /*
1324 * Make sure we were able to grow the pool.
1325 */
1326 if (ms_count >= tvd->vdev_ms_count ||
1327 spa_cursize >= spa_newsize) {
1328 (void) printf("Top-level vdev metaslab count: "
1329 "before %llu, after %llu\n",
1330 (u_longlong_t)ms_count,
1331 (u_longlong_t)tvd->vdev_ms_count);
1332 fatal(0, "LUN expansion failed: before %llu, "
1333 "after %llu\n", spa_cursize, spa_newsize);
1334 } else if (zopt_verbose >= 5) {
1335 char oldnumbuf[6], newnumbuf[6];
1336
1337 nicenum(spa_cursize, oldnumbuf);
1338 nicenum(spa_newsize, newnumbuf);
1339 (void) printf("%s grew from %s to %s\n",
1340 spa->spa_name, oldnumbuf, newnumbuf);
1341 }
1342 spa_config_exit(spa, SCL_STATE, spa);
34dc7c2f
BB
1343 (void) mutex_unlock(&ztest_shared->zs_vdev_lock);
1344}
1345
1346/* ARGSUSED */
1347static void
1348ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
1349{
1350 /*
1351 * Create the directory object.
1352 */
1353 VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
1354 DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
1355 DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
1356
1357 VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
1358 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
1359
1360 VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
1361 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
1362}
1363
1364static int
1365ztest_destroy_cb(char *name, void *arg)
1366{
1367 ztest_args_t *za = arg;
1368 objset_t *os;
1369 dmu_object_info_t *doi = &za->za_doi;
1370 int error;
1371
1372 /*
1373 * Verify that the dataset contains a directory object.
1374 */
1375 error = dmu_objset_open(name, DMU_OST_OTHER,
b128c09f 1376 DS_MODE_USER | DS_MODE_READONLY, &os);
34dc7c2f
BB
1377 ASSERT3U(error, ==, 0);
1378 error = dmu_object_info(os, ZTEST_DIROBJ, doi);
1379 if (error != ENOENT) {
1380 /* We could have crashed in the middle of destroying it */
1381 ASSERT3U(error, ==, 0);
1382 ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
1383 ASSERT3S(doi->doi_physical_blks, >=, 0);
1384 }
1385 dmu_objset_close(os);
1386
1387 /*
1388 * Destroy the dataset.
1389 */
1390 error = dmu_objset_destroy(name);
b128c09f
BB
1391 if (error) {
1392 (void) dmu_objset_open(name, DMU_OST_OTHER,
1393 DS_MODE_USER | DS_MODE_READONLY, &os);
1394 fatal(0, "dmu_objset_destroy(os=%p) = %d\n", &os, error);
1395 }
34dc7c2f
BB
1396 return (0);
1397}
1398
1399/*
1400 * Verify that dmu_objset_{create,destroy,open,close} work as expected.
1401 */
1402static uint64_t
1403ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
1404{
1405 itx_t *itx;
1406 lr_create_t *lr;
1407 size_t namesize;
1408 char name[24];
1409
1410 (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
1411 namesize = strlen(name) + 1;
1412
1413 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
1414 ztest_random(ZIL_MAX_BLKSZ));
1415 lr = (lr_create_t *)&itx->itx_lr;
1416 bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
1417 lr->lr_doid = object;
1418 lr->lr_foid = 0;
1419 lr->lr_mode = mode;
1420 lr->lr_uid = 0;
1421 lr->lr_gid = 0;
1422 lr->lr_gen = dmu_tx_get_txg(tx);
1423 lr->lr_crtime[0] = time(NULL);
1424 lr->lr_crtime[1] = 0;
1425 lr->lr_rdev = 0;
1426 bcopy(name, (char *)(lr + 1), namesize);
1427
1428 return (zil_itx_assign(zilog, itx, tx));
1429}
1430
1431void
1432ztest_dmu_objset_create_destroy(ztest_args_t *za)
1433{
1434 int error;
b128c09f 1435 objset_t *os, *os2;
34dc7c2f 1436 char name[100];
b128c09f 1437 int basemode, expected_error;
34dc7c2f
BB
1438 zilog_t *zilog;
1439 uint64_t seq;
1440 uint64_t objects;
34dc7c2f
BB
1441
1442 (void) rw_rdlock(&ztest_shared->zs_name_lock);
1443 (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
1444 (u_longlong_t)za->za_instance);
1445
b128c09f
BB
1446 basemode = DS_MODE_TYPE(za->za_instance);
1447 if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER)
1448 basemode = DS_MODE_USER;
34dc7c2f
BB
1449
1450 /*
1451 * If this dataset exists from a previous run, process its replay log
1452 * half of the time. If we don't replay it, then dmu_objset_destroy()
1453 * (invoked from ztest_destroy_cb() below) should just throw it away.
1454 */
1455 if (ztest_random(2) == 0 &&
b128c09f 1456 dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
fb5f0bc8 1457 zil_replay(os, os, ztest_replay_vector);
34dc7c2f
BB
1458 dmu_objset_close(os);
1459 }
1460
1461 /*
1462 * There may be an old instance of the dataset we're about to
1463 * create lying around from a previous run. If so, destroy it
1464 * and all of its snapshots.
1465 */
1466 (void) dmu_objset_find(name, ztest_destroy_cb, za,
1467 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1468
1469 /*
1470 * Verify that the destroyed dataset is no longer in the namespace.
1471 */
1472 error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
1473 if (error != ENOENT)
1474 fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
1475 name, os);
1476
1477 /*
1478 * Verify that we can create a new dataset.
1479 */
1480 error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
1481 ztest_create_cb, NULL);
1482 if (error) {
1483 if (error == ENOSPC) {
1484 ztest_record_enospc("dmu_objset_create");
1485 (void) rw_unlock(&ztest_shared->zs_name_lock);
1486 return;
1487 }
1488 fatal(0, "dmu_objset_create(%s) = %d", name, error);
1489 }
1490
1491 error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
1492 if (error) {
1493 fatal(0, "dmu_objset_open(%s) = %d", name, error);
1494 }
1495
1496 /*
1497 * Open the intent log for it.
1498 */
1499 zilog = zil_open(os, NULL);
1500
1501 /*
1502 * Put a random number of objects in there.
1503 */
1504 objects = ztest_random(20);
1505 seq = 0;
1506 while (objects-- != 0) {
1507 uint64_t object;
1508 dmu_tx_t *tx = dmu_tx_create(os);
1509 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
1510 error = dmu_tx_assign(tx, TXG_WAIT);
1511 if (error) {
1512 dmu_tx_abort(tx);
1513 } else {
1514 object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
1515 DMU_OT_NONE, 0, tx);
1516 ztest_set_random_blocksize(os, object, tx);
1517 seq = ztest_log_create(zilog, tx, object,
1518 DMU_OT_UINT64_OTHER);
1519 dmu_write(os, object, 0, sizeof (name), name, tx);
1520 dmu_tx_commit(tx);
1521 }
1522 if (ztest_random(5) == 0) {
1523 zil_commit(zilog, seq, object);
1524 }
1525 if (ztest_random(100) == 0) {
1526 error = zil_suspend(zilog);
1527 if (error == 0) {
1528 zil_resume(zilog);
1529 }
1530 }
1531 }
1532
1533 /*
1534 * Verify that we cannot create an existing dataset.
1535 */
1536 error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, NULL, NULL);
1537 if (error != EEXIST)
1538 fatal(0, "created existing dataset, error = %d", error);
1539
1540 /*
b128c09f 1541 * Verify that multiple dataset holds are allowed, but only when
34dc7c2f 1542 * the new access mode is compatible with the base mode.
b128c09f
BB
1543 */
1544 if (basemode == DS_MODE_OWNER) {
1545 error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_USER,
1546 &os2);
1547 if (error)
1548 fatal(0, "dmu_objset_open('%s') = %d", name, error);
1549 else
34dc7c2f
BB
1550 dmu_objset_close(os2);
1551 }
b128c09f
BB
1552 error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os2);
1553 expected_error = (basemode == DS_MODE_OWNER) ? EBUSY : 0;
1554 if (error != expected_error)
1555 fatal(0, "dmu_objset_open('%s') = %d, expected %d",
1556 name, error, expected_error);
1557 if (error == 0)
1558 dmu_objset_close(os2);
34dc7c2f
BB
1559
1560 zil_close(zilog);
1561 dmu_objset_close(os);
1562
1563 error = dmu_objset_destroy(name);
1564 if (error)
1565 fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
1566
1567 (void) rw_unlock(&ztest_shared->zs_name_lock);
1568}
1569
1570/*
1571 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
1572 */
1573void
1574ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
1575{
1576 int error;
1577 objset_t *os = za->za_os;
1578 char snapname[100];
1579 char osname[MAXNAMELEN];
1580
1581 (void) rw_rdlock(&ztest_shared->zs_name_lock);
1582 dmu_objset_name(os, osname);
1583 (void) snprintf(snapname, 100, "%s@%llu", osname,
1584 (u_longlong_t)za->za_instance);
1585
1586 error = dmu_objset_destroy(snapname);
1587 if (error != 0 && error != ENOENT)
1588 fatal(0, "dmu_objset_destroy() = %d", error);
9babb374
BB
1589 error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1,
1590 NULL, FALSE);
34dc7c2f
BB
1591 if (error == ENOSPC)
1592 ztest_record_enospc("dmu_take_snapshot");
1593 else if (error != 0 && error != EEXIST)
1594 fatal(0, "dmu_take_snapshot() = %d", error);
1595 (void) rw_unlock(&ztest_shared->zs_name_lock);
1596}
1597
9babb374
BB
1598/*
1599 * Cleanup non-standard snapshots and clones.
1600 */
1601void
1602ztest_dsl_dataset_cleanup(char *osname, uint64_t curval)
1603{
1604 char snap1name[100];
1605 char clone1name[100];
1606 char snap2name[100];
1607 char clone2name[100];
1608 char snap3name[100];
1609 int error;
1610
1611 (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
1612 (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
1613 (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
1614 (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
1615 (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
1616
1617 error = dmu_objset_destroy(clone2name);
1618 if (error && error != ENOENT)
1619 fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
1620 error = dmu_objset_destroy(snap3name);
1621 if (error && error != ENOENT)
1622 fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
1623 error = dmu_objset_destroy(snap2name);
1624 if (error && error != ENOENT)
1625 fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
1626 error = dmu_objset_destroy(clone1name);
1627 if (error && error != ENOENT)
1628 fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
1629 error = dmu_objset_destroy(snap1name);
1630 if (error && error != ENOENT)
1631 fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
1632}
1633
1634/*
1635 * Verify dsl_dataset_promote handles EBUSY
1636 */
1637void
1638ztest_dsl_dataset_promote_busy(ztest_args_t *za)
1639{
1640 int error;
1641 objset_t *os = za->za_os;
1642 objset_t *clone;
1643 dsl_dataset_t *ds;
1644 char snap1name[100];
1645 char clone1name[100];
1646 char snap2name[100];
1647 char clone2name[100];
1648 char snap3name[100];
1649 char osname[MAXNAMELEN];
1650 uint64_t curval = za->za_instance;
1651
1652 (void) rw_rdlock(&ztest_shared->zs_name_lock);
1653
1654 dmu_objset_name(os, osname);
1655 ztest_dsl_dataset_cleanup(osname, curval);
1656
1657 (void) snprintf(snap1name, 100, "%s@s1_%llu", osname, curval);
1658 (void) snprintf(clone1name, 100, "%s/c1_%llu", osname, curval);
1659 (void) snprintf(snap2name, 100, "%s@s2_%llu", clone1name, curval);
1660 (void) snprintf(clone2name, 100, "%s/c2_%llu", osname, curval);
1661 (void) snprintf(snap3name, 100, "%s@s3_%llu", clone1name, curval);
1662
1663 error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
1664 NULL, FALSE);
1665 if (error && error != EEXIST) {
1666 if (error == ENOSPC) {
1667 ztest_record_enospc("dmu_take_snapshot");
1668 goto out;
1669 }
1670 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
1671 }
1672
1673 error = dmu_objset_open(snap1name, DMU_OST_OTHER,
1674 DS_MODE_USER | DS_MODE_READONLY, &clone);
1675 if (error)
1676 fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
1677
1678 error = dmu_objset_create(clone1name, DMU_OST_OTHER, clone, 0,
1679 NULL, NULL);
1680 dmu_objset_close(clone);
1681 if (error) {
1682 if (error == ENOSPC) {
1683 ztest_record_enospc("dmu_objset_create");
1684 goto out;
1685 }
1686 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
1687 }
1688
1689 error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
1690 NULL, FALSE);
1691 if (error && error != EEXIST) {
1692 if (error == ENOSPC) {
1693 ztest_record_enospc("dmu_take_snapshot");
1694 goto out;
1695 }
1696 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
1697 }
1698
1699 error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
1700 NULL, FALSE);
1701 if (error && error != EEXIST) {
1702 if (error == ENOSPC) {
1703 ztest_record_enospc("dmu_take_snapshot");
1704 goto out;
1705 }
1706 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
1707 }
1708
1709 error = dmu_objset_open(snap3name, DMU_OST_OTHER,
1710 DS_MODE_USER | DS_MODE_READONLY, &clone);
1711 if (error)
1712 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
1713
1714 error = dmu_objset_create(clone2name, DMU_OST_OTHER, clone, 0,
1715 NULL, NULL);
1716 dmu_objset_close(clone);
1717 if (error) {
1718 if (error == ENOSPC) {
1719 ztest_record_enospc("dmu_objset_create");
1720 goto out;
1721 }
1722 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
1723 }
1724
1725 error = dsl_dataset_own(snap1name, DS_MODE_READONLY, FTAG, &ds);
1726 if (error)
1727 fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
1728 error = dsl_dataset_promote(clone2name);
1729 if (error != EBUSY)
1730 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
1731 error);
1732 dsl_dataset_disown(ds, FTAG);
1733
1734out:
1735 ztest_dsl_dataset_cleanup(osname, curval);
1736
1737 (void) rw_unlock(&ztest_shared->zs_name_lock);
1738}
1739
34dc7c2f
BB
1740/*
1741 * Verify that dmu_object_{alloc,free} work as expected.
1742 */
1743void
1744ztest_dmu_object_alloc_free(ztest_args_t *za)
1745{
1746 objset_t *os = za->za_os;
1747 dmu_buf_t *db;
1748 dmu_tx_t *tx;
1749 uint64_t batchobj, object, batchsize, endoff, temp;
1750 int b, c, error, bonuslen;
1751 dmu_object_info_t *doi = &za->za_doi;
1752 char osname[MAXNAMELEN];
1753
1754 dmu_objset_name(os, osname);
1755
1756 endoff = -8ULL;
1757 batchsize = 2;
1758
1759 /*
1760 * Create a batch object if necessary, and record it in the directory.
1761 */
b128c09f 1762 VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
9babb374 1763 sizeof (uint64_t), &batchobj, DMU_READ_PREFETCH));
34dc7c2f
BB
1764 if (batchobj == 0) {
1765 tx = dmu_tx_create(os);
1766 dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
1767 sizeof (uint64_t));
1768 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1769 error = dmu_tx_assign(tx, TXG_WAIT);
1770 if (error) {
1771 ztest_record_enospc("create a batch object");
1772 dmu_tx_abort(tx);
1773 return;
1774 }
1775 batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
1776 DMU_OT_NONE, 0, tx);
1777 ztest_set_random_blocksize(os, batchobj, tx);
1778 dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
1779 sizeof (uint64_t), &batchobj, tx);
1780 dmu_tx_commit(tx);
1781 }
1782
1783 /*
1784 * Destroy the previous batch of objects.
1785 */
1786 for (b = 0; b < batchsize; b++) {
b128c09f 1787 VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
9babb374 1788 sizeof (uint64_t), &object, DMU_READ_PREFETCH));
34dc7c2f
BB
1789 if (object == 0)
1790 continue;
1791 /*
1792 * Read and validate contents.
1793 * We expect the nth byte of the bonus buffer to be n.
1794 */
1795 VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
1796 za->za_dbuf = db;
1797
1798 dmu_object_info_from_db(db, doi);
1799 ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
1800 ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
1801 ASSERT3S(doi->doi_physical_blks, >=, 0);
1802
1803 bonuslen = doi->doi_bonus_size;
1804
1805 for (c = 0; c < bonuslen; c++) {
1806 if (((uint8_t *)db->db_data)[c] !=
1807 (uint8_t)(c + bonuslen)) {
1808 fatal(0,
1809 "bad bonus: %s, obj %llu, off %d: %u != %u",
1810 osname, object, c,
1811 ((uint8_t *)db->db_data)[c],
1812 (uint8_t)(c + bonuslen));
1813 }
1814 }
1815
1816 dmu_buf_rele(db, FTAG);
1817 za->za_dbuf = NULL;
1818
1819 /*
1820 * We expect the word at endoff to be our object number.
1821 */
1822 VERIFY(0 == dmu_read(os, object, endoff,
9babb374 1823 sizeof (uint64_t), &temp, DMU_READ_PREFETCH));
34dc7c2f
BB
1824
1825 if (temp != object) {
1826 fatal(0, "bad data in %s, got %llu, expected %llu",
1827 osname, temp, object);
1828 }
1829
1830 /*
1831 * Destroy old object and clear batch entry.
1832 */
1833 tx = dmu_tx_create(os);
1834 dmu_tx_hold_write(tx, batchobj,
1835 b * sizeof (uint64_t), sizeof (uint64_t));
1836 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
1837 error = dmu_tx_assign(tx, TXG_WAIT);
1838 if (error) {
1839 ztest_record_enospc("free object");
1840 dmu_tx_abort(tx);
1841 return;
1842 }
1843 error = dmu_object_free(os, object, tx);
1844 if (error) {
1845 fatal(0, "dmu_object_free('%s', %llu) = %d",
1846 osname, object, error);
1847 }
1848 object = 0;
1849
1850 dmu_object_set_checksum(os, batchobj,
1851 ztest_random_checksum(), tx);
1852 dmu_object_set_compress(os, batchobj,
1853 ztest_random_compress(), tx);
1854
1855 dmu_write(os, batchobj, b * sizeof (uint64_t),
1856 sizeof (uint64_t), &object, tx);
1857
1858 dmu_tx_commit(tx);
1859 }
1860
1861 /*
1862 * Before creating the new batch of objects, generate a bunch of churn.
1863 */
1864 for (b = ztest_random(100); b > 0; b--) {
1865 tx = dmu_tx_create(os);
1866 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1867 error = dmu_tx_assign(tx, TXG_WAIT);
1868 if (error) {
1869 ztest_record_enospc("churn objects");
1870 dmu_tx_abort(tx);
1871 return;
1872 }
1873 object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
1874 DMU_OT_NONE, 0, tx);
1875 ztest_set_random_blocksize(os, object, tx);
1876 error = dmu_object_free(os, object, tx);
1877 if (error) {
1878 fatal(0, "dmu_object_free('%s', %llu) = %d",
1879 osname, object, error);
1880 }
1881 dmu_tx_commit(tx);
1882 }
1883
1884 /*
1885 * Create a new batch of objects with randomly chosen
1886 * blocksizes and record them in the batch directory.
1887 */
1888 for (b = 0; b < batchsize; b++) {
1889 uint32_t va_blksize;
1890 u_longlong_t va_nblocks;
1891
1892 tx = dmu_tx_create(os);
1893 dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
1894 sizeof (uint64_t));
1895 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1896 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
1897 sizeof (uint64_t));
1898 error = dmu_tx_assign(tx, TXG_WAIT);
1899 if (error) {
1900 ztest_record_enospc("create batchobj");
1901 dmu_tx_abort(tx);
1902 return;
1903 }
1904 bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
1905
1906 object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
1907 DMU_OT_PLAIN_OTHER, bonuslen, tx);
1908
1909 ztest_set_random_blocksize(os, object, tx);
1910
1911 dmu_object_set_checksum(os, object,
1912 ztest_random_checksum(), tx);
1913 dmu_object_set_compress(os, object,
1914 ztest_random_compress(), tx);
1915
1916 dmu_write(os, batchobj, b * sizeof (uint64_t),
1917 sizeof (uint64_t), &object, tx);
1918
1919 /*
1920 * Write to both the bonus buffer and the regular data.
1921 */
1922 VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
1923 za->za_dbuf = db;
1924 ASSERT3U(bonuslen, <=, db->db_size);
1925
1926 dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
1927 ASSERT3S(va_nblocks, >=, 0);
1928
1929 dmu_buf_will_dirty(db, tx);
1930
1931 /*
1932 * See comments above regarding the contents of
1933 * the bonus buffer and the word at endoff.
1934 */
1935 for (c = 0; c < bonuslen; c++)
1936 ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
1937
1938 dmu_buf_rele(db, FTAG);
1939 za->za_dbuf = NULL;
1940
1941 /*
1942 * Write to a large offset to increase indirection.
1943 */
1944 dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
1945
1946 dmu_tx_commit(tx);
1947 }
1948}
1949
1950/*
1951 * Verify that dmu_{read,write} work as expected.
1952 */
1953typedef struct bufwad {
1954 uint64_t bw_index;
1955 uint64_t bw_txg;
1956 uint64_t bw_data;
1957} bufwad_t;
1958
1959typedef struct dmu_read_write_dir {
1960 uint64_t dd_packobj;
1961 uint64_t dd_bigobj;
1962 uint64_t dd_chunk;
1963} dmu_read_write_dir_t;
1964
1965void
1966ztest_dmu_read_write(ztest_args_t *za)
1967{
1968 objset_t *os = za->za_os;
1969 dmu_read_write_dir_t dd;
1970 dmu_tx_t *tx;
1971 int i, freeit, error;
1972 uint64_t n, s, txg;
1973 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
1974 uint64_t packoff, packsize, bigoff, bigsize;
1975 uint64_t regions = 997;
1976 uint64_t stride = 123456789ULL;
1977 uint64_t width = 40;
1978 int free_percent = 5;
1979
1980 /*
1981 * This test uses two objects, packobj and bigobj, that are always
1982 * updated together (i.e. in the same tx) so that their contents are
1983 * in sync and can be compared. Their contents relate to each other
1984 * in a simple way: packobj is a dense array of 'bufwad' structures,
1985 * while bigobj is a sparse array of the same bufwads. Specifically,
1986 * for any index n, there are three bufwads that should be identical:
1987 *
1988 * packobj, at offset n * sizeof (bufwad_t)
1989 * bigobj, at the head of the nth chunk
1990 * bigobj, at the tail of the nth chunk
1991 *
1992 * The chunk size is arbitrary. It doesn't have to be a power of two,
1993 * and it doesn't have any relation to the object blocksize.
1994 * The only requirement is that it can hold at least two bufwads.
1995 *
1996 * Normally, we write the bufwad to each of these locations.
1997 * However, free_percent of the time we instead write zeroes to
1998 * packobj and perform a dmu_free_range() on bigobj. By comparing
1999 * bigobj to packobj, we can verify that the DMU is correctly
2000 * tracking which parts of an object are allocated and free,
2001 * and that the contents of the allocated blocks are correct.
2002 */
2003
2004 /*
2005 * Read the directory info. If it's the first time, set things up.
2006 */
2007 VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
9babb374 2008 sizeof (dd), &dd, DMU_READ_PREFETCH));
34dc7c2f
BB
2009 if (dd.dd_chunk == 0) {
2010 ASSERT(dd.dd_packobj == 0);
2011 ASSERT(dd.dd_bigobj == 0);
2012 tx = dmu_tx_create(os);
2013 dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
2014 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
2015 error = dmu_tx_assign(tx, TXG_WAIT);
2016 if (error) {
2017 ztest_record_enospc("create r/w directory");
2018 dmu_tx_abort(tx);
2019 return;
2020 }
2021
2022 dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
2023 DMU_OT_NONE, 0, tx);
2024 dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
2025 DMU_OT_NONE, 0, tx);
2026 dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
2027
2028 ztest_set_random_blocksize(os, dd.dd_packobj, tx);
2029 ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
2030
2031 dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
2032 tx);
2033 dmu_tx_commit(tx);
2034 }
2035
2036 /*
2037 * Prefetch a random chunk of the big object.
2038 * Our aim here is to get some async reads in flight
2039 * for blocks that we may free below; the DMU should
2040 * handle this race correctly.
2041 */
2042 n = ztest_random(regions) * stride + ztest_random(width);
2043 s = 1 + ztest_random(2 * width - 1);
2044 dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
2045
2046 /*
2047 * Pick a random index and compute the offsets into packobj and bigobj.
2048 */
2049 n = ztest_random(regions) * stride + ztest_random(width);
2050 s = 1 + ztest_random(width - 1);
2051
2052 packoff = n * sizeof (bufwad_t);
2053 packsize = s * sizeof (bufwad_t);
2054
2055 bigoff = n * dd.dd_chunk;
2056 bigsize = s * dd.dd_chunk;
2057
2058 packbuf = umem_alloc(packsize, UMEM_NOFAIL);
2059 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
2060
2061 /*
2062 * free_percent of the time, free a range of bigobj rather than
2063 * overwriting it.
2064 */
2065 freeit = (ztest_random(100) < free_percent);
2066
2067 /*
2068 * Read the current contents of our objects.
2069 */
9babb374
BB
2070 error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf,
2071 DMU_READ_PREFETCH);
34dc7c2f 2072 ASSERT3U(error, ==, 0);
9babb374
BB
2073 error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf,
2074 DMU_READ_PREFETCH);
34dc7c2f
BB
2075 ASSERT3U(error, ==, 0);
2076
2077 /*
2078 * Get a tx for the mods to both packobj and bigobj.
2079 */
2080 tx = dmu_tx_create(os);
2081
2082 dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
2083
2084 if (freeit)
2085 dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
2086 else
2087 dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
2088
2089 error = dmu_tx_assign(tx, TXG_WAIT);
2090
2091 if (error) {
2092 ztest_record_enospc("dmu r/w range");
2093 dmu_tx_abort(tx);
2094 umem_free(packbuf, packsize);
2095 umem_free(bigbuf, bigsize);
2096 return;
2097 }
2098
2099 txg = dmu_tx_get_txg(tx);
2100
2101 /*
2102 * For each index from n to n + s, verify that the existing bufwad
2103 * in packobj matches the bufwads at the head and tail of the
2104 * corresponding chunk in bigobj. Then update all three bufwads
2105 * with the new values we want to write out.
2106 */
2107 for (i = 0; i < s; i++) {
2108 /* LINTED */
2109 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
2110 /* LINTED */
2111 bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
2112 /* LINTED */
2113 bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
2114
2115 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
2116 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
2117
2118 if (pack->bw_txg > txg)
2119 fatal(0, "future leak: got %llx, open txg is %llx",
2120 pack->bw_txg, txg);
2121
2122 if (pack->bw_data != 0 && pack->bw_index != n + i)
2123 fatal(0, "wrong index: got %llx, wanted %llx+%llx",
2124 pack->bw_index, n, i);
2125
2126 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
2127 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
2128
2129 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
2130 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
2131
2132 if (freeit) {
2133 bzero(pack, sizeof (bufwad_t));
2134 } else {
2135 pack->bw_index = n + i;
2136 pack->bw_txg = txg;
2137 pack->bw_data = 1 + ztest_random(-2ULL);
2138 }
2139 *bigH = *pack;
2140 *bigT = *pack;
2141 }
2142
2143 /*
2144 * We've verified all the old bufwads, and made new ones.
2145 * Now write them out.
2146 */
2147 dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
2148
2149 if (freeit) {
2150 if (zopt_verbose >= 6) {
2151 (void) printf("freeing offset %llx size %llx"
2152 " txg %llx\n",
2153 (u_longlong_t)bigoff,
2154 (u_longlong_t)bigsize,
2155 (u_longlong_t)txg);
2156 }
2157 VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
2158 bigsize, tx));
2159 } else {
2160 if (zopt_verbose >= 6) {
2161 (void) printf("writing offset %llx size %llx"
2162 " txg %llx\n",
2163 (u_longlong_t)bigoff,
2164 (u_longlong_t)bigsize,
2165 (u_longlong_t)txg);
2166 }
2167 dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
2168 }
2169
2170 dmu_tx_commit(tx);
2171
2172 /*
2173 * Sanity check the stuff we just wrote.
2174 */
2175 {
2176 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
2177 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
2178
2179 VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
9babb374 2180 packsize, packcheck, DMU_READ_PREFETCH));
34dc7c2f 2181 VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
9babb374 2182 bigsize, bigcheck, DMU_READ_PREFETCH));
34dc7c2f
BB
2183
2184 ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
2185 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
2186
2187 umem_free(packcheck, packsize);
2188 umem_free(bigcheck, bigsize);
2189 }
2190
2191 umem_free(packbuf, packsize);
2192 umem_free(bigbuf, bigsize);
2193}
2194
9babb374
BB
2195void
2196compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
2197 uint64_t bigsize, uint64_t n, dmu_read_write_dir_t dd, uint64_t txg)
2198{
2199 uint64_t i;
2200 bufwad_t *pack;
2201 bufwad_t *bigH;
2202 bufwad_t *bigT;
2203
2204 /*
2205 * For each index from n to n + s, verify that the existing bufwad
2206 * in packobj matches the bufwads at the head and tail of the
2207 * corresponding chunk in bigobj. Then update all three bufwads
2208 * with the new values we want to write out.
2209 */
2210 for (i = 0; i < s; i++) {
2211 /* LINTED */
2212 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
2213 /* LINTED */
2214 bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
2215 /* LINTED */
2216 bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
2217
2218 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
2219 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
2220
2221 if (pack->bw_txg > txg)
2222 fatal(0, "future leak: got %llx, open txg is %llx",
2223 pack->bw_txg, txg);
2224
2225 if (pack->bw_data != 0 && pack->bw_index != n + i)
2226 fatal(0, "wrong index: got %llx, wanted %llx+%llx",
2227 pack->bw_index, n, i);
2228
2229 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
2230 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
2231
2232 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
2233 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
2234
2235 pack->bw_index = n + i;
2236 pack->bw_txg = txg;
2237 pack->bw_data = 1 + ztest_random(-2ULL);
2238
2239 *bigH = *pack;
2240 *bigT = *pack;
2241 }
2242}
2243
2244void
2245ztest_dmu_read_write_zcopy(ztest_args_t *za)
2246{
2247 objset_t *os = za->za_os;
2248 dmu_read_write_dir_t dd;
2249 dmu_tx_t *tx;
2250 uint64_t i;
2251 int error;
2252 uint64_t n, s, txg;
2253 bufwad_t *packbuf, *bigbuf;
2254 uint64_t packoff, packsize, bigoff, bigsize;
2255 uint64_t regions = 997;
2256 uint64_t stride = 123456789ULL;
2257 uint64_t width = 9;
2258 dmu_buf_t *bonus_db;
2259 arc_buf_t **bigbuf_arcbufs;
2260 dmu_object_info_t *doi = &za->za_doi;
2261
2262 /*
2263 * This test uses two objects, packobj and bigobj, that are always
2264 * updated together (i.e. in the same tx) so that their contents are
2265 * in sync and can be compared. Their contents relate to each other
2266 * in a simple way: packobj is a dense array of 'bufwad' structures,
2267 * while bigobj is a sparse array of the same bufwads. Specifically,
2268 * for any index n, there are three bufwads that should be identical:
2269 *
2270 * packobj, at offset n * sizeof (bufwad_t)
2271 * bigobj, at the head of the nth chunk
2272 * bigobj, at the tail of the nth chunk
2273 *
2274 * The chunk size is set equal to bigobj block size so that
2275 * dmu_assign_arcbuf() can be tested for object updates.
2276 */
2277
2278 /*
2279 * Read the directory info. If it's the first time, set things up.
2280 */
2281 VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
2282 sizeof (dd), &dd, DMU_READ_PREFETCH));
2283 if (dd.dd_chunk == 0) {
2284 ASSERT(dd.dd_packobj == 0);
2285 ASSERT(dd.dd_bigobj == 0);
2286 tx = dmu_tx_create(os);
2287 dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
2288 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
2289 error = dmu_tx_assign(tx, TXG_WAIT);
2290 if (error) {
2291 ztest_record_enospc("create r/w directory");
2292 dmu_tx_abort(tx);
2293 return;
2294 }
2295
2296 dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
2297 DMU_OT_NONE, 0, tx);
2298 dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
2299 DMU_OT_NONE, 0, tx);
2300 ztest_set_random_blocksize(os, dd.dd_packobj, tx);
2301 ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
2302
2303 VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
2304 ASSERT(doi->doi_data_block_size >= 2 * sizeof (bufwad_t));
2305 ASSERT(ISP2(doi->doi_data_block_size));
2306 dd.dd_chunk = doi->doi_data_block_size;
2307
2308 dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
2309 tx);
2310 dmu_tx_commit(tx);
2311 } else {
2312 VERIFY(dmu_object_info(os, dd.dd_bigobj, doi) == 0);
2313 VERIFY(ISP2(doi->doi_data_block_size));
2314 VERIFY(dd.dd_chunk == doi->doi_data_block_size);
2315 VERIFY(dd.dd_chunk >= 2 * sizeof (bufwad_t));
2316 }
2317
2318 /*
2319 * Pick a random index and compute the offsets into packobj and bigobj.
2320 */
2321 n = ztest_random(regions) * stride + ztest_random(width);
2322 s = 1 + ztest_random(width - 1);
2323
2324 packoff = n * sizeof (bufwad_t);
2325 packsize = s * sizeof (bufwad_t);
2326
2327 bigoff = n * dd.dd_chunk;
2328 bigsize = s * dd.dd_chunk;
2329
2330 packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
2331 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
2332
2333 VERIFY(dmu_bonus_hold(os, dd.dd_bigobj, FTAG, &bonus_db) == 0);
2334
2335 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
2336
2337 /*
2338 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
2339 * Iteration 1 test zcopy to already referenced dbufs.
2340 * Iteration 2 test zcopy to dirty dbuf in the same txg.
2341 * Iteration 3 test zcopy to dbuf dirty in previous txg.
2342 * Iteration 4 test zcopy when dbuf is no longer dirty.
2343 * Iteration 5 test zcopy when it can't be done.
2344 * Iteration 6 one more zcopy write.
2345 */
2346 for (i = 0; i < 7; i++) {
2347 uint64_t j;
2348 uint64_t off;
2349
2350 /*
2351 * In iteration 5 (i == 5) use arcbufs
2352 * that don't match bigobj blksz to test
2353 * dmu_assign_arcbuf() when it can't directly
2354 * assign an arcbuf to a dbuf.
2355 */
2356 for (j = 0; j < s; j++) {
2357 if (i != 5) {
2358 bigbuf_arcbufs[j] =
2359 dmu_request_arcbuf(bonus_db,
2360 dd.dd_chunk);
2361 } else {
2362 bigbuf_arcbufs[2 * j] =
2363 dmu_request_arcbuf(bonus_db,
2364 dd.dd_chunk / 2);
2365 bigbuf_arcbufs[2 * j + 1] =
2366 dmu_request_arcbuf(bonus_db,
2367 dd.dd_chunk / 2);
2368 }
2369 }
2370
2371 /*
2372 * Get a tx for the mods to both packobj and bigobj.
2373 */
2374 tx = dmu_tx_create(os);
2375
2376 dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
2377 dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
2378
2379 if (ztest_random(100) == 0) {
2380 error = -1;
2381 } else {
2382 error = dmu_tx_assign(tx, TXG_WAIT);
2383 }
2384
2385 if (error) {
2386 if (error != -1) {
2387 ztest_record_enospc("dmu r/w range");
2388 }
2389 dmu_tx_abort(tx);
2390 umem_free(packbuf, packsize);
2391 umem_free(bigbuf, bigsize);
2392 for (j = 0; j < s; j++) {
2393 if (i != 5) {
2394 dmu_return_arcbuf(bigbuf_arcbufs[j]);
2395 } else {
2396 dmu_return_arcbuf(
2397 bigbuf_arcbufs[2 * j]);
2398 dmu_return_arcbuf(
2399 bigbuf_arcbufs[2 * j + 1]);
2400 }
2401 }
2402 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
2403 dmu_buf_rele(bonus_db, FTAG);
2404 return;
2405 }
2406
2407 txg = dmu_tx_get_txg(tx);
2408
2409 /*
2410 * 50% of the time don't read objects in the 1st iteration to
2411 * test dmu_assign_arcbuf() for the case when there're no
2412 * existing dbufs for the specified offsets.
2413 */
2414 if (i != 0 || ztest_random(2) != 0) {
2415 error = dmu_read(os, dd.dd_packobj, packoff,
2416 packsize, packbuf, DMU_READ_PREFETCH);
2417 ASSERT3U(error, ==, 0);
2418 error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize,
2419 bigbuf, DMU_READ_PREFETCH);
2420 ASSERT3U(error, ==, 0);
2421 }
2422 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
2423 n, dd, txg);
2424
2425 /*
2426 * We've verified all the old bufwads, and made new ones.
2427 * Now write them out.
2428 */
2429 dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
2430 if (zopt_verbose >= 6) {
2431 (void) printf("writing offset %llx size %llx"
2432 " txg %llx\n",
2433 (u_longlong_t)bigoff,
2434 (u_longlong_t)bigsize,
2435 (u_longlong_t)txg);
2436 }
2437 for (off = bigoff, j = 0; j < s; j++, off += dd.dd_chunk) {
2438 dmu_buf_t *dbt;
2439 if (i != 5) {
2440 bcopy((caddr_t)bigbuf + (off - bigoff),
2441 bigbuf_arcbufs[j]->b_data, dd.dd_chunk);
2442 } else {
2443 bcopy((caddr_t)bigbuf + (off - bigoff),
2444 bigbuf_arcbufs[2 * j]->b_data,
2445 dd.dd_chunk / 2);
2446 bcopy((caddr_t)bigbuf + (off - bigoff) +
2447 dd.dd_chunk / 2,
2448 bigbuf_arcbufs[2 * j + 1]->b_data,
2449 dd.dd_chunk / 2);
2450 }
2451
2452 if (i == 1) {
2453 VERIFY(dmu_buf_hold(os, dd.dd_bigobj, off,
2454 FTAG, &dbt) == 0);
2455 }
2456 if (i != 5) {
2457 dmu_assign_arcbuf(bonus_db, off,
2458 bigbuf_arcbufs[j], tx);
2459 } else {
2460 dmu_assign_arcbuf(bonus_db, off,
2461 bigbuf_arcbufs[2 * j], tx);
2462 dmu_assign_arcbuf(bonus_db,
2463 off + dd.dd_chunk / 2,
2464 bigbuf_arcbufs[2 * j + 1], tx);
2465 }
2466 if (i == 1) {
2467 dmu_buf_rele(dbt, FTAG);
2468 }
2469 }
2470 dmu_tx_commit(tx);
2471
2472 /*
2473 * Sanity check the stuff we just wrote.
2474 */
2475 {
2476 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
2477 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
2478
2479 VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
2480 packsize, packcheck, DMU_READ_PREFETCH));
2481 VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
2482 bigsize, bigcheck, DMU_READ_PREFETCH));
2483
2484 ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
2485 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
2486
2487 umem_free(packcheck, packsize);
2488 umem_free(bigcheck, bigsize);
2489 }
2490 if (i == 2) {
2491 txg_wait_open(dmu_objset_pool(os), 0);
2492 } else if (i == 3) {
2493 txg_wait_synced(dmu_objset_pool(os), 0);
2494 }
2495 }
2496
2497 dmu_buf_rele(bonus_db, FTAG);
2498 umem_free(packbuf, packsize);
2499 umem_free(bigbuf, bigsize);
2500 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
2501}
2502
34dc7c2f
BB
2503void
2504ztest_dmu_check_future_leak(ztest_args_t *za)
2505{
2506 objset_t *os = za->za_os;
2507 dmu_buf_t *db;
2508 ztest_block_tag_t *bt;
2509 dmu_object_info_t *doi = &za->za_doi;
2510
2511 /*
2512 * Make sure that, if there is a write record in the bonus buffer
2513 * of the ZTEST_DIROBJ, that the txg for this record is <= the
2514 * last synced txg of the pool.
2515 */
2516 VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
2517 za->za_dbuf = db;
2518 VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
2519 ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
2520 ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
2521 ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
2522 bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
2523 if (bt->bt_objset != 0) {
2524 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
2525 ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
2526 ASSERT3U(bt->bt_offset, ==, -1ULL);
2527 ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
2528 }
2529 dmu_buf_rele(db, FTAG);
2530 za->za_dbuf = NULL;
2531}
2532
2533void
2534ztest_dmu_write_parallel(ztest_args_t *za)
2535{
2536 objset_t *os = za->za_os;
2537 ztest_block_tag_t *rbt = &za->za_rbt;
2538 ztest_block_tag_t *wbt = &za->za_wbt;
2539 const size_t btsize = sizeof (ztest_block_tag_t);
2540 dmu_buf_t *db;
2541 int b, error;
2542 int bs = ZTEST_DIROBJ_BLOCKSIZE;
2543 int do_free = 0;
b128c09f 2544 uint64_t off, txg, txg_how;
34dc7c2f
BB
2545 mutex_t *lp;
2546 char osname[MAXNAMELEN];
2547 char iobuf[SPA_MAXBLOCKSIZE];
2548 blkptr_t blk = { 0 };
2549 uint64_t blkoff;
2550 zbookmark_t zb;
2551 dmu_tx_t *tx = dmu_tx_create(os);
9babb374
BB
2552 dmu_buf_t *bonus_db;
2553 arc_buf_t *abuf = NULL;
34dc7c2f
BB
2554
2555 dmu_objset_name(os, osname);
2556
2557 /*
2558 * Have multiple threads write to large offsets in ZTEST_DIROBJ
2559 * to verify that having multiple threads writing to the same object
2560 * in parallel doesn't cause any trouble.
2561 */
2562 if (ztest_random(4) == 0) {
2563 /*
2564 * Do the bonus buffer instead of a regular block.
2565 * We need a lock to serialize resize vs. others,
2566 * so we hash on the objset ID.
2567 */
2568 b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
2569 off = -1ULL;
2570 dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
2571 } else {
2572 b = ztest_random(ZTEST_SYNC_LOCKS);
2573 off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
2574 if (ztest_random(4) == 0) {
2575 do_free = 1;
2576 dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
2577 } else {
2578 dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
2579 }
2580 }
2581
9babb374
BB
2582 if (off != -1ULL && P2PHASE(off, bs) == 0 && !do_free &&
2583 ztest_random(8) == 0) {
2584 VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &bonus_db) == 0);
2585 abuf = dmu_request_arcbuf(bonus_db, bs);
2586 }
2587
34dc7c2f
BB
2588 txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
2589 error = dmu_tx_assign(tx, txg_how);
2590 if (error) {
2591 if (error == ERESTART) {
2592 ASSERT(txg_how == TXG_NOWAIT);
2593 dmu_tx_wait(tx);
2594 } else {
2595 ztest_record_enospc("dmu write parallel");
2596 }
2597 dmu_tx_abort(tx);
9babb374
BB
2598 if (abuf != NULL) {
2599 dmu_return_arcbuf(abuf);
2600 dmu_buf_rele(bonus_db, FTAG);
2601 }
34dc7c2f
BB
2602 return;
2603 }
b128c09f 2604 txg = dmu_tx_get_txg(tx);
34dc7c2f
BB
2605
2606 lp = &ztest_shared->zs_sync_lock[b];
2607 (void) mutex_lock(lp);
2608
2609 wbt->bt_objset = dmu_objset_id(os);
2610 wbt->bt_object = ZTEST_DIROBJ;
2611 wbt->bt_offset = off;
b128c09f 2612 wbt->bt_txg = txg;
34dc7c2f
BB
2613 wbt->bt_thread = za->za_instance;
2614 wbt->bt_seq = ztest_shared->zs_seq[b]++; /* protected by lp */
2615
b128c09f
BB
2616 /*
2617 * Occasionally, write an all-zero block to test the behavior
2618 * of blocks that compress into holes.
2619 */
2620 if (off != -1ULL && ztest_random(8) == 0)
2621 bzero(wbt, btsize);
2622
34dc7c2f
BB
2623 if (off == -1ULL) {
2624 dmu_object_info_t *doi = &za->za_doi;
2625 char *dboff;
2626
2627 VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
2628 za->za_dbuf = db;
2629 dmu_object_info_from_db(db, doi);
2630 ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
2631 ASSERT3U(doi->doi_bonus_size, >=, btsize);
2632 ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
2633 dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
2634 bcopy(dboff, rbt, btsize);
2635 if (rbt->bt_objset != 0) {
2636 ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
2637 ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
2638 ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
2639 ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
2640 }
2641 if (ztest_random(10) == 0) {
2642 int newsize = (ztest_random(db->db_size /
2643 btsize) + 1) * btsize;
2644
2645 ASSERT3U(newsize, >=, btsize);
2646 ASSERT3U(newsize, <=, db->db_size);
2647 VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
2648 dboff = (char *)db->db_data + newsize - btsize;
2649 }
2650 dmu_buf_will_dirty(db, tx);
2651 bcopy(wbt, dboff, btsize);
2652 dmu_buf_rele(db, FTAG);
2653 za->za_dbuf = NULL;
2654 } else if (do_free) {
2655 VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
9babb374 2656 } else if (abuf == NULL) {
34dc7c2f 2657 dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
9babb374
BB
2658 } else {
2659 bcopy(wbt, abuf->b_data, btsize);
2660 dmu_assign_arcbuf(bonus_db, off, abuf, tx);
2661 dmu_buf_rele(bonus_db, FTAG);
34dc7c2f
BB
2662 }
2663
2664 (void) mutex_unlock(lp);
2665
2666 if (ztest_random(1000) == 0)
2667 (void) poll(NULL, 0, 1); /* open dn_notxholds window */
2668
2669 dmu_tx_commit(tx);
2670
2671 if (ztest_random(10000) == 0)
b128c09f 2672 txg_wait_synced(dmu_objset_pool(os), txg);
34dc7c2f 2673
b128c09f 2674 if (off == -1ULL || do_free)
34dc7c2f
BB
2675 return;
2676
2677 if (ztest_random(2) != 0)
2678 return;
2679
2680 /*
2681 * dmu_sync() the block we just wrote.
2682 */
2683 (void) mutex_lock(lp);
2684
2685 blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
2686 error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
2687 za->za_dbuf = db;
2688 if (error) {
34dc7c2f
BB
2689 (void) mutex_unlock(lp);
2690 return;
2691 }
2692 blkoff = off - blkoff;
b128c09f 2693 error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
34dc7c2f
BB
2694 dmu_buf_rele(db, FTAG);
2695 za->za_dbuf = NULL;
2696
9babb374
BB
2697 if (error) {
2698 (void) mutex_unlock(lp);
34dc7c2f 2699 return;
9babb374 2700 }
34dc7c2f 2701
9babb374
BB
2702 if (blk.blk_birth == 0) { /* concurrent free */
2703 (void) mutex_unlock(lp);
34dc7c2f 2704 return;
9babb374 2705 }
34dc7c2f
BB
2706
2707 txg_suspend(dmu_objset_pool(os));
2708
9babb374
BB
2709 (void) mutex_unlock(lp);
2710
34dc7c2f
BB
2711 ASSERT(blk.blk_fill == 1);
2712 ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
2713 ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
2714 ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
2715
2716 /*
2717 * Read the block that dmu_sync() returned to make sure its contents
2718 * match what we wrote. We do this while still txg_suspend()ed
2719 * to ensure that the block can't be reused before we read it.
2720 */
2721 zb.zb_objset = dmu_objset_id(os);
2722 zb.zb_object = ZTEST_DIROBJ;
2723 zb.zb_level = 0;
2724 zb.zb_blkid = off / bs;
2725 error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
2726 NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
2727 ASSERT3U(error, ==, 0);
2728
2729 txg_resume(dmu_objset_pool(os));
2730
2731 bcopy(&iobuf[blkoff], rbt, btsize);
2732
2733 if (rbt->bt_objset == 0) /* concurrent free */
2734 return;
2735
b128c09f
BB
2736 if (wbt->bt_objset == 0) /* all-zero overwrite */
2737 return;
2738
34dc7c2f
BB
2739 ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
2740 ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
2741 ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
2742
2743 /*
2744 * The semantic of dmu_sync() is that we always push the most recent
2745 * version of the data, so in the face of concurrent updates we may
2746 * see a newer version of the block. That's OK.
2747 */
2748 ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
2749 if (rbt->bt_thread == wbt->bt_thread)
2750 ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
2751 else
2752 ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
2753}
2754
2755/*
2756 * Verify that zap_{create,destroy,add,remove,update} work as expected.
2757 */
2758#define ZTEST_ZAP_MIN_INTS 1
2759#define ZTEST_ZAP_MAX_INTS 4
2760#define ZTEST_ZAP_MAX_PROPS 1000
2761
2762void
2763ztest_zap(ztest_args_t *za)
2764{
2765 objset_t *os = za->za_os;
2766 uint64_t object;
2767 uint64_t txg, last_txg;
2768 uint64_t value[ZTEST_ZAP_MAX_INTS];
2769 uint64_t zl_ints, zl_intsize, prop;
2770 int i, ints;
2771 dmu_tx_t *tx;
2772 char propname[100], txgname[100];
2773 int error;
2774 char osname[MAXNAMELEN];
2775 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
2776
2777 dmu_objset_name(os, osname);
2778
2779 /*
2780 * Create a new object if necessary, and record it in the directory.
2781 */
2782 VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
9babb374 2783 sizeof (uint64_t), &object, DMU_READ_PREFETCH));
34dc7c2f
BB
2784
2785 if (object == 0) {
2786 tx = dmu_tx_create(os);
2787 dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
2788 sizeof (uint64_t));
2789 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
2790 error = dmu_tx_assign(tx, TXG_WAIT);
2791 if (error) {
2792 ztest_record_enospc("create zap test obj");
2793 dmu_tx_abort(tx);
2794 return;
2795 }
2796 object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
2797 if (error) {
2798 fatal(0, "zap_create('%s', %llu) = %d",
2799 osname, object, error);
2800 }
2801 ASSERT(object != 0);
2802 dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
2803 sizeof (uint64_t), &object, tx);
2804 /*
2805 * Generate a known hash collision, and verify that
2806 * we can lookup and remove both entries.
2807 */
2808 for (i = 0; i < 2; i++) {
2809 value[i] = i;
2810 error = zap_add(os, object, hc[i], sizeof (uint64_t),
2811 1, &value[i], tx);
2812 ASSERT3U(error, ==, 0);
2813 }
2814 for (i = 0; i < 2; i++) {
2815 error = zap_add(os, object, hc[i], sizeof (uint64_t),
2816 1, &value[i], tx);
2817 ASSERT3U(error, ==, EEXIST);
2818 error = zap_length(os, object, hc[i],
2819 &zl_intsize, &zl_ints);
2820 ASSERT3U(error, ==, 0);
2821 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
2822 ASSERT3U(zl_ints, ==, 1);
2823 }
2824 for (i = 0; i < 2; i++) {
2825 error = zap_remove(os, object, hc[i], tx);
2826 ASSERT3U(error, ==, 0);
2827 }
2828
2829 dmu_tx_commit(tx);
2830 }
2831
2832 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
2833
2834 prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
2835 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
2836 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
2837 bzero(value, sizeof (value));
2838 last_txg = 0;
2839
2840 /*
2841 * If these zap entries already exist, validate their contents.
2842 */
2843 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
2844 if (error == 0) {
2845 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
2846 ASSERT3U(zl_ints, ==, 1);
2847
2848 VERIFY(zap_lookup(os, object, txgname, zl_intsize,
2849 zl_ints, &last_txg) == 0);
2850
2851 VERIFY(zap_length(os, object, propname, &zl_intsize,
2852 &zl_ints) == 0);
2853
2854 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
2855 ASSERT3U(zl_ints, ==, ints);
2856
2857 VERIFY(zap_lookup(os, object, propname, zl_intsize,
2858 zl_ints, value) == 0);
2859
2860 for (i = 0; i < ints; i++) {
2861 ASSERT3U(value[i], ==, last_txg + object + i);
2862 }
2863 } else {
2864 ASSERT3U(error, ==, ENOENT);
2865 }
2866
2867 /*
2868 * Atomically update two entries in our zap object.
2869 * The first is named txg_%llu, and contains the txg
2870 * in which the property was last updated. The second
2871 * is named prop_%llu, and the nth element of its value
2872 * should be txg + object + n.
2873 */
2874 tx = dmu_tx_create(os);
2875 dmu_tx_hold_zap(tx, object, TRUE, NULL);
2876 error = dmu_tx_assign(tx, TXG_WAIT);
2877 if (error) {
2878 ztest_record_enospc("create zap entry");
2879 dmu_tx_abort(tx);
2880 return;
2881 }
2882 txg = dmu_tx_get_txg(tx);
2883
2884 if (last_txg > txg)
2885 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
2886
2887 for (i = 0; i < ints; i++)
2888 value[i] = txg + object + i;
2889
2890 error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
2891 if (error)
2892 fatal(0, "zap_update('%s', %llu, '%s') = %d",
2893 osname, object, txgname, error);
2894
2895 error = zap_update(os, object, propname, sizeof (uint64_t),
2896 ints, value, tx);
2897 if (error)
2898 fatal(0, "zap_update('%s', %llu, '%s') = %d",
2899 osname, object, propname, error);
2900
2901 dmu_tx_commit(tx);
2902
2903 /*
2904 * Remove a random pair of entries.
2905 */
2906 prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
2907 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
2908 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
2909
2910 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
2911
2912 if (error == ENOENT)
2913 return;
2914
2915 ASSERT3U(error, ==, 0);
2916
2917 tx = dmu_tx_create(os);
2918 dmu_tx_hold_zap(tx, object, TRUE, NULL);
2919 error = dmu_tx_assign(tx, TXG_WAIT);
2920 if (error) {
2921 ztest_record_enospc("remove zap entry");
2922 dmu_tx_abort(tx);
2923 return;
2924 }
2925 error = zap_remove(os, object, txgname, tx);
2926 if (error)
2927 fatal(0, "zap_remove('%s', %llu, '%s') = %d",
2928 osname, object, txgname, error);
2929
2930 error = zap_remove(os, object, propname, tx);
2931 if (error)
2932 fatal(0, "zap_remove('%s', %llu, '%s') = %d",
2933 osname, object, propname, error);
2934
2935 dmu_tx_commit(tx);
2936
2937 /*
2938 * Once in a while, destroy the object.
2939 */
2940 if (ztest_random(1000) != 0)
2941 return;
2942
2943 tx = dmu_tx_create(os);
2944 dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
2945 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
2946 error = dmu_tx_assign(tx, TXG_WAIT);
2947 if (error) {
2948 ztest_record_enospc("destroy zap object");
2949 dmu_tx_abort(tx);
2950 return;
2951 }
2952 error = zap_destroy(os, object, tx);
2953 if (error)
2954 fatal(0, "zap_destroy('%s', %llu) = %d",
2955 osname, object, error);
2956 object = 0;
2957 dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
2958 &object, tx);
2959 dmu_tx_commit(tx);
2960}
2961
2962void
2963ztest_zap_parallel(ztest_args_t *za)
2964{
2965 objset_t *os = za->za_os;
2966 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
2967 dmu_tx_t *tx;
2968 int i, namelen, error;
2969 char name[20], string_value[20];
2970 void *data;
2971
2972 /*
2973 * Generate a random name of the form 'xxx.....' where each
2974 * x is a random printable character and the dots are dots.
2975 * There are 94 such characters, and the name length goes from
2976 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
2977 */
2978 namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
2979
2980 for (i = 0; i < 3; i++)
2981 name[i] = '!' + ztest_random('~' - '!' + 1);
2982 for (; i < namelen - 1; i++)
2983 name[i] = '.';
2984 name[i] = '\0';
2985
2986 if (ztest_random(2) == 0)
2987 object = ZTEST_MICROZAP_OBJ;
2988 else
2989 object = ZTEST_FATZAP_OBJ;
2990
2991 if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
2992 wsize = sizeof (txg);
2993 wc = 1;
2994 data = &txg;
2995 } else {
2996 wsize = 1;
2997 wc = namelen;
2998 data = string_value;
2999 }
3000
3001 count = -1ULL;
3002 VERIFY(zap_count(os, object, &count) == 0);
3003 ASSERT(count != -1ULL);
3004
3005 /*
3006 * Select an operation: length, lookup, add, update, remove.
3007 */
3008 i = ztest_random(5);
3009
3010 if (i >= 2) {
3011 tx = dmu_tx_create(os);
3012 dmu_tx_hold_zap(tx, object, TRUE, NULL);
3013 error = dmu_tx_assign(tx, TXG_WAIT);
3014 if (error) {
3015 ztest_record_enospc("zap parallel");
3016 dmu_tx_abort(tx);
3017 return;
3018 }
3019 txg = dmu_tx_get_txg(tx);
3020 bcopy(name, string_value, namelen);
3021 } else {
3022 tx = NULL;
3023 txg = 0;
3024 bzero(string_value, namelen);
3025 }
3026
3027 switch (i) {
3028
3029 case 0:
3030 error = zap_length(os, object, name, &zl_wsize, &zl_wc);
3031 if (error == 0) {
3032 ASSERT3U(wsize, ==, zl_wsize);
3033 ASSERT3U(wc, ==, zl_wc);
3034 } else {
3035 ASSERT3U(error, ==, ENOENT);
3036 }
3037 break;
3038
3039 case 1:
3040 error = zap_lookup(os, object, name, wsize, wc, data);
3041 if (error == 0) {
3042 if (data == string_value &&
3043 bcmp(name, data, namelen) != 0)
3044 fatal(0, "name '%s' != val '%s' len %d",
3045 name, data, namelen);
3046 } else {
3047 ASSERT3U(error, ==, ENOENT);
3048 }
3049 break;
3050
3051 case 2:
3052 error = zap_add(os, object, name, wsize, wc, data, tx);
3053 ASSERT(error == 0 || error == EEXIST);
3054 break;
3055
3056 case 3:
3057 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
3058 break;
3059
3060 case 4:
3061 error = zap_remove(os, object, name, tx);
3062 ASSERT(error == 0 || error == ENOENT);
3063 break;
3064 }
3065
3066 if (tx != NULL)
3067 dmu_tx_commit(tx);
3068}
3069
3070void
3071ztest_dsl_prop_get_set(ztest_args_t *za)
3072{
3073 objset_t *os = za->za_os;
3074 int i, inherit;
3075 uint64_t value;
3076 const char *prop, *valname;
3077 char setpoint[MAXPATHLEN];
3078 char osname[MAXNAMELEN];
3079 int error;
3080
3081 (void) rw_rdlock(&ztest_shared->zs_name_lock);
3082
3083 dmu_objset_name(os, osname);
3084
3085 for (i = 0; i < 2; i++) {
3086 if (i == 0) {
3087 prop = "checksum";
3088 value = ztest_random_checksum();
3089 inherit = (value == ZIO_CHECKSUM_INHERIT);
3090 } else {
3091 prop = "compression";
3092 value = ztest_random_compress();
3093 inherit = (value == ZIO_COMPRESS_INHERIT);
3094 }
3095
3096 error = dsl_prop_set(osname, prop, sizeof (value),
3097 !inherit, &value);
3098
3099 if (error == ENOSPC) {
3100 ztest_record_enospc("dsl_prop_set");
3101 break;
3102 }
3103
3104 ASSERT3U(error, ==, 0);
3105
3106 VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
3107 1, &value, setpoint), ==, 0);
3108
3109 if (i == 0)
3110 valname = zio_checksum_table[value].ci_name;
3111 else
3112 valname = zio_compress_table[value].ci_name;
3113
3114 if (zopt_verbose >= 6) {
3115 (void) printf("%s %s = %s for '%s'\n",
3116 osname, prop, valname, setpoint);
3117 }
3118 }
3119
3120 (void) rw_unlock(&ztest_shared->zs_name_lock);
3121}
3122
34dc7c2f
BB
3123/*
3124 * Inject random faults into the on-disk data.
3125 */
3126void
3127ztest_fault_inject(ztest_args_t *za)
3128{
3129 int fd;
3130 uint64_t offset;
3131 uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
3132 uint64_t bad = 0x1990c0ffeedecade;
3133 uint64_t top, leaf;
3134 char path0[MAXPATHLEN];
3135 char pathrand[MAXPATHLEN];
3136 size_t fsize;
3137 spa_t *spa = za->za_spa;
3138 int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
3139 int iters = 1000;
b128c09f
BB
3140 int maxfaults = zopt_maxfaults;
3141 vdev_t *vd0 = NULL;
34dc7c2f
BB
3142 uint64_t guid0 = 0;
3143
b128c09f 3144 ASSERT(leaves >= 1);
34dc7c2f
BB
3145
3146 /*
b128c09f 3147 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
34dc7c2f 3148 */
b128c09f 3149 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
34dc7c2f 3150
b128c09f
BB
3151 if (ztest_random(2) == 0) {
3152 /*
3153 * Inject errors on a normal data device.
3154 */
3155 top = ztest_random(spa->spa_root_vdev->vdev_children);
3156 leaf = ztest_random(leaves);
34dc7c2f 3157
b128c09f
BB
3158 /*
3159 * Generate paths to the first leaf in this top-level vdev,
3160 * and to the random leaf we selected. We'll induce transient
3161 * write failures and random online/offline activity on leaf 0,
3162 * and we'll write random garbage to the randomly chosen leaf.
3163 */
3164 (void) snprintf(path0, sizeof (path0), ztest_dev_template,
3165 zopt_dir, zopt_pool, top * leaves + 0);
3166 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
3167 zopt_dir, zopt_pool, top * leaves + leaf);
34dc7c2f 3168
b128c09f
BB
3169 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
3170 if (vd0 != NULL && maxfaults != 1) {
3171 /*
3172 * Make vd0 explicitly claim to be unreadable,
3173 * or unwriteable, or reach behind its back
3174 * and close the underlying fd. We can do this if
3175 * maxfaults == 0 because we'll fail and reexecute,
3176 * and we can do it if maxfaults >= 2 because we'll
3177 * have enough redundancy. If maxfaults == 1, the
3178 * combination of this with injection of random data
3179 * corruption below exceeds the pool's fault tolerance.
3180 */
3181 vdev_file_t *vf = vd0->vdev_tsd;
3182
3183 if (vf != NULL && ztest_random(3) == 0) {
3184 (void) close(vf->vf_vnode->v_fd);
3185 vf->vf_vnode->v_fd = -1;
3186 } else if (ztest_random(2) == 0) {
3187 vd0->vdev_cant_read = B_TRUE;
3188 } else {
3189 vd0->vdev_cant_write = B_TRUE;
3190 }
3191 guid0 = vd0->vdev_guid;
3192 }
3193 } else {
3194 /*
3195 * Inject errors on an l2cache device.
3196 */
3197 spa_aux_vdev_t *sav = &spa->spa_l2cache;
34dc7c2f 3198
b128c09f
BB
3199 if (sav->sav_count == 0) {
3200 spa_config_exit(spa, SCL_STATE, FTAG);
3201 return;
3202 }
3203 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
34dc7c2f 3204 guid0 = vd0->vdev_guid;
b128c09f
BB
3205 (void) strcpy(path0, vd0->vdev_path);
3206 (void) strcpy(pathrand, vd0->vdev_path);
3207
3208 leaf = 0;
3209 leaves = 1;
3210 maxfaults = INT_MAX; /* no limit on cache devices */
34dc7c2f
BB
3211 }
3212
b128c09f
BB
3213 spa_config_exit(spa, SCL_STATE, FTAG);
3214
3215 if (maxfaults == 0)
3216 return;
34dc7c2f
BB
3217
3218 /*
3219 * If we can tolerate two or more faults, randomly online/offline vd0.
3220 */
b128c09f 3221 if (maxfaults >= 2 && guid0 != 0) {
fb5f0bc8
BB
3222 if (ztest_random(10) < 6) {
3223 int flags = (ztest_random(2) == 0 ?
3224 ZFS_OFFLINE_TEMPORARY : 0);
3225 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
3226 } else {
3227 (void) vdev_online(spa, guid0, 0, NULL);
3228 }
34dc7c2f
BB
3229 }
3230
3231 /*
3232 * We have at least single-fault tolerance, so inject data corruption.
3233 */
3234 fd = open(pathrand, O_RDWR);
3235
3236 if (fd == -1) /* we hit a gap in the device namespace */
3237 return;
3238
3239 fsize = lseek(fd, 0, SEEK_END);
3240
3241 while (--iters != 0) {
3242 offset = ztest_random(fsize / (leaves << bshift)) *
3243 (leaves << bshift) + (leaf << bshift) +
3244 (ztest_random(1ULL << (bshift - 1)) & -8ULL);
3245
3246 if (offset >= fsize)
3247 continue;
3248
3249 if (zopt_verbose >= 6)
3250 (void) printf("injecting bad word into %s,"
3251 " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
3252
3253 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
3254 fatal(1, "can't inject bad word at 0x%llx in %s",
3255 offset, pathrand);
3256 }
3257
3258 (void) close(fd);
3259}
3260
3261/*
3262 * Scrub the pool.
3263 */
3264void
3265ztest_scrub(ztest_args_t *za)
3266{
3267 spa_t *spa = za->za_spa;
3268
b128c09f 3269 (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
34dc7c2f 3270 (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
b128c09f 3271 (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
34dc7c2f
BB
3272}
3273
3274/*
3275 * Rename the pool to a different name and then rename it back.
3276 */
3277void
3278ztest_spa_rename(ztest_args_t *za)
3279{
3280 char *oldname, *newname;
3281 int error;
3282 spa_t *spa;
3283
3284 (void) rw_wrlock(&ztest_shared->zs_name_lock);
3285
3286 oldname = za->za_pool;
3287 newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
3288 (void) strcpy(newname, oldname);
3289 (void) strcat(newname, "_tmp");
3290
3291 /*
3292 * Do the rename
3293 */
3294 error = spa_rename(oldname, newname);
3295 if (error)
3296 fatal(0, "spa_rename('%s', '%s') = %d", oldname,
3297 newname, error);
3298
3299 /*
3300 * Try to open it under the old name, which shouldn't exist
3301 */
3302 error = spa_open(oldname, &spa, FTAG);
3303 if (error != ENOENT)
3304 fatal(0, "spa_open('%s') = %d", oldname, error);
3305
3306 /*
3307 * Open it under the new name and make sure it's still the same spa_t.
3308 */
3309 error = spa_open(newname, &spa, FTAG);
3310 if (error != 0)
3311 fatal(0, "spa_open('%s') = %d", newname, error);
3312
3313 ASSERT(spa == za->za_spa);
3314 spa_close(spa, FTAG);
3315
3316 /*
3317 * Rename it back to the original
3318 */
3319 error = spa_rename(newname, oldname);
3320 if (error)
3321 fatal(0, "spa_rename('%s', '%s') = %d", newname,
3322 oldname, error);
3323
3324 /*
3325 * Make sure it can still be opened
3326 */
3327 error = spa_open(oldname, &spa, FTAG);
3328 if (error != 0)
3329 fatal(0, "spa_open('%s') = %d", oldname, error);
3330
3331 ASSERT(spa == za->za_spa);
3332 spa_close(spa, FTAG);
3333
3334 umem_free(newname, strlen(newname) + 1);
3335
3336 (void) rw_unlock(&ztest_shared->zs_name_lock);
3337}
3338
3339
3340/*
3341 * Completely obliterate one disk.
3342 */
3343static void
3344ztest_obliterate_one_disk(uint64_t vdev)
3345{
3346 int fd;
3347 char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
3348 size_t fsize;
3349
3350 if (zopt_maxfaults < 2)
3351 return;
3352
3353 (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
3354 (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
3355
3356 fd = open(dev_name, O_RDWR);
3357
3358 if (fd == -1)
3359 fatal(1, "can't open %s", dev_name);
3360
3361 /*
3362 * Determine the size.
3363 */
3364 fsize = lseek(fd, 0, SEEK_END);
3365
3366 (void) close(fd);
3367
3368 /*
3369 * Rename the old device to dev_name.old (useful for debugging).
3370 */
3371 VERIFY(rename(dev_name, copy_name) == 0);
3372
3373 /*
3374 * Create a new one.
3375 */
3376 VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
3377 VERIFY(ftruncate(fd, fsize) == 0);
3378 (void) close(fd);
3379}
3380
3381static void
3382ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
3383{
3384 char dev_name[MAXPATHLEN];
b128c09f 3385 nvlist_t *root;
34dc7c2f
BB
3386 int error;
3387 uint64_t guid;
34dc7c2f
BB
3388 vdev_t *vd;
3389
3390 (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
3391
3392 /*
3393 * Build the nvlist describing dev_name.
3394 */
b128c09f 3395 root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
34dc7c2f 3396
b128c09f 3397 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
34dc7c2f
BB
3398 if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
3399 guid = 0;
3400 else
3401 guid = vd->vdev_guid;
b128c09f 3402 spa_config_exit(spa, SCL_VDEV, FTAG);
34dc7c2f
BB
3403 error = spa_vdev_attach(spa, guid, root, B_TRUE);
3404 if (error != 0 &&
3405 error != EBUSY &&
3406 error != ENOTSUP &&
3407 error != ENODEV &&
3408 error != EDOM)
3409 fatal(0, "spa_vdev_attach(in-place) = %d", error);
3410
34dc7c2f
BB
3411 nvlist_free(root);
3412}
3413
3414static void
3415ztest_verify_blocks(char *pool)
3416{
3417 int status;
3418 char zdb[MAXPATHLEN + MAXNAMELEN + 20];
3419 char zbuf[1024];
3420 char *bin;
3421 char *ztest;
3422 char *isa;
3423 int isalen;
3424 FILE *fp;
3425
3426 (void) realpath(getexecname(), zdb);
3427
3428 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
3429 bin = strstr(zdb, "/usr/bin/");
3430 ztest = strstr(bin, "/ztest");
3431 isa = bin + 8;
3432 isalen = ztest - isa;
3433 isa = strdup(isa);
3434 /* LINTED */
3435 (void) sprintf(bin,
9babb374 3436 "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s",
34dc7c2f
BB
3437 isalen,
3438 isa,
3439 zopt_verbose >= 3 ? "s" : "",
3440 zopt_verbose >= 4 ? "v" : "",
b128c09f 3441 pool);
34dc7c2f
BB
3442 free(isa);
3443
3444 if (zopt_verbose >= 5)
3445 (void) printf("Executing %s\n", strstr(zdb, "zdb "));
3446
3447 fp = popen(zdb, "r");
3448
3449 while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
3450 if (zopt_verbose >= 3)
3451 (void) printf("%s", zbuf);
3452
3453 status = pclose(fp);
3454
3455 if (status == 0)
3456 return;
3457
3458 ztest_dump_core = 0;
3459 if (WIFEXITED(status))
3460 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
3461 else
3462 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
3463}
3464
3465static void
3466ztest_walk_pool_directory(char *header)
3467{
3468 spa_t *spa = NULL;
3469
3470 if (zopt_verbose >= 6)
3471 (void) printf("%s\n", header);
3472
3473 mutex_enter(&spa_namespace_lock);
3474 while ((spa = spa_next(spa)) != NULL)
3475 if (zopt_verbose >= 6)
3476 (void) printf("\t%s\n", spa_name(spa));
3477 mutex_exit(&spa_namespace_lock);
3478}
3479
3480static void
3481ztest_spa_import_export(char *oldname, char *newname)
3482{
fb5f0bc8 3483 nvlist_t *config, *newconfig;
34dc7c2f
BB
3484 uint64_t pool_guid;
3485 spa_t *spa;
3486 int error;
3487
3488 if (zopt_verbose >= 4) {
3489 (void) printf("import/export: old = %s, new = %s\n",
3490 oldname, newname);
3491 }
3492
3493 /*
3494 * Clean up from previous runs.
3495 */
3496 (void) spa_destroy(newname);
3497
3498 /*
3499 * Get the pool's configuration and guid.
3500 */
3501 error = spa_open(oldname, &spa, FTAG);
3502 if (error)
3503 fatal(0, "spa_open('%s') = %d", oldname, error);
3504
fb5f0bc8
BB
3505 /*
3506 * Kick off a scrub to tickle scrub/export races.
3507 */
3508 if (ztest_random(2) == 0)
3509 (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
3510
34dc7c2f
BB
3511 pool_guid = spa_guid(spa);
3512 spa_close(spa, FTAG);
3513
3514 ztest_walk_pool_directory("pools before export");
3515
3516 /*
3517 * Export it.
3518 */
fb5f0bc8 3519 error = spa_export(oldname, &config, B_FALSE, B_FALSE);
34dc7c2f
BB
3520 if (error)
3521 fatal(0, "spa_export('%s') = %d", oldname, error);
3522
3523 ztest_walk_pool_directory("pools after export");
3524
fb5f0bc8
BB
3525 /*
3526 * Try to import it.
3527 */
3528 newconfig = spa_tryimport(config);
3529 ASSERT(newconfig != NULL);
3530 nvlist_free(newconfig);
3531
34dc7c2f
BB
3532 /*
3533 * Import it under the new name.
3534 */
3535 error = spa_import(newname, config, NULL);
3536 if (error)
3537 fatal(0, "spa_import('%s') = %d", newname, error);
3538
3539 ztest_walk_pool_directory("pools after import");
3540
3541 /*
3542 * Try to import it again -- should fail with EEXIST.
3543 */
3544 error = spa_import(newname, config, NULL);
3545 if (error != EEXIST)
3546 fatal(0, "spa_import('%s') twice", newname);
3547
3548 /*
3549 * Try to import it under a different name -- should fail with EEXIST.
3550 */
3551 error = spa_import(oldname, config, NULL);
3552 if (error != EEXIST)
3553 fatal(0, "spa_import('%s') under multiple names", newname);
3554
3555 /*
3556 * Verify that the pool is no longer visible under the old name.
3557 */
3558 error = spa_open(oldname, &spa, FTAG);
3559 if (error != ENOENT)
3560 fatal(0, "spa_open('%s') = %d", newname, error);
3561
3562 /*
3563 * Verify that we can open and close the pool using the new name.
3564 */
3565 error = spa_open(newname, &spa, FTAG);
3566 if (error)
3567 fatal(0, "spa_open('%s') = %d", newname, error);
3568 ASSERT(pool_guid == spa_guid(spa));
3569 spa_close(spa, FTAG);
3570
3571 nvlist_free(config);
3572}
3573
fb5f0bc8
BB
3574static void
3575ztest_resume(spa_t *spa)
3576{
3577 if (spa_suspended(spa)) {
3578 spa_vdev_state_enter(spa);
3579 vdev_clear(spa, NULL);
3580 (void) spa_vdev_state_exit(spa, NULL, 0);
9babb374 3581 (void) zio_resume(spa);
fb5f0bc8
BB
3582 }
3583}
3584
34dc7c2f 3585static void *
fb5f0bc8 3586ztest_resume_thread(void *arg)
34dc7c2f 3587{
b128c09f 3588 spa_t *spa = arg;
34dc7c2f
BB
3589
3590 while (!ztest_exiting) {
b128c09f 3591 (void) poll(NULL, 0, 1000);
fb5f0bc8 3592 ztest_resume(spa);
34dc7c2f 3593 }
34dc7c2f
BB
3594 return (NULL);
3595}
3596
3597static void *
3598ztest_thread(void *arg)
3599{
3600 ztest_args_t *za = arg;
3601 ztest_shared_t *zs = ztest_shared;
3602 hrtime_t now, functime;
3603 ztest_info_t *zi;
3604 int f, i;
3605
3606 while ((now = gethrtime()) < za->za_stop) {
3607 /*
3608 * See if it's time to force a crash.
3609 */
3610 if (now > za->za_kill) {
3611 zs->zs_alloc = spa_get_alloc(za->za_spa);
3612 zs->zs_space = spa_get_space(za->za_spa);
3613 (void) kill(getpid(), SIGKILL);
3614 }
3615
3616 /*
3617 * Pick a random function.
3618 */
3619 f = ztest_random(ZTEST_FUNCS);
3620 zi = &zs->zs_info[f];
3621
3622 /*
3623 * Decide whether to call it, based on the requested frequency.
3624 */
3625 if (zi->zi_call_target == 0 ||
3626 (double)zi->zi_call_total / zi->zi_call_target >
3627 (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
3628 continue;
3629
3630 atomic_add_64(&zi->zi_calls, 1);
3631 atomic_add_64(&zi->zi_call_total, 1);
3632
3633 za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
3634 ZTEST_DIRSIZE;
3635 za->za_diroff_shared = (1ULL << 63);
3636
3637 for (i = 0; i < zi->zi_iters; i++)
3638 zi->zi_func(za);
3639
3640 functime = gethrtime() - now;
3641
3642 atomic_add_64(&zi->zi_call_time, functime);
3643
3644 if (zopt_verbose >= 4) {
3645 Dl_info dli;
3646 (void) dladdr((void *)zi->zi_func, &dli);
3647 (void) printf("%6.2f sec in %s\n",
3648 (double)functime / NANOSEC, dli.dli_sname);
3649 }
3650
3651 /*
3652 * If we're getting ENOSPC with some regularity, stop.
3653 */
3654 if (zs->zs_enospc_count > 10)
3655 break;
3656 }
3657
3658 return (NULL);
3659}
3660
3661/*
3662 * Kick off threads to run tests on all datasets in parallel.
3663 */
3664static void
3665ztest_run(char *pool)
3666{
3667 int t, d, error;
3668 ztest_shared_t *zs = ztest_shared;
3669 ztest_args_t *za;
3670 spa_t *spa;
3671 char name[100];
b128c09f
BB
3672 thread_t resume_tid;
3673
3674 ztest_exiting = B_FALSE;
34dc7c2f
BB
3675
3676 (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
3677 (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
3678
3679 for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
3680 (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
3681
3682 /*
3683 * Destroy one disk before we even start.
3684 * It's mirrored, so everything should work just fine.
3685 * This makes us exercise fault handling very early in spa_load().
3686 */
3687 ztest_obliterate_one_disk(0);
3688
3689 /*
3690 * Verify that the sum of the sizes of all blocks in the pool
3691 * equals the SPA's allocated space total.
3692 */
3693 ztest_verify_blocks(pool);
3694
3695 /*
3696 * Kick off a replacement of the disk we just obliterated.
3697 */
3698 kernel_init(FREAD | FWRITE);
b128c09f 3699 VERIFY(spa_open(pool, &spa, FTAG) == 0);
34dc7c2f
BB
3700 ztest_replace_one_disk(spa, 0);
3701 if (zopt_verbose >= 5)
3702 show_pool_stats(spa);
3703 spa_close(spa, FTAG);
3704 kernel_fini();
3705
3706 kernel_init(FREAD | FWRITE);
3707
3708 /*
3709 * Verify that we can export the pool and reimport it under a
3710 * different name.
3711 */
3712 if (ztest_random(2) == 0) {
3713 (void) snprintf(name, 100, "%s_import", pool);
3714 ztest_spa_import_export(pool, name);
3715 ztest_spa_import_export(name, pool);
3716 }
3717
3718 /*
3719 * Verify that we can loop over all pools.
3720 */
3721 mutex_enter(&spa_namespace_lock);
3722 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
3723 if (zopt_verbose > 3) {
3724 (void) printf("spa_next: found %s\n", spa_name(spa));
3725 }
3726 }
3727 mutex_exit(&spa_namespace_lock);
3728
3729 /*
b128c09f 3730 * Open our pool.
34dc7c2f 3731 */
b128c09f 3732 VERIFY(spa_open(pool, &spa, FTAG) == 0);
34dc7c2f 3733
fb5f0bc8
BB
3734 /*
3735 * We don't expect the pool to suspend unless maxfaults == 0,
3736 * in which case ztest_fault_inject() temporarily takes away
3737 * the only valid replica.
3738 */
3739 if (zopt_maxfaults == 0)
3740 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
3741 else
3742 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
3743
34dc7c2f 3744 /*
b128c09f 3745 * Create a thread to periodically resume suspended I/O.
34dc7c2f 3746 */
fb5f0bc8 3747 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
b128c09f 3748 &resume_tid) == 0);
34dc7c2f
BB
3749
3750 /*
3751 * Verify that we can safely inquire about about any object,
3752 * whether it's allocated or not. To make it interesting,
3753 * we probe a 5-wide window around each power of two.
3754 * This hits all edge cases, including zero and the max.
3755 */
3756 for (t = 0; t < 64; t++) {
3757 for (d = -5; d <= 5; d++) {
3758 error = dmu_object_info(spa->spa_meta_objset,
3759 (1ULL << t) + d, NULL);
3760 ASSERT(error == 0 || error == ENOENT ||
3761 error == EINVAL);
3762 }
3763 }
3764
3765 /*
3766 * Now kick off all the tests that run in parallel.
3767 */
3768 zs->zs_enospc_count = 0;
3769
3770 za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
3771
3772 if (zopt_verbose >= 4)
3773 (void) printf("starting main threads...\n");
3774
34dc7c2f
BB
3775 za[0].za_start = gethrtime();
3776 za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
3777 za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
3778 za[0].za_kill = za[0].za_stop;
3779 if (ztest_random(100) < zopt_killrate)
3780 za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
3781
3782 for (t = 0; t < zopt_threads; t++) {
3783 d = t % zopt_datasets;
3784
3785 (void) strcpy(za[t].za_pool, pool);
3786 za[t].za_os = za[d].za_os;
3787 za[t].za_spa = spa;
3788 za[t].za_zilog = za[d].za_zilog;
3789 za[t].za_instance = t;
3790 za[t].za_random = ztest_random(-1ULL);
3791 za[t].za_start = za[0].za_start;
3792 za[t].za_stop = za[0].za_stop;
3793 za[t].za_kill = za[0].za_kill;
3794
3795 if (t < zopt_datasets) {
34dc7c2f
BB
3796 int test_future = FALSE;
3797 (void) rw_rdlock(&ztest_shared->zs_name_lock);
3798 (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
3799 error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
3800 ztest_create_cb, NULL);
3801 if (error == EEXIST) {
3802 test_future = TRUE;
3803 } else if (error == ENOSPC) {
3804 zs->zs_enospc_count++;
3805 (void) rw_unlock(&ztest_shared->zs_name_lock);
3806 break;
3807 } else if (error != 0) {
3808 fatal(0, "dmu_objset_create(%s) = %d",
3809 name, error);
3810 }
3811 error = dmu_objset_open(name, DMU_OST_OTHER,
b128c09f 3812 DS_MODE_USER, &za[d].za_os);
34dc7c2f
BB
3813 if (error)
3814 fatal(0, "dmu_objset_open('%s') = %d",
3815 name, error);
3816 (void) rw_unlock(&ztest_shared->zs_name_lock);
3817 if (test_future)
3818 ztest_dmu_check_future_leak(&za[t]);
fb5f0bc8
BB
3819 zil_replay(za[d].za_os, za[d].za_os,
3820 ztest_replay_vector);
34dc7c2f
BB
3821 za[d].za_zilog = zil_open(za[d].za_os, NULL);
3822 }
3823
b128c09f
BB
3824 VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
3825 &za[t].za_thread) == 0);
34dc7c2f
BB
3826 }
3827
3828 while (--t >= 0) {
b128c09f 3829 VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
34dc7c2f
BB
3830 if (t < zopt_datasets) {
3831 zil_close(za[t].za_zilog);
3832 dmu_objset_close(za[t].za_os);
3833 }
3834 }
3835
3836 if (zopt_verbose >= 3)
3837 show_pool_stats(spa);
3838
3839 txg_wait_synced(spa_get_dsl(spa), 0);
3840
3841 zs->zs_alloc = spa_get_alloc(spa);
3842 zs->zs_space = spa_get_space(spa);
3843
3844 /*
3845 * If we had out-of-space errors, destroy a random objset.
3846 */
3847 if (zs->zs_enospc_count != 0) {
3848 (void) rw_rdlock(&ztest_shared->zs_name_lock);
3849 d = (int)ztest_random(zopt_datasets);
3850 (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
3851 if (zopt_verbose >= 3)
3852 (void) printf("Destroying %s to free up space\n", name);
9babb374
BB
3853
3854 /* Cleanup any non-standard clones and snapshots */
3855 ztest_dsl_dataset_cleanup(name, za[d].za_instance);
3856
34dc7c2f
BB
3857 (void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
3858 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
3859 (void) rw_unlock(&ztest_shared->zs_name_lock);
3860 }
3861
3862 txg_wait_synced(spa_get_dsl(spa), 0);
3863
b128c09f
BB
3864 umem_free(za, zopt_threads * sizeof (ztest_args_t));
3865
3866 /* Kill the resume thread */
3867 ztest_exiting = B_TRUE;
3868 VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
fb5f0bc8 3869 ztest_resume(spa);
b128c09f 3870
34dc7c2f
BB
3871 /*
3872 * Right before closing the pool, kick off a bunch of async I/O;
3873 * spa_close() should wait for it to complete.
3874 */
3875 for (t = 1; t < 50; t++)
3876 dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
3877
34dc7c2f
BB
3878 spa_close(spa, FTAG);
3879
3880 kernel_fini();
3881}
3882
3883void
3884print_time(hrtime_t t, char *timebuf)
3885{
3886 hrtime_t s = t / NANOSEC;
3887 hrtime_t m = s / 60;
3888 hrtime_t h = m / 60;
3889 hrtime_t d = h / 24;
3890
3891 s -= m * 60;
3892 m -= h * 60;
3893 h -= d * 24;
3894
3895 timebuf[0] = '\0';
3896
3897 if (d)
3898 (void) sprintf(timebuf,
3899 "%llud%02lluh%02llum%02llus", d, h, m, s);
3900 else if (h)
3901 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
3902 else if (m)
3903 (void) sprintf(timebuf, "%llum%02llus", m, s);
3904 else
3905 (void) sprintf(timebuf, "%llus", s);
3906}
3907
3908/*
3909 * Create a storage pool with the given name and initial vdev size.
3910 * Then create the specified number of datasets in the pool.
3911 */
3912static void
3913ztest_init(char *pool)
3914{
3915 spa_t *spa;
3916 int error;
3917 nvlist_t *nvroot;
3918
3919 kernel_init(FREAD | FWRITE);
3920
3921 /*
3922 * Create the storage pool.
3923 */
3924 (void) spa_destroy(pool);
3925 ztest_shared->zs_vdev_primaries = 0;
b128c09f
BB
3926 nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
3927 0, zopt_raidz, zopt_mirrors, 1);
3928 error = spa_create(pool, nvroot, NULL, NULL, NULL);
34dc7c2f
BB
3929 nvlist_free(nvroot);
3930
3931 if (error)
3932 fatal(0, "spa_create() = %d", error);
3933 error = spa_open(pool, &spa, FTAG);
3934 if (error)
3935 fatal(0, "spa_open() = %d", error);
3936
9babb374
BB
3937 metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
3938
34dc7c2f
BB
3939 if (zopt_verbose >= 3)
3940 show_pool_stats(spa);
3941
3942 spa_close(spa, FTAG);
3943
3944 kernel_fini();
3945}
3946
3947int
3948main(int argc, char **argv)
3949{
3950 int kills = 0;
3951 int iters = 0;
3952 int i, f;
3953 ztest_shared_t *zs;
3954 ztest_info_t *zi;
3955 char timebuf[100];
3956 char numbuf[6];
3957
3958 (void) setvbuf(stdout, NULL, _IOLBF, 0);
3959
3960 /* Override location of zpool.cache */
b128c09f 3961 spa_config_path = "/tmp/zpool.cache";
34dc7c2f
BB
3962
3963 ztest_random_fd = open("/dev/urandom", O_RDONLY);
3964
3965 process_options(argc, argv);
3966
34dc7c2f
BB
3967 /*
3968 * Blow away any existing copy of zpool.cache
3969 */
3970 if (zopt_init != 0)
3971 (void) remove("/tmp/zpool.cache");
3972
3973 zs = ztest_shared = (void *)mmap(0,
3974 P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
3975 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
3976
3977 if (zopt_verbose >= 1) {
3978 (void) printf("%llu vdevs, %d datasets, %d threads,"
3979 " %llu seconds...\n",
3980 (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads,
3981 (u_longlong_t)zopt_time);
3982 }
3983
3984 /*
3985 * Create and initialize our storage pool.
3986 */
3987 for (i = 1; i <= zopt_init; i++) {
3988 bzero(zs, sizeof (ztest_shared_t));
3989 if (zopt_verbose >= 3 && zopt_init != 1)
3990 (void) printf("ztest_init(), pass %d\n", i);
3991 ztest_init(zopt_pool);
3992 }
3993
3994 /*
3995 * Initialize the call targets for each function.
3996 */
3997 for (f = 0; f < ZTEST_FUNCS; f++) {
3998 zi = &zs->zs_info[f];
3999
4000 *zi = ztest_info[f];
4001
4002 if (*zi->zi_interval == 0)
4003 zi->zi_call_target = UINT64_MAX;
4004 else
4005 zi->zi_call_target = zopt_time / *zi->zi_interval;
4006 }
4007
4008 zs->zs_start_time = gethrtime();
4009 zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
4010
4011 /*
4012 * Run the tests in a loop. These tests include fault injection
4013 * to verify that self-healing data works, and forced crashes
4014 * to verify that we never lose on-disk consistency.
4015 */
4016 while (gethrtime() < zs->zs_stop_time) {
4017 int status;
4018 pid_t pid;
4019 char *tmp;
4020
4021 /*
4022 * Initialize the workload counters for each function.
4023 */
4024 for (f = 0; f < ZTEST_FUNCS; f++) {
4025 zi = &zs->zs_info[f];
4026 zi->zi_calls = 0;
4027 zi->zi_call_time = 0;
4028 }
4029
9babb374
BB
4030 /* Set the allocation switch size */
4031 metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
4032
34dc7c2f
BB
4033 pid = fork();
4034
4035 if (pid == -1)
4036 fatal(1, "fork failed");
4037
4038 if (pid == 0) { /* child */
4039 struct rlimit rl = { 1024, 1024 };
4040 (void) setrlimit(RLIMIT_NOFILE, &rl);
4041 (void) enable_extended_FILE_stdio(-1, -1);
4042 ztest_run(zopt_pool);
4043 exit(0);
4044 }
4045
4046 while (waitpid(pid, &status, 0) != pid)
4047 continue;
4048
4049 if (WIFEXITED(status)) {
4050 if (WEXITSTATUS(status) != 0) {
4051 (void) fprintf(stderr,
4052 "child exited with code %d\n",
4053 WEXITSTATUS(status));
4054 exit(2);
4055 }
4056 } else if (WIFSIGNALED(status)) {
4057 if (WTERMSIG(status) != SIGKILL) {
4058 (void) fprintf(stderr,
4059 "child died with signal %d\n",
4060 WTERMSIG(status));
4061 exit(3);
4062 }
4063 kills++;
4064 } else {
4065 (void) fprintf(stderr, "something strange happened "
4066 "to child\n");
4067 exit(4);
4068 }
4069
4070 iters++;
4071
4072 if (zopt_verbose >= 1) {
4073 hrtime_t now = gethrtime();
4074
4075 now = MIN(now, zs->zs_stop_time);
4076 print_time(zs->zs_stop_time - now, timebuf);
4077 nicenum(zs->zs_space, numbuf);
4078
4079 (void) printf("Pass %3d, %8s, %3llu ENOSPC, "
4080 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
4081 iters,
4082 WIFEXITED(status) ? "Complete" : "SIGKILL",
4083 (u_longlong_t)zs->zs_enospc_count,
4084 100.0 * zs->zs_alloc / zs->zs_space,
4085 numbuf,
4086 100.0 * (now - zs->zs_start_time) /
4087 (zopt_time * NANOSEC), timebuf);
4088 }
4089
4090 if (zopt_verbose >= 2) {
4091 (void) printf("\nWorkload summary:\n\n");
4092 (void) printf("%7s %9s %s\n",
4093 "Calls", "Time", "Function");
4094 (void) printf("%7s %9s %s\n",
4095 "-----", "----", "--------");
4096 for (f = 0; f < ZTEST_FUNCS; f++) {
4097 Dl_info dli;
4098
4099 zi = &zs->zs_info[f];
4100 print_time(zi->zi_call_time, timebuf);
4101 (void) dladdr((void *)zi->zi_func, &dli);
4102 (void) printf("%7llu %9s %s\n",
4103 (u_longlong_t)zi->zi_calls, timebuf,
4104 dli.dli_sname);
4105 }
4106 (void) printf("\n");
4107 }
4108
4109 /*
4110 * It's possible that we killed a child during a rename test, in
4111 * which case we'll have a 'ztest_tmp' pool lying around instead
4112 * of 'ztest'. Do a blind rename in case this happened.
4113 */
4114 tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
4115 (void) strcpy(tmp, zopt_pool);
4116 (void) strcat(tmp, "_tmp");
4117 kernel_init(FREAD | FWRITE);
4118 (void) spa_rename(tmp, zopt_pool);
4119 kernel_fini();
4120 umem_free(tmp, strlen(tmp) + 1);
4121 }
4122
4123 ztest_verify_blocks(zopt_pool);
4124
4125 if (zopt_verbose >= 1) {
4126 (void) printf("%d killed, %d completed, %.0f%% kill rate\n",
4127 kills, iters - kills, (100.0 * kills) / MAX(1, iters));
4128 }
4129
4130 return (0);
4131}