4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2013 Steven Hartland. All rights reserved.
28 * LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
29 * It has the following characteristics:
31 * - Thread Safe. libzfs_core is accessible concurrently from multiple
32 * threads. This is accomplished primarily by avoiding global data
33 * (e.g. caching). Since it's thread-safe, there is no reason for a
34 * process to have multiple libzfs "instances". Therefore, we store
35 * our few pieces of data (e.g. the file descriptor) in global
36 * variables. The fd is reference-counted so that the libzfs_core
37 * library can be "initialized" multiple times (e.g. by different
38 * consumers within the same process).
40 * - Committed Interface. The libzfs_core interface will be committed,
41 * therefore consumers can compile against it and be confident that
42 * their code will continue to work on future releases of this code.
43 * Currently, the interface is Evolving (not Committed), but we intend
44 * to commit to it once it is more complete and we determine that it
45 * meets the needs of all consumers.
47 * - Programmatic Error Handling. libzfs_core communicates errors with
48 * defined error numbers, and doesn't print anything to stdout/stderr.
50 * - Thin Layer. libzfs_core is a thin layer, marshaling arguments
51 * to/from the kernel ioctls. There is generally a 1:1 correspondence
52 * between libzfs_core functions and ioctls to /dev/zfs.
54 * - Clear Atomicity. Because libzfs_core functions are generally 1:1
55 * with kernel ioctls, and kernel ioctls are general atomic, each
56 * libzfs_core function is atomic. For example, creating multiple
57 * snapshots with a single call to lzc_snapshot() is atomic -- it
58 * can't fail with only some of the requested snapshots created, even
59 * in the event of power loss or system crash.
61 * - Continued libzfs Support. Some higher-level operations (e.g.
62 * support for "zfs send -R") are too complicated to fit the scope of
63 * libzfs_core. This functionality will continue to live in libzfs.
64 * Where appropriate, libzfs will use the underlying atomic operations
65 * of libzfs_core. For example, libzfs may implement "zfs send -R |
66 * zfs receive" by using individual "send one snapshot", rename,
67 * destroy, and "receive one snapshot" operations in libzfs_core.
68 * /sbin/zfs and /zbin/zpool will link with both libzfs and
69 * libzfs_core. Other consumers should aim to use only libzfs_core,
70 * since that will be the supported, stable interface going forwards.
73 #include <libzfs_core.h>
81 #include <sys/nvpair.h>
82 #include <sys/param.h>
83 #include <sys/types.h>
85 #include <sys/zfs_ioctl.h>
88 static pthread_mutex_t g_lock
= PTHREAD_MUTEX_INITIALIZER
;
89 static int g_refcount
;
92 libzfs_core_init(void)
94 (void) pthread_mutex_lock(&g_lock
);
95 if (g_refcount
== 0) {
96 g_fd
= open("/dev/zfs", O_RDWR
);
98 (void) pthread_mutex_unlock(&g_lock
);
103 (void) pthread_mutex_unlock(&g_lock
);
108 libzfs_core_fini(void)
110 (void) pthread_mutex_lock(&g_lock
);
111 ASSERT3S(g_refcount
, >, 0);
115 (void) pthread_mutex_unlock(&g_lock
);
119 lzc_ioctl(zfs_ioc_t ioc
, const char *name
,
120 nvlist_t
*source
, nvlist_t
**resultp
)
122 zfs_cmd_t zc
= {"\0"};
127 ASSERT3S(g_refcount
, >, 0);
129 (void) strlcpy(zc
.zc_name
, name
, sizeof (zc
.zc_name
));
131 packed
= fnvlist_pack(source
, &size
);
132 zc
.zc_nvlist_src
= (uint64_t)(uintptr_t)packed
;
133 zc
.zc_nvlist_src_size
= size
;
135 if (resultp
!= NULL
) {
137 zc
.zc_nvlist_dst_size
= MAX(size
* 2, 128 * 1024);
138 zc
.zc_nvlist_dst
= (uint64_t)(uintptr_t)
139 malloc(zc
.zc_nvlist_dst_size
);
140 if (zc
.zc_nvlist_dst
== (uint64_t)0) {
146 while (ioctl(g_fd
, ioc
, &zc
) != 0) {
147 if (errno
== ENOMEM
&& resultp
!= NULL
) {
148 free((void *)(uintptr_t)zc
.zc_nvlist_dst
);
149 zc
.zc_nvlist_dst_size
*= 2;
150 zc
.zc_nvlist_dst
= (uint64_t)(uintptr_t)
151 malloc(zc
.zc_nvlist_dst_size
);
152 if (zc
.zc_nvlist_dst
== (uint64_t)0) {
161 if (zc
.zc_nvlist_dst_filled
) {
162 *resultp
= fnvlist_unpack((void *)(uintptr_t)zc
.zc_nvlist_dst
,
163 zc
.zc_nvlist_dst_size
);
167 fnvlist_pack_free(packed
, size
);
168 free((void *)(uintptr_t)zc
.zc_nvlist_dst
);
173 lzc_create(const char *fsname
, dmu_objset_type_t type
, nvlist_t
*props
)
176 nvlist_t
*args
= fnvlist_alloc();
177 fnvlist_add_int32(args
, "type", type
);
179 fnvlist_add_nvlist(args
, "props", props
);
180 error
= lzc_ioctl(ZFS_IOC_CREATE
, fsname
, args
, NULL
);
186 lzc_clone(const char *fsname
, const char *origin
,
190 nvlist_t
*args
= fnvlist_alloc();
191 fnvlist_add_string(args
, "origin", origin
);
193 fnvlist_add_nvlist(args
, "props", props
);
194 error
= lzc_ioctl(ZFS_IOC_CLONE
, fsname
, args
, NULL
);
202 * The keys in the snaps nvlist are the snapshots to be created.
203 * They must all be in the same pool.
205 * The props nvlist is properties to set. Currently only user properties
206 * are supported. { user:prop_name -> string value }
208 * The returned results nvlist will have an entry for each snapshot that failed.
209 * The value will be the (int32) error code.
211 * The return value will be 0 if all snapshots were created, otherwise it will
212 * be the errno of a (unspecified) snapshot that failed.
215 lzc_snapshot(nvlist_t
*snaps
, nvlist_t
*props
, nvlist_t
**errlist
)
220 char pool
[MAXNAMELEN
];
224 /* determine the pool name */
225 elem
= nvlist_next_nvpair(snaps
, NULL
);
228 (void) strlcpy(pool
, nvpair_name(elem
), sizeof (pool
));
229 pool
[strcspn(pool
, "/@")] = '\0';
231 args
= fnvlist_alloc();
232 fnvlist_add_nvlist(args
, "snaps", snaps
);
234 fnvlist_add_nvlist(args
, "props", props
);
236 error
= lzc_ioctl(ZFS_IOC_SNAPSHOT
, pool
, args
, errlist
);
243 * Destroys snapshots.
245 * The keys in the snaps nvlist are the snapshots to be destroyed.
246 * They must all be in the same pool.
248 * Snapshots that do not exist will be silently ignored.
250 * If 'defer' is not set, and a snapshot has user holds or clones, the
251 * destroy operation will fail and none of the snapshots will be
254 * If 'defer' is set, and a snapshot has user holds or clones, it will be
255 * marked for deferred destruction, and will be destroyed when the last hold
256 * or clone is removed/destroyed.
258 * The return value will be 0 if all snapshots were destroyed (or marked for
259 * later destruction if 'defer' is set) or didn't exist to begin with.
261 * Otherwise the return value will be the errno of a (unspecified) snapshot
262 * that failed, no snapshots will be destroyed, and the errlist will have an
263 * entry for each snapshot that failed. The value in the errlist will be
264 * the (int32) error code.
267 lzc_destroy_snaps(nvlist_t
*snaps
, boolean_t defer
, nvlist_t
**errlist
)
272 char pool
[MAXNAMELEN
];
274 /* determine the pool name */
275 elem
= nvlist_next_nvpair(snaps
, NULL
);
278 (void) strlcpy(pool
, nvpair_name(elem
), sizeof (pool
));
279 pool
[strcspn(pool
, "/@")] = '\0';
281 args
= fnvlist_alloc();
282 fnvlist_add_nvlist(args
, "snaps", snaps
);
284 fnvlist_add_boolean(args
, "defer");
286 error
= lzc_ioctl(ZFS_IOC_DESTROY_SNAPS
, pool
, args
, errlist
);
293 lzc_snaprange_space(const char *firstsnap
, const char *lastsnap
,
302 /* determine the fs name */
303 (void) strlcpy(fs
, firstsnap
, sizeof (fs
));
304 atp
= strchr(fs
, '@');
309 args
= fnvlist_alloc();
310 fnvlist_add_string(args
, "firstsnap", firstsnap
);
312 err
= lzc_ioctl(ZFS_IOC_SPACE_SNAPS
, lastsnap
, args
, &result
);
315 *usedp
= fnvlist_lookup_uint64(result
, "used");
316 fnvlist_free(result
);
322 lzc_exists(const char *dataset
)
325 * The objset_stats ioctl is still legacy, so we need to construct our
326 * own zfs_cmd_t rather than using zfsc_ioctl().
328 zfs_cmd_t zc
= {"\0"};
330 (void) strlcpy(zc
.zc_name
, dataset
, sizeof (zc
.zc_name
));
331 return (ioctl(g_fd
, ZFS_IOC_OBJSET_STATS
, &zc
) == 0);
335 * Create "user holds" on snapshots. If there is a hold on a snapshot,
336 * the snapshot can not be destroyed. (However, it can be marked for deletion
337 * by lzc_destroy_snaps(defer=B_TRUE).)
339 * The keys in the nvlist are snapshot names.
340 * The snapshots must all be in the same pool.
341 * The value is the name of the hold (string type).
343 * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
344 * In this case, when the cleanup_fd is closed (including on process
345 * termination), the holds will be released. If the system is shut down
346 * uncleanly, the holds will be released when the pool is next opened
349 * Holds for snapshots which don't exist will be skipped and have an entry
350 * added to errlist, but will not cause an overall failure.
352 * The return value will be 0 if all holds, for snapshots that existed,
353 * were successfully created.
355 * Otherwise the return value will be the errno of a (unspecified) hold that
356 * failed and no holds will be created.
358 * In all cases the errlist will have an entry for each hold that failed
359 * (name = snapshot), with its value being the error code (int32).
362 lzc_hold(nvlist_t
*holds
, int cleanup_fd
, nvlist_t
**errlist
)
364 char pool
[MAXNAMELEN
];
369 /* determine the pool name */
370 elem
= nvlist_next_nvpair(holds
, NULL
);
373 (void) strlcpy(pool
, nvpair_name(elem
), sizeof (pool
));
374 pool
[strcspn(pool
, "/@")] = '\0';
376 args
= fnvlist_alloc();
377 fnvlist_add_nvlist(args
, "holds", holds
);
378 if (cleanup_fd
!= -1)
379 fnvlist_add_int32(args
, "cleanup_fd", cleanup_fd
);
381 error
= lzc_ioctl(ZFS_IOC_HOLD
, pool
, args
, errlist
);
387 * Release "user holds" on snapshots. If the snapshot has been marked for
388 * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
389 * any clones, and all the user holds are removed, then the snapshot will be
392 * The keys in the nvlist are snapshot names.
393 * The snapshots must all be in the same pool.
394 * The value is a nvlist whose keys are the holds to remove.
396 * Holds which failed to release because they didn't exist will have an entry
397 * added to errlist, but will not cause an overall failure.
399 * The return value will be 0 if the nvl holds was empty or all holds that
400 * existed, were successfully removed.
402 * Otherwise the return value will be the errno of a (unspecified) hold that
403 * failed to release and no holds will be released.
405 * In all cases the errlist will have an entry for each hold that failed to
409 lzc_release(nvlist_t
*holds
, nvlist_t
**errlist
)
411 char pool
[MAXNAMELEN
];
414 /* determine the pool name */
415 elem
= nvlist_next_nvpair(holds
, NULL
);
418 (void) strlcpy(pool
, nvpair_name(elem
), sizeof (pool
));
419 pool
[strcspn(pool
, "/@")] = '\0';
421 return (lzc_ioctl(ZFS_IOC_RELEASE
, pool
, holds
, errlist
));
425 * Retrieve list of user holds on the specified snapshot.
427 * On success, *holdsp will be set to a nvlist which the caller must free.
428 * The keys are the names of the holds, and the value is the creation time
429 * of the hold (uint64) in seconds since the epoch.
432 lzc_get_holds(const char *snapname
, nvlist_t
**holdsp
)
435 nvlist_t
*innvl
= fnvlist_alloc();
436 error
= lzc_ioctl(ZFS_IOC_GET_HOLDS
, snapname
, innvl
, holdsp
);
443 * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
445 * If "from" is NULL, a full (non-incremental) stream will be sent.
446 * If "from" is non-NULL, it must be the full name of a snapshot or
447 * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
448 * "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or
449 * bookmark must represent an earlier point in the history of "snapname").
450 * It can be an earlier snapshot in the same filesystem or zvol as "snapname",
451 * or it can be the origin of "snapname"'s filesystem, or an earlier
452 * snapshot in the origin, etc.
454 * "fd" is the file descriptor to write the send stream to.
457 lzc_send(const char *snapname
, const char *from
, int fd
)
462 args
= fnvlist_alloc();
463 fnvlist_add_int32(args
, "fd", fd
);
465 fnvlist_add_string(args
, "fromsnap", from
);
466 err
= lzc_ioctl(ZFS_IOC_SEND_NEW
, snapname
, args
, NULL
);
472 * If fromsnap is NULL, a full (non-incremental) stream will be estimated.
475 lzc_send_space(const char *snapname
, const char *fromsnap
, uint64_t *spacep
)
481 args
= fnvlist_alloc();
482 if (fromsnap
!= NULL
)
483 fnvlist_add_string(args
, "fromsnap", fromsnap
);
484 err
= lzc_ioctl(ZFS_IOC_SEND_SPACE
, snapname
, args
, &result
);
487 *spacep
= fnvlist_lookup_uint64(result
, "space");
493 recv_read(int fd
, void *buf
, int ilen
)
500 rv
= read(fd
, cp
, len
);
505 if (rv
< 0 || len
!= 0)
512 * The simplest receive case: receive from the specified fd, creating the
513 * specified snapshot. Apply the specified properties a "received" properties
514 * (which can be overridden by locally-set properties). If the stream is a
515 * clone, its origin snapshot must be specified by 'origin'. The 'force'
516 * flag will cause the target filesystem to be rolled back or destroyed if
517 * necessary to receive.
519 * Return 0 on success or an errno on failure.
521 * Note: this interface does not work on dedup'd streams
522 * (those with DMU_BACKUP_FEATURE_DEDUP).
525 lzc_receive(const char *snapname
, nvlist_t
*props
, const char *origin
,
526 boolean_t force
, int fd
)
529 * The receive ioctl is still legacy, so we need to construct our own
530 * zfs_cmd_t rather than using zfsc_ioctl().
532 zfs_cmd_t zc
= {"\0"};
536 dmu_replay_record_t drr
;
539 ASSERT3S(g_refcount
, >, 0);
541 /* zc_name is name of containing filesystem */
542 (void) strlcpy(zc
.zc_name
, snapname
, sizeof (zc
.zc_name
));
543 atp
= strchr(zc
.zc_name
, '@');
548 /* if the fs does not exist, try its parent. */
549 if (!lzc_exists(zc
.zc_name
)) {
550 char *slashp
= strrchr(zc
.zc_name
, '/');
557 /* zc_value is full name of the snapshot to create */
558 (void) strlcpy(zc
.zc_value
, snapname
, sizeof (zc
.zc_value
));
561 /* zc_nvlist_src is props to set */
562 packed
= fnvlist_pack(props
, &size
);
563 zc
.zc_nvlist_src
= (uint64_t)(uintptr_t)packed
;
564 zc
.zc_nvlist_src_size
= size
;
567 /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
569 (void) strlcpy(zc
.zc_string
, origin
, sizeof (zc
.zc_string
));
571 /* zc_begin_record is non-byteswapped BEGIN record */
572 error
= recv_read(fd
, &drr
, sizeof (drr
));
575 zc
.zc_begin_record
= drr
.drr_u
.drr_begin
;
577 /* zc_cookie is fd to read from */
580 /* zc guid is force flag */
583 /* zc_cleanup_fd is unused */
584 zc
.zc_cleanup_fd
= -1;
586 error
= ioctl(g_fd
, ZFS_IOC_RECV
, &zc
);
592 fnvlist_pack_free(packed
, size
);
593 free((void*)(uintptr_t)zc
.zc_nvlist_dst
);
598 * Roll back this filesystem or volume to its most recent snapshot.
599 * If snapnamebuf is not NULL, it will be filled in with the name
600 * of the most recent snapshot.
602 * Return 0 on success or an errno on failure.
605 lzc_rollback(const char *fsname
, char *snapnamebuf
, int snapnamelen
)
611 args
= fnvlist_alloc();
612 err
= lzc_ioctl(ZFS_IOC_ROLLBACK
, fsname
, args
, &result
);
614 if (err
== 0 && snapnamebuf
!= NULL
) {
615 const char *snapname
= fnvlist_lookup_string(result
, "target");
616 (void) strlcpy(snapnamebuf
, snapname
, snapnamelen
);
624 * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
625 * the name of the snapshot (e.g. "pool/fs@snap"). All the bookmarks and
626 * snapshots must be in the same pool.
628 * The returned results nvlist will have an entry for each bookmark that failed.
629 * The value will be the (int32) error code.
631 * The return value will be 0 if all bookmarks were created, otherwise it will
632 * be the errno of a (undetermined) bookmarks that failed.
635 lzc_bookmark(nvlist_t
*bookmarks
, nvlist_t
**errlist
)
639 char pool
[MAXNAMELEN
];
641 /* determine the pool name */
642 elem
= nvlist_next_nvpair(bookmarks
, NULL
);
645 (void) strlcpy(pool
, nvpair_name(elem
), sizeof (pool
));
646 pool
[strcspn(pool
, "/#")] = '\0';
648 error
= lzc_ioctl(ZFS_IOC_BOOKMARK
, pool
, bookmarks
, errlist
);
654 * Retrieve bookmarks.
656 * Retrieve the list of bookmarks for the given file system. The props
657 * parameter is an nvlist of property names (with no values) that will be
658 * returned for each bookmark.
660 * The following are valid properties on bookmarks, all of which are numbers
661 * (represented as uint64 in the nvlist)
663 * "guid" - globally unique identifier of the snapshot it refers to
664 * "createtxg" - txg when the snapshot it refers to was created
665 * "creation" - timestamp when the snapshot it refers to was created
667 * The format of the returned nvlist as follows:
668 * <short name of bookmark> -> {
669 * <name of property> -> {
675 lzc_get_bookmarks(const char *fsname
, nvlist_t
*props
, nvlist_t
**bmarks
)
677 return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS
, fsname
, props
, bmarks
));
681 * Destroys bookmarks.
683 * The keys in the bmarks nvlist are the bookmarks to be destroyed.
684 * They must all be in the same pool. Bookmarks are specified as
687 * Bookmarks that do not exist will be silently ignored.
689 * The return value will be 0 if all bookmarks that existed were destroyed.
691 * Otherwise the return value will be the errno of a (undetermined) bookmark
692 * that failed, no bookmarks will be destroyed, and the errlist will have an
693 * entry for each bookmarks that failed. The value in the errlist will be
694 * the (int32) error code.
697 lzc_destroy_bookmarks(nvlist_t
*bmarks
, nvlist_t
**errlist
)
701 char pool
[MAXNAMELEN
];
703 /* determine the pool name */
704 elem
= nvlist_next_nvpair(bmarks
, NULL
);
707 (void) strlcpy(pool
, nvpair_name(elem
), sizeof (pool
));
708 pool
[strcspn(pool
, "/#")] = '\0';
710 error
= lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS
, pool
, bmarks
, errlist
);