]>
git.proxmox.com Git - mirror_zfs-debian.git/blob - lib/libzpool/kernel.c
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
34 #include <sys/processor.h>
35 #include <sys/zfs_context.h>
36 #include <sys/utsname.h>
37 #include <sys/systeminfo.h>
40 * Emulation of kernel services in userland.
45 vnode_t
*rootdir
= (vnode_t
*)0xabcd1234;
46 char hw_serial
[HW_HOSTID_LEN
];
48 struct utsname utsname
= {
49 "userland", "libzpool", "1", "1", "na"
52 /* this only exists to have its address taken */
56 * =========================================================================
58 * =========================================================================
62 zk_thread_create(void (*func
)(), void *arg
)
66 VERIFY(thr_create(0, 0, (void *(*)(void *))func
, arg
, THR_DETACHED
,
69 return ((void *)(uintptr_t)tid
);
73 * =========================================================================
75 * =========================================================================
79 kstat_create(char *module
, int instance
, char *name
, char *class,
80 uchar_t type
, ulong_t ndata
, uchar_t ks_flag
)
87 kstat_install(kstat_t
*ksp
)
92 kstat_delete(kstat_t
*ksp
)
96 * =========================================================================
98 * =========================================================================
101 zmutex_init(kmutex_t
*mp
)
104 mp
->initialized
= B_TRUE
;
105 (void) _mutex_init(&mp
->m_lock
, USYNC_THREAD
, NULL
);
109 zmutex_destroy(kmutex_t
*mp
)
111 ASSERT(mp
->initialized
== B_TRUE
);
112 ASSERT(mp
->m_owner
== NULL
);
113 (void) _mutex_destroy(&(mp
)->m_lock
);
114 mp
->m_owner
= (void *)-1UL;
115 mp
->initialized
= B_FALSE
;
119 mutex_enter(kmutex_t
*mp
)
121 ASSERT(mp
->initialized
== B_TRUE
);
122 ASSERT(mp
->m_owner
!= (void *)-1UL);
123 ASSERT(mp
->m_owner
!= curthread
);
124 VERIFY(mutex_lock(&mp
->m_lock
) == 0);
125 ASSERT(mp
->m_owner
== NULL
);
126 mp
->m_owner
= curthread
;
130 mutex_tryenter(kmutex_t
*mp
)
132 ASSERT(mp
->initialized
== B_TRUE
);
133 ASSERT(mp
->m_owner
!= (void *)-1UL);
134 if (0 == mutex_trylock(&mp
->m_lock
)) {
135 ASSERT(mp
->m_owner
== NULL
);
136 mp
->m_owner
= curthread
;
144 mutex_exit(kmutex_t
*mp
)
146 ASSERT(mp
->initialized
== B_TRUE
);
147 ASSERT(mutex_owner(mp
) == curthread
);
149 VERIFY(mutex_unlock(&mp
->m_lock
) == 0);
153 mutex_owner(kmutex_t
*mp
)
155 ASSERT(mp
->initialized
== B_TRUE
);
156 return (mp
->m_owner
);
160 * =========================================================================
162 * =========================================================================
166 rw_init(krwlock_t
*rwlp
, char *name
, int type
, void *arg
)
168 rwlock_init(&rwlp
->rw_lock
, USYNC_THREAD
, NULL
);
169 rwlp
->rw_owner
= NULL
;
170 rwlp
->initialized
= B_TRUE
;
174 rw_destroy(krwlock_t
*rwlp
)
176 rwlock_destroy(&rwlp
->rw_lock
);
177 rwlp
->rw_owner
= (void *)-1UL;
178 rwlp
->initialized
= B_FALSE
;
182 rw_enter(krwlock_t
*rwlp
, krw_t rw
)
184 ASSERT(!RW_LOCK_HELD(rwlp
));
185 ASSERT(rwlp
->initialized
== B_TRUE
);
186 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
187 ASSERT(rwlp
->rw_owner
!= curthread
);
190 VERIFY(rw_rdlock(&rwlp
->rw_lock
) == 0);
192 VERIFY(rw_wrlock(&rwlp
->rw_lock
) == 0);
194 rwlp
->rw_owner
= curthread
;
198 rw_exit(krwlock_t
*rwlp
)
200 ASSERT(rwlp
->initialized
== B_TRUE
);
201 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
203 rwlp
->rw_owner
= NULL
;
204 VERIFY(rw_unlock(&rwlp
->rw_lock
) == 0);
208 rw_tryenter(krwlock_t
*rwlp
, krw_t rw
)
212 ASSERT(rwlp
->initialized
== B_TRUE
);
213 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
216 rv
= rw_tryrdlock(&rwlp
->rw_lock
);
218 rv
= rw_trywrlock(&rwlp
->rw_lock
);
221 rwlp
->rw_owner
= curthread
;
230 rw_tryupgrade(krwlock_t
*rwlp
)
232 ASSERT(rwlp
->initialized
== B_TRUE
);
233 ASSERT(rwlp
->rw_owner
!= (void *)-1UL);
239 * =========================================================================
240 * condition variables
241 * =========================================================================
245 cv_init(kcondvar_t
*cv
, char *name
, int type
, void *arg
)
247 VERIFY(cond_init(cv
, type
, NULL
) == 0);
251 cv_destroy(kcondvar_t
*cv
)
253 VERIFY(cond_destroy(cv
) == 0);
257 cv_wait(kcondvar_t
*cv
, kmutex_t
*mp
)
259 ASSERT(mutex_owner(mp
) == curthread
);
261 int ret
= cond_wait(cv
, &mp
->m_lock
);
262 VERIFY(ret
== 0 || ret
== EINTR
);
263 mp
->m_owner
= curthread
;
267 cv_timedwait(kcondvar_t
*cv
, kmutex_t
*mp
, clock_t abstime
)
274 delta
= abstime
- ddi_get_lbolt();
278 ts
.tv_sec
= delta
/ hz
;
279 ts
.tv_nsec
= (delta
% hz
) * (NANOSEC
/ hz
);
281 ASSERT(mutex_owner(mp
) == curthread
);
283 error
= cond_reltimedwait(cv
, &mp
->m_lock
, &ts
);
284 mp
->m_owner
= curthread
;
298 cv_signal(kcondvar_t
*cv
)
300 VERIFY(cond_signal(cv
) == 0);
304 cv_broadcast(kcondvar_t
*cv
)
306 VERIFY(cond_broadcast(cv
) == 0);
310 * =========================================================================
312 * =========================================================================
315 * Note: for the xxxat() versions of these functions, we assume that the
316 * starting vp is always rootdir (which is true for spa_directory.c, the only
317 * ZFS consumer of these interfaces). We assert this is true, and then emulate
318 * them by adding '/' in front of the path.
323 vn_open(char *path
, int x1
, int flags
, int mode
, vnode_t
**vpp
, int x2
, int x3
)
328 char realpath
[MAXPATHLEN
];
333 * If we're accessing a real disk from userland, we need to use
334 * the character interface to avoid caching. This is particularly
335 * important if we're trying to look at a real in-kernel storage
336 * pool from userland, e.g. via zdb, because otherwise we won't
337 * see the changes occurring under the segmap cache.
338 * On the other hand, the stupid character device returns zero
339 * for its size. So -- gag -- we open the block device to get
340 * its size, and remember it for subsequent VOP_GETATTR().
342 if (strncmp(path
, "/dev/", 5) == 0) {
344 fd
= open64(path
, O_RDONLY
);
347 if (fstat64(fd
, &st
) == -1) {
352 (void) sprintf(realpath
, "%s", path
);
353 dsk
= strstr(path
, "/dsk/");
355 (void) sprintf(realpath
+ (dsk
- path
) + 1, "r%s",
358 (void) sprintf(realpath
, "%s", path
);
359 if (!(flags
& FCREAT
) && stat64(realpath
, &st
) == -1)
364 old_umask
= umask(0);
367 * The construct 'flags - FREAD' conveniently maps combinations of
368 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
370 fd
= open64(realpath
, flags
- FREAD
, mode
);
373 (void) umask(old_umask
);
378 if (fstat64(fd
, &st
) == -1) {
384 (void) fcntl(fd
, F_SETFD
, FD_CLOEXEC
);
386 *vpp
= vp
= umem_zalloc(sizeof (vnode_t
), UMEM_NOFAIL
);
389 vp
->v_size
= st
.st_size
;
390 vp
->v_path
= spa_strdup(path
);
397 vn_openat(char *path
, int x1
, int flags
, int mode
, vnode_t
**vpp
, int x2
,
398 int x3
, vnode_t
*startvp
, int fd
)
400 char *realpath
= umem_alloc(strlen(path
) + 2, UMEM_NOFAIL
);
403 ASSERT(startvp
== rootdir
);
404 (void) sprintf(realpath
, "/%s", path
);
406 /* fd ignored for now, need if want to simulate nbmand support */
407 ret
= vn_open(realpath
, x1
, flags
, mode
, vpp
, x2
, x3
);
409 umem_free(realpath
, strlen(path
) + 2);
416 vn_rdwr(int uio
, vnode_t
*vp
, void *addr
, ssize_t len
, offset_t offset
,
417 int x1
, int x2
, rlim64_t x3
, void *x4
, ssize_t
*residp
)
419 ssize_t rc
, done
= 0, split
;
421 if (uio
== UIO_READ
) {
422 rc
= pread64(vp
->v_fd
, addr
, len
, offset
);
425 * To simulate partial disk writes, we split writes into two
426 * system calls so that the process can be killed in between.
428 split
= (len
> 0 ? rand() % len
: 0);
429 rc
= pwrite64(vp
->v_fd
, addr
, split
, offset
);
432 rc
= pwrite64(vp
->v_fd
, (char *)addr
+ split
,
433 len
- split
, offset
+ split
);
443 *residp
= len
- done
;
444 else if (done
!= len
)
450 vn_close(vnode_t
*vp
)
453 spa_strfree(vp
->v_path
);
454 umem_free(vp
, sizeof (vnode_t
));
458 * At a minimum we need to update the size since vdev_reopen()
459 * will no longer call vn_openat().
462 fop_getattr(vnode_t
*vp
, vattr_t
*vap
)
466 if (fstat64(vp
->v_fd
, &st
) == -1) {
471 vap
->va_size
= st
.st_size
;
478 * =========================================================================
479 * Figure out which debugging statements to print
480 * =========================================================================
483 static char *dprintf_string
;
484 static int dprintf_print_all
;
487 dprintf_find_string(const char *string
)
489 char *tmp_str
= dprintf_string
;
490 int len
= strlen(string
);
493 * Find out if this is a string we want to print.
494 * String format: file1.c,function_name1,file2.c,file3.c
497 while (tmp_str
!= NULL
) {
498 if (strncmp(tmp_str
, string
, len
) == 0 &&
499 (tmp_str
[len
] == ',' || tmp_str
[len
] == '\0'))
501 tmp_str
= strchr(tmp_str
, ',');
503 tmp_str
++; /* Get rid of , */
509 dprintf_setup(int *argc
, char **argv
)
514 * Debugging can be specified two ways: by setting the
515 * environment variable ZFS_DEBUG, or by including a
516 * "debug=..." argument on the command line. The command
517 * line setting overrides the environment variable.
520 for (i
= 1; i
< *argc
; i
++) {
521 int len
= strlen("debug=");
522 /* First look for a command line argument */
523 if (strncmp("debug=", argv
[i
], len
) == 0) {
524 dprintf_string
= argv
[i
] + len
;
525 /* Remove from args */
526 for (j
= i
; j
< *argc
; j
++)
533 if (dprintf_string
== NULL
) {
534 /* Look for ZFS_DEBUG environment variable */
535 dprintf_string
= getenv("ZFS_DEBUG");
539 * Are we just turning on all debugging?
541 if (dprintf_find_string("on"))
542 dprintf_print_all
= 1;
546 * =========================================================================
548 * =========================================================================
551 __dprintf(const char *file
, const char *func
, int line
, const char *fmt
, ...)
557 * Get rid of annoying "../common/" prefix to filename.
559 newfile
= strrchr(file
, '/');
560 if (newfile
!= NULL
) {
561 newfile
= newfile
+ 1; /* Get rid of leading / */
566 if (dprintf_print_all
||
567 dprintf_find_string(newfile
) ||
568 dprintf_find_string(func
)) {
569 /* Print out just the function name if requested */
571 if (dprintf_find_string("pid"))
572 (void) printf("%d ", getpid());
573 if (dprintf_find_string("tid"))
574 (void) printf("%u ", thr_self());
575 if (dprintf_find_string("cpu"))
576 (void) printf("%u ", getcpuid());
577 if (dprintf_find_string("time"))
578 (void) printf("%llu ", gethrtime());
579 if (dprintf_find_string("long"))
580 (void) printf("%s, line %d: ", newfile
, line
);
581 (void) printf("%s: ", func
);
583 (void) vprintf(fmt
, adx
);
589 #endif /* ZFS_DEBUG */
592 * =========================================================================
593 * cmn_err() and panic()
594 * =========================================================================
596 static char ce_prefix
[CE_IGNORE
][10] = { "", "NOTICE: ", "WARNING: ", "" };
597 static char ce_suffix
[CE_IGNORE
][2] = { "", "\n", "\n", "" };
600 vpanic(const char *fmt
, va_list adx
)
602 (void) fprintf(stderr
, "error: ");
603 (void) vfprintf(stderr
, fmt
, adx
);
604 (void) fprintf(stderr
, "\n");
606 abort(); /* think of it as a "user-level crash dump" */
610 panic(const char *fmt
, ...)
620 vcmn_err(int ce
, const char *fmt
, va_list adx
)
624 if (ce
!= CE_NOTE
) { /* suppress noise in userland stress testing */
625 (void) fprintf(stderr
, "%s", ce_prefix
[ce
]);
626 (void) vfprintf(stderr
, fmt
, adx
);
627 (void) fprintf(stderr
, "%s", ce_suffix
[ce
]);
633 cmn_err(int ce
, const char *fmt
, ...)
638 vcmn_err(ce
, fmt
, adx
);
643 * =========================================================================
645 * =========================================================================
648 kobj_open_file(char *name
)
653 /* set vp as the _fd field of the file */
654 if (vn_openat(name
, UIO_SYSSPACE
, FREAD
, 0, &vp
, 0, 0, rootdir
,
656 return ((void *)-1UL);
658 file
= umem_zalloc(sizeof (struct _buf
), UMEM_NOFAIL
);
659 file
->_fd
= (intptr_t)vp
;
664 kobj_read_file(struct _buf
*file
, char *buf
, unsigned size
, unsigned off
)
668 vn_rdwr(UIO_READ
, (vnode_t
*)file
->_fd
, buf
, size
, (offset_t
)off
,
669 UIO_SYSSPACE
, 0, 0, 0, &resid
);
671 return (size
- resid
);
675 kobj_close_file(struct _buf
*file
)
677 vn_close((vnode_t
*)file
->_fd
);
678 umem_free(file
, sizeof (struct _buf
));
682 kobj_get_filesize(struct _buf
*file
, uint64_t *size
)
685 vnode_t
*vp
= (vnode_t
*)file
->_fd
;
687 if (fstat64(vp
->v_fd
, &st
) == -1) {
696 * =========================================================================
698 * =========================================================================
704 poll(0, 0, ticks
* (1000 / hz
));
708 * Find highest one bit set.
709 * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
710 * High order bit is 31 (or 63 in _LP64 kernel).
720 if (i
& 0xffffffff00000000ul
) {
724 if (i
& 0xffff0000) {
742 static int random_fd
= -1, urandom_fd
= -1;
745 random_get_bytes_common(uint8_t *ptr
, size_t len
, int fd
)
753 bytes
= read(fd
, ptr
, resid
);
754 ASSERT3S(bytes
, >=, 0);
763 random_get_bytes(uint8_t *ptr
, size_t len
)
765 return (random_get_bytes_common(ptr
, len
, random_fd
));
769 random_get_pseudo_bytes(uint8_t *ptr
, size_t len
)
771 return (random_get_bytes_common(ptr
, len
, urandom_fd
));
775 ddi_strtoul(const char *hw_serial
, char **nptr
, int base
, unsigned long *result
)
779 *result
= strtoul(hw_serial
, &end
, base
);
786 ddi_strtoull(const char *str
, char **nptr
, int base
, u_longlong_t
*result
)
790 *result
= strtoull(str
, &end
, base
);
797 * =========================================================================
798 * kernel emulation setup & teardown
799 * =========================================================================
802 umem_out_of_memory(void)
804 char errmsg
[] = "out of memory -- generating core dump\n";
806 (void) fprintf(stderr
, "%s", errmsg
);
812 kernel_init(int mode
)
814 umem_nofail_callback(umem_out_of_memory
);
816 physmem
= sysconf(_SC_PHYS_PAGES
);
818 dprintf("physmem = %llu pages (%.2f GB)\n", physmem
,
819 (double)physmem
* sysconf(_SC_PAGE_SIZE
) / (1ULL << 30));
821 (void) snprintf(hw_serial
, sizeof (hw_serial
), "%ld",
822 (mode
& FWRITE
) ? gethostid() : 0);
824 VERIFY((random_fd
= open("/dev/random", O_RDONLY
)) != -1);
825 VERIFY((urandom_fd
= open("/dev/urandom", O_RDONLY
)) != -1);
859 crgetngroups(cred_t
*cr
)
865 crgetgroups(cred_t
*cr
)
871 zfs_secpolicy_snapshot_perms(const char *name
, cred_t
*cr
)
877 zfs_secpolicy_rename_perms(const char *from
, const char *to
, cred_t
*cr
)
883 zfs_secpolicy_destroy_perms(const char *name
, cred_t
*cr
)
889 ksid_lookupdomain(const char *dom
)
893 kd
= umem_zalloc(sizeof (ksiddomain_t
), UMEM_NOFAIL
);
894 kd
->kd_name
= spa_strdup(dom
);
899 ksiddomain_rele(ksiddomain_t
*ksid
)
901 spa_strfree(ksid
->kd_name
);
902 umem_free(ksid
, sizeof (ksiddomain_t
));
906 * Do not change the length of the returned string; it must be freed
910 kmem_asprintf(const char *fmt
, ...)
917 size
= vsnprintf(NULL
, 0, fmt
, adx
) + 1;
920 buf
= kmem_alloc(size
, KM_SLEEP
);
923 size
= vsnprintf(buf
, size
, fmt
, adx
);
931 zfs_onexit_fd_hold(int fd
, minor_t
*minorp
)
939 zfs_onexit_fd_rele(int fd
)
945 zfs_onexit_add_cb(minor_t minor
, void (*func
)(void *), void *data
,
946 uint64_t *action_handle
)
953 zfs_onexit_del_cb(minor_t minor
, uint64_t action_handle
, boolean_t fire
)
960 zfs_onexit_cb_data(minor_t minor
, uint64_t action_handle
, void **data
)