]> git.proxmox.com Git - mirror_zfs.git/blame - lib/libzpool/kernel.c
cstyle: Allow spaces in all comments
[mirror_zfs.git] / lib / libzpool / kernel.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
572e2857 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
34dc7c2f
BB
23 */
24
34dc7c2f
BB
25#include <assert.h>
26#include <fcntl.h>
27#include <poll.h>
28#include <stdio.h>
29#include <stdlib.h>
30#include <string.h>
31#include <zlib.h>
1e33ac1e 32#include <sys/signal.h>
34dc7c2f
BB
33#include <sys/spa.h>
34#include <sys/stat.h>
35#include <sys/processor.h>
36#include <sys/zfs_context.h>
13fe0198 37#include <sys/rrwlock.h>
34dc7c2f 38#include <sys/utsname.h>
d603ed6c 39#include <sys/time.h>
d164b209 40#include <sys/systeminfo.h>
34dc7c2f
BB
41
42/*
43 * Emulation of kernel services in userland.
44 */
45
428870ff 46int aok;
34dc7c2f
BB
47uint64_t physmem;
48vnode_t *rootdir = (vnode_t *)0xabcd1234;
d164b209 49char hw_serial[HW_HOSTID_LEN];
34dc7c2f
BB
50
51struct utsname utsname = {
52 "userland", "libzpool", "1", "1", "na"
53};
54
428870ff
BB
55/* this only exists to have its address taken */
56struct proc p0;
57
34dc7c2f
BB
58/*
59 * =========================================================================
60 * threads
61 * =========================================================================
62 */
1e33ac1e
BB
63
64pthread_cond_t kthread_cond = PTHREAD_COND_INITIALIZER;
65pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER;
66pthread_key_t kthread_key;
67int kthread_nr = 0;
68
69static void
70thread_init(void)
71{
72 kthread_t *kt;
73
74 VERIFY3S(pthread_key_create(&kthread_key, NULL), ==, 0);
75
76 /* Create entry for primary kthread */
77 kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL);
78 kt->t_tid = pthread_self();
79 kt->t_func = NULL;
80
81 VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0);
82
83 /* Only the main thread should be running at the moment */
84 ASSERT3S(kthread_nr, ==, 0);
85 kthread_nr = 1;
86}
87
88static void
89thread_fini(void)
90{
91 kthread_t *kt = curthread;
92
93 ASSERT(pthread_equal(kt->t_tid, pthread_self()));
94 ASSERT3P(kt->t_func, ==, NULL);
95
96 umem_free(kt, sizeof(kthread_t));
97
98 /* Wait for all threads to exit via thread_exit() */
99 VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
100
101 kthread_nr--; /* Main thread is exiting */
102
103 while (kthread_nr > 0)
104 VERIFY3S(pthread_cond_wait(&kthread_cond, &kthread_lock), ==,
105 0);
106
107 ASSERT3S(kthread_nr, ==, 0);
108 VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
109
110 VERIFY3S(pthread_key_delete(kthread_key), ==, 0);
111}
112
34dc7c2f 113kthread_t *
1e33ac1e
BB
114zk_thread_current(void)
115{
116 kthread_t *kt = pthread_getspecific(kthread_key);
117
118 ASSERT3P(kt, !=, NULL);
119
120 return kt;
121}
122
123void *
124zk_thread_helper(void *arg)
34dc7c2f 125{
1e33ac1e
BB
126 kthread_t *kt = (kthread_t *) arg;
127
128 VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0);
34dc7c2f 129
1e33ac1e
BB
130 VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
131 kthread_nr++;
132 VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
34dc7c2f 133
1e33ac1e
BB
134 kt->t_tid = pthread_self();
135 ((thread_func_arg_t) kt->t_func)(kt->t_arg);
136
137 /* Unreachable, thread must exit with thread_exit() */
138 abort();
139
140 return NULL;
141}
142
143kthread_t *
144zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg,
0aebd4f9 145 size_t len, proc_t *pp, int state, pri_t pri, int detachstate)
1e33ac1e
BB
146{
147 kthread_t *kt;
148 pthread_attr_t attr;
149 size_t stack;
150
151 ASSERT3S(state & ~TS_RUN, ==, 0);
152
153 kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL);
154 kt->t_func = func;
155 kt->t_arg = arg;
156
157 /*
158 * The Solaris kernel stack size is 24k for x86/x86_64.
159 * The Linux kernel stack size is 8k for x86/x86_64.
160 *
161 * We reduce the default stack size in userspace, to ensure
162 * we observe stack overruns in user space as well as in
4b2f65b2
ED
163 * kernel space. In practice we can't set the userspace stack
164 * size to 8k because differences in stack usage between kernel
165 * space and userspace could lead to spurious stack overflows
166 * (especially when debugging is enabled). Nevertheless, we try
167 * to set it to the lowest value that works (currently 8k*4).
168 * PTHREAD_STACK_MIN is the minimum stack required for a NULL
169 * procedure in user space and is added in to the stack
170 * requirements.
1e33ac1e
BB
171 *
172 * Some buggy NPTL threading implementations include the
173 * guard area within the stack size allocations. In
174 * this case we allocate an extra page to account for the
175 * guard area since we only have two pages of usable stack
176 * on Linux.
177 */
178
79c6e4c4 179 stack = PTHREAD_STACK_MIN + MAX(stksize, STACK_SIZE) * 4;
1e33ac1e
BB
180
181 VERIFY3S(pthread_attr_init(&attr), ==, 0);
182 VERIFY3S(pthread_attr_setstacksize(&attr, stack), ==, 0);
183 VERIFY3S(pthread_attr_setguardsize(&attr, PAGESIZE), ==, 0);
0aebd4f9 184 VERIFY3S(pthread_attr_setdetachstate(&attr, detachstate), ==, 0);
1e33ac1e
BB
185
186 VERIFY3S(pthread_create(&kt->t_tid, &attr, &zk_thread_helper, kt),
187 ==, 0);
188
189 VERIFY3S(pthread_attr_destroy(&attr), ==, 0);
190
191 return kt;
192}
193
194void
195zk_thread_exit(void)
196{
197 kthread_t *kt = curthread;
198
199 ASSERT(pthread_equal(kt->t_tid, pthread_self()));
200
201 umem_free(kt, sizeof(kthread_t));
202
203 pthread_mutex_lock(&kthread_lock);
204 kthread_nr--;
205 pthread_mutex_unlock(&kthread_lock);
206
207 pthread_cond_broadcast(&kthread_cond);
208 pthread_exit((void *)TS_MAGIC);
209}
210
211void
212zk_thread_join(kt_did_t tid)
213{
214 void *ret;
215
216 pthread_join((pthread_t)tid, &ret);
217 VERIFY3P(ret, ==, (void *)TS_MAGIC);
34dc7c2f
BB
218}
219
220/*
221 * =========================================================================
222 * kstats
223 * =========================================================================
224 */
225/*ARGSUSED*/
226kstat_t *
330847ff
MA
227kstat_create(const char *module, int instance, const char *name,
228 const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag)
34dc7c2f
BB
229{
230 return (NULL);
231}
232
233/*ARGSUSED*/
234void
235kstat_install(kstat_t *ksp)
236{}
237
238/*ARGSUSED*/
239void
240kstat_delete(kstat_t *ksp)
241{}
242
1421c891 243/*ARGSUSED*/
330847ff
MA
244void
245kstat_waitq_enter(kstat_io_t *kiop)
246{}
247
248/*ARGSUSED*/
249void
250kstat_waitq_exit(kstat_io_t *kiop)
251{}
252
253/*ARGSUSED*/
254void
255kstat_runq_enter(kstat_io_t *kiop)
256{}
257
258/*ARGSUSED*/
259void
260kstat_runq_exit(kstat_io_t *kiop)
261{}
262
263/*ARGSUSED*/
264void
265kstat_waitq_to_runq(kstat_io_t *kiop)
266{}
267
268/*ARGSUSED*/
269void
270kstat_runq_back_to_waitq(kstat_io_t *kiop)
271{}
272
1421c891
PS
273void
274kstat_set_raw_ops(kstat_t *ksp,
275 int (*headers)(char *buf, size_t size),
276 int (*data)(char *buf, size_t size, void *data),
277 void *(*addr)(kstat_t *ksp, loff_t index))
278{}
279
34dc7c2f
BB
280/*
281 * =========================================================================
282 * mutexes
283 * =========================================================================
284 */
1e33ac1e 285
34dc7c2f 286void
1e33ac1e 287mutex_init(kmutex_t *mp, char *name, int type, void *cookie)
34dc7c2f 288{
1e33ac1e
BB
289 ASSERT3S(type, ==, MUTEX_DEFAULT);
290 ASSERT3P(cookie, ==, NULL);
291 mp->m_owner = MTX_INIT;
292 mp->m_magic = MTX_MAGIC;
293 VERIFY3S(pthread_mutex_init(&mp->m_lock, NULL), ==, 0);
34dc7c2f
BB
294}
295
296void
1e33ac1e 297mutex_destroy(kmutex_t *mp)
34dc7c2f 298{
1e33ac1e
BB
299 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
300 ASSERT3P(mp->m_owner, ==, MTX_INIT);
301 VERIFY3S(pthread_mutex_destroy(&(mp)->m_lock), ==, 0);
302 mp->m_owner = MTX_DEST;
303 mp->m_magic = 0;
34dc7c2f
BB
304}
305
306void
307mutex_enter(kmutex_t *mp)
308{
1e33ac1e
BB
309 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
310 ASSERT3P(mp->m_owner, !=, MTX_DEST);
311 ASSERT3P(mp->m_owner, !=, curthread);
312 VERIFY3S(pthread_mutex_lock(&mp->m_lock), ==, 0);
313 ASSERT3P(mp->m_owner, ==, MTX_INIT);
34dc7c2f
BB
314 mp->m_owner = curthread;
315}
316
317int
318mutex_tryenter(kmutex_t *mp)
319{
1e33ac1e
BB
320 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
321 ASSERT3P(mp->m_owner, !=, MTX_DEST);
322 if (0 == pthread_mutex_trylock(&mp->m_lock)) {
323 ASSERT3P(mp->m_owner, ==, MTX_INIT);
34dc7c2f
BB
324 mp->m_owner = curthread;
325 return (1);
326 } else {
327 return (0);
328 }
329}
330
331void
332mutex_exit(kmutex_t *mp)
333{
1e33ac1e
BB
334 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
335 ASSERT3P(mutex_owner(mp), ==, curthread);
336 mp->m_owner = MTX_INIT;
337 VERIFY3S(pthread_mutex_unlock(&mp->m_lock), ==, 0);
34dc7c2f
BB
338}
339
340void *
341mutex_owner(kmutex_t *mp)
342{
1e33ac1e 343 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
34dc7c2f
BB
344 return (mp->m_owner);
345}
346
1e33ac1e
BB
347int
348mutex_held(kmutex_t *mp)
349{
350 return (mp->m_owner == curthread);
351}
352
34dc7c2f
BB
353/*
354 * =========================================================================
355 * rwlocks
356 * =========================================================================
357 */
1e33ac1e 358
34dc7c2f
BB
359void
360rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
361{
1e33ac1e
BB
362 ASSERT3S(type, ==, RW_DEFAULT);
363 ASSERT3P(arg, ==, NULL);
364 VERIFY3S(pthread_rwlock_init(&rwlp->rw_lock, NULL), ==, 0);
365 rwlp->rw_owner = RW_INIT;
366 rwlp->rw_wr_owner = RW_INIT;
367 rwlp->rw_readers = 0;
368 rwlp->rw_magic = RW_MAGIC;
34dc7c2f
BB
369}
370
371void
372rw_destroy(krwlock_t *rwlp)
373{
1e33ac1e
BB
374 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
375
376 VERIFY3S(pthread_rwlock_destroy(&rwlp->rw_lock), ==, 0);
377 rwlp->rw_magic = 0;
34dc7c2f
BB
378}
379
380void
381rw_enter(krwlock_t *rwlp, krw_t rw)
382{
1e33ac1e
BB
383 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
384 ASSERT3P(rwlp->rw_owner, !=, curthread);
385 ASSERT3P(rwlp->rw_wr_owner, !=, curthread);
34dc7c2f 386
1e33ac1e
BB
387 if (rw == RW_READER) {
388 VERIFY3S(pthread_rwlock_rdlock(&rwlp->rw_lock), ==, 0);
389 ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT);
390
391 atomic_inc_uint(&rwlp->rw_readers);
392 } else {
393 VERIFY3S(pthread_rwlock_wrlock(&rwlp->rw_lock), ==, 0);
394 ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT);
395 ASSERT3U(rwlp->rw_readers, ==, 0);
396
397 rwlp->rw_wr_owner = curthread;
398 }
34dc7c2f
BB
399
400 rwlp->rw_owner = curthread;
401}
402
403void
404rw_exit(krwlock_t *rwlp)
405{
1e33ac1e
BB
406 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
407 ASSERT(RW_LOCK_HELD(rwlp));
408
409 if (RW_READ_HELD(rwlp))
410 atomic_dec_uint(&rwlp->rw_readers);
411 else
412 rwlp->rw_wr_owner = RW_INIT;
34dc7c2f 413
1e33ac1e
BB
414 rwlp->rw_owner = RW_INIT;
415 VERIFY3S(pthread_rwlock_unlock(&rwlp->rw_lock), ==, 0);
34dc7c2f
BB
416}
417
418int
419rw_tryenter(krwlock_t *rwlp, krw_t rw)
420{
421 int rv;
422
1e33ac1e 423 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
34dc7c2f
BB
424
425 if (rw == RW_READER)
1e33ac1e 426 rv = pthread_rwlock_tryrdlock(&rwlp->rw_lock);
34dc7c2f 427 else
1e33ac1e 428 rv = pthread_rwlock_trywrlock(&rwlp->rw_lock);
34dc7c2f
BB
429
430 if (rv == 0) {
1e33ac1e
BB
431 ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT);
432
433 if (rw == RW_READER)
434 atomic_inc_uint(&rwlp->rw_readers);
435 else {
436 ASSERT3U(rwlp->rw_readers, ==, 0);
437 rwlp->rw_wr_owner = curthread;
438 }
439
34dc7c2f
BB
440 rwlp->rw_owner = curthread;
441 return (1);
442 }
443
1e33ac1e
BB
444 VERIFY3S(rv, ==, EBUSY);
445
34dc7c2f
BB
446 return (0);
447}
448
34dc7c2f
BB
449int
450rw_tryupgrade(krwlock_t *rwlp)
451{
1e33ac1e 452 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
34dc7c2f
BB
453
454 return (0);
455}
456
457/*
458 * =========================================================================
459 * condition variables
460 * =========================================================================
461 */
1e33ac1e 462
34dc7c2f
BB
463void
464cv_init(kcondvar_t *cv, char *name, int type, void *arg)
465{
1e33ac1e
BB
466 ASSERT3S(type, ==, CV_DEFAULT);
467 cv->cv_magic = CV_MAGIC;
468 VERIFY3S(pthread_cond_init(&cv->cv, NULL), ==, 0);
34dc7c2f
BB
469}
470
471void
472cv_destroy(kcondvar_t *cv)
473{
1e33ac1e
BB
474 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
475 VERIFY3S(pthread_cond_destroy(&cv->cv), ==, 0);
476 cv->cv_magic = 0;
34dc7c2f
BB
477}
478
479void
480cv_wait(kcondvar_t *cv, kmutex_t *mp)
481{
1e33ac1e
BB
482 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
483 ASSERT3P(mutex_owner(mp), ==, curthread);
484 mp->m_owner = MTX_INIT;
485 int ret = pthread_cond_wait(&cv->cv, &mp->m_lock);
486 if (ret != 0)
487 VERIFY3S(ret, ==, EINTR);
34dc7c2f
BB
488 mp->m_owner = curthread;
489}
490
491clock_t
492cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
493{
494 int error;
1e33ac1e 495 struct timeval tv;
34dc7c2f
BB
496 timestruc_t ts;
497 clock_t delta;
498
1e33ac1e
BB
499 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
500
34dc7c2f 501top:
428870ff 502 delta = abstime - ddi_get_lbolt();
34dc7c2f
BB
503 if (delta <= 0)
504 return (-1);
505
1e33ac1e
BB
506 VERIFY(gettimeofday(&tv, NULL) == 0);
507
508 ts.tv_sec = tv.tv_sec + delta / hz;
509 ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz);
510 if (ts.tv_nsec >= NANOSEC) {
511 ts.tv_sec++;
512 ts.tv_nsec -= NANOSEC;
513 }
34dc7c2f 514
1e33ac1e
BB
515 ASSERT3P(mutex_owner(mp), ==, curthread);
516 mp->m_owner = MTX_INIT;
517 error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts);
34dc7c2f
BB
518 mp->m_owner = curthread;
519
1e33ac1e 520 if (error == ETIMEDOUT)
34dc7c2f
BB
521 return (-1);
522
523 if (error == EINTR)
524 goto top;
525
1e33ac1e 526 VERIFY3S(error, ==, 0);
34dc7c2f
BB
527
528 return (1);
529}
530
63fd3c6c
AL
531/*ARGSUSED*/
532clock_t
533cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
534 int flag)
535{
536 int error;
537 timestruc_t ts;
538 hrtime_t delta;
539
540 ASSERT(flag == 0);
541
542top:
543 delta = tim - gethrtime();
544 if (delta <= 0)
545 return (-1);
546
547 ts.tv_sec = delta / NANOSEC;
548 ts.tv_nsec = delta % NANOSEC;
549
550 ASSERT(mutex_owner(mp) == curthread);
551 mp->m_owner = NULL;
552 error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts);
553 mp->m_owner = curthread;
554
555 if (error == ETIME)
556 return (-1);
557
558 if (error == EINTR)
559 goto top;
560
561 ASSERT(error == 0);
562
563 return (1);
564}
565
34dc7c2f
BB
566void
567cv_signal(kcondvar_t *cv)
568{
1e33ac1e
BB
569 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
570 VERIFY3S(pthread_cond_signal(&cv->cv), ==, 0);
34dc7c2f
BB
571}
572
573void
574cv_broadcast(kcondvar_t *cv)
575{
1e33ac1e
BB
576 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
577 VERIFY3S(pthread_cond_broadcast(&cv->cv), ==, 0);
34dc7c2f
BB
578}
579
580/*
581 * =========================================================================
582 * vnode operations
583 * =========================================================================
584 */
585/*
586 * Note: for the xxxat() versions of these functions, we assume that the
587 * starting vp is always rootdir (which is true for spa_directory.c, the only
588 * ZFS consumer of these interfaces). We assert this is true, and then emulate
589 * them by adding '/' in front of the path.
590 */
591
592/*ARGSUSED*/
593int
594vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
595{
596 int fd;
597 vnode_t *vp;
a4914d38 598 int old_umask = 0;
5ae4e2c2 599 char *realpath;
34dc7c2f 600 struct stat64 st;
4d58b69d 601 int err;
34dc7c2f 602
5ae4e2c2
BB
603 realpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
604
34dc7c2f
BB
605 /*
606 * If we're accessing a real disk from userland, we need to use
607 * the character interface to avoid caching. This is particularly
608 * important if we're trying to look at a real in-kernel storage
609 * pool from userland, e.g. via zdb, because otherwise we won't
610 * see the changes occurring under the segmap cache.
611 * On the other hand, the stupid character device returns zero
612 * for its size. So -- gag -- we open the block device to get
613 * its size, and remember it for subsequent VOP_GETATTR().
614 */
d603ed6c 615#if defined(__sun__) || defined(__sun)
34dc7c2f 616 if (strncmp(path, "/dev/", 5) == 0) {
d603ed6c
BB
617#else
618 if (0) {
619#endif
34dc7c2f
BB
620 char *dsk;
621 fd = open64(path, O_RDONLY);
5ae4e2c2
BB
622 if (fd == -1) {
623 err = errno;
624 free(realpath);
625 return (err);
626 }
34dc7c2f 627 if (fstat64(fd, &st) == -1) {
5ae4e2c2 628 err = errno;
34dc7c2f 629 close(fd);
5ae4e2c2
BB
630 free(realpath);
631 return (err);
34dc7c2f
BB
632 }
633 close(fd);
634 (void) sprintf(realpath, "%s", path);
635 dsk = strstr(path, "/dsk/");
636 if (dsk != NULL)
637 (void) sprintf(realpath + (dsk - path) + 1, "r%s",
638 dsk + 1);
639 } else {
640 (void) sprintf(realpath, "%s", path);
5ae4e2c2
BB
641 if (!(flags & FCREAT) && stat64(realpath, &st) == -1) {
642 err = errno;
643 free(realpath);
644 return (err);
645 }
34dc7c2f
BB
646 }
647
d603ed6c
BB
648 if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) {
649#ifdef __linux__
650 flags |= O_DIRECT;
651#endif
652 /* We shouldn't be writing to block devices in userspace */
653 VERIFY(!(flags & FWRITE));
654 }
655
34dc7c2f
BB
656 if (flags & FCREAT)
657 old_umask = umask(0);
658
659 /*
660 * The construct 'flags - FREAD' conveniently maps combinations of
661 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
662 */
663 fd = open64(realpath, flags - FREAD, mode);
5ae4e2c2 664 free(realpath);
34dc7c2f
BB
665
666 if (flags & FCREAT)
667 (void) umask(old_umask);
668
669 if (fd == -1)
670 return (errno);
671
8d4e8140 672 if (fstat64_blk(fd, &st) == -1) {
4d58b69d 673 err = errno;
34dc7c2f 674 close(fd);
4d58b69d 675 return (err);
34dc7c2f
BB
676 }
677
678 (void) fcntl(fd, F_SETFD, FD_CLOEXEC);
679
680 *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
681
682 vp->v_fd = fd;
683 vp->v_size = st.st_size;
684 vp->v_path = spa_strdup(path);
685
686 return (0);
687}
688
689/*ARGSUSED*/
690int
691vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
692 int x3, vnode_t *startvp, int fd)
693{
694 char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
695 int ret;
696
697 ASSERT(startvp == rootdir);
698 (void) sprintf(realpath, "/%s", path);
699
700 /* fd ignored for now, need if want to simulate nbmand support */
701 ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
702
703 umem_free(realpath, strlen(path) + 2);
704
705 return (ret);
706}
707
708/*ARGSUSED*/
709int
710vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
711 int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
712{
4d58b69d 713 ssize_t rc, done = 0, split;
34dc7c2f
BB
714
715 if (uio == UIO_READ) {
4d58b69d 716 rc = pread64(vp->v_fd, addr, len, offset);
34dc7c2f
BB
717 } else {
718 /*
719 * To simulate partial disk writes, we split writes into two
720 * system calls so that the process can be killed in between.
721 */
9ae529ec
CS
722 int sectors = len >> SPA_MINBLOCKSHIFT;
723 split = (sectors > 0 ? rand() % sectors : 0) <<
724 SPA_MINBLOCKSHIFT;
4d58b69d
RC
725 rc = pwrite64(vp->v_fd, addr, split, offset);
726 if (rc != -1) {
727 done = rc;
728 rc = pwrite64(vp->v_fd, (char *)addr + split,
729 len - split, offset + split);
730 }
34dc7c2f
BB
731 }
732
d603ed6c
BB
733#ifdef __linux__
734 if (rc == -1 && errno == EINVAL) {
735 /*
736 * Under Linux, this most likely means an alignment issue
737 * (memory or disk) due to O_DIRECT, so we abort() in order to
738 * catch the offender.
739 */
740 abort();
741 }
742#endif
4d58b69d 743 if (rc == -1)
34dc7c2f 744 return (errno);
4d58b69d
RC
745
746 done += rc;
747
34dc7c2f 748 if (residp)
4d58b69d
RC
749 *residp = len - done;
750 else if (done != len)
34dc7c2f
BB
751 return (EIO);
752 return (0);
753}
754
755void
756vn_close(vnode_t *vp)
757{
758 close(vp->v_fd);
759 spa_strfree(vp->v_path);
760 umem_free(vp, sizeof (vnode_t));
761}
762
428870ff
BB
763/*
764 * At a minimum we need to update the size since vdev_reopen()
765 * will no longer call vn_openat().
766 */
767int
768fop_getattr(vnode_t *vp, vattr_t *vap)
769{
770 struct stat64 st;
8d4e8140 771 int err;
428870ff 772
8d4e8140
RC
773 if (fstat64_blk(vp->v_fd, &st) == -1) {
774 err = errno;
428870ff 775 close(vp->v_fd);
8d4e8140 776 return (err);
428870ff
BB
777 }
778
779 vap->va_size = st.st_size;
780 return (0);
781}
782
34dc7c2f
BB
783/*
784 * =========================================================================
785 * Figure out which debugging statements to print
786 * =========================================================================
787 */
788
789static char *dprintf_string;
790static int dprintf_print_all;
791
792int
793dprintf_find_string(const char *string)
794{
795 char *tmp_str = dprintf_string;
796 int len = strlen(string);
797
798 /*
799 * Find out if this is a string we want to print.
800 * String format: file1.c,function_name1,file2.c,file3.c
801 */
802
803 while (tmp_str != NULL) {
804 if (strncmp(tmp_str, string, len) == 0 &&
805 (tmp_str[len] == ',' || tmp_str[len] == '\0'))
806 return (1);
807 tmp_str = strchr(tmp_str, ',');
808 if (tmp_str != NULL)
809 tmp_str++; /* Get rid of , */
810 }
811 return (0);
812}
813
814void
815dprintf_setup(int *argc, char **argv)
816{
817 int i, j;
818
819 /*
820 * Debugging can be specified two ways: by setting the
821 * environment variable ZFS_DEBUG, or by including a
822 * "debug=..." argument on the command line. The command
823 * line setting overrides the environment variable.
824 */
825
826 for (i = 1; i < *argc; i++) {
827 int len = strlen("debug=");
828 /* First look for a command line argument */
829 if (strncmp("debug=", argv[i], len) == 0) {
830 dprintf_string = argv[i] + len;
831 /* Remove from args */
832 for (j = i; j < *argc; j++)
833 argv[j] = argv[j+1];
834 argv[j] = NULL;
835 (*argc)--;
836 }
837 }
838
839 if (dprintf_string == NULL) {
840 /* Look for ZFS_DEBUG environment variable */
841 dprintf_string = getenv("ZFS_DEBUG");
842 }
843
844 /*
845 * Are we just turning on all debugging?
846 */
847 if (dprintf_find_string("on"))
848 dprintf_print_all = 1;
849}
850
851/*
852 * =========================================================================
853 * debug printfs
854 * =========================================================================
855 */
856void
857__dprintf(const char *file, const char *func, int line, const char *fmt, ...)
858{
859 const char *newfile;
860 va_list adx;
861
862 /*
863 * Get rid of annoying "../common/" prefix to filename.
864 */
865 newfile = strrchr(file, '/');
866 if (newfile != NULL) {
867 newfile = newfile + 1; /* Get rid of leading / */
868 } else {
869 newfile = file;
870 }
871
872 if (dprintf_print_all ||
873 dprintf_find_string(newfile) ||
874 dprintf_find_string(func)) {
875 /* Print out just the function name if requested */
876 flockfile(stdout);
877 if (dprintf_find_string("pid"))
878 (void) printf("%d ", getpid());
879 if (dprintf_find_string("tid"))
1e33ac1e 880 (void) printf("%u ", (uint_t) pthread_self());
34dc7c2f
BB
881 if (dprintf_find_string("cpu"))
882 (void) printf("%u ", getcpuid());
883 if (dprintf_find_string("time"))
884 (void) printf("%llu ", gethrtime());
885 if (dprintf_find_string("long"))
886 (void) printf("%s, line %d: ", newfile, line);
887 (void) printf("%s: ", func);
888 va_start(adx, fmt);
889 (void) vprintf(fmt, adx);
890 va_end(adx);
891 funlockfile(stdout);
892 }
893}
894
34dc7c2f
BB
895/*
896 * =========================================================================
897 * cmn_err() and panic()
898 * =========================================================================
899 */
900static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
901static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
902
903void
904vpanic(const char *fmt, va_list adx)
905{
906 (void) fprintf(stderr, "error: ");
907 (void) vfprintf(stderr, fmt, adx);
908 (void) fprintf(stderr, "\n");
909
910 abort(); /* think of it as a "user-level crash dump" */
911}
912
913void
914panic(const char *fmt, ...)
915{
916 va_list adx;
917
918 va_start(adx, fmt);
919 vpanic(fmt, adx);
920 va_end(adx);
921}
922
923void
924vcmn_err(int ce, const char *fmt, va_list adx)
925{
926 if (ce == CE_PANIC)
927 vpanic(fmt, adx);
928 if (ce != CE_NOTE) { /* suppress noise in userland stress testing */
929 (void) fprintf(stderr, "%s", ce_prefix[ce]);
930 (void) vfprintf(stderr, fmt, adx);
931 (void) fprintf(stderr, "%s", ce_suffix[ce]);
932 }
933}
934
935/*PRINTFLIKE2*/
936void
937cmn_err(int ce, const char *fmt, ...)
938{
939 va_list adx;
940
941 va_start(adx, fmt);
942 vcmn_err(ce, fmt, adx);
943 va_end(adx);
944}
945
946/*
947 * =========================================================================
948 * kobj interfaces
949 * =========================================================================
950 */
951struct _buf *
952kobj_open_file(char *name)
953{
954 struct _buf *file;
955 vnode_t *vp;
956
957 /* set vp as the _fd field of the file */
958 if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
959 -1) != 0)
960 return ((void *)-1UL);
961
962 file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
963 file->_fd = (intptr_t)vp;
964 return (file);
965}
966
967int
968kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
969{
970 ssize_t resid;
971
972 vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
973 UIO_SYSSPACE, 0, 0, 0, &resid);
974
975 return (size - resid);
976}
977
978void
979kobj_close_file(struct _buf *file)
980{
981 vn_close((vnode_t *)file->_fd);
982 umem_free(file, sizeof (struct _buf));
983}
984
985int
986kobj_get_filesize(struct _buf *file, uint64_t *size)
987{
988 struct stat64 st;
989 vnode_t *vp = (vnode_t *)file->_fd;
990
991 if (fstat64(vp->v_fd, &st) == -1) {
992 vn_close(vp);
993 return (errno);
994 }
995 *size = st.st_size;
996 return (0);
997}
998
999/*
1000 * =========================================================================
1001 * misc routines
1002 * =========================================================================
1003 */
1004
1005void
1006delay(clock_t ticks)
1007{
1008 poll(0, 0, ticks * (1000 / hz));
1009}
1010
1011/*
1012 * Find highest one bit set.
1013 * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
1014 * High order bit is 31 (or 63 in _LP64 kernel).
1015 */
1016int
1017highbit(ulong_t i)
1018{
1019 register int h = 1;
1020
1021 if (i == 0)
1022 return (0);
1023#ifdef _LP64
1024 if (i & 0xffffffff00000000ul) {
1025 h += 32; i >>= 32;
1026 }
1027#endif
1028 if (i & 0xffff0000) {
1029 h += 16; i >>= 16;
1030 }
1031 if (i & 0xff00) {
1032 h += 8; i >>= 8;
1033 }
1034 if (i & 0xf0) {
1035 h += 4; i >>= 4;
1036 }
1037 if (i & 0xc) {
1038 h += 2; i >>= 2;
1039 }
1040 if (i & 0x2) {
1041 h += 1;
1042 }
1043 return (h);
1044}
1045
1046static int random_fd = -1, urandom_fd = -1;
1047
1048static int
1049random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
1050{
1051 size_t resid = len;
1052 ssize_t bytes;
1053
1054 ASSERT(fd != -1);
1055
1056 while (resid != 0) {
1057 bytes = read(fd, ptr, resid);
1058 ASSERT3S(bytes, >=, 0);
1059 ptr += bytes;
1060 resid -= bytes;
1061 }
1062
1063 return (0);
1064}
1065
1066int
1067random_get_bytes(uint8_t *ptr, size_t len)
1068{
1069 return (random_get_bytes_common(ptr, len, random_fd));
1070}
1071
1072int
1073random_get_pseudo_bytes(uint8_t *ptr, size_t len)
1074{
1075 return (random_get_bytes_common(ptr, len, urandom_fd));
1076}
1077
1078int
1079ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
1080{
1081 char *end;
1082
1083 *result = strtoul(hw_serial, &end, base);
1084 if (*result == 0)
1085 return (errno);
1086 return (0);
1087}
1088
428870ff
BB
1089int
1090ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
1091{
1092 char *end;
1093
1094 *result = strtoull(str, &end, base);
1095 if (*result == 0)
1096 return (errno);
1097 return (0);
1098}
1099
34dc7c2f
BB
1100/*
1101 * =========================================================================
1102 * kernel emulation setup & teardown
1103 * =========================================================================
1104 */
1105static int
1106umem_out_of_memory(void)
1107{
1108 char errmsg[] = "out of memory -- generating core dump\n";
1109
0e5b68e0 1110 (void) fprintf(stderr, "%s", errmsg);
34dc7c2f
BB
1111 abort();
1112 return (0);
1113}
1114
1115void
1116kernel_init(int mode)
1117{
13fe0198
MA
1118 extern uint_t rrw_tsd_key;
1119
34dc7c2f
BB
1120 umem_nofail_callback(umem_out_of_memory);
1121
1122 physmem = sysconf(_SC_PHYS_PAGES);
1123
1124 dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
1125 (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
1126
428870ff
BB
1127 (void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
1128 (mode & FWRITE) ? gethostid() : 0);
34dc7c2f
BB
1129
1130 VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
1131 VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
1132
1e33ac1e 1133 thread_init();
b128c09f
BB
1134 system_taskq_init();
1135
34dc7c2f 1136 spa_init(mode);
13fe0198
MA
1137
1138 tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
34dc7c2f
BB
1139}
1140
1141void
1142kernel_fini(void)
1143{
1144 spa_fini();
1145
428870ff 1146 system_taskq_fini();
1e33ac1e 1147 thread_fini();
428870ff 1148
34dc7c2f
BB
1149 close(random_fd);
1150 close(urandom_fd);
1151
1152 random_fd = -1;
1153 urandom_fd = -1;
1154}
1155
34dc7c2f
BB
1156uid_t
1157crgetuid(cred_t *cr)
1158{
1159 return (0);
1160}
1161
6f1ffb06
MA
1162uid_t
1163crgetruid(cred_t *cr)
1164{
1165 return (0);
1166}
1167
34dc7c2f
BB
1168gid_t
1169crgetgid(cred_t *cr)
1170{
1171 return (0);
1172}
1173
1174int
1175crgetngroups(cred_t *cr)
1176{
1177 return (0);
1178}
1179
1180gid_t *
1181crgetgroups(cred_t *cr)
1182{
1183 return (NULL);
1184}
1185
1186int
1187zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
1188{
1189 return (0);
1190}
1191
1192int
1193zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
1194{
1195 return (0);
1196}
1197
1198int
1199zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
1200{
1201 return (0);
1202}
1203
1204ksiddomain_t *
1205ksid_lookupdomain(const char *dom)
1206{
1207 ksiddomain_t *kd;
1208
1209 kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
1210 kd->kd_name = spa_strdup(dom);
1211 return (kd);
1212}
1213
1214void
1215ksiddomain_rele(ksiddomain_t *ksid)
1216{
1217 spa_strfree(ksid->kd_name);
1218 umem_free(ksid, sizeof (ksiddomain_t));
1219}
428870ff 1220
428870ff 1221char *
00b46022 1222kmem_vasprintf(const char *fmt, va_list adx)
428870ff 1223{
00b46022
BB
1224 char *buf = NULL;
1225 va_list adx_copy;
428870ff 1226
00b46022
BB
1227 va_copy(adx_copy, adx);
1228 VERIFY(vasprintf(&buf, fmt, adx_copy) != -1);
1229 va_end(adx_copy);
428870ff 1230
00b46022
BB
1231 return (buf);
1232}
1233
1234char *
1235kmem_asprintf(const char *fmt, ...)
1236{
1237 char *buf = NULL;
1238 va_list adx;
428870ff
BB
1239
1240 va_start(adx, fmt);
00b46022 1241 VERIFY(vasprintf(&buf, fmt, adx) != -1);
428870ff
BB
1242 va_end(adx);
1243
1244 return (buf);
1245}
572e2857
BB
1246
1247/* ARGSUSED */
1248int
1249zfs_onexit_fd_hold(int fd, minor_t *minorp)
1250{
1251 *minorp = 0;
1252 return (0);
1253}
1254
1255/* ARGSUSED */
1256void
1257zfs_onexit_fd_rele(int fd)
1258{
1259}
1260
1261/* ARGSUSED */
1262int
1263zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
1264 uint64_t *action_handle)
1265{
1266 return (0);
1267}
1268
1269/* ARGSUSED */
1270int
1271zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
1272{
1273 return (0);
1274}
1275
1276/* ARGSUSED */
1277int
1278zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
1279{
1280 return (0);
1281}