]> git.proxmox.com Git - mirror_zfs.git/blob - lib/libzpool/kernel.c
Illumos #4374
[mirror_zfs.git] / lib / libzpool / kernel.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <assert.h>
26 #include <fcntl.h>
27 #include <poll.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <zlib.h>
32 #include <sys/signal.h>
33 #include <sys/spa.h>
34 #include <sys/stat.h>
35 #include <sys/processor.h>
36 #include <sys/zfs_context.h>
37 #include <sys/rrwlock.h>
38 #include <sys/utsname.h>
39 #include <sys/time.h>
40 #include <sys/systeminfo.h>
41
42 /*
43 * Emulation of kernel services in userland.
44 */
45
46 int aok;
47 uint64_t physmem;
48 vnode_t *rootdir = (vnode_t *)0xabcd1234;
49 char hw_serial[HW_HOSTID_LEN];
50
51 struct utsname utsname = {
52 "userland", "libzpool", "1", "1", "na"
53 };
54
55 /* this only exists to have its address taken */
56 struct proc p0;
57
58 /*
59 * =========================================================================
60 * threads
61 * =========================================================================
62 */
63
64 pthread_cond_t kthread_cond = PTHREAD_COND_INITIALIZER;
65 pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER;
66 pthread_key_t kthread_key;
67 int kthread_nr = 0;
68
69 static void
70 thread_init(void)
71 {
72 kthread_t *kt;
73
74 VERIFY3S(pthread_key_create(&kthread_key, NULL), ==, 0);
75
76 /* Create entry for primary kthread */
77 kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL);
78 kt->t_tid = pthread_self();
79 kt->t_func = NULL;
80
81 VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0);
82
83 /* Only the main thread should be running at the moment */
84 ASSERT3S(kthread_nr, ==, 0);
85 kthread_nr = 1;
86 }
87
88 static void
89 thread_fini(void)
90 {
91 kthread_t *kt = curthread;
92
93 ASSERT(pthread_equal(kt->t_tid, pthread_self()));
94 ASSERT3P(kt->t_func, ==, NULL);
95
96 umem_free(kt, sizeof (kthread_t));
97
98 /* Wait for all threads to exit via thread_exit() */
99 VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
100
101 kthread_nr--; /* Main thread is exiting */
102
103 while (kthread_nr > 0)
104 VERIFY3S(pthread_cond_wait(&kthread_cond, &kthread_lock), ==,
105 0);
106
107 ASSERT3S(kthread_nr, ==, 0);
108 VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
109
110 VERIFY3S(pthread_key_delete(kthread_key), ==, 0);
111 }
112
113 kthread_t *
114 zk_thread_current(void)
115 {
116 kthread_t *kt = pthread_getspecific(kthread_key);
117
118 ASSERT3P(kt, !=, NULL);
119
120 return (kt);
121 }
122
123 void *
124 zk_thread_helper(void *arg)
125 {
126 kthread_t *kt = (kthread_t *) arg;
127
128 VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0);
129
130 VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0);
131 kthread_nr++;
132 VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0);
133
134 kt->t_tid = pthread_self();
135 ((thread_func_arg_t) kt->t_func)(kt->t_arg);
136
137 /* Unreachable, thread must exit with thread_exit() */
138 abort();
139
140 return (NULL);
141 }
142
143 kthread_t *
144 zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg,
145 size_t len, proc_t *pp, int state, pri_t pri, int detachstate)
146 {
147 kthread_t *kt;
148 pthread_attr_t attr;
149 size_t stack;
150
151 ASSERT3S(state & ~TS_RUN, ==, 0);
152
153 kt = umem_zalloc(sizeof (kthread_t), UMEM_NOFAIL);
154 kt->t_func = func;
155 kt->t_arg = arg;
156
157 /*
158 * The Solaris kernel stack size is 24k for x86/x86_64.
159 * The Linux kernel stack size is 8k for x86/x86_64.
160 *
161 * We reduce the default stack size in userspace, to ensure
162 * we observe stack overruns in user space as well as in
163 * kernel space. In practice we can't set the userspace stack
164 * size to 8k because differences in stack usage between kernel
165 * space and userspace could lead to spurious stack overflows
166 * (especially when debugging is enabled). Nevertheless, we try
167 * to set it to the lowest value that works (currently 8k*4).
168 * PTHREAD_STACK_MIN is the minimum stack required for a NULL
169 * procedure in user space and is added in to the stack
170 * requirements.
171 *
172 * Some buggy NPTL threading implementations include the
173 * guard area within the stack size allocations. In
174 * this case we allocate an extra page to account for the
175 * guard area since we only have two pages of usable stack
176 * on Linux.
177 */
178
179 stack = PTHREAD_STACK_MIN + MAX(stksize, STACK_SIZE) * 4;
180
181 VERIFY3S(pthread_attr_init(&attr), ==, 0);
182 VERIFY3S(pthread_attr_setstacksize(&attr, stack), ==, 0);
183 VERIFY3S(pthread_attr_setguardsize(&attr, PAGESIZE), ==, 0);
184 VERIFY3S(pthread_attr_setdetachstate(&attr, detachstate), ==, 0);
185
186 VERIFY3S(pthread_create(&kt->t_tid, &attr, &zk_thread_helper, kt),
187 ==, 0);
188
189 VERIFY3S(pthread_attr_destroy(&attr), ==, 0);
190
191 return (kt);
192 }
193
194 void
195 zk_thread_exit(void)
196 {
197 kthread_t *kt = curthread;
198
199 ASSERT(pthread_equal(kt->t_tid, pthread_self()));
200
201 umem_free(kt, sizeof (kthread_t));
202
203 pthread_mutex_lock(&kthread_lock);
204 kthread_nr--;
205 pthread_mutex_unlock(&kthread_lock);
206
207 pthread_cond_broadcast(&kthread_cond);
208 pthread_exit((void *)TS_MAGIC);
209 }
210
211 void
212 zk_thread_join(kt_did_t tid)
213 {
214 void *ret;
215
216 pthread_join((pthread_t)tid, &ret);
217 VERIFY3P(ret, ==, (void *)TS_MAGIC);
218 }
219
220 /*
221 * =========================================================================
222 * kstats
223 * =========================================================================
224 */
225 /*ARGSUSED*/
226 kstat_t *
227 kstat_create(const char *module, int instance, const char *name,
228 const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag)
229 {
230 return (NULL);
231 }
232
233 /*ARGSUSED*/
234 void
235 kstat_install(kstat_t *ksp)
236 {}
237
238 /*ARGSUSED*/
239 void
240 kstat_delete(kstat_t *ksp)
241 {}
242
243 /*ARGSUSED*/
244 void
245 kstat_waitq_enter(kstat_io_t *kiop)
246 {}
247
248 /*ARGSUSED*/
249 void
250 kstat_waitq_exit(kstat_io_t *kiop)
251 {}
252
253 /*ARGSUSED*/
254 void
255 kstat_runq_enter(kstat_io_t *kiop)
256 {}
257
258 /*ARGSUSED*/
259 void
260 kstat_runq_exit(kstat_io_t *kiop)
261 {}
262
263 /*ARGSUSED*/
264 void
265 kstat_waitq_to_runq(kstat_io_t *kiop)
266 {}
267
268 /*ARGSUSED*/
269 void
270 kstat_runq_back_to_waitq(kstat_io_t *kiop)
271 {}
272
273 void
274 kstat_set_raw_ops(kstat_t *ksp,
275 int (*headers)(char *buf, size_t size),
276 int (*data)(char *buf, size_t size, void *data),
277 void *(*addr)(kstat_t *ksp, loff_t index))
278 {}
279
280 /*
281 * =========================================================================
282 * mutexes
283 * =========================================================================
284 */
285
286 void
287 mutex_init(kmutex_t *mp, char *name, int type, void *cookie)
288 {
289 ASSERT3S(type, ==, MUTEX_DEFAULT);
290 ASSERT3P(cookie, ==, NULL);
291 mp->m_owner = MTX_INIT;
292 mp->m_magic = MTX_MAGIC;
293 VERIFY3S(pthread_mutex_init(&mp->m_lock, NULL), ==, 0);
294 }
295
296 void
297 mutex_destroy(kmutex_t *mp)
298 {
299 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
300 ASSERT3P(mp->m_owner, ==, MTX_INIT);
301 VERIFY3S(pthread_mutex_destroy(&(mp)->m_lock), ==, 0);
302 mp->m_owner = MTX_DEST;
303 mp->m_magic = 0;
304 }
305
306 void
307 mutex_enter(kmutex_t *mp)
308 {
309 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
310 ASSERT3P(mp->m_owner, !=, MTX_DEST);
311 ASSERT3P(mp->m_owner, !=, curthread);
312 VERIFY3S(pthread_mutex_lock(&mp->m_lock), ==, 0);
313 ASSERT3P(mp->m_owner, ==, MTX_INIT);
314 mp->m_owner = curthread;
315 }
316
317 int
318 mutex_tryenter(kmutex_t *mp)
319 {
320 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
321 ASSERT3P(mp->m_owner, !=, MTX_DEST);
322 if (0 == pthread_mutex_trylock(&mp->m_lock)) {
323 ASSERT3P(mp->m_owner, ==, MTX_INIT);
324 mp->m_owner = curthread;
325 return (1);
326 } else {
327 return (0);
328 }
329 }
330
331 void
332 mutex_exit(kmutex_t *mp)
333 {
334 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
335 ASSERT3P(mutex_owner(mp), ==, curthread);
336 mp->m_owner = MTX_INIT;
337 VERIFY3S(pthread_mutex_unlock(&mp->m_lock), ==, 0);
338 }
339
340 void *
341 mutex_owner(kmutex_t *mp)
342 {
343 ASSERT3U(mp->m_magic, ==, MTX_MAGIC);
344 return (mp->m_owner);
345 }
346
347 int
348 mutex_held(kmutex_t *mp)
349 {
350 return (mp->m_owner == curthread);
351 }
352
353 /*
354 * =========================================================================
355 * rwlocks
356 * =========================================================================
357 */
358
359 void
360 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
361 {
362 ASSERT3S(type, ==, RW_DEFAULT);
363 ASSERT3P(arg, ==, NULL);
364 VERIFY3S(pthread_rwlock_init(&rwlp->rw_lock, NULL), ==, 0);
365 rwlp->rw_owner = RW_INIT;
366 rwlp->rw_wr_owner = RW_INIT;
367 rwlp->rw_readers = 0;
368 rwlp->rw_magic = RW_MAGIC;
369 }
370
371 void
372 rw_destroy(krwlock_t *rwlp)
373 {
374 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
375
376 VERIFY3S(pthread_rwlock_destroy(&rwlp->rw_lock), ==, 0);
377 rwlp->rw_magic = 0;
378 }
379
380 void
381 rw_enter(krwlock_t *rwlp, krw_t rw)
382 {
383 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
384 ASSERT3P(rwlp->rw_owner, !=, curthread);
385 ASSERT3P(rwlp->rw_wr_owner, !=, curthread);
386
387 if (rw == RW_READER) {
388 VERIFY3S(pthread_rwlock_rdlock(&rwlp->rw_lock), ==, 0);
389 ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT);
390
391 atomic_inc_uint(&rwlp->rw_readers);
392 } else {
393 VERIFY3S(pthread_rwlock_wrlock(&rwlp->rw_lock), ==, 0);
394 ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT);
395 ASSERT3U(rwlp->rw_readers, ==, 0);
396
397 rwlp->rw_wr_owner = curthread;
398 }
399
400 rwlp->rw_owner = curthread;
401 }
402
403 void
404 rw_exit(krwlock_t *rwlp)
405 {
406 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
407 ASSERT(RW_LOCK_HELD(rwlp));
408
409 if (RW_READ_HELD(rwlp))
410 atomic_dec_uint(&rwlp->rw_readers);
411 else
412 rwlp->rw_wr_owner = RW_INIT;
413
414 rwlp->rw_owner = RW_INIT;
415 VERIFY3S(pthread_rwlock_unlock(&rwlp->rw_lock), ==, 0);
416 }
417
418 int
419 rw_tryenter(krwlock_t *rwlp, krw_t rw)
420 {
421 int rv;
422
423 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
424
425 if (rw == RW_READER)
426 rv = pthread_rwlock_tryrdlock(&rwlp->rw_lock);
427 else
428 rv = pthread_rwlock_trywrlock(&rwlp->rw_lock);
429
430 if (rv == 0) {
431 ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT);
432
433 if (rw == RW_READER)
434 atomic_inc_uint(&rwlp->rw_readers);
435 else {
436 ASSERT3U(rwlp->rw_readers, ==, 0);
437 rwlp->rw_wr_owner = curthread;
438 }
439
440 rwlp->rw_owner = curthread;
441 return (1);
442 }
443
444 VERIFY3S(rv, ==, EBUSY);
445
446 return (0);
447 }
448
449 int
450 rw_tryupgrade(krwlock_t *rwlp)
451 {
452 ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC);
453
454 return (0);
455 }
456
457 /*
458 * =========================================================================
459 * condition variables
460 * =========================================================================
461 */
462
463 void
464 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
465 {
466 ASSERT3S(type, ==, CV_DEFAULT);
467 cv->cv_magic = CV_MAGIC;
468 VERIFY3S(pthread_cond_init(&cv->cv, NULL), ==, 0);
469 }
470
471 void
472 cv_destroy(kcondvar_t *cv)
473 {
474 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
475 VERIFY3S(pthread_cond_destroy(&cv->cv), ==, 0);
476 cv->cv_magic = 0;
477 }
478
479 void
480 cv_wait(kcondvar_t *cv, kmutex_t *mp)
481 {
482 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
483 ASSERT3P(mutex_owner(mp), ==, curthread);
484 mp->m_owner = MTX_INIT;
485 int ret = pthread_cond_wait(&cv->cv, &mp->m_lock);
486 if (ret != 0)
487 VERIFY3S(ret, ==, EINTR);
488 mp->m_owner = curthread;
489 }
490
491 clock_t
492 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
493 {
494 int error;
495 struct timeval tv;
496 timestruc_t ts;
497 clock_t delta;
498
499 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
500
501 top:
502 delta = abstime - ddi_get_lbolt();
503 if (delta <= 0)
504 return (-1);
505
506 VERIFY(gettimeofday(&tv, NULL) == 0);
507
508 ts.tv_sec = tv.tv_sec + delta / hz;
509 ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz);
510 if (ts.tv_nsec >= NANOSEC) {
511 ts.tv_sec++;
512 ts.tv_nsec -= NANOSEC;
513 }
514
515 ASSERT3P(mutex_owner(mp), ==, curthread);
516 mp->m_owner = MTX_INIT;
517 error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts);
518 mp->m_owner = curthread;
519
520 if (error == ETIMEDOUT)
521 return (-1);
522
523 if (error == EINTR)
524 goto top;
525
526 VERIFY3S(error, ==, 0);
527
528 return (1);
529 }
530
531 /*ARGSUSED*/
532 clock_t
533 cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
534 int flag)
535 {
536 int error;
537 timestruc_t ts;
538 hrtime_t delta;
539
540 ASSERT(flag == 0);
541
542 top:
543 delta = tim - gethrtime();
544 if (delta <= 0)
545 return (-1);
546
547 ts.tv_sec = delta / NANOSEC;
548 ts.tv_nsec = delta % NANOSEC;
549
550 ASSERT(mutex_owner(mp) == curthread);
551 mp->m_owner = NULL;
552 error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts);
553 mp->m_owner = curthread;
554
555 if (error == ETIME)
556 return (-1);
557
558 if (error == EINTR)
559 goto top;
560
561 ASSERT(error == 0);
562
563 return (1);
564 }
565
566 void
567 cv_signal(kcondvar_t *cv)
568 {
569 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
570 VERIFY3S(pthread_cond_signal(&cv->cv), ==, 0);
571 }
572
573 void
574 cv_broadcast(kcondvar_t *cv)
575 {
576 ASSERT3U(cv->cv_magic, ==, CV_MAGIC);
577 VERIFY3S(pthread_cond_broadcast(&cv->cv), ==, 0);
578 }
579
580 /*
581 * =========================================================================
582 * vnode operations
583 * =========================================================================
584 */
585 /*
586 * Note: for the xxxat() versions of these functions, we assume that the
587 * starting vp is always rootdir (which is true for spa_directory.c, the only
588 * ZFS consumer of these interfaces). We assert this is true, and then emulate
589 * them by adding '/' in front of the path.
590 */
591
592 /*ARGSUSED*/
593 int
594 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3)
595 {
596 int fd;
597 vnode_t *vp;
598 int old_umask = 0;
599 char *realpath;
600 struct stat64 st;
601 int err;
602
603 realpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
604
605 /*
606 * If we're accessing a real disk from userland, we need to use
607 * the character interface to avoid caching. This is particularly
608 * important if we're trying to look at a real in-kernel storage
609 * pool from userland, e.g. via zdb, because otherwise we won't
610 * see the changes occurring under the segmap cache.
611 * On the other hand, the stupid character device returns zero
612 * for its size. So -- gag -- we open the block device to get
613 * its size, and remember it for subsequent VOP_GETATTR().
614 */
615 #if defined(__sun__) || defined(__sun)
616 if (strncmp(path, "/dev/", 5) == 0) {
617 #else
618 if (0) {
619 #endif
620 char *dsk;
621 fd = open64(path, O_RDONLY);
622 if (fd == -1) {
623 err = errno;
624 free(realpath);
625 return (err);
626 }
627 if (fstat64(fd, &st) == -1) {
628 err = errno;
629 close(fd);
630 free(realpath);
631 return (err);
632 }
633 close(fd);
634 (void) sprintf(realpath, "%s", path);
635 dsk = strstr(path, "/dsk/");
636 if (dsk != NULL)
637 (void) sprintf(realpath + (dsk - path) + 1, "r%s",
638 dsk + 1);
639 } else {
640 (void) sprintf(realpath, "%s", path);
641 if (!(flags & FCREAT) && stat64(realpath, &st) == -1) {
642 err = errno;
643 free(realpath);
644 return (err);
645 }
646 }
647
648 if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) {
649 #ifdef __linux__
650 flags |= O_DIRECT;
651 #endif
652 /* We shouldn't be writing to block devices in userspace */
653 VERIFY(!(flags & FWRITE));
654 }
655
656 if (flags & FCREAT)
657 old_umask = umask(0);
658
659 /*
660 * The construct 'flags - FREAD' conveniently maps combinations of
661 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR.
662 */
663 fd = open64(realpath, flags - FREAD, mode);
664 free(realpath);
665
666 if (flags & FCREAT)
667 (void) umask(old_umask);
668
669 if (fd == -1)
670 return (errno);
671
672 if (fstat64_blk(fd, &st) == -1) {
673 err = errno;
674 close(fd);
675 return (err);
676 }
677
678 (void) fcntl(fd, F_SETFD, FD_CLOEXEC);
679
680 *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL);
681
682 vp->v_fd = fd;
683 vp->v_size = st.st_size;
684 vp->v_path = spa_strdup(path);
685
686 return (0);
687 }
688
689 /*ARGSUSED*/
690 int
691 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2,
692 int x3, vnode_t *startvp, int fd)
693 {
694 char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL);
695 int ret;
696
697 ASSERT(startvp == rootdir);
698 (void) sprintf(realpath, "/%s", path);
699
700 /* fd ignored for now, need if want to simulate nbmand support */
701 ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3);
702
703 umem_free(realpath, strlen(path) + 2);
704
705 return (ret);
706 }
707
708 /*ARGSUSED*/
709 int
710 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset,
711 int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp)
712 {
713 ssize_t rc, done = 0, split;
714
715 if (uio == UIO_READ) {
716 rc = pread64(vp->v_fd, addr, len, offset);
717 } else {
718 /*
719 * To simulate partial disk writes, we split writes into two
720 * system calls so that the process can be killed in between.
721 */
722 int sectors = len >> SPA_MINBLOCKSHIFT;
723 split = (sectors > 0 ? rand() % sectors : 0) <<
724 SPA_MINBLOCKSHIFT;
725 rc = pwrite64(vp->v_fd, addr, split, offset);
726 if (rc != -1) {
727 done = rc;
728 rc = pwrite64(vp->v_fd, (char *)addr + split,
729 len - split, offset + split);
730 }
731 }
732
733 #ifdef __linux__
734 if (rc == -1 && errno == EINVAL) {
735 /*
736 * Under Linux, this most likely means an alignment issue
737 * (memory or disk) due to O_DIRECT, so we abort() in order to
738 * catch the offender.
739 */
740 abort();
741 }
742 #endif
743 if (rc == -1)
744 return (errno);
745
746 done += rc;
747
748 if (residp)
749 *residp = len - done;
750 else if (done != len)
751 return (EIO);
752 return (0);
753 }
754
755 void
756 vn_close(vnode_t *vp)
757 {
758 close(vp->v_fd);
759 spa_strfree(vp->v_path);
760 umem_free(vp, sizeof (vnode_t));
761 }
762
763 /*
764 * At a minimum we need to update the size since vdev_reopen()
765 * will no longer call vn_openat().
766 */
767 int
768 fop_getattr(vnode_t *vp, vattr_t *vap)
769 {
770 struct stat64 st;
771 int err;
772
773 if (fstat64_blk(vp->v_fd, &st) == -1) {
774 err = errno;
775 close(vp->v_fd);
776 return (err);
777 }
778
779 vap->va_size = st.st_size;
780 return (0);
781 }
782
783 /*
784 * =========================================================================
785 * Figure out which debugging statements to print
786 * =========================================================================
787 */
788
789 static char *dprintf_string;
790 static int dprintf_print_all;
791
792 int
793 dprintf_find_string(const char *string)
794 {
795 char *tmp_str = dprintf_string;
796 int len = strlen(string);
797
798 /*
799 * Find out if this is a string we want to print.
800 * String format: file1.c,function_name1,file2.c,file3.c
801 */
802
803 while (tmp_str != NULL) {
804 if (strncmp(tmp_str, string, len) == 0 &&
805 (tmp_str[len] == ',' || tmp_str[len] == '\0'))
806 return (1);
807 tmp_str = strchr(tmp_str, ',');
808 if (tmp_str != NULL)
809 tmp_str++; /* Get rid of , */
810 }
811 return (0);
812 }
813
814 void
815 dprintf_setup(int *argc, char **argv)
816 {
817 int i, j;
818
819 /*
820 * Debugging can be specified two ways: by setting the
821 * environment variable ZFS_DEBUG, or by including a
822 * "debug=..." argument on the command line. The command
823 * line setting overrides the environment variable.
824 */
825
826 for (i = 1; i < *argc; i++) {
827 int len = strlen("debug=");
828 /* First look for a command line argument */
829 if (strncmp("debug=", argv[i], len) == 0) {
830 dprintf_string = argv[i] + len;
831 /* Remove from args */
832 for (j = i; j < *argc; j++)
833 argv[j] = argv[j+1];
834 argv[j] = NULL;
835 (*argc)--;
836 }
837 }
838
839 if (dprintf_string == NULL) {
840 /* Look for ZFS_DEBUG environment variable */
841 dprintf_string = getenv("ZFS_DEBUG");
842 }
843
844 /*
845 * Are we just turning on all debugging?
846 */
847 if (dprintf_find_string("on"))
848 dprintf_print_all = 1;
849 }
850
851 /*
852 * =========================================================================
853 * debug printfs
854 * =========================================================================
855 */
856 void
857 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
858 {
859 const char *newfile;
860 va_list adx;
861
862 /*
863 * Get rid of annoying "../common/" prefix to filename.
864 */
865 newfile = strrchr(file, '/');
866 if (newfile != NULL) {
867 newfile = newfile + 1; /* Get rid of leading / */
868 } else {
869 newfile = file;
870 }
871
872 if (dprintf_print_all ||
873 dprintf_find_string(newfile) ||
874 dprintf_find_string(func)) {
875 /* Print out just the function name if requested */
876 flockfile(stdout);
877 if (dprintf_find_string("pid"))
878 (void) printf("%d ", getpid());
879 if (dprintf_find_string("tid"))
880 (void) printf("%u ", (uint_t) pthread_self());
881 if (dprintf_find_string("cpu"))
882 (void) printf("%u ", getcpuid());
883 if (dprintf_find_string("time"))
884 (void) printf("%llu ", gethrtime());
885 if (dprintf_find_string("long"))
886 (void) printf("%s, line %d: ", newfile, line);
887 (void) printf("%s: ", func);
888 va_start(adx, fmt);
889 (void) vprintf(fmt, adx);
890 va_end(adx);
891 funlockfile(stdout);
892 }
893 }
894
895 /*
896 * =========================================================================
897 * cmn_err() and panic()
898 * =========================================================================
899 */
900 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
901 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
902
903 void
904 vpanic(const char *fmt, va_list adx)
905 {
906 (void) fprintf(stderr, "error: ");
907 (void) vfprintf(stderr, fmt, adx);
908 (void) fprintf(stderr, "\n");
909
910 abort(); /* think of it as a "user-level crash dump" */
911 }
912
913 void
914 panic(const char *fmt, ...)
915 {
916 va_list adx;
917
918 va_start(adx, fmt);
919 vpanic(fmt, adx);
920 va_end(adx);
921 }
922
923 void
924 vcmn_err(int ce, const char *fmt, va_list adx)
925 {
926 if (ce == CE_PANIC)
927 vpanic(fmt, adx);
928 if (ce != CE_NOTE) { /* suppress noise in userland stress testing */
929 (void) fprintf(stderr, "%s", ce_prefix[ce]);
930 (void) vfprintf(stderr, fmt, adx);
931 (void) fprintf(stderr, "%s", ce_suffix[ce]);
932 }
933 }
934
935 /*PRINTFLIKE2*/
936 void
937 cmn_err(int ce, const char *fmt, ...)
938 {
939 va_list adx;
940
941 va_start(adx, fmt);
942 vcmn_err(ce, fmt, adx);
943 va_end(adx);
944 }
945
946 /*
947 * =========================================================================
948 * kobj interfaces
949 * =========================================================================
950 */
951 struct _buf *
952 kobj_open_file(char *name)
953 {
954 struct _buf *file;
955 vnode_t *vp;
956
957 /* set vp as the _fd field of the file */
958 if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir,
959 -1) != 0)
960 return ((void *)-1UL);
961
962 file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL);
963 file->_fd = (intptr_t)vp;
964 return (file);
965 }
966
967 int
968 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
969 {
970 ssize_t resid;
971
972 vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off,
973 UIO_SYSSPACE, 0, 0, 0, &resid);
974
975 return (size - resid);
976 }
977
978 void
979 kobj_close_file(struct _buf *file)
980 {
981 vn_close((vnode_t *)file->_fd);
982 umem_free(file, sizeof (struct _buf));
983 }
984
985 int
986 kobj_get_filesize(struct _buf *file, uint64_t *size)
987 {
988 struct stat64 st;
989 vnode_t *vp = (vnode_t *)file->_fd;
990
991 if (fstat64(vp->v_fd, &st) == -1) {
992 vn_close(vp);
993 return (errno);
994 }
995 *size = st.st_size;
996 return (0);
997 }
998
999 /*
1000 * =========================================================================
1001 * misc routines
1002 * =========================================================================
1003 */
1004
1005 void
1006 delay(clock_t ticks)
1007 {
1008 poll(0, 0, ticks * (1000 / hz));
1009 }
1010
1011 /*
1012 * Find highest one bit set.
1013 * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
1014 * High order bit is 31 (or 63 in _LP64 kernel).
1015 */
1016 int
1017 highbit64(uint64_t i)
1018 {
1019 register int h = 1;
1020
1021 if (i == 0)
1022 return (0);
1023 if (i & 0xffffffff00000000ULL) {
1024 h += 32; i >>= 32;
1025 }
1026 if (i & 0xffff0000) {
1027 h += 16; i >>= 16;
1028 }
1029 if (i & 0xff00) {
1030 h += 8; i >>= 8;
1031 }
1032 if (i & 0xf0) {
1033 h += 4; i >>= 4;
1034 }
1035 if (i & 0xc) {
1036 h += 2; i >>= 2;
1037 }
1038 if (i & 0x2) {
1039 h += 1;
1040 }
1041 return (h);
1042 }
1043
1044 static int random_fd = -1, urandom_fd = -1;
1045
1046 static int
1047 random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
1048 {
1049 size_t resid = len;
1050 ssize_t bytes;
1051
1052 ASSERT(fd != -1);
1053
1054 while (resid != 0) {
1055 bytes = read(fd, ptr, resid);
1056 ASSERT3S(bytes, >=, 0);
1057 ptr += bytes;
1058 resid -= bytes;
1059 }
1060
1061 return (0);
1062 }
1063
1064 int
1065 random_get_bytes(uint8_t *ptr, size_t len)
1066 {
1067 return (random_get_bytes_common(ptr, len, random_fd));
1068 }
1069
1070 int
1071 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
1072 {
1073 return (random_get_bytes_common(ptr, len, urandom_fd));
1074 }
1075
1076 int
1077 ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
1078 {
1079 char *end;
1080
1081 *result = strtoul(hw_serial, &end, base);
1082 if (*result == 0)
1083 return (errno);
1084 return (0);
1085 }
1086
1087 int
1088 ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
1089 {
1090 char *end;
1091
1092 *result = strtoull(str, &end, base);
1093 if (*result == 0)
1094 return (errno);
1095 return (0);
1096 }
1097
1098 /*
1099 * =========================================================================
1100 * kernel emulation setup & teardown
1101 * =========================================================================
1102 */
1103 static int
1104 umem_out_of_memory(void)
1105 {
1106 char errmsg[] = "out of memory -- generating core dump\n";
1107
1108 (void) fprintf(stderr, "%s", errmsg);
1109 abort();
1110 return (0);
1111 }
1112
1113 void
1114 kernel_init(int mode)
1115 {
1116 extern uint_t rrw_tsd_key;
1117
1118 umem_nofail_callback(umem_out_of_memory);
1119
1120 physmem = sysconf(_SC_PHYS_PAGES);
1121
1122 dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
1123 (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
1124
1125 (void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
1126 (mode & FWRITE) ? gethostid() : 0);
1127
1128 VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1);
1129 VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1);
1130
1131 thread_init();
1132 system_taskq_init();
1133
1134 spa_init(mode);
1135
1136 tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
1137 }
1138
1139 void
1140 kernel_fini(void)
1141 {
1142 spa_fini();
1143
1144 system_taskq_fini();
1145 thread_fini();
1146
1147 close(random_fd);
1148 close(urandom_fd);
1149
1150 random_fd = -1;
1151 urandom_fd = -1;
1152 }
1153
1154 uid_t
1155 crgetuid(cred_t *cr)
1156 {
1157 return (0);
1158 }
1159
1160 uid_t
1161 crgetruid(cred_t *cr)
1162 {
1163 return (0);
1164 }
1165
1166 gid_t
1167 crgetgid(cred_t *cr)
1168 {
1169 return (0);
1170 }
1171
1172 int
1173 crgetngroups(cred_t *cr)
1174 {
1175 return (0);
1176 }
1177
1178 gid_t *
1179 crgetgroups(cred_t *cr)
1180 {
1181 return (NULL);
1182 }
1183
1184 int
1185 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
1186 {
1187 return (0);
1188 }
1189
1190 int
1191 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
1192 {
1193 return (0);
1194 }
1195
1196 int
1197 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
1198 {
1199 return (0);
1200 }
1201
1202 ksiddomain_t *
1203 ksid_lookupdomain(const char *dom)
1204 {
1205 ksiddomain_t *kd;
1206
1207 kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
1208 kd->kd_name = spa_strdup(dom);
1209 return (kd);
1210 }
1211
1212 void
1213 ksiddomain_rele(ksiddomain_t *ksid)
1214 {
1215 spa_strfree(ksid->kd_name);
1216 umem_free(ksid, sizeof (ksiddomain_t));
1217 }
1218
1219 char *
1220 kmem_vasprintf(const char *fmt, va_list adx)
1221 {
1222 char *buf = NULL;
1223 va_list adx_copy;
1224
1225 va_copy(adx_copy, adx);
1226 VERIFY(vasprintf(&buf, fmt, adx_copy) != -1);
1227 va_end(adx_copy);
1228
1229 return (buf);
1230 }
1231
1232 char *
1233 kmem_asprintf(const char *fmt, ...)
1234 {
1235 char *buf = NULL;
1236 va_list adx;
1237
1238 va_start(adx, fmt);
1239 VERIFY(vasprintf(&buf, fmt, adx) != -1);
1240 va_end(adx);
1241
1242 return (buf);
1243 }
1244
1245 /* ARGSUSED */
1246 int
1247 zfs_onexit_fd_hold(int fd, minor_t *minorp)
1248 {
1249 *minorp = 0;
1250 return (0);
1251 }
1252
1253 /* ARGSUSED */
1254 void
1255 zfs_onexit_fd_rele(int fd)
1256 {
1257 }
1258
1259 /* ARGSUSED */
1260 int
1261 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
1262 uint64_t *action_handle)
1263 {
1264 return (0);
1265 }
1266
1267 /* ARGSUSED */
1268 int
1269 zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire)
1270 {
1271 return (0);
1272 }
1273
1274 /* ARGSUSED */
1275 int
1276 zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data)
1277 {
1278 return (0);
1279 }