]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
23 | */ | |
24 | ||
25 | #include <assert.h> | |
26 | #include <fcntl.h> | |
27 | #include <poll.h> | |
28 | #include <stdio.h> | |
29 | #include <stdlib.h> | |
30 | #include <string.h> | |
31 | #include <zlib.h> | |
32 | #include <sys/signal.h> | |
33 | #include <sys/spa.h> | |
34 | #include <sys/stat.h> | |
35 | #include <sys/processor.h> | |
36 | #include <sys/zfs_context.h> | |
37 | #include <sys/utsname.h> | |
38 | #include <sys/time.h> | |
39 | #include <sys/systeminfo.h> | |
40 | ||
41 | /* | |
42 | * Emulation of kernel services in userland. | |
43 | */ | |
44 | ||
45 | int aok; | |
46 | uint64_t physmem; | |
47 | vnode_t *rootdir = (vnode_t *)0xabcd1234; | |
48 | char hw_serial[HW_HOSTID_LEN]; | |
49 | ||
50 | struct utsname utsname = { | |
51 | "userland", "libzpool", "1", "1", "na" | |
52 | }; | |
53 | ||
54 | /* this only exists to have its address taken */ | |
55 | struct proc p0; | |
56 | ||
57 | /* | |
58 | * ========================================================================= | |
59 | * threads | |
60 | * ========================================================================= | |
61 | */ | |
62 | ||
63 | pthread_cond_t kthread_cond = PTHREAD_COND_INITIALIZER; | |
64 | pthread_mutex_t kthread_lock = PTHREAD_MUTEX_INITIALIZER; | |
65 | pthread_key_t kthread_key; | |
66 | int kthread_nr = 0; | |
67 | ||
68 | static void | |
69 | thread_init(void) | |
70 | { | |
71 | kthread_t *kt; | |
72 | ||
73 | VERIFY3S(pthread_key_create(&kthread_key, NULL), ==, 0); | |
74 | ||
75 | /* Create entry for primary kthread */ | |
76 | kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); | |
77 | kt->t_tid = pthread_self(); | |
78 | kt->t_func = NULL; | |
79 | ||
80 | VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); | |
81 | ||
82 | /* Only the main thread should be running at the moment */ | |
83 | ASSERT3S(kthread_nr, ==, 0); | |
84 | kthread_nr = 1; | |
85 | } | |
86 | ||
87 | static void | |
88 | thread_fini(void) | |
89 | { | |
90 | kthread_t *kt = curthread; | |
91 | ||
92 | ASSERT(pthread_equal(kt->t_tid, pthread_self())); | |
93 | ASSERT3P(kt->t_func, ==, NULL); | |
94 | ||
95 | umem_free(kt, sizeof(kthread_t)); | |
96 | ||
97 | /* Wait for all threads to exit via thread_exit() */ | |
98 | VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); | |
99 | ||
100 | kthread_nr--; /* Main thread is exiting */ | |
101 | ||
102 | while (kthread_nr > 0) | |
103 | VERIFY3S(pthread_cond_wait(&kthread_cond, &kthread_lock), ==, | |
104 | 0); | |
105 | ||
106 | ASSERT3S(kthread_nr, ==, 0); | |
107 | VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); | |
108 | ||
109 | VERIFY3S(pthread_key_delete(kthread_key), ==, 0); | |
110 | } | |
111 | ||
112 | kthread_t * | |
113 | zk_thread_current(void) | |
114 | { | |
115 | kthread_t *kt = pthread_getspecific(kthread_key); | |
116 | ||
117 | ASSERT3P(kt, !=, NULL); | |
118 | ||
119 | return kt; | |
120 | } | |
121 | ||
122 | void * | |
123 | zk_thread_helper(void *arg) | |
124 | { | |
125 | kthread_t *kt = (kthread_t *) arg; | |
126 | ||
127 | VERIFY3S(pthread_setspecific(kthread_key, kt), ==, 0); | |
128 | ||
129 | VERIFY3S(pthread_mutex_lock(&kthread_lock), ==, 0); | |
130 | kthread_nr++; | |
131 | VERIFY3S(pthread_mutex_unlock(&kthread_lock), ==, 0); | |
132 | ||
133 | kt->t_tid = pthread_self(); | |
134 | ((thread_func_arg_t) kt->t_func)(kt->t_arg); | |
135 | ||
136 | /* Unreachable, thread must exit with thread_exit() */ | |
137 | abort(); | |
138 | ||
139 | return NULL; | |
140 | } | |
141 | ||
142 | kthread_t * | |
143 | zk_thread_create(caddr_t stk, size_t stksize, thread_func_t func, void *arg, | |
144 | size_t len, proc_t *pp, int state, pri_t pri, int detachstate) | |
145 | { | |
146 | kthread_t *kt; | |
147 | pthread_attr_t attr; | |
148 | size_t stack; | |
149 | ||
150 | ASSERT3S(state & ~TS_RUN, ==, 0); | |
151 | ||
152 | kt = umem_zalloc(sizeof(kthread_t), UMEM_NOFAIL); | |
153 | kt->t_func = func; | |
154 | kt->t_arg = arg; | |
155 | ||
156 | /* | |
157 | * The Solaris kernel stack size is 24k for x86/x86_64. | |
158 | * The Linux kernel stack size is 8k for x86/x86_64. | |
159 | * | |
160 | * We reduce the default stack size in userspace, to ensure | |
161 | * we observe stack overruns in user space as well as in | |
162 | * kernel space. In practice we can't set the userspace stack | |
163 | * size to 8k because differences in stack usage between kernel | |
164 | * space and userspace could lead to spurious stack overflows | |
165 | * (especially when debugging is enabled). Nevertheless, we try | |
166 | * to set it to the lowest value that works (currently 8k*4). | |
167 | * PTHREAD_STACK_MIN is the minimum stack required for a NULL | |
168 | * procedure in user space and is added in to the stack | |
169 | * requirements. | |
170 | * | |
171 | * Some buggy NPTL threading implementations include the | |
172 | * guard area within the stack size allocations. In | |
173 | * this case we allocate an extra page to account for the | |
174 | * guard area since we only have two pages of usable stack | |
175 | * on Linux. | |
176 | */ | |
177 | ||
178 | stack = PTHREAD_STACK_MIN + MAX(stksize, STACK_SIZE) * 4 + | |
179 | EXTRA_GUARD_BYTES; | |
180 | ||
181 | VERIFY3S(pthread_attr_init(&attr), ==, 0); | |
182 | VERIFY3S(pthread_attr_setstacksize(&attr, stack), ==, 0); | |
183 | VERIFY3S(pthread_attr_setguardsize(&attr, PAGESIZE), ==, 0); | |
184 | VERIFY3S(pthread_attr_setdetachstate(&attr, detachstate), ==, 0); | |
185 | ||
186 | VERIFY3S(pthread_create(&kt->t_tid, &attr, &zk_thread_helper, kt), | |
187 | ==, 0); | |
188 | ||
189 | VERIFY3S(pthread_attr_destroy(&attr), ==, 0); | |
190 | ||
191 | return kt; | |
192 | } | |
193 | ||
194 | void | |
195 | zk_thread_exit(void) | |
196 | { | |
197 | kthread_t *kt = curthread; | |
198 | ||
199 | ASSERT(pthread_equal(kt->t_tid, pthread_self())); | |
200 | ||
201 | umem_free(kt, sizeof(kthread_t)); | |
202 | ||
203 | pthread_mutex_lock(&kthread_lock); | |
204 | kthread_nr--; | |
205 | pthread_mutex_unlock(&kthread_lock); | |
206 | ||
207 | pthread_cond_broadcast(&kthread_cond); | |
208 | pthread_exit((void *)TS_MAGIC); | |
209 | } | |
210 | ||
211 | void | |
212 | zk_thread_join(kt_did_t tid) | |
213 | { | |
214 | void *ret; | |
215 | ||
216 | pthread_join((pthread_t)tid, &ret); | |
217 | VERIFY3P(ret, ==, (void *)TS_MAGIC); | |
218 | } | |
219 | ||
220 | /* | |
221 | * ========================================================================= | |
222 | * kstats | |
223 | * ========================================================================= | |
224 | */ | |
225 | /*ARGSUSED*/ | |
226 | kstat_t * | |
227 | kstat_create(char *module, int instance, char *name, char *class, | |
228 | uchar_t type, ulong_t ndata, uchar_t ks_flag) | |
229 | { | |
230 | return (NULL); | |
231 | } | |
232 | ||
233 | /*ARGSUSED*/ | |
234 | void | |
235 | kstat_install(kstat_t *ksp) | |
236 | {} | |
237 | ||
238 | /*ARGSUSED*/ | |
239 | void | |
240 | kstat_delete(kstat_t *ksp) | |
241 | {} | |
242 | ||
243 | /* | |
244 | * ========================================================================= | |
245 | * mutexes | |
246 | * ========================================================================= | |
247 | */ | |
248 | ||
249 | void | |
250 | mutex_init(kmutex_t *mp, char *name, int type, void *cookie) | |
251 | { | |
252 | ASSERT3S(type, ==, MUTEX_DEFAULT); | |
253 | ASSERT3P(cookie, ==, NULL); | |
254 | mp->m_owner = MTX_INIT; | |
255 | mp->m_magic = MTX_MAGIC; | |
256 | VERIFY3S(pthread_mutex_init(&mp->m_lock, NULL), ==, 0); | |
257 | } | |
258 | ||
259 | void | |
260 | mutex_destroy(kmutex_t *mp) | |
261 | { | |
262 | ASSERT3U(mp->m_magic, ==, MTX_MAGIC); | |
263 | ASSERT3P(mp->m_owner, ==, MTX_INIT); | |
264 | VERIFY3S(pthread_mutex_destroy(&(mp)->m_lock), ==, 0); | |
265 | mp->m_owner = MTX_DEST; | |
266 | mp->m_magic = 0; | |
267 | } | |
268 | ||
269 | void | |
270 | mutex_enter(kmutex_t *mp) | |
271 | { | |
272 | ASSERT3U(mp->m_magic, ==, MTX_MAGIC); | |
273 | ASSERT3P(mp->m_owner, !=, MTX_DEST); | |
274 | ASSERT3P(mp->m_owner, !=, curthread); | |
275 | VERIFY3S(pthread_mutex_lock(&mp->m_lock), ==, 0); | |
276 | ASSERT3P(mp->m_owner, ==, MTX_INIT); | |
277 | mp->m_owner = curthread; | |
278 | } | |
279 | ||
280 | int | |
281 | mutex_tryenter(kmutex_t *mp) | |
282 | { | |
283 | ASSERT3U(mp->m_magic, ==, MTX_MAGIC); | |
284 | ASSERT3P(mp->m_owner, !=, MTX_DEST); | |
285 | if (0 == pthread_mutex_trylock(&mp->m_lock)) { | |
286 | ASSERT3P(mp->m_owner, ==, MTX_INIT); | |
287 | mp->m_owner = curthread; | |
288 | return (1); | |
289 | } else { | |
290 | return (0); | |
291 | } | |
292 | } | |
293 | ||
294 | void | |
295 | mutex_exit(kmutex_t *mp) | |
296 | { | |
297 | ASSERT3U(mp->m_magic, ==, MTX_MAGIC); | |
298 | ASSERT3P(mutex_owner(mp), ==, curthread); | |
299 | mp->m_owner = MTX_INIT; | |
300 | VERIFY3S(pthread_mutex_unlock(&mp->m_lock), ==, 0); | |
301 | } | |
302 | ||
303 | void * | |
304 | mutex_owner(kmutex_t *mp) | |
305 | { | |
306 | ASSERT3U(mp->m_magic, ==, MTX_MAGIC); | |
307 | return (mp->m_owner); | |
308 | } | |
309 | ||
310 | int | |
311 | mutex_held(kmutex_t *mp) | |
312 | { | |
313 | return (mp->m_owner == curthread); | |
314 | } | |
315 | ||
316 | /* | |
317 | * ========================================================================= | |
318 | * rwlocks | |
319 | * ========================================================================= | |
320 | */ | |
321 | ||
322 | void | |
323 | rw_init(krwlock_t *rwlp, char *name, int type, void *arg) | |
324 | { | |
325 | ASSERT3S(type, ==, RW_DEFAULT); | |
326 | ASSERT3P(arg, ==, NULL); | |
327 | VERIFY3S(pthread_rwlock_init(&rwlp->rw_lock, NULL), ==, 0); | |
328 | rwlp->rw_owner = RW_INIT; | |
329 | rwlp->rw_wr_owner = RW_INIT; | |
330 | rwlp->rw_readers = 0; | |
331 | rwlp->rw_magic = RW_MAGIC; | |
332 | } | |
333 | ||
334 | void | |
335 | rw_destroy(krwlock_t *rwlp) | |
336 | { | |
337 | ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); | |
338 | ||
339 | VERIFY3S(pthread_rwlock_destroy(&rwlp->rw_lock), ==, 0); | |
340 | rwlp->rw_magic = 0; | |
341 | } | |
342 | ||
343 | void | |
344 | rw_enter(krwlock_t *rwlp, krw_t rw) | |
345 | { | |
346 | ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); | |
347 | ASSERT3P(rwlp->rw_owner, !=, curthread); | |
348 | ASSERT3P(rwlp->rw_wr_owner, !=, curthread); | |
349 | ||
350 | if (rw == RW_READER) { | |
351 | VERIFY3S(pthread_rwlock_rdlock(&rwlp->rw_lock), ==, 0); | |
352 | ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); | |
353 | ||
354 | atomic_inc_uint(&rwlp->rw_readers); | |
355 | } else { | |
356 | VERIFY3S(pthread_rwlock_wrlock(&rwlp->rw_lock), ==, 0); | |
357 | ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); | |
358 | ASSERT3U(rwlp->rw_readers, ==, 0); | |
359 | ||
360 | rwlp->rw_wr_owner = curthread; | |
361 | } | |
362 | ||
363 | rwlp->rw_owner = curthread; | |
364 | } | |
365 | ||
366 | void | |
367 | rw_exit(krwlock_t *rwlp) | |
368 | { | |
369 | ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); | |
370 | ASSERT(RW_LOCK_HELD(rwlp)); | |
371 | ||
372 | if (RW_READ_HELD(rwlp)) | |
373 | atomic_dec_uint(&rwlp->rw_readers); | |
374 | else | |
375 | rwlp->rw_wr_owner = RW_INIT; | |
376 | ||
377 | rwlp->rw_owner = RW_INIT; | |
378 | VERIFY3S(pthread_rwlock_unlock(&rwlp->rw_lock), ==, 0); | |
379 | } | |
380 | ||
381 | int | |
382 | rw_tryenter(krwlock_t *rwlp, krw_t rw) | |
383 | { | |
384 | int rv; | |
385 | ||
386 | ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); | |
387 | ||
388 | if (rw == RW_READER) | |
389 | rv = pthread_rwlock_tryrdlock(&rwlp->rw_lock); | |
390 | else | |
391 | rv = pthread_rwlock_trywrlock(&rwlp->rw_lock); | |
392 | ||
393 | if (rv == 0) { | |
394 | ASSERT3P(rwlp->rw_wr_owner, ==, RW_INIT); | |
395 | ||
396 | if (rw == RW_READER) | |
397 | atomic_inc_uint(&rwlp->rw_readers); | |
398 | else { | |
399 | ASSERT3U(rwlp->rw_readers, ==, 0); | |
400 | rwlp->rw_wr_owner = curthread; | |
401 | } | |
402 | ||
403 | rwlp->rw_owner = curthread; | |
404 | return (1); | |
405 | } | |
406 | ||
407 | VERIFY3S(rv, ==, EBUSY); | |
408 | ||
409 | return (0); | |
410 | } | |
411 | ||
412 | int | |
413 | rw_tryupgrade(krwlock_t *rwlp) | |
414 | { | |
415 | ASSERT3U(rwlp->rw_magic, ==, RW_MAGIC); | |
416 | ||
417 | return (0); | |
418 | } | |
419 | ||
420 | /* | |
421 | * ========================================================================= | |
422 | * condition variables | |
423 | * ========================================================================= | |
424 | */ | |
425 | ||
426 | void | |
427 | cv_init(kcondvar_t *cv, char *name, int type, void *arg) | |
428 | { | |
429 | ASSERT3S(type, ==, CV_DEFAULT); | |
430 | cv->cv_magic = CV_MAGIC; | |
431 | VERIFY3S(pthread_cond_init(&cv->cv, NULL), ==, 0); | |
432 | } | |
433 | ||
434 | void | |
435 | cv_destroy(kcondvar_t *cv) | |
436 | { | |
437 | ASSERT3U(cv->cv_magic, ==, CV_MAGIC); | |
438 | VERIFY3S(pthread_cond_destroy(&cv->cv), ==, 0); | |
439 | cv->cv_magic = 0; | |
440 | } | |
441 | ||
442 | void | |
443 | cv_wait(kcondvar_t *cv, kmutex_t *mp) | |
444 | { | |
445 | ASSERT3U(cv->cv_magic, ==, CV_MAGIC); | |
446 | ASSERT3P(mutex_owner(mp), ==, curthread); | |
447 | mp->m_owner = MTX_INIT; | |
448 | int ret = pthread_cond_wait(&cv->cv, &mp->m_lock); | |
449 | if (ret != 0) | |
450 | VERIFY3S(ret, ==, EINTR); | |
451 | mp->m_owner = curthread; | |
452 | } | |
453 | ||
454 | clock_t | |
455 | cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) | |
456 | { | |
457 | int error; | |
458 | struct timeval tv; | |
459 | timestruc_t ts; | |
460 | clock_t delta; | |
461 | ||
462 | ASSERT3U(cv->cv_magic, ==, CV_MAGIC); | |
463 | ||
464 | top: | |
465 | delta = abstime - ddi_get_lbolt(); | |
466 | if (delta <= 0) | |
467 | return (-1); | |
468 | ||
469 | VERIFY(gettimeofday(&tv, NULL) == 0); | |
470 | ||
471 | ts.tv_sec = tv.tv_sec + delta / hz; | |
472 | ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); | |
473 | if (ts.tv_nsec >= NANOSEC) { | |
474 | ts.tv_sec++; | |
475 | ts.tv_nsec -= NANOSEC; | |
476 | } | |
477 | ||
478 | ASSERT3P(mutex_owner(mp), ==, curthread); | |
479 | mp->m_owner = MTX_INIT; | |
480 | error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts); | |
481 | mp->m_owner = curthread; | |
482 | ||
483 | if (error == ETIMEDOUT) | |
484 | return (-1); | |
485 | ||
486 | if (error == EINTR) | |
487 | goto top; | |
488 | ||
489 | VERIFY3S(error, ==, 0); | |
490 | ||
491 | return (1); | |
492 | } | |
493 | ||
494 | void | |
495 | cv_signal(kcondvar_t *cv) | |
496 | { | |
497 | ASSERT3U(cv->cv_magic, ==, CV_MAGIC); | |
498 | VERIFY3S(pthread_cond_signal(&cv->cv), ==, 0); | |
499 | } | |
500 | ||
501 | void | |
502 | cv_broadcast(kcondvar_t *cv) | |
503 | { | |
504 | ASSERT3U(cv->cv_magic, ==, CV_MAGIC); | |
505 | VERIFY3S(pthread_cond_broadcast(&cv->cv), ==, 0); | |
506 | } | |
507 | ||
508 | /* | |
509 | * ========================================================================= | |
510 | * vnode operations | |
511 | * ========================================================================= | |
512 | */ | |
513 | /* | |
514 | * Note: for the xxxat() versions of these functions, we assume that the | |
515 | * starting vp is always rootdir (which is true for spa_directory.c, the only | |
516 | * ZFS consumer of these interfaces). We assert this is true, and then emulate | |
517 | * them by adding '/' in front of the path. | |
518 | */ | |
519 | ||
520 | /*ARGSUSED*/ | |
521 | int | |
522 | vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) | |
523 | { | |
524 | int fd; | |
525 | vnode_t *vp; | |
526 | int old_umask; | |
527 | char *realpath; | |
528 | struct stat64 st; | |
529 | int err; | |
530 | ||
531 | realpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); | |
532 | ||
533 | /* | |
534 | * If we're accessing a real disk from userland, we need to use | |
535 | * the character interface to avoid caching. This is particularly | |
536 | * important if we're trying to look at a real in-kernel storage | |
537 | * pool from userland, e.g. via zdb, because otherwise we won't | |
538 | * see the changes occurring under the segmap cache. | |
539 | * On the other hand, the stupid character device returns zero | |
540 | * for its size. So -- gag -- we open the block device to get | |
541 | * its size, and remember it for subsequent VOP_GETATTR(). | |
542 | */ | |
543 | #if defined(__sun__) || defined(__sun) | |
544 | if (strncmp(path, "/dev/", 5) == 0) { | |
545 | #else | |
546 | if (0) { | |
547 | #endif | |
548 | char *dsk; | |
549 | fd = open64(path, O_RDONLY); | |
550 | if (fd == -1) { | |
551 | err = errno; | |
552 | free(realpath); | |
553 | return (err); | |
554 | } | |
555 | if (fstat64(fd, &st) == -1) { | |
556 | err = errno; | |
557 | close(fd); | |
558 | free(realpath); | |
559 | return (err); | |
560 | } | |
561 | close(fd); | |
562 | (void) sprintf(realpath, "%s", path); | |
563 | dsk = strstr(path, "/dsk/"); | |
564 | if (dsk != NULL) | |
565 | (void) sprintf(realpath + (dsk - path) + 1, "r%s", | |
566 | dsk + 1); | |
567 | } else { | |
568 | (void) sprintf(realpath, "%s", path); | |
569 | if (!(flags & FCREAT) && stat64(realpath, &st) == -1) { | |
570 | err = errno; | |
571 | free(realpath); | |
572 | return (err); | |
573 | } | |
574 | } | |
575 | ||
576 | if (!(flags & FCREAT) && S_ISBLK(st.st_mode)) { | |
577 | #ifdef __linux__ | |
578 | flags |= O_DIRECT; | |
579 | #endif | |
580 | /* We shouldn't be writing to block devices in userspace */ | |
581 | VERIFY(!(flags & FWRITE)); | |
582 | } | |
583 | ||
584 | if (flags & FCREAT) | |
585 | old_umask = umask(0); | |
586 | ||
587 | /* | |
588 | * The construct 'flags - FREAD' conveniently maps combinations of | |
589 | * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. | |
590 | */ | |
591 | fd = open64(realpath, flags - FREAD, mode); | |
592 | free(realpath); | |
593 | ||
594 | if (flags & FCREAT) | |
595 | (void) umask(old_umask); | |
596 | ||
597 | if (fd == -1) | |
598 | return (errno); | |
599 | ||
600 | if (fstat64_blk(fd, &st) == -1) { | |
601 | err = errno; | |
602 | close(fd); | |
603 | return (err); | |
604 | } | |
605 | ||
606 | (void) fcntl(fd, F_SETFD, FD_CLOEXEC); | |
607 | ||
608 | *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); | |
609 | ||
610 | vp->v_fd = fd; | |
611 | vp->v_size = st.st_size; | |
612 | vp->v_path = spa_strdup(path); | |
613 | ||
614 | return (0); | |
615 | } | |
616 | ||
617 | /*ARGSUSED*/ | |
618 | int | |
619 | vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, | |
620 | int x3, vnode_t *startvp, int fd) | |
621 | { | |
622 | char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); | |
623 | int ret; | |
624 | ||
625 | ASSERT(startvp == rootdir); | |
626 | (void) sprintf(realpath, "/%s", path); | |
627 | ||
628 | /* fd ignored for now, need if want to simulate nbmand support */ | |
629 | ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); | |
630 | ||
631 | umem_free(realpath, strlen(path) + 2); | |
632 | ||
633 | return (ret); | |
634 | } | |
635 | ||
636 | /*ARGSUSED*/ | |
637 | int | |
638 | vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, | |
639 | int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) | |
640 | { | |
641 | ssize_t rc, done = 0, split; | |
642 | ||
643 | if (uio == UIO_READ) { | |
644 | rc = pread64(vp->v_fd, addr, len, offset); | |
645 | } else { | |
646 | /* | |
647 | * To simulate partial disk writes, we split writes into two | |
648 | * system calls so that the process can be killed in between. | |
649 | */ | |
650 | int sectors = len >> SPA_MINBLOCKSHIFT; | |
651 | split = (sectors > 0 ? rand() % sectors : 0) << | |
652 | SPA_MINBLOCKSHIFT; | |
653 | rc = pwrite64(vp->v_fd, addr, split, offset); | |
654 | if (rc != -1) { | |
655 | done = rc; | |
656 | rc = pwrite64(vp->v_fd, (char *)addr + split, | |
657 | len - split, offset + split); | |
658 | } | |
659 | } | |
660 | ||
661 | #ifdef __linux__ | |
662 | if (rc == -1 && errno == EINVAL) { | |
663 | /* | |
664 | * Under Linux, this most likely means an alignment issue | |
665 | * (memory or disk) due to O_DIRECT, so we abort() in order to | |
666 | * catch the offender. | |
667 | */ | |
668 | abort(); | |
669 | } | |
670 | #endif | |
671 | if (rc == -1) | |
672 | return (errno); | |
673 | ||
674 | done += rc; | |
675 | ||
676 | if (residp) | |
677 | *residp = len - done; | |
678 | else if (done != len) | |
679 | return (EIO); | |
680 | return (0); | |
681 | } | |
682 | ||
683 | void | |
684 | vn_close(vnode_t *vp) | |
685 | { | |
686 | close(vp->v_fd); | |
687 | spa_strfree(vp->v_path); | |
688 | umem_free(vp, sizeof (vnode_t)); | |
689 | } | |
690 | ||
691 | /* | |
692 | * At a minimum we need to update the size since vdev_reopen() | |
693 | * will no longer call vn_openat(). | |
694 | */ | |
695 | int | |
696 | fop_getattr(vnode_t *vp, vattr_t *vap) | |
697 | { | |
698 | struct stat64 st; | |
699 | int err; | |
700 | ||
701 | if (fstat64_blk(vp->v_fd, &st) == -1) { | |
702 | err = errno; | |
703 | close(vp->v_fd); | |
704 | return (err); | |
705 | } | |
706 | ||
707 | vap->va_size = st.st_size; | |
708 | return (0); | |
709 | } | |
710 | ||
711 | /* | |
712 | * ========================================================================= | |
713 | * Figure out which debugging statements to print | |
714 | * ========================================================================= | |
715 | */ | |
716 | ||
717 | static char *dprintf_string; | |
718 | static int dprintf_print_all; | |
719 | ||
720 | int | |
721 | dprintf_find_string(const char *string) | |
722 | { | |
723 | char *tmp_str = dprintf_string; | |
724 | int len = strlen(string); | |
725 | ||
726 | /* | |
727 | * Find out if this is a string we want to print. | |
728 | * String format: file1.c,function_name1,file2.c,file3.c | |
729 | */ | |
730 | ||
731 | while (tmp_str != NULL) { | |
732 | if (strncmp(tmp_str, string, len) == 0 && | |
733 | (tmp_str[len] == ',' || tmp_str[len] == '\0')) | |
734 | return (1); | |
735 | tmp_str = strchr(tmp_str, ','); | |
736 | if (tmp_str != NULL) | |
737 | tmp_str++; /* Get rid of , */ | |
738 | } | |
739 | return (0); | |
740 | } | |
741 | ||
742 | void | |
743 | dprintf_setup(int *argc, char **argv) | |
744 | { | |
745 | int i, j; | |
746 | ||
747 | /* | |
748 | * Debugging can be specified two ways: by setting the | |
749 | * environment variable ZFS_DEBUG, or by including a | |
750 | * "debug=..." argument on the command line. The command | |
751 | * line setting overrides the environment variable. | |
752 | */ | |
753 | ||
754 | for (i = 1; i < *argc; i++) { | |
755 | int len = strlen("debug="); | |
756 | /* First look for a command line argument */ | |
757 | if (strncmp("debug=", argv[i], len) == 0) { | |
758 | dprintf_string = argv[i] + len; | |
759 | /* Remove from args */ | |
760 | for (j = i; j < *argc; j++) | |
761 | argv[j] = argv[j+1]; | |
762 | argv[j] = NULL; | |
763 | (*argc)--; | |
764 | } | |
765 | } | |
766 | ||
767 | if (dprintf_string == NULL) { | |
768 | /* Look for ZFS_DEBUG environment variable */ | |
769 | dprintf_string = getenv("ZFS_DEBUG"); | |
770 | } | |
771 | ||
772 | /* | |
773 | * Are we just turning on all debugging? | |
774 | */ | |
775 | if (dprintf_find_string("on")) | |
776 | dprintf_print_all = 1; | |
777 | } | |
778 | ||
779 | /* | |
780 | * ========================================================================= | |
781 | * debug printfs | |
782 | * ========================================================================= | |
783 | */ | |
784 | void | |
785 | __dprintf(const char *file, const char *func, int line, const char *fmt, ...) | |
786 | { | |
787 | const char *newfile; | |
788 | va_list adx; | |
789 | ||
790 | /* | |
791 | * Get rid of annoying "../common/" prefix to filename. | |
792 | */ | |
793 | newfile = strrchr(file, '/'); | |
794 | if (newfile != NULL) { | |
795 | newfile = newfile + 1; /* Get rid of leading / */ | |
796 | } else { | |
797 | newfile = file; | |
798 | } | |
799 | ||
800 | if (dprintf_print_all || | |
801 | dprintf_find_string(newfile) || | |
802 | dprintf_find_string(func)) { | |
803 | /* Print out just the function name if requested */ | |
804 | flockfile(stdout); | |
805 | if (dprintf_find_string("pid")) | |
806 | (void) printf("%d ", getpid()); | |
807 | if (dprintf_find_string("tid")) | |
808 | (void) printf("%u ", (uint_t) pthread_self()); | |
809 | if (dprintf_find_string("cpu")) | |
810 | (void) printf("%u ", getcpuid()); | |
811 | if (dprintf_find_string("time")) | |
812 | (void) printf("%llu ", gethrtime()); | |
813 | if (dprintf_find_string("long")) | |
814 | (void) printf("%s, line %d: ", newfile, line); | |
815 | (void) printf("%s: ", func); | |
816 | va_start(adx, fmt); | |
817 | (void) vprintf(fmt, adx); | |
818 | va_end(adx); | |
819 | funlockfile(stdout); | |
820 | } | |
821 | } | |
822 | ||
823 | /* | |
824 | * ========================================================================= | |
825 | * cmn_err() and panic() | |
826 | * ========================================================================= | |
827 | */ | |
828 | static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; | |
829 | static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; | |
830 | ||
831 | void | |
832 | vpanic(const char *fmt, va_list adx) | |
833 | { | |
834 | (void) fprintf(stderr, "error: "); | |
835 | (void) vfprintf(stderr, fmt, adx); | |
836 | (void) fprintf(stderr, "\n"); | |
837 | ||
838 | abort(); /* think of it as a "user-level crash dump" */ | |
839 | } | |
840 | ||
841 | void | |
842 | panic(const char *fmt, ...) | |
843 | { | |
844 | va_list adx; | |
845 | ||
846 | va_start(adx, fmt); | |
847 | vpanic(fmt, adx); | |
848 | va_end(adx); | |
849 | } | |
850 | ||
851 | void | |
852 | vcmn_err(int ce, const char *fmt, va_list adx) | |
853 | { | |
854 | if (ce == CE_PANIC) | |
855 | vpanic(fmt, adx); | |
856 | if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ | |
857 | (void) fprintf(stderr, "%s", ce_prefix[ce]); | |
858 | (void) vfprintf(stderr, fmt, adx); | |
859 | (void) fprintf(stderr, "%s", ce_suffix[ce]); | |
860 | } | |
861 | } | |
862 | ||
863 | /*PRINTFLIKE2*/ | |
864 | void | |
865 | cmn_err(int ce, const char *fmt, ...) | |
866 | { | |
867 | va_list adx; | |
868 | ||
869 | va_start(adx, fmt); | |
870 | vcmn_err(ce, fmt, adx); | |
871 | va_end(adx); | |
872 | } | |
873 | ||
874 | /* | |
875 | * ========================================================================= | |
876 | * kobj interfaces | |
877 | * ========================================================================= | |
878 | */ | |
879 | struct _buf * | |
880 | kobj_open_file(char *name) | |
881 | { | |
882 | struct _buf *file; | |
883 | vnode_t *vp; | |
884 | ||
885 | /* set vp as the _fd field of the file */ | |
886 | if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, | |
887 | -1) != 0) | |
888 | return ((void *)-1UL); | |
889 | ||
890 | file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); | |
891 | file->_fd = (intptr_t)vp; | |
892 | return (file); | |
893 | } | |
894 | ||
895 | int | |
896 | kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) | |
897 | { | |
898 | ssize_t resid; | |
899 | ||
900 | vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, | |
901 | UIO_SYSSPACE, 0, 0, 0, &resid); | |
902 | ||
903 | return (size - resid); | |
904 | } | |
905 | ||
906 | void | |
907 | kobj_close_file(struct _buf *file) | |
908 | { | |
909 | vn_close((vnode_t *)file->_fd); | |
910 | umem_free(file, sizeof (struct _buf)); | |
911 | } | |
912 | ||
913 | int | |
914 | kobj_get_filesize(struct _buf *file, uint64_t *size) | |
915 | { | |
916 | struct stat64 st; | |
917 | vnode_t *vp = (vnode_t *)file->_fd; | |
918 | ||
919 | if (fstat64(vp->v_fd, &st) == -1) { | |
920 | vn_close(vp); | |
921 | return (errno); | |
922 | } | |
923 | *size = st.st_size; | |
924 | return (0); | |
925 | } | |
926 | ||
927 | /* | |
928 | * ========================================================================= | |
929 | * misc routines | |
930 | * ========================================================================= | |
931 | */ | |
932 | ||
933 | void | |
934 | delay(clock_t ticks) | |
935 | { | |
936 | poll(0, 0, ticks * (1000 / hz)); | |
937 | } | |
938 | ||
939 | /* | |
940 | * Find highest one bit set. | |
941 | * Returns bit number + 1 of highest bit that is set, otherwise returns 0. | |
942 | * High order bit is 31 (or 63 in _LP64 kernel). | |
943 | */ | |
944 | int | |
945 | highbit(ulong_t i) | |
946 | { | |
947 | register int h = 1; | |
948 | ||
949 | if (i == 0) | |
950 | return (0); | |
951 | #ifdef _LP64 | |
952 | if (i & 0xffffffff00000000ul) { | |
953 | h += 32; i >>= 32; | |
954 | } | |
955 | #endif | |
956 | if (i & 0xffff0000) { | |
957 | h += 16; i >>= 16; | |
958 | } | |
959 | if (i & 0xff00) { | |
960 | h += 8; i >>= 8; | |
961 | } | |
962 | if (i & 0xf0) { | |
963 | h += 4; i >>= 4; | |
964 | } | |
965 | if (i & 0xc) { | |
966 | h += 2; i >>= 2; | |
967 | } | |
968 | if (i & 0x2) { | |
969 | h += 1; | |
970 | } | |
971 | return (h); | |
972 | } | |
973 | ||
974 | static int random_fd = -1, urandom_fd = -1; | |
975 | ||
976 | static int | |
977 | random_get_bytes_common(uint8_t *ptr, size_t len, int fd) | |
978 | { | |
979 | size_t resid = len; | |
980 | ssize_t bytes; | |
981 | ||
982 | ASSERT(fd != -1); | |
983 | ||
984 | while (resid != 0) { | |
985 | bytes = read(fd, ptr, resid); | |
986 | ASSERT3S(bytes, >=, 0); | |
987 | ptr += bytes; | |
988 | resid -= bytes; | |
989 | } | |
990 | ||
991 | return (0); | |
992 | } | |
993 | ||
994 | int | |
995 | random_get_bytes(uint8_t *ptr, size_t len) | |
996 | { | |
997 | return (random_get_bytes_common(ptr, len, random_fd)); | |
998 | } | |
999 | ||
1000 | int | |
1001 | random_get_pseudo_bytes(uint8_t *ptr, size_t len) | |
1002 | { | |
1003 | return (random_get_bytes_common(ptr, len, urandom_fd)); | |
1004 | } | |
1005 | ||
1006 | int | |
1007 | ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) | |
1008 | { | |
1009 | char *end; | |
1010 | ||
1011 | *result = strtoul(hw_serial, &end, base); | |
1012 | if (*result == 0) | |
1013 | return (errno); | |
1014 | return (0); | |
1015 | } | |
1016 | ||
1017 | int | |
1018 | ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) | |
1019 | { | |
1020 | char *end; | |
1021 | ||
1022 | *result = strtoull(str, &end, base); | |
1023 | if (*result == 0) | |
1024 | return (errno); | |
1025 | return (0); | |
1026 | } | |
1027 | ||
1028 | /* | |
1029 | * ========================================================================= | |
1030 | * kernel emulation setup & teardown | |
1031 | * ========================================================================= | |
1032 | */ | |
1033 | static int | |
1034 | umem_out_of_memory(void) | |
1035 | { | |
1036 | char errmsg[] = "out of memory -- generating core dump\n"; | |
1037 | ||
1038 | (void) fprintf(stderr, "%s", errmsg); | |
1039 | abort(); | |
1040 | return (0); | |
1041 | } | |
1042 | ||
1043 | void | |
1044 | kernel_init(int mode) | |
1045 | { | |
1046 | umem_nofail_callback(umem_out_of_memory); | |
1047 | ||
1048 | physmem = sysconf(_SC_PHYS_PAGES); | |
1049 | ||
1050 | dprintf("physmem = %llu pages (%.2f GB)\n", physmem, | |
1051 | (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); | |
1052 | ||
1053 | (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", | |
1054 | (mode & FWRITE) ? gethostid() : 0); | |
1055 | ||
1056 | VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); | |
1057 | VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); | |
1058 | ||
1059 | thread_init(); | |
1060 | system_taskq_init(); | |
1061 | ||
1062 | spa_init(mode); | |
1063 | } | |
1064 | ||
1065 | void | |
1066 | kernel_fini(void) | |
1067 | { | |
1068 | spa_fini(); | |
1069 | ||
1070 | system_taskq_fini(); | |
1071 | thread_fini(); | |
1072 | ||
1073 | close(random_fd); | |
1074 | close(urandom_fd); | |
1075 | ||
1076 | random_fd = -1; | |
1077 | urandom_fd = -1; | |
1078 | } | |
1079 | ||
1080 | uid_t | |
1081 | crgetuid(cred_t *cr) | |
1082 | { | |
1083 | return (0); | |
1084 | } | |
1085 | ||
1086 | gid_t | |
1087 | crgetgid(cred_t *cr) | |
1088 | { | |
1089 | return (0); | |
1090 | } | |
1091 | ||
1092 | int | |
1093 | crgetngroups(cred_t *cr) | |
1094 | { | |
1095 | return (0); | |
1096 | } | |
1097 | ||
1098 | gid_t * | |
1099 | crgetgroups(cred_t *cr) | |
1100 | { | |
1101 | return (NULL); | |
1102 | } | |
1103 | ||
1104 | int | |
1105 | zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) | |
1106 | { | |
1107 | return (0); | |
1108 | } | |
1109 | ||
1110 | int | |
1111 | zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) | |
1112 | { | |
1113 | return (0); | |
1114 | } | |
1115 | ||
1116 | int | |
1117 | zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) | |
1118 | { | |
1119 | return (0); | |
1120 | } | |
1121 | ||
1122 | ksiddomain_t * | |
1123 | ksid_lookupdomain(const char *dom) | |
1124 | { | |
1125 | ksiddomain_t *kd; | |
1126 | ||
1127 | kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); | |
1128 | kd->kd_name = spa_strdup(dom); | |
1129 | return (kd); | |
1130 | } | |
1131 | ||
1132 | void | |
1133 | ksiddomain_rele(ksiddomain_t *ksid) | |
1134 | { | |
1135 | spa_strfree(ksid->kd_name); | |
1136 | umem_free(ksid, sizeof (ksiddomain_t)); | |
1137 | } | |
1138 | ||
1139 | char * | |
1140 | kmem_vasprintf(const char *fmt, va_list adx) | |
1141 | { | |
1142 | char *buf = NULL; | |
1143 | va_list adx_copy; | |
1144 | ||
1145 | va_copy(adx_copy, adx); | |
1146 | VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); | |
1147 | va_end(adx_copy); | |
1148 | ||
1149 | return (buf); | |
1150 | } | |
1151 | ||
1152 | char * | |
1153 | kmem_asprintf(const char *fmt, ...) | |
1154 | { | |
1155 | char *buf = NULL; | |
1156 | va_list adx; | |
1157 | ||
1158 | va_start(adx, fmt); | |
1159 | VERIFY(vasprintf(&buf, fmt, adx) != -1); | |
1160 | va_end(adx); | |
1161 | ||
1162 | return (buf); | |
1163 | } | |
1164 | ||
1165 | /* ARGSUSED */ | |
1166 | int | |
1167 | zfs_onexit_fd_hold(int fd, minor_t *minorp) | |
1168 | { | |
1169 | *minorp = 0; | |
1170 | return (0); | |
1171 | } | |
1172 | ||
1173 | /* ARGSUSED */ | |
1174 | void | |
1175 | zfs_onexit_fd_rele(int fd) | |
1176 | { | |
1177 | } | |
1178 | ||
1179 | /* ARGSUSED */ | |
1180 | int | |
1181 | zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, | |
1182 | uint64_t *action_handle) | |
1183 | { | |
1184 | return (0); | |
1185 | } | |
1186 | ||
1187 | /* ARGSUSED */ | |
1188 | int | |
1189 | zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) | |
1190 | { | |
1191 | return (0); | |
1192 | } | |
1193 | ||
1194 | /* ARGSUSED */ | |
1195 | int | |
1196 | zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) | |
1197 | { | |
1198 | return (0); | |
1199 | } |