]>
Commit | Line | Data |
---|---|---|
5df7e9d8 MM |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
5df7e9d8 MM |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
0929c4de MA |
21 | /* |
22 | * Copyright (c) 2012, 2020 by Delphix. All rights reserved. | |
23 | */ | |
5df7e9d8 MM |
24 | |
25 | #include <sys/dataset_kstats.h> | |
26 | #include <sys/dbuf.h> | |
27 | #include <sys/dmu_traverse.h> | |
28 | #include <sys/dsl_dataset.h> | |
29 | #include <sys/dsl_prop.h> | |
30 | #include <sys/dsl_dir.h> | |
31 | #include <sys/zap.h> | |
32 | #include <sys/zfeature.h> | |
33 | #include <sys/zil_impl.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | #include <sys/zio.h> | |
36 | #include <sys/zfs_rlock.h> | |
37 | #include <sys/spa_impl.h> | |
38 | #include <sys/zvol.h> | |
39 | #include <sys/zvol_impl.h> | |
40 | ||
41 | #include <linux/blkdev_compat.h> | |
42 | #include <linux/task_io_accounting_ops.h> | |
43 | ||
6f73d021 TH |
44 | #ifdef HAVE_BLK_MQ |
45 | #include <linux/blk-mq.h> | |
46 | #endif | |
47 | ||
48 | static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, | |
49 | struct request *rq, boolean_t force_sync); | |
50 | ||
18168da7 AZ |
51 | static unsigned int zvol_major = ZVOL_MAJOR; |
52 | static unsigned int zvol_request_sync = 0; | |
53 | static unsigned int zvol_prefetch_bytes = (128 * 1024); | |
54 | static unsigned long zvol_max_discard_blocks = 16384; | |
abdcef47 PH |
55 | |
56 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS | |
945e39fc | 57 | static unsigned int zvol_open_timeout_ms = 1000; |
abdcef47 | 58 | #endif |
5df7e9d8 | 59 | |
6f73d021 TH |
60 | static unsigned int zvol_threads = 0; |
61 | #ifdef HAVE_BLK_MQ | |
62 | static unsigned int zvol_blk_mq_threads = 0; | |
63 | static unsigned int zvol_blk_mq_actual_threads; | |
64 | static boolean_t zvol_use_blk_mq = B_FALSE; | |
65 | ||
66 | /* | |
67 | * The maximum number of volblocksize blocks to process per thread. Typically, | |
68 | * write heavy workloads preform better with higher values here, and read | |
69 | * heavy workloads preform better with lower values, but that's not a hard | |
70 | * and fast rule. It's basically a knob to tune between "less overhead with | |
71 | * less parallelism" and "more overhead, but more parallelism". | |
72 | * | |
73 | * '8' was chosen as a reasonable, balanced, default based off of sequential | |
74 | * read and write tests to a zvol in an NVMe pool (with 16 CPUs). | |
75 | */ | |
76 | static unsigned int zvol_blk_mq_blocks_per_thread = 8; | |
77 | #endif | |
78 | ||
79 | #ifndef BLKDEV_DEFAULT_RQ | |
80 | /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ | |
81 | #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ | |
82 | #endif | |
83 | ||
84 | /* | |
85 | * Finalize our BIO or request. | |
86 | */ | |
87 | #ifdef HAVE_BLK_MQ | |
88 | #define END_IO(zv, bio, rq, error) do { \ | |
89 | if (bio) { \ | |
90 | BIO_END_IO(bio, error); \ | |
91 | } else { \ | |
92 | blk_mq_end_request(rq, errno_to_bi_status(error)); \ | |
93 | } \ | |
94 | } while (0) | |
95 | #else | |
96 | #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) | |
97 | #endif | |
98 | ||
99 | #ifdef HAVE_BLK_MQ | |
100 | static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
101 | static unsigned int zvol_actual_blk_mq_queue_depth; | |
102 | #endif | |
103 | ||
5df7e9d8 MM |
104 | struct zvol_state_os { |
105 | struct gendisk *zvo_disk; /* generic disk */ | |
106 | struct request_queue *zvo_queue; /* request queue */ | |
5df7e9d8 | 107 | dev_t zvo_dev; /* device id */ |
6f73d021 TH |
108 | |
109 | #ifdef HAVE_BLK_MQ | |
110 | struct blk_mq_tag_set tag_set; | |
111 | #endif | |
112 | ||
113 | /* Set from the global 'zvol_use_blk_mq' at zvol load */ | |
114 | boolean_t use_blk_mq; | |
5df7e9d8 MM |
115 | }; |
116 | ||
27218a32 | 117 | static taskq_t *zvol_taskq; |
5df7e9d8 MM |
118 | static struct ida zvol_ida; |
119 | ||
e439ee83 | 120 | typedef struct zv_request_stack { |
5df7e9d8 MM |
121 | zvol_state_t *zv; |
122 | struct bio *bio; | |
6f73d021 | 123 | struct request *rq; |
5df7e9d8 MM |
124 | } zv_request_t; |
125 | ||
6f73d021 TH |
126 | typedef struct zv_work { |
127 | struct request *rq; | |
128 | struct work_struct work; | |
129 | } zv_work_t; | |
130 | ||
e439ee83 CS |
131 | typedef struct zv_request_task { |
132 | zv_request_t zvr; | |
133 | taskq_ent_t ent; | |
134 | } zv_request_task_t; | |
135 | ||
136 | static zv_request_task_t * | |
137 | zv_request_task_create(zv_request_t zvr) | |
138 | { | |
139 | zv_request_task_t *task; | |
140 | task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); | |
141 | taskq_init_ent(&task->ent); | |
142 | task->zvr = zvr; | |
143 | return (task); | |
144 | } | |
145 | ||
146 | static void | |
147 | zv_request_task_free(zv_request_task_t *task) | |
148 | { | |
149 | kmem_free(task, sizeof (*task)); | |
150 | } | |
151 | ||
6f73d021 TH |
152 | #ifdef HAVE_BLK_MQ |
153 | ||
154 | /* | |
155 | * This is called when a new block multiqueue request comes in. A request | |
156 | * contains one or more BIOs. | |
157 | */ | |
158 | static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, | |
159 | const struct blk_mq_queue_data *bd) | |
160 | { | |
161 | struct request *rq = bd->rq; | |
162 | zvol_state_t *zv = rq->q->queuedata; | |
163 | ||
164 | /* Tell the kernel that we are starting to process this request */ | |
165 | blk_mq_start_request(rq); | |
166 | ||
167 | if (blk_rq_is_passthrough(rq)) { | |
168 | /* Skip non filesystem request */ | |
169 | blk_mq_end_request(rq, BLK_STS_IOERR); | |
170 | return (BLK_STS_IOERR); | |
171 | } | |
172 | ||
173 | zvol_request_impl(zv, NULL, rq, 0); | |
174 | ||
175 | /* Acknowledge to the kernel that we got this request */ | |
176 | return (BLK_STS_OK); | |
177 | } | |
178 | ||
179 | static struct blk_mq_ops zvol_blk_mq_queue_ops = { | |
180 | .queue_rq = zvol_mq_queue_rq, | |
181 | }; | |
182 | ||
183 | /* Initialize our blk-mq struct */ | |
184 | static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) | |
185 | { | |
186 | struct zvol_state_os *zso = zv->zv_zso; | |
187 | ||
188 | memset(&zso->tag_set, 0, sizeof (zso->tag_set)); | |
189 | ||
190 | /* Initialize tag set. */ | |
191 | zso->tag_set.ops = &zvol_blk_mq_queue_ops; | |
192 | zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; | |
193 | zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; | |
194 | zso->tag_set.numa_node = NUMA_NO_NODE; | |
195 | zso->tag_set.cmd_size = 0; | |
196 | ||
197 | /* | |
198 | * We need BLK_MQ_F_BLOCKING here since we do blocking calls in | |
199 | * zvol_request_impl() | |
200 | */ | |
201 | zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; | |
202 | zso->tag_set.driver_data = zv; | |
203 | ||
204 | return (blk_mq_alloc_tag_set(&zso->tag_set)); | |
205 | } | |
206 | #endif /* HAVE_BLK_MQ */ | |
207 | ||
5df7e9d8 MM |
208 | /* |
209 | * Given a path, return TRUE if path is a ZVOL. | |
210 | */ | |
1dccfd7a CS |
211 | boolean_t |
212 | zvol_os_is_zvol(const char *path) | |
5df7e9d8 | 213 | { |
b7281c88 | 214 | dev_t dev = 0; |
5df7e9d8 | 215 | |
b7281c88 | 216 | if (vdev_lookup_bdev(path, &dev) != 0) |
5df7e9d8 MM |
217 | return (B_FALSE); |
218 | ||
b7281c88 | 219 | if (MAJOR(dev) == zvol_major) |
5df7e9d8 MM |
220 | return (B_TRUE); |
221 | ||
222 | return (B_FALSE); | |
223 | } | |
224 | ||
5df7e9d8 | 225 | static void |
e439ee83 | 226 | zvol_write(zv_request_t *zvr) |
5df7e9d8 | 227 | { |
5df7e9d8 | 228 | struct bio *bio = zvr->bio; |
6f73d021 | 229 | struct request *rq = zvr->rq; |
1c2358c1 | 230 | int error = 0; |
d0cd9a5c | 231 | zfs_uio_t uio; |
5df7e9d8 | 232 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
233 | struct request_queue *q; |
234 | struct gendisk *disk; | |
235 | unsigned long start_time = 0; | |
236 | boolean_t acct = B_FALSE; | |
237 | ||
0b32d817 RM |
238 | ASSERT3P(zv, !=, NULL); |
239 | ASSERT3U(zv->zv_open_count, >, 0); | |
240 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 241 | |
6f73d021 TH |
242 | q = zv->zv_zso->zvo_queue; |
243 | disk = zv->zv_zso->zvo_disk; | |
244 | ||
0929c4de | 245 | /* bio marked as FLUSH need to flush before write */ |
6f73d021 | 246 | if (io_is_flush(bio, rq)) |
0929c4de MA |
247 | zil_commit(zv->zv_zilog, ZVOL_OBJ); |
248 | ||
249 | /* Some requests are just for flush and nothing else. */ | |
6f73d021 | 250 | if (io_size(bio, rq) == 0) { |
0929c4de | 251 | rw_exit(&zv->zv_suspend_lock); |
6f73d021 | 252 | END_IO(zv, bio, rq, 0); |
0929c4de MA |
253 | return; |
254 | } | |
255 | ||
6f73d021 TH |
256 | zfs_uio_bvec_init(&uio, bio, rq); |
257 | ||
5df7e9d8 | 258 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 259 | |
6f73d021 TH |
260 | /* |
261 | * With use_blk_mq, accounting is done by blk_mq_start_request() | |
262 | * and blk_mq_end_request(), so we can skip it here. | |
263 | */ | |
264 | if (bio) { | |
265 | acct = blk_queue_io_stat(q); | |
266 | if (acct) { | |
267 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
268 | bio); | |
269 | } | |
270 | } | |
5df7e9d8 MM |
271 | |
272 | boolean_t sync = | |
6f73d021 | 273 | io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 | 274 | |
0929c4de MA |
275 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
276 | uio.uio_loffset, uio.uio_resid, RL_WRITER); | |
277 | ||
5df7e9d8 MM |
278 | uint64_t volsize = zv->zv_volsize; |
279 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { | |
280 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
281 | uint64_t off = uio.uio_loffset; | |
282 | dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); | |
283 | ||
284 | if (bytes > volsize - off) /* don't write past the end */ | |
285 | bytes = volsize - off; | |
286 | ||
20f28785 | 287 | dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); |
5df7e9d8 MM |
288 | |
289 | /* This will only fail for ENOSPC */ | |
290 | error = dmu_tx_assign(tx, TXG_WAIT); | |
291 | if (error) { | |
292 | dmu_tx_abort(tx); | |
293 | break; | |
294 | } | |
295 | error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); | |
296 | if (error == 0) { | |
297 | zvol_log_write(zv, tx, off, bytes, sync); | |
298 | } | |
299 | dmu_tx_commit(tx); | |
300 | ||
301 | if (error) | |
302 | break; | |
303 | } | |
0929c4de | 304 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
305 | |
306 | int64_t nwritten = start_resid - uio.uio_resid; | |
4547fc4e | 307 | dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); |
5df7e9d8 MM |
308 | task_io_account_write(nwritten); |
309 | ||
310 | if (sync) | |
311 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
312 | ||
313 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 314 | |
6f73d021 | 315 | if (bio && acct) { |
a970f059 | 316 | blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); |
6f73d021 | 317 | } |
a970f059 | 318 | |
6f73d021 | 319 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
320 | } |
321 | ||
322 | static void | |
e439ee83 CS |
323 | zvol_write_task(void *arg) |
324 | { | |
325 | zv_request_task_t *task = arg; | |
326 | zvol_write(&task->zvr); | |
327 | zv_request_task_free(task); | |
328 | } | |
329 | ||
330 | static void | |
331 | zvol_discard(zv_request_t *zvr) | |
5df7e9d8 | 332 | { |
5df7e9d8 | 333 | struct bio *bio = zvr->bio; |
6f73d021 | 334 | struct request *rq = zvr->rq; |
5df7e9d8 | 335 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
336 | uint64_t start = io_offset(bio, rq); |
337 | uint64_t size = io_size(bio, rq); | |
5df7e9d8 MM |
338 | uint64_t end = start + size; |
339 | boolean_t sync; | |
340 | int error = 0; | |
341 | dmu_tx_t *tx; | |
6f73d021 TH |
342 | struct request_queue *q = zv->zv_zso->zvo_queue; |
343 | struct gendisk *disk = zv->zv_zso->zvo_disk; | |
344 | unsigned long start_time = 0; | |
5dd0f019 | 345 | boolean_t acct = B_FALSE; |
5df7e9d8 | 346 | |
0b32d817 RM |
347 | ASSERT3P(zv, !=, NULL); |
348 | ASSERT3U(zv->zv_open_count, >, 0); | |
349 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 350 | |
6f73d021 TH |
351 | if (bio) { |
352 | acct = blk_queue_io_stat(q); | |
353 | if (acct) { | |
354 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
355 | bio); | |
356 | } | |
357 | } | |
5df7e9d8 | 358 | |
6f73d021 | 359 | sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 MM |
360 | |
361 | if (end > zv->zv_volsize) { | |
362 | error = SET_ERROR(EIO); | |
363 | goto unlock; | |
364 | } | |
365 | ||
366 | /* | |
367 | * Align the request to volume block boundaries when a secure erase is | |
368 | * not required. This will prevent dnode_free_range() from zeroing out | |
369 | * the unaligned parts which is slow (read-modify-write) and useless | |
370 | * since we are not freeing any space by doing so. | |
371 | */ | |
6f73d021 | 372 | if (!io_is_secure_erase(bio, rq)) { |
5df7e9d8 MM |
373 | start = P2ROUNDUP(start, zv->zv_volblocksize); |
374 | end = P2ALIGN(end, zv->zv_volblocksize); | |
375 | size = end - start; | |
376 | } | |
377 | ||
378 | if (start >= end) | |
379 | goto unlock; | |
380 | ||
0929c4de MA |
381 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
382 | start, size, RL_WRITER); | |
383 | ||
5df7e9d8 MM |
384 | tx = dmu_tx_create(zv->zv_objset); |
385 | dmu_tx_mark_netfree(tx); | |
386 | error = dmu_tx_assign(tx, TXG_WAIT); | |
387 | if (error != 0) { | |
388 | dmu_tx_abort(tx); | |
389 | } else { | |
c3773de1 | 390 | zvol_log_truncate(zv, tx, start, size); |
5df7e9d8 MM |
391 | dmu_tx_commit(tx); |
392 | error = dmu_free_long_range(zv->zv_objset, | |
393 | ZVOL_OBJ, start, size); | |
394 | } | |
0929c4de | 395 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
396 | |
397 | if (error == 0 && sync) | |
398 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
399 | ||
0929c4de | 400 | unlock: |
5df7e9d8 | 401 | rw_exit(&zv->zv_suspend_lock); |
a970f059 | 402 | |
6f73d021 TH |
403 | if (bio && acct) { |
404 | blk_generic_end_io_acct(q, disk, WRITE, bio, | |
405 | start_time); | |
406 | } | |
a970f059 | 407 | |
6f73d021 | 408 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
409 | } |
410 | ||
411 | static void | |
e439ee83 CS |
412 | zvol_discard_task(void *arg) |
413 | { | |
414 | zv_request_task_t *task = arg; | |
415 | zvol_discard(&task->zvr); | |
416 | zv_request_task_free(task); | |
417 | } | |
418 | ||
419 | static void | |
420 | zvol_read(zv_request_t *zvr) | |
5df7e9d8 | 421 | { |
5df7e9d8 | 422 | struct bio *bio = zvr->bio; |
6f73d021 | 423 | struct request *rq = zvr->rq; |
1c2358c1 | 424 | int error = 0; |
d0cd9a5c | 425 | zfs_uio_t uio; |
6f73d021 | 426 | boolean_t acct = B_FALSE; |
5df7e9d8 | 427 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
428 | struct request_queue *q; |
429 | struct gendisk *disk; | |
430 | unsigned long start_time = 0; | |
431 | ||
0b32d817 RM |
432 | ASSERT3P(zv, !=, NULL); |
433 | ASSERT3U(zv->zv_open_count, >, 0); | |
5df7e9d8 | 434 | |
6f73d021 TH |
435 | zfs_uio_bvec_init(&uio, bio, rq); |
436 | ||
437 | q = zv->zv_zso->zvo_queue; | |
438 | disk = zv->zv_zso->zvo_disk; | |
439 | ||
5df7e9d8 | 440 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 441 | |
6f73d021 TH |
442 | /* |
443 | * When blk-mq is being used, accounting is done by | |
444 | * blk_mq_start_request() and blk_mq_end_request(). | |
445 | */ | |
446 | if (bio) { | |
447 | acct = blk_queue_io_stat(q); | |
448 | if (acct) | |
449 | start_time = blk_generic_start_io_acct(q, disk, READ, | |
450 | bio); | |
451 | } | |
5df7e9d8 | 452 | |
0929c4de MA |
453 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
454 | uio.uio_loffset, uio.uio_resid, RL_READER); | |
455 | ||
5df7e9d8 | 456 | uint64_t volsize = zv->zv_volsize; |
6f73d021 | 457 | |
5df7e9d8 MM |
458 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { |
459 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
460 | ||
461 | /* don't read past the end */ | |
462 | if (bytes > volsize - uio.uio_loffset) | |
463 | bytes = volsize - uio.uio_loffset; | |
464 | ||
465 | error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); | |
466 | if (error) { | |
467 | /* convert checksum errors into IO errors */ | |
468 | if (error == ECKSUM) | |
469 | error = SET_ERROR(EIO); | |
470 | break; | |
471 | } | |
472 | } | |
0929c4de | 473 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
474 | |
475 | int64_t nread = start_resid - uio.uio_resid; | |
4547fc4e | 476 | dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); |
5df7e9d8 MM |
477 | task_io_account_read(nread); |
478 | ||
479 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 480 | |
6f73d021 | 481 | if (bio && acct) { |
a970f059 | 482 | blk_generic_end_io_acct(q, disk, READ, bio, start_time); |
6f73d021 | 483 | } |
a970f059 | 484 | |
6f73d021 | 485 | END_IO(zv, bio, rq, -error); |
e439ee83 CS |
486 | } |
487 | ||
488 | static void | |
489 | zvol_read_task(void *arg) | |
490 | { | |
491 | zv_request_task_t *task = arg; | |
492 | zvol_read(&task->zvr); | |
493 | zv_request_task_free(task); | |
5df7e9d8 MM |
494 | } |
495 | ||
6f73d021 TH |
496 | |
497 | /* | |
498 | * Process a BIO or request | |
499 | * | |
500 | * Either 'bio' or 'rq' should be set depending on if we are processing a | |
501 | * bio or a request (both should not be set). | |
502 | * | |
503 | * force_sync: Set to 0 to defer processing to a background taskq | |
504 | * Set to 1 to process data synchronously | |
505 | */ | |
435a451e | 506 | static void |
6f73d021 TH |
507 | zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, |
508 | boolean_t force_sync) | |
5df7e9d8 | 509 | { |
5df7e9d8 | 510 | fstrans_cookie_t cookie = spl_fstrans_mark(); |
6f73d021 TH |
511 | uint64_t offset = io_offset(bio, rq); |
512 | uint64_t size = io_size(bio, rq); | |
513 | int rw = io_data_dir(bio, rq); | |
5df7e9d8 | 514 | |
60387fac | 515 | if (zvol_request_sync || zv->zv_threading == B_FALSE) |
6f73d021 TH |
516 | force_sync = 1; |
517 | ||
518 | zv_request_t zvr = { | |
519 | .zv = zv, | |
520 | .bio = bio, | |
521 | .rq = rq, | |
522 | }; | |
523 | ||
524 | if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { | |
525 | printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", | |
5df7e9d8 MM |
526 | zv->zv_zso->zvo_disk->disk_name, |
527 | (long long unsigned)offset, | |
528 | (long unsigned)size); | |
529 | ||
6f73d021 | 530 | END_IO(zv, bio, rq, -SET_ERROR(EIO)); |
5df7e9d8 MM |
531 | goto out; |
532 | } | |
533 | ||
e439ee83 CS |
534 | zv_request_task_t *task; |
535 | ||
5df7e9d8 | 536 | if (rw == WRITE) { |
5df7e9d8 | 537 | if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { |
6f73d021 | 538 | END_IO(zv, bio, rq, -SET_ERROR(EROFS)); |
5df7e9d8 MM |
539 | goto out; |
540 | } | |
541 | ||
542 | /* | |
0929c4de MA |
543 | * Prevents the zvol from being suspended, or the ZIL being |
544 | * concurrently opened. Will be released after the i/o | |
545 | * completes. | |
5df7e9d8 MM |
546 | */ |
547 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
548 | ||
549 | /* | |
550 | * Open a ZIL if this is the first time we have written to this | |
551 | * zvol. We protect zv->zv_zilog with zv_suspend_lock rather | |
552 | * than zv_state_lock so that we don't need to acquire an | |
553 | * additional lock in this path. | |
554 | */ | |
555 | if (zv->zv_zilog == NULL) { | |
556 | rw_exit(&zv->zv_suspend_lock); | |
557 | rw_enter(&zv->zv_suspend_lock, RW_WRITER); | |
558 | if (zv->zv_zilog == NULL) { | |
559 | zv->zv_zilog = zil_open(zv->zv_objset, | |
fb087146 | 560 | zvol_get_data, &zv->zv_kstat.dk_zil_sums); |
5df7e9d8 | 561 | zv->zv_flags |= ZVOL_WRITTEN_TO; |
93e36580 CS |
562 | /* replay / destroy done in zvol_create_minor */ |
563 | VERIFY0((zv->zv_zilog->zl_header->zh_flags & | |
564 | ZIL_REPLAY_NEEDED)); | |
5df7e9d8 MM |
565 | } |
566 | rw_downgrade(&zv->zv_suspend_lock); | |
567 | } | |
568 | ||
5df7e9d8 | 569 | /* |
0929c4de MA |
570 | * We don't want this thread to be blocked waiting for i/o to |
571 | * complete, so we instead wait from a taskq callback. The | |
572 | * i/o may be a ZIL write (via zil_commit()), or a read of an | |
573 | * indirect block, or a read of a data block (if this is a | |
574 | * partial-block write). We will indicate that the i/o is | |
6f73d021 | 575 | * complete by calling END_IO() from the taskq callback. |
0929c4de MA |
576 | * |
577 | * This design allows the calling thread to continue and | |
578 | * initiate more concurrent operations by calling | |
579 | * zvol_request() again. There are typically only a small | |
580 | * number of threads available to call zvol_request() (e.g. | |
581 | * one per iSCSI target), so keeping the latency of | |
582 | * zvol_request() low is important for performance. | |
583 | * | |
584 | * The zvol_request_sync module parameter allows this | |
585 | * behavior to be altered, for performance evaluation | |
586 | * purposes. If the callback blocks, setting | |
587 | * zvol_request_sync=1 will result in much worse performance. | |
588 | * | |
589 | * We can have up to zvol_threads concurrent i/o's being | |
590 | * processed for all zvols on the system. This is typically | |
591 | * a vast improvement over the zvol_request_sync=1 behavior | |
592 | * of one i/o at a time per zvol. However, an even better | |
593 | * design would be for zvol_request() to initiate the zio | |
594 | * directly, and then be notified by the zio_done callback, | |
6f73d021 | 595 | * which would call END_IO(). Unfortunately, the DMU/ZIL |
0929c4de MA |
596 | * interfaces lack this functionality (they block waiting for |
597 | * the i/o to complete). | |
5df7e9d8 | 598 | */ |
6f73d021 TH |
599 | if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { |
600 | if (force_sync) { | |
e439ee83 | 601 | zvol_discard(&zvr); |
0929c4de | 602 | } else { |
e439ee83 | 603 | task = zv_request_task_create(zvr); |
0929c4de | 604 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 605 | zvol_discard_task, task, 0, &task->ent); |
0929c4de | 606 | } |
5df7e9d8 | 607 | } else { |
6f73d021 | 608 | if (force_sync) { |
e439ee83 | 609 | zvol_write(&zvr); |
0929c4de | 610 | } else { |
e439ee83 | 611 | task = zv_request_task_create(zvr); |
0929c4de | 612 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 613 | zvol_write_task, task, 0, &task->ent); |
0929c4de | 614 | } |
5df7e9d8 MM |
615 | } |
616 | } else { | |
617 | /* | |
618 | * The SCST driver, and possibly others, may issue READ I/Os | |
619 | * with a length of zero bytes. These empty I/Os contain no | |
620 | * data and require no additional handling. | |
621 | */ | |
622 | if (size == 0) { | |
6f73d021 | 623 | END_IO(zv, bio, rq, 0); |
5df7e9d8 MM |
624 | goto out; |
625 | } | |
626 | ||
5df7e9d8 MM |
627 | rw_enter(&zv->zv_suspend_lock, RW_READER); |
628 | ||
0929c4de | 629 | /* See comment in WRITE case above. */ |
6f73d021 | 630 | if (force_sync) { |
e439ee83 | 631 | zvol_read(&zvr); |
0929c4de | 632 | } else { |
e439ee83 | 633 | task = zv_request_task_create(zvr); |
0929c4de | 634 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 635 | zvol_read_task, task, 0, &task->ent); |
0929c4de | 636 | } |
5df7e9d8 MM |
637 | } |
638 | ||
639 | out: | |
640 | spl_fstrans_unmark(cookie); | |
6f73d021 TH |
641 | } |
642 | ||
643 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
644 | #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID | |
645 | static void | |
646 | zvol_submit_bio(struct bio *bio) | |
647 | #else | |
648 | static blk_qc_t | |
649 | zvol_submit_bio(struct bio *bio) | |
650 | #endif | |
651 | #else | |
652 | static MAKE_REQUEST_FN_RET | |
653 | zvol_request(struct request_queue *q, struct bio *bio) | |
654 | #endif | |
655 | { | |
656 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
657 | #if defined(HAVE_BIO_BDEV_DISK) | |
658 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
659 | #else | |
660 | struct request_queue *q = bio->bi_disk->queue; | |
661 | #endif | |
662 | #endif | |
663 | zvol_state_t *zv = q->queuedata; | |
664 | ||
665 | zvol_request_impl(zv, bio, NULL, 0); | |
666 | #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ | |
667 | defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ | |
435a451e | 668 | !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) |
5df7e9d8 MM |
669 | return (BLK_QC_T_NONE); |
670 | #endif | |
671 | } | |
672 | ||
673 | static int | |
43e8f6e3 CK |
674 | #ifdef HAVE_BLK_MODE_T |
675 | zvol_open(struct gendisk *disk, blk_mode_t flag) | |
676 | #else | |
5df7e9d8 | 677 | zvol_open(struct block_device *bdev, fmode_t flag) |
43e8f6e3 | 678 | #endif |
5df7e9d8 MM |
679 | { |
680 | zvol_state_t *zv; | |
681 | int error = 0; | |
8a02d01e | 682 | boolean_t drop_suspend = B_FALSE; |
77e2756d BB |
683 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
684 | hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); | |
685 | hrtime_t start = gethrtime(); | |
5df7e9d8 | 686 | |
77e2756d BB |
687 | retry: |
688 | #endif | |
5df7e9d8 MM |
689 | rw_enter(&zvol_state_lock, RW_READER); |
690 | /* | |
691 | * Obtain a copy of private_data under the zvol_state_lock to make | |
692 | * sure that either the result of zvol free code path setting | |
43e8f6e3 | 693 | * disk->private_data to NULL is observed, or zvol_os_free() |
5df7e9d8 MM |
694 | * is not called on this zv because of the positive zv_open_count. |
695 | */ | |
43e8f6e3 CK |
696 | #ifdef HAVE_BLK_MODE_T |
697 | zv = disk->private_data; | |
698 | #else | |
5df7e9d8 | 699 | zv = bdev->bd_disk->private_data; |
43e8f6e3 | 700 | #endif |
5df7e9d8 MM |
701 | if (zv == NULL) { |
702 | rw_exit(&zvol_state_lock); | |
703 | return (SET_ERROR(-ENXIO)); | |
704 | } | |
705 | ||
8a02d01e BB |
706 | mutex_enter(&zv->zv_state_lock); |
707 | /* | |
708 | * Make sure zvol is not suspended during first open | |
709 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
710 | * ordering - zv_suspend_lock before zv_state_lock | |
711 | */ | |
712 | if (zv->zv_open_count == 0) { | |
713 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
714 | mutex_exit(&zv->zv_state_lock); | |
715 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
716 | mutex_enter(&zv->zv_state_lock); | |
717 | /* check to see if zv_suspend_lock is needed */ | |
718 | if (zv->zv_open_count != 0) { | |
719 | rw_exit(&zv->zv_suspend_lock); | |
720 | } else { | |
721 | drop_suspend = B_TRUE; | |
722 | } | |
723 | } else { | |
724 | drop_suspend = B_TRUE; | |
725 | } | |
726 | } | |
727 | rw_exit(&zvol_state_lock); | |
728 | ||
729 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
730 | ||
731 | if (zv->zv_open_count == 0) { | |
732 | boolean_t drop_namespace = B_FALSE; | |
733 | ||
734 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
735 | ||
77e2756d BB |
736 | /* |
737 | * In all other call paths the spa_namespace_lock is taken | |
738 | * before the bdev->bd_mutex lock. However, on open(2) | |
739 | * the __blkdev_get() function calls fops->open() with the | |
740 | * bdev->bd_mutex lock held. This can result in a deadlock | |
741 | * when zvols from one pool are used as vdevs in another. | |
742 | * | |
743 | * To prevent a lock inversion deadlock we preemptively | |
744 | * take the spa_namespace_lock. Normally the lock will not | |
745 | * be contended and this is safe because spa_open_common() | |
746 | * handles the case where the caller already holds the | |
747 | * spa_namespace_lock. | |
748 | * | |
749 | * When the lock cannot be aquired after multiple retries | |
750 | * this must be the vdev on zvol deadlock case and we have | |
751 | * no choice but to return an error. For 5.12 and older | |
752 | * kernels returning -ERESTARTSYS will result in the | |
753 | * bdev->bd_mutex being dropped, then reacquired, and | |
754 | * fops->open() being called again. This process can be | |
755 | * repeated safely until both locks are acquired. For 5.13 | |
756 | * and newer the -ERESTARTSYS retry logic was removed from | |
757 | * the kernel so the only option is to return the error for | |
758 | * the caller to handle it. | |
759 | */ | |
8a02d01e BB |
760 | if (!mutex_owned(&spa_namespace_lock)) { |
761 | if (!mutex_tryenter(&spa_namespace_lock)) { | |
762 | mutex_exit(&zv->zv_state_lock); | |
763 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d BB |
764 | |
765 | #ifdef HAVE_BLKDEV_GET_ERESTARTSYS | |
8a02d01e | 766 | schedule(); |
77e2756d | 767 | return (SET_ERROR(-ERESTARTSYS)); |
8a02d01e BB |
768 | #else |
769 | if ((gethrtime() - start) > timeout) | |
770 | return (SET_ERROR(-ERESTARTSYS)); | |
77e2756d | 771 | |
8a02d01e BB |
772 | schedule_timeout(MSEC_TO_TICK(10)); |
773 | goto retry; | |
77e2756d | 774 | #endif |
8a02d01e BB |
775 | } else { |
776 | drop_namespace = B_TRUE; | |
5df7e9d8 MM |
777 | } |
778 | } | |
5df7e9d8 | 779 | |
43e8f6e3 | 780 | error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); |
5df7e9d8 | 781 | |
8a02d01e BB |
782 | if (drop_namespace) |
783 | mutex_exit(&spa_namespace_lock); | |
5df7e9d8 MM |
784 | } |
785 | ||
8a02d01e | 786 | if (error == 0) { |
43e8f6e3 CK |
787 | if ((blk_mode_is_open_write(flag)) && |
788 | (zv->zv_flags & ZVOL_RDONLY)) { | |
8a02d01e BB |
789 | if (zv->zv_open_count == 0) |
790 | zvol_last_close(zv); | |
5df7e9d8 | 791 | |
8a02d01e BB |
792 | error = SET_ERROR(-EROFS); |
793 | } else { | |
794 | zv->zv_open_count++; | |
795 | } | |
796 | } | |
5df7e9d8 | 797 | |
5df7e9d8 MM |
798 | mutex_exit(&zv->zv_state_lock); |
799 | if (drop_suspend) | |
800 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d | 801 | |
8a02d01e | 802 | if (error == 0) |
43e8f6e3 CK |
803 | #ifdef HAVE_BLK_MODE_T |
804 | disk_check_media_change(disk); | |
805 | #else | |
8a02d01e | 806 | zfs_check_media_change(bdev); |
43e8f6e3 | 807 | #endif |
8a02d01e BB |
808 | |
809 | return (error); | |
5df7e9d8 MM |
810 | } |
811 | ||
5df7e9d8 | 812 | static void |
43e8f6e3 CK |
813 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG |
814 | zvol_release(struct gendisk *disk) | |
815 | #else | |
816 | zvol_release(struct gendisk *disk, fmode_t unused) | |
817 | #endif | |
5df7e9d8 | 818 | { |
43e8f6e3 CK |
819 | #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) |
820 | (void) unused; | |
821 | #endif | |
5df7e9d8 MM |
822 | zvol_state_t *zv; |
823 | boolean_t drop_suspend = B_TRUE; | |
824 | ||
825 | rw_enter(&zvol_state_lock, RW_READER); | |
826 | zv = disk->private_data; | |
827 | ||
828 | mutex_enter(&zv->zv_state_lock); | |
0b32d817 | 829 | ASSERT3U(zv->zv_open_count, >, 0); |
5df7e9d8 MM |
830 | /* |
831 | * make sure zvol is not suspended during last close | |
832 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
833 | * ordering - zv_suspend_lock before zv_state_lock | |
834 | */ | |
835 | if (zv->zv_open_count == 1) { | |
836 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
837 | mutex_exit(&zv->zv_state_lock); | |
838 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
839 | mutex_enter(&zv->zv_state_lock); | |
840 | /* check to see if zv_suspend_lock is needed */ | |
841 | if (zv->zv_open_count != 1) { | |
842 | rw_exit(&zv->zv_suspend_lock); | |
843 | drop_suspend = B_FALSE; | |
844 | } | |
845 | } | |
846 | } else { | |
847 | drop_suspend = B_FALSE; | |
848 | } | |
849 | rw_exit(&zvol_state_lock); | |
850 | ||
851 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
5df7e9d8 MM |
852 | |
853 | zv->zv_open_count--; | |
0b32d817 RM |
854 | if (zv->zv_open_count == 0) { |
855 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
5df7e9d8 | 856 | zvol_last_close(zv); |
0b32d817 | 857 | } |
5df7e9d8 MM |
858 | |
859 | mutex_exit(&zv->zv_state_lock); | |
860 | ||
861 | if (drop_suspend) | |
862 | rw_exit(&zv->zv_suspend_lock); | |
5df7e9d8 MM |
863 | } |
864 | ||
865 | static int | |
866 | zvol_ioctl(struct block_device *bdev, fmode_t mode, | |
867 | unsigned int cmd, unsigned long arg) | |
868 | { | |
869 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
870 | int error = 0; | |
871 | ||
872 | ASSERT3U(zv->zv_open_count, >, 0); | |
873 | ||
874 | switch (cmd) { | |
875 | case BLKFLSBUF: | |
7ac56b86 | 876 | #ifdef HAVE_FSYNC_BDEV |
5df7e9d8 | 877 | fsync_bdev(bdev); |
7ac56b86 CK |
878 | #elif defined(HAVE_SYNC_BLOCKDEV) |
879 | sync_blockdev(bdev); | |
880 | #else | |
881 | #error "Neither fsync_bdev() nor sync_blockdev() found" | |
882 | #endif | |
5df7e9d8 MM |
883 | invalidate_bdev(bdev); |
884 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
885 | ||
886 | if (!(zv->zv_flags & ZVOL_RDONLY)) | |
887 | txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); | |
888 | ||
889 | rw_exit(&zv->zv_suspend_lock); | |
890 | break; | |
891 | ||
892 | case BLKZNAME: | |
893 | mutex_enter(&zv->zv_state_lock); | |
894 | error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); | |
895 | mutex_exit(&zv->zv_state_lock); | |
896 | break; | |
897 | ||
898 | default: | |
899 | error = -ENOTTY; | |
900 | break; | |
901 | } | |
902 | ||
903 | return (SET_ERROR(error)); | |
904 | } | |
905 | ||
906 | #ifdef CONFIG_COMPAT | |
907 | static int | |
908 | zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, | |
909 | unsigned cmd, unsigned long arg) | |
910 | { | |
911 | return (zvol_ioctl(bdev, mode, cmd, arg)); | |
912 | } | |
913 | #else | |
914 | #define zvol_compat_ioctl NULL | |
915 | #endif | |
916 | ||
5df7e9d8 MM |
917 | static unsigned int |
918 | zvol_check_events(struct gendisk *disk, unsigned int clearing) | |
919 | { | |
920 | unsigned int mask = 0; | |
921 | ||
922 | rw_enter(&zvol_state_lock, RW_READER); | |
923 | ||
924 | zvol_state_t *zv = disk->private_data; | |
925 | if (zv != NULL) { | |
926 | mutex_enter(&zv->zv_state_lock); | |
927 | mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; | |
928 | zv->zv_changed = 0; | |
929 | mutex_exit(&zv->zv_state_lock); | |
930 | } | |
931 | ||
932 | rw_exit(&zvol_state_lock); | |
933 | ||
934 | return (mask); | |
935 | } | |
5df7e9d8 MM |
936 | |
937 | static int | |
938 | zvol_revalidate_disk(struct gendisk *disk) | |
939 | { | |
940 | rw_enter(&zvol_state_lock, RW_READER); | |
941 | ||
942 | zvol_state_t *zv = disk->private_data; | |
943 | if (zv != NULL) { | |
944 | mutex_enter(&zv->zv_state_lock); | |
945 | set_capacity(zv->zv_zso->zvo_disk, | |
946 | zv->zv_volsize >> SECTOR_BITS); | |
947 | mutex_exit(&zv->zv_state_lock); | |
948 | } | |
949 | ||
950 | rw_exit(&zvol_state_lock); | |
951 | ||
952 | return (0); | |
953 | } | |
954 | ||
1dccfd7a CS |
955 | int |
956 | zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) | |
5df7e9d8 | 957 | { |
1c0bbd52 | 958 | struct gendisk *disk = zv->zv_zso->zvo_disk; |
5df7e9d8 | 959 | |
19697e45 | 960 | #if defined(HAVE_REVALIDATE_DISK_SIZE) |
1c0bbd52 | 961 | revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); |
19697e45 | 962 | #elif defined(HAVE_REVALIDATE_DISK) |
1c0bbd52 | 963 | revalidate_disk(disk); |
19697e45 BB |
964 | #else |
965 | zvol_revalidate_disk(disk); | |
59b68723 | 966 | #endif |
5df7e9d8 MM |
967 | return (0); |
968 | } | |
969 | ||
1dccfd7a CS |
970 | void |
971 | zvol_os_clear_private(zvol_state_t *zv) | |
5df7e9d8 MM |
972 | { |
973 | /* | |
974 | * Cleared while holding zvol_state_lock as a writer | |
975 | * which will prevent zvol_open() from opening it. | |
976 | */ | |
977 | zv->zv_zso->zvo_disk->private_data = NULL; | |
978 | } | |
979 | ||
980 | /* | |
981 | * Provide a simple virtual geometry for legacy compatibility. For devices | |
982 | * smaller than 1 MiB a small head and sector count is used to allow very | |
983 | * tiny devices. For devices over 1 Mib a standard head and sector count | |
984 | * is used to keep the cylinders count reasonable. | |
985 | */ | |
986 | static int | |
987 | zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |
988 | { | |
989 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
990 | sector_t sectors; | |
991 | ||
992 | ASSERT3U(zv->zv_open_count, >, 0); | |
993 | ||
994 | sectors = get_capacity(zv->zv_zso->zvo_disk); | |
995 | ||
996 | if (sectors > 2048) { | |
997 | geo->heads = 16; | |
998 | geo->sectors = 63; | |
999 | } else { | |
1000 | geo->heads = 2; | |
1001 | geo->sectors = 4; | |
1002 | } | |
1003 | ||
1004 | geo->start = 0; | |
1005 | geo->cylinders = sectors / (geo->heads * geo->sectors); | |
1006 | ||
1007 | return (0); | |
1008 | } | |
1009 | ||
6f73d021 TH |
1010 | /* |
1011 | * Why have two separate block_device_operations structs? | |
1012 | * | |
1013 | * Normally we'd just have one, and assign 'submit_bio' as needed. However, | |
1014 | * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we | |
1015 | * can't just change submit_bio dynamically at runtime. So just create two | |
1016 | * separate structs to get around this. | |
1017 | */ | |
1018 | static const struct block_device_operations zvol_ops_blk_mq = { | |
1019 | .open = zvol_open, | |
1020 | .release = zvol_release, | |
1021 | .ioctl = zvol_ioctl, | |
1022 | .compat_ioctl = zvol_compat_ioctl, | |
1023 | .check_events = zvol_check_events, | |
1024 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK | |
1025 | .revalidate_disk = zvol_revalidate_disk, | |
1026 | #endif | |
1027 | .getgeo = zvol_getgeo, | |
1028 | .owner = THIS_MODULE, | |
1029 | }; | |
1030 | ||
18168da7 | 1031 | static const struct block_device_operations zvol_ops = { |
5df7e9d8 MM |
1032 | .open = zvol_open, |
1033 | .release = zvol_release, | |
1034 | .ioctl = zvol_ioctl, | |
1035 | .compat_ioctl = zvol_compat_ioctl, | |
5df7e9d8 | 1036 | .check_events = zvol_check_events, |
48c7b0e4 | 1037 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK |
5df7e9d8 | 1038 | .revalidate_disk = zvol_revalidate_disk, |
48c7b0e4 | 1039 | #endif |
5df7e9d8 MM |
1040 | .getgeo = zvol_getgeo, |
1041 | .owner = THIS_MODULE, | |
d817c171 | 1042 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS |
1b06b03a | 1043 | .submit_bio = zvol_submit_bio, |
d817c171 | 1044 | #endif |
5df7e9d8 MM |
1045 | }; |
1046 | ||
6f73d021 TH |
1047 | static int |
1048 | zvol_alloc_non_blk_mq(struct zvol_state_os *zso) | |
1049 | { | |
1050 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) | |
1051 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1052 | zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); | |
1053 | if (zso->zvo_disk == NULL) | |
1054 | return (1); | |
1055 | ||
1056 | zso->zvo_disk->minors = ZVOL_MINORS; | |
1057 | zso->zvo_queue = zso->zvo_disk->queue; | |
1058 | #else | |
1059 | zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); | |
1060 | if (zso->zvo_queue == NULL) | |
1061 | return (1); | |
1062 | ||
1063 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1064 | if (zso->zvo_disk == NULL) { | |
1065 | blk_cleanup_queue(zso->zvo_queue); | |
1066 | return (1); | |
1067 | } | |
1068 | ||
1069 | zso->zvo_disk->queue = zso->zvo_queue; | |
1070 | #endif /* HAVE_BLK_ALLOC_DISK */ | |
1071 | #else | |
1072 | zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); | |
1073 | if (zso->zvo_queue == NULL) | |
1074 | return (1); | |
1075 | ||
1076 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1077 | if (zso->zvo_disk == NULL) { | |
1078 | blk_cleanup_queue(zso->zvo_queue); | |
1079 | return (1); | |
1080 | } | |
1081 | ||
1082 | zso->zvo_disk->queue = zso->zvo_queue; | |
1083 | #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ | |
1084 | return (0); | |
1085 | ||
1086 | } | |
1087 | ||
1088 | static int | |
1089 | zvol_alloc_blk_mq(zvol_state_t *zv) | |
1090 | { | |
1091 | #ifdef HAVE_BLK_MQ | |
1092 | struct zvol_state_os *zso = zv->zv_zso; | |
1093 | ||
1094 | /* Allocate our blk-mq tag_set */ | |
1095 | if (zvol_blk_mq_alloc_tag_set(zv) != 0) | |
1096 | return (1); | |
1097 | ||
1098 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1099 | zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); | |
1100 | if (zso->zvo_disk == NULL) { | |
1101 | blk_mq_free_tag_set(&zso->tag_set); | |
1102 | return (1); | |
1103 | } | |
1104 | zso->zvo_queue = zso->zvo_disk->queue; | |
1105 | zso->zvo_disk->minors = ZVOL_MINORS; | |
1106 | #else | |
1107 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1108 | if (zso->zvo_disk == NULL) { | |
1109 | blk_cleanup_queue(zso->zvo_queue); | |
1110 | blk_mq_free_tag_set(&zso->tag_set); | |
1111 | return (1); | |
1112 | } | |
1113 | /* Allocate queue */ | |
1114 | zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); | |
1115 | if (IS_ERR(zso->zvo_queue)) { | |
1116 | blk_mq_free_tag_set(&zso->tag_set); | |
1117 | return (1); | |
1118 | } | |
1119 | ||
1120 | /* Our queue is now created, assign it to our disk */ | |
1121 | zso->zvo_disk->queue = zso->zvo_queue; | |
1122 | ||
1123 | #endif | |
1124 | #endif | |
1125 | return (0); | |
1126 | } | |
1127 | ||
5df7e9d8 MM |
1128 | /* |
1129 | * Allocate memory for a new zvol_state_t and setup the required | |
1130 | * request queue and generic disk structures for the block device. | |
1131 | */ | |
1132 | static zvol_state_t * | |
1133 | zvol_alloc(dev_t dev, const char *name) | |
1134 | { | |
1135 | zvol_state_t *zv; | |
68dde63d | 1136 | struct zvol_state_os *zso; |
5df7e9d8 | 1137 | uint64_t volmode; |
6f73d021 | 1138 | int ret; |
5df7e9d8 MM |
1139 | |
1140 | if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) | |
1141 | return (NULL); | |
1142 | ||
1143 | if (volmode == ZFS_VOLMODE_DEFAULT) | |
1144 | volmode = zvol_volmode; | |
1145 | ||
1146 | if (volmode == ZFS_VOLMODE_NONE) | |
1147 | return (NULL); | |
1148 | ||
1149 | zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); | |
68dde63d BB |
1150 | zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); |
1151 | zv->zv_zso = zso; | |
0ca45cb3 | 1152 | zv->zv_volmode = volmode; |
5df7e9d8 MM |
1153 | |
1154 | list_link_init(&zv->zv_next); | |
5df7e9d8 MM |
1155 | mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); |
1156 | ||
6f73d021 TH |
1157 | #ifdef HAVE_BLK_MQ |
1158 | zv->zv_zso->use_blk_mq = zvol_use_blk_mq; | |
1159 | #endif | |
1b06b03a | 1160 | |
6f73d021 TH |
1161 | /* |
1162 | * The block layer has 3 interfaces for getting BIOs: | |
1163 | * | |
1164 | * 1. blk-mq request queues (new) | |
1165 | * 2. submit_bio() (oldest) | |
1166 | * 3. regular request queues (old). | |
1167 | * | |
1168 | * Each of those interfaces has two permutations: | |
1169 | * | |
1170 | * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates | |
1171 | * both the disk and its queue (5.14 kernel or newer) | |
1172 | * | |
1173 | * b) We don't have blk_*alloc_disk(), and have to allocate the | |
1174 | * disk and the queue separately. (5.13 kernel or older) | |
1175 | */ | |
1176 | if (zv->zv_zso->use_blk_mq) { | |
1177 | ret = zvol_alloc_blk_mq(zv); | |
1178 | zso->zvo_disk->fops = &zvol_ops_blk_mq; | |
1179 | } else { | |
1180 | ret = zvol_alloc_non_blk_mq(zso); | |
1181 | zso->zvo_disk->fops = &zvol_ops; | |
1b06b03a | 1182 | } |
6f73d021 | 1183 | if (ret != 0) |
5df7e9d8 MM |
1184 | goto out_kmem; |
1185 | ||
68dde63d | 1186 | blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); |
5df7e9d8 MM |
1187 | |
1188 | /* Limit read-ahead to a single page to prevent over-prefetching. */ | |
68dde63d | 1189 | blk_queue_set_read_ahead(zso->zvo_queue, 1); |
5df7e9d8 | 1190 | |
6f73d021 TH |
1191 | if (!zv->zv_zso->use_blk_mq) { |
1192 | /* Disable write merging in favor of the ZIO pipeline. */ | |
1193 | blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); | |
1194 | } | |
5df7e9d8 | 1195 | |
ae1e40b3 BB |
1196 | /* Enable /proc/diskstats */ |
1197 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); | |
1198 | ||
68dde63d BB |
1199 | zso->zvo_queue->queuedata = zv; |
1200 | zso->zvo_dev = dev; | |
5df7e9d8 MM |
1201 | zv->zv_open_count = 0; |
1202 | strlcpy(zv->zv_name, name, MAXNAMELEN); | |
1203 | ||
2cc479d0 | 1204 | zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); |
5df7e9d8 MM |
1205 | rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); |
1206 | ||
68dde63d BB |
1207 | zso->zvo_disk->major = zvol_major; |
1208 | zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; | |
5df7e9d8 | 1209 | |
026f126b BB |
1210 | /* |
1211 | * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. | |
1212 | * This is accomplished by limiting the number of minors for the | |
1213 | * device to one and explicitly disabling partition scanning. | |
1214 | */ | |
5df7e9d8 | 1215 | if (volmode == ZFS_VOLMODE_DEV) { |
68dde63d | 1216 | zso->zvo_disk->minors = 1; |
026f126b BB |
1217 | zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; |
1218 | zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; | |
5df7e9d8 | 1219 | } |
026f126b | 1220 | |
68dde63d | 1221 | zso->zvo_disk->first_minor = (dev & MINORMASK); |
68dde63d | 1222 | zso->zvo_disk->private_data = zv; |
68dde63d | 1223 | snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", |
5df7e9d8 MM |
1224 | ZVOL_DEV_NAME, (dev & MINORMASK)); |
1225 | ||
1226 | return (zv); | |
1227 | ||
5df7e9d8 | 1228 | out_kmem: |
68dde63d | 1229 | kmem_free(zso, sizeof (struct zvol_state_os)); |
5df7e9d8 MM |
1230 | kmem_free(zv, sizeof (zvol_state_t)); |
1231 | return (NULL); | |
1232 | } | |
1233 | ||
1234 | /* | |
1235 | * Cleanup then free a zvol_state_t which was created by zvol_alloc(). | |
1236 | * At this time, the structure is not opened by anyone, is taken off | |
1237 | * the zvol_state_list, and has its private data set to NULL. | |
1238 | * The zvol_state_lock is dropped. | |
99573cc0 PS |
1239 | * |
1240 | * This function may take many milliseconds to complete (e.g. we've seen | |
1241 | * it take over 256ms), due to the calls to "blk_cleanup_queue" and | |
1242 | * "del_gendisk". Thus, consumers need to be careful to account for this | |
1243 | * latency when calling this function. | |
5df7e9d8 | 1244 | */ |
1dccfd7a CS |
1245 | void |
1246 | zvol_os_free(zvol_state_t *zv) | |
5df7e9d8 MM |
1247 | { |
1248 | ||
1249 | ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); | |
1250 | ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); | |
0b32d817 RM |
1251 | ASSERT0(zv->zv_open_count); |
1252 | ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); | |
5df7e9d8 MM |
1253 | |
1254 | rw_destroy(&zv->zv_suspend_lock); | |
2cc479d0 | 1255 | zfs_rangelock_fini(&zv->zv_rangelock); |
5df7e9d8 MM |
1256 | |
1257 | del_gendisk(zv->zv_zso->zvo_disk); | |
1b06b03a BB |
1258 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ |
1259 | defined(HAVE_BLK_ALLOC_DISK) | |
c26045b4 | 1260 | #if defined(HAVE_BLK_CLEANUP_DISK) |
1b06b03a | 1261 | blk_cleanup_disk(zv->zv_zso->zvo_disk); |
c26045b4 BB |
1262 | #else |
1263 | put_disk(zv->zv_zso->zvo_disk); | |
1264 | #endif | |
1b06b03a | 1265 | #else |
5df7e9d8 MM |
1266 | blk_cleanup_queue(zv->zv_zso->zvo_queue); |
1267 | put_disk(zv->zv_zso->zvo_disk); | |
1b06b03a | 1268 | #endif |
5df7e9d8 | 1269 | |
6f73d021 TH |
1270 | #ifdef HAVE_BLK_MQ |
1271 | if (zv->zv_zso->use_blk_mq) | |
1272 | blk_mq_free_tag_set(&zv->zv_zso->tag_set); | |
1273 | #endif | |
1274 | ||
5df7e9d8 MM |
1275 | ida_simple_remove(&zvol_ida, |
1276 | MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); | |
1277 | ||
1278 | mutex_destroy(&zv->zv_state_lock); | |
4547fc4e | 1279 | dataset_kstats_destroy(&zv->zv_kstat); |
5df7e9d8 MM |
1280 | |
1281 | kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); | |
1282 | kmem_free(zv, sizeof (zvol_state_t)); | |
1283 | } | |
1284 | ||
0ca45cb3 MM |
1285 | void |
1286 | zvol_wait_close(zvol_state_t *zv) | |
1287 | { | |
1288 | } | |
1289 | ||
5df7e9d8 MM |
1290 | /* |
1291 | * Create a block device minor node and setup the linkage between it | |
1292 | * and the specified volume. Once this function returns the block | |
1293 | * device is live and ready for use. | |
1294 | */ | |
1dccfd7a | 1295 | int |
ec213971 | 1296 | zvol_os_create_minor(const char *name) |
5df7e9d8 MM |
1297 | { |
1298 | zvol_state_t *zv; | |
1299 | objset_t *os; | |
1300 | dmu_object_info_t *doi; | |
1301 | uint64_t volsize; | |
1302 | uint64_t len; | |
1303 | unsigned minor = 0; | |
1304 | int error = 0; | |
1305 | int idx; | |
1306 | uint64_t hash = zvol_name_hash(name); | |
60387fac | 1307 | uint64_t volthreading; |
e197bb24 | 1308 | bool replayed_zil = B_FALSE; |
5df7e9d8 MM |
1309 | |
1310 | if (zvol_inhibit_dev) | |
1311 | return (0); | |
1312 | ||
1313 | idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); | |
1314 | if (idx < 0) | |
1315 | return (SET_ERROR(-idx)); | |
1316 | minor = idx << ZVOL_MINOR_BITS; | |
1317 | ||
1318 | zv = zvol_find_by_name_hash(name, hash, RW_NONE); | |
1319 | if (zv) { | |
1320 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1321 | mutex_exit(&zv->zv_state_lock); | |
1322 | ida_simple_remove(&zvol_ida, idx); | |
1323 | return (SET_ERROR(EEXIST)); | |
1324 | } | |
1325 | ||
1326 | doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); | |
1327 | ||
1328 | error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); | |
1329 | if (error) | |
1330 | goto out_doi; | |
1331 | ||
1332 | error = dmu_object_info(os, ZVOL_OBJ, doi); | |
1333 | if (error) | |
1334 | goto out_dmu_objset_disown; | |
1335 | ||
1336 | error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); | |
1337 | if (error) | |
1338 | goto out_dmu_objset_disown; | |
1339 | ||
1340 | zv = zvol_alloc(MKDEV(zvol_major, minor), name); | |
1341 | if (zv == NULL) { | |
1342 | error = SET_ERROR(EAGAIN); | |
1343 | goto out_dmu_objset_disown; | |
1344 | } | |
1345 | zv->zv_hash = hash; | |
1346 | ||
1347 | if (dmu_objset_is_snapshot(os)) | |
1348 | zv->zv_flags |= ZVOL_RDONLY; | |
1349 | ||
1350 | zv->zv_volblocksize = doi->doi_data_block_size; | |
1351 | zv->zv_volsize = volsize; | |
1352 | zv->zv_objset = os; | |
1353 | ||
60387fac AH |
1354 | /* Default */ |
1355 | zv->zv_threading = B_TRUE; | |
1356 | if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) | |
1357 | == 0) | |
1358 | zv->zv_threading = volthreading; | |
1359 | ||
5df7e9d8 MM |
1360 | set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); |
1361 | ||
1362 | blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, | |
1363 | (DMU_MAX_ACCESS / 4) >> 9); | |
6f73d021 TH |
1364 | |
1365 | if (zv->zv_zso->use_blk_mq) { | |
1366 | /* | |
1367 | * IO requests can be really big (1MB). When an IO request | |
1368 | * comes in, it is passed off to zvol_read() or zvol_write() | |
1369 | * in a new thread, where it is chunked up into 'volblocksize' | |
1370 | * sized pieces and processed. So for example, if the request | |
1371 | * is a 1MB write and your volblocksize is 128k, one zvol_write | |
1372 | * thread will take that request and sequentially do ten 128k | |
1373 | * IOs. This is due to the fact that the thread needs to lock | |
1374 | * each volblocksize sized block. So you might be wondering: | |
1375 | * "instead of passing the whole 1MB request to one thread, | |
1376 | * why not pass ten individual 128k chunks to ten threads and | |
1377 | * process the whole write in parallel?" The short answer is | |
1378 | * that there's a sweet spot number of chunks that balances | |
1379 | * the greater parallelism with the added overhead of more | |
1380 | * threads. The sweet spot can be different depending on if you | |
1381 | * have a read or write heavy workload. Writes typically want | |
1382 | * high chunk counts while reads typically want lower ones. On | |
1383 | * a test pool with 6 NVMe drives in a 3x 2-disk mirror | |
1384 | * configuration, with volblocksize=8k, the sweet spot for good | |
1385 | * sequential reads and writes was at 8 chunks. | |
1386 | */ | |
1387 | ||
1388 | /* | |
1389 | * Below we tell the kernel how big we want our requests | |
1390 | * to be. You would think that blk_queue_io_opt() would be | |
1391 | * used to do this since it is used to "set optimal request | |
1392 | * size for the queue", but that doesn't seem to do | |
1393 | * anything - the kernel still gives you huge requests | |
1394 | * with tons of little PAGE_SIZE segments contained within it. | |
1395 | * | |
1396 | * Knowing that the kernel will just give you PAGE_SIZE segments | |
1397 | * no matter what, you can say "ok, I want PAGE_SIZE byte | |
1398 | * segments, and I want 'N' of them per request", where N is | |
1399 | * the correct number of segments for the volblocksize and | |
1400 | * number of chunks you want. | |
1401 | */ | |
1402 | #ifdef HAVE_BLK_MQ | |
1403 | if (zvol_blk_mq_blocks_per_thread != 0) { | |
1404 | unsigned int chunks; | |
1405 | chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); | |
1406 | ||
1407 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1408 | PAGE_SIZE); | |
1409 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1410 | (zv->zv_volblocksize * chunks) / PAGE_SIZE); | |
1411 | } else { | |
1412 | /* | |
1413 | * Special case: zvol_blk_mq_blocks_per_thread = 0 | |
1414 | * Max everything out. | |
1415 | */ | |
1416 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1417 | UINT16_MAX); | |
1418 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1419 | UINT_MAX); | |
1420 | } | |
1421 | #endif | |
1422 | } else { | |
1423 | blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); | |
1424 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); | |
1425 | } | |
1426 | ||
5df7e9d8 MM |
1427 | blk_queue_physical_block_size(zv->zv_zso->zvo_queue, |
1428 | zv->zv_volblocksize); | |
1429 | blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); | |
1430 | blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, | |
1431 | (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); | |
1432 | blk_queue_discard_granularity(zv->zv_zso->zvo_queue, | |
1433 | zv->zv_volblocksize); | |
5e4aedac | 1434 | #ifdef QUEUE_FLAG_DISCARD |
5df7e9d8 | 1435 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); |
5e4aedac | 1436 | #endif |
5df7e9d8 MM |
1437 | #ifdef QUEUE_FLAG_NONROT |
1438 | blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); | |
1439 | #endif | |
1440 | #ifdef QUEUE_FLAG_ADD_RANDOM | |
1441 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); | |
1442 | #endif | |
1443 | /* This flag was introduced in kernel version 4.12. */ | |
1444 | #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH | |
1445 | blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); | |
1446 | #endif | |
1447 | ||
fb087146 AH |
1448 | ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); |
1449 | error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); | |
1450 | if (error) | |
1451 | goto out_dmu_objset_disown; | |
93e36580 | 1452 | ASSERT3P(zv->zv_zilog, ==, NULL); |
fb087146 | 1453 | zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); |
5df7e9d8 MM |
1454 | if (spa_writeable(dmu_objset_spa(os))) { |
1455 | if (zil_replay_disable) | |
e197bb24 | 1456 | replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); |
5df7e9d8 | 1457 | else |
e197bb24 | 1458 | replayed_zil = zil_replay(os, zv, zvol_replay_vector); |
5df7e9d8 | 1459 | } |
e197bb24 AS |
1460 | if (replayed_zil) |
1461 | zil_close(zv->zv_zilog); | |
93e36580 | 1462 | zv->zv_zilog = NULL; |
5df7e9d8 MM |
1463 | |
1464 | /* | |
1465 | * When udev detects the addition of the device it will immediately | |
1466 | * invoke blkid(8) to determine the type of content on the device. | |
1467 | * Prefetching the blocks commonly scanned by blkid(8) will speed | |
1468 | * up this process. | |
1469 | */ | |
8ef15f93 | 1470 | len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); |
5df7e9d8 MM |
1471 | if (len > 0) { |
1472 | dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); | |
1473 | dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, | |
1474 | ZIO_PRIORITY_SYNC_READ); | |
1475 | } | |
1476 | ||
1477 | zv->zv_objset = NULL; | |
1478 | out_dmu_objset_disown: | |
1479 | dmu_objset_disown(os, B_TRUE, FTAG); | |
1480 | out_doi: | |
1481 | kmem_free(doi, sizeof (dmu_object_info_t)); | |
1482 | ||
1483 | /* | |
1484 | * Keep in mind that once add_disk() is called, the zvol is | |
1485 | * announced to the world, and zvol_open()/zvol_release() can | |
1486 | * be called at any time. Incidentally, add_disk() itself calls | |
1487 | * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() | |
1488 | * directly as well. | |
1489 | */ | |
1490 | if (error == 0) { | |
1491 | rw_enter(&zvol_state_lock, RW_WRITER); | |
1492 | zvol_insert(zv); | |
1493 | rw_exit(&zvol_state_lock); | |
12fa250d RE |
1494 | #ifdef HAVE_ADD_DISK_RET |
1495 | error = add_disk(zv->zv_zso->zvo_disk); | |
1496 | #else | |
5df7e9d8 | 1497 | add_disk(zv->zv_zso->zvo_disk); |
12fa250d | 1498 | #endif |
5df7e9d8 MM |
1499 | } else { |
1500 | ida_simple_remove(&zvol_ida, idx); | |
1501 | } | |
1502 | ||
ec213971 | 1503 | return (error); |
5df7e9d8 MM |
1504 | } |
1505 | ||
1dccfd7a CS |
1506 | void |
1507 | zvol_os_rename_minor(zvol_state_t *zv, const char *newname) | |
5df7e9d8 MM |
1508 | { |
1509 | int readonly = get_disk_ro(zv->zv_zso->zvo_disk); | |
1510 | ||
1511 | ASSERT(RW_LOCK_HELD(&zvol_state_lock)); | |
1512 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1513 | ||
1514 | strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); | |
1515 | ||
1516 | /* move to new hashtable entry */ | |
1517 | zv->zv_hash = zvol_name_hash(zv->zv_name); | |
1518 | hlist_del(&zv->zv_hlink); | |
1519 | hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); | |
1520 | ||
1521 | /* | |
1522 | * The block device's read-only state is briefly changed causing | |
1523 | * a KOBJ_CHANGE uevent to be issued. This ensures udev detects | |
1524 | * the name change and fixes the symlinks. This does not change | |
1525 | * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never | |
1526 | * changes. This would normally be done using kobject_uevent() but | |
1527 | * that is a GPL-only symbol which is why we need this workaround. | |
1528 | */ | |
1529 | set_disk_ro(zv->zv_zso->zvo_disk, !readonly); | |
1530 | set_disk_ro(zv->zv_zso->zvo_disk, readonly); | |
1531 | } | |
1532 | ||
1dccfd7a CS |
1533 | void |
1534 | zvol_os_set_disk_ro(zvol_state_t *zv, int flags) | |
5df7e9d8 MM |
1535 | { |
1536 | ||
1537 | set_disk_ro(zv->zv_zso->zvo_disk, flags); | |
1538 | } | |
1539 | ||
1dccfd7a CS |
1540 | void |
1541 | zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) | |
5df7e9d8 MM |
1542 | { |
1543 | ||
1544 | set_capacity(zv->zv_zso->zvo_disk, capacity); | |
1545 | } | |
1546 | ||
5df7e9d8 MM |
1547 | int |
1548 | zvol_init(void) | |
1549 | { | |
1550 | int error; | |
6f73d021 TH |
1551 | |
1552 | /* | |
1553 | * zvol_threads is the module param the user passes in. | |
1554 | * | |
1555 | * zvol_actual_threads is what we use internally, since the user can | |
1556 | * pass zvol_thread = 0 to mean "use all the CPUs" (the default). | |
1557 | */ | |
1558 | static unsigned int zvol_actual_threads; | |
1559 | ||
1560 | if (zvol_threads == 0) { | |
1561 | /* | |
1562 | * See dde9380a1 for why 32 was chosen here. This should | |
1563 | * probably be refined to be some multiple of the number | |
1564 | * of CPUs. | |
1565 | */ | |
1566 | zvol_actual_threads = MAX(num_online_cpus(), 32); | |
1567 | } else { | |
1568 | zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); | |
1569 | } | |
5df7e9d8 MM |
1570 | |
1571 | error = register_blkdev(zvol_major, ZVOL_DRIVER); | |
1572 | if (error) { | |
1573 | printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); | |
1574 | return (error); | |
1575 | } | |
6f73d021 TH |
1576 | |
1577 | #ifdef HAVE_BLK_MQ | |
1578 | if (zvol_blk_mq_queue_depth == 0) { | |
1579 | zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
1580 | } else { | |
1581 | zvol_actual_blk_mq_queue_depth = | |
1582 | MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); | |
1583 | } | |
1584 | ||
1585 | if (zvol_blk_mq_threads == 0) { | |
1586 | zvol_blk_mq_actual_threads = num_online_cpus(); | |
1587 | } else { | |
1588 | zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), | |
1589 | 1024); | |
1590 | } | |
1591 | #endif | |
1592 | zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, | |
1593 | zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); | |
5df7e9d8 MM |
1594 | if (zvol_taskq == NULL) { |
1595 | unregister_blkdev(zvol_major, ZVOL_DRIVER); | |
1596 | return (-ENOMEM); | |
1597 | } | |
6f73d021 | 1598 | |
5df7e9d8 | 1599 | zvol_init_impl(); |
5df7e9d8 | 1600 | ida_init(&zvol_ida); |
5df7e9d8 MM |
1601 | return (0); |
1602 | } | |
1603 | ||
1604 | void | |
1605 | zvol_fini(void) | |
1606 | { | |
5df7e9d8 | 1607 | zvol_fini_impl(); |
5df7e9d8 MM |
1608 | unregister_blkdev(zvol_major, ZVOL_DRIVER); |
1609 | taskq_destroy(zvol_taskq); | |
1610 | ida_destroy(&zvol_ida); | |
1611 | } | |
1612 | ||
1613 | /* BEGIN CSTYLED */ | |
1614 | module_param(zvol_inhibit_dev, uint, 0644); | |
1615 | MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); | |
1616 | ||
1617 | module_param(zvol_major, uint, 0444); | |
1618 | MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); | |
1619 | ||
1620 | module_param(zvol_threads, uint, 0444); | |
6f73d021 TH |
1621 | MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" |
1622 | "to 0 to use all active CPUs"); | |
5df7e9d8 MM |
1623 | |
1624 | module_param(zvol_request_sync, uint, 0644); | |
1625 | MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); | |
1626 | ||
1627 | module_param(zvol_max_discard_blocks, ulong, 0444); | |
1628 | MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); | |
1629 | ||
1630 | module_param(zvol_prefetch_bytes, uint, 0644); | |
1631 | MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); | |
1632 | ||
1633 | module_param(zvol_volmode, uint, 0644); | |
1634 | MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); | |
6f73d021 | 1635 | |
05c4710e TH |
1636 | #ifdef HAVE_BLK_MQ |
1637 | module_param(zvol_blk_mq_queue_depth, uint, 0644); | |
1638 | MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); | |
1639 | ||
1640 | module_param(zvol_use_blk_mq, uint, 0644); | |
1641 | MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); | |
1642 | ||
1643 | module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); | |
1644 | MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, | |
1645 | "Process volblocksize blocks per thread"); | |
1646 | #endif | |
1647 | ||
945e39fc PS |
1648 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
1649 | module_param(zvol_open_timeout_ms, uint, 0644); | |
1650 | MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); | |
1651 | #endif | |
1652 | ||
5df7e9d8 | 1653 | /* END CSTYLED */ |