]>
Commit | Line | Data |
---|---|---|
5df7e9d8 MM |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
5df7e9d8 MM |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
0929c4de MA |
21 | /* |
22 | * Copyright (c) 2012, 2020 by Delphix. All rights reserved. | |
23 | */ | |
5df7e9d8 MM |
24 | |
25 | #include <sys/dataset_kstats.h> | |
26 | #include <sys/dbuf.h> | |
27 | #include <sys/dmu_traverse.h> | |
28 | #include <sys/dsl_dataset.h> | |
29 | #include <sys/dsl_prop.h> | |
30 | #include <sys/dsl_dir.h> | |
31 | #include <sys/zap.h> | |
32 | #include <sys/zfeature.h> | |
33 | #include <sys/zil_impl.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | #include <sys/zio.h> | |
36 | #include <sys/zfs_rlock.h> | |
37 | #include <sys/spa_impl.h> | |
38 | #include <sys/zvol.h> | |
39 | #include <sys/zvol_impl.h> | |
40 | ||
41 | #include <linux/blkdev_compat.h> | |
42 | #include <linux/task_io_accounting_ops.h> | |
43 | ||
6f73d021 TH |
44 | #ifdef HAVE_BLK_MQ |
45 | #include <linux/blk-mq.h> | |
46 | #endif | |
47 | ||
48 | static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, | |
49 | struct request *rq, boolean_t force_sync); | |
50 | ||
18168da7 AZ |
51 | static unsigned int zvol_major = ZVOL_MAJOR; |
52 | static unsigned int zvol_request_sync = 0; | |
53 | static unsigned int zvol_prefetch_bytes = (128 * 1024); | |
54 | static unsigned long zvol_max_discard_blocks = 16384; | |
abdcef47 PH |
55 | |
56 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS | |
945e39fc | 57 | static unsigned int zvol_open_timeout_ms = 1000; |
abdcef47 | 58 | #endif |
5df7e9d8 | 59 | |
6f73d021 TH |
60 | static unsigned int zvol_threads = 0; |
61 | #ifdef HAVE_BLK_MQ | |
62 | static unsigned int zvol_blk_mq_threads = 0; | |
63 | static unsigned int zvol_blk_mq_actual_threads; | |
64 | static boolean_t zvol_use_blk_mq = B_FALSE; | |
65 | ||
66 | /* | |
67 | * The maximum number of volblocksize blocks to process per thread. Typically, | |
68 | * write heavy workloads preform better with higher values here, and read | |
69 | * heavy workloads preform better with lower values, but that's not a hard | |
70 | * and fast rule. It's basically a knob to tune between "less overhead with | |
71 | * less parallelism" and "more overhead, but more parallelism". | |
72 | * | |
73 | * '8' was chosen as a reasonable, balanced, default based off of sequential | |
74 | * read and write tests to a zvol in an NVMe pool (with 16 CPUs). | |
75 | */ | |
76 | static unsigned int zvol_blk_mq_blocks_per_thread = 8; | |
77 | #endif | |
78 | ||
79 | #ifndef BLKDEV_DEFAULT_RQ | |
80 | /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ | |
81 | #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ | |
82 | #endif | |
83 | ||
84 | /* | |
85 | * Finalize our BIO or request. | |
86 | */ | |
87 | #ifdef HAVE_BLK_MQ | |
88 | #define END_IO(zv, bio, rq, error) do { \ | |
89 | if (bio) { \ | |
90 | BIO_END_IO(bio, error); \ | |
91 | } else { \ | |
92 | blk_mq_end_request(rq, errno_to_bi_status(error)); \ | |
93 | } \ | |
94 | } while (0) | |
95 | #else | |
96 | #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) | |
97 | #endif | |
98 | ||
99 | #ifdef HAVE_BLK_MQ | |
100 | static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
101 | static unsigned int zvol_actual_blk_mq_queue_depth; | |
102 | #endif | |
103 | ||
5df7e9d8 MM |
104 | struct zvol_state_os { |
105 | struct gendisk *zvo_disk; /* generic disk */ | |
106 | struct request_queue *zvo_queue; /* request queue */ | |
5df7e9d8 | 107 | dev_t zvo_dev; /* device id */ |
6f73d021 TH |
108 | |
109 | #ifdef HAVE_BLK_MQ | |
110 | struct blk_mq_tag_set tag_set; | |
111 | #endif | |
112 | ||
113 | /* Set from the global 'zvol_use_blk_mq' at zvol load */ | |
114 | boolean_t use_blk_mq; | |
5df7e9d8 MM |
115 | }; |
116 | ||
27218a32 | 117 | static taskq_t *zvol_taskq; |
5df7e9d8 MM |
118 | static struct ida zvol_ida; |
119 | ||
e439ee83 | 120 | typedef struct zv_request_stack { |
5df7e9d8 MM |
121 | zvol_state_t *zv; |
122 | struct bio *bio; | |
6f73d021 | 123 | struct request *rq; |
5df7e9d8 MM |
124 | } zv_request_t; |
125 | ||
6f73d021 TH |
126 | typedef struct zv_work { |
127 | struct request *rq; | |
128 | struct work_struct work; | |
129 | } zv_work_t; | |
130 | ||
e439ee83 CS |
131 | typedef struct zv_request_task { |
132 | zv_request_t zvr; | |
133 | taskq_ent_t ent; | |
134 | } zv_request_task_t; | |
135 | ||
136 | static zv_request_task_t * | |
137 | zv_request_task_create(zv_request_t zvr) | |
138 | { | |
139 | zv_request_task_t *task; | |
140 | task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); | |
141 | taskq_init_ent(&task->ent); | |
142 | task->zvr = zvr; | |
143 | return (task); | |
144 | } | |
145 | ||
146 | static void | |
147 | zv_request_task_free(zv_request_task_t *task) | |
148 | { | |
149 | kmem_free(task, sizeof (*task)); | |
150 | } | |
151 | ||
6f73d021 TH |
152 | #ifdef HAVE_BLK_MQ |
153 | ||
154 | /* | |
155 | * This is called when a new block multiqueue request comes in. A request | |
156 | * contains one or more BIOs. | |
157 | */ | |
158 | static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, | |
159 | const struct blk_mq_queue_data *bd) | |
160 | { | |
161 | struct request *rq = bd->rq; | |
162 | zvol_state_t *zv = rq->q->queuedata; | |
163 | ||
164 | /* Tell the kernel that we are starting to process this request */ | |
165 | blk_mq_start_request(rq); | |
166 | ||
167 | if (blk_rq_is_passthrough(rq)) { | |
168 | /* Skip non filesystem request */ | |
169 | blk_mq_end_request(rq, BLK_STS_IOERR); | |
170 | return (BLK_STS_IOERR); | |
171 | } | |
172 | ||
173 | zvol_request_impl(zv, NULL, rq, 0); | |
174 | ||
175 | /* Acknowledge to the kernel that we got this request */ | |
176 | return (BLK_STS_OK); | |
177 | } | |
178 | ||
179 | static struct blk_mq_ops zvol_blk_mq_queue_ops = { | |
180 | .queue_rq = zvol_mq_queue_rq, | |
181 | }; | |
182 | ||
183 | /* Initialize our blk-mq struct */ | |
184 | static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) | |
185 | { | |
186 | struct zvol_state_os *zso = zv->zv_zso; | |
187 | ||
188 | memset(&zso->tag_set, 0, sizeof (zso->tag_set)); | |
189 | ||
190 | /* Initialize tag set. */ | |
191 | zso->tag_set.ops = &zvol_blk_mq_queue_ops; | |
192 | zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; | |
193 | zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; | |
194 | zso->tag_set.numa_node = NUMA_NO_NODE; | |
195 | zso->tag_set.cmd_size = 0; | |
196 | ||
197 | /* | |
198 | * We need BLK_MQ_F_BLOCKING here since we do blocking calls in | |
199 | * zvol_request_impl() | |
200 | */ | |
201 | zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; | |
202 | zso->tag_set.driver_data = zv; | |
203 | ||
204 | return (blk_mq_alloc_tag_set(&zso->tag_set)); | |
205 | } | |
206 | #endif /* HAVE_BLK_MQ */ | |
207 | ||
5df7e9d8 MM |
208 | /* |
209 | * Given a path, return TRUE if path is a ZVOL. | |
210 | */ | |
1dccfd7a CS |
211 | boolean_t |
212 | zvol_os_is_zvol(const char *path) | |
5df7e9d8 | 213 | { |
b7281c88 | 214 | dev_t dev = 0; |
5df7e9d8 | 215 | |
b7281c88 | 216 | if (vdev_lookup_bdev(path, &dev) != 0) |
5df7e9d8 MM |
217 | return (B_FALSE); |
218 | ||
b7281c88 | 219 | if (MAJOR(dev) == zvol_major) |
5df7e9d8 MM |
220 | return (B_TRUE); |
221 | ||
222 | return (B_FALSE); | |
223 | } | |
224 | ||
5df7e9d8 | 225 | static void |
e439ee83 | 226 | zvol_write(zv_request_t *zvr) |
5df7e9d8 | 227 | { |
5df7e9d8 | 228 | struct bio *bio = zvr->bio; |
6f73d021 | 229 | struct request *rq = zvr->rq; |
1c2358c1 | 230 | int error = 0; |
d0cd9a5c | 231 | zfs_uio_t uio; |
5df7e9d8 | 232 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
233 | struct request_queue *q; |
234 | struct gendisk *disk; | |
235 | unsigned long start_time = 0; | |
236 | boolean_t acct = B_FALSE; | |
237 | ||
0b32d817 RM |
238 | ASSERT3P(zv, !=, NULL); |
239 | ASSERT3U(zv->zv_open_count, >, 0); | |
240 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 241 | |
6f73d021 TH |
242 | q = zv->zv_zso->zvo_queue; |
243 | disk = zv->zv_zso->zvo_disk; | |
244 | ||
0929c4de | 245 | /* bio marked as FLUSH need to flush before write */ |
6f73d021 | 246 | if (io_is_flush(bio, rq)) |
0929c4de MA |
247 | zil_commit(zv->zv_zilog, ZVOL_OBJ); |
248 | ||
249 | /* Some requests are just for flush and nothing else. */ | |
6f73d021 | 250 | if (io_size(bio, rq) == 0) { |
0929c4de | 251 | rw_exit(&zv->zv_suspend_lock); |
6f73d021 | 252 | END_IO(zv, bio, rq, 0); |
0929c4de MA |
253 | return; |
254 | } | |
255 | ||
6f73d021 TH |
256 | zfs_uio_bvec_init(&uio, bio, rq); |
257 | ||
5df7e9d8 | 258 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 259 | |
6f73d021 TH |
260 | /* |
261 | * With use_blk_mq, accounting is done by blk_mq_start_request() | |
262 | * and blk_mq_end_request(), so we can skip it here. | |
263 | */ | |
264 | if (bio) { | |
265 | acct = blk_queue_io_stat(q); | |
266 | if (acct) { | |
267 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
268 | bio); | |
269 | } | |
270 | } | |
5df7e9d8 MM |
271 | |
272 | boolean_t sync = | |
6f73d021 | 273 | io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 | 274 | |
0929c4de MA |
275 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
276 | uio.uio_loffset, uio.uio_resid, RL_WRITER); | |
277 | ||
5df7e9d8 MM |
278 | uint64_t volsize = zv->zv_volsize; |
279 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { | |
280 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
281 | uint64_t off = uio.uio_loffset; | |
282 | dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); | |
283 | ||
284 | if (bytes > volsize - off) /* don't write past the end */ | |
285 | bytes = volsize - off; | |
286 | ||
20f28785 | 287 | dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); |
5df7e9d8 MM |
288 | |
289 | /* This will only fail for ENOSPC */ | |
290 | error = dmu_tx_assign(tx, TXG_WAIT); | |
291 | if (error) { | |
292 | dmu_tx_abort(tx); | |
293 | break; | |
294 | } | |
295 | error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); | |
296 | if (error == 0) { | |
297 | zvol_log_write(zv, tx, off, bytes, sync); | |
298 | } | |
299 | dmu_tx_commit(tx); | |
300 | ||
301 | if (error) | |
302 | break; | |
303 | } | |
0929c4de | 304 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
305 | |
306 | int64_t nwritten = start_resid - uio.uio_resid; | |
4547fc4e | 307 | dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); |
5df7e9d8 MM |
308 | task_io_account_write(nwritten); |
309 | ||
310 | if (sync) | |
311 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
312 | ||
313 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 314 | |
6f73d021 | 315 | if (bio && acct) { |
a970f059 | 316 | blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); |
6f73d021 | 317 | } |
a970f059 | 318 | |
6f73d021 | 319 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
320 | } |
321 | ||
322 | static void | |
e439ee83 CS |
323 | zvol_write_task(void *arg) |
324 | { | |
325 | zv_request_task_t *task = arg; | |
326 | zvol_write(&task->zvr); | |
327 | zv_request_task_free(task); | |
328 | } | |
329 | ||
330 | static void | |
331 | zvol_discard(zv_request_t *zvr) | |
5df7e9d8 | 332 | { |
5df7e9d8 | 333 | struct bio *bio = zvr->bio; |
6f73d021 | 334 | struct request *rq = zvr->rq; |
5df7e9d8 | 335 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
336 | uint64_t start = io_offset(bio, rq); |
337 | uint64_t size = io_size(bio, rq); | |
5df7e9d8 MM |
338 | uint64_t end = start + size; |
339 | boolean_t sync; | |
340 | int error = 0; | |
341 | dmu_tx_t *tx; | |
6f73d021 TH |
342 | struct request_queue *q = zv->zv_zso->zvo_queue; |
343 | struct gendisk *disk = zv->zv_zso->zvo_disk; | |
344 | unsigned long start_time = 0; | |
5dd0f019 | 345 | boolean_t acct = B_FALSE; |
5df7e9d8 | 346 | |
0b32d817 RM |
347 | ASSERT3P(zv, !=, NULL); |
348 | ASSERT3U(zv->zv_open_count, >, 0); | |
349 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 350 | |
6f73d021 TH |
351 | if (bio) { |
352 | acct = blk_queue_io_stat(q); | |
353 | if (acct) { | |
354 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
355 | bio); | |
356 | } | |
357 | } | |
5df7e9d8 | 358 | |
6f73d021 | 359 | sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 MM |
360 | |
361 | if (end > zv->zv_volsize) { | |
362 | error = SET_ERROR(EIO); | |
363 | goto unlock; | |
364 | } | |
365 | ||
366 | /* | |
367 | * Align the request to volume block boundaries when a secure erase is | |
368 | * not required. This will prevent dnode_free_range() from zeroing out | |
369 | * the unaligned parts which is slow (read-modify-write) and useless | |
370 | * since we are not freeing any space by doing so. | |
371 | */ | |
6f73d021 | 372 | if (!io_is_secure_erase(bio, rq)) { |
5df7e9d8 MM |
373 | start = P2ROUNDUP(start, zv->zv_volblocksize); |
374 | end = P2ALIGN(end, zv->zv_volblocksize); | |
375 | size = end - start; | |
376 | } | |
377 | ||
378 | if (start >= end) | |
379 | goto unlock; | |
380 | ||
0929c4de MA |
381 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
382 | start, size, RL_WRITER); | |
383 | ||
5df7e9d8 MM |
384 | tx = dmu_tx_create(zv->zv_objset); |
385 | dmu_tx_mark_netfree(tx); | |
386 | error = dmu_tx_assign(tx, TXG_WAIT); | |
387 | if (error != 0) { | |
388 | dmu_tx_abort(tx); | |
389 | } else { | |
390 | zvol_log_truncate(zv, tx, start, size, B_TRUE); | |
391 | dmu_tx_commit(tx); | |
392 | error = dmu_free_long_range(zv->zv_objset, | |
393 | ZVOL_OBJ, start, size); | |
394 | } | |
0929c4de | 395 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
396 | |
397 | if (error == 0 && sync) | |
398 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
399 | ||
0929c4de | 400 | unlock: |
5df7e9d8 | 401 | rw_exit(&zv->zv_suspend_lock); |
a970f059 | 402 | |
6f73d021 TH |
403 | if (bio && acct) { |
404 | blk_generic_end_io_acct(q, disk, WRITE, bio, | |
405 | start_time); | |
406 | } | |
a970f059 | 407 | |
6f73d021 | 408 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
409 | } |
410 | ||
411 | static void | |
e439ee83 CS |
412 | zvol_discard_task(void *arg) |
413 | { | |
414 | zv_request_task_t *task = arg; | |
415 | zvol_discard(&task->zvr); | |
416 | zv_request_task_free(task); | |
417 | } | |
418 | ||
419 | static void | |
420 | zvol_read(zv_request_t *zvr) | |
5df7e9d8 | 421 | { |
5df7e9d8 | 422 | struct bio *bio = zvr->bio; |
6f73d021 | 423 | struct request *rq = zvr->rq; |
1c2358c1 | 424 | int error = 0; |
d0cd9a5c | 425 | zfs_uio_t uio; |
6f73d021 | 426 | boolean_t acct = B_FALSE; |
5df7e9d8 | 427 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
428 | struct request_queue *q; |
429 | struct gendisk *disk; | |
430 | unsigned long start_time = 0; | |
431 | ||
0b32d817 RM |
432 | ASSERT3P(zv, !=, NULL); |
433 | ASSERT3U(zv->zv_open_count, >, 0); | |
5df7e9d8 | 434 | |
6f73d021 TH |
435 | zfs_uio_bvec_init(&uio, bio, rq); |
436 | ||
437 | q = zv->zv_zso->zvo_queue; | |
438 | disk = zv->zv_zso->zvo_disk; | |
439 | ||
5df7e9d8 | 440 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 441 | |
6f73d021 TH |
442 | /* |
443 | * When blk-mq is being used, accounting is done by | |
444 | * blk_mq_start_request() and blk_mq_end_request(). | |
445 | */ | |
446 | if (bio) { | |
447 | acct = blk_queue_io_stat(q); | |
448 | if (acct) | |
449 | start_time = blk_generic_start_io_acct(q, disk, READ, | |
450 | bio); | |
451 | } | |
5df7e9d8 | 452 | |
0929c4de MA |
453 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
454 | uio.uio_loffset, uio.uio_resid, RL_READER); | |
455 | ||
5df7e9d8 | 456 | uint64_t volsize = zv->zv_volsize; |
6f73d021 | 457 | |
5df7e9d8 MM |
458 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { |
459 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
460 | ||
461 | /* don't read past the end */ | |
462 | if (bytes > volsize - uio.uio_loffset) | |
463 | bytes = volsize - uio.uio_loffset; | |
464 | ||
465 | error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); | |
466 | if (error) { | |
467 | /* convert checksum errors into IO errors */ | |
468 | if (error == ECKSUM) | |
469 | error = SET_ERROR(EIO); | |
470 | break; | |
471 | } | |
472 | } | |
0929c4de | 473 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
474 | |
475 | int64_t nread = start_resid - uio.uio_resid; | |
4547fc4e | 476 | dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); |
5df7e9d8 MM |
477 | task_io_account_read(nread); |
478 | ||
479 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 480 | |
6f73d021 | 481 | if (bio && acct) { |
a970f059 | 482 | blk_generic_end_io_acct(q, disk, READ, bio, start_time); |
6f73d021 | 483 | } |
a970f059 | 484 | |
6f73d021 | 485 | END_IO(zv, bio, rq, -error); |
e439ee83 CS |
486 | } |
487 | ||
488 | static void | |
489 | zvol_read_task(void *arg) | |
490 | { | |
491 | zv_request_task_t *task = arg; | |
492 | zvol_read(&task->zvr); | |
493 | zv_request_task_free(task); | |
5df7e9d8 MM |
494 | } |
495 | ||
6f73d021 TH |
496 | |
497 | /* | |
498 | * Process a BIO or request | |
499 | * | |
500 | * Either 'bio' or 'rq' should be set depending on if we are processing a | |
501 | * bio or a request (both should not be set). | |
502 | * | |
503 | * force_sync: Set to 0 to defer processing to a background taskq | |
504 | * Set to 1 to process data synchronously | |
505 | */ | |
435a451e | 506 | static void |
6f73d021 TH |
507 | zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, |
508 | boolean_t force_sync) | |
5df7e9d8 | 509 | { |
5df7e9d8 | 510 | fstrans_cookie_t cookie = spl_fstrans_mark(); |
6f73d021 TH |
511 | uint64_t offset = io_offset(bio, rq); |
512 | uint64_t size = io_size(bio, rq); | |
513 | int rw = io_data_dir(bio, rq); | |
5df7e9d8 | 514 | |
6f73d021 TH |
515 | if (zvol_request_sync) |
516 | force_sync = 1; | |
517 | ||
518 | zv_request_t zvr = { | |
519 | .zv = zv, | |
520 | .bio = bio, | |
521 | .rq = rq, | |
522 | }; | |
523 | ||
524 | if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { | |
525 | printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", | |
5df7e9d8 MM |
526 | zv->zv_zso->zvo_disk->disk_name, |
527 | (long long unsigned)offset, | |
528 | (long unsigned)size); | |
529 | ||
6f73d021 | 530 | END_IO(zv, bio, rq, -SET_ERROR(EIO)); |
5df7e9d8 MM |
531 | goto out; |
532 | } | |
533 | ||
e439ee83 CS |
534 | zv_request_task_t *task; |
535 | ||
5df7e9d8 | 536 | if (rw == WRITE) { |
5df7e9d8 | 537 | if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { |
6f73d021 | 538 | END_IO(zv, bio, rq, -SET_ERROR(EROFS)); |
5df7e9d8 MM |
539 | goto out; |
540 | } | |
541 | ||
542 | /* | |
0929c4de MA |
543 | * Prevents the zvol from being suspended, or the ZIL being |
544 | * concurrently opened. Will be released after the i/o | |
545 | * completes. | |
5df7e9d8 MM |
546 | */ |
547 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
548 | ||
549 | /* | |
550 | * Open a ZIL if this is the first time we have written to this | |
551 | * zvol. We protect zv->zv_zilog with zv_suspend_lock rather | |
552 | * than zv_state_lock so that we don't need to acquire an | |
553 | * additional lock in this path. | |
554 | */ | |
555 | if (zv->zv_zilog == NULL) { | |
556 | rw_exit(&zv->zv_suspend_lock); | |
557 | rw_enter(&zv->zv_suspend_lock, RW_WRITER); | |
558 | if (zv->zv_zilog == NULL) { | |
559 | zv->zv_zilog = zil_open(zv->zv_objset, | |
fb087146 | 560 | zvol_get_data, &zv->zv_kstat.dk_zil_sums); |
5df7e9d8 | 561 | zv->zv_flags |= ZVOL_WRITTEN_TO; |
93e36580 CS |
562 | /* replay / destroy done in zvol_create_minor */ |
563 | VERIFY0((zv->zv_zilog->zl_header->zh_flags & | |
564 | ZIL_REPLAY_NEEDED)); | |
5df7e9d8 MM |
565 | } |
566 | rw_downgrade(&zv->zv_suspend_lock); | |
567 | } | |
568 | ||
5df7e9d8 | 569 | /* |
0929c4de MA |
570 | * We don't want this thread to be blocked waiting for i/o to |
571 | * complete, so we instead wait from a taskq callback. The | |
572 | * i/o may be a ZIL write (via zil_commit()), or a read of an | |
573 | * indirect block, or a read of a data block (if this is a | |
574 | * partial-block write). We will indicate that the i/o is | |
6f73d021 | 575 | * complete by calling END_IO() from the taskq callback. |
0929c4de MA |
576 | * |
577 | * This design allows the calling thread to continue and | |
578 | * initiate more concurrent operations by calling | |
579 | * zvol_request() again. There are typically only a small | |
580 | * number of threads available to call zvol_request() (e.g. | |
581 | * one per iSCSI target), so keeping the latency of | |
582 | * zvol_request() low is important for performance. | |
583 | * | |
584 | * The zvol_request_sync module parameter allows this | |
585 | * behavior to be altered, for performance evaluation | |
586 | * purposes. If the callback blocks, setting | |
587 | * zvol_request_sync=1 will result in much worse performance. | |
588 | * | |
589 | * We can have up to zvol_threads concurrent i/o's being | |
590 | * processed for all zvols on the system. This is typically | |
591 | * a vast improvement over the zvol_request_sync=1 behavior | |
592 | * of one i/o at a time per zvol. However, an even better | |
593 | * design would be for zvol_request() to initiate the zio | |
594 | * directly, and then be notified by the zio_done callback, | |
6f73d021 | 595 | * which would call END_IO(). Unfortunately, the DMU/ZIL |
0929c4de MA |
596 | * interfaces lack this functionality (they block waiting for |
597 | * the i/o to complete). | |
5df7e9d8 | 598 | */ |
6f73d021 TH |
599 | if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { |
600 | if (force_sync) { | |
e439ee83 | 601 | zvol_discard(&zvr); |
0929c4de | 602 | } else { |
e439ee83 | 603 | task = zv_request_task_create(zvr); |
0929c4de | 604 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 605 | zvol_discard_task, task, 0, &task->ent); |
0929c4de | 606 | } |
5df7e9d8 | 607 | } else { |
6f73d021 | 608 | if (force_sync) { |
e439ee83 | 609 | zvol_write(&zvr); |
0929c4de | 610 | } else { |
e439ee83 | 611 | task = zv_request_task_create(zvr); |
0929c4de | 612 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 613 | zvol_write_task, task, 0, &task->ent); |
0929c4de | 614 | } |
5df7e9d8 MM |
615 | } |
616 | } else { | |
617 | /* | |
618 | * The SCST driver, and possibly others, may issue READ I/Os | |
619 | * with a length of zero bytes. These empty I/Os contain no | |
620 | * data and require no additional handling. | |
621 | */ | |
622 | if (size == 0) { | |
6f73d021 | 623 | END_IO(zv, bio, rq, 0); |
5df7e9d8 MM |
624 | goto out; |
625 | } | |
626 | ||
5df7e9d8 MM |
627 | rw_enter(&zv->zv_suspend_lock, RW_READER); |
628 | ||
0929c4de | 629 | /* See comment in WRITE case above. */ |
6f73d021 | 630 | if (force_sync) { |
e439ee83 | 631 | zvol_read(&zvr); |
0929c4de | 632 | } else { |
e439ee83 | 633 | task = zv_request_task_create(zvr); |
0929c4de | 634 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 635 | zvol_read_task, task, 0, &task->ent); |
0929c4de | 636 | } |
5df7e9d8 MM |
637 | } |
638 | ||
639 | out: | |
640 | spl_fstrans_unmark(cookie); | |
6f73d021 TH |
641 | } |
642 | ||
643 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
644 | #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID | |
645 | static void | |
646 | zvol_submit_bio(struct bio *bio) | |
647 | #else | |
648 | static blk_qc_t | |
649 | zvol_submit_bio(struct bio *bio) | |
650 | #endif | |
651 | #else | |
652 | static MAKE_REQUEST_FN_RET | |
653 | zvol_request(struct request_queue *q, struct bio *bio) | |
654 | #endif | |
655 | { | |
656 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
657 | #if defined(HAVE_BIO_BDEV_DISK) | |
658 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
659 | #else | |
660 | struct request_queue *q = bio->bi_disk->queue; | |
661 | #endif | |
662 | #endif | |
663 | zvol_state_t *zv = q->queuedata; | |
664 | ||
665 | zvol_request_impl(zv, bio, NULL, 0); | |
666 | #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ | |
667 | defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ | |
435a451e | 668 | !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) |
5df7e9d8 MM |
669 | return (BLK_QC_T_NONE); |
670 | #endif | |
671 | } | |
672 | ||
673 | static int | |
674 | zvol_open(struct block_device *bdev, fmode_t flag) | |
675 | { | |
676 | zvol_state_t *zv; | |
677 | int error = 0; | |
8a02d01e | 678 | boolean_t drop_suspend = B_FALSE; |
77e2756d BB |
679 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
680 | hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); | |
681 | hrtime_t start = gethrtime(); | |
5df7e9d8 | 682 | |
77e2756d BB |
683 | retry: |
684 | #endif | |
5df7e9d8 MM |
685 | rw_enter(&zvol_state_lock, RW_READER); |
686 | /* | |
687 | * Obtain a copy of private_data under the zvol_state_lock to make | |
688 | * sure that either the result of zvol free code path setting | |
1dccfd7a | 689 | * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free() |
5df7e9d8 MM |
690 | * is not called on this zv because of the positive zv_open_count. |
691 | */ | |
692 | zv = bdev->bd_disk->private_data; | |
693 | if (zv == NULL) { | |
694 | rw_exit(&zvol_state_lock); | |
695 | return (SET_ERROR(-ENXIO)); | |
696 | } | |
697 | ||
8a02d01e BB |
698 | mutex_enter(&zv->zv_state_lock); |
699 | /* | |
700 | * Make sure zvol is not suspended during first open | |
701 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
702 | * ordering - zv_suspend_lock before zv_state_lock | |
703 | */ | |
704 | if (zv->zv_open_count == 0) { | |
705 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
706 | mutex_exit(&zv->zv_state_lock); | |
707 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
708 | mutex_enter(&zv->zv_state_lock); | |
709 | /* check to see if zv_suspend_lock is needed */ | |
710 | if (zv->zv_open_count != 0) { | |
711 | rw_exit(&zv->zv_suspend_lock); | |
712 | } else { | |
713 | drop_suspend = B_TRUE; | |
714 | } | |
715 | } else { | |
716 | drop_suspend = B_TRUE; | |
717 | } | |
718 | } | |
719 | rw_exit(&zvol_state_lock); | |
720 | ||
721 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
722 | ||
723 | if (zv->zv_open_count == 0) { | |
724 | boolean_t drop_namespace = B_FALSE; | |
725 | ||
726 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
727 | ||
77e2756d BB |
728 | /* |
729 | * In all other call paths the spa_namespace_lock is taken | |
730 | * before the bdev->bd_mutex lock. However, on open(2) | |
731 | * the __blkdev_get() function calls fops->open() with the | |
732 | * bdev->bd_mutex lock held. This can result in a deadlock | |
733 | * when zvols from one pool are used as vdevs in another. | |
734 | * | |
735 | * To prevent a lock inversion deadlock we preemptively | |
736 | * take the spa_namespace_lock. Normally the lock will not | |
737 | * be contended and this is safe because spa_open_common() | |
738 | * handles the case where the caller already holds the | |
739 | * spa_namespace_lock. | |
740 | * | |
741 | * When the lock cannot be aquired after multiple retries | |
742 | * this must be the vdev on zvol deadlock case and we have | |
743 | * no choice but to return an error. For 5.12 and older | |
744 | * kernels returning -ERESTARTSYS will result in the | |
745 | * bdev->bd_mutex being dropped, then reacquired, and | |
746 | * fops->open() being called again. This process can be | |
747 | * repeated safely until both locks are acquired. For 5.13 | |
748 | * and newer the -ERESTARTSYS retry logic was removed from | |
749 | * the kernel so the only option is to return the error for | |
750 | * the caller to handle it. | |
751 | */ | |
8a02d01e BB |
752 | if (!mutex_owned(&spa_namespace_lock)) { |
753 | if (!mutex_tryenter(&spa_namespace_lock)) { | |
754 | mutex_exit(&zv->zv_state_lock); | |
755 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d BB |
756 | |
757 | #ifdef HAVE_BLKDEV_GET_ERESTARTSYS | |
8a02d01e | 758 | schedule(); |
77e2756d | 759 | return (SET_ERROR(-ERESTARTSYS)); |
8a02d01e BB |
760 | #else |
761 | if ((gethrtime() - start) > timeout) | |
762 | return (SET_ERROR(-ERESTARTSYS)); | |
77e2756d | 763 | |
8a02d01e BB |
764 | schedule_timeout(MSEC_TO_TICK(10)); |
765 | goto retry; | |
77e2756d | 766 | #endif |
8a02d01e BB |
767 | } else { |
768 | drop_namespace = B_TRUE; | |
5df7e9d8 MM |
769 | } |
770 | } | |
5df7e9d8 | 771 | |
5df7e9d8 | 772 | error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); |
5df7e9d8 | 773 | |
8a02d01e BB |
774 | if (drop_namespace) |
775 | mutex_exit(&spa_namespace_lock); | |
5df7e9d8 MM |
776 | } |
777 | ||
8a02d01e BB |
778 | if (error == 0) { |
779 | if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { | |
780 | if (zv->zv_open_count == 0) | |
781 | zvol_last_close(zv); | |
5df7e9d8 | 782 | |
8a02d01e BB |
783 | error = SET_ERROR(-EROFS); |
784 | } else { | |
785 | zv->zv_open_count++; | |
786 | } | |
787 | } | |
5df7e9d8 | 788 | |
5df7e9d8 MM |
789 | mutex_exit(&zv->zv_state_lock); |
790 | if (drop_suspend) | |
791 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d | 792 | |
8a02d01e BB |
793 | if (error == 0) |
794 | zfs_check_media_change(bdev); | |
795 | ||
796 | return (error); | |
5df7e9d8 MM |
797 | } |
798 | ||
5df7e9d8 | 799 | static void |
5df7e9d8 MM |
800 | zvol_release(struct gendisk *disk, fmode_t mode) |
801 | { | |
802 | zvol_state_t *zv; | |
803 | boolean_t drop_suspend = B_TRUE; | |
804 | ||
805 | rw_enter(&zvol_state_lock, RW_READER); | |
806 | zv = disk->private_data; | |
807 | ||
808 | mutex_enter(&zv->zv_state_lock); | |
0b32d817 | 809 | ASSERT3U(zv->zv_open_count, >, 0); |
5df7e9d8 MM |
810 | /* |
811 | * make sure zvol is not suspended during last close | |
812 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
813 | * ordering - zv_suspend_lock before zv_state_lock | |
814 | */ | |
815 | if (zv->zv_open_count == 1) { | |
816 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
817 | mutex_exit(&zv->zv_state_lock); | |
818 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
819 | mutex_enter(&zv->zv_state_lock); | |
820 | /* check to see if zv_suspend_lock is needed */ | |
821 | if (zv->zv_open_count != 1) { | |
822 | rw_exit(&zv->zv_suspend_lock); | |
823 | drop_suspend = B_FALSE; | |
824 | } | |
825 | } | |
826 | } else { | |
827 | drop_suspend = B_FALSE; | |
828 | } | |
829 | rw_exit(&zvol_state_lock); | |
830 | ||
831 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
5df7e9d8 MM |
832 | |
833 | zv->zv_open_count--; | |
0b32d817 RM |
834 | if (zv->zv_open_count == 0) { |
835 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
5df7e9d8 | 836 | zvol_last_close(zv); |
0b32d817 | 837 | } |
5df7e9d8 MM |
838 | |
839 | mutex_exit(&zv->zv_state_lock); | |
840 | ||
841 | if (drop_suspend) | |
842 | rw_exit(&zv->zv_suspend_lock); | |
5df7e9d8 MM |
843 | } |
844 | ||
845 | static int | |
846 | zvol_ioctl(struct block_device *bdev, fmode_t mode, | |
847 | unsigned int cmd, unsigned long arg) | |
848 | { | |
849 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
850 | int error = 0; | |
851 | ||
852 | ASSERT3U(zv->zv_open_count, >, 0); | |
853 | ||
854 | switch (cmd) { | |
855 | case BLKFLSBUF: | |
856 | fsync_bdev(bdev); | |
857 | invalidate_bdev(bdev); | |
858 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
859 | ||
860 | if (!(zv->zv_flags & ZVOL_RDONLY)) | |
861 | txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); | |
862 | ||
863 | rw_exit(&zv->zv_suspend_lock); | |
864 | break; | |
865 | ||
866 | case BLKZNAME: | |
867 | mutex_enter(&zv->zv_state_lock); | |
868 | error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); | |
869 | mutex_exit(&zv->zv_state_lock); | |
870 | break; | |
871 | ||
872 | default: | |
873 | error = -ENOTTY; | |
874 | break; | |
875 | } | |
876 | ||
877 | return (SET_ERROR(error)); | |
878 | } | |
879 | ||
880 | #ifdef CONFIG_COMPAT | |
881 | static int | |
882 | zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, | |
883 | unsigned cmd, unsigned long arg) | |
884 | { | |
885 | return (zvol_ioctl(bdev, mode, cmd, arg)); | |
886 | } | |
887 | #else | |
888 | #define zvol_compat_ioctl NULL | |
889 | #endif | |
890 | ||
5df7e9d8 MM |
891 | static unsigned int |
892 | zvol_check_events(struct gendisk *disk, unsigned int clearing) | |
893 | { | |
894 | unsigned int mask = 0; | |
895 | ||
896 | rw_enter(&zvol_state_lock, RW_READER); | |
897 | ||
898 | zvol_state_t *zv = disk->private_data; | |
899 | if (zv != NULL) { | |
900 | mutex_enter(&zv->zv_state_lock); | |
901 | mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; | |
902 | zv->zv_changed = 0; | |
903 | mutex_exit(&zv->zv_state_lock); | |
904 | } | |
905 | ||
906 | rw_exit(&zvol_state_lock); | |
907 | ||
908 | return (mask); | |
909 | } | |
5df7e9d8 MM |
910 | |
911 | static int | |
912 | zvol_revalidate_disk(struct gendisk *disk) | |
913 | { | |
914 | rw_enter(&zvol_state_lock, RW_READER); | |
915 | ||
916 | zvol_state_t *zv = disk->private_data; | |
917 | if (zv != NULL) { | |
918 | mutex_enter(&zv->zv_state_lock); | |
919 | set_capacity(zv->zv_zso->zvo_disk, | |
920 | zv->zv_volsize >> SECTOR_BITS); | |
921 | mutex_exit(&zv->zv_state_lock); | |
922 | } | |
923 | ||
924 | rw_exit(&zvol_state_lock); | |
925 | ||
926 | return (0); | |
927 | } | |
928 | ||
1dccfd7a CS |
929 | int |
930 | zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) | |
5df7e9d8 | 931 | { |
1c0bbd52 | 932 | struct gendisk *disk = zv->zv_zso->zvo_disk; |
5df7e9d8 | 933 | |
19697e45 | 934 | #if defined(HAVE_REVALIDATE_DISK_SIZE) |
1c0bbd52 | 935 | revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); |
19697e45 | 936 | #elif defined(HAVE_REVALIDATE_DISK) |
1c0bbd52 | 937 | revalidate_disk(disk); |
19697e45 BB |
938 | #else |
939 | zvol_revalidate_disk(disk); | |
59b68723 | 940 | #endif |
5df7e9d8 MM |
941 | return (0); |
942 | } | |
943 | ||
1dccfd7a CS |
944 | void |
945 | zvol_os_clear_private(zvol_state_t *zv) | |
5df7e9d8 MM |
946 | { |
947 | /* | |
948 | * Cleared while holding zvol_state_lock as a writer | |
949 | * which will prevent zvol_open() from opening it. | |
950 | */ | |
951 | zv->zv_zso->zvo_disk->private_data = NULL; | |
952 | } | |
953 | ||
954 | /* | |
955 | * Provide a simple virtual geometry for legacy compatibility. For devices | |
956 | * smaller than 1 MiB a small head and sector count is used to allow very | |
957 | * tiny devices. For devices over 1 Mib a standard head and sector count | |
958 | * is used to keep the cylinders count reasonable. | |
959 | */ | |
960 | static int | |
961 | zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |
962 | { | |
963 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
964 | sector_t sectors; | |
965 | ||
966 | ASSERT3U(zv->zv_open_count, >, 0); | |
967 | ||
968 | sectors = get_capacity(zv->zv_zso->zvo_disk); | |
969 | ||
970 | if (sectors > 2048) { | |
971 | geo->heads = 16; | |
972 | geo->sectors = 63; | |
973 | } else { | |
974 | geo->heads = 2; | |
975 | geo->sectors = 4; | |
976 | } | |
977 | ||
978 | geo->start = 0; | |
979 | geo->cylinders = sectors / (geo->heads * geo->sectors); | |
980 | ||
981 | return (0); | |
982 | } | |
983 | ||
6f73d021 TH |
984 | /* |
985 | * Why have two separate block_device_operations structs? | |
986 | * | |
987 | * Normally we'd just have one, and assign 'submit_bio' as needed. However, | |
988 | * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we | |
989 | * can't just change submit_bio dynamically at runtime. So just create two | |
990 | * separate structs to get around this. | |
991 | */ | |
992 | static const struct block_device_operations zvol_ops_blk_mq = { | |
993 | .open = zvol_open, | |
994 | .release = zvol_release, | |
995 | .ioctl = zvol_ioctl, | |
996 | .compat_ioctl = zvol_compat_ioctl, | |
997 | .check_events = zvol_check_events, | |
998 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK | |
999 | .revalidate_disk = zvol_revalidate_disk, | |
1000 | #endif | |
1001 | .getgeo = zvol_getgeo, | |
1002 | .owner = THIS_MODULE, | |
1003 | }; | |
1004 | ||
18168da7 | 1005 | static const struct block_device_operations zvol_ops = { |
5df7e9d8 MM |
1006 | .open = zvol_open, |
1007 | .release = zvol_release, | |
1008 | .ioctl = zvol_ioctl, | |
1009 | .compat_ioctl = zvol_compat_ioctl, | |
5df7e9d8 | 1010 | .check_events = zvol_check_events, |
48c7b0e4 | 1011 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK |
5df7e9d8 | 1012 | .revalidate_disk = zvol_revalidate_disk, |
48c7b0e4 | 1013 | #endif |
5df7e9d8 MM |
1014 | .getgeo = zvol_getgeo, |
1015 | .owner = THIS_MODULE, | |
d817c171 | 1016 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS |
1b06b03a | 1017 | .submit_bio = zvol_submit_bio, |
d817c171 | 1018 | #endif |
5df7e9d8 MM |
1019 | }; |
1020 | ||
6f73d021 TH |
1021 | static int |
1022 | zvol_alloc_non_blk_mq(struct zvol_state_os *zso) | |
1023 | { | |
1024 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) | |
1025 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1026 | zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); | |
1027 | if (zso->zvo_disk == NULL) | |
1028 | return (1); | |
1029 | ||
1030 | zso->zvo_disk->minors = ZVOL_MINORS; | |
1031 | zso->zvo_queue = zso->zvo_disk->queue; | |
1032 | #else | |
1033 | zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); | |
1034 | if (zso->zvo_queue == NULL) | |
1035 | return (1); | |
1036 | ||
1037 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1038 | if (zso->zvo_disk == NULL) { | |
1039 | blk_cleanup_queue(zso->zvo_queue); | |
1040 | return (1); | |
1041 | } | |
1042 | ||
1043 | zso->zvo_disk->queue = zso->zvo_queue; | |
1044 | #endif /* HAVE_BLK_ALLOC_DISK */ | |
1045 | #else | |
1046 | zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); | |
1047 | if (zso->zvo_queue == NULL) | |
1048 | return (1); | |
1049 | ||
1050 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1051 | if (zso->zvo_disk == NULL) { | |
1052 | blk_cleanup_queue(zso->zvo_queue); | |
1053 | return (1); | |
1054 | } | |
1055 | ||
1056 | zso->zvo_disk->queue = zso->zvo_queue; | |
1057 | #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ | |
1058 | return (0); | |
1059 | ||
1060 | } | |
1061 | ||
1062 | static int | |
1063 | zvol_alloc_blk_mq(zvol_state_t *zv) | |
1064 | { | |
1065 | #ifdef HAVE_BLK_MQ | |
1066 | struct zvol_state_os *zso = zv->zv_zso; | |
1067 | ||
1068 | /* Allocate our blk-mq tag_set */ | |
1069 | if (zvol_blk_mq_alloc_tag_set(zv) != 0) | |
1070 | return (1); | |
1071 | ||
1072 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1073 | zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); | |
1074 | if (zso->zvo_disk == NULL) { | |
1075 | blk_mq_free_tag_set(&zso->tag_set); | |
1076 | return (1); | |
1077 | } | |
1078 | zso->zvo_queue = zso->zvo_disk->queue; | |
1079 | zso->zvo_disk->minors = ZVOL_MINORS; | |
1080 | #else | |
1081 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1082 | if (zso->zvo_disk == NULL) { | |
1083 | blk_cleanup_queue(zso->zvo_queue); | |
1084 | blk_mq_free_tag_set(&zso->tag_set); | |
1085 | return (1); | |
1086 | } | |
1087 | /* Allocate queue */ | |
1088 | zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); | |
1089 | if (IS_ERR(zso->zvo_queue)) { | |
1090 | blk_mq_free_tag_set(&zso->tag_set); | |
1091 | return (1); | |
1092 | } | |
1093 | ||
1094 | /* Our queue is now created, assign it to our disk */ | |
1095 | zso->zvo_disk->queue = zso->zvo_queue; | |
1096 | ||
1097 | #endif | |
1098 | #endif | |
1099 | return (0); | |
1100 | } | |
1101 | ||
5df7e9d8 MM |
1102 | /* |
1103 | * Allocate memory for a new zvol_state_t and setup the required | |
1104 | * request queue and generic disk structures for the block device. | |
1105 | */ | |
1106 | static zvol_state_t * | |
1107 | zvol_alloc(dev_t dev, const char *name) | |
1108 | { | |
1109 | zvol_state_t *zv; | |
68dde63d | 1110 | struct zvol_state_os *zso; |
5df7e9d8 | 1111 | uint64_t volmode; |
6f73d021 | 1112 | int ret; |
5df7e9d8 MM |
1113 | |
1114 | if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) | |
1115 | return (NULL); | |
1116 | ||
1117 | if (volmode == ZFS_VOLMODE_DEFAULT) | |
1118 | volmode = zvol_volmode; | |
1119 | ||
1120 | if (volmode == ZFS_VOLMODE_NONE) | |
1121 | return (NULL); | |
1122 | ||
1123 | zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); | |
68dde63d BB |
1124 | zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); |
1125 | zv->zv_zso = zso; | |
0ca45cb3 | 1126 | zv->zv_volmode = volmode; |
5df7e9d8 MM |
1127 | |
1128 | list_link_init(&zv->zv_next); | |
5df7e9d8 MM |
1129 | mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); |
1130 | ||
6f73d021 TH |
1131 | #ifdef HAVE_BLK_MQ |
1132 | zv->zv_zso->use_blk_mq = zvol_use_blk_mq; | |
1133 | #endif | |
1b06b03a | 1134 | |
6f73d021 TH |
1135 | /* |
1136 | * The block layer has 3 interfaces for getting BIOs: | |
1137 | * | |
1138 | * 1. blk-mq request queues (new) | |
1139 | * 2. submit_bio() (oldest) | |
1140 | * 3. regular request queues (old). | |
1141 | * | |
1142 | * Each of those interfaces has two permutations: | |
1143 | * | |
1144 | * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates | |
1145 | * both the disk and its queue (5.14 kernel or newer) | |
1146 | * | |
1147 | * b) We don't have blk_*alloc_disk(), and have to allocate the | |
1148 | * disk and the queue separately. (5.13 kernel or older) | |
1149 | */ | |
1150 | if (zv->zv_zso->use_blk_mq) { | |
1151 | ret = zvol_alloc_blk_mq(zv); | |
1152 | zso->zvo_disk->fops = &zvol_ops_blk_mq; | |
1153 | } else { | |
1154 | ret = zvol_alloc_non_blk_mq(zso); | |
1155 | zso->zvo_disk->fops = &zvol_ops; | |
1b06b03a | 1156 | } |
6f73d021 | 1157 | if (ret != 0) |
5df7e9d8 MM |
1158 | goto out_kmem; |
1159 | ||
68dde63d | 1160 | blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); |
5df7e9d8 MM |
1161 | |
1162 | /* Limit read-ahead to a single page to prevent over-prefetching. */ | |
68dde63d | 1163 | blk_queue_set_read_ahead(zso->zvo_queue, 1); |
5df7e9d8 | 1164 | |
6f73d021 TH |
1165 | if (!zv->zv_zso->use_blk_mq) { |
1166 | /* Disable write merging in favor of the ZIO pipeline. */ | |
1167 | blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); | |
1168 | } | |
5df7e9d8 | 1169 | |
ae1e40b3 BB |
1170 | /* Enable /proc/diskstats */ |
1171 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); | |
1172 | ||
68dde63d BB |
1173 | zso->zvo_queue->queuedata = zv; |
1174 | zso->zvo_dev = dev; | |
5df7e9d8 MM |
1175 | zv->zv_open_count = 0; |
1176 | strlcpy(zv->zv_name, name, MAXNAMELEN); | |
1177 | ||
2cc479d0 | 1178 | zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); |
5df7e9d8 MM |
1179 | rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); |
1180 | ||
68dde63d BB |
1181 | zso->zvo_disk->major = zvol_major; |
1182 | zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; | |
5df7e9d8 | 1183 | |
026f126b BB |
1184 | /* |
1185 | * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. | |
1186 | * This is accomplished by limiting the number of minors for the | |
1187 | * device to one and explicitly disabling partition scanning. | |
1188 | */ | |
5df7e9d8 | 1189 | if (volmode == ZFS_VOLMODE_DEV) { |
68dde63d | 1190 | zso->zvo_disk->minors = 1; |
026f126b BB |
1191 | zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; |
1192 | zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; | |
5df7e9d8 | 1193 | } |
026f126b | 1194 | |
68dde63d | 1195 | zso->zvo_disk->first_minor = (dev & MINORMASK); |
68dde63d | 1196 | zso->zvo_disk->private_data = zv; |
68dde63d | 1197 | snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", |
5df7e9d8 MM |
1198 | ZVOL_DEV_NAME, (dev & MINORMASK)); |
1199 | ||
1200 | return (zv); | |
1201 | ||
5df7e9d8 | 1202 | out_kmem: |
68dde63d | 1203 | kmem_free(zso, sizeof (struct zvol_state_os)); |
5df7e9d8 MM |
1204 | kmem_free(zv, sizeof (zvol_state_t)); |
1205 | return (NULL); | |
1206 | } | |
1207 | ||
1208 | /* | |
1209 | * Cleanup then free a zvol_state_t which was created by zvol_alloc(). | |
1210 | * At this time, the structure is not opened by anyone, is taken off | |
1211 | * the zvol_state_list, and has its private data set to NULL. | |
1212 | * The zvol_state_lock is dropped. | |
99573cc0 PS |
1213 | * |
1214 | * This function may take many milliseconds to complete (e.g. we've seen | |
1215 | * it take over 256ms), due to the calls to "blk_cleanup_queue" and | |
1216 | * "del_gendisk". Thus, consumers need to be careful to account for this | |
1217 | * latency when calling this function. | |
5df7e9d8 | 1218 | */ |
1dccfd7a CS |
1219 | void |
1220 | zvol_os_free(zvol_state_t *zv) | |
5df7e9d8 MM |
1221 | { |
1222 | ||
1223 | ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); | |
1224 | ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); | |
0b32d817 RM |
1225 | ASSERT0(zv->zv_open_count); |
1226 | ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); | |
5df7e9d8 MM |
1227 | |
1228 | rw_destroy(&zv->zv_suspend_lock); | |
2cc479d0 | 1229 | zfs_rangelock_fini(&zv->zv_rangelock); |
5df7e9d8 MM |
1230 | |
1231 | del_gendisk(zv->zv_zso->zvo_disk); | |
1b06b03a BB |
1232 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ |
1233 | defined(HAVE_BLK_ALLOC_DISK) | |
c26045b4 | 1234 | #if defined(HAVE_BLK_CLEANUP_DISK) |
1b06b03a | 1235 | blk_cleanup_disk(zv->zv_zso->zvo_disk); |
c26045b4 BB |
1236 | #else |
1237 | put_disk(zv->zv_zso->zvo_disk); | |
1238 | #endif | |
1b06b03a | 1239 | #else |
5df7e9d8 MM |
1240 | blk_cleanup_queue(zv->zv_zso->zvo_queue); |
1241 | put_disk(zv->zv_zso->zvo_disk); | |
1b06b03a | 1242 | #endif |
5df7e9d8 | 1243 | |
6f73d021 TH |
1244 | #ifdef HAVE_BLK_MQ |
1245 | if (zv->zv_zso->use_blk_mq) | |
1246 | blk_mq_free_tag_set(&zv->zv_zso->tag_set); | |
1247 | #endif | |
1248 | ||
5df7e9d8 MM |
1249 | ida_simple_remove(&zvol_ida, |
1250 | MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); | |
1251 | ||
1252 | mutex_destroy(&zv->zv_state_lock); | |
4547fc4e | 1253 | dataset_kstats_destroy(&zv->zv_kstat); |
5df7e9d8 MM |
1254 | |
1255 | kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); | |
1256 | kmem_free(zv, sizeof (zvol_state_t)); | |
1257 | } | |
1258 | ||
0ca45cb3 MM |
1259 | void |
1260 | zvol_wait_close(zvol_state_t *zv) | |
1261 | { | |
1262 | } | |
1263 | ||
5df7e9d8 MM |
1264 | /* |
1265 | * Create a block device minor node and setup the linkage between it | |
1266 | * and the specified volume. Once this function returns the block | |
1267 | * device is live and ready for use. | |
1268 | */ | |
1dccfd7a | 1269 | int |
ec213971 | 1270 | zvol_os_create_minor(const char *name) |
5df7e9d8 MM |
1271 | { |
1272 | zvol_state_t *zv; | |
1273 | objset_t *os; | |
1274 | dmu_object_info_t *doi; | |
1275 | uint64_t volsize; | |
1276 | uint64_t len; | |
1277 | unsigned minor = 0; | |
1278 | int error = 0; | |
1279 | int idx; | |
1280 | uint64_t hash = zvol_name_hash(name); | |
e197bb24 | 1281 | bool replayed_zil = B_FALSE; |
5df7e9d8 MM |
1282 | |
1283 | if (zvol_inhibit_dev) | |
1284 | return (0); | |
1285 | ||
1286 | idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); | |
1287 | if (idx < 0) | |
1288 | return (SET_ERROR(-idx)); | |
1289 | minor = idx << ZVOL_MINOR_BITS; | |
1290 | ||
1291 | zv = zvol_find_by_name_hash(name, hash, RW_NONE); | |
1292 | if (zv) { | |
1293 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1294 | mutex_exit(&zv->zv_state_lock); | |
1295 | ida_simple_remove(&zvol_ida, idx); | |
1296 | return (SET_ERROR(EEXIST)); | |
1297 | } | |
1298 | ||
1299 | doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); | |
1300 | ||
1301 | error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); | |
1302 | if (error) | |
1303 | goto out_doi; | |
1304 | ||
1305 | error = dmu_object_info(os, ZVOL_OBJ, doi); | |
1306 | if (error) | |
1307 | goto out_dmu_objset_disown; | |
1308 | ||
1309 | error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); | |
1310 | if (error) | |
1311 | goto out_dmu_objset_disown; | |
1312 | ||
1313 | zv = zvol_alloc(MKDEV(zvol_major, minor), name); | |
1314 | if (zv == NULL) { | |
1315 | error = SET_ERROR(EAGAIN); | |
1316 | goto out_dmu_objset_disown; | |
1317 | } | |
1318 | zv->zv_hash = hash; | |
1319 | ||
1320 | if (dmu_objset_is_snapshot(os)) | |
1321 | zv->zv_flags |= ZVOL_RDONLY; | |
1322 | ||
1323 | zv->zv_volblocksize = doi->doi_data_block_size; | |
1324 | zv->zv_volsize = volsize; | |
1325 | zv->zv_objset = os; | |
1326 | ||
1327 | set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); | |
1328 | ||
1329 | blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, | |
1330 | (DMU_MAX_ACCESS / 4) >> 9); | |
6f73d021 TH |
1331 | |
1332 | if (zv->zv_zso->use_blk_mq) { | |
1333 | /* | |
1334 | * IO requests can be really big (1MB). When an IO request | |
1335 | * comes in, it is passed off to zvol_read() or zvol_write() | |
1336 | * in a new thread, where it is chunked up into 'volblocksize' | |
1337 | * sized pieces and processed. So for example, if the request | |
1338 | * is a 1MB write and your volblocksize is 128k, one zvol_write | |
1339 | * thread will take that request and sequentially do ten 128k | |
1340 | * IOs. This is due to the fact that the thread needs to lock | |
1341 | * each volblocksize sized block. So you might be wondering: | |
1342 | * "instead of passing the whole 1MB request to one thread, | |
1343 | * why not pass ten individual 128k chunks to ten threads and | |
1344 | * process the whole write in parallel?" The short answer is | |
1345 | * that there's a sweet spot number of chunks that balances | |
1346 | * the greater parallelism with the added overhead of more | |
1347 | * threads. The sweet spot can be different depending on if you | |
1348 | * have a read or write heavy workload. Writes typically want | |
1349 | * high chunk counts while reads typically want lower ones. On | |
1350 | * a test pool with 6 NVMe drives in a 3x 2-disk mirror | |
1351 | * configuration, with volblocksize=8k, the sweet spot for good | |
1352 | * sequential reads and writes was at 8 chunks. | |
1353 | */ | |
1354 | ||
1355 | /* | |
1356 | * Below we tell the kernel how big we want our requests | |
1357 | * to be. You would think that blk_queue_io_opt() would be | |
1358 | * used to do this since it is used to "set optimal request | |
1359 | * size for the queue", but that doesn't seem to do | |
1360 | * anything - the kernel still gives you huge requests | |
1361 | * with tons of little PAGE_SIZE segments contained within it. | |
1362 | * | |
1363 | * Knowing that the kernel will just give you PAGE_SIZE segments | |
1364 | * no matter what, you can say "ok, I want PAGE_SIZE byte | |
1365 | * segments, and I want 'N' of them per request", where N is | |
1366 | * the correct number of segments for the volblocksize and | |
1367 | * number of chunks you want. | |
1368 | */ | |
1369 | #ifdef HAVE_BLK_MQ | |
1370 | if (zvol_blk_mq_blocks_per_thread != 0) { | |
1371 | unsigned int chunks; | |
1372 | chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); | |
1373 | ||
1374 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1375 | PAGE_SIZE); | |
1376 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1377 | (zv->zv_volblocksize * chunks) / PAGE_SIZE); | |
1378 | } else { | |
1379 | /* | |
1380 | * Special case: zvol_blk_mq_blocks_per_thread = 0 | |
1381 | * Max everything out. | |
1382 | */ | |
1383 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1384 | UINT16_MAX); | |
1385 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1386 | UINT_MAX); | |
1387 | } | |
1388 | #endif | |
1389 | } else { | |
1390 | blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); | |
1391 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); | |
1392 | } | |
1393 | ||
5df7e9d8 MM |
1394 | blk_queue_physical_block_size(zv->zv_zso->zvo_queue, |
1395 | zv->zv_volblocksize); | |
1396 | blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); | |
1397 | blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, | |
1398 | (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); | |
1399 | blk_queue_discard_granularity(zv->zv_zso->zvo_queue, | |
1400 | zv->zv_volblocksize); | |
5e4aedac | 1401 | #ifdef QUEUE_FLAG_DISCARD |
5df7e9d8 | 1402 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); |
5e4aedac | 1403 | #endif |
5df7e9d8 MM |
1404 | #ifdef QUEUE_FLAG_NONROT |
1405 | blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); | |
1406 | #endif | |
1407 | #ifdef QUEUE_FLAG_ADD_RANDOM | |
1408 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); | |
1409 | #endif | |
1410 | /* This flag was introduced in kernel version 4.12. */ | |
1411 | #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH | |
1412 | blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); | |
1413 | #endif | |
1414 | ||
fb087146 AH |
1415 | ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); |
1416 | error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); | |
1417 | if (error) | |
1418 | goto out_dmu_objset_disown; | |
93e36580 | 1419 | ASSERT3P(zv->zv_zilog, ==, NULL); |
fb087146 | 1420 | zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); |
5df7e9d8 MM |
1421 | if (spa_writeable(dmu_objset_spa(os))) { |
1422 | if (zil_replay_disable) | |
e197bb24 | 1423 | replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); |
5df7e9d8 | 1424 | else |
e197bb24 | 1425 | replayed_zil = zil_replay(os, zv, zvol_replay_vector); |
5df7e9d8 | 1426 | } |
e197bb24 AS |
1427 | if (replayed_zil) |
1428 | zil_close(zv->zv_zilog); | |
93e36580 | 1429 | zv->zv_zilog = NULL; |
5df7e9d8 MM |
1430 | |
1431 | /* | |
1432 | * When udev detects the addition of the device it will immediately | |
1433 | * invoke blkid(8) to determine the type of content on the device. | |
1434 | * Prefetching the blocks commonly scanned by blkid(8) will speed | |
1435 | * up this process. | |
1436 | */ | |
8ef15f93 | 1437 | len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); |
5df7e9d8 MM |
1438 | if (len > 0) { |
1439 | dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); | |
1440 | dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, | |
1441 | ZIO_PRIORITY_SYNC_READ); | |
1442 | } | |
1443 | ||
1444 | zv->zv_objset = NULL; | |
1445 | out_dmu_objset_disown: | |
1446 | dmu_objset_disown(os, B_TRUE, FTAG); | |
1447 | out_doi: | |
1448 | kmem_free(doi, sizeof (dmu_object_info_t)); | |
1449 | ||
1450 | /* | |
1451 | * Keep in mind that once add_disk() is called, the zvol is | |
1452 | * announced to the world, and zvol_open()/zvol_release() can | |
1453 | * be called at any time. Incidentally, add_disk() itself calls | |
1454 | * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() | |
1455 | * directly as well. | |
1456 | */ | |
1457 | if (error == 0) { | |
1458 | rw_enter(&zvol_state_lock, RW_WRITER); | |
1459 | zvol_insert(zv); | |
1460 | rw_exit(&zvol_state_lock); | |
12fa250d RE |
1461 | #ifdef HAVE_ADD_DISK_RET |
1462 | error = add_disk(zv->zv_zso->zvo_disk); | |
1463 | #else | |
5df7e9d8 | 1464 | add_disk(zv->zv_zso->zvo_disk); |
12fa250d | 1465 | #endif |
5df7e9d8 MM |
1466 | } else { |
1467 | ida_simple_remove(&zvol_ida, idx); | |
1468 | } | |
1469 | ||
ec213971 | 1470 | return (error); |
5df7e9d8 MM |
1471 | } |
1472 | ||
1dccfd7a CS |
1473 | void |
1474 | zvol_os_rename_minor(zvol_state_t *zv, const char *newname) | |
5df7e9d8 MM |
1475 | { |
1476 | int readonly = get_disk_ro(zv->zv_zso->zvo_disk); | |
1477 | ||
1478 | ASSERT(RW_LOCK_HELD(&zvol_state_lock)); | |
1479 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1480 | ||
1481 | strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); | |
1482 | ||
1483 | /* move to new hashtable entry */ | |
1484 | zv->zv_hash = zvol_name_hash(zv->zv_name); | |
1485 | hlist_del(&zv->zv_hlink); | |
1486 | hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); | |
1487 | ||
1488 | /* | |
1489 | * The block device's read-only state is briefly changed causing | |
1490 | * a KOBJ_CHANGE uevent to be issued. This ensures udev detects | |
1491 | * the name change and fixes the symlinks. This does not change | |
1492 | * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never | |
1493 | * changes. This would normally be done using kobject_uevent() but | |
1494 | * that is a GPL-only symbol which is why we need this workaround. | |
1495 | */ | |
1496 | set_disk_ro(zv->zv_zso->zvo_disk, !readonly); | |
1497 | set_disk_ro(zv->zv_zso->zvo_disk, readonly); | |
1498 | } | |
1499 | ||
1dccfd7a CS |
1500 | void |
1501 | zvol_os_set_disk_ro(zvol_state_t *zv, int flags) | |
5df7e9d8 MM |
1502 | { |
1503 | ||
1504 | set_disk_ro(zv->zv_zso->zvo_disk, flags); | |
1505 | } | |
1506 | ||
1dccfd7a CS |
1507 | void |
1508 | zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) | |
5df7e9d8 MM |
1509 | { |
1510 | ||
1511 | set_capacity(zv->zv_zso->zvo_disk, capacity); | |
1512 | } | |
1513 | ||
5df7e9d8 MM |
1514 | int |
1515 | zvol_init(void) | |
1516 | { | |
1517 | int error; | |
6f73d021 TH |
1518 | |
1519 | /* | |
1520 | * zvol_threads is the module param the user passes in. | |
1521 | * | |
1522 | * zvol_actual_threads is what we use internally, since the user can | |
1523 | * pass zvol_thread = 0 to mean "use all the CPUs" (the default). | |
1524 | */ | |
1525 | static unsigned int zvol_actual_threads; | |
1526 | ||
1527 | if (zvol_threads == 0) { | |
1528 | /* | |
1529 | * See dde9380a1 for why 32 was chosen here. This should | |
1530 | * probably be refined to be some multiple of the number | |
1531 | * of CPUs. | |
1532 | */ | |
1533 | zvol_actual_threads = MAX(num_online_cpus(), 32); | |
1534 | } else { | |
1535 | zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); | |
1536 | } | |
5df7e9d8 MM |
1537 | |
1538 | error = register_blkdev(zvol_major, ZVOL_DRIVER); | |
1539 | if (error) { | |
1540 | printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); | |
1541 | return (error); | |
1542 | } | |
6f73d021 TH |
1543 | |
1544 | #ifdef HAVE_BLK_MQ | |
1545 | if (zvol_blk_mq_queue_depth == 0) { | |
1546 | zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
1547 | } else { | |
1548 | zvol_actual_blk_mq_queue_depth = | |
1549 | MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); | |
1550 | } | |
1551 | ||
1552 | if (zvol_blk_mq_threads == 0) { | |
1553 | zvol_blk_mq_actual_threads = num_online_cpus(); | |
1554 | } else { | |
1555 | zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), | |
1556 | 1024); | |
1557 | } | |
1558 | #endif | |
1559 | zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, | |
1560 | zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); | |
5df7e9d8 MM |
1561 | if (zvol_taskq == NULL) { |
1562 | unregister_blkdev(zvol_major, ZVOL_DRIVER); | |
1563 | return (-ENOMEM); | |
1564 | } | |
6f73d021 | 1565 | |
5df7e9d8 | 1566 | zvol_init_impl(); |
5df7e9d8 | 1567 | ida_init(&zvol_ida); |
5df7e9d8 MM |
1568 | return (0); |
1569 | } | |
1570 | ||
1571 | void | |
1572 | zvol_fini(void) | |
1573 | { | |
5df7e9d8 | 1574 | zvol_fini_impl(); |
5df7e9d8 MM |
1575 | unregister_blkdev(zvol_major, ZVOL_DRIVER); |
1576 | taskq_destroy(zvol_taskq); | |
1577 | ida_destroy(&zvol_ida); | |
1578 | } | |
1579 | ||
1580 | /* BEGIN CSTYLED */ | |
1581 | module_param(zvol_inhibit_dev, uint, 0644); | |
1582 | MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); | |
1583 | ||
1584 | module_param(zvol_major, uint, 0444); | |
1585 | MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); | |
1586 | ||
1587 | module_param(zvol_threads, uint, 0444); | |
6f73d021 TH |
1588 | MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" |
1589 | "to 0 to use all active CPUs"); | |
5df7e9d8 MM |
1590 | |
1591 | module_param(zvol_request_sync, uint, 0644); | |
1592 | MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); | |
1593 | ||
1594 | module_param(zvol_max_discard_blocks, ulong, 0444); | |
1595 | MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); | |
1596 | ||
1597 | module_param(zvol_prefetch_bytes, uint, 0644); | |
1598 | MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); | |
1599 | ||
1600 | module_param(zvol_volmode, uint, 0644); | |
1601 | MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); | |
6f73d021 TH |
1602 | |
1603 | #ifdef HAVE_BLK_MQ | |
1604 | module_param(zvol_blk_mq_queue_depth, uint, 0644); | |
1605 | MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); | |
1606 | ||
1607 | module_param(zvol_use_blk_mq, uint, 0644); | |
1608 | MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); | |
1609 | ||
1610 | module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); | |
1611 | MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, | |
1612 | "Process volblocksize blocks per thread"); | |
1613 | #endif | |
1614 | ||
945e39fc PS |
1615 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
1616 | module_param(zvol_open_timeout_ms, uint, 0644); | |
1617 | MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); | |
1618 | #endif | |
1619 | ||
5df7e9d8 | 1620 | /* END CSTYLED */ |