]>
Commit | Line | Data |
---|---|---|
5df7e9d8 MM |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
0929c4de MA |
21 | /* |
22 | * Copyright (c) 2012, 2020 by Delphix. All rights reserved. | |
23 | */ | |
5df7e9d8 MM |
24 | |
25 | #include <sys/dataset_kstats.h> | |
26 | #include <sys/dbuf.h> | |
27 | #include <sys/dmu_traverse.h> | |
28 | #include <sys/dsl_dataset.h> | |
29 | #include <sys/dsl_prop.h> | |
30 | #include <sys/dsl_dir.h> | |
31 | #include <sys/zap.h> | |
32 | #include <sys/zfeature.h> | |
33 | #include <sys/zil_impl.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | #include <sys/zio.h> | |
36 | #include <sys/zfs_rlock.h> | |
37 | #include <sys/spa_impl.h> | |
38 | #include <sys/zvol.h> | |
39 | #include <sys/zvol_impl.h> | |
40 | ||
41 | #include <linux/blkdev_compat.h> | |
42 | #include <linux/task_io_accounting_ops.h> | |
43 | ||
6f73d021 TH |
44 | #ifdef HAVE_BLK_MQ |
45 | #include <linux/blk-mq.h> | |
46 | #endif | |
47 | ||
48 | static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, | |
49 | struct request *rq, boolean_t force_sync); | |
50 | ||
18168da7 AZ |
51 | static unsigned int zvol_major = ZVOL_MAJOR; |
52 | static unsigned int zvol_request_sync = 0; | |
53 | static unsigned int zvol_prefetch_bytes = (128 * 1024); | |
54 | static unsigned long zvol_max_discard_blocks = 16384; | |
abdcef47 PH |
55 | |
56 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS | |
18168da7 | 57 | static const unsigned int zvol_open_timeout_ms = 1000; |
abdcef47 | 58 | #endif |
5df7e9d8 | 59 | |
6f73d021 TH |
60 | static unsigned int zvol_threads = 0; |
61 | #ifdef HAVE_BLK_MQ | |
62 | static unsigned int zvol_blk_mq_threads = 0; | |
63 | static unsigned int zvol_blk_mq_actual_threads; | |
64 | static boolean_t zvol_use_blk_mq = B_FALSE; | |
65 | ||
66 | /* | |
67 | * The maximum number of volblocksize blocks to process per thread. Typically, | |
68 | * write heavy workloads preform better with higher values here, and read | |
69 | * heavy workloads preform better with lower values, but that's not a hard | |
70 | * and fast rule. It's basically a knob to tune between "less overhead with | |
71 | * less parallelism" and "more overhead, but more parallelism". | |
72 | * | |
73 | * '8' was chosen as a reasonable, balanced, default based off of sequential | |
74 | * read and write tests to a zvol in an NVMe pool (with 16 CPUs). | |
75 | */ | |
76 | static unsigned int zvol_blk_mq_blocks_per_thread = 8; | |
77 | #endif | |
78 | ||
79 | #ifndef BLKDEV_DEFAULT_RQ | |
80 | /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ | |
81 | #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ | |
82 | #endif | |
83 | ||
84 | /* | |
85 | * Finalize our BIO or request. | |
86 | */ | |
87 | #ifdef HAVE_BLK_MQ | |
88 | #define END_IO(zv, bio, rq, error) do { \ | |
89 | if (bio) { \ | |
90 | BIO_END_IO(bio, error); \ | |
91 | } else { \ | |
92 | blk_mq_end_request(rq, errno_to_bi_status(error)); \ | |
93 | } \ | |
94 | } while (0) | |
95 | #else | |
96 | #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) | |
97 | #endif | |
98 | ||
99 | #ifdef HAVE_BLK_MQ | |
100 | static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
101 | static unsigned int zvol_actual_blk_mq_queue_depth; | |
102 | #endif | |
103 | ||
5df7e9d8 MM |
104 | struct zvol_state_os { |
105 | struct gendisk *zvo_disk; /* generic disk */ | |
106 | struct request_queue *zvo_queue; /* request queue */ | |
5df7e9d8 | 107 | dev_t zvo_dev; /* device id */ |
6f73d021 TH |
108 | |
109 | #ifdef HAVE_BLK_MQ | |
110 | struct blk_mq_tag_set tag_set; | |
111 | #endif | |
112 | ||
113 | /* Set from the global 'zvol_use_blk_mq' at zvol load */ | |
114 | boolean_t use_blk_mq; | |
5df7e9d8 MM |
115 | }; |
116 | ||
117 | taskq_t *zvol_taskq; | |
118 | static struct ida zvol_ida; | |
119 | ||
e439ee83 | 120 | typedef struct zv_request_stack { |
5df7e9d8 MM |
121 | zvol_state_t *zv; |
122 | struct bio *bio; | |
6f73d021 | 123 | struct request *rq; |
5df7e9d8 MM |
124 | } zv_request_t; |
125 | ||
6f73d021 TH |
126 | typedef struct zv_work { |
127 | struct request *rq; | |
128 | struct work_struct work; | |
129 | } zv_work_t; | |
130 | ||
e439ee83 CS |
131 | typedef struct zv_request_task { |
132 | zv_request_t zvr; | |
133 | taskq_ent_t ent; | |
134 | } zv_request_task_t; | |
135 | ||
136 | static zv_request_task_t * | |
137 | zv_request_task_create(zv_request_t zvr) | |
138 | { | |
139 | zv_request_task_t *task; | |
140 | task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); | |
141 | taskq_init_ent(&task->ent); | |
142 | task->zvr = zvr; | |
143 | return (task); | |
144 | } | |
145 | ||
146 | static void | |
147 | zv_request_task_free(zv_request_task_t *task) | |
148 | { | |
149 | kmem_free(task, sizeof (*task)); | |
150 | } | |
151 | ||
6f73d021 TH |
152 | #ifdef HAVE_BLK_MQ |
153 | ||
154 | /* | |
155 | * This is called when a new block multiqueue request comes in. A request | |
156 | * contains one or more BIOs. | |
157 | */ | |
158 | static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, | |
159 | const struct blk_mq_queue_data *bd) | |
160 | { | |
161 | struct request *rq = bd->rq; | |
162 | zvol_state_t *zv = rq->q->queuedata; | |
163 | ||
164 | /* Tell the kernel that we are starting to process this request */ | |
165 | blk_mq_start_request(rq); | |
166 | ||
167 | if (blk_rq_is_passthrough(rq)) { | |
168 | /* Skip non filesystem request */ | |
169 | blk_mq_end_request(rq, BLK_STS_IOERR); | |
170 | return (BLK_STS_IOERR); | |
171 | } | |
172 | ||
173 | zvol_request_impl(zv, NULL, rq, 0); | |
174 | ||
175 | /* Acknowledge to the kernel that we got this request */ | |
176 | return (BLK_STS_OK); | |
177 | } | |
178 | ||
179 | static struct blk_mq_ops zvol_blk_mq_queue_ops = { | |
180 | .queue_rq = zvol_mq_queue_rq, | |
181 | }; | |
182 | ||
183 | /* Initialize our blk-mq struct */ | |
184 | static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) | |
185 | { | |
186 | struct zvol_state_os *zso = zv->zv_zso; | |
187 | ||
188 | memset(&zso->tag_set, 0, sizeof (zso->tag_set)); | |
189 | ||
190 | /* Initialize tag set. */ | |
191 | zso->tag_set.ops = &zvol_blk_mq_queue_ops; | |
192 | zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; | |
193 | zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; | |
194 | zso->tag_set.numa_node = NUMA_NO_NODE; | |
195 | zso->tag_set.cmd_size = 0; | |
196 | ||
197 | /* | |
198 | * We need BLK_MQ_F_BLOCKING here since we do blocking calls in | |
199 | * zvol_request_impl() | |
200 | */ | |
201 | zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; | |
202 | zso->tag_set.driver_data = zv; | |
203 | ||
204 | return (blk_mq_alloc_tag_set(&zso->tag_set)); | |
205 | } | |
206 | #endif /* HAVE_BLK_MQ */ | |
207 | ||
5df7e9d8 MM |
208 | /* |
209 | * Given a path, return TRUE if path is a ZVOL. | |
210 | */ | |
1dccfd7a CS |
211 | boolean_t |
212 | zvol_os_is_zvol(const char *path) | |
5df7e9d8 | 213 | { |
b7281c88 | 214 | dev_t dev = 0; |
5df7e9d8 | 215 | |
b7281c88 | 216 | if (vdev_lookup_bdev(path, &dev) != 0) |
5df7e9d8 MM |
217 | return (B_FALSE); |
218 | ||
b7281c88 | 219 | if (MAJOR(dev) == zvol_major) |
5df7e9d8 MM |
220 | return (B_TRUE); |
221 | ||
222 | return (B_FALSE); | |
223 | } | |
224 | ||
5df7e9d8 | 225 | static void |
e439ee83 | 226 | zvol_write(zv_request_t *zvr) |
5df7e9d8 | 227 | { |
5df7e9d8 | 228 | struct bio *bio = zvr->bio; |
6f73d021 | 229 | struct request *rq = zvr->rq; |
1c2358c1 | 230 | int error = 0; |
d0cd9a5c | 231 | zfs_uio_t uio; |
5df7e9d8 | 232 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
233 | struct request_queue *q; |
234 | struct gendisk *disk; | |
235 | unsigned long start_time = 0; | |
236 | boolean_t acct = B_FALSE; | |
237 | ||
0b32d817 RM |
238 | ASSERT3P(zv, !=, NULL); |
239 | ASSERT3U(zv->zv_open_count, >, 0); | |
240 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 241 | |
6f73d021 TH |
242 | q = zv->zv_zso->zvo_queue; |
243 | disk = zv->zv_zso->zvo_disk; | |
244 | ||
0929c4de | 245 | /* bio marked as FLUSH need to flush before write */ |
6f73d021 | 246 | if (io_is_flush(bio, rq)) |
0929c4de MA |
247 | zil_commit(zv->zv_zilog, ZVOL_OBJ); |
248 | ||
249 | /* Some requests are just for flush and nothing else. */ | |
6f73d021 | 250 | if (io_size(bio, rq) == 0) { |
0929c4de | 251 | rw_exit(&zv->zv_suspend_lock); |
6f73d021 | 252 | END_IO(zv, bio, rq, 0); |
0929c4de MA |
253 | return; |
254 | } | |
255 | ||
6f73d021 TH |
256 | zfs_uio_bvec_init(&uio, bio, rq); |
257 | ||
5df7e9d8 | 258 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 259 | |
6f73d021 TH |
260 | /* |
261 | * With use_blk_mq, accounting is done by blk_mq_start_request() | |
262 | * and blk_mq_end_request(), so we can skip it here. | |
263 | */ | |
264 | if (bio) { | |
265 | acct = blk_queue_io_stat(q); | |
266 | if (acct) { | |
267 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
268 | bio); | |
269 | } | |
270 | } | |
5df7e9d8 MM |
271 | |
272 | boolean_t sync = | |
6f73d021 | 273 | io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 | 274 | |
0929c4de MA |
275 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
276 | uio.uio_loffset, uio.uio_resid, RL_WRITER); | |
277 | ||
5df7e9d8 MM |
278 | uint64_t volsize = zv->zv_volsize; |
279 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { | |
280 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
281 | uint64_t off = uio.uio_loffset; | |
282 | dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); | |
283 | ||
284 | if (bytes > volsize - off) /* don't write past the end */ | |
285 | bytes = volsize - off; | |
286 | ||
20f28785 | 287 | dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); |
5df7e9d8 MM |
288 | |
289 | /* This will only fail for ENOSPC */ | |
290 | error = dmu_tx_assign(tx, TXG_WAIT); | |
291 | if (error) { | |
292 | dmu_tx_abort(tx); | |
293 | break; | |
294 | } | |
295 | error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); | |
296 | if (error == 0) { | |
297 | zvol_log_write(zv, tx, off, bytes, sync); | |
298 | } | |
299 | dmu_tx_commit(tx); | |
300 | ||
301 | if (error) | |
302 | break; | |
303 | } | |
0929c4de | 304 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
305 | |
306 | int64_t nwritten = start_resid - uio.uio_resid; | |
4547fc4e | 307 | dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); |
5df7e9d8 MM |
308 | task_io_account_write(nwritten); |
309 | ||
310 | if (sync) | |
311 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
312 | ||
313 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 314 | |
6f73d021 | 315 | if (bio && acct) { |
a970f059 | 316 | blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); |
6f73d021 | 317 | } |
a970f059 | 318 | |
6f73d021 | 319 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
320 | } |
321 | ||
322 | static void | |
e439ee83 CS |
323 | zvol_write_task(void *arg) |
324 | { | |
325 | zv_request_task_t *task = arg; | |
326 | zvol_write(&task->zvr); | |
327 | zv_request_task_free(task); | |
328 | } | |
329 | ||
330 | static void | |
331 | zvol_discard(zv_request_t *zvr) | |
5df7e9d8 | 332 | { |
5df7e9d8 | 333 | struct bio *bio = zvr->bio; |
6f73d021 | 334 | struct request *rq = zvr->rq; |
5df7e9d8 | 335 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
336 | uint64_t start = io_offset(bio, rq); |
337 | uint64_t size = io_size(bio, rq); | |
5df7e9d8 MM |
338 | uint64_t end = start + size; |
339 | boolean_t sync; | |
340 | int error = 0; | |
341 | dmu_tx_t *tx; | |
6f73d021 TH |
342 | struct request_queue *q = zv->zv_zso->zvo_queue; |
343 | struct gendisk *disk = zv->zv_zso->zvo_disk; | |
344 | unsigned long start_time = 0; | |
345 | ||
346 | boolean_t acct = blk_queue_io_stat(q); | |
5df7e9d8 | 347 | |
0b32d817 RM |
348 | ASSERT3P(zv, !=, NULL); |
349 | ASSERT3U(zv->zv_open_count, >, 0); | |
350 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 351 | |
6f73d021 TH |
352 | if (bio) { |
353 | acct = blk_queue_io_stat(q); | |
354 | if (acct) { | |
355 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
356 | bio); | |
357 | } | |
358 | } | |
5df7e9d8 | 359 | |
6f73d021 | 360 | sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 MM |
361 | |
362 | if (end > zv->zv_volsize) { | |
363 | error = SET_ERROR(EIO); | |
364 | goto unlock; | |
365 | } | |
366 | ||
367 | /* | |
368 | * Align the request to volume block boundaries when a secure erase is | |
369 | * not required. This will prevent dnode_free_range() from zeroing out | |
370 | * the unaligned parts which is slow (read-modify-write) and useless | |
371 | * since we are not freeing any space by doing so. | |
372 | */ | |
6f73d021 | 373 | if (!io_is_secure_erase(bio, rq)) { |
5df7e9d8 MM |
374 | start = P2ROUNDUP(start, zv->zv_volblocksize); |
375 | end = P2ALIGN(end, zv->zv_volblocksize); | |
376 | size = end - start; | |
377 | } | |
378 | ||
379 | if (start >= end) | |
380 | goto unlock; | |
381 | ||
0929c4de MA |
382 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
383 | start, size, RL_WRITER); | |
384 | ||
5df7e9d8 MM |
385 | tx = dmu_tx_create(zv->zv_objset); |
386 | dmu_tx_mark_netfree(tx); | |
387 | error = dmu_tx_assign(tx, TXG_WAIT); | |
388 | if (error != 0) { | |
389 | dmu_tx_abort(tx); | |
390 | } else { | |
391 | zvol_log_truncate(zv, tx, start, size, B_TRUE); | |
392 | dmu_tx_commit(tx); | |
393 | error = dmu_free_long_range(zv->zv_objset, | |
394 | ZVOL_OBJ, start, size); | |
395 | } | |
0929c4de | 396 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
397 | |
398 | if (error == 0 && sync) | |
399 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
400 | ||
0929c4de | 401 | unlock: |
5df7e9d8 | 402 | rw_exit(&zv->zv_suspend_lock); |
a970f059 | 403 | |
6f73d021 TH |
404 | if (bio && acct) { |
405 | blk_generic_end_io_acct(q, disk, WRITE, bio, | |
406 | start_time); | |
407 | } | |
a970f059 | 408 | |
6f73d021 | 409 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
410 | } |
411 | ||
412 | static void | |
e439ee83 CS |
413 | zvol_discard_task(void *arg) |
414 | { | |
415 | zv_request_task_t *task = arg; | |
416 | zvol_discard(&task->zvr); | |
417 | zv_request_task_free(task); | |
418 | } | |
419 | ||
420 | static void | |
421 | zvol_read(zv_request_t *zvr) | |
5df7e9d8 | 422 | { |
5df7e9d8 | 423 | struct bio *bio = zvr->bio; |
6f73d021 | 424 | struct request *rq = zvr->rq; |
1c2358c1 | 425 | int error = 0; |
d0cd9a5c | 426 | zfs_uio_t uio; |
6f73d021 | 427 | boolean_t acct = B_FALSE; |
5df7e9d8 | 428 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
429 | struct request_queue *q; |
430 | struct gendisk *disk; | |
431 | unsigned long start_time = 0; | |
432 | ||
0b32d817 RM |
433 | ASSERT3P(zv, !=, NULL); |
434 | ASSERT3U(zv->zv_open_count, >, 0); | |
5df7e9d8 | 435 | |
6f73d021 TH |
436 | zfs_uio_bvec_init(&uio, bio, rq); |
437 | ||
438 | q = zv->zv_zso->zvo_queue; | |
439 | disk = zv->zv_zso->zvo_disk; | |
440 | ||
5df7e9d8 | 441 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 442 | |
6f73d021 TH |
443 | /* |
444 | * When blk-mq is being used, accounting is done by | |
445 | * blk_mq_start_request() and blk_mq_end_request(). | |
446 | */ | |
447 | if (bio) { | |
448 | acct = blk_queue_io_stat(q); | |
449 | if (acct) | |
450 | start_time = blk_generic_start_io_acct(q, disk, READ, | |
451 | bio); | |
452 | } | |
5df7e9d8 | 453 | |
0929c4de MA |
454 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
455 | uio.uio_loffset, uio.uio_resid, RL_READER); | |
456 | ||
5df7e9d8 | 457 | uint64_t volsize = zv->zv_volsize; |
6f73d021 | 458 | |
5df7e9d8 MM |
459 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { |
460 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
461 | ||
462 | /* don't read past the end */ | |
463 | if (bytes > volsize - uio.uio_loffset) | |
464 | bytes = volsize - uio.uio_loffset; | |
465 | ||
466 | error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); | |
467 | if (error) { | |
468 | /* convert checksum errors into IO errors */ | |
469 | if (error == ECKSUM) | |
470 | error = SET_ERROR(EIO); | |
471 | break; | |
472 | } | |
473 | } | |
0929c4de | 474 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
475 | |
476 | int64_t nread = start_resid - uio.uio_resid; | |
4547fc4e | 477 | dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); |
5df7e9d8 MM |
478 | task_io_account_read(nread); |
479 | ||
480 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 481 | |
6f73d021 | 482 | if (bio && acct) { |
a970f059 | 483 | blk_generic_end_io_acct(q, disk, READ, bio, start_time); |
6f73d021 | 484 | } |
a970f059 | 485 | |
6f73d021 | 486 | END_IO(zv, bio, rq, -error); |
e439ee83 CS |
487 | } |
488 | ||
489 | static void | |
490 | zvol_read_task(void *arg) | |
491 | { | |
492 | zv_request_task_t *task = arg; | |
493 | zvol_read(&task->zvr); | |
494 | zv_request_task_free(task); | |
5df7e9d8 MM |
495 | } |
496 | ||
6f73d021 TH |
497 | |
498 | /* | |
499 | * Process a BIO or request | |
500 | * | |
501 | * Either 'bio' or 'rq' should be set depending on if we are processing a | |
502 | * bio or a request (both should not be set). | |
503 | * | |
504 | * force_sync: Set to 0 to defer processing to a background taskq | |
505 | * Set to 1 to process data synchronously | |
506 | */ | |
435a451e | 507 | static void |
6f73d021 TH |
508 | zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, |
509 | boolean_t force_sync) | |
5df7e9d8 | 510 | { |
5df7e9d8 | 511 | fstrans_cookie_t cookie = spl_fstrans_mark(); |
6f73d021 TH |
512 | uint64_t offset = io_offset(bio, rq); |
513 | uint64_t size = io_size(bio, rq); | |
514 | int rw = io_data_dir(bio, rq); | |
5df7e9d8 | 515 | |
6f73d021 TH |
516 | if (zvol_request_sync) |
517 | force_sync = 1; | |
518 | ||
519 | zv_request_t zvr = { | |
520 | .zv = zv, | |
521 | .bio = bio, | |
522 | .rq = rq, | |
523 | }; | |
524 | ||
525 | if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { | |
526 | printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", | |
5df7e9d8 MM |
527 | zv->zv_zso->zvo_disk->disk_name, |
528 | (long long unsigned)offset, | |
529 | (long unsigned)size); | |
530 | ||
6f73d021 | 531 | END_IO(zv, bio, rq, -SET_ERROR(EIO)); |
5df7e9d8 MM |
532 | goto out; |
533 | } | |
534 | ||
e439ee83 CS |
535 | zv_request_task_t *task; |
536 | ||
5df7e9d8 | 537 | if (rw == WRITE) { |
5df7e9d8 | 538 | if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { |
6f73d021 | 539 | END_IO(zv, bio, rq, -SET_ERROR(EROFS)); |
5df7e9d8 MM |
540 | goto out; |
541 | } | |
542 | ||
543 | /* | |
0929c4de MA |
544 | * Prevents the zvol from being suspended, or the ZIL being |
545 | * concurrently opened. Will be released after the i/o | |
546 | * completes. | |
5df7e9d8 MM |
547 | */ |
548 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
549 | ||
550 | /* | |
551 | * Open a ZIL if this is the first time we have written to this | |
552 | * zvol. We protect zv->zv_zilog with zv_suspend_lock rather | |
553 | * than zv_state_lock so that we don't need to acquire an | |
554 | * additional lock in this path. | |
555 | */ | |
556 | if (zv->zv_zilog == NULL) { | |
557 | rw_exit(&zv->zv_suspend_lock); | |
558 | rw_enter(&zv->zv_suspend_lock, RW_WRITER); | |
559 | if (zv->zv_zilog == NULL) { | |
560 | zv->zv_zilog = zil_open(zv->zv_objset, | |
561 | zvol_get_data); | |
562 | zv->zv_flags |= ZVOL_WRITTEN_TO; | |
93e36580 CS |
563 | /* replay / destroy done in zvol_create_minor */ |
564 | VERIFY0((zv->zv_zilog->zl_header->zh_flags & | |
565 | ZIL_REPLAY_NEEDED)); | |
5df7e9d8 MM |
566 | } |
567 | rw_downgrade(&zv->zv_suspend_lock); | |
568 | } | |
569 | ||
5df7e9d8 | 570 | /* |
0929c4de MA |
571 | * We don't want this thread to be blocked waiting for i/o to |
572 | * complete, so we instead wait from a taskq callback. The | |
573 | * i/o may be a ZIL write (via zil_commit()), or a read of an | |
574 | * indirect block, or a read of a data block (if this is a | |
575 | * partial-block write). We will indicate that the i/o is | |
6f73d021 | 576 | * complete by calling END_IO() from the taskq callback. |
0929c4de MA |
577 | * |
578 | * This design allows the calling thread to continue and | |
579 | * initiate more concurrent operations by calling | |
580 | * zvol_request() again. There are typically only a small | |
581 | * number of threads available to call zvol_request() (e.g. | |
582 | * one per iSCSI target), so keeping the latency of | |
583 | * zvol_request() low is important for performance. | |
584 | * | |
585 | * The zvol_request_sync module parameter allows this | |
586 | * behavior to be altered, for performance evaluation | |
587 | * purposes. If the callback blocks, setting | |
588 | * zvol_request_sync=1 will result in much worse performance. | |
589 | * | |
590 | * We can have up to zvol_threads concurrent i/o's being | |
591 | * processed for all zvols on the system. This is typically | |
592 | * a vast improvement over the zvol_request_sync=1 behavior | |
593 | * of one i/o at a time per zvol. However, an even better | |
594 | * design would be for zvol_request() to initiate the zio | |
595 | * directly, and then be notified by the zio_done callback, | |
6f73d021 | 596 | * which would call END_IO(). Unfortunately, the DMU/ZIL |
0929c4de MA |
597 | * interfaces lack this functionality (they block waiting for |
598 | * the i/o to complete). | |
5df7e9d8 | 599 | */ |
6f73d021 TH |
600 | if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { |
601 | if (force_sync) { | |
e439ee83 | 602 | zvol_discard(&zvr); |
0929c4de | 603 | } else { |
e439ee83 | 604 | task = zv_request_task_create(zvr); |
0929c4de | 605 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 606 | zvol_discard_task, task, 0, &task->ent); |
0929c4de | 607 | } |
5df7e9d8 | 608 | } else { |
6f73d021 | 609 | if (force_sync) { |
e439ee83 | 610 | zvol_write(&zvr); |
0929c4de | 611 | } else { |
e439ee83 | 612 | task = zv_request_task_create(zvr); |
0929c4de | 613 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 614 | zvol_write_task, task, 0, &task->ent); |
0929c4de | 615 | } |
5df7e9d8 MM |
616 | } |
617 | } else { | |
618 | /* | |
619 | * The SCST driver, and possibly others, may issue READ I/Os | |
620 | * with a length of zero bytes. These empty I/Os contain no | |
621 | * data and require no additional handling. | |
622 | */ | |
623 | if (size == 0) { | |
6f73d021 | 624 | END_IO(zv, bio, rq, 0); |
5df7e9d8 MM |
625 | goto out; |
626 | } | |
627 | ||
5df7e9d8 MM |
628 | rw_enter(&zv->zv_suspend_lock, RW_READER); |
629 | ||
0929c4de | 630 | /* See comment in WRITE case above. */ |
6f73d021 | 631 | if (force_sync) { |
e439ee83 | 632 | zvol_read(&zvr); |
0929c4de | 633 | } else { |
e439ee83 | 634 | task = zv_request_task_create(zvr); |
0929c4de | 635 | taskq_dispatch_ent(zvol_taskq, |
e439ee83 | 636 | zvol_read_task, task, 0, &task->ent); |
0929c4de | 637 | } |
5df7e9d8 MM |
638 | } |
639 | ||
640 | out: | |
641 | spl_fstrans_unmark(cookie); | |
6f73d021 TH |
642 | } |
643 | ||
644 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
645 | #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID | |
646 | static void | |
647 | zvol_submit_bio(struct bio *bio) | |
648 | #else | |
649 | static blk_qc_t | |
650 | zvol_submit_bio(struct bio *bio) | |
651 | #endif | |
652 | #else | |
653 | static MAKE_REQUEST_FN_RET | |
654 | zvol_request(struct request_queue *q, struct bio *bio) | |
655 | #endif | |
656 | { | |
657 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
658 | #if defined(HAVE_BIO_BDEV_DISK) | |
659 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
660 | #else | |
661 | struct request_queue *q = bio->bi_disk->queue; | |
662 | #endif | |
663 | #endif | |
664 | zvol_state_t *zv = q->queuedata; | |
665 | ||
666 | zvol_request_impl(zv, bio, NULL, 0); | |
667 | #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ | |
668 | defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ | |
435a451e | 669 | !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) |
5df7e9d8 MM |
670 | return (BLK_QC_T_NONE); |
671 | #endif | |
672 | } | |
673 | ||
674 | static int | |
675 | zvol_open(struct block_device *bdev, fmode_t flag) | |
676 | { | |
677 | zvol_state_t *zv; | |
678 | int error = 0; | |
8a02d01e | 679 | boolean_t drop_suspend = B_FALSE; |
77e2756d BB |
680 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
681 | hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); | |
682 | hrtime_t start = gethrtime(); | |
5df7e9d8 | 683 | |
77e2756d BB |
684 | retry: |
685 | #endif | |
5df7e9d8 MM |
686 | rw_enter(&zvol_state_lock, RW_READER); |
687 | /* | |
688 | * Obtain a copy of private_data under the zvol_state_lock to make | |
689 | * sure that either the result of zvol free code path setting | |
1dccfd7a | 690 | * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free() |
5df7e9d8 MM |
691 | * is not called on this zv because of the positive zv_open_count. |
692 | */ | |
693 | zv = bdev->bd_disk->private_data; | |
694 | if (zv == NULL) { | |
695 | rw_exit(&zvol_state_lock); | |
696 | return (SET_ERROR(-ENXIO)); | |
697 | } | |
698 | ||
8a02d01e BB |
699 | mutex_enter(&zv->zv_state_lock); |
700 | /* | |
701 | * Make sure zvol is not suspended during first open | |
702 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
703 | * ordering - zv_suspend_lock before zv_state_lock | |
704 | */ | |
705 | if (zv->zv_open_count == 0) { | |
706 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
707 | mutex_exit(&zv->zv_state_lock); | |
708 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
709 | mutex_enter(&zv->zv_state_lock); | |
710 | /* check to see if zv_suspend_lock is needed */ | |
711 | if (zv->zv_open_count != 0) { | |
712 | rw_exit(&zv->zv_suspend_lock); | |
713 | } else { | |
714 | drop_suspend = B_TRUE; | |
715 | } | |
716 | } else { | |
717 | drop_suspend = B_TRUE; | |
718 | } | |
719 | } | |
720 | rw_exit(&zvol_state_lock); | |
721 | ||
722 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
723 | ||
724 | if (zv->zv_open_count == 0) { | |
725 | boolean_t drop_namespace = B_FALSE; | |
726 | ||
727 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
728 | ||
77e2756d BB |
729 | /* |
730 | * In all other call paths the spa_namespace_lock is taken | |
731 | * before the bdev->bd_mutex lock. However, on open(2) | |
732 | * the __blkdev_get() function calls fops->open() with the | |
733 | * bdev->bd_mutex lock held. This can result in a deadlock | |
734 | * when zvols from one pool are used as vdevs in another. | |
735 | * | |
736 | * To prevent a lock inversion deadlock we preemptively | |
737 | * take the spa_namespace_lock. Normally the lock will not | |
738 | * be contended and this is safe because spa_open_common() | |
739 | * handles the case where the caller already holds the | |
740 | * spa_namespace_lock. | |
741 | * | |
742 | * When the lock cannot be aquired after multiple retries | |
743 | * this must be the vdev on zvol deadlock case and we have | |
744 | * no choice but to return an error. For 5.12 and older | |
745 | * kernels returning -ERESTARTSYS will result in the | |
746 | * bdev->bd_mutex being dropped, then reacquired, and | |
747 | * fops->open() being called again. This process can be | |
748 | * repeated safely until both locks are acquired. For 5.13 | |
749 | * and newer the -ERESTARTSYS retry logic was removed from | |
750 | * the kernel so the only option is to return the error for | |
751 | * the caller to handle it. | |
752 | */ | |
8a02d01e BB |
753 | if (!mutex_owned(&spa_namespace_lock)) { |
754 | if (!mutex_tryenter(&spa_namespace_lock)) { | |
755 | mutex_exit(&zv->zv_state_lock); | |
756 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d BB |
757 | |
758 | #ifdef HAVE_BLKDEV_GET_ERESTARTSYS | |
8a02d01e | 759 | schedule(); |
77e2756d | 760 | return (SET_ERROR(-ERESTARTSYS)); |
8a02d01e BB |
761 | #else |
762 | if ((gethrtime() - start) > timeout) | |
763 | return (SET_ERROR(-ERESTARTSYS)); | |
77e2756d | 764 | |
8a02d01e BB |
765 | schedule_timeout(MSEC_TO_TICK(10)); |
766 | goto retry; | |
77e2756d | 767 | #endif |
8a02d01e BB |
768 | } else { |
769 | drop_namespace = B_TRUE; | |
5df7e9d8 MM |
770 | } |
771 | } | |
5df7e9d8 | 772 | |
5df7e9d8 | 773 | error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); |
5df7e9d8 | 774 | |
8a02d01e BB |
775 | if (drop_namespace) |
776 | mutex_exit(&spa_namespace_lock); | |
5df7e9d8 MM |
777 | } |
778 | ||
8a02d01e BB |
779 | if (error == 0) { |
780 | if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { | |
781 | if (zv->zv_open_count == 0) | |
782 | zvol_last_close(zv); | |
5df7e9d8 | 783 | |
8a02d01e BB |
784 | error = SET_ERROR(-EROFS); |
785 | } else { | |
786 | zv->zv_open_count++; | |
787 | } | |
788 | } | |
5df7e9d8 | 789 | |
5df7e9d8 MM |
790 | mutex_exit(&zv->zv_state_lock); |
791 | if (drop_suspend) | |
792 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d | 793 | |
8a02d01e BB |
794 | if (error == 0) |
795 | zfs_check_media_change(bdev); | |
796 | ||
797 | return (error); | |
5df7e9d8 MM |
798 | } |
799 | ||
5df7e9d8 | 800 | static void |
5df7e9d8 MM |
801 | zvol_release(struct gendisk *disk, fmode_t mode) |
802 | { | |
803 | zvol_state_t *zv; | |
804 | boolean_t drop_suspend = B_TRUE; | |
805 | ||
806 | rw_enter(&zvol_state_lock, RW_READER); | |
807 | zv = disk->private_data; | |
808 | ||
809 | mutex_enter(&zv->zv_state_lock); | |
0b32d817 | 810 | ASSERT3U(zv->zv_open_count, >, 0); |
5df7e9d8 MM |
811 | /* |
812 | * make sure zvol is not suspended during last close | |
813 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
814 | * ordering - zv_suspend_lock before zv_state_lock | |
815 | */ | |
816 | if (zv->zv_open_count == 1) { | |
817 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
818 | mutex_exit(&zv->zv_state_lock); | |
819 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
820 | mutex_enter(&zv->zv_state_lock); | |
821 | /* check to see if zv_suspend_lock is needed */ | |
822 | if (zv->zv_open_count != 1) { | |
823 | rw_exit(&zv->zv_suspend_lock); | |
824 | drop_suspend = B_FALSE; | |
825 | } | |
826 | } | |
827 | } else { | |
828 | drop_suspend = B_FALSE; | |
829 | } | |
830 | rw_exit(&zvol_state_lock); | |
831 | ||
832 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
5df7e9d8 MM |
833 | |
834 | zv->zv_open_count--; | |
0b32d817 RM |
835 | if (zv->zv_open_count == 0) { |
836 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
5df7e9d8 | 837 | zvol_last_close(zv); |
0b32d817 | 838 | } |
5df7e9d8 MM |
839 | |
840 | mutex_exit(&zv->zv_state_lock); | |
841 | ||
842 | if (drop_suspend) | |
843 | rw_exit(&zv->zv_suspend_lock); | |
5df7e9d8 MM |
844 | } |
845 | ||
846 | static int | |
847 | zvol_ioctl(struct block_device *bdev, fmode_t mode, | |
848 | unsigned int cmd, unsigned long arg) | |
849 | { | |
850 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
851 | int error = 0; | |
852 | ||
853 | ASSERT3U(zv->zv_open_count, >, 0); | |
854 | ||
855 | switch (cmd) { | |
856 | case BLKFLSBUF: | |
857 | fsync_bdev(bdev); | |
858 | invalidate_bdev(bdev); | |
859 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
860 | ||
861 | if (!(zv->zv_flags & ZVOL_RDONLY)) | |
862 | txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); | |
863 | ||
864 | rw_exit(&zv->zv_suspend_lock); | |
865 | break; | |
866 | ||
867 | case BLKZNAME: | |
868 | mutex_enter(&zv->zv_state_lock); | |
869 | error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); | |
870 | mutex_exit(&zv->zv_state_lock); | |
871 | break; | |
872 | ||
873 | default: | |
874 | error = -ENOTTY; | |
875 | break; | |
876 | } | |
877 | ||
878 | return (SET_ERROR(error)); | |
879 | } | |
880 | ||
881 | #ifdef CONFIG_COMPAT | |
882 | static int | |
883 | zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, | |
884 | unsigned cmd, unsigned long arg) | |
885 | { | |
886 | return (zvol_ioctl(bdev, mode, cmd, arg)); | |
887 | } | |
888 | #else | |
889 | #define zvol_compat_ioctl NULL | |
890 | #endif | |
891 | ||
5df7e9d8 MM |
892 | static unsigned int |
893 | zvol_check_events(struct gendisk *disk, unsigned int clearing) | |
894 | { | |
895 | unsigned int mask = 0; | |
896 | ||
897 | rw_enter(&zvol_state_lock, RW_READER); | |
898 | ||
899 | zvol_state_t *zv = disk->private_data; | |
900 | if (zv != NULL) { | |
901 | mutex_enter(&zv->zv_state_lock); | |
902 | mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; | |
903 | zv->zv_changed = 0; | |
904 | mutex_exit(&zv->zv_state_lock); | |
905 | } | |
906 | ||
907 | rw_exit(&zvol_state_lock); | |
908 | ||
909 | return (mask); | |
910 | } | |
5df7e9d8 MM |
911 | |
912 | static int | |
913 | zvol_revalidate_disk(struct gendisk *disk) | |
914 | { | |
915 | rw_enter(&zvol_state_lock, RW_READER); | |
916 | ||
917 | zvol_state_t *zv = disk->private_data; | |
918 | if (zv != NULL) { | |
919 | mutex_enter(&zv->zv_state_lock); | |
920 | set_capacity(zv->zv_zso->zvo_disk, | |
921 | zv->zv_volsize >> SECTOR_BITS); | |
922 | mutex_exit(&zv->zv_state_lock); | |
923 | } | |
924 | ||
925 | rw_exit(&zvol_state_lock); | |
926 | ||
927 | return (0); | |
928 | } | |
929 | ||
1dccfd7a CS |
930 | int |
931 | zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) | |
5df7e9d8 | 932 | { |
1c0bbd52 | 933 | struct gendisk *disk = zv->zv_zso->zvo_disk; |
5df7e9d8 | 934 | |
19697e45 | 935 | #if defined(HAVE_REVALIDATE_DISK_SIZE) |
1c0bbd52 | 936 | revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); |
19697e45 | 937 | #elif defined(HAVE_REVALIDATE_DISK) |
1c0bbd52 | 938 | revalidate_disk(disk); |
19697e45 BB |
939 | #else |
940 | zvol_revalidate_disk(disk); | |
59b68723 | 941 | #endif |
5df7e9d8 MM |
942 | return (0); |
943 | } | |
944 | ||
1dccfd7a CS |
945 | void |
946 | zvol_os_clear_private(zvol_state_t *zv) | |
5df7e9d8 MM |
947 | { |
948 | /* | |
949 | * Cleared while holding zvol_state_lock as a writer | |
950 | * which will prevent zvol_open() from opening it. | |
951 | */ | |
952 | zv->zv_zso->zvo_disk->private_data = NULL; | |
953 | } | |
954 | ||
955 | /* | |
956 | * Provide a simple virtual geometry for legacy compatibility. For devices | |
957 | * smaller than 1 MiB a small head and sector count is used to allow very | |
958 | * tiny devices. For devices over 1 Mib a standard head and sector count | |
959 | * is used to keep the cylinders count reasonable. | |
960 | */ | |
961 | static int | |
962 | zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |
963 | { | |
964 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
965 | sector_t sectors; | |
966 | ||
967 | ASSERT3U(zv->zv_open_count, >, 0); | |
968 | ||
969 | sectors = get_capacity(zv->zv_zso->zvo_disk); | |
970 | ||
971 | if (sectors > 2048) { | |
972 | geo->heads = 16; | |
973 | geo->sectors = 63; | |
974 | } else { | |
975 | geo->heads = 2; | |
976 | geo->sectors = 4; | |
977 | } | |
978 | ||
979 | geo->start = 0; | |
980 | geo->cylinders = sectors / (geo->heads * geo->sectors); | |
981 | ||
982 | return (0); | |
983 | } | |
984 | ||
6f73d021 TH |
985 | /* |
986 | * Why have two separate block_device_operations structs? | |
987 | * | |
988 | * Normally we'd just have one, and assign 'submit_bio' as needed. However, | |
989 | * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we | |
990 | * can't just change submit_bio dynamically at runtime. So just create two | |
991 | * separate structs to get around this. | |
992 | */ | |
993 | static const struct block_device_operations zvol_ops_blk_mq = { | |
994 | .open = zvol_open, | |
995 | .release = zvol_release, | |
996 | .ioctl = zvol_ioctl, | |
997 | .compat_ioctl = zvol_compat_ioctl, | |
998 | .check_events = zvol_check_events, | |
999 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK | |
1000 | .revalidate_disk = zvol_revalidate_disk, | |
1001 | #endif | |
1002 | .getgeo = zvol_getgeo, | |
1003 | .owner = THIS_MODULE, | |
1004 | }; | |
1005 | ||
18168da7 | 1006 | static const struct block_device_operations zvol_ops = { |
5df7e9d8 MM |
1007 | .open = zvol_open, |
1008 | .release = zvol_release, | |
1009 | .ioctl = zvol_ioctl, | |
1010 | .compat_ioctl = zvol_compat_ioctl, | |
5df7e9d8 | 1011 | .check_events = zvol_check_events, |
48c7b0e4 | 1012 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK |
5df7e9d8 | 1013 | .revalidate_disk = zvol_revalidate_disk, |
48c7b0e4 | 1014 | #endif |
5df7e9d8 MM |
1015 | .getgeo = zvol_getgeo, |
1016 | .owner = THIS_MODULE, | |
d817c171 | 1017 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS |
1b06b03a | 1018 | .submit_bio = zvol_submit_bio, |
d817c171 | 1019 | #endif |
5df7e9d8 MM |
1020 | }; |
1021 | ||
6f73d021 TH |
1022 | static int |
1023 | zvol_alloc_non_blk_mq(struct zvol_state_os *zso) | |
1024 | { | |
1025 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) | |
1026 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1027 | zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); | |
1028 | if (zso->zvo_disk == NULL) | |
1029 | return (1); | |
1030 | ||
1031 | zso->zvo_disk->minors = ZVOL_MINORS; | |
1032 | zso->zvo_queue = zso->zvo_disk->queue; | |
1033 | #else | |
1034 | zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); | |
1035 | if (zso->zvo_queue == NULL) | |
1036 | return (1); | |
1037 | ||
1038 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1039 | if (zso->zvo_disk == NULL) { | |
1040 | blk_cleanup_queue(zso->zvo_queue); | |
1041 | return (1); | |
1042 | } | |
1043 | ||
1044 | zso->zvo_disk->queue = zso->zvo_queue; | |
1045 | #endif /* HAVE_BLK_ALLOC_DISK */ | |
1046 | #else | |
1047 | zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); | |
1048 | if (zso->zvo_queue == NULL) | |
1049 | return (1); | |
1050 | ||
1051 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1052 | if (zso->zvo_disk == NULL) { | |
1053 | blk_cleanup_queue(zso->zvo_queue); | |
1054 | return (1); | |
1055 | } | |
1056 | ||
1057 | zso->zvo_disk->queue = zso->zvo_queue; | |
1058 | #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ | |
1059 | return (0); | |
1060 | ||
1061 | } | |
1062 | ||
1063 | static int | |
1064 | zvol_alloc_blk_mq(zvol_state_t *zv) | |
1065 | { | |
1066 | #ifdef HAVE_BLK_MQ | |
1067 | struct zvol_state_os *zso = zv->zv_zso; | |
1068 | ||
1069 | /* Allocate our blk-mq tag_set */ | |
1070 | if (zvol_blk_mq_alloc_tag_set(zv) != 0) | |
1071 | return (1); | |
1072 | ||
1073 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1074 | zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); | |
1075 | if (zso->zvo_disk == NULL) { | |
1076 | blk_mq_free_tag_set(&zso->tag_set); | |
1077 | return (1); | |
1078 | } | |
1079 | zso->zvo_queue = zso->zvo_disk->queue; | |
1080 | zso->zvo_disk->minors = ZVOL_MINORS; | |
1081 | #else | |
1082 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1083 | if (zso->zvo_disk == NULL) { | |
1084 | blk_cleanup_queue(zso->zvo_queue); | |
1085 | blk_mq_free_tag_set(&zso->tag_set); | |
1086 | return (1); | |
1087 | } | |
1088 | /* Allocate queue */ | |
1089 | zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); | |
1090 | if (IS_ERR(zso->zvo_queue)) { | |
1091 | blk_mq_free_tag_set(&zso->tag_set); | |
1092 | return (1); | |
1093 | } | |
1094 | ||
1095 | /* Our queue is now created, assign it to our disk */ | |
1096 | zso->zvo_disk->queue = zso->zvo_queue; | |
1097 | ||
1098 | #endif | |
1099 | #endif | |
1100 | return (0); | |
1101 | } | |
1102 | ||
5df7e9d8 MM |
1103 | /* |
1104 | * Allocate memory for a new zvol_state_t and setup the required | |
1105 | * request queue and generic disk structures for the block device. | |
1106 | */ | |
1107 | static zvol_state_t * | |
1108 | zvol_alloc(dev_t dev, const char *name) | |
1109 | { | |
1110 | zvol_state_t *zv; | |
68dde63d | 1111 | struct zvol_state_os *zso; |
5df7e9d8 | 1112 | uint64_t volmode; |
6f73d021 | 1113 | int ret; |
5df7e9d8 MM |
1114 | |
1115 | if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) | |
1116 | return (NULL); | |
1117 | ||
1118 | if (volmode == ZFS_VOLMODE_DEFAULT) | |
1119 | volmode = zvol_volmode; | |
1120 | ||
1121 | if (volmode == ZFS_VOLMODE_NONE) | |
1122 | return (NULL); | |
1123 | ||
1124 | zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); | |
68dde63d BB |
1125 | zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); |
1126 | zv->zv_zso = zso; | |
0ca45cb3 | 1127 | zv->zv_volmode = volmode; |
5df7e9d8 MM |
1128 | |
1129 | list_link_init(&zv->zv_next); | |
5df7e9d8 MM |
1130 | mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); |
1131 | ||
6f73d021 TH |
1132 | #ifdef HAVE_BLK_MQ |
1133 | zv->zv_zso->use_blk_mq = zvol_use_blk_mq; | |
1134 | #endif | |
1b06b03a | 1135 | |
6f73d021 TH |
1136 | /* |
1137 | * The block layer has 3 interfaces for getting BIOs: | |
1138 | * | |
1139 | * 1. blk-mq request queues (new) | |
1140 | * 2. submit_bio() (oldest) | |
1141 | * 3. regular request queues (old). | |
1142 | * | |
1143 | * Each of those interfaces has two permutations: | |
1144 | * | |
1145 | * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates | |
1146 | * both the disk and its queue (5.14 kernel or newer) | |
1147 | * | |
1148 | * b) We don't have blk_*alloc_disk(), and have to allocate the | |
1149 | * disk and the queue separately. (5.13 kernel or older) | |
1150 | */ | |
1151 | if (zv->zv_zso->use_blk_mq) { | |
1152 | ret = zvol_alloc_blk_mq(zv); | |
1153 | zso->zvo_disk->fops = &zvol_ops_blk_mq; | |
1154 | } else { | |
1155 | ret = zvol_alloc_non_blk_mq(zso); | |
1156 | zso->zvo_disk->fops = &zvol_ops; | |
1b06b03a | 1157 | } |
6f73d021 | 1158 | if (ret != 0) |
5df7e9d8 MM |
1159 | goto out_kmem; |
1160 | ||
68dde63d | 1161 | blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); |
5df7e9d8 MM |
1162 | |
1163 | /* Limit read-ahead to a single page to prevent over-prefetching. */ | |
68dde63d | 1164 | blk_queue_set_read_ahead(zso->zvo_queue, 1); |
5df7e9d8 | 1165 | |
6f73d021 TH |
1166 | if (!zv->zv_zso->use_blk_mq) { |
1167 | /* Disable write merging in favor of the ZIO pipeline. */ | |
1168 | blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); | |
1169 | } | |
5df7e9d8 | 1170 | |
ae1e40b3 BB |
1171 | /* Enable /proc/diskstats */ |
1172 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); | |
1173 | ||
68dde63d BB |
1174 | zso->zvo_queue->queuedata = zv; |
1175 | zso->zvo_dev = dev; | |
5df7e9d8 MM |
1176 | zv->zv_open_count = 0; |
1177 | strlcpy(zv->zv_name, name, MAXNAMELEN); | |
1178 | ||
2cc479d0 | 1179 | zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); |
5df7e9d8 MM |
1180 | rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); |
1181 | ||
68dde63d BB |
1182 | zso->zvo_disk->major = zvol_major; |
1183 | zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; | |
5df7e9d8 | 1184 | |
026f126b BB |
1185 | /* |
1186 | * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. | |
1187 | * This is accomplished by limiting the number of minors for the | |
1188 | * device to one and explicitly disabling partition scanning. | |
1189 | */ | |
5df7e9d8 | 1190 | if (volmode == ZFS_VOLMODE_DEV) { |
68dde63d | 1191 | zso->zvo_disk->minors = 1; |
026f126b BB |
1192 | zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; |
1193 | zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; | |
5df7e9d8 | 1194 | } |
026f126b | 1195 | |
68dde63d | 1196 | zso->zvo_disk->first_minor = (dev & MINORMASK); |
68dde63d | 1197 | zso->zvo_disk->private_data = zv; |
68dde63d | 1198 | snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", |
5df7e9d8 MM |
1199 | ZVOL_DEV_NAME, (dev & MINORMASK)); |
1200 | ||
1201 | return (zv); | |
1202 | ||
5df7e9d8 | 1203 | out_kmem: |
68dde63d | 1204 | kmem_free(zso, sizeof (struct zvol_state_os)); |
5df7e9d8 MM |
1205 | kmem_free(zv, sizeof (zvol_state_t)); |
1206 | return (NULL); | |
1207 | } | |
1208 | ||
1209 | /* | |
1210 | * Cleanup then free a zvol_state_t which was created by zvol_alloc(). | |
1211 | * At this time, the structure is not opened by anyone, is taken off | |
1212 | * the zvol_state_list, and has its private data set to NULL. | |
1213 | * The zvol_state_lock is dropped. | |
99573cc0 PS |
1214 | * |
1215 | * This function may take many milliseconds to complete (e.g. we've seen | |
1216 | * it take over 256ms), due to the calls to "blk_cleanup_queue" and | |
1217 | * "del_gendisk". Thus, consumers need to be careful to account for this | |
1218 | * latency when calling this function. | |
5df7e9d8 | 1219 | */ |
1dccfd7a CS |
1220 | void |
1221 | zvol_os_free(zvol_state_t *zv) | |
5df7e9d8 MM |
1222 | { |
1223 | ||
1224 | ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); | |
1225 | ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); | |
0b32d817 RM |
1226 | ASSERT0(zv->zv_open_count); |
1227 | ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); | |
5df7e9d8 MM |
1228 | |
1229 | rw_destroy(&zv->zv_suspend_lock); | |
2cc479d0 | 1230 | zfs_rangelock_fini(&zv->zv_rangelock); |
5df7e9d8 MM |
1231 | |
1232 | del_gendisk(zv->zv_zso->zvo_disk); | |
1b06b03a BB |
1233 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ |
1234 | defined(HAVE_BLK_ALLOC_DISK) | |
1235 | blk_cleanup_disk(zv->zv_zso->zvo_disk); | |
1236 | #else | |
5df7e9d8 MM |
1237 | blk_cleanup_queue(zv->zv_zso->zvo_queue); |
1238 | put_disk(zv->zv_zso->zvo_disk); | |
1b06b03a | 1239 | #endif |
5df7e9d8 | 1240 | |
6f73d021 TH |
1241 | #ifdef HAVE_BLK_MQ |
1242 | if (zv->zv_zso->use_blk_mq) | |
1243 | blk_mq_free_tag_set(&zv->zv_zso->tag_set); | |
1244 | #endif | |
1245 | ||
5df7e9d8 MM |
1246 | ida_simple_remove(&zvol_ida, |
1247 | MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); | |
1248 | ||
1249 | mutex_destroy(&zv->zv_state_lock); | |
4547fc4e | 1250 | dataset_kstats_destroy(&zv->zv_kstat); |
5df7e9d8 MM |
1251 | |
1252 | kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); | |
1253 | kmem_free(zv, sizeof (zvol_state_t)); | |
1254 | } | |
1255 | ||
0ca45cb3 MM |
1256 | void |
1257 | zvol_wait_close(zvol_state_t *zv) | |
1258 | { | |
1259 | } | |
1260 | ||
5df7e9d8 MM |
1261 | /* |
1262 | * Create a block device minor node and setup the linkage between it | |
1263 | * and the specified volume. Once this function returns the block | |
1264 | * device is live and ready for use. | |
1265 | */ | |
1dccfd7a | 1266 | int |
ec213971 | 1267 | zvol_os_create_minor(const char *name) |
5df7e9d8 MM |
1268 | { |
1269 | zvol_state_t *zv; | |
1270 | objset_t *os; | |
1271 | dmu_object_info_t *doi; | |
1272 | uint64_t volsize; | |
1273 | uint64_t len; | |
1274 | unsigned minor = 0; | |
1275 | int error = 0; | |
1276 | int idx; | |
1277 | uint64_t hash = zvol_name_hash(name); | |
1278 | ||
1279 | if (zvol_inhibit_dev) | |
1280 | return (0); | |
1281 | ||
1282 | idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); | |
1283 | if (idx < 0) | |
1284 | return (SET_ERROR(-idx)); | |
1285 | minor = idx << ZVOL_MINOR_BITS; | |
1286 | ||
1287 | zv = zvol_find_by_name_hash(name, hash, RW_NONE); | |
1288 | if (zv) { | |
1289 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1290 | mutex_exit(&zv->zv_state_lock); | |
1291 | ida_simple_remove(&zvol_ida, idx); | |
1292 | return (SET_ERROR(EEXIST)); | |
1293 | } | |
1294 | ||
1295 | doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); | |
1296 | ||
1297 | error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); | |
1298 | if (error) | |
1299 | goto out_doi; | |
1300 | ||
1301 | error = dmu_object_info(os, ZVOL_OBJ, doi); | |
1302 | if (error) | |
1303 | goto out_dmu_objset_disown; | |
1304 | ||
1305 | error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); | |
1306 | if (error) | |
1307 | goto out_dmu_objset_disown; | |
1308 | ||
1309 | zv = zvol_alloc(MKDEV(zvol_major, minor), name); | |
1310 | if (zv == NULL) { | |
1311 | error = SET_ERROR(EAGAIN); | |
1312 | goto out_dmu_objset_disown; | |
1313 | } | |
1314 | zv->zv_hash = hash; | |
1315 | ||
1316 | if (dmu_objset_is_snapshot(os)) | |
1317 | zv->zv_flags |= ZVOL_RDONLY; | |
1318 | ||
1319 | zv->zv_volblocksize = doi->doi_data_block_size; | |
1320 | zv->zv_volsize = volsize; | |
1321 | zv->zv_objset = os; | |
1322 | ||
1323 | set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); | |
1324 | ||
1325 | blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, | |
1326 | (DMU_MAX_ACCESS / 4) >> 9); | |
6f73d021 TH |
1327 | |
1328 | if (zv->zv_zso->use_blk_mq) { | |
1329 | /* | |
1330 | * IO requests can be really big (1MB). When an IO request | |
1331 | * comes in, it is passed off to zvol_read() or zvol_write() | |
1332 | * in a new thread, where it is chunked up into 'volblocksize' | |
1333 | * sized pieces and processed. So for example, if the request | |
1334 | * is a 1MB write and your volblocksize is 128k, one zvol_write | |
1335 | * thread will take that request and sequentially do ten 128k | |
1336 | * IOs. This is due to the fact that the thread needs to lock | |
1337 | * each volblocksize sized block. So you might be wondering: | |
1338 | * "instead of passing the whole 1MB request to one thread, | |
1339 | * why not pass ten individual 128k chunks to ten threads and | |
1340 | * process the whole write in parallel?" The short answer is | |
1341 | * that there's a sweet spot number of chunks that balances | |
1342 | * the greater parallelism with the added overhead of more | |
1343 | * threads. The sweet spot can be different depending on if you | |
1344 | * have a read or write heavy workload. Writes typically want | |
1345 | * high chunk counts while reads typically want lower ones. On | |
1346 | * a test pool with 6 NVMe drives in a 3x 2-disk mirror | |
1347 | * configuration, with volblocksize=8k, the sweet spot for good | |
1348 | * sequential reads and writes was at 8 chunks. | |
1349 | */ | |
1350 | ||
1351 | /* | |
1352 | * Below we tell the kernel how big we want our requests | |
1353 | * to be. You would think that blk_queue_io_opt() would be | |
1354 | * used to do this since it is used to "set optimal request | |
1355 | * size for the queue", but that doesn't seem to do | |
1356 | * anything - the kernel still gives you huge requests | |
1357 | * with tons of little PAGE_SIZE segments contained within it. | |
1358 | * | |
1359 | * Knowing that the kernel will just give you PAGE_SIZE segments | |
1360 | * no matter what, you can say "ok, I want PAGE_SIZE byte | |
1361 | * segments, and I want 'N' of them per request", where N is | |
1362 | * the correct number of segments for the volblocksize and | |
1363 | * number of chunks you want. | |
1364 | */ | |
1365 | #ifdef HAVE_BLK_MQ | |
1366 | if (zvol_blk_mq_blocks_per_thread != 0) { | |
1367 | unsigned int chunks; | |
1368 | chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); | |
1369 | ||
1370 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1371 | PAGE_SIZE); | |
1372 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1373 | (zv->zv_volblocksize * chunks) / PAGE_SIZE); | |
1374 | } else { | |
1375 | /* | |
1376 | * Special case: zvol_blk_mq_blocks_per_thread = 0 | |
1377 | * Max everything out. | |
1378 | */ | |
1379 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1380 | UINT16_MAX); | |
1381 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1382 | UINT_MAX); | |
1383 | } | |
1384 | #endif | |
1385 | } else { | |
1386 | blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); | |
1387 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); | |
1388 | } | |
1389 | ||
5df7e9d8 MM |
1390 | blk_queue_physical_block_size(zv->zv_zso->zvo_queue, |
1391 | zv->zv_volblocksize); | |
1392 | blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); | |
1393 | blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, | |
1394 | (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); | |
1395 | blk_queue_discard_granularity(zv->zv_zso->zvo_queue, | |
1396 | zv->zv_volblocksize); | |
5e4aedac | 1397 | #ifdef QUEUE_FLAG_DISCARD |
5df7e9d8 | 1398 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); |
5e4aedac | 1399 | #endif |
5df7e9d8 MM |
1400 | #ifdef QUEUE_FLAG_NONROT |
1401 | blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); | |
1402 | #endif | |
1403 | #ifdef QUEUE_FLAG_ADD_RANDOM | |
1404 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); | |
1405 | #endif | |
1406 | /* This flag was introduced in kernel version 4.12. */ | |
1407 | #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH | |
1408 | blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); | |
1409 | #endif | |
1410 | ||
93e36580 CS |
1411 | ASSERT3P(zv->zv_zilog, ==, NULL); |
1412 | zv->zv_zilog = zil_open(os, zvol_get_data); | |
5df7e9d8 MM |
1413 | if (spa_writeable(dmu_objset_spa(os))) { |
1414 | if (zil_replay_disable) | |
93e36580 | 1415 | zil_destroy(zv->zv_zilog, B_FALSE); |
5df7e9d8 MM |
1416 | else |
1417 | zil_replay(os, zv, zvol_replay_vector); | |
1418 | } | |
93e36580 CS |
1419 | zil_close(zv->zv_zilog); |
1420 | zv->zv_zilog = NULL; | |
4547fc4e AJ |
1421 | ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); |
1422 | dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); | |
5df7e9d8 MM |
1423 | |
1424 | /* | |
1425 | * When udev detects the addition of the device it will immediately | |
1426 | * invoke blkid(8) to determine the type of content on the device. | |
1427 | * Prefetching the blocks commonly scanned by blkid(8) will speed | |
1428 | * up this process. | |
1429 | */ | |
1430 | len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); | |
1431 | if (len > 0) { | |
1432 | dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); | |
1433 | dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, | |
1434 | ZIO_PRIORITY_SYNC_READ); | |
1435 | } | |
1436 | ||
1437 | zv->zv_objset = NULL; | |
1438 | out_dmu_objset_disown: | |
1439 | dmu_objset_disown(os, B_TRUE, FTAG); | |
1440 | out_doi: | |
1441 | kmem_free(doi, sizeof (dmu_object_info_t)); | |
1442 | ||
1443 | /* | |
1444 | * Keep in mind that once add_disk() is called, the zvol is | |
1445 | * announced to the world, and zvol_open()/zvol_release() can | |
1446 | * be called at any time. Incidentally, add_disk() itself calls | |
1447 | * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() | |
1448 | * directly as well. | |
1449 | */ | |
1450 | if (error == 0) { | |
1451 | rw_enter(&zvol_state_lock, RW_WRITER); | |
1452 | zvol_insert(zv); | |
1453 | rw_exit(&zvol_state_lock); | |
12fa250d RE |
1454 | #ifdef HAVE_ADD_DISK_RET |
1455 | error = add_disk(zv->zv_zso->zvo_disk); | |
1456 | #else | |
5df7e9d8 | 1457 | add_disk(zv->zv_zso->zvo_disk); |
12fa250d | 1458 | #endif |
5df7e9d8 MM |
1459 | } else { |
1460 | ida_simple_remove(&zvol_ida, idx); | |
1461 | } | |
1462 | ||
ec213971 | 1463 | return (error); |
5df7e9d8 MM |
1464 | } |
1465 | ||
1dccfd7a CS |
1466 | void |
1467 | zvol_os_rename_minor(zvol_state_t *zv, const char *newname) | |
5df7e9d8 MM |
1468 | { |
1469 | int readonly = get_disk_ro(zv->zv_zso->zvo_disk); | |
1470 | ||
1471 | ASSERT(RW_LOCK_HELD(&zvol_state_lock)); | |
1472 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1473 | ||
1474 | strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); | |
1475 | ||
1476 | /* move to new hashtable entry */ | |
1477 | zv->zv_hash = zvol_name_hash(zv->zv_name); | |
1478 | hlist_del(&zv->zv_hlink); | |
1479 | hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); | |
1480 | ||
1481 | /* | |
1482 | * The block device's read-only state is briefly changed causing | |
1483 | * a KOBJ_CHANGE uevent to be issued. This ensures udev detects | |
1484 | * the name change and fixes the symlinks. This does not change | |
1485 | * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never | |
1486 | * changes. This would normally be done using kobject_uevent() but | |
1487 | * that is a GPL-only symbol which is why we need this workaround. | |
1488 | */ | |
1489 | set_disk_ro(zv->zv_zso->zvo_disk, !readonly); | |
1490 | set_disk_ro(zv->zv_zso->zvo_disk, readonly); | |
1491 | } | |
1492 | ||
1dccfd7a CS |
1493 | void |
1494 | zvol_os_set_disk_ro(zvol_state_t *zv, int flags) | |
5df7e9d8 MM |
1495 | { |
1496 | ||
1497 | set_disk_ro(zv->zv_zso->zvo_disk, flags); | |
1498 | } | |
1499 | ||
1dccfd7a CS |
1500 | void |
1501 | zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) | |
5df7e9d8 MM |
1502 | { |
1503 | ||
1504 | set_capacity(zv->zv_zso->zvo_disk, capacity); | |
1505 | } | |
1506 | ||
5df7e9d8 MM |
1507 | int |
1508 | zvol_init(void) | |
1509 | { | |
1510 | int error; | |
6f73d021 TH |
1511 | |
1512 | /* | |
1513 | * zvol_threads is the module param the user passes in. | |
1514 | * | |
1515 | * zvol_actual_threads is what we use internally, since the user can | |
1516 | * pass zvol_thread = 0 to mean "use all the CPUs" (the default). | |
1517 | */ | |
1518 | static unsigned int zvol_actual_threads; | |
1519 | ||
1520 | if (zvol_threads == 0) { | |
1521 | /* | |
1522 | * See dde9380a1 for why 32 was chosen here. This should | |
1523 | * probably be refined to be some multiple of the number | |
1524 | * of CPUs. | |
1525 | */ | |
1526 | zvol_actual_threads = MAX(num_online_cpus(), 32); | |
1527 | } else { | |
1528 | zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); | |
1529 | } | |
5df7e9d8 MM |
1530 | |
1531 | error = register_blkdev(zvol_major, ZVOL_DRIVER); | |
1532 | if (error) { | |
1533 | printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); | |
1534 | return (error); | |
1535 | } | |
6f73d021 TH |
1536 | |
1537 | #ifdef HAVE_BLK_MQ | |
1538 | if (zvol_blk_mq_queue_depth == 0) { | |
1539 | zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
1540 | } else { | |
1541 | zvol_actual_blk_mq_queue_depth = | |
1542 | MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); | |
1543 | } | |
1544 | ||
1545 | if (zvol_blk_mq_threads == 0) { | |
1546 | zvol_blk_mq_actual_threads = num_online_cpus(); | |
1547 | } else { | |
1548 | zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), | |
1549 | 1024); | |
1550 | } | |
1551 | #endif | |
1552 | zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, | |
1553 | zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); | |
5df7e9d8 MM |
1554 | if (zvol_taskq == NULL) { |
1555 | unregister_blkdev(zvol_major, ZVOL_DRIVER); | |
1556 | return (-ENOMEM); | |
1557 | } | |
6f73d021 | 1558 | |
5df7e9d8 | 1559 | zvol_init_impl(); |
5df7e9d8 | 1560 | ida_init(&zvol_ida); |
5df7e9d8 MM |
1561 | return (0); |
1562 | } | |
1563 | ||
1564 | void | |
1565 | zvol_fini(void) | |
1566 | { | |
5df7e9d8 | 1567 | zvol_fini_impl(); |
5df7e9d8 MM |
1568 | unregister_blkdev(zvol_major, ZVOL_DRIVER); |
1569 | taskq_destroy(zvol_taskq); | |
1570 | ida_destroy(&zvol_ida); | |
1571 | } | |
1572 | ||
1573 | /* BEGIN CSTYLED */ | |
1574 | module_param(zvol_inhibit_dev, uint, 0644); | |
1575 | MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); | |
1576 | ||
1577 | module_param(zvol_major, uint, 0444); | |
1578 | MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); | |
1579 | ||
1580 | module_param(zvol_threads, uint, 0444); | |
6f73d021 TH |
1581 | MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" |
1582 | "to 0 to use all active CPUs"); | |
5df7e9d8 MM |
1583 | |
1584 | module_param(zvol_request_sync, uint, 0644); | |
1585 | MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); | |
1586 | ||
1587 | module_param(zvol_max_discard_blocks, ulong, 0444); | |
1588 | MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); | |
1589 | ||
1590 | module_param(zvol_prefetch_bytes, uint, 0644); | |
1591 | MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); | |
1592 | ||
1593 | module_param(zvol_volmode, uint, 0644); | |
1594 | MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); | |
6f73d021 TH |
1595 | |
1596 | #ifdef HAVE_BLK_MQ | |
1597 | module_param(zvol_blk_mq_queue_depth, uint, 0644); | |
1598 | MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); | |
1599 | ||
1600 | module_param(zvol_use_blk_mq, uint, 0644); | |
1601 | MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); | |
1602 | ||
1603 | module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); | |
1604 | MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, | |
1605 | "Process volblocksize blocks per thread"); | |
1606 | #endif | |
1607 | ||
5df7e9d8 | 1608 | /* END CSTYLED */ |