]>
Commit | Line | Data |
---|---|---|
5df7e9d8 MM |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
5df7e9d8 MM |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
0929c4de MA |
21 | /* |
22 | * Copyright (c) 2012, 2020 by Delphix. All rights reserved. | |
23 | */ | |
5df7e9d8 MM |
24 | |
25 | #include <sys/dataset_kstats.h> | |
26 | #include <sys/dbuf.h> | |
27 | #include <sys/dmu_traverse.h> | |
28 | #include <sys/dsl_dataset.h> | |
29 | #include <sys/dsl_prop.h> | |
30 | #include <sys/dsl_dir.h> | |
31 | #include <sys/zap.h> | |
32 | #include <sys/zfeature.h> | |
33 | #include <sys/zil_impl.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | #include <sys/zio.h> | |
36 | #include <sys/zfs_rlock.h> | |
37 | #include <sys/spa_impl.h> | |
38 | #include <sys/zvol.h> | |
39 | #include <sys/zvol_impl.h> | |
99741bde | 40 | #include <cityhash.h> |
5df7e9d8 MM |
41 | |
42 | #include <linux/blkdev_compat.h> | |
43 | #include <linux/task_io_accounting_ops.h> | |
44 | ||
6f73d021 TH |
45 | #ifdef HAVE_BLK_MQ |
46 | #include <linux/blk-mq.h> | |
47 | #endif | |
48 | ||
49 | static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, | |
50 | struct request *rq, boolean_t force_sync); | |
51 | ||
18168da7 AZ |
52 | static unsigned int zvol_major = ZVOL_MAJOR; |
53 | static unsigned int zvol_request_sync = 0; | |
54 | static unsigned int zvol_prefetch_bytes = (128 * 1024); | |
55 | static unsigned long zvol_max_discard_blocks = 16384; | |
abdcef47 | 56 | |
99741bde AH |
57 | /* |
58 | * Switch taskq at multiple of 512 MB offset. This can be set to a lower value | |
59 | * to utilize more threads for small files but may affect prefetch hits. | |
60 | */ | |
61 | #define ZVOL_TASKQ_OFFSET_SHIFT 29 | |
62 | ||
abdcef47 | 63 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
945e39fc | 64 | static unsigned int zvol_open_timeout_ms = 1000; |
abdcef47 | 65 | #endif |
5df7e9d8 | 66 | |
6f73d021 TH |
67 | static unsigned int zvol_threads = 0; |
68 | #ifdef HAVE_BLK_MQ | |
69 | static unsigned int zvol_blk_mq_threads = 0; | |
70 | static unsigned int zvol_blk_mq_actual_threads; | |
71 | static boolean_t zvol_use_blk_mq = B_FALSE; | |
72 | ||
73 | /* | |
74 | * The maximum number of volblocksize blocks to process per thread. Typically, | |
75 | * write heavy workloads preform better with higher values here, and read | |
76 | * heavy workloads preform better with lower values, but that's not a hard | |
77 | * and fast rule. It's basically a knob to tune between "less overhead with | |
78 | * less parallelism" and "more overhead, but more parallelism". | |
79 | * | |
80 | * '8' was chosen as a reasonable, balanced, default based off of sequential | |
81 | * read and write tests to a zvol in an NVMe pool (with 16 CPUs). | |
82 | */ | |
83 | static unsigned int zvol_blk_mq_blocks_per_thread = 8; | |
84 | #endif | |
85 | ||
c13400c9 RN |
86 | static unsigned int zvol_num_taskqs = 0; |
87 | ||
6f73d021 TH |
88 | #ifndef BLKDEV_DEFAULT_RQ |
89 | /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ | |
90 | #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ | |
91 | #endif | |
92 | ||
93 | /* | |
94 | * Finalize our BIO or request. | |
95 | */ | |
96 | #ifdef HAVE_BLK_MQ | |
97 | #define END_IO(zv, bio, rq, error) do { \ | |
98 | if (bio) { \ | |
99 | BIO_END_IO(bio, error); \ | |
100 | } else { \ | |
101 | blk_mq_end_request(rq, errno_to_bi_status(error)); \ | |
102 | } \ | |
103 | } while (0) | |
104 | #else | |
105 | #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) | |
106 | #endif | |
107 | ||
108 | #ifdef HAVE_BLK_MQ | |
109 | static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
110 | static unsigned int zvol_actual_blk_mq_queue_depth; | |
111 | #endif | |
112 | ||
5df7e9d8 MM |
113 | struct zvol_state_os { |
114 | struct gendisk *zvo_disk; /* generic disk */ | |
115 | struct request_queue *zvo_queue; /* request queue */ | |
5df7e9d8 | 116 | dev_t zvo_dev; /* device id */ |
6f73d021 TH |
117 | |
118 | #ifdef HAVE_BLK_MQ | |
119 | struct blk_mq_tag_set tag_set; | |
120 | #endif | |
121 | ||
122 | /* Set from the global 'zvol_use_blk_mq' at zvol load */ | |
123 | boolean_t use_blk_mq; | |
5df7e9d8 MM |
124 | }; |
125 | ||
99741bde AH |
126 | typedef struct zv_taskq { |
127 | uint_t tqs_cnt; | |
128 | taskq_t **tqs_taskq; | |
129 | } zv_taskq_t; | |
130 | static zv_taskq_t zvol_taskqs; | |
5df7e9d8 MM |
131 | static struct ida zvol_ida; |
132 | ||
e439ee83 | 133 | typedef struct zv_request_stack { |
5df7e9d8 MM |
134 | zvol_state_t *zv; |
135 | struct bio *bio; | |
6f73d021 | 136 | struct request *rq; |
5df7e9d8 MM |
137 | } zv_request_t; |
138 | ||
6f73d021 TH |
139 | typedef struct zv_work { |
140 | struct request *rq; | |
141 | struct work_struct work; | |
142 | } zv_work_t; | |
143 | ||
e439ee83 CS |
144 | typedef struct zv_request_task { |
145 | zv_request_t zvr; | |
146 | taskq_ent_t ent; | |
147 | } zv_request_task_t; | |
148 | ||
149 | static zv_request_task_t * | |
150 | zv_request_task_create(zv_request_t zvr) | |
151 | { | |
152 | zv_request_task_t *task; | |
153 | task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); | |
154 | taskq_init_ent(&task->ent); | |
155 | task->zvr = zvr; | |
156 | return (task); | |
157 | } | |
158 | ||
159 | static void | |
160 | zv_request_task_free(zv_request_task_t *task) | |
161 | { | |
162 | kmem_free(task, sizeof (*task)); | |
163 | } | |
164 | ||
6f73d021 TH |
165 | #ifdef HAVE_BLK_MQ |
166 | ||
167 | /* | |
168 | * This is called when a new block multiqueue request comes in. A request | |
169 | * contains one or more BIOs. | |
170 | */ | |
171 | static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, | |
172 | const struct blk_mq_queue_data *bd) | |
173 | { | |
174 | struct request *rq = bd->rq; | |
175 | zvol_state_t *zv = rq->q->queuedata; | |
176 | ||
177 | /* Tell the kernel that we are starting to process this request */ | |
178 | blk_mq_start_request(rq); | |
179 | ||
180 | if (blk_rq_is_passthrough(rq)) { | |
181 | /* Skip non filesystem request */ | |
182 | blk_mq_end_request(rq, BLK_STS_IOERR); | |
183 | return (BLK_STS_IOERR); | |
184 | } | |
185 | ||
186 | zvol_request_impl(zv, NULL, rq, 0); | |
187 | ||
188 | /* Acknowledge to the kernel that we got this request */ | |
189 | return (BLK_STS_OK); | |
190 | } | |
191 | ||
192 | static struct blk_mq_ops zvol_blk_mq_queue_ops = { | |
193 | .queue_rq = zvol_mq_queue_rq, | |
194 | }; | |
195 | ||
196 | /* Initialize our blk-mq struct */ | |
197 | static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) | |
198 | { | |
199 | struct zvol_state_os *zso = zv->zv_zso; | |
200 | ||
201 | memset(&zso->tag_set, 0, sizeof (zso->tag_set)); | |
202 | ||
203 | /* Initialize tag set. */ | |
204 | zso->tag_set.ops = &zvol_blk_mq_queue_ops; | |
205 | zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; | |
206 | zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; | |
207 | zso->tag_set.numa_node = NUMA_NO_NODE; | |
208 | zso->tag_set.cmd_size = 0; | |
209 | ||
210 | /* | |
211 | * We need BLK_MQ_F_BLOCKING here since we do blocking calls in | |
212 | * zvol_request_impl() | |
213 | */ | |
214 | zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; | |
215 | zso->tag_set.driver_data = zv; | |
216 | ||
217 | return (blk_mq_alloc_tag_set(&zso->tag_set)); | |
218 | } | |
219 | #endif /* HAVE_BLK_MQ */ | |
220 | ||
5df7e9d8 MM |
221 | /* |
222 | * Given a path, return TRUE if path is a ZVOL. | |
223 | */ | |
1dccfd7a CS |
224 | boolean_t |
225 | zvol_os_is_zvol(const char *path) | |
5df7e9d8 | 226 | { |
b7281c88 | 227 | dev_t dev = 0; |
5df7e9d8 | 228 | |
b7281c88 | 229 | if (vdev_lookup_bdev(path, &dev) != 0) |
5df7e9d8 MM |
230 | return (B_FALSE); |
231 | ||
b7281c88 | 232 | if (MAJOR(dev) == zvol_major) |
5df7e9d8 MM |
233 | return (B_TRUE); |
234 | ||
235 | return (B_FALSE); | |
236 | } | |
237 | ||
5df7e9d8 | 238 | static void |
e439ee83 | 239 | zvol_write(zv_request_t *zvr) |
5df7e9d8 | 240 | { |
5df7e9d8 | 241 | struct bio *bio = zvr->bio; |
6f73d021 | 242 | struct request *rq = zvr->rq; |
1c2358c1 | 243 | int error = 0; |
d0cd9a5c | 244 | zfs_uio_t uio; |
5df7e9d8 | 245 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
246 | struct request_queue *q; |
247 | struct gendisk *disk; | |
248 | unsigned long start_time = 0; | |
249 | boolean_t acct = B_FALSE; | |
250 | ||
0b32d817 RM |
251 | ASSERT3P(zv, !=, NULL); |
252 | ASSERT3U(zv->zv_open_count, >, 0); | |
253 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 254 | |
6f73d021 TH |
255 | q = zv->zv_zso->zvo_queue; |
256 | disk = zv->zv_zso->zvo_disk; | |
257 | ||
0929c4de | 258 | /* bio marked as FLUSH need to flush before write */ |
6f73d021 | 259 | if (io_is_flush(bio, rq)) |
0929c4de MA |
260 | zil_commit(zv->zv_zilog, ZVOL_OBJ); |
261 | ||
262 | /* Some requests are just for flush and nothing else. */ | |
6f73d021 | 263 | if (io_size(bio, rq) == 0) { |
0929c4de | 264 | rw_exit(&zv->zv_suspend_lock); |
6f73d021 | 265 | END_IO(zv, bio, rq, 0); |
0929c4de MA |
266 | return; |
267 | } | |
268 | ||
6f73d021 TH |
269 | zfs_uio_bvec_init(&uio, bio, rq); |
270 | ||
5df7e9d8 | 271 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 272 | |
6f73d021 TH |
273 | /* |
274 | * With use_blk_mq, accounting is done by blk_mq_start_request() | |
275 | * and blk_mq_end_request(), so we can skip it here. | |
276 | */ | |
277 | if (bio) { | |
278 | acct = blk_queue_io_stat(q); | |
279 | if (acct) { | |
280 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
281 | bio); | |
282 | } | |
283 | } | |
5df7e9d8 MM |
284 | |
285 | boolean_t sync = | |
6f73d021 | 286 | io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 | 287 | |
0929c4de MA |
288 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
289 | uio.uio_loffset, uio.uio_resid, RL_WRITER); | |
290 | ||
5df7e9d8 MM |
291 | uint64_t volsize = zv->zv_volsize; |
292 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { | |
293 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
294 | uint64_t off = uio.uio_loffset; | |
295 | dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); | |
296 | ||
297 | if (bytes > volsize - off) /* don't write past the end */ | |
298 | bytes = volsize - off; | |
299 | ||
20f28785 | 300 | dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); |
5df7e9d8 MM |
301 | |
302 | /* This will only fail for ENOSPC */ | |
303 | error = dmu_tx_assign(tx, TXG_WAIT); | |
304 | if (error) { | |
305 | dmu_tx_abort(tx); | |
306 | break; | |
307 | } | |
308 | error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); | |
309 | if (error == 0) { | |
310 | zvol_log_write(zv, tx, off, bytes, sync); | |
311 | } | |
312 | dmu_tx_commit(tx); | |
313 | ||
314 | if (error) | |
315 | break; | |
316 | } | |
0929c4de | 317 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
318 | |
319 | int64_t nwritten = start_resid - uio.uio_resid; | |
4547fc4e | 320 | dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); |
5df7e9d8 MM |
321 | task_io_account_write(nwritten); |
322 | ||
323 | if (sync) | |
324 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
325 | ||
326 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 327 | |
6f73d021 | 328 | if (bio && acct) { |
a970f059 | 329 | blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); |
6f73d021 | 330 | } |
a970f059 | 331 | |
6f73d021 | 332 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
333 | } |
334 | ||
335 | static void | |
e439ee83 CS |
336 | zvol_write_task(void *arg) |
337 | { | |
338 | zv_request_task_t *task = arg; | |
339 | zvol_write(&task->zvr); | |
340 | zv_request_task_free(task); | |
341 | } | |
342 | ||
343 | static void | |
344 | zvol_discard(zv_request_t *zvr) | |
5df7e9d8 | 345 | { |
5df7e9d8 | 346 | struct bio *bio = zvr->bio; |
6f73d021 | 347 | struct request *rq = zvr->rq; |
5df7e9d8 | 348 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
349 | uint64_t start = io_offset(bio, rq); |
350 | uint64_t size = io_size(bio, rq); | |
5df7e9d8 MM |
351 | uint64_t end = start + size; |
352 | boolean_t sync; | |
353 | int error = 0; | |
354 | dmu_tx_t *tx; | |
6f73d021 TH |
355 | struct request_queue *q = zv->zv_zso->zvo_queue; |
356 | struct gendisk *disk = zv->zv_zso->zvo_disk; | |
357 | unsigned long start_time = 0; | |
5dd0f019 | 358 | boolean_t acct = B_FALSE; |
5df7e9d8 | 359 | |
0b32d817 RM |
360 | ASSERT3P(zv, !=, NULL); |
361 | ASSERT3U(zv->zv_open_count, >, 0); | |
362 | ASSERT3P(zv->zv_zilog, !=, NULL); | |
5df7e9d8 | 363 | |
6f73d021 TH |
364 | if (bio) { |
365 | acct = blk_queue_io_stat(q); | |
366 | if (acct) { | |
367 | start_time = blk_generic_start_io_acct(q, disk, WRITE, | |
368 | bio); | |
369 | } | |
370 | } | |
5df7e9d8 | 371 | |
6f73d021 | 372 | sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; |
5df7e9d8 MM |
373 | |
374 | if (end > zv->zv_volsize) { | |
375 | error = SET_ERROR(EIO); | |
376 | goto unlock; | |
377 | } | |
378 | ||
379 | /* | |
380 | * Align the request to volume block boundaries when a secure erase is | |
381 | * not required. This will prevent dnode_free_range() from zeroing out | |
382 | * the unaligned parts which is slow (read-modify-write) and useless | |
383 | * since we are not freeing any space by doing so. | |
384 | */ | |
6f73d021 | 385 | if (!io_is_secure_erase(bio, rq)) { |
5df7e9d8 MM |
386 | start = P2ROUNDUP(start, zv->zv_volblocksize); |
387 | end = P2ALIGN(end, zv->zv_volblocksize); | |
388 | size = end - start; | |
389 | } | |
390 | ||
391 | if (start >= end) | |
392 | goto unlock; | |
393 | ||
0929c4de MA |
394 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
395 | start, size, RL_WRITER); | |
396 | ||
5df7e9d8 MM |
397 | tx = dmu_tx_create(zv->zv_objset); |
398 | dmu_tx_mark_netfree(tx); | |
399 | error = dmu_tx_assign(tx, TXG_WAIT); | |
400 | if (error != 0) { | |
401 | dmu_tx_abort(tx); | |
402 | } else { | |
c3773de1 | 403 | zvol_log_truncate(zv, tx, start, size); |
5df7e9d8 MM |
404 | dmu_tx_commit(tx); |
405 | error = dmu_free_long_range(zv->zv_objset, | |
406 | ZVOL_OBJ, start, size); | |
407 | } | |
0929c4de | 408 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
409 | |
410 | if (error == 0 && sync) | |
411 | zil_commit(zv->zv_zilog, ZVOL_OBJ); | |
412 | ||
0929c4de | 413 | unlock: |
5df7e9d8 | 414 | rw_exit(&zv->zv_suspend_lock); |
a970f059 | 415 | |
6f73d021 TH |
416 | if (bio && acct) { |
417 | blk_generic_end_io_acct(q, disk, WRITE, bio, | |
418 | start_time); | |
419 | } | |
a970f059 | 420 | |
6f73d021 | 421 | END_IO(zv, bio, rq, -error); |
5df7e9d8 MM |
422 | } |
423 | ||
424 | static void | |
e439ee83 CS |
425 | zvol_discard_task(void *arg) |
426 | { | |
427 | zv_request_task_t *task = arg; | |
428 | zvol_discard(&task->zvr); | |
429 | zv_request_task_free(task); | |
430 | } | |
431 | ||
432 | static void | |
433 | zvol_read(zv_request_t *zvr) | |
5df7e9d8 | 434 | { |
5df7e9d8 | 435 | struct bio *bio = zvr->bio; |
6f73d021 | 436 | struct request *rq = zvr->rq; |
1c2358c1 | 437 | int error = 0; |
d0cd9a5c | 438 | zfs_uio_t uio; |
6f73d021 | 439 | boolean_t acct = B_FALSE; |
5df7e9d8 | 440 | zvol_state_t *zv = zvr->zv; |
6f73d021 TH |
441 | struct request_queue *q; |
442 | struct gendisk *disk; | |
443 | unsigned long start_time = 0; | |
444 | ||
0b32d817 RM |
445 | ASSERT3P(zv, !=, NULL); |
446 | ASSERT3U(zv->zv_open_count, >, 0); | |
5df7e9d8 | 447 | |
6f73d021 TH |
448 | zfs_uio_bvec_init(&uio, bio, rq); |
449 | ||
450 | q = zv->zv_zso->zvo_queue; | |
451 | disk = zv->zv_zso->zvo_disk; | |
452 | ||
5df7e9d8 | 453 | ssize_t start_resid = uio.uio_resid; |
a970f059 | 454 | |
6f73d021 TH |
455 | /* |
456 | * When blk-mq is being used, accounting is done by | |
457 | * blk_mq_start_request() and blk_mq_end_request(). | |
458 | */ | |
459 | if (bio) { | |
460 | acct = blk_queue_io_stat(q); | |
461 | if (acct) | |
462 | start_time = blk_generic_start_io_acct(q, disk, READ, | |
463 | bio); | |
464 | } | |
5df7e9d8 | 465 | |
0929c4de MA |
466 | zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, |
467 | uio.uio_loffset, uio.uio_resid, RL_READER); | |
468 | ||
5df7e9d8 | 469 | uint64_t volsize = zv->zv_volsize; |
6f73d021 | 470 | |
5df7e9d8 MM |
471 | while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { |
472 | uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); | |
473 | ||
474 | /* don't read past the end */ | |
475 | if (bytes > volsize - uio.uio_loffset) | |
476 | bytes = volsize - uio.uio_loffset; | |
477 | ||
478 | error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); | |
479 | if (error) { | |
480 | /* convert checksum errors into IO errors */ | |
481 | if (error == ECKSUM) | |
482 | error = SET_ERROR(EIO); | |
483 | break; | |
484 | } | |
485 | } | |
0929c4de | 486 | zfs_rangelock_exit(lr); |
5df7e9d8 MM |
487 | |
488 | int64_t nread = start_resid - uio.uio_resid; | |
4547fc4e | 489 | dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); |
5df7e9d8 MM |
490 | task_io_account_read(nread); |
491 | ||
492 | rw_exit(&zv->zv_suspend_lock); | |
a970f059 | 493 | |
6f73d021 | 494 | if (bio && acct) { |
a970f059 | 495 | blk_generic_end_io_acct(q, disk, READ, bio, start_time); |
6f73d021 | 496 | } |
a970f059 | 497 | |
6f73d021 | 498 | END_IO(zv, bio, rq, -error); |
e439ee83 CS |
499 | } |
500 | ||
501 | static void | |
502 | zvol_read_task(void *arg) | |
503 | { | |
504 | zv_request_task_t *task = arg; | |
505 | zvol_read(&task->zvr); | |
506 | zv_request_task_free(task); | |
5df7e9d8 MM |
507 | } |
508 | ||
6f73d021 TH |
509 | |
510 | /* | |
511 | * Process a BIO or request | |
512 | * | |
513 | * Either 'bio' or 'rq' should be set depending on if we are processing a | |
514 | * bio or a request (both should not be set). | |
515 | * | |
516 | * force_sync: Set to 0 to defer processing to a background taskq | |
517 | * Set to 1 to process data synchronously | |
518 | */ | |
435a451e | 519 | static void |
6f73d021 TH |
520 | zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, |
521 | boolean_t force_sync) | |
5df7e9d8 | 522 | { |
5df7e9d8 | 523 | fstrans_cookie_t cookie = spl_fstrans_mark(); |
6f73d021 TH |
524 | uint64_t offset = io_offset(bio, rq); |
525 | uint64_t size = io_size(bio, rq); | |
526 | int rw = io_data_dir(bio, rq); | |
5df7e9d8 | 527 | |
60387fac | 528 | if (zvol_request_sync || zv->zv_threading == B_FALSE) |
6f73d021 TH |
529 | force_sync = 1; |
530 | ||
531 | zv_request_t zvr = { | |
532 | .zv = zv, | |
533 | .bio = bio, | |
534 | .rq = rq, | |
535 | }; | |
536 | ||
537 | if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { | |
538 | printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", | |
5df7e9d8 MM |
539 | zv->zv_zso->zvo_disk->disk_name, |
540 | (long long unsigned)offset, | |
541 | (long unsigned)size); | |
542 | ||
6f73d021 | 543 | END_IO(zv, bio, rq, -SET_ERROR(EIO)); |
5df7e9d8 MM |
544 | goto out; |
545 | } | |
546 | ||
e439ee83 | 547 | zv_request_task_t *task; |
99741bde AH |
548 | zv_taskq_t *ztqs = &zvol_taskqs; |
549 | uint_t blk_mq_hw_queue = 0; | |
550 | uint_t tq_idx; | |
551 | uint_t taskq_hash; | |
552 | #ifdef HAVE_BLK_MQ | |
553 | if (rq) | |
554 | blk_mq_hw_queue = rq->mq_hctx->queue_num; | |
555 | #endif | |
556 | taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, | |
557 | blk_mq_hw_queue, 0); | |
558 | tq_idx = taskq_hash % ztqs->tqs_cnt; | |
e439ee83 | 559 | |
5df7e9d8 | 560 | if (rw == WRITE) { |
5df7e9d8 | 561 | if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { |
6f73d021 | 562 | END_IO(zv, bio, rq, -SET_ERROR(EROFS)); |
5df7e9d8 MM |
563 | goto out; |
564 | } | |
565 | ||
566 | /* | |
0929c4de MA |
567 | * Prevents the zvol from being suspended, or the ZIL being |
568 | * concurrently opened. Will be released after the i/o | |
569 | * completes. | |
5df7e9d8 MM |
570 | */ |
571 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
572 | ||
573 | /* | |
574 | * Open a ZIL if this is the first time we have written to this | |
575 | * zvol. We protect zv->zv_zilog with zv_suspend_lock rather | |
576 | * than zv_state_lock so that we don't need to acquire an | |
577 | * additional lock in this path. | |
578 | */ | |
579 | if (zv->zv_zilog == NULL) { | |
580 | rw_exit(&zv->zv_suspend_lock); | |
581 | rw_enter(&zv->zv_suspend_lock, RW_WRITER); | |
582 | if (zv->zv_zilog == NULL) { | |
583 | zv->zv_zilog = zil_open(zv->zv_objset, | |
fb087146 | 584 | zvol_get_data, &zv->zv_kstat.dk_zil_sums); |
5df7e9d8 | 585 | zv->zv_flags |= ZVOL_WRITTEN_TO; |
93e36580 CS |
586 | /* replay / destroy done in zvol_create_minor */ |
587 | VERIFY0((zv->zv_zilog->zl_header->zh_flags & | |
588 | ZIL_REPLAY_NEEDED)); | |
5df7e9d8 MM |
589 | } |
590 | rw_downgrade(&zv->zv_suspend_lock); | |
591 | } | |
592 | ||
5df7e9d8 | 593 | /* |
0929c4de MA |
594 | * We don't want this thread to be blocked waiting for i/o to |
595 | * complete, so we instead wait from a taskq callback. The | |
596 | * i/o may be a ZIL write (via zil_commit()), or a read of an | |
597 | * indirect block, or a read of a data block (if this is a | |
598 | * partial-block write). We will indicate that the i/o is | |
6f73d021 | 599 | * complete by calling END_IO() from the taskq callback. |
0929c4de MA |
600 | * |
601 | * This design allows the calling thread to continue and | |
602 | * initiate more concurrent operations by calling | |
603 | * zvol_request() again. There are typically only a small | |
604 | * number of threads available to call zvol_request() (e.g. | |
605 | * one per iSCSI target), so keeping the latency of | |
606 | * zvol_request() low is important for performance. | |
607 | * | |
608 | * The zvol_request_sync module parameter allows this | |
609 | * behavior to be altered, for performance evaluation | |
610 | * purposes. If the callback blocks, setting | |
611 | * zvol_request_sync=1 will result in much worse performance. | |
612 | * | |
613 | * We can have up to zvol_threads concurrent i/o's being | |
614 | * processed for all zvols on the system. This is typically | |
615 | * a vast improvement over the zvol_request_sync=1 behavior | |
616 | * of one i/o at a time per zvol. However, an even better | |
617 | * design would be for zvol_request() to initiate the zio | |
618 | * directly, and then be notified by the zio_done callback, | |
6f73d021 | 619 | * which would call END_IO(). Unfortunately, the DMU/ZIL |
0929c4de MA |
620 | * interfaces lack this functionality (they block waiting for |
621 | * the i/o to complete). | |
5df7e9d8 | 622 | */ |
6f73d021 TH |
623 | if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { |
624 | if (force_sync) { | |
e439ee83 | 625 | zvol_discard(&zvr); |
0929c4de | 626 | } else { |
e439ee83 | 627 | task = zv_request_task_create(zvr); |
99741bde | 628 | taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], |
e439ee83 | 629 | zvol_discard_task, task, 0, &task->ent); |
0929c4de | 630 | } |
5df7e9d8 | 631 | } else { |
6f73d021 | 632 | if (force_sync) { |
e439ee83 | 633 | zvol_write(&zvr); |
0929c4de | 634 | } else { |
e439ee83 | 635 | task = zv_request_task_create(zvr); |
99741bde | 636 | taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], |
e439ee83 | 637 | zvol_write_task, task, 0, &task->ent); |
0929c4de | 638 | } |
5df7e9d8 MM |
639 | } |
640 | } else { | |
641 | /* | |
642 | * The SCST driver, and possibly others, may issue READ I/Os | |
643 | * with a length of zero bytes. These empty I/Os contain no | |
644 | * data and require no additional handling. | |
645 | */ | |
646 | if (size == 0) { | |
6f73d021 | 647 | END_IO(zv, bio, rq, 0); |
5df7e9d8 MM |
648 | goto out; |
649 | } | |
650 | ||
5df7e9d8 MM |
651 | rw_enter(&zv->zv_suspend_lock, RW_READER); |
652 | ||
0929c4de | 653 | /* See comment in WRITE case above. */ |
6f73d021 | 654 | if (force_sync) { |
e439ee83 | 655 | zvol_read(&zvr); |
0929c4de | 656 | } else { |
e439ee83 | 657 | task = zv_request_task_create(zvr); |
99741bde | 658 | taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], |
e439ee83 | 659 | zvol_read_task, task, 0, &task->ent); |
0929c4de | 660 | } |
5df7e9d8 MM |
661 | } |
662 | ||
663 | out: | |
664 | spl_fstrans_unmark(cookie); | |
6f73d021 TH |
665 | } |
666 | ||
667 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
668 | #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID | |
669 | static void | |
670 | zvol_submit_bio(struct bio *bio) | |
671 | #else | |
672 | static blk_qc_t | |
673 | zvol_submit_bio(struct bio *bio) | |
674 | #endif | |
675 | #else | |
676 | static MAKE_REQUEST_FN_RET | |
677 | zvol_request(struct request_queue *q, struct bio *bio) | |
678 | #endif | |
679 | { | |
680 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS | |
681 | #if defined(HAVE_BIO_BDEV_DISK) | |
682 | struct request_queue *q = bio->bi_bdev->bd_disk->queue; | |
683 | #else | |
684 | struct request_queue *q = bio->bi_disk->queue; | |
685 | #endif | |
686 | #endif | |
687 | zvol_state_t *zv = q->queuedata; | |
688 | ||
689 | zvol_request_impl(zv, bio, NULL, 0); | |
690 | #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ | |
691 | defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ | |
435a451e | 692 | !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) |
5df7e9d8 MM |
693 | return (BLK_QC_T_NONE); |
694 | #endif | |
695 | } | |
696 | ||
697 | static int | |
43e8f6e3 CK |
698 | #ifdef HAVE_BLK_MODE_T |
699 | zvol_open(struct gendisk *disk, blk_mode_t flag) | |
700 | #else | |
5df7e9d8 | 701 | zvol_open(struct block_device *bdev, fmode_t flag) |
43e8f6e3 | 702 | #endif |
5df7e9d8 MM |
703 | { |
704 | zvol_state_t *zv; | |
705 | int error = 0; | |
8a02d01e | 706 | boolean_t drop_suspend = B_FALSE; |
77e2756d BB |
707 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
708 | hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); | |
709 | hrtime_t start = gethrtime(); | |
5df7e9d8 | 710 | |
77e2756d BB |
711 | retry: |
712 | #endif | |
5df7e9d8 MM |
713 | rw_enter(&zvol_state_lock, RW_READER); |
714 | /* | |
715 | * Obtain a copy of private_data under the zvol_state_lock to make | |
716 | * sure that either the result of zvol free code path setting | |
43e8f6e3 | 717 | * disk->private_data to NULL is observed, or zvol_os_free() |
5df7e9d8 MM |
718 | * is not called on this zv because of the positive zv_open_count. |
719 | */ | |
43e8f6e3 CK |
720 | #ifdef HAVE_BLK_MODE_T |
721 | zv = disk->private_data; | |
722 | #else | |
5df7e9d8 | 723 | zv = bdev->bd_disk->private_data; |
43e8f6e3 | 724 | #endif |
5df7e9d8 MM |
725 | if (zv == NULL) { |
726 | rw_exit(&zvol_state_lock); | |
727 | return (SET_ERROR(-ENXIO)); | |
728 | } | |
729 | ||
8a02d01e BB |
730 | mutex_enter(&zv->zv_state_lock); |
731 | /* | |
732 | * Make sure zvol is not suspended during first open | |
733 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
734 | * ordering - zv_suspend_lock before zv_state_lock | |
735 | */ | |
736 | if (zv->zv_open_count == 0) { | |
737 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
738 | mutex_exit(&zv->zv_state_lock); | |
739 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
740 | mutex_enter(&zv->zv_state_lock); | |
741 | /* check to see if zv_suspend_lock is needed */ | |
742 | if (zv->zv_open_count != 0) { | |
743 | rw_exit(&zv->zv_suspend_lock); | |
744 | } else { | |
745 | drop_suspend = B_TRUE; | |
746 | } | |
747 | } else { | |
748 | drop_suspend = B_TRUE; | |
749 | } | |
750 | } | |
751 | rw_exit(&zvol_state_lock); | |
752 | ||
753 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
754 | ||
755 | if (zv->zv_open_count == 0) { | |
756 | boolean_t drop_namespace = B_FALSE; | |
757 | ||
758 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
759 | ||
77e2756d BB |
760 | /* |
761 | * In all other call paths the spa_namespace_lock is taken | |
762 | * before the bdev->bd_mutex lock. However, on open(2) | |
763 | * the __blkdev_get() function calls fops->open() with the | |
764 | * bdev->bd_mutex lock held. This can result in a deadlock | |
765 | * when zvols from one pool are used as vdevs in another. | |
766 | * | |
767 | * To prevent a lock inversion deadlock we preemptively | |
768 | * take the spa_namespace_lock. Normally the lock will not | |
769 | * be contended and this is safe because spa_open_common() | |
770 | * handles the case where the caller already holds the | |
771 | * spa_namespace_lock. | |
772 | * | |
773 | * When the lock cannot be aquired after multiple retries | |
774 | * this must be the vdev on zvol deadlock case and we have | |
775 | * no choice but to return an error. For 5.12 and older | |
776 | * kernels returning -ERESTARTSYS will result in the | |
777 | * bdev->bd_mutex being dropped, then reacquired, and | |
778 | * fops->open() being called again. This process can be | |
779 | * repeated safely until both locks are acquired. For 5.13 | |
780 | * and newer the -ERESTARTSYS retry logic was removed from | |
781 | * the kernel so the only option is to return the error for | |
782 | * the caller to handle it. | |
783 | */ | |
8a02d01e BB |
784 | if (!mutex_owned(&spa_namespace_lock)) { |
785 | if (!mutex_tryenter(&spa_namespace_lock)) { | |
786 | mutex_exit(&zv->zv_state_lock); | |
787 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d BB |
788 | |
789 | #ifdef HAVE_BLKDEV_GET_ERESTARTSYS | |
8a02d01e | 790 | schedule(); |
77e2756d | 791 | return (SET_ERROR(-ERESTARTSYS)); |
8a02d01e BB |
792 | #else |
793 | if ((gethrtime() - start) > timeout) | |
794 | return (SET_ERROR(-ERESTARTSYS)); | |
77e2756d | 795 | |
8a02d01e BB |
796 | schedule_timeout(MSEC_TO_TICK(10)); |
797 | goto retry; | |
77e2756d | 798 | #endif |
8a02d01e BB |
799 | } else { |
800 | drop_namespace = B_TRUE; | |
5df7e9d8 MM |
801 | } |
802 | } | |
5df7e9d8 | 803 | |
43e8f6e3 | 804 | error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); |
5df7e9d8 | 805 | |
8a02d01e BB |
806 | if (drop_namespace) |
807 | mutex_exit(&spa_namespace_lock); | |
5df7e9d8 MM |
808 | } |
809 | ||
8a02d01e | 810 | if (error == 0) { |
43e8f6e3 CK |
811 | if ((blk_mode_is_open_write(flag)) && |
812 | (zv->zv_flags & ZVOL_RDONLY)) { | |
8a02d01e BB |
813 | if (zv->zv_open_count == 0) |
814 | zvol_last_close(zv); | |
5df7e9d8 | 815 | |
8a02d01e BB |
816 | error = SET_ERROR(-EROFS); |
817 | } else { | |
818 | zv->zv_open_count++; | |
819 | } | |
820 | } | |
5df7e9d8 | 821 | |
5df7e9d8 MM |
822 | mutex_exit(&zv->zv_state_lock); |
823 | if (drop_suspend) | |
824 | rw_exit(&zv->zv_suspend_lock); | |
77e2756d | 825 | |
8a02d01e | 826 | if (error == 0) |
43e8f6e3 CK |
827 | #ifdef HAVE_BLK_MODE_T |
828 | disk_check_media_change(disk); | |
829 | #else | |
8a02d01e | 830 | zfs_check_media_change(bdev); |
43e8f6e3 | 831 | #endif |
8a02d01e BB |
832 | |
833 | return (error); | |
5df7e9d8 MM |
834 | } |
835 | ||
5df7e9d8 | 836 | static void |
43e8f6e3 CK |
837 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG |
838 | zvol_release(struct gendisk *disk) | |
839 | #else | |
840 | zvol_release(struct gendisk *disk, fmode_t unused) | |
841 | #endif | |
5df7e9d8 | 842 | { |
43e8f6e3 CK |
843 | #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) |
844 | (void) unused; | |
845 | #endif | |
5df7e9d8 MM |
846 | zvol_state_t *zv; |
847 | boolean_t drop_suspend = B_TRUE; | |
848 | ||
849 | rw_enter(&zvol_state_lock, RW_READER); | |
850 | zv = disk->private_data; | |
851 | ||
852 | mutex_enter(&zv->zv_state_lock); | |
0b32d817 | 853 | ASSERT3U(zv->zv_open_count, >, 0); |
5df7e9d8 MM |
854 | /* |
855 | * make sure zvol is not suspended during last close | |
856 | * (hold zv_suspend_lock) and respect proper lock acquisition | |
857 | * ordering - zv_suspend_lock before zv_state_lock | |
858 | */ | |
859 | if (zv->zv_open_count == 1) { | |
860 | if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { | |
861 | mutex_exit(&zv->zv_state_lock); | |
862 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
863 | mutex_enter(&zv->zv_state_lock); | |
864 | /* check to see if zv_suspend_lock is needed */ | |
865 | if (zv->zv_open_count != 1) { | |
866 | rw_exit(&zv->zv_suspend_lock); | |
867 | drop_suspend = B_FALSE; | |
868 | } | |
869 | } | |
870 | } else { | |
871 | drop_suspend = B_FALSE; | |
872 | } | |
873 | rw_exit(&zvol_state_lock); | |
874 | ||
875 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
5df7e9d8 MM |
876 | |
877 | zv->zv_open_count--; | |
0b32d817 RM |
878 | if (zv->zv_open_count == 0) { |
879 | ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); | |
5df7e9d8 | 880 | zvol_last_close(zv); |
0b32d817 | 881 | } |
5df7e9d8 MM |
882 | |
883 | mutex_exit(&zv->zv_state_lock); | |
884 | ||
885 | if (drop_suspend) | |
886 | rw_exit(&zv->zv_suspend_lock); | |
5df7e9d8 MM |
887 | } |
888 | ||
889 | static int | |
890 | zvol_ioctl(struct block_device *bdev, fmode_t mode, | |
891 | unsigned int cmd, unsigned long arg) | |
892 | { | |
893 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
894 | int error = 0; | |
895 | ||
896 | ASSERT3U(zv->zv_open_count, >, 0); | |
897 | ||
898 | switch (cmd) { | |
899 | case BLKFLSBUF: | |
7ac56b86 | 900 | #ifdef HAVE_FSYNC_BDEV |
5df7e9d8 | 901 | fsync_bdev(bdev); |
7ac56b86 CK |
902 | #elif defined(HAVE_SYNC_BLOCKDEV) |
903 | sync_blockdev(bdev); | |
904 | #else | |
905 | #error "Neither fsync_bdev() nor sync_blockdev() found" | |
906 | #endif | |
5df7e9d8 MM |
907 | invalidate_bdev(bdev); |
908 | rw_enter(&zv->zv_suspend_lock, RW_READER); | |
909 | ||
910 | if (!(zv->zv_flags & ZVOL_RDONLY)) | |
911 | txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); | |
912 | ||
913 | rw_exit(&zv->zv_suspend_lock); | |
914 | break; | |
915 | ||
916 | case BLKZNAME: | |
917 | mutex_enter(&zv->zv_state_lock); | |
918 | error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); | |
919 | mutex_exit(&zv->zv_state_lock); | |
920 | break; | |
921 | ||
922 | default: | |
923 | error = -ENOTTY; | |
924 | break; | |
925 | } | |
926 | ||
927 | return (SET_ERROR(error)); | |
928 | } | |
929 | ||
930 | #ifdef CONFIG_COMPAT | |
931 | static int | |
932 | zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, | |
933 | unsigned cmd, unsigned long arg) | |
934 | { | |
935 | return (zvol_ioctl(bdev, mode, cmd, arg)); | |
936 | } | |
937 | #else | |
938 | #define zvol_compat_ioctl NULL | |
939 | #endif | |
940 | ||
5df7e9d8 MM |
941 | static unsigned int |
942 | zvol_check_events(struct gendisk *disk, unsigned int clearing) | |
943 | { | |
944 | unsigned int mask = 0; | |
945 | ||
946 | rw_enter(&zvol_state_lock, RW_READER); | |
947 | ||
948 | zvol_state_t *zv = disk->private_data; | |
949 | if (zv != NULL) { | |
950 | mutex_enter(&zv->zv_state_lock); | |
951 | mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; | |
952 | zv->zv_changed = 0; | |
953 | mutex_exit(&zv->zv_state_lock); | |
954 | } | |
955 | ||
956 | rw_exit(&zvol_state_lock); | |
957 | ||
958 | return (mask); | |
959 | } | |
5df7e9d8 MM |
960 | |
961 | static int | |
962 | zvol_revalidate_disk(struct gendisk *disk) | |
963 | { | |
964 | rw_enter(&zvol_state_lock, RW_READER); | |
965 | ||
966 | zvol_state_t *zv = disk->private_data; | |
967 | if (zv != NULL) { | |
968 | mutex_enter(&zv->zv_state_lock); | |
969 | set_capacity(zv->zv_zso->zvo_disk, | |
970 | zv->zv_volsize >> SECTOR_BITS); | |
971 | mutex_exit(&zv->zv_state_lock); | |
972 | } | |
973 | ||
974 | rw_exit(&zvol_state_lock); | |
975 | ||
976 | return (0); | |
977 | } | |
978 | ||
1dccfd7a CS |
979 | int |
980 | zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) | |
5df7e9d8 | 981 | { |
1c0bbd52 | 982 | struct gendisk *disk = zv->zv_zso->zvo_disk; |
5df7e9d8 | 983 | |
19697e45 | 984 | #if defined(HAVE_REVALIDATE_DISK_SIZE) |
1c0bbd52 | 985 | revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); |
19697e45 | 986 | #elif defined(HAVE_REVALIDATE_DISK) |
1c0bbd52 | 987 | revalidate_disk(disk); |
19697e45 BB |
988 | #else |
989 | zvol_revalidate_disk(disk); | |
59b68723 | 990 | #endif |
5df7e9d8 MM |
991 | return (0); |
992 | } | |
993 | ||
1dccfd7a CS |
994 | void |
995 | zvol_os_clear_private(zvol_state_t *zv) | |
5df7e9d8 MM |
996 | { |
997 | /* | |
998 | * Cleared while holding zvol_state_lock as a writer | |
999 | * which will prevent zvol_open() from opening it. | |
1000 | */ | |
1001 | zv->zv_zso->zvo_disk->private_data = NULL; | |
1002 | } | |
1003 | ||
1004 | /* | |
1005 | * Provide a simple virtual geometry for legacy compatibility. For devices | |
1006 | * smaller than 1 MiB a small head and sector count is used to allow very | |
1007 | * tiny devices. For devices over 1 Mib a standard head and sector count | |
1008 | * is used to keep the cylinders count reasonable. | |
1009 | */ | |
1010 | static int | |
1011 | zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) | |
1012 | { | |
1013 | zvol_state_t *zv = bdev->bd_disk->private_data; | |
1014 | sector_t sectors; | |
1015 | ||
1016 | ASSERT3U(zv->zv_open_count, >, 0); | |
1017 | ||
1018 | sectors = get_capacity(zv->zv_zso->zvo_disk); | |
1019 | ||
1020 | if (sectors > 2048) { | |
1021 | geo->heads = 16; | |
1022 | geo->sectors = 63; | |
1023 | } else { | |
1024 | geo->heads = 2; | |
1025 | geo->sectors = 4; | |
1026 | } | |
1027 | ||
1028 | geo->start = 0; | |
1029 | geo->cylinders = sectors / (geo->heads * geo->sectors); | |
1030 | ||
1031 | return (0); | |
1032 | } | |
1033 | ||
6f73d021 TH |
1034 | /* |
1035 | * Why have two separate block_device_operations structs? | |
1036 | * | |
1037 | * Normally we'd just have one, and assign 'submit_bio' as needed. However, | |
1038 | * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we | |
1039 | * can't just change submit_bio dynamically at runtime. So just create two | |
1040 | * separate structs to get around this. | |
1041 | */ | |
1042 | static const struct block_device_operations zvol_ops_blk_mq = { | |
1043 | .open = zvol_open, | |
1044 | .release = zvol_release, | |
1045 | .ioctl = zvol_ioctl, | |
1046 | .compat_ioctl = zvol_compat_ioctl, | |
1047 | .check_events = zvol_check_events, | |
1048 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK | |
1049 | .revalidate_disk = zvol_revalidate_disk, | |
1050 | #endif | |
1051 | .getgeo = zvol_getgeo, | |
1052 | .owner = THIS_MODULE, | |
1053 | }; | |
1054 | ||
18168da7 | 1055 | static const struct block_device_operations zvol_ops = { |
5df7e9d8 MM |
1056 | .open = zvol_open, |
1057 | .release = zvol_release, | |
1058 | .ioctl = zvol_ioctl, | |
1059 | .compat_ioctl = zvol_compat_ioctl, | |
5df7e9d8 | 1060 | .check_events = zvol_check_events, |
48c7b0e4 | 1061 | #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK |
5df7e9d8 | 1062 | .revalidate_disk = zvol_revalidate_disk, |
48c7b0e4 | 1063 | #endif |
5df7e9d8 MM |
1064 | .getgeo = zvol_getgeo, |
1065 | .owner = THIS_MODULE, | |
d817c171 | 1066 | #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS |
1b06b03a | 1067 | .submit_bio = zvol_submit_bio, |
d817c171 | 1068 | #endif |
5df7e9d8 MM |
1069 | }; |
1070 | ||
6f73d021 TH |
1071 | static int |
1072 | zvol_alloc_non_blk_mq(struct zvol_state_os *zso) | |
1073 | { | |
1074 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) | |
1075 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1076 | zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); | |
1077 | if (zso->zvo_disk == NULL) | |
1078 | return (1); | |
1079 | ||
6097a7ba RN |
1080 | zso->zvo_disk->minors = ZVOL_MINORS; |
1081 | zso->zvo_queue = zso->zvo_disk->queue; | |
1082 | #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) | |
1083 | struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); | |
1084 | if (IS_ERR(disk)) { | |
1085 | zso->zvo_disk = NULL; | |
1086 | return (1); | |
1087 | } | |
1088 | ||
1089 | zso->zvo_disk = disk; | |
6f73d021 TH |
1090 | zso->zvo_disk->minors = ZVOL_MINORS; |
1091 | zso->zvo_queue = zso->zvo_disk->queue; | |
1092 | #else | |
1093 | zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); | |
1094 | if (zso->zvo_queue == NULL) | |
1095 | return (1); | |
1096 | ||
1097 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1098 | if (zso->zvo_disk == NULL) { | |
1099 | blk_cleanup_queue(zso->zvo_queue); | |
1100 | return (1); | |
1101 | } | |
1102 | ||
1103 | zso->zvo_disk->queue = zso->zvo_queue; | |
1104 | #endif /* HAVE_BLK_ALLOC_DISK */ | |
1105 | #else | |
1106 | zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); | |
1107 | if (zso->zvo_queue == NULL) | |
1108 | return (1); | |
1109 | ||
1110 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1111 | if (zso->zvo_disk == NULL) { | |
1112 | blk_cleanup_queue(zso->zvo_queue); | |
1113 | return (1); | |
1114 | } | |
1115 | ||
1116 | zso->zvo_disk->queue = zso->zvo_queue; | |
1117 | #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ | |
1118 | return (0); | |
1119 | ||
1120 | } | |
1121 | ||
1122 | static int | |
1123 | zvol_alloc_blk_mq(zvol_state_t *zv) | |
1124 | { | |
1125 | #ifdef HAVE_BLK_MQ | |
1126 | struct zvol_state_os *zso = zv->zv_zso; | |
1127 | ||
1128 | /* Allocate our blk-mq tag_set */ | |
1129 | if (zvol_blk_mq_alloc_tag_set(zv) != 0) | |
1130 | return (1); | |
1131 | ||
1132 | #if defined(HAVE_BLK_ALLOC_DISK) | |
1133 | zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); | |
1134 | if (zso->zvo_disk == NULL) { | |
1135 | blk_mq_free_tag_set(&zso->tag_set); | |
1136 | return (1); | |
1137 | } | |
1138 | zso->zvo_queue = zso->zvo_disk->queue; | |
1139 | zso->zvo_disk->minors = ZVOL_MINORS; | |
6097a7ba RN |
1140 | #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) |
1141 | struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); | |
1142 | if (IS_ERR(disk)) { | |
1143 | zso->zvo_disk = NULL; | |
1144 | blk_mq_free_tag_set(&zso->tag_set); | |
1145 | return (1); | |
1146 | } | |
1147 | ||
1148 | zso->zvo_disk = disk; | |
1149 | zso->zvo_queue = zso->zvo_disk->queue; | |
1150 | zso->zvo_disk->minors = ZVOL_MINORS; | |
6f73d021 TH |
1151 | #else |
1152 | zso->zvo_disk = alloc_disk(ZVOL_MINORS); | |
1153 | if (zso->zvo_disk == NULL) { | |
1154 | blk_cleanup_queue(zso->zvo_queue); | |
1155 | blk_mq_free_tag_set(&zso->tag_set); | |
1156 | return (1); | |
1157 | } | |
1158 | /* Allocate queue */ | |
1159 | zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); | |
1160 | if (IS_ERR(zso->zvo_queue)) { | |
1161 | blk_mq_free_tag_set(&zso->tag_set); | |
1162 | return (1); | |
1163 | } | |
1164 | ||
1165 | /* Our queue is now created, assign it to our disk */ | |
1166 | zso->zvo_disk->queue = zso->zvo_queue; | |
1167 | ||
1168 | #endif | |
1169 | #endif | |
1170 | return (0); | |
1171 | } | |
1172 | ||
5df7e9d8 MM |
1173 | /* |
1174 | * Allocate memory for a new zvol_state_t and setup the required | |
1175 | * request queue and generic disk structures for the block device. | |
1176 | */ | |
1177 | static zvol_state_t * | |
1178 | zvol_alloc(dev_t dev, const char *name) | |
1179 | { | |
1180 | zvol_state_t *zv; | |
68dde63d | 1181 | struct zvol_state_os *zso; |
5df7e9d8 | 1182 | uint64_t volmode; |
6f73d021 | 1183 | int ret; |
5df7e9d8 MM |
1184 | |
1185 | if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) | |
1186 | return (NULL); | |
1187 | ||
1188 | if (volmode == ZFS_VOLMODE_DEFAULT) | |
1189 | volmode = zvol_volmode; | |
1190 | ||
1191 | if (volmode == ZFS_VOLMODE_NONE) | |
1192 | return (NULL); | |
1193 | ||
1194 | zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); | |
68dde63d BB |
1195 | zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); |
1196 | zv->zv_zso = zso; | |
0ca45cb3 | 1197 | zv->zv_volmode = volmode; |
5df7e9d8 MM |
1198 | |
1199 | list_link_init(&zv->zv_next); | |
5df7e9d8 MM |
1200 | mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); |
1201 | ||
6f73d021 TH |
1202 | #ifdef HAVE_BLK_MQ |
1203 | zv->zv_zso->use_blk_mq = zvol_use_blk_mq; | |
1204 | #endif | |
1b06b03a | 1205 | |
6f73d021 TH |
1206 | /* |
1207 | * The block layer has 3 interfaces for getting BIOs: | |
1208 | * | |
1209 | * 1. blk-mq request queues (new) | |
1210 | * 2. submit_bio() (oldest) | |
1211 | * 3. regular request queues (old). | |
1212 | * | |
1213 | * Each of those interfaces has two permutations: | |
1214 | * | |
1215 | * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates | |
1216 | * both the disk and its queue (5.14 kernel or newer) | |
1217 | * | |
1218 | * b) We don't have blk_*alloc_disk(), and have to allocate the | |
1219 | * disk and the queue separately. (5.13 kernel or older) | |
1220 | */ | |
1221 | if (zv->zv_zso->use_blk_mq) { | |
1222 | ret = zvol_alloc_blk_mq(zv); | |
1223 | zso->zvo_disk->fops = &zvol_ops_blk_mq; | |
1224 | } else { | |
1225 | ret = zvol_alloc_non_blk_mq(zso); | |
1226 | zso->zvo_disk->fops = &zvol_ops; | |
1b06b03a | 1227 | } |
6f73d021 | 1228 | if (ret != 0) |
5df7e9d8 MM |
1229 | goto out_kmem; |
1230 | ||
68dde63d | 1231 | blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); |
5df7e9d8 MM |
1232 | |
1233 | /* Limit read-ahead to a single page to prevent over-prefetching. */ | |
68dde63d | 1234 | blk_queue_set_read_ahead(zso->zvo_queue, 1); |
5df7e9d8 | 1235 | |
6f73d021 TH |
1236 | if (!zv->zv_zso->use_blk_mq) { |
1237 | /* Disable write merging in favor of the ZIO pipeline. */ | |
1238 | blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); | |
1239 | } | |
5df7e9d8 | 1240 | |
ae1e40b3 BB |
1241 | /* Enable /proc/diskstats */ |
1242 | blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); | |
1243 | ||
68dde63d BB |
1244 | zso->zvo_queue->queuedata = zv; |
1245 | zso->zvo_dev = dev; | |
5df7e9d8 | 1246 | zv->zv_open_count = 0; |
cf331663 | 1247 | strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); |
5df7e9d8 | 1248 | |
2cc479d0 | 1249 | zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); |
5df7e9d8 MM |
1250 | rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); |
1251 | ||
68dde63d BB |
1252 | zso->zvo_disk->major = zvol_major; |
1253 | zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; | |
5df7e9d8 | 1254 | |
026f126b BB |
1255 | /* |
1256 | * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. | |
1257 | * This is accomplished by limiting the number of minors for the | |
1258 | * device to one and explicitly disabling partition scanning. | |
1259 | */ | |
5df7e9d8 | 1260 | if (volmode == ZFS_VOLMODE_DEV) { |
68dde63d | 1261 | zso->zvo_disk->minors = 1; |
026f126b BB |
1262 | zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; |
1263 | zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; | |
5df7e9d8 | 1264 | } |
026f126b | 1265 | |
68dde63d | 1266 | zso->zvo_disk->first_minor = (dev & MINORMASK); |
68dde63d | 1267 | zso->zvo_disk->private_data = zv; |
68dde63d | 1268 | snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", |
5df7e9d8 MM |
1269 | ZVOL_DEV_NAME, (dev & MINORMASK)); |
1270 | ||
1271 | return (zv); | |
1272 | ||
5df7e9d8 | 1273 | out_kmem: |
68dde63d | 1274 | kmem_free(zso, sizeof (struct zvol_state_os)); |
5df7e9d8 MM |
1275 | kmem_free(zv, sizeof (zvol_state_t)); |
1276 | return (NULL); | |
1277 | } | |
1278 | ||
1279 | /* | |
1280 | * Cleanup then free a zvol_state_t which was created by zvol_alloc(). | |
1281 | * At this time, the structure is not opened by anyone, is taken off | |
1282 | * the zvol_state_list, and has its private data set to NULL. | |
1283 | * The zvol_state_lock is dropped. | |
99573cc0 PS |
1284 | * |
1285 | * This function may take many milliseconds to complete (e.g. we've seen | |
1286 | * it take over 256ms), due to the calls to "blk_cleanup_queue" and | |
1287 | * "del_gendisk". Thus, consumers need to be careful to account for this | |
1288 | * latency when calling this function. | |
5df7e9d8 | 1289 | */ |
1dccfd7a CS |
1290 | void |
1291 | zvol_os_free(zvol_state_t *zv) | |
5df7e9d8 MM |
1292 | { |
1293 | ||
1294 | ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); | |
1295 | ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); | |
0b32d817 RM |
1296 | ASSERT0(zv->zv_open_count); |
1297 | ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); | |
5df7e9d8 MM |
1298 | |
1299 | rw_destroy(&zv->zv_suspend_lock); | |
2cc479d0 | 1300 | zfs_rangelock_fini(&zv->zv_rangelock); |
5df7e9d8 MM |
1301 | |
1302 | del_gendisk(zv->zv_zso->zvo_disk); | |
1b06b03a | 1303 | #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ |
6097a7ba | 1304 | (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) |
c26045b4 | 1305 | #if defined(HAVE_BLK_CLEANUP_DISK) |
1b06b03a | 1306 | blk_cleanup_disk(zv->zv_zso->zvo_disk); |
c26045b4 BB |
1307 | #else |
1308 | put_disk(zv->zv_zso->zvo_disk); | |
1309 | #endif | |
1b06b03a | 1310 | #else |
5df7e9d8 MM |
1311 | blk_cleanup_queue(zv->zv_zso->zvo_queue); |
1312 | put_disk(zv->zv_zso->zvo_disk); | |
1b06b03a | 1313 | #endif |
5df7e9d8 | 1314 | |
6f73d021 TH |
1315 | #ifdef HAVE_BLK_MQ |
1316 | if (zv->zv_zso->use_blk_mq) | |
1317 | blk_mq_free_tag_set(&zv->zv_zso->tag_set); | |
1318 | #endif | |
1319 | ||
5df7e9d8 MM |
1320 | ida_simple_remove(&zvol_ida, |
1321 | MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); | |
1322 | ||
1323 | mutex_destroy(&zv->zv_state_lock); | |
4547fc4e | 1324 | dataset_kstats_destroy(&zv->zv_kstat); |
5df7e9d8 MM |
1325 | |
1326 | kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); | |
1327 | kmem_free(zv, sizeof (zvol_state_t)); | |
1328 | } | |
1329 | ||
0ca45cb3 MM |
1330 | void |
1331 | zvol_wait_close(zvol_state_t *zv) | |
1332 | { | |
1333 | } | |
1334 | ||
5df7e9d8 MM |
1335 | /* |
1336 | * Create a block device minor node and setup the linkage between it | |
1337 | * and the specified volume. Once this function returns the block | |
1338 | * device is live and ready for use. | |
1339 | */ | |
1dccfd7a | 1340 | int |
ec213971 | 1341 | zvol_os_create_minor(const char *name) |
5df7e9d8 MM |
1342 | { |
1343 | zvol_state_t *zv; | |
1344 | objset_t *os; | |
1345 | dmu_object_info_t *doi; | |
1346 | uint64_t volsize; | |
1347 | uint64_t len; | |
1348 | unsigned minor = 0; | |
1349 | int error = 0; | |
1350 | int idx; | |
1351 | uint64_t hash = zvol_name_hash(name); | |
60387fac | 1352 | uint64_t volthreading; |
e197bb24 | 1353 | bool replayed_zil = B_FALSE; |
5df7e9d8 MM |
1354 | |
1355 | if (zvol_inhibit_dev) | |
1356 | return (0); | |
1357 | ||
1358 | idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); | |
1359 | if (idx < 0) | |
1360 | return (SET_ERROR(-idx)); | |
1361 | minor = idx << ZVOL_MINOR_BITS; | |
c0aab8b8 FG |
1362 | if (MINOR(minor) != minor) { |
1363 | /* too many partitions can cause an overflow */ | |
1364 | zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", | |
1365 | name, minor, MINOR(minor)); | |
1366 | ida_simple_remove(&zvol_ida, idx); | |
1367 | return (SET_ERROR(EINVAL)); | |
1368 | } | |
5df7e9d8 MM |
1369 | |
1370 | zv = zvol_find_by_name_hash(name, hash, RW_NONE); | |
1371 | if (zv) { | |
1372 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1373 | mutex_exit(&zv->zv_state_lock); | |
1374 | ida_simple_remove(&zvol_ida, idx); | |
1375 | return (SET_ERROR(EEXIST)); | |
1376 | } | |
1377 | ||
1378 | doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); | |
1379 | ||
1380 | error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); | |
1381 | if (error) | |
1382 | goto out_doi; | |
1383 | ||
1384 | error = dmu_object_info(os, ZVOL_OBJ, doi); | |
1385 | if (error) | |
1386 | goto out_dmu_objset_disown; | |
1387 | ||
1388 | error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); | |
1389 | if (error) | |
1390 | goto out_dmu_objset_disown; | |
1391 | ||
1392 | zv = zvol_alloc(MKDEV(zvol_major, minor), name); | |
1393 | if (zv == NULL) { | |
1394 | error = SET_ERROR(EAGAIN); | |
1395 | goto out_dmu_objset_disown; | |
1396 | } | |
1397 | zv->zv_hash = hash; | |
1398 | ||
1399 | if (dmu_objset_is_snapshot(os)) | |
1400 | zv->zv_flags |= ZVOL_RDONLY; | |
1401 | ||
1402 | zv->zv_volblocksize = doi->doi_data_block_size; | |
1403 | zv->zv_volsize = volsize; | |
1404 | zv->zv_objset = os; | |
1405 | ||
60387fac AH |
1406 | /* Default */ |
1407 | zv->zv_threading = B_TRUE; | |
1408 | if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) | |
1409 | == 0) | |
1410 | zv->zv_threading = volthreading; | |
1411 | ||
5df7e9d8 MM |
1412 | set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); |
1413 | ||
1414 | blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, | |
1415 | (DMU_MAX_ACCESS / 4) >> 9); | |
6f73d021 TH |
1416 | |
1417 | if (zv->zv_zso->use_blk_mq) { | |
1418 | /* | |
1419 | * IO requests can be really big (1MB). When an IO request | |
1420 | * comes in, it is passed off to zvol_read() or zvol_write() | |
1421 | * in a new thread, where it is chunked up into 'volblocksize' | |
1422 | * sized pieces and processed. So for example, if the request | |
1423 | * is a 1MB write and your volblocksize is 128k, one zvol_write | |
1424 | * thread will take that request and sequentially do ten 128k | |
1425 | * IOs. This is due to the fact that the thread needs to lock | |
1426 | * each volblocksize sized block. So you might be wondering: | |
1427 | * "instead of passing the whole 1MB request to one thread, | |
1428 | * why not pass ten individual 128k chunks to ten threads and | |
1429 | * process the whole write in parallel?" The short answer is | |
1430 | * that there's a sweet spot number of chunks that balances | |
1431 | * the greater parallelism with the added overhead of more | |
1432 | * threads. The sweet spot can be different depending on if you | |
1433 | * have a read or write heavy workload. Writes typically want | |
1434 | * high chunk counts while reads typically want lower ones. On | |
1435 | * a test pool with 6 NVMe drives in a 3x 2-disk mirror | |
1436 | * configuration, with volblocksize=8k, the sweet spot for good | |
1437 | * sequential reads and writes was at 8 chunks. | |
1438 | */ | |
1439 | ||
1440 | /* | |
1441 | * Below we tell the kernel how big we want our requests | |
1442 | * to be. You would think that blk_queue_io_opt() would be | |
1443 | * used to do this since it is used to "set optimal request | |
1444 | * size for the queue", but that doesn't seem to do | |
1445 | * anything - the kernel still gives you huge requests | |
1446 | * with tons of little PAGE_SIZE segments contained within it. | |
1447 | * | |
1448 | * Knowing that the kernel will just give you PAGE_SIZE segments | |
1449 | * no matter what, you can say "ok, I want PAGE_SIZE byte | |
1450 | * segments, and I want 'N' of them per request", where N is | |
1451 | * the correct number of segments for the volblocksize and | |
1452 | * number of chunks you want. | |
1453 | */ | |
1454 | #ifdef HAVE_BLK_MQ | |
1455 | if (zvol_blk_mq_blocks_per_thread != 0) { | |
1456 | unsigned int chunks; | |
1457 | chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); | |
1458 | ||
1459 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1460 | PAGE_SIZE); | |
1461 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1462 | (zv->zv_volblocksize * chunks) / PAGE_SIZE); | |
1463 | } else { | |
1464 | /* | |
1465 | * Special case: zvol_blk_mq_blocks_per_thread = 0 | |
1466 | * Max everything out. | |
1467 | */ | |
1468 | blk_queue_max_segments(zv->zv_zso->zvo_queue, | |
1469 | UINT16_MAX); | |
1470 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, | |
1471 | UINT_MAX); | |
1472 | } | |
1473 | #endif | |
1474 | } else { | |
1475 | blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); | |
1476 | blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); | |
1477 | } | |
1478 | ||
5df7e9d8 MM |
1479 | blk_queue_physical_block_size(zv->zv_zso->zvo_queue, |
1480 | zv->zv_volblocksize); | |
1481 | blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); | |
1482 | blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, | |
1483 | (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); | |
1484 | blk_queue_discard_granularity(zv->zv_zso->zvo_queue, | |
1485 | zv->zv_volblocksize); | |
5e4aedac | 1486 | #ifdef QUEUE_FLAG_DISCARD |
5df7e9d8 | 1487 | blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); |
5e4aedac | 1488 | #endif |
5df7e9d8 MM |
1489 | #ifdef QUEUE_FLAG_NONROT |
1490 | blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); | |
1491 | #endif | |
1492 | #ifdef QUEUE_FLAG_ADD_RANDOM | |
1493 | blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); | |
1494 | #endif | |
1495 | /* This flag was introduced in kernel version 4.12. */ | |
1496 | #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH | |
1497 | blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); | |
1498 | #endif | |
1499 | ||
fb087146 AH |
1500 | ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); |
1501 | error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); | |
1502 | if (error) | |
1503 | goto out_dmu_objset_disown; | |
93e36580 | 1504 | ASSERT3P(zv->zv_zilog, ==, NULL); |
fb087146 | 1505 | zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); |
5df7e9d8 MM |
1506 | if (spa_writeable(dmu_objset_spa(os))) { |
1507 | if (zil_replay_disable) | |
e197bb24 | 1508 | replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); |
5df7e9d8 | 1509 | else |
e197bb24 | 1510 | replayed_zil = zil_replay(os, zv, zvol_replay_vector); |
5df7e9d8 | 1511 | } |
e197bb24 AS |
1512 | if (replayed_zil) |
1513 | zil_close(zv->zv_zilog); | |
93e36580 | 1514 | zv->zv_zilog = NULL; |
5df7e9d8 MM |
1515 | |
1516 | /* | |
1517 | * When udev detects the addition of the device it will immediately | |
1518 | * invoke blkid(8) to determine the type of content on the device. | |
1519 | * Prefetching the blocks commonly scanned by blkid(8) will speed | |
1520 | * up this process. | |
1521 | */ | |
8ef15f93 | 1522 | len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); |
5df7e9d8 MM |
1523 | if (len > 0) { |
1524 | dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); | |
1525 | dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, | |
1526 | ZIO_PRIORITY_SYNC_READ); | |
1527 | } | |
1528 | ||
1529 | zv->zv_objset = NULL; | |
1530 | out_dmu_objset_disown: | |
1531 | dmu_objset_disown(os, B_TRUE, FTAG); | |
1532 | out_doi: | |
1533 | kmem_free(doi, sizeof (dmu_object_info_t)); | |
1534 | ||
1535 | /* | |
1536 | * Keep in mind that once add_disk() is called, the zvol is | |
1537 | * announced to the world, and zvol_open()/zvol_release() can | |
1538 | * be called at any time. Incidentally, add_disk() itself calls | |
1539 | * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() | |
1540 | * directly as well. | |
1541 | */ | |
1542 | if (error == 0) { | |
1543 | rw_enter(&zvol_state_lock, RW_WRITER); | |
1544 | zvol_insert(zv); | |
1545 | rw_exit(&zvol_state_lock); | |
12fa250d RE |
1546 | #ifdef HAVE_ADD_DISK_RET |
1547 | error = add_disk(zv->zv_zso->zvo_disk); | |
1548 | #else | |
5df7e9d8 | 1549 | add_disk(zv->zv_zso->zvo_disk); |
12fa250d | 1550 | #endif |
5df7e9d8 MM |
1551 | } else { |
1552 | ida_simple_remove(&zvol_ida, idx); | |
1553 | } | |
1554 | ||
ec213971 | 1555 | return (error); |
5df7e9d8 MM |
1556 | } |
1557 | ||
1dccfd7a CS |
1558 | void |
1559 | zvol_os_rename_minor(zvol_state_t *zv, const char *newname) | |
5df7e9d8 MM |
1560 | { |
1561 | int readonly = get_disk_ro(zv->zv_zso->zvo_disk); | |
1562 | ||
1563 | ASSERT(RW_LOCK_HELD(&zvol_state_lock)); | |
1564 | ASSERT(MUTEX_HELD(&zv->zv_state_lock)); | |
1565 | ||
1566 | strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); | |
1567 | ||
1568 | /* move to new hashtable entry */ | |
1569 | zv->zv_hash = zvol_name_hash(zv->zv_name); | |
1570 | hlist_del(&zv->zv_hlink); | |
1571 | hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); | |
1572 | ||
1573 | /* | |
1574 | * The block device's read-only state is briefly changed causing | |
1575 | * a KOBJ_CHANGE uevent to be issued. This ensures udev detects | |
1576 | * the name change and fixes the symlinks. This does not change | |
1577 | * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never | |
1578 | * changes. This would normally be done using kobject_uevent() but | |
1579 | * that is a GPL-only symbol which is why we need this workaround. | |
1580 | */ | |
1581 | set_disk_ro(zv->zv_zso->zvo_disk, !readonly); | |
1582 | set_disk_ro(zv->zv_zso->zvo_disk, readonly); | |
e36ff84c AS |
1583 | |
1584 | dataset_kstats_rename(&zv->zv_kstat, newname); | |
5df7e9d8 MM |
1585 | } |
1586 | ||
1dccfd7a CS |
1587 | void |
1588 | zvol_os_set_disk_ro(zvol_state_t *zv, int flags) | |
5df7e9d8 MM |
1589 | { |
1590 | ||
1591 | set_disk_ro(zv->zv_zso->zvo_disk, flags); | |
1592 | } | |
1593 | ||
1dccfd7a CS |
1594 | void |
1595 | zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) | |
5df7e9d8 MM |
1596 | { |
1597 | ||
1598 | set_capacity(zv->zv_zso->zvo_disk, capacity); | |
1599 | } | |
1600 | ||
5df7e9d8 MM |
1601 | int |
1602 | zvol_init(void) | |
1603 | { | |
1604 | int error; | |
6f73d021 TH |
1605 | |
1606 | /* | |
1607 | * zvol_threads is the module param the user passes in. | |
1608 | * | |
1609 | * zvol_actual_threads is what we use internally, since the user can | |
1610 | * pass zvol_thread = 0 to mean "use all the CPUs" (the default). | |
1611 | */ | |
1612 | static unsigned int zvol_actual_threads; | |
1613 | ||
1614 | if (zvol_threads == 0) { | |
1615 | /* | |
1616 | * See dde9380a1 for why 32 was chosen here. This should | |
1617 | * probably be refined to be some multiple of the number | |
1618 | * of CPUs. | |
1619 | */ | |
1620 | zvol_actual_threads = MAX(num_online_cpus(), 32); | |
1621 | } else { | |
1622 | zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); | |
1623 | } | |
5df7e9d8 | 1624 | |
99741bde AH |
1625 | /* |
1626 | * Use atleast 32 zvol_threads but for many core system, | |
1627 | * prefer 6 threads per taskq, but no more taskqs | |
1628 | * than threads in them on large systems. | |
1629 | * | |
1630 | * taskq total | |
1631 | * cpus taskqs threads threads | |
1632 | * ------- ------- ------- ------- | |
1633 | * 1 1 32 32 | |
1634 | * 2 1 32 32 | |
1635 | * 4 1 32 32 | |
1636 | * 8 2 16 32 | |
1637 | * 16 3 11 33 | |
1638 | * 32 5 7 35 | |
1639 | * 64 8 8 64 | |
1640 | * 128 11 12 132 | |
1641 | * 256 16 16 256 | |
1642 | */ | |
1643 | zv_taskq_t *ztqs = &zvol_taskqs; | |
1644 | uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); | |
1645 | if (num_tqs == 0) { | |
1646 | num_tqs = 1 + num_online_cpus() / 6; | |
1647 | while (num_tqs * num_tqs > zvol_actual_threads) | |
1648 | num_tqs--; | |
1649 | } | |
1650 | uint_t per_tq_thread = zvol_actual_threads / num_tqs; | |
1651 | if (per_tq_thread * num_tqs < zvol_actual_threads) | |
1652 | per_tq_thread++; | |
1653 | ztqs->tqs_cnt = num_tqs; | |
1654 | ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); | |
5df7e9d8 MM |
1655 | error = register_blkdev(zvol_major, ZVOL_DRIVER); |
1656 | if (error) { | |
99741bde AH |
1657 | kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); |
1658 | ztqs->tqs_taskq = NULL; | |
5df7e9d8 MM |
1659 | printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); |
1660 | return (error); | |
1661 | } | |
6f73d021 TH |
1662 | |
1663 | #ifdef HAVE_BLK_MQ | |
1664 | if (zvol_blk_mq_queue_depth == 0) { | |
1665 | zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; | |
1666 | } else { | |
1667 | zvol_actual_blk_mq_queue_depth = | |
1668 | MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); | |
1669 | } | |
1670 | ||
1671 | if (zvol_blk_mq_threads == 0) { | |
1672 | zvol_blk_mq_actual_threads = num_online_cpus(); | |
1673 | } else { | |
1674 | zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), | |
1675 | 1024); | |
1676 | } | |
1677 | #endif | |
99741bde AH |
1678 | for (uint_t i = 0; i < num_tqs; i++) { |
1679 | char name[32]; | |
1680 | (void) snprintf(name, sizeof (name), "%s_tq-%u", | |
1681 | ZVOL_DRIVER, i); | |
1682 | ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, | |
1683 | maxclsyspri, per_tq_thread, INT_MAX, | |
1684 | TASKQ_PREPOPULATE | TASKQ_DYNAMIC); | |
1685 | if (ztqs->tqs_taskq[i] == NULL) { | |
1686 | for (int j = i - 1; j >= 0; j--) | |
1687 | taskq_destroy(ztqs->tqs_taskq[j]); | |
1688 | unregister_blkdev(zvol_major, ZVOL_DRIVER); | |
1689 | kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * | |
1690 | sizeof (taskq_t *)); | |
1691 | ztqs->tqs_taskq = NULL; | |
1692 | return (-ENOMEM); | |
1693 | } | |
5df7e9d8 | 1694 | } |
6f73d021 | 1695 | |
5df7e9d8 | 1696 | zvol_init_impl(); |
5df7e9d8 | 1697 | ida_init(&zvol_ida); |
5df7e9d8 MM |
1698 | return (0); |
1699 | } | |
1700 | ||
1701 | void | |
1702 | zvol_fini(void) | |
1703 | { | |
99741bde | 1704 | zv_taskq_t *ztqs = &zvol_taskqs; |
5df7e9d8 | 1705 | zvol_fini_impl(); |
5df7e9d8 | 1706 | unregister_blkdev(zvol_major, ZVOL_DRIVER); |
99741bde AH |
1707 | |
1708 | if (ztqs->tqs_taskq == NULL) { | |
1709 | ASSERT3U(ztqs->tqs_cnt, ==, 0); | |
1710 | } else { | |
1711 | for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { | |
1712 | ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); | |
1713 | taskq_destroy(ztqs->tqs_taskq[i]); | |
1714 | } | |
1715 | kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * | |
1716 | sizeof (taskq_t *)); | |
1717 | ztqs->tqs_taskq = NULL; | |
1718 | } | |
1719 | ||
5df7e9d8 MM |
1720 | ida_destroy(&zvol_ida); |
1721 | } | |
1722 | ||
1723 | /* BEGIN CSTYLED */ | |
1724 | module_param(zvol_inhibit_dev, uint, 0644); | |
1725 | MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); | |
1726 | ||
1727 | module_param(zvol_major, uint, 0444); | |
1728 | MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); | |
1729 | ||
1730 | module_param(zvol_threads, uint, 0444); | |
6f73d021 TH |
1731 | MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" |
1732 | "to 0 to use all active CPUs"); | |
5df7e9d8 MM |
1733 | |
1734 | module_param(zvol_request_sync, uint, 0644); | |
1735 | MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); | |
1736 | ||
1737 | module_param(zvol_max_discard_blocks, ulong, 0444); | |
1738 | MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); | |
1739 | ||
99741bde AH |
1740 | module_param(zvol_num_taskqs, uint, 0444); |
1741 | MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); | |
1742 | ||
5df7e9d8 MM |
1743 | module_param(zvol_prefetch_bytes, uint, 0644); |
1744 | MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); | |
1745 | ||
1746 | module_param(zvol_volmode, uint, 0644); | |
1747 | MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); | |
6f73d021 | 1748 | |
05c4710e TH |
1749 | #ifdef HAVE_BLK_MQ |
1750 | module_param(zvol_blk_mq_queue_depth, uint, 0644); | |
1751 | MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); | |
1752 | ||
1753 | module_param(zvol_use_blk_mq, uint, 0644); | |
1754 | MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); | |
1755 | ||
1756 | module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); | |
1757 | MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, | |
1758 | "Process volblocksize blocks per thread"); | |
1759 | #endif | |
1760 | ||
945e39fc PS |
1761 | #ifndef HAVE_BLKDEV_GET_ERESTARTSYS |
1762 | module_param(zvol_open_timeout_ms, uint, 0644); | |
1763 | MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); | |
1764 | #endif | |
1765 | ||
5df7e9d8 | 1766 | /* END CSTYLED */ |