1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/vmalloc.h>
3 #include <linux/bitmap.h>
6 #define CREATE_TRACE_POINTS
7 #include "null_blk_trace.h"
9 /* zone_size in MBs to sectors. */
10 #define ZONE_SIZE_SHIFT 11
12 static inline unsigned int null_zone_no(struct nullb_device
*dev
, sector_t sect
)
14 return sect
>> ilog2(dev
->zone_size_sects
);
17 int null_init_zoned_dev(struct nullb_device
*dev
, struct request_queue
*q
)
19 sector_t dev_size
= (sector_t
)dev
->size
* 1024 * 1024;
23 if (!is_power_of_2(dev
->zone_size
)) {
24 pr_err("zone_size must be power-of-two\n");
27 if (dev
->zone_size
> dev
->size
) {
28 pr_err("Zone size larger than device capacity\n");
32 if (!dev
->zone_capacity
)
33 dev
->zone_capacity
= dev
->zone_size
;
35 if (dev
->zone_capacity
> dev
->zone_size
) {
36 pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n",
37 dev
->zone_capacity
, dev
->zone_size
);
41 dev
->zone_size_sects
= dev
->zone_size
<< ZONE_SIZE_SHIFT
;
42 dev
->nr_zones
= dev_size
>>
43 (SECTOR_SHIFT
+ ilog2(dev
->zone_size_sects
));
44 dev
->zones
= kvmalloc_array(dev
->nr_zones
, sizeof(struct blk_zone
),
45 GFP_KERNEL
| __GFP_ZERO
);
50 * With memory backing, the zone_lock spinlock needs to be temporarily
51 * released to avoid scheduling in atomic context. To guarantee zone
52 * information protection, use a bitmap to lock zones with
53 * wait_on_bit_lock_io(). Sleeping on the lock is OK as memory backing
54 * implies that the queue is marked with BLK_MQ_F_BLOCKING.
56 spin_lock_init(&dev
->zone_lock
);
57 if (dev
->memory_backed
) {
58 dev
->zone_locks
= bitmap_zalloc(dev
->nr_zones
, GFP_KERNEL
);
59 if (!dev
->zone_locks
) {
65 if (dev
->zone_nr_conv
>= dev
->nr_zones
) {
66 dev
->zone_nr_conv
= dev
->nr_zones
- 1;
67 pr_info("changed the number of conventional zones to %u",
71 /* Max active zones has to be < nbr of seq zones in order to be enforceable */
72 if (dev
->zone_max_active
>= dev
->nr_zones
- dev
->zone_nr_conv
) {
73 dev
->zone_max_active
= 0;
74 pr_info("zone_max_active limit disabled, limit >= zone count\n");
77 /* Max open zones has to be <= max active zones */
78 if (dev
->zone_max_active
&& dev
->zone_max_open
> dev
->zone_max_active
) {
79 dev
->zone_max_open
= dev
->zone_max_active
;
80 pr_info("changed the maximum number of open zones to %u\n",
82 } else if (dev
->zone_max_open
>= dev
->nr_zones
- dev
->zone_nr_conv
) {
83 dev
->zone_max_open
= 0;
84 pr_info("zone_max_open limit disabled, limit >= zone count\n");
87 for (i
= 0; i
< dev
->zone_nr_conv
; i
++) {
88 struct blk_zone
*zone
= &dev
->zones
[i
];
91 zone
->len
= dev
->zone_size_sects
;
92 zone
->capacity
= zone
->len
;
93 zone
->wp
= zone
->start
+ zone
->len
;
94 zone
->type
= BLK_ZONE_TYPE_CONVENTIONAL
;
95 zone
->cond
= BLK_ZONE_COND_NOT_WP
;
97 sector
+= dev
->zone_size_sects
;
100 for (i
= dev
->zone_nr_conv
; i
< dev
->nr_zones
; i
++) {
101 struct blk_zone
*zone
= &dev
->zones
[i
];
103 zone
->start
= zone
->wp
= sector
;
104 zone
->len
= dev
->zone_size_sects
;
105 zone
->capacity
= dev
->zone_capacity
<< ZONE_SIZE_SHIFT
;
106 zone
->type
= BLK_ZONE_TYPE_SEQWRITE_REQ
;
107 zone
->cond
= BLK_ZONE_COND_EMPTY
;
109 sector
+= dev
->zone_size_sects
;
112 q
->limits
.zoned
= BLK_ZONED_HM
;
113 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL
, q
);
114 blk_queue_required_elevator_features(q
, ELEVATOR_F_ZBD_SEQ_WRITE
);
119 int null_register_zoned_dev(struct nullb
*nullb
)
121 struct nullb_device
*dev
= nullb
->dev
;
122 struct request_queue
*q
= nullb
->q
;
124 if (queue_is_mq(q
)) {
125 int ret
= blk_revalidate_disk_zones(nullb
->disk
, NULL
);
130 blk_queue_chunk_sectors(q
, dev
->zone_size_sects
);
131 q
->nr_zones
= blkdev_nr_zones(nullb
->disk
);
134 blk_queue_max_zone_append_sectors(q
, dev
->zone_size_sects
);
135 blk_queue_max_open_zones(q
, dev
->zone_max_open
);
136 blk_queue_max_active_zones(q
, dev
->zone_max_active
);
141 void null_free_zoned_dev(struct nullb_device
*dev
)
143 bitmap_free(dev
->zone_locks
);
147 static inline void null_lock_zone(struct nullb_device
*dev
, unsigned int zno
)
149 if (dev
->memory_backed
)
150 wait_on_bit_lock_io(dev
->zone_locks
, zno
, TASK_UNINTERRUPTIBLE
);
151 spin_lock_irq(&dev
->zone_lock
);
154 static inline void null_unlock_zone(struct nullb_device
*dev
, unsigned int zno
)
156 spin_unlock_irq(&dev
->zone_lock
);
158 if (dev
->memory_backed
)
159 clear_and_wake_up_bit(zno
, dev
->zone_locks
);
162 int null_report_zones(struct gendisk
*disk
, sector_t sector
,
163 unsigned int nr_zones
, report_zones_cb cb
, void *data
)
165 struct nullb
*nullb
= disk
->private_data
;
166 struct nullb_device
*dev
= nullb
->dev
;
167 unsigned int first_zone
, i
, zno
;
168 struct blk_zone zone
;
171 first_zone
= null_zone_no(dev
, sector
);
172 if (first_zone
>= dev
->nr_zones
)
175 nr_zones
= min(nr_zones
, dev
->nr_zones
- first_zone
);
176 trace_nullb_report_zones(nullb
, nr_zones
);
179 for (i
= 0; i
< nr_zones
; i
++, zno
++) {
181 * Stacked DM target drivers will remap the zone information by
182 * modifying the zone information passed to the report callback.
183 * So use a local copy to avoid corruption of the device zone
186 null_lock_zone(dev
, zno
);
187 memcpy(&zone
, &dev
->zones
[zno
], sizeof(struct blk_zone
));
188 null_unlock_zone(dev
, zno
);
190 error
= cb(&zone
, i
, data
);
199 * This is called in the case of memory backing from null_process_cmd()
200 * with the target zone already locked.
202 size_t null_zone_valid_read_len(struct nullb
*nullb
,
203 sector_t sector
, unsigned int len
)
205 struct nullb_device
*dev
= nullb
->dev
;
206 struct blk_zone
*zone
= &dev
->zones
[null_zone_no(dev
, sector
)];
207 unsigned int nr_sectors
= len
>> SECTOR_SHIFT
;
209 /* Read must be below the write pointer position */
210 if (zone
->type
== BLK_ZONE_TYPE_CONVENTIONAL
||
211 sector
+ nr_sectors
<= zone
->wp
)
214 if (sector
> zone
->wp
)
217 return (zone
->wp
- sector
) << SECTOR_SHIFT
;
220 static blk_status_t
null_close_zone(struct nullb_device
*dev
, struct blk_zone
*zone
)
222 if (zone
->type
== BLK_ZONE_TYPE_CONVENTIONAL
)
223 return BLK_STS_IOERR
;
225 switch (zone
->cond
) {
226 case BLK_ZONE_COND_CLOSED
:
227 /* close operation on closed is not an error */
229 case BLK_ZONE_COND_IMP_OPEN
:
230 dev
->nr_zones_imp_open
--;
232 case BLK_ZONE_COND_EXP_OPEN
:
233 dev
->nr_zones_exp_open
--;
235 case BLK_ZONE_COND_EMPTY
:
236 case BLK_ZONE_COND_FULL
:
238 return BLK_STS_IOERR
;
241 if (zone
->wp
== zone
->start
) {
242 zone
->cond
= BLK_ZONE_COND_EMPTY
;
244 zone
->cond
= BLK_ZONE_COND_CLOSED
;
245 dev
->nr_zones_closed
++;
251 static void null_close_first_imp_zone(struct nullb_device
*dev
)
255 for (i
= dev
->zone_nr_conv
; i
< dev
->nr_zones
; i
++) {
256 if (dev
->zones
[i
].cond
== BLK_ZONE_COND_IMP_OPEN
) {
257 null_close_zone(dev
, &dev
->zones
[i
]);
263 static blk_status_t
null_check_active(struct nullb_device
*dev
)
265 if (!dev
->zone_max_active
)
268 if (dev
->nr_zones_exp_open
+ dev
->nr_zones_imp_open
+
269 dev
->nr_zones_closed
< dev
->zone_max_active
)
272 return BLK_STS_ZONE_ACTIVE_RESOURCE
;
275 static blk_status_t
null_check_open(struct nullb_device
*dev
)
277 if (!dev
->zone_max_open
)
280 if (dev
->nr_zones_exp_open
+ dev
->nr_zones_imp_open
< dev
->zone_max_open
)
283 if (dev
->nr_zones_imp_open
) {
284 if (null_check_active(dev
) == BLK_STS_OK
) {
285 null_close_first_imp_zone(dev
);
290 return BLK_STS_ZONE_OPEN_RESOURCE
;
294 * This function matches the manage open zone resources function in the ZBC standard,
295 * with the addition of max active zones support (added in the ZNS standard).
297 * The function determines if a zone can transition to implicit open or explicit open,
298 * while maintaining the max open zone (and max active zone) limit(s). It may close an
299 * implicit open zone in order to make additional zone resources available.
301 * ZBC states that an implicit open zone shall be closed only if there is not
302 * room within the open limit. However, with the addition of an active limit,
303 * it is not certain that closing an implicit open zone will allow a new zone
304 * to be opened, since we might already be at the active limit capacity.
306 static blk_status_t
null_check_zone_resources(struct nullb_device
*dev
, struct blk_zone
*zone
)
310 switch (zone
->cond
) {
311 case BLK_ZONE_COND_EMPTY
:
312 ret
= null_check_active(dev
);
313 if (ret
!= BLK_STS_OK
)
316 case BLK_ZONE_COND_CLOSED
:
317 return null_check_open(dev
);
319 /* Should never be called for other states */
321 return BLK_STS_IOERR
;
325 static blk_status_t
null_zone_write(struct nullb_cmd
*cmd
, sector_t sector
,
326 unsigned int nr_sectors
, bool append
)
328 struct nullb_device
*dev
= cmd
->nq
->dev
;
329 unsigned int zno
= null_zone_no(dev
, sector
);
330 struct blk_zone
*zone
= &dev
->zones
[zno
];
333 trace_nullb_zone_op(cmd
, zno
, zone
->cond
);
335 if (zone
->type
== BLK_ZONE_TYPE_CONVENTIONAL
)
336 return null_process_cmd(cmd
, REQ_OP_WRITE
, sector
, nr_sectors
);
338 null_lock_zone(dev
, zno
);
340 switch (zone
->cond
) {
341 case BLK_ZONE_COND_FULL
:
342 /* Cannot write to a full zone */
345 case BLK_ZONE_COND_EMPTY
:
346 case BLK_ZONE_COND_CLOSED
:
347 ret
= null_check_zone_resources(dev
, zone
);
348 if (ret
!= BLK_STS_OK
)
351 case BLK_ZONE_COND_IMP_OPEN
:
352 case BLK_ZONE_COND_EXP_OPEN
:
355 /* Invalid zone condition */
361 * Regular writes must be at the write pointer position.
362 * Zone append writes are automatically issued at the write
363 * pointer and the position returned using the request or BIO
369 cmd
->bio
->bi_iter
.bi_sector
= sector
;
371 cmd
->rq
->__sector
= sector
;
372 } else if (sector
!= zone
->wp
) {
377 if (zone
->wp
+ nr_sectors
> zone
->start
+ zone
->capacity
) {
382 if (zone
->cond
== BLK_ZONE_COND_CLOSED
) {
383 dev
->nr_zones_closed
--;
384 dev
->nr_zones_imp_open
++;
385 } else if (zone
->cond
== BLK_ZONE_COND_EMPTY
) {
386 dev
->nr_zones_imp_open
++;
388 if (zone
->cond
!= BLK_ZONE_COND_EXP_OPEN
)
389 zone
->cond
= BLK_ZONE_COND_IMP_OPEN
;
392 * Memory backing allocation may sleep: release the zone_lock spinlock
393 * to avoid scheduling in atomic context. Zone operation atomicity is
394 * still guaranteed through the zone_locks bitmap.
396 if (dev
->memory_backed
)
397 spin_unlock_irq(&dev
->zone_lock
);
398 ret
= null_process_cmd(cmd
, REQ_OP_WRITE
, sector
, nr_sectors
);
399 if (dev
->memory_backed
)
400 spin_lock_irq(&dev
->zone_lock
);
402 if (ret
!= BLK_STS_OK
)
405 zone
->wp
+= nr_sectors
;
406 if (zone
->wp
== zone
->start
+ zone
->capacity
) {
407 if (zone
->cond
== BLK_ZONE_COND_EXP_OPEN
)
408 dev
->nr_zones_exp_open
--;
409 else if (zone
->cond
== BLK_ZONE_COND_IMP_OPEN
)
410 dev
->nr_zones_imp_open
--;
411 zone
->cond
= BLK_ZONE_COND_FULL
;
416 null_unlock_zone(dev
, zno
);
421 static blk_status_t
null_open_zone(struct nullb_device
*dev
, struct blk_zone
*zone
)
425 if (zone
->type
== BLK_ZONE_TYPE_CONVENTIONAL
)
426 return BLK_STS_IOERR
;
428 switch (zone
->cond
) {
429 case BLK_ZONE_COND_EXP_OPEN
:
430 /* open operation on exp open is not an error */
432 case BLK_ZONE_COND_EMPTY
:
433 ret
= null_check_zone_resources(dev
, zone
);
434 if (ret
!= BLK_STS_OK
)
437 case BLK_ZONE_COND_IMP_OPEN
:
438 dev
->nr_zones_imp_open
--;
440 case BLK_ZONE_COND_CLOSED
:
441 ret
= null_check_zone_resources(dev
, zone
);
442 if (ret
!= BLK_STS_OK
)
444 dev
->nr_zones_closed
--;
446 case BLK_ZONE_COND_FULL
:
448 return BLK_STS_IOERR
;
451 zone
->cond
= BLK_ZONE_COND_EXP_OPEN
;
452 dev
->nr_zones_exp_open
++;
457 static blk_status_t
null_finish_zone(struct nullb_device
*dev
, struct blk_zone
*zone
)
461 if (zone
->type
== BLK_ZONE_TYPE_CONVENTIONAL
)
462 return BLK_STS_IOERR
;
464 switch (zone
->cond
) {
465 case BLK_ZONE_COND_FULL
:
466 /* finish operation on full is not an error */
468 case BLK_ZONE_COND_EMPTY
:
469 ret
= null_check_zone_resources(dev
, zone
);
470 if (ret
!= BLK_STS_OK
)
473 case BLK_ZONE_COND_IMP_OPEN
:
474 dev
->nr_zones_imp_open
--;
476 case BLK_ZONE_COND_EXP_OPEN
:
477 dev
->nr_zones_exp_open
--;
479 case BLK_ZONE_COND_CLOSED
:
480 ret
= null_check_zone_resources(dev
, zone
);
481 if (ret
!= BLK_STS_OK
)
483 dev
->nr_zones_closed
--;
486 return BLK_STS_IOERR
;
489 zone
->cond
= BLK_ZONE_COND_FULL
;
490 zone
->wp
= zone
->start
+ zone
->len
;
495 static blk_status_t
null_reset_zone(struct nullb_device
*dev
, struct blk_zone
*zone
)
497 if (zone
->type
== BLK_ZONE_TYPE_CONVENTIONAL
)
498 return BLK_STS_IOERR
;
500 switch (zone
->cond
) {
501 case BLK_ZONE_COND_EMPTY
:
502 /* reset operation on empty is not an error */
504 case BLK_ZONE_COND_IMP_OPEN
:
505 dev
->nr_zones_imp_open
--;
507 case BLK_ZONE_COND_EXP_OPEN
:
508 dev
->nr_zones_exp_open
--;
510 case BLK_ZONE_COND_CLOSED
:
511 dev
->nr_zones_closed
--;
513 case BLK_ZONE_COND_FULL
:
516 return BLK_STS_IOERR
;
519 zone
->cond
= BLK_ZONE_COND_EMPTY
;
520 zone
->wp
= zone
->start
;
525 static blk_status_t
null_zone_mgmt(struct nullb_cmd
*cmd
, enum req_opf op
,
528 struct nullb_device
*dev
= cmd
->nq
->dev
;
529 unsigned int zone_no
;
530 struct blk_zone
*zone
;
534 if (op
== REQ_OP_ZONE_RESET_ALL
) {
535 for (i
= dev
->zone_nr_conv
; i
< dev
->nr_zones
; i
++) {
536 null_lock_zone(dev
, i
);
537 zone
= &dev
->zones
[i
];
538 if (zone
->cond
!= BLK_ZONE_COND_EMPTY
) {
539 null_reset_zone(dev
, zone
);
540 trace_nullb_zone_op(cmd
, i
, zone
->cond
);
542 null_unlock_zone(dev
, i
);
547 zone_no
= null_zone_no(dev
, sector
);
548 zone
= &dev
->zones
[zone_no
];
550 null_lock_zone(dev
, zone_no
);
553 case REQ_OP_ZONE_RESET
:
554 ret
= null_reset_zone(dev
, zone
);
556 case REQ_OP_ZONE_OPEN
:
557 ret
= null_open_zone(dev
, zone
);
559 case REQ_OP_ZONE_CLOSE
:
560 ret
= null_close_zone(dev
, zone
);
562 case REQ_OP_ZONE_FINISH
:
563 ret
= null_finish_zone(dev
, zone
);
566 ret
= BLK_STS_NOTSUPP
;
570 if (ret
== BLK_STS_OK
)
571 trace_nullb_zone_op(cmd
, zone_no
, zone
->cond
);
573 null_unlock_zone(dev
, zone_no
);
578 blk_status_t
null_process_zoned_cmd(struct nullb_cmd
*cmd
, enum req_opf op
,
579 sector_t sector
, sector_t nr_sectors
)
581 struct nullb_device
*dev
= cmd
->nq
->dev
;
582 unsigned int zno
= null_zone_no(dev
, sector
);
587 sts
= null_zone_write(cmd
, sector
, nr_sectors
, false);
589 case REQ_OP_ZONE_APPEND
:
590 sts
= null_zone_write(cmd
, sector
, nr_sectors
, true);
592 case REQ_OP_ZONE_RESET
:
593 case REQ_OP_ZONE_RESET_ALL
:
594 case REQ_OP_ZONE_OPEN
:
595 case REQ_OP_ZONE_CLOSE
:
596 case REQ_OP_ZONE_FINISH
:
597 sts
= null_zone_mgmt(cmd
, op
, sector
);
600 null_lock_zone(dev
, zno
);
601 sts
= null_process_cmd(cmd
, op
, sector
, nr_sectors
);
602 null_unlock_zone(dev
, zno
);