1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
4 * Copyright (C) 2016 CNEX Labs
5 * Initial release: Javier Gonzalez <javier@cnexlabs.com>
6 * Matias Bjorling <matias@cnexlabs.com>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version
10 * 2 as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * Implementation of a physical block-device target for Open-channel SSDs.
19 * pblk-init.c - pblk's initialization.
23 #include "pblk-trace.h"
25 static unsigned int write_buffer_size
;
27 module_param(write_buffer_size
, uint
, 0644);
28 MODULE_PARM_DESC(write_buffer_size
, "number of entries in a write buffer");
30 struct pblk_global_caches
{
31 struct kmem_cache
*ws
;
32 struct kmem_cache
*rec
;
33 struct kmem_cache
*g_rq
;
34 struct kmem_cache
*w_rq
;
38 struct mutex mutex
; /* Ensures consistency between
43 static struct pblk_global_caches pblk_caches
= {
44 .mutex
= __MUTEX_INITIALIZER(pblk_caches
.mutex
),
48 struct bio_set pblk_bio_set
;
50 static int pblk_rw_io(struct request_queue
*q
, struct pblk
*pblk
,
55 /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
56 * constraint. Writes can be of arbitrary size.
58 if (bio_data_dir(bio
) == READ
) {
59 blk_queue_split(q
, &bio
);
60 ret
= pblk_submit_read(pblk
, bio
);
61 if (ret
== NVM_IO_DONE
&& bio_flagged(bio
, BIO_CLONED
))
67 /* Prevent deadlock in the case of a modest LUN configuration and large
68 * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
69 * available for user I/O.
71 if (pblk_get_secs(bio
) > pblk_rl_max_io(&pblk
->rl
))
72 blk_queue_split(q
, &bio
);
74 return pblk_write_to_cache(pblk
, bio
, PBLK_IOTYPE_USER
);
77 static blk_qc_t
pblk_make_rq(struct request_queue
*q
, struct bio
*bio
)
79 struct pblk
*pblk
= q
->queuedata
;
81 if (bio_op(bio
) == REQ_OP_DISCARD
) {
82 pblk_discard(pblk
, bio
);
83 if (!(bio
->bi_opf
& REQ_PREFLUSH
)) {
89 switch (pblk_rw_io(q
, pblk
, bio
)) {
101 static size_t pblk_trans_map_size(struct pblk
*pblk
)
105 if (pblk
->addrf_len
< 32)
108 return entry_size
* pblk
->rl
.nr_secs
;
111 #ifdef CONFIG_NVM_PBLK_DEBUG
112 static u32
pblk_l2p_crc(struct pblk
*pblk
)
117 map_size
= pblk_trans_map_size(pblk
);
118 crc
= crc32_le(crc
, pblk
->trans_map
, map_size
);
123 static void pblk_l2p_free(struct pblk
*pblk
)
125 vfree(pblk
->trans_map
);
128 static int pblk_l2p_recover(struct pblk
*pblk
, bool factory_init
)
130 struct pblk_line
*line
= NULL
;
133 pblk_setup_uuid(pblk
);
135 line
= pblk_recov_l2p(pblk
);
137 pblk_err(pblk
, "could not recover l2p table\n");
142 #ifdef CONFIG_NVM_PBLK_DEBUG
143 pblk_info(pblk
, "init: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
146 /* Free full lines directly as GC has not been started yet */
147 pblk_gc_free_full_lines(pblk
);
150 /* Configure next line for user data */
151 line
= pblk_line_get_first_data(pblk
);
159 static int pblk_l2p_init(struct pblk
*pblk
, bool factory_init
)
166 map_size
= pblk_trans_map_size(pblk
);
167 pblk
->trans_map
= vmalloc(map_size
);
168 if (!pblk
->trans_map
)
171 pblk_ppa_set_empty(&ppa
);
173 for (i
= 0; i
< pblk
->rl
.nr_secs
; i
++)
174 pblk_trans_map_set(pblk
, i
, ppa
);
176 ret
= pblk_l2p_recover(pblk
, factory_init
);
178 vfree(pblk
->trans_map
);
183 static void pblk_rwb_free(struct pblk
*pblk
)
185 if (pblk_rb_tear_down_check(&pblk
->rwb
))
186 pblk_err(pblk
, "write buffer error on tear down\n");
188 pblk_rb_free(&pblk
->rwb
);
191 static int pblk_rwb_init(struct pblk
*pblk
)
193 struct nvm_tgt_dev
*dev
= pblk
->dev
;
194 struct nvm_geo
*geo
= &dev
->geo
;
195 unsigned long buffer_size
;
196 int pgs_in_buffer
, threshold
;
198 threshold
= geo
->mw_cunits
* geo
->all_luns
;
199 pgs_in_buffer
= (max(geo
->mw_cunits
, geo
->ws_opt
) + geo
->ws_opt
)
202 if (write_buffer_size
&& (write_buffer_size
> pgs_in_buffer
))
203 buffer_size
= write_buffer_size
;
205 buffer_size
= pgs_in_buffer
;
207 return pblk_rb_init(&pblk
->rwb
, buffer_size
, threshold
, geo
->csecs
);
210 static int pblk_set_addrf_12(struct pblk
*pblk
, struct nvm_geo
*geo
,
211 struct nvm_addrf_12
*dst
)
213 struct nvm_addrf_12
*src
= (struct nvm_addrf_12
*)&geo
->addrf
;
216 /* Re-calculate channel and lun format to adapt to configuration */
217 power_len
= get_count_order(geo
->num_ch
);
218 if (1 << power_len
!= geo
->num_ch
) {
219 pblk_err(pblk
, "supports only power-of-two channel config.\n");
222 dst
->ch_len
= power_len
;
224 power_len
= get_count_order(geo
->num_lun
);
225 if (1 << power_len
!= geo
->num_lun
) {
226 pblk_err(pblk
, "supports only power-of-two LUN config.\n");
229 dst
->lun_len
= power_len
;
231 dst
->blk_len
= src
->blk_len
;
232 dst
->pg_len
= src
->pg_len
;
233 dst
->pln_len
= src
->pln_len
;
234 dst
->sec_len
= src
->sec_len
;
237 dst
->pln_offset
= dst
->sec_len
;
238 dst
->ch_offset
= dst
->pln_offset
+ dst
->pln_len
;
239 dst
->lun_offset
= dst
->ch_offset
+ dst
->ch_len
;
240 dst
->pg_offset
= dst
->lun_offset
+ dst
->lun_len
;
241 dst
->blk_offset
= dst
->pg_offset
+ dst
->pg_len
;
243 dst
->sec_mask
= ((1ULL << dst
->sec_len
) - 1) << dst
->sec_offset
;
244 dst
->pln_mask
= ((1ULL << dst
->pln_len
) - 1) << dst
->pln_offset
;
245 dst
->ch_mask
= ((1ULL << dst
->ch_len
) - 1) << dst
->ch_offset
;
246 dst
->lun_mask
= ((1ULL << dst
->lun_len
) - 1) << dst
->lun_offset
;
247 dst
->pg_mask
= ((1ULL << dst
->pg_len
) - 1) << dst
->pg_offset
;
248 dst
->blk_mask
= ((1ULL << dst
->blk_len
) - 1) << dst
->blk_offset
;
250 return dst
->blk_offset
+ src
->blk_len
;
253 static int pblk_set_addrf_20(struct nvm_geo
*geo
, struct nvm_addrf
*adst
,
254 struct pblk_addrf
*udst
)
256 struct nvm_addrf
*src
= &geo
->addrf
;
258 adst
->ch_len
= get_count_order(geo
->num_ch
);
259 adst
->lun_len
= get_count_order(geo
->num_lun
);
260 adst
->chk_len
= src
->chk_len
;
261 adst
->sec_len
= src
->sec_len
;
263 adst
->sec_offset
= 0;
264 adst
->ch_offset
= adst
->sec_len
;
265 adst
->lun_offset
= adst
->ch_offset
+ adst
->ch_len
;
266 adst
->chk_offset
= adst
->lun_offset
+ adst
->lun_len
;
268 adst
->sec_mask
= ((1ULL << adst
->sec_len
) - 1) << adst
->sec_offset
;
269 adst
->chk_mask
= ((1ULL << adst
->chk_len
) - 1) << adst
->chk_offset
;
270 adst
->lun_mask
= ((1ULL << adst
->lun_len
) - 1) << adst
->lun_offset
;
271 adst
->ch_mask
= ((1ULL << adst
->ch_len
) - 1) << adst
->ch_offset
;
273 udst
->sec_stripe
= geo
->ws_opt
;
274 udst
->ch_stripe
= geo
->num_ch
;
275 udst
->lun_stripe
= geo
->num_lun
;
277 udst
->sec_lun_stripe
= udst
->sec_stripe
* udst
->ch_stripe
;
278 udst
->sec_ws_stripe
= udst
->sec_lun_stripe
* udst
->lun_stripe
;
280 return adst
->chk_offset
+ adst
->chk_len
;
283 static int pblk_set_addrf(struct pblk
*pblk
)
285 struct nvm_tgt_dev
*dev
= pblk
->dev
;
286 struct nvm_geo
*geo
= &dev
->geo
;
289 switch (geo
->version
) {
290 case NVM_OCSSD_SPEC_12
:
291 div_u64_rem(geo
->clba
, pblk
->min_write_pgs
, &mod
);
293 pblk_err(pblk
, "bad configuration of sectors/pages\n");
297 pblk
->addrf_len
= pblk_set_addrf_12(pblk
, geo
,
298 (void *)&pblk
->addrf
);
300 case NVM_OCSSD_SPEC_20
:
301 pblk
->addrf_len
= pblk_set_addrf_20(geo
, (void *)&pblk
->addrf
,
305 pblk_err(pblk
, "OCSSD revision not supported (%d)\n",
313 static int pblk_create_global_caches(void)
316 pblk_caches
.ws
= kmem_cache_create("pblk_blk_ws",
317 sizeof(struct pblk_line_ws
), 0, 0, NULL
);
321 pblk_caches
.rec
= kmem_cache_create("pblk_rec",
322 sizeof(struct pblk_rec_ctx
), 0, 0, NULL
);
323 if (!pblk_caches
.rec
)
324 goto fail_destroy_ws
;
326 pblk_caches
.g_rq
= kmem_cache_create("pblk_g_rq", pblk_g_rq_size
,
328 if (!pblk_caches
.g_rq
)
329 goto fail_destroy_rec
;
331 pblk_caches
.w_rq
= kmem_cache_create("pblk_w_rq", pblk_w_rq_size
,
333 if (!pblk_caches
.w_rq
)
334 goto fail_destroy_g_rq
;
339 kmem_cache_destroy(pblk_caches
.g_rq
);
341 kmem_cache_destroy(pblk_caches
.rec
);
343 kmem_cache_destroy(pblk_caches
.ws
);
348 static int pblk_get_global_caches(void)
352 mutex_lock(&pblk_caches
.mutex
);
354 if (kref_get_unless_zero(&pblk_caches
.kref
))
357 ret
= pblk_create_global_caches();
359 kref_init(&pblk_caches
.kref
);
362 mutex_unlock(&pblk_caches
.mutex
);
366 static void pblk_destroy_global_caches(struct kref
*ref
)
368 struct pblk_global_caches
*c
;
370 c
= container_of(ref
, struct pblk_global_caches
, kref
);
372 kmem_cache_destroy(c
->ws
);
373 kmem_cache_destroy(c
->rec
);
374 kmem_cache_destroy(c
->g_rq
);
375 kmem_cache_destroy(c
->w_rq
);
378 static void pblk_put_global_caches(void)
380 mutex_lock(&pblk_caches
.mutex
);
381 kref_put(&pblk_caches
.kref
, pblk_destroy_global_caches
);
382 mutex_unlock(&pblk_caches
.mutex
);
385 static int pblk_core_init(struct pblk
*pblk
)
387 struct nvm_tgt_dev
*dev
= pblk
->dev
;
388 struct nvm_geo
*geo
= &dev
->geo
;
389 int ret
, max_write_ppas
;
391 atomic64_set(&pblk
->user_wa
, 0);
392 atomic64_set(&pblk
->pad_wa
, 0);
393 atomic64_set(&pblk
->gc_wa
, 0);
394 pblk
->user_rst_wa
= 0;
395 pblk
->pad_rst_wa
= 0;
398 atomic64_set(&pblk
->nr_flush
, 0);
399 pblk
->nr_flush_rst
= 0;
401 pblk
->min_write_pgs
= geo
->ws_opt
;
402 pblk
->min_write_pgs_data
= pblk
->min_write_pgs
;
403 max_write_ppas
= pblk
->min_write_pgs
* geo
->all_luns
;
404 pblk
->max_write_pgs
= min_t(int, max_write_ppas
, NVM_MAX_VLBA
);
405 pblk
->max_write_pgs
= min_t(int, pblk
->max_write_pgs
,
406 queue_max_hw_sectors(dev
->q
) / (geo
->csecs
>> SECTOR_SHIFT
));
407 pblk_set_sec_per_write(pblk
, pblk
->min_write_pgs
);
409 pblk
->oob_meta_size
= geo
->sos
;
410 if (!pblk_is_oob_meta_supported(pblk
)) {
411 /* For drives which does not have OOB metadata feature
412 * in order to support recovery feature we need to use
413 * so called packed metadata. Packed metada will store
414 * the same information as OOB metadata (l2p table mapping,
415 * but in the form of the single page at the end of
416 * every write request.
418 if (pblk
->min_write_pgs
419 * sizeof(struct pblk_sec_meta
) > PAGE_SIZE
) {
420 /* We want to keep all the packed metadata on single
421 * page per write requests. So we need to ensure that
424 * This is more like sanity check, since there is
425 * no device with such a big minimal write size
426 * (above 1 metabytes).
428 pblk_err(pblk
, "Not supported min write size\n");
431 /* For packed meta approach we do some simplification.
432 * On read path we always issue requests which size
433 * equal to max_write_pgs, with all pages filled with
434 * user payload except of last one page which will be
435 * filled with packed metadata.
437 pblk
->max_write_pgs
= pblk
->min_write_pgs
;
438 pblk
->min_write_pgs_data
= pblk
->min_write_pgs
- 1;
441 pblk
->pad_dist
= kcalloc(pblk
->min_write_pgs
- 1, sizeof(atomic64_t
),
446 if (pblk_get_global_caches())
447 goto fail_free_pad_dist
;
449 /* Internal bios can be at most the sectors signaled by the device. */
450 ret
= mempool_init_page_pool(&pblk
->page_bio_pool
, NVM_MAX_VLBA
, 0);
452 goto free_global_caches
;
454 ret
= mempool_init_slab_pool(&pblk
->gen_ws_pool
, PBLK_GEN_WS_POOL_SIZE
,
457 goto free_page_bio_pool
;
459 ret
= mempool_init_slab_pool(&pblk
->rec_pool
, geo
->all_luns
,
462 goto free_gen_ws_pool
;
464 ret
= mempool_init_slab_pool(&pblk
->r_rq_pool
, geo
->all_luns
,
469 ret
= mempool_init_slab_pool(&pblk
->e_rq_pool
, geo
->all_luns
,
474 ret
= mempool_init_slab_pool(&pblk
->w_rq_pool
, geo
->all_luns
,
479 pblk
->close_wq
= alloc_workqueue("pblk-close-wq",
480 WQ_MEM_RECLAIM
| WQ_UNBOUND
, PBLK_NR_CLOSE_JOBS
);
484 pblk
->bb_wq
= alloc_workqueue("pblk-bb-wq",
485 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
489 pblk
->r_end_wq
= alloc_workqueue("pblk-read-end-wq",
490 WQ_MEM_RECLAIM
| WQ_UNBOUND
, 0);
494 if (pblk_set_addrf(pblk
))
497 INIT_LIST_HEAD(&pblk
->compl_list
);
498 INIT_LIST_HEAD(&pblk
->resubmit_list
);
503 destroy_workqueue(pblk
->r_end_wq
);
505 destroy_workqueue(pblk
->bb_wq
);
507 destroy_workqueue(pblk
->close_wq
);
509 mempool_exit(&pblk
->w_rq_pool
);
511 mempool_exit(&pblk
->e_rq_pool
);
513 mempool_exit(&pblk
->r_rq_pool
);
515 mempool_exit(&pblk
->rec_pool
);
517 mempool_exit(&pblk
->gen_ws_pool
);
519 mempool_exit(&pblk
->page_bio_pool
);
521 pblk_put_global_caches();
523 kfree(pblk
->pad_dist
);
527 static void pblk_core_free(struct pblk
*pblk
)
530 destroy_workqueue(pblk
->close_wq
);
533 destroy_workqueue(pblk
->r_end_wq
);
536 destroy_workqueue(pblk
->bb_wq
);
538 mempool_exit(&pblk
->page_bio_pool
);
539 mempool_exit(&pblk
->gen_ws_pool
);
540 mempool_exit(&pblk
->rec_pool
);
541 mempool_exit(&pblk
->r_rq_pool
);
542 mempool_exit(&pblk
->e_rq_pool
);
543 mempool_exit(&pblk
->w_rq_pool
);
545 pblk_put_global_caches();
546 kfree(pblk
->pad_dist
);
549 static void pblk_line_mg_free(struct pblk
*pblk
)
551 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
554 kfree(l_mg
->bb_template
);
556 kfree(l_mg
->vsc_list
);
558 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
559 kfree(l_mg
->sline_meta
[i
]);
560 pblk_mfree(l_mg
->eline_meta
[i
]->buf
, l_mg
->emeta_alloc_type
);
561 kfree(l_mg
->eline_meta
[i
]);
564 mempool_destroy(l_mg
->bitmap_pool
);
565 kmem_cache_destroy(l_mg
->bitmap_cache
);
568 static void pblk_line_meta_free(struct pblk_line_mgmt
*l_mg
,
569 struct pblk_line
*line
)
571 struct pblk_w_err_gc
*w_err_gc
= line
->w_err_gc
;
573 kfree(line
->blk_bitmap
);
574 kfree(line
->erase_bitmap
);
577 pblk_mfree(w_err_gc
->lba_list
, l_mg
->emeta_alloc_type
);
581 static void pblk_lines_free(struct pblk
*pblk
)
583 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
584 struct pblk_line
*line
;
587 spin_lock(&l_mg
->free_lock
);
588 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
589 line
= &pblk
->lines
[i
];
591 pblk_line_free(line
);
592 pblk_line_meta_free(l_mg
, line
);
594 spin_unlock(&l_mg
->free_lock
);
596 pblk_line_mg_free(pblk
);
602 static int pblk_luns_init(struct pblk
*pblk
)
604 struct nvm_tgt_dev
*dev
= pblk
->dev
;
605 struct nvm_geo
*geo
= &dev
->geo
;
606 struct pblk_lun
*rlun
;
609 /* TODO: Implement unbalanced LUN support */
610 if (geo
->num_lun
< 0) {
611 pblk_err(pblk
, "unbalanced LUN config.\n");
615 pblk
->luns
= kcalloc(geo
->all_luns
, sizeof(struct pblk_lun
),
620 for (i
= 0; i
< geo
->all_luns
; i
++) {
621 /* Stripe across channels */
622 int ch
= i
% geo
->num_ch
;
623 int lun_raw
= i
/ geo
->num_ch
;
624 int lunid
= lun_raw
+ ch
* geo
->num_lun
;
626 rlun
= &pblk
->luns
[i
];
627 rlun
->bppa
= dev
->luns
[lunid
];
629 sema_init(&rlun
->wr_sem
, 1);
635 /* See comment over struct line_emeta definition */
636 static unsigned int calc_emeta_len(struct pblk
*pblk
)
638 struct pblk_line_meta
*lm
= &pblk
->lm
;
639 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
640 struct nvm_tgt_dev
*dev
= pblk
->dev
;
641 struct nvm_geo
*geo
= &dev
->geo
;
643 /* Round to sector size so that lba_list starts on its own sector */
644 lm
->emeta_sec
[1] = DIV_ROUND_UP(
645 sizeof(struct line_emeta
) + lm
->blk_bitmap_len
+
646 sizeof(struct wa_counters
), geo
->csecs
);
647 lm
->emeta_len
[1] = lm
->emeta_sec
[1] * geo
->csecs
;
649 /* Round to sector size so that vsc_list starts on its own sector */
650 lm
->dsec_per_line
= lm
->sec_per_line
- lm
->emeta_sec
[0];
651 lm
->emeta_sec
[2] = DIV_ROUND_UP(lm
->dsec_per_line
* sizeof(u64
),
653 lm
->emeta_len
[2] = lm
->emeta_sec
[2] * geo
->csecs
;
655 lm
->emeta_sec
[3] = DIV_ROUND_UP(l_mg
->nr_lines
* sizeof(u32
),
657 lm
->emeta_len
[3] = lm
->emeta_sec
[3] * geo
->csecs
;
659 lm
->vsc_list_len
= l_mg
->nr_lines
* sizeof(u32
);
661 return (lm
->emeta_len
[1] + lm
->emeta_len
[2] + lm
->emeta_len
[3]);
664 static int pblk_set_provision(struct pblk
*pblk
, int nr_free_chks
)
666 struct nvm_tgt_dev
*dev
= pblk
->dev
;
667 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
668 struct pblk_line_meta
*lm
= &pblk
->lm
;
669 struct nvm_geo
*geo
= &dev
->geo
;
670 sector_t provisioned
;
671 int sec_meta
, blk_meta
, clba
;
674 if (geo
->op
== NVM_TARGET_DEFAULT_OP
)
675 pblk
->op
= PBLK_DEFAULT_OP
;
679 minimum
= pblk_get_min_chks(pblk
);
680 provisioned
= nr_free_chks
;
681 provisioned
*= (100 - pblk
->op
);
682 sector_div(provisioned
, 100);
684 if ((nr_free_chks
- provisioned
) < minimum
) {
685 if (geo
->op
!= NVM_TARGET_DEFAULT_OP
) {
686 pblk_err(pblk
, "OP too small to create a sane instance\n");
690 /* If the user did not specify an OP value, and PBLK_DEFAULT_OP
691 * is not enough, calculate and set sane value
694 provisioned
= nr_free_chks
- minimum
;
695 pblk
->op
= (100 * minimum
) / nr_free_chks
;
696 pblk_info(pblk
, "Default OP insufficient, adjusting OP to %d\n",
700 pblk
->op_blks
= nr_free_chks
- provisioned
;
702 /* Internally pblk manages all free blocks, but all calculations based
703 * on user capacity consider only provisioned blocks
705 pblk
->rl
.total_blocks
= nr_free_chks
;
706 pblk
->rl
.nr_secs
= nr_free_chks
* geo
->clba
;
708 /* Consider sectors used for metadata */
709 sec_meta
= (lm
->smeta_sec
+ lm
->emeta_sec
[0]) * l_mg
->nr_free_lines
;
710 blk_meta
= DIV_ROUND_UP(sec_meta
, geo
->clba
);
712 clba
= (geo
->clba
/ pblk
->min_write_pgs
) * pblk
->min_write_pgs_data
;
713 pblk
->capacity
= (provisioned
- blk_meta
) * clba
;
715 atomic_set(&pblk
->rl
.free_blocks
, nr_free_chks
);
716 atomic_set(&pblk
->rl
.free_user_blocks
, nr_free_chks
);
721 static int pblk_setup_line_meta_chk(struct pblk
*pblk
, struct pblk_line
*line
,
722 struct nvm_chk_meta
*meta
)
724 struct nvm_tgt_dev
*dev
= pblk
->dev
;
725 struct nvm_geo
*geo
= &dev
->geo
;
726 struct pblk_line_meta
*lm
= &pblk
->lm
;
727 int i
, nr_bad_chks
= 0;
729 for (i
= 0; i
< lm
->blk_per_line
; i
++) {
730 struct pblk_lun
*rlun
= &pblk
->luns
[i
];
731 struct nvm_chk_meta
*chunk
;
732 struct nvm_chk_meta
*chunk_meta
;
737 pos
= pblk_ppa_to_pos(geo
, ppa
);
738 chunk
= &line
->chks
[pos
];
740 ppa
.m
.chk
= line
->id
;
741 chunk_meta
= pblk_chunk_get_off(pblk
, meta
, ppa
);
743 chunk
->state
= chunk_meta
->state
;
744 chunk
->type
= chunk_meta
->type
;
745 chunk
->wi
= chunk_meta
->wi
;
746 chunk
->slba
= chunk_meta
->slba
;
747 chunk
->cnlb
= chunk_meta
->cnlb
;
748 chunk
->wp
= chunk_meta
->wp
;
750 trace_pblk_chunk_state(pblk_disk_name(pblk
), &ppa
,
753 if (chunk
->type
& NVM_CHK_TP_SZ_SPEC
) {
754 WARN_ONCE(1, "pblk: custom-sized chunks unsupported\n");
758 if (!(chunk
->state
& NVM_CHK_ST_OFFLINE
))
761 set_bit(pos
, line
->blk_bitmap
);
768 static long pblk_setup_line_meta(struct pblk
*pblk
, struct pblk_line
*line
,
769 void *chunk_meta
, int line_id
)
771 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
772 struct pblk_line_meta
*lm
= &pblk
->lm
;
773 long nr_bad_chks
, chk_in_line
;
777 line
->type
= PBLK_LINETYPE_FREE
;
778 line
->state
= PBLK_LINESTATE_NEW
;
779 line
->gc_group
= PBLK_LINEGC_NONE
;
780 line
->vsc
= &l_mg
->vsc_list
[line_id
];
781 spin_lock_init(&line
->lock
);
783 nr_bad_chks
= pblk_setup_line_meta_chk(pblk
, line
, chunk_meta
);
785 chk_in_line
= lm
->blk_per_line
- nr_bad_chks
;
786 if (nr_bad_chks
< 0 || nr_bad_chks
> lm
->blk_per_line
||
787 chk_in_line
< lm
->min_blk_line
) {
788 line
->state
= PBLK_LINESTATE_BAD
;
789 list_add_tail(&line
->list
, &l_mg
->bad_list
);
793 atomic_set(&line
->blk_in_line
, chk_in_line
);
794 list_add_tail(&line
->list
, &l_mg
->free_list
);
795 l_mg
->nr_free_lines
++;
800 static int pblk_alloc_line_meta(struct pblk
*pblk
, struct pblk_line
*line
)
802 struct pblk_line_meta
*lm
= &pblk
->lm
;
804 line
->blk_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
805 if (!line
->blk_bitmap
)
808 line
->erase_bitmap
= kzalloc(lm
->blk_bitmap_len
, GFP_KERNEL
);
809 if (!line
->erase_bitmap
)
810 goto free_blk_bitmap
;
813 line
->chks
= kmalloc_array(lm
->blk_per_line
,
814 sizeof(struct nvm_chk_meta
), GFP_KERNEL
);
816 goto free_erase_bitmap
;
818 line
->w_err_gc
= kzalloc(sizeof(struct pblk_w_err_gc
), GFP_KERNEL
);
827 kfree(line
->erase_bitmap
);
829 kfree(line
->blk_bitmap
);
833 static int pblk_line_mg_init(struct pblk
*pblk
)
835 struct nvm_tgt_dev
*dev
= pblk
->dev
;
836 struct nvm_geo
*geo
= &dev
->geo
;
837 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
838 struct pblk_line_meta
*lm
= &pblk
->lm
;
841 l_mg
->nr_lines
= geo
->num_chk
;
842 l_mg
->log_line
= l_mg
->data_line
= NULL
;
843 l_mg
->l_seq_nr
= l_mg
->d_seq_nr
= 0;
844 l_mg
->nr_free_lines
= 0;
845 bitmap_zero(&l_mg
->meta_bitmap
, PBLK_DATA_LINES
);
847 INIT_LIST_HEAD(&l_mg
->free_list
);
848 INIT_LIST_HEAD(&l_mg
->corrupt_list
);
849 INIT_LIST_HEAD(&l_mg
->bad_list
);
850 INIT_LIST_HEAD(&l_mg
->gc_full_list
);
851 INIT_LIST_HEAD(&l_mg
->gc_high_list
);
852 INIT_LIST_HEAD(&l_mg
->gc_mid_list
);
853 INIT_LIST_HEAD(&l_mg
->gc_low_list
);
854 INIT_LIST_HEAD(&l_mg
->gc_empty_list
);
855 INIT_LIST_HEAD(&l_mg
->gc_werr_list
);
857 INIT_LIST_HEAD(&l_mg
->emeta_list
);
859 l_mg
->gc_lists
[0] = &l_mg
->gc_werr_list
;
860 l_mg
->gc_lists
[1] = &l_mg
->gc_high_list
;
861 l_mg
->gc_lists
[2] = &l_mg
->gc_mid_list
;
862 l_mg
->gc_lists
[3] = &l_mg
->gc_low_list
;
864 spin_lock_init(&l_mg
->free_lock
);
865 spin_lock_init(&l_mg
->close_lock
);
866 spin_lock_init(&l_mg
->gc_lock
);
868 l_mg
->vsc_list
= kcalloc(l_mg
->nr_lines
, sizeof(__le32
), GFP_KERNEL
);
872 l_mg
->bb_template
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
873 if (!l_mg
->bb_template
)
874 goto fail_free_vsc_list
;
876 l_mg
->bb_aux
= kzalloc(lm
->sec_bitmap_len
, GFP_KERNEL
);
878 goto fail_free_bb_template
;
880 /* smeta is always small enough to fit on a kmalloc memory allocation,
881 * emeta depends on the number of LUNs allocated to the pblk instance
883 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
884 l_mg
->sline_meta
[i
] = kmalloc(lm
->smeta_len
, GFP_KERNEL
);
885 if (!l_mg
->sline_meta
[i
])
886 goto fail_free_smeta
;
889 l_mg
->bitmap_cache
= kmem_cache_create("pblk_lm_bitmap",
890 lm
->sec_bitmap_len
, 0, 0, NULL
);
891 if (!l_mg
->bitmap_cache
)
892 goto fail_free_smeta
;
894 /* the bitmap pool is used for both valid and map bitmaps */
895 l_mg
->bitmap_pool
= mempool_create_slab_pool(PBLK_DATA_LINES
* 2,
897 if (!l_mg
->bitmap_pool
)
898 goto fail_destroy_bitmap_cache
;
900 /* emeta allocates three different buffers for managing metadata with
901 * in-memory and in-media layouts
903 for (i
= 0; i
< PBLK_DATA_LINES
; i
++) {
904 struct pblk_emeta
*emeta
;
906 emeta
= kmalloc(sizeof(struct pblk_emeta
), GFP_KERNEL
);
908 goto fail_free_emeta
;
910 if (lm
->emeta_len
[0] > KMALLOC_MAX_CACHE_SIZE
) {
911 l_mg
->emeta_alloc_type
= PBLK_VMALLOC_META
;
913 emeta
->buf
= vmalloc(lm
->emeta_len
[0]);
916 goto fail_free_emeta
;
919 emeta
->nr_entries
= lm
->emeta_sec
[0];
920 l_mg
->eline_meta
[i
] = emeta
;
922 l_mg
->emeta_alloc_type
= PBLK_KMALLOC_META
;
924 emeta
->buf
= kmalloc(lm
->emeta_len
[0], GFP_KERNEL
);
927 goto fail_free_emeta
;
930 emeta
->nr_entries
= lm
->emeta_sec
[0];
931 l_mg
->eline_meta
[i
] = emeta
;
935 for (i
= 0; i
< l_mg
->nr_lines
; i
++)
936 l_mg
->vsc_list
[i
] = cpu_to_le32(EMPTY_ENTRY
);
938 bb_distance
= (geo
->all_luns
) * geo
->ws_opt
;
939 for (i
= 0; i
< lm
->sec_per_line
; i
+= bb_distance
)
940 bitmap_set(l_mg
->bb_template
, i
, geo
->ws_opt
);
946 if (l_mg
->emeta_alloc_type
== PBLK_VMALLOC_META
)
947 vfree(l_mg
->eline_meta
[i
]->buf
);
949 kfree(l_mg
->eline_meta
[i
]->buf
);
950 kfree(l_mg
->eline_meta
[i
]);
953 mempool_destroy(l_mg
->bitmap_pool
);
954 fail_destroy_bitmap_cache
:
955 kmem_cache_destroy(l_mg
->bitmap_cache
);
957 for (i
= 0; i
< PBLK_DATA_LINES
; i
++)
958 kfree(l_mg
->sline_meta
[i
]);
960 fail_free_bb_template
:
961 kfree(l_mg
->bb_template
);
963 kfree(l_mg
->vsc_list
);
968 static int pblk_line_meta_init(struct pblk
*pblk
)
970 struct nvm_tgt_dev
*dev
= pblk
->dev
;
971 struct nvm_geo
*geo
= &dev
->geo
;
972 struct pblk_line_meta
*lm
= &pblk
->lm
;
973 unsigned int smeta_len
, emeta_len
;
976 lm
->sec_per_line
= geo
->clba
* geo
->all_luns
;
977 lm
->blk_per_line
= geo
->all_luns
;
978 lm
->blk_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
979 lm
->sec_bitmap_len
= BITS_TO_LONGS(lm
->sec_per_line
) * sizeof(long);
980 lm
->lun_bitmap_len
= BITS_TO_LONGS(geo
->all_luns
) * sizeof(long);
981 lm
->mid_thrs
= lm
->sec_per_line
/ 2;
982 lm
->high_thrs
= lm
->sec_per_line
/ 4;
983 lm
->meta_distance
= (geo
->all_luns
/ 2) * pblk
->min_write_pgs
;
985 /* Calculate necessary pages for smeta. See comment over struct
986 * line_smeta definition
990 lm
->smeta_sec
= i
* geo
->ws_opt
;
991 lm
->smeta_len
= lm
->smeta_sec
* geo
->csecs
;
993 smeta_len
= sizeof(struct line_smeta
) + lm
->lun_bitmap_len
;
994 if (smeta_len
> lm
->smeta_len
) {
999 /* Calculate necessary pages for emeta. See comment over struct
1000 * line_emeta definition
1004 lm
->emeta_sec
[0] = i
* geo
->ws_opt
;
1005 lm
->emeta_len
[0] = lm
->emeta_sec
[0] * geo
->csecs
;
1007 emeta_len
= calc_emeta_len(pblk
);
1008 if (emeta_len
> lm
->emeta_len
[0]) {
1010 goto add_emeta_page
;
1013 lm
->emeta_bb
= geo
->all_luns
> i
? geo
->all_luns
- i
: 0;
1015 lm
->min_blk_line
= 1;
1016 if (geo
->all_luns
> 1)
1017 lm
->min_blk_line
+= DIV_ROUND_UP(lm
->smeta_sec
+
1018 lm
->emeta_sec
[0], geo
->clba
);
1020 if (lm
->min_blk_line
> lm
->blk_per_line
) {
1021 pblk_err(pblk
, "config. not supported. Min. LUN in line:%d\n",
1029 static int pblk_lines_init(struct pblk
*pblk
)
1031 struct pblk_line_mgmt
*l_mg
= &pblk
->l_mg
;
1032 struct pblk_line
*line
;
1034 int nr_free_chks
= 0;
1037 ret
= pblk_line_meta_init(pblk
);
1041 ret
= pblk_line_mg_init(pblk
);
1045 ret
= pblk_luns_init(pblk
);
1047 goto fail_free_meta
;
1049 chunk_meta
= pblk_get_chunk_meta(pblk
);
1050 if (IS_ERR(chunk_meta
)) {
1051 ret
= PTR_ERR(chunk_meta
);
1052 goto fail_free_luns
;
1055 pblk
->lines
= kcalloc(l_mg
->nr_lines
, sizeof(struct pblk_line
),
1059 goto fail_free_chunk_meta
;
1062 for (i
= 0; i
< l_mg
->nr_lines
; i
++) {
1063 line
= &pblk
->lines
[i
];
1065 ret
= pblk_alloc_line_meta(pblk
, line
);
1067 goto fail_free_lines
;
1069 nr_free_chks
+= pblk_setup_line_meta(pblk
, line
, chunk_meta
, i
);
1071 trace_pblk_line_state(pblk_disk_name(pblk
), line
->id
,
1075 if (!nr_free_chks
) {
1076 pblk_err(pblk
, "too many bad blocks prevent for sane instance\n");
1078 goto fail_free_lines
;
1081 ret
= pblk_set_provision(pblk
, nr_free_chks
);
1083 goto fail_free_lines
;
1090 pblk_line_meta_free(l_mg
, &pblk
->lines
[i
]);
1092 fail_free_chunk_meta
:
1097 pblk_line_mg_free(pblk
);
1102 static int pblk_writer_init(struct pblk
*pblk
)
1104 pblk
->writer_ts
= kthread_create(pblk_write_ts
, pblk
, "pblk-writer-t");
1105 if (IS_ERR(pblk
->writer_ts
)) {
1106 int err
= PTR_ERR(pblk
->writer_ts
);
1109 pblk_err(pblk
, "could not allocate writer kthread (%d)\n",
1114 timer_setup(&pblk
->wtimer
, pblk_write_timer_fn
, 0);
1115 mod_timer(&pblk
->wtimer
, jiffies
+ msecs_to_jiffies(100));
1120 static void pblk_writer_stop(struct pblk
*pblk
)
1122 /* The pipeline must be stopped and the write buffer emptied before the
1123 * write thread is stopped
1125 WARN(pblk_rb_read_count(&pblk
->rwb
),
1126 "Stopping not fully persisted write buffer\n");
1128 WARN(pblk_rb_sync_count(&pblk
->rwb
),
1129 "Stopping not fully synced write buffer\n");
1131 del_timer_sync(&pblk
->wtimer
);
1132 if (pblk
->writer_ts
)
1133 kthread_stop(pblk
->writer_ts
);
1136 static void pblk_free(struct pblk
*pblk
)
1138 pblk_lines_free(pblk
);
1139 pblk_l2p_free(pblk
);
1140 pblk_rwb_free(pblk
);
1141 pblk_core_free(pblk
);
1146 static void pblk_tear_down(struct pblk
*pblk
, bool graceful
)
1149 __pblk_pipeline_flush(pblk
);
1150 __pblk_pipeline_stop(pblk
);
1151 pblk_writer_stop(pblk
);
1152 pblk_rb_sync_l2p(&pblk
->rwb
);
1153 pblk_rl_free(&pblk
->rl
);
1155 pblk_debug(pblk
, "consistent tear down (graceful:%d)\n", graceful
);
1158 static void pblk_exit(void *private, bool graceful
)
1160 struct pblk
*pblk
= private;
1162 pblk_gc_exit(pblk
, graceful
);
1163 pblk_tear_down(pblk
, graceful
);
1165 #ifdef CONFIG_NVM_PBLK_DEBUG
1166 pblk_info(pblk
, "exit: L2P CRC: %x\n", pblk_l2p_crc(pblk
));
1172 static sector_t
pblk_capacity(void *private)
1174 struct pblk
*pblk
= private;
1176 return pblk
->capacity
* NR_PHY_IN_LOG
;
1179 static void *pblk_init(struct nvm_tgt_dev
*dev
, struct gendisk
*tdisk
,
1182 struct nvm_geo
*geo
= &dev
->geo
;
1183 struct request_queue
*bqueue
= dev
->q
;
1184 struct request_queue
*tqueue
= tdisk
->queue
;
1188 pblk
= kzalloc(sizeof(struct pblk
), GFP_KERNEL
);
1190 return ERR_PTR(-ENOMEM
);
1194 pblk
->state
= PBLK_STATE_RUNNING
;
1195 trace_pblk_state(pblk_disk_name(pblk
), pblk
->state
);
1196 pblk
->gc
.gc_enabled
= 0;
1198 if (!(geo
->version
== NVM_OCSSD_SPEC_12
||
1199 geo
->version
== NVM_OCSSD_SPEC_20
)) {
1200 pblk_err(pblk
, "OCSSD version not supported (%u)\n",
1203 return ERR_PTR(-EINVAL
);
1207 pblk_err(pblk
, "extended metadata not supported\n");
1209 return ERR_PTR(-EINVAL
);
1212 spin_lock_init(&pblk
->resubmit_lock
);
1213 spin_lock_init(&pblk
->trans_lock
);
1214 spin_lock_init(&pblk
->lock
);
1216 #ifdef CONFIG_NVM_PBLK_DEBUG
1217 atomic_long_set(&pblk
->inflight_writes
, 0);
1218 atomic_long_set(&pblk
->padded_writes
, 0);
1219 atomic_long_set(&pblk
->padded_wb
, 0);
1220 atomic_long_set(&pblk
->req_writes
, 0);
1221 atomic_long_set(&pblk
->sub_writes
, 0);
1222 atomic_long_set(&pblk
->sync_writes
, 0);
1223 atomic_long_set(&pblk
->inflight_reads
, 0);
1224 atomic_long_set(&pblk
->cache_reads
, 0);
1225 atomic_long_set(&pblk
->sync_reads
, 0);
1226 atomic_long_set(&pblk
->recov_writes
, 0);
1227 atomic_long_set(&pblk
->recov_writes
, 0);
1228 atomic_long_set(&pblk
->recov_gc_writes
, 0);
1229 atomic_long_set(&pblk
->recov_gc_reads
, 0);
1232 atomic_long_set(&pblk
->read_failed
, 0);
1233 atomic_long_set(&pblk
->read_empty
, 0);
1234 atomic_long_set(&pblk
->read_high_ecc
, 0);
1235 atomic_long_set(&pblk
->read_failed_gc
, 0);
1236 atomic_long_set(&pblk
->write_failed
, 0);
1237 atomic_long_set(&pblk
->erase_failed
, 0);
1239 ret
= pblk_core_init(pblk
);
1241 pblk_err(pblk
, "could not initialize core\n");
1245 ret
= pblk_lines_init(pblk
);
1247 pblk_err(pblk
, "could not initialize lines\n");
1248 goto fail_free_core
;
1251 ret
= pblk_rwb_init(pblk
);
1253 pblk_err(pblk
, "could not initialize write buffer\n");
1254 goto fail_free_lines
;
1257 ret
= pblk_l2p_init(pblk
, flags
& NVM_TARGET_FACTORY
);
1259 pblk_err(pblk
, "could not initialize maps\n");
1263 ret
= pblk_writer_init(pblk
);
1266 pblk_err(pblk
, "could not initialize write thread\n");
1270 ret
= pblk_gc_init(pblk
);
1272 pblk_err(pblk
, "could not initialize gc\n");
1273 goto fail_stop_writer
;
1276 /* inherit the size from the underlying device */
1277 blk_queue_logical_block_size(tqueue
, queue_physical_block_size(bqueue
));
1278 blk_queue_max_hw_sectors(tqueue
, queue_max_hw_sectors(bqueue
));
1280 blk_queue_write_cache(tqueue
, true, false);
1282 tqueue
->limits
.discard_granularity
= geo
->clba
* geo
->csecs
;
1283 tqueue
->limits
.discard_alignment
= 0;
1284 blk_queue_max_discard_sectors(tqueue
, UINT_MAX
>> 9);
1285 blk_queue_flag_set(QUEUE_FLAG_DISCARD
, tqueue
);
1287 pblk_info(pblk
, "luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
1288 geo
->all_luns
, pblk
->l_mg
.nr_lines
,
1289 (unsigned long long)pblk
->rl
.nr_secs
,
1290 pblk
->rwb
.nr_entries
);
1292 wake_up_process(pblk
->writer_ts
);
1294 /* Check if we need to start GC */
1295 pblk_gc_should_kick(pblk
);
1300 pblk_writer_stop(pblk
);
1302 pblk_l2p_free(pblk
);
1304 pblk_rwb_free(pblk
);
1306 pblk_lines_free(pblk
);
1308 pblk_core_free(pblk
);
1311 return ERR_PTR(ret
);
1314 /* physical block device target */
1315 static struct nvm_tgt_type tt_pblk
= {
1317 .version
= {1, 0, 0},
1319 .make_rq
= pblk_make_rq
,
1320 .capacity
= pblk_capacity
,
1325 .sysfs_init
= pblk_sysfs_init
,
1326 .sysfs_exit
= pblk_sysfs_exit
,
1327 .owner
= THIS_MODULE
,
1330 static int __init
pblk_module_init(void)
1334 ret
= bioset_init(&pblk_bio_set
, BIO_POOL_SIZE
, 0, 0);
1337 ret
= nvm_register_tgt_type(&tt_pblk
);
1339 bioset_exit(&pblk_bio_set
);
1343 static void pblk_module_exit(void)
1345 bioset_exit(&pblk_bio_set
);
1346 nvm_unregister_tgt_type(&tt_pblk
);
1349 module_init(pblk_module_init
);
1350 module_exit(pblk_module_exit
);
1351 MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
1352 MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
1353 MODULE_LICENSE("GPL v2");
1354 MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");