]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/lib/ftl/ftl_core.c
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / lib / ftl / ftl_core.c
1 /*-
2 * BSD LICENSE
3 *
4 * Copyright (c) Intel Corporation.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include "spdk/likely.h"
35 #include "spdk/stdinc.h"
36 #include "spdk/nvme.h"
37 #include "spdk/io_channel.h"
38 #include "spdk/bdev_module.h"
39 #include "spdk/string.h"
40 #include "spdk_internal/log.h"
41 #include "spdk/ftl.h"
42
43 #include "ftl_core.h"
44 #include "ftl_band.h"
45 #include "ftl_io.h"
46 #include "ftl_anm.h"
47 #include "ftl_rwb.h"
48 #include "ftl_debug.h"
49 #include "ftl_reloc.h"
50
51 struct ftl_wptr {
52 /* Owner device */
53 struct spdk_ftl_dev *dev;
54
55 /* Current PPA */
56 struct ftl_ppa ppa;
57
58 /* Band currently being written to */
59 struct ftl_band *band;
60
61 /* Current logical block's offset */
62 uint64_t offset;
63
64 /* Current erase block */
65 struct ftl_chunk *chunk;
66
67 /* Pending IO queue */
68 TAILQ_HEAD(, ftl_io) pending_queue;
69
70 /* List link */
71 LIST_ENTRY(ftl_wptr) list_entry;
72
73 /*
74 * If setup in direct mode, there will be no offset or band state update after IO.
75 * The PPA is not assigned by wptr, and is instead taken directly from the request.
76 */
77 bool direct_mode;
78 };
79
80 struct ftl_flush {
81 /* Owner device */
82 struct spdk_ftl_dev *dev;
83
84 /* Number of batches to wait for */
85 size_t num_req;
86
87 /* Callback */
88 struct {
89 spdk_ftl_fn fn;
90 void *ctx;
91 } cb;
92
93 /* Batch bitmap */
94 struct spdk_bit_array *bmap;
95
96 /* List link */
97 LIST_ENTRY(ftl_flush) list_entry;
98 };
99
100 static int
101 ftl_rwb_flags_from_io(const struct ftl_io *io)
102 {
103 int valid_flags = FTL_IO_INTERNAL | FTL_IO_WEAK | FTL_IO_PAD;
104 return io->flags & valid_flags;
105 }
106
107 static int
108 ftl_rwb_entry_weak(const struct ftl_rwb_entry *entry)
109 {
110 return entry->flags & FTL_IO_WEAK;
111 }
112
113 static void
114 ftl_wptr_free(struct ftl_wptr *wptr)
115 {
116 if (!wptr) {
117 return;
118 }
119
120 free(wptr);
121 }
122
123 static void
124 ftl_remove_wptr(struct ftl_wptr *wptr)
125 {
126 LIST_REMOVE(wptr, list_entry);
127 ftl_wptr_free(wptr);
128 }
129
130 static void
131 ftl_io_cmpl_cb(void *arg, const struct spdk_nvme_cpl *status)
132 {
133 struct ftl_io *io = arg;
134
135 if (spdk_nvme_cpl_is_error(status)) {
136 ftl_io_process_error(io, status);
137 }
138
139 ftl_trace_completion(io->dev, io, FTL_TRACE_COMPLETION_DISK);
140
141 ftl_io_dec_req(io);
142
143 if (ftl_io_done(io)) {
144 ftl_io_complete(io);
145 }
146 }
147
148 static void
149 ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band)
150 {
151 struct ftl_wptr *wptr = NULL;
152
153 LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
154 if (wptr->band == band) {
155 break;
156 }
157 }
158
159 /* If the band already has the high_prio flag set, other writes must */
160 /* have failed earlier, so it's already taken care of. */
161 if (band->high_prio) {
162 assert(wptr == NULL);
163 return;
164 }
165
166 ftl_band_write_failed(band);
167 ftl_remove_wptr(wptr);
168 }
169
170 static struct ftl_wptr *
171 ftl_wptr_from_band(struct ftl_band *band)
172 {
173 struct spdk_ftl_dev *dev = band->dev;
174 struct ftl_wptr *wptr = NULL;
175
176 LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
177 if (wptr->band == band) {
178 return wptr;
179 }
180 }
181
182 return NULL;
183 }
184
185 static void
186 ftl_md_write_fail(struct ftl_io *io, int status)
187 {
188 struct ftl_band *band = io->band;
189 struct ftl_wptr *wptr;
190 char buf[128];
191
192 wptr = ftl_wptr_from_band(band);
193
194 SPDK_ERRLOG("Metadata write failed @ppa: %s, status: %d\n",
195 ftl_ppa2str(wptr->ppa, buf, sizeof(buf)), status);
196
197 ftl_halt_writes(io->dev, band);
198 }
199
200 static void
201 ftl_md_write_cb(struct ftl_io *io, void *arg, int status)
202 {
203 struct spdk_ftl_dev *dev = io->dev;
204 struct ftl_nv_cache *nv_cache = &dev->nv_cache;
205 struct ftl_wptr *wptr;
206 struct spdk_bdev *bdev;
207
208 wptr = ftl_wptr_from_band(io->band);
209
210 if (status) {
211 ftl_md_write_fail(io, status);
212 return;
213 }
214
215 ftl_band_set_next_state(io->band);
216 if (io->band->state == FTL_BAND_STATE_CLOSED) {
217 if (nv_cache->bdev_desc) {
218 bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
219
220 pthread_spin_lock(&nv_cache->lock);
221 nv_cache->num_available += ftl_band_user_lbks(io->band);
222
223 if (spdk_unlikely(nv_cache->num_available > spdk_bdev_get_num_blocks(bdev))) {
224 nv_cache->num_available = spdk_bdev_get_num_blocks(bdev);
225 }
226 pthread_spin_unlock(&nv_cache->lock);
227 }
228
229 ftl_remove_wptr(wptr);
230 }
231 }
232
233 static int
234 ftl_ppa_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa)
235 {
236 struct spdk_ftl_dev *dev = io->dev;
237 size_t lbk_cnt, max_lbks;
238
239 assert(ftl_io_mode_ppa(io));
240 assert(io->iov_pos < io->iov_cnt);
241
242 if (io->pos == 0) {
243 *ppa = io->ppa;
244 } else {
245 *ppa = ftl_band_next_xfer_ppa(io->band, io->ppa, io->pos);
246 }
247
248 assert(!ftl_ppa_invalid(*ppa));
249
250 /* Metadata has to be read in the way it's written (jumping across */
251 /* the chunks in xfer_size increments) */
252 if (io->flags & FTL_IO_MD) {
253 max_lbks = dev->xfer_size - (ppa->lbk % dev->xfer_size);
254 lbk_cnt = spdk_min(ftl_io_iovec_len_left(io), max_lbks);
255 assert(ppa->lbk / dev->xfer_size == (ppa->lbk + lbk_cnt - 1) / dev->xfer_size);
256 } else {
257 lbk_cnt = ftl_io_iovec_len_left(io);
258 }
259
260 return lbk_cnt;
261 }
262
263 static int
264 ftl_wptr_close_band(struct ftl_wptr *wptr)
265 {
266 struct ftl_band *band = wptr->band;
267
268 ftl_band_set_state(band, FTL_BAND_STATE_CLOSING);
269 band->tail_md_ppa = wptr->ppa;
270
271 return ftl_band_write_tail_md(band, ftl_md_write_cb);
272 }
273
274 static int
275 ftl_wptr_open_band(struct ftl_wptr *wptr)
276 {
277 struct ftl_band *band = wptr->band;
278
279 assert(ftl_band_chunk_is_first(band, wptr->chunk));
280 assert(band->lba_map.num_vld == 0);
281
282 ftl_band_clear_lba_map(band);
283
284 assert(band->state == FTL_BAND_STATE_PREP);
285 ftl_band_set_state(band, FTL_BAND_STATE_OPENING);
286
287 return ftl_band_write_head_md(band, ftl_md_write_cb);
288 }
289
290 static int
291 ftl_submit_erase(struct ftl_io *io)
292 {
293 struct spdk_ftl_dev *dev = io->dev;
294 struct ftl_band *band = io->band;
295 struct ftl_ppa ppa = io->ppa;
296 struct ftl_chunk *chunk;
297 uint64_t ppa_packed;
298 int rc = 0;
299 size_t i;
300
301 for (i = 0; i < io->lbk_cnt; ++i) {
302 if (i != 0) {
303 chunk = ftl_band_next_chunk(band, ftl_band_chunk_from_ppa(band, ppa));
304 assert(chunk->state == FTL_CHUNK_STATE_CLOSED ||
305 chunk->state == FTL_CHUNK_STATE_VACANT);
306 ppa = chunk->start_ppa;
307 }
308
309 assert(ppa.lbk == 0);
310 ppa_packed = ftl_ppa_addr_pack(dev, ppa);
311
312 ftl_trace_submission(dev, io, ppa, 1);
313 rc = spdk_nvme_ocssd_ns_cmd_vector_reset(dev->ns, ftl_get_write_qpair(dev),
314 &ppa_packed, 1, NULL, ftl_io_cmpl_cb, io);
315 if (spdk_unlikely(rc)) {
316 ftl_io_fail(io, rc);
317 SPDK_ERRLOG("Vector reset failed with status: %d\n", rc);
318 break;
319 }
320
321 ftl_io_inc_req(io);
322 ftl_io_advance(io, 1);
323 }
324
325 if (ftl_io_done(io)) {
326 ftl_io_complete(io);
327 }
328
329 return rc;
330 }
331
332 static void
333 _ftl_io_erase(void *ctx)
334 {
335 ftl_io_erase((struct ftl_io *)ctx);
336 }
337
338 static bool
339 ftl_check_core_thread(const struct spdk_ftl_dev *dev)
340 {
341 return dev->core_thread.thread == spdk_get_thread();
342 }
343
344 static bool
345 ftl_check_read_thread(const struct spdk_ftl_dev *dev)
346 {
347 return dev->read_thread.thread == spdk_get_thread();
348 }
349
350 int
351 ftl_io_erase(struct ftl_io *io)
352 {
353 struct spdk_ftl_dev *dev = io->dev;
354
355 if (ftl_check_core_thread(dev)) {
356 return ftl_submit_erase(io);
357 }
358
359 spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_erase, io);
360 return 0;
361 }
362
363 static struct ftl_band *
364 ftl_next_write_band(struct spdk_ftl_dev *dev)
365 {
366 struct ftl_band *band;
367
368 band = LIST_FIRST(&dev->free_bands);
369 if (!band) {
370 return NULL;
371 }
372 assert(band->state == FTL_BAND_STATE_FREE);
373
374 if (ftl_band_erase(band)) {
375 /* TODO: handle erase failure */
376 return NULL;
377 }
378
379 return band;
380 }
381
382 static struct ftl_band *
383 ftl_next_wptr_band(struct spdk_ftl_dev *dev)
384 {
385 struct ftl_band *band;
386
387 if (!dev->next_band) {
388 band = ftl_next_write_band(dev);
389 } else {
390 assert(dev->next_band->state == FTL_BAND_STATE_PREP);
391 band = dev->next_band;
392 dev->next_band = NULL;
393 }
394
395 return band;
396 }
397
398 static struct ftl_wptr *
399 ftl_wptr_init(struct ftl_band *band)
400 {
401 struct spdk_ftl_dev *dev = band->dev;
402 struct ftl_wptr *wptr;
403
404 wptr = calloc(1, sizeof(*wptr));
405 if (!wptr) {
406 return NULL;
407 }
408
409 wptr->dev = dev;
410 wptr->band = band;
411 wptr->chunk = CIRCLEQ_FIRST(&band->chunks);
412 wptr->ppa = wptr->chunk->start_ppa;
413 TAILQ_INIT(&wptr->pending_queue);
414
415 return wptr;
416 }
417
418 static int
419 ftl_add_direct_wptr(struct ftl_band *band)
420 {
421 struct spdk_ftl_dev *dev = band->dev;
422 struct ftl_wptr *wptr;
423
424 assert(band->state == FTL_BAND_STATE_OPEN);
425
426 wptr = ftl_wptr_init(band);
427 if (!wptr) {
428 return -1;
429 }
430
431 wptr->direct_mode = true;
432
433 if (ftl_band_alloc_lba_map(band)) {
434 ftl_wptr_free(wptr);
435 return -1;
436 }
437
438 LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
439
440 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: direct band %u\n", band->id);
441 ftl_trace_write_band(dev, band);
442 return 0;
443 }
444
445 static void
446 ftl_close_direct_wptr(struct ftl_band *band)
447 {
448 struct ftl_wptr *wptr = ftl_wptr_from_band(band);
449
450 assert(wptr->direct_mode);
451 assert(band->state == FTL_BAND_STATE_CLOSED);
452
453 ftl_band_release_lba_map(band);
454
455 ftl_remove_wptr(wptr);
456 }
457
458 int
459 ftl_band_set_direct_access(struct ftl_band *band, bool access)
460 {
461 if (access) {
462 return ftl_add_direct_wptr(band);
463 } else {
464 ftl_close_direct_wptr(band);
465 return 0;
466 }
467 }
468
469 static int
470 ftl_add_wptr(struct spdk_ftl_dev *dev)
471 {
472 struct ftl_band *band;
473 struct ftl_wptr *wptr;
474
475 band = ftl_next_wptr_band(dev);
476 if (!band) {
477 return -1;
478 }
479
480 wptr = ftl_wptr_init(band);
481 if (!wptr) {
482 return -1;
483 }
484
485 if (ftl_band_write_prep(band)) {
486 ftl_wptr_free(wptr);
487 return -1;
488 }
489
490 LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
491
492 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id);
493 ftl_trace_write_band(dev, band);
494 return 0;
495 }
496
497 static void
498 ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size)
499 {
500 struct ftl_band *band = wptr->band;
501 struct spdk_ftl_dev *dev = wptr->dev;
502 struct spdk_ftl_conf *conf = &dev->conf;
503 size_t next_thld;
504
505 if (spdk_unlikely(wptr->direct_mode)) {
506 return;
507 }
508
509 wptr->offset += xfer_size;
510 next_thld = (ftl_band_num_usable_lbks(band) * conf->band_thld) / 100;
511
512 if (ftl_band_full(band, wptr->offset)) {
513 ftl_band_set_state(band, FTL_BAND_STATE_FULL);
514 }
515
516 wptr->chunk->busy = true;
517 wptr->ppa = ftl_band_next_xfer_ppa(band, wptr->ppa, xfer_size);
518 wptr->chunk = ftl_band_next_operational_chunk(band, wptr->chunk);
519
520 assert(!ftl_ppa_invalid(wptr->ppa));
521
522 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: grp:%d, pu:%d chunk:%d, lbk:%u\n",
523 wptr->ppa.grp, wptr->ppa.pu, wptr->ppa.chk, wptr->ppa.lbk);
524
525 if (wptr->offset >= next_thld && !dev->next_band) {
526 dev->next_band = ftl_next_write_band(dev);
527 }
528 }
529
530 static size_t
531 ftl_wptr_user_lbks_left(const struct ftl_wptr *wptr)
532 {
533 return ftl_band_user_lbks_left(wptr->band, wptr->offset);
534 }
535
536 static int
537 ftl_wptr_ready(struct ftl_wptr *wptr)
538 {
539 struct ftl_band *band = wptr->band;
540
541 /* TODO: add handling of empty bands */
542
543 if (spdk_unlikely(!ftl_chunk_is_writable(wptr->chunk))) {
544 /* Erasing band may fail after it was assigned to wptr. */
545 if (spdk_unlikely(wptr->chunk->state == FTL_CHUNK_STATE_BAD)) {
546 ftl_wptr_advance(wptr, wptr->dev->xfer_size);
547 }
548 return 0;
549 }
550
551 /* If we're in the process of writing metadata, wait till it is */
552 /* completed. */
553 /* TODO: we should probably change bands once we're writing tail md */
554 if (ftl_band_state_changing(band)) {
555 return 0;
556 }
557
558 if (band->state == FTL_BAND_STATE_FULL) {
559 if (ftl_wptr_close_band(wptr)) {
560 /* TODO: need recovery here */
561 assert(false);
562 }
563 return 0;
564 }
565
566 if (band->state != FTL_BAND_STATE_OPEN) {
567 if (ftl_wptr_open_band(wptr)) {
568 /* TODO: need recovery here */
569 assert(false);
570 }
571 return 0;
572 }
573
574 return 1;
575 }
576
577 static const struct spdk_ftl_limit *
578 ftl_get_limit(const struct spdk_ftl_dev *dev, int type)
579 {
580 assert(type < SPDK_FTL_LIMIT_MAX);
581 return &dev->conf.defrag.limits[type];
582 }
583
584 static bool
585 ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
586 {
587 struct ftl_ppa ppa;
588
589 /* If the LBA is invalid don't bother checking the md and l2p */
590 if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) {
591 return false;
592 }
593
594 ppa = ftl_l2p_get(dev, entry->lba);
595 if (!(ftl_ppa_cached(ppa) && ppa.offset == entry->pos)) {
596 return false;
597 }
598
599 return true;
600 }
601
602 static void
603 ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
604 {
605 pthread_spin_lock(&entry->lock);
606
607 if (!ftl_rwb_entry_valid(entry)) {
608 goto unlock;
609 }
610
611 /* If the l2p wasn't updated and still points at the entry, fill it with the */
612 /* on-disk PPA and clear the cache status bit. Otherwise, skip the l2p update */
613 /* and just clear the cache status. */
614 if (!ftl_cache_lba_valid(dev, entry)) {
615 goto clear;
616 }
617
618 ftl_l2p_set(dev, entry->lba, entry->ppa);
619 clear:
620 ftl_rwb_entry_invalidate(entry);
621 unlock:
622 pthread_spin_unlock(&entry->lock);
623 }
624
625 static struct ftl_rwb_entry *
626 ftl_acquire_entry(struct spdk_ftl_dev *dev, int flags)
627 {
628 struct ftl_rwb_entry *entry;
629
630 entry = ftl_rwb_acquire(dev->rwb, ftl_rwb_type_from_flags(flags));
631 if (!entry) {
632 return NULL;
633 }
634
635 ftl_evict_cache_entry(dev, entry);
636
637 entry->flags = flags;
638 return entry;
639 }
640
641 static void
642 ftl_rwb_pad(struct spdk_ftl_dev *dev, size_t size)
643 {
644 struct ftl_rwb_entry *entry;
645 int flags = FTL_IO_PAD | FTL_IO_INTERNAL;
646
647 for (size_t i = 0; i < size; ++i) {
648 entry = ftl_acquire_entry(dev, flags);
649 if (!entry) {
650 break;
651 }
652
653 entry->lba = FTL_LBA_INVALID;
654 entry->ppa = ftl_to_ppa(FTL_PPA_INVALID);
655 memset(entry->data, 0, FTL_BLOCK_SIZE);
656 ftl_rwb_push(entry);
657 }
658 }
659
660 static void
661 ftl_remove_free_bands(struct spdk_ftl_dev *dev)
662 {
663 while (!LIST_EMPTY(&dev->free_bands)) {
664 LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry);
665 }
666
667 dev->next_band = NULL;
668 }
669
670 static void
671 ftl_wptr_process_shutdown(struct ftl_wptr *wptr)
672 {
673 struct spdk_ftl_dev *dev = wptr->dev;
674 size_t size = ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_INTERNAL) +
675 ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_USER);
676 size_t num_active = dev->xfer_size * ftl_rwb_get_active_batches(dev->rwb);
677 size_t band_length, rwb_free_space, pad_length;
678
679 num_active = num_active ? num_active : dev->xfer_size;
680 if (size >= num_active) {
681 return;
682 }
683
684 /* If we reach this point we need to remove free bands */
685 /* and pad current wptr band to the end */
686 if (ftl_rwb_get_active_batches(dev->rwb) <= 1) {
687 ftl_remove_free_bands(dev);
688 }
689
690 band_length = ftl_wptr_user_lbks_left(wptr);
691 rwb_free_space = ftl_rwb_size(dev->rwb) - size;
692 pad_length = spdk_min(band_length, rwb_free_space);
693
694 /* Pad write buffer until band is full */
695 ftl_rwb_pad(dev, pad_length);
696 }
697
698 static int
699 ftl_shutdown_complete(struct spdk_ftl_dev *dev)
700 {
701 return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) &&
702 LIST_EMPTY(&dev->wptr_list);
703 }
704
705 void
706 ftl_apply_limits(struct spdk_ftl_dev *dev)
707 {
708 const struct spdk_ftl_limit *limit;
709 struct ftl_stats *stats = &dev->stats;
710 size_t rwb_limit[FTL_RWB_TYPE_MAX];
711 int i;
712
713 ftl_rwb_get_limits(dev->rwb, rwb_limit);
714
715 /* Clear existing limit */
716 dev->limit = SPDK_FTL_LIMIT_MAX;
717
718 for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) {
719 limit = ftl_get_limit(dev, i);
720
721 if (dev->num_free <= limit->thld) {
722 rwb_limit[FTL_RWB_TYPE_USER] =
723 (limit->limit * ftl_rwb_entry_cnt(dev->rwb)) / 100;
724 stats->limits[i]++;
725 dev->limit = i;
726 goto apply;
727 }
728 }
729
730 /* Clear the limits, since we don't need to apply them anymore */
731 rwb_limit[FTL_RWB_TYPE_USER] = ftl_rwb_entry_cnt(dev->rwb);
732 apply:
733 ftl_trace_limits(dev, rwb_limit, dev->num_free);
734 ftl_rwb_set_limits(dev->rwb, rwb_limit);
735 }
736
737 static int
738 ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
739 {
740 struct ftl_band *band = ftl_band_from_ppa(dev, ppa);
741 struct ftl_lba_map *lba_map = &band->lba_map;
742 uint64_t offset;
743
744 offset = ftl_band_lbkoff_from_ppa(band, ppa);
745
746 /* The bit might be already cleared if two writes are scheduled to the */
747 /* same LBA at the same time */
748 if (spdk_bit_array_get(lba_map->vld, offset)) {
749 assert(lba_map->num_vld > 0);
750 spdk_bit_array_clear(lba_map->vld, offset);
751 lba_map->num_vld--;
752 return 1;
753 }
754
755 return 0;
756 }
757
758 int
759 ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
760 {
761 struct ftl_band *band;
762 int rc;
763
764 assert(!ftl_ppa_cached(ppa));
765 band = ftl_band_from_ppa(dev, ppa);
766
767 pthread_spin_lock(&band->lba_map.lock);
768 rc = ftl_invalidate_addr_unlocked(dev, ppa);
769 pthread_spin_unlock(&band->lba_map.lock);
770
771 return rc;
772 }
773
774 static int
775 ftl_read_retry(int rc)
776 {
777 return rc == -EAGAIN;
778 }
779
780 static int
781 ftl_read_canceled(int rc)
782 {
783 return rc == -EFAULT || rc == 0;
784 }
785
786 static void
787 ftl_add_to_retry_queue(struct ftl_io *io)
788 {
789 if (!(io->flags & FTL_IO_RETRY)) {
790 io->flags |= FTL_IO_RETRY;
791 TAILQ_INSERT_TAIL(&io->dev->retry_queue, io, retry_entry);
792 }
793 }
794
795 static int
796 ftl_ppa_cache_read(struct ftl_io *io, uint64_t lba,
797 struct ftl_ppa ppa, void *buf)
798 {
799 struct ftl_rwb *rwb = io->dev->rwb;
800 struct ftl_rwb_entry *entry;
801 struct ftl_ppa nppa;
802 int rc = 0;
803
804 entry = ftl_rwb_entry_from_offset(rwb, ppa.offset);
805 pthread_spin_lock(&entry->lock);
806
807 nppa = ftl_l2p_get(io->dev, lba);
808 if (ppa.ppa != nppa.ppa) {
809 rc = -1;
810 goto out;
811 }
812
813 memcpy(buf, entry->data, FTL_BLOCK_SIZE);
814 out:
815 pthread_spin_unlock(&entry->lock);
816 return rc;
817 }
818
819 static int
820 ftl_lba_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa)
821 {
822 struct spdk_ftl_dev *dev = io->dev;
823 struct ftl_ppa next_ppa;
824 size_t i;
825
826 *ppa = ftl_l2p_get(dev, ftl_io_current_lba(io));
827
828 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read ppa:%lx, lba:%lu\n",
829 ppa->ppa, ftl_io_current_lba(io));
830
831 /* If the PPA is invalid, skip it (the buffer should already be zero'ed) */
832 if (ftl_ppa_invalid(*ppa)) {
833 return -EFAULT;
834 }
835
836 if (ftl_ppa_cached(*ppa)) {
837 if (!ftl_ppa_cache_read(io, ftl_io_current_lba(io), *ppa, ftl_io_iovec_addr(io))) {
838 return 0;
839 }
840
841 /* If the state changed, we have to re-read the l2p */
842 return -EAGAIN;
843 }
844
845 for (i = 1; i < ftl_io_iovec_len_left(io); ++i) {
846 next_ppa = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i));
847
848 if (ftl_ppa_invalid(next_ppa) || ftl_ppa_cached(next_ppa)) {
849 break;
850 }
851
852 if (ftl_ppa_addr_pack(dev, *ppa) + i != ftl_ppa_addr_pack(dev, next_ppa)) {
853 break;
854 }
855 }
856
857 return i;
858 }
859
860 static int
861 ftl_submit_read(struct ftl_io *io)
862 {
863 struct spdk_ftl_dev *dev = io->dev;
864 struct ftl_ppa ppa;
865 int rc = 0, lbk_cnt;
866
867 assert(LIST_EMPTY(&io->children));
868
869 while (io->pos < io->lbk_cnt) {
870 if (ftl_io_mode_ppa(io)) {
871 lbk_cnt = rc = ftl_ppa_read_next_ppa(io, &ppa);
872 } else {
873 lbk_cnt = rc = ftl_lba_read_next_ppa(io, &ppa);
874 }
875
876 /* We might need to retry the read from scratch (e.g. */
877 /* because write was under way and completed before */
878 /* we could read it from rwb */
879 if (ftl_read_retry(rc)) {
880 continue;
881 }
882
883 /* We don't have to schedule the read, as it was read from cache */
884 if (ftl_read_canceled(rc)) {
885 ftl_io_advance(io, 1);
886 ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID :
887 FTL_TRACE_COMPLETION_CACHE);
888 rc = 0;
889 continue;
890 }
891
892 assert(lbk_cnt > 0);
893
894 ftl_trace_submission(dev, io, ppa, lbk_cnt);
895 rc = spdk_nvme_ns_cmd_read(dev->ns, ftl_get_read_qpair(dev),
896 ftl_io_iovec_addr(io),
897 ftl_ppa_addr_pack(io->dev, ppa), lbk_cnt,
898 ftl_io_cmpl_cb, io, 0);
899 if (spdk_unlikely(rc)) {
900 if (rc == -ENOMEM) {
901 ftl_add_to_retry_queue(io);
902 } else {
903 ftl_io_fail(io, rc);
904 }
905 break;
906 }
907
908 ftl_io_inc_req(io);
909 ftl_io_advance(io, lbk_cnt);
910 }
911
912 /* If we didn't have to read anything from the device, */
913 /* complete the request right away */
914 if (ftl_io_done(io)) {
915 ftl_io_complete(io);
916 }
917
918 return rc;
919 }
920
921 static void
922 ftl_complete_flush(struct ftl_flush *flush)
923 {
924 assert(flush->num_req == 0);
925 LIST_REMOVE(flush, list_entry);
926
927 flush->cb.fn(flush->cb.ctx, 0);
928
929 spdk_bit_array_free(&flush->bmap);
930 free(flush);
931 }
932
933 static void
934 ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_rwb_batch *batch)
935 {
936 struct ftl_flush *flush, *tflush;
937 size_t offset;
938
939 LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) {
940 offset = ftl_rwb_batch_get_offset(batch);
941
942 if (spdk_bit_array_get(flush->bmap, offset)) {
943 spdk_bit_array_clear(flush->bmap, offset);
944 if (!(--flush->num_req)) {
945 ftl_complete_flush(flush);
946 }
947 }
948 }
949 }
950
951 static uint64_t
952 ftl_reserve_nv_cache(struct ftl_nv_cache *nv_cache, size_t *num_lbks)
953 {
954 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
955 uint64_t num_available, cache_size, cache_addr = FTL_LBA_INVALID;
956
957 cache_size = spdk_bdev_get_num_blocks(bdev);
958
959 pthread_spin_lock(&nv_cache->lock);
960 if (spdk_unlikely(nv_cache->num_available == 0)) {
961 goto out;
962 }
963
964 num_available = spdk_min(nv_cache->num_available, *num_lbks);
965 if (spdk_unlikely(nv_cache->current_addr + num_available > cache_size)) {
966 *num_lbks = cache_size - nv_cache->current_addr;
967 } else {
968 *num_lbks = num_available;
969 }
970
971 cache_addr = nv_cache->current_addr;
972 nv_cache->current_addr += *num_lbks;
973 nv_cache->num_available -= *num_lbks;
974
975 if (nv_cache->current_addr == spdk_bdev_get_num_blocks(bdev)) {
976 nv_cache->current_addr = 0;
977 }
978 out:
979 pthread_spin_unlock(&nv_cache->lock);
980 return cache_addr;
981 }
982
983 static struct ftl_io *
984 ftl_alloc_io_nv_cache(struct ftl_io *parent, size_t num_lbks)
985 {
986 struct ftl_io_init_opts opts = {
987 .dev = parent->dev,
988 .parent = parent,
989 .data = ftl_io_iovec_addr(parent),
990 .lbk_cnt = num_lbks,
991 .flags = FTL_IO_CACHE,
992 };
993
994 return ftl_io_init_internal(&opts);
995 }
996
997 static void
998 ftl_nv_cache_submit_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
999 {
1000 struct ftl_io *io = cb_arg;
1001
1002 if (spdk_unlikely(!success)) {
1003 SPDK_ERRLOG("Non-volatile cache write failed at %"PRIx64"\n", io->ppa.ppa);
1004 io->status = -EIO;
1005 }
1006
1007 ftl_io_dec_req(io);
1008 if (ftl_io_done(io)) {
1009 ftl_io_complete(io);
1010 }
1011
1012 spdk_bdev_free_io(bdev_io);
1013 }
1014
1015 static void
1016 ftl_submit_nv_cache(void *ctx)
1017 {
1018 struct ftl_io *io = ctx;
1019 struct spdk_ftl_dev *dev = io->dev;
1020 struct spdk_thread *thread;
1021 struct ftl_io_channel *ioch;
1022 int rc;
1023
1024 ioch = spdk_io_channel_get_ctx(io->ioch);
1025 thread = spdk_io_channel_get_thread(io->ioch);
1026
1027 rc = spdk_bdev_write_blocks(dev->nv_cache.bdev_desc, ioch->cache_ioch,
1028 ftl_io_iovec_addr(io), io->ppa.ppa, io->lbk_cnt,
1029 ftl_nv_cache_submit_cb, io);
1030 if (rc == -ENOMEM) {
1031 spdk_thread_send_msg(thread, ftl_submit_nv_cache, io);
1032 return;
1033 } else if (rc) {
1034 SPDK_ERRLOG("Write to persistent cache failed: %s (%"PRIu64", %"PRIu64")\n",
1035 spdk_strerror(-rc), io->ppa.ppa, io->lbk_cnt);
1036 io->status = -EIO;
1037 ftl_io_complete(io);
1038 return;
1039 }
1040
1041 ftl_io_advance(io, io->lbk_cnt);
1042 ftl_io_inc_req(io);
1043 }
1044
1045 static void
1046 _ftl_write_nv_cache(void *ctx)
1047 {
1048 struct ftl_io *child, *io = ctx;
1049 struct spdk_ftl_dev *dev = io->dev;
1050 struct spdk_thread *thread;
1051 uint64_t num_lbks;
1052
1053 thread = spdk_io_channel_get_thread(io->ioch);
1054
1055 while (io->pos < io->lbk_cnt) {
1056 num_lbks = ftl_io_iovec_len_left(io);
1057
1058 child = ftl_alloc_io_nv_cache(io, num_lbks);
1059 if (spdk_unlikely(!child)) {
1060 spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
1061 return;
1062 }
1063
1064 /* Reserve area on the write buffer cache */
1065 child->ppa.ppa = ftl_reserve_nv_cache(&dev->nv_cache, &num_lbks);
1066 if (child->ppa.ppa == FTL_LBA_INVALID) {
1067 ftl_io_free(child);
1068 spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
1069 break;
1070 }
1071
1072 /* Shrink the IO if there isn't enough room in the cache to fill the whole iovec */
1073 if (spdk_unlikely(num_lbks != ftl_io_iovec_len_left(io))) {
1074 ftl_io_shrink_iovec(child, num_lbks);
1075 }
1076
1077 ftl_submit_nv_cache(child);
1078 }
1079
1080 if (ftl_io_done(io)) {
1081 ftl_io_complete(io);
1082 }
1083 }
1084
1085 static void
1086 ftl_write_nv_cache(struct ftl_io *parent)
1087 {
1088 ftl_io_reset(parent);
1089 parent->flags |= FTL_IO_CACHE;
1090 _ftl_write_nv_cache(parent);
1091 }
1092
1093 static void
1094 ftl_write_fail(struct ftl_io *io, int status)
1095 {
1096 struct ftl_rwb_batch *batch = io->rwb_batch;
1097 struct spdk_ftl_dev *dev = io->dev;
1098 struct ftl_rwb_entry *entry;
1099 struct ftl_band *band;
1100 char buf[128];
1101
1102 entry = ftl_rwb_batch_first_entry(batch);
1103
1104 band = ftl_band_from_ppa(io->dev, entry->ppa);
1105 SPDK_ERRLOG("Write failed @ppa: %s, status: %d\n",
1106 ftl_ppa2str(entry->ppa, buf, sizeof(buf)), status);
1107
1108 /* Close the band and, halt wptr and defrag */
1109 ftl_halt_writes(dev, band);
1110
1111 ftl_rwb_foreach(entry, batch) {
1112 /* Invalidate meta set by process_writes() */
1113 ftl_invalidate_addr(dev, entry->ppa);
1114 }
1115
1116 /* Reset the batch back to the the RWB to resend it later */
1117 ftl_rwb_batch_revert(batch);
1118 }
1119
1120 static void
1121 ftl_write_cb(struct ftl_io *io, void *arg, int status)
1122 {
1123 struct spdk_ftl_dev *dev = io->dev;
1124 struct ftl_rwb_batch *batch = io->rwb_batch;
1125 struct ftl_rwb_entry *entry;
1126
1127 if (status) {
1128 ftl_write_fail(io, status);
1129 return;
1130 }
1131
1132 assert(io->lbk_cnt == dev->xfer_size);
1133 ftl_rwb_foreach(entry, batch) {
1134 if (!(io->flags & FTL_IO_MD) && !(entry->flags & FTL_IO_PAD)) {
1135 /* Verify that the LBA is set for user lbks */
1136 assert(entry->lba != FTL_LBA_INVALID);
1137 }
1138
1139 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lu, lba:%lu\n",
1140 entry->ppa.ppa, entry->lba);
1141 }
1142
1143 ftl_process_flush(dev, batch);
1144 ftl_rwb_batch_release(batch);
1145 }
1146
1147 static void
1148 ftl_update_rwb_stats(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry)
1149 {
1150 if (!ftl_rwb_entry_internal(entry)) {
1151 dev->stats.write_user++;
1152 }
1153 dev->stats.write_total++;
1154 }
1155
1156 static void
1157 ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry,
1158 struct ftl_ppa ppa)
1159 {
1160 struct ftl_ppa prev_ppa;
1161 struct ftl_rwb_entry *prev;
1162 struct ftl_band *band;
1163 int valid;
1164
1165 prev_ppa = ftl_l2p_get(dev, entry->lba);
1166 if (ftl_ppa_invalid(prev_ppa)) {
1167 ftl_l2p_set(dev, entry->lba, ppa);
1168 return;
1169 }
1170
1171 /* If the L2P's PPA is different than what we expected we don't need to */
1172 /* do anything (someone's already overwritten our data). */
1173 if (ftl_rwb_entry_weak(entry) && !ftl_ppa_cmp(prev_ppa, entry->ppa)) {
1174 return;
1175 }
1176
1177 if (ftl_ppa_cached(prev_ppa)) {
1178 assert(!ftl_rwb_entry_weak(entry));
1179 prev = ftl_rwb_entry_from_offset(dev->rwb, prev_ppa.offset);
1180 pthread_spin_lock(&prev->lock);
1181
1182 /* Re-read the L2P under the lock to protect against updates */
1183 /* to this LBA from other threads */
1184 prev_ppa = ftl_l2p_get(dev, entry->lba);
1185
1186 /* If the entry is no longer in cache, another write has been */
1187 /* scheduled in the meantime, so we have to invalidate its LBA */
1188 if (!ftl_ppa_cached(prev_ppa)) {
1189 ftl_invalidate_addr(dev, prev_ppa);
1190 }
1191
1192 /* If previous entry is part of cache, remove and invalidate it */
1193 if (ftl_rwb_entry_valid(prev)) {
1194 ftl_invalidate_addr(dev, prev->ppa);
1195 ftl_rwb_entry_invalidate(prev);
1196 }
1197
1198 ftl_l2p_set(dev, entry->lba, ppa);
1199 pthread_spin_unlock(&prev->lock);
1200 return;
1201 }
1202
1203 /* Lock the band containing previous PPA. This assures atomic changes to */
1204 /* the L2P as wall as metadata. The valid bits in metadata are used to */
1205 /* check weak writes validity. */
1206 band = ftl_band_from_ppa(dev, prev_ppa);
1207 pthread_spin_lock(&band->lba_map.lock);
1208
1209 valid = ftl_invalidate_addr_unlocked(dev, prev_ppa);
1210
1211 /* If the address has been invalidated already, we don't want to update */
1212 /* the L2P for weak writes, as it means the write is no longer valid. */
1213 if (!ftl_rwb_entry_weak(entry) || valid) {
1214 ftl_l2p_set(dev, entry->lba, ppa);
1215 }
1216
1217 pthread_spin_unlock(&band->lba_map.lock);
1218 }
1219
1220 static struct ftl_io *
1221 ftl_io_init_child_write(struct ftl_io *parent, struct ftl_ppa ppa,
1222 void *data, void *md, ftl_io_fn cb)
1223 {
1224 struct ftl_io *io;
1225 struct spdk_ftl_dev *dev = parent->dev;
1226 struct ftl_io_init_opts opts = {
1227 .dev = dev,
1228 .io = NULL,
1229 .parent = parent,
1230 .rwb_batch = NULL,
1231 .band = parent->band,
1232 .size = sizeof(struct ftl_io),
1233 .flags = 0,
1234 .type = FTL_IO_WRITE,
1235 .lbk_cnt = dev->xfer_size,
1236 .cb_fn = cb,
1237 .data = data,
1238 .md = md,
1239 };
1240
1241 io = ftl_io_init_internal(&opts);
1242 if (!io) {
1243 return NULL;
1244 }
1245
1246 io->ppa = ppa;
1247
1248 return io;
1249 }
1250
1251 static void
1252 ftl_io_child_write_cb(struct ftl_io *io, void *ctx, int status)
1253 {
1254 struct ftl_chunk *chunk;
1255
1256 chunk = ftl_band_chunk_from_ppa(io->band, io->ppa);
1257 chunk->busy = false;
1258 }
1259
1260 static int
1261 ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io, int lbk_cnt)
1262 {
1263 struct spdk_ftl_dev *dev = io->dev;
1264 struct ftl_io *child;
1265 int rc;
1266 struct ftl_ppa ppa;
1267
1268 if (spdk_likely(!wptr->direct_mode)) {
1269 ppa = wptr->ppa;
1270 } else {
1271 assert(io->flags & FTL_IO_DIRECT_ACCESS);
1272 assert(io->ppa.chk == wptr->band->id);
1273 ppa = io->ppa;
1274 }
1275
1276 /* Split IO to child requests and release chunk immediately after child is completed */
1277 child = ftl_io_init_child_write(io, ppa, ftl_io_iovec_addr(io),
1278 ftl_io_get_md(io), ftl_io_child_write_cb);
1279 if (!child) {
1280 return -EAGAIN;
1281 }
1282
1283 rc = spdk_nvme_ns_cmd_write_with_md(dev->ns, ftl_get_write_qpair(dev),
1284 ftl_io_iovec_addr(child), child->md,
1285 ftl_ppa_addr_pack(dev, ppa),
1286 lbk_cnt, ftl_io_cmpl_cb, child, 0, 0, 0);
1287 if (rc) {
1288 ftl_io_fail(child, rc);
1289 ftl_io_complete(child);
1290 SPDK_ERRLOG("spdk_nvme_ns_cmd_write failed with status:%d, ppa:%lu\n",
1291 rc, ppa.ppa);
1292
1293 return -EIO;
1294 }
1295
1296 ftl_io_inc_req(child);
1297 ftl_io_advance(child, lbk_cnt);
1298
1299 return 0;
1300 }
1301
1302 static int
1303 ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io)
1304 {
1305 struct spdk_ftl_dev *dev = io->dev;
1306 int rc = 0;
1307
1308 assert(io->lbk_cnt % dev->xfer_size == 0);
1309
1310 while (io->iov_pos < io->iov_cnt) {
1311 /* There are no guarantees of the order of completion of NVMe IO submission queue */
1312 /* so wait until chunk is not busy before submitting another write */
1313 if (wptr->chunk->busy) {
1314 TAILQ_INSERT_TAIL(&wptr->pending_queue, io, retry_entry);
1315 rc = -EAGAIN;
1316 break;
1317 }
1318
1319 rc = ftl_submit_child_write(wptr, io, dev->xfer_size);
1320 if (spdk_unlikely(rc)) {
1321 if (rc == -EAGAIN) {
1322 TAILQ_INSERT_TAIL(&wptr->pending_queue, io, retry_entry);
1323 } else {
1324 ftl_io_fail(io, rc);
1325 }
1326 break;
1327 }
1328
1329 ftl_trace_submission(dev, io, wptr->ppa, dev->xfer_size);
1330 ftl_wptr_advance(wptr, dev->xfer_size);
1331 }
1332
1333 if (ftl_io_done(io)) {
1334 /* Parent IO will complete after all children are completed */
1335 ftl_io_complete(io);
1336 }
1337
1338 return rc;
1339 }
1340
1341 static void
1342 ftl_flush_pad_batch(struct spdk_ftl_dev *dev)
1343 {
1344 struct ftl_rwb *rwb = dev->rwb;
1345 size_t size, num_entries;
1346
1347 size = ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_INTERNAL) +
1348 ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_USER);
1349
1350 /* There must be something in the RWB, otherwise the flush */
1351 /* wouldn't be waiting for anything */
1352 assert(size > 0);
1353
1354 /* Only add padding when there's less than xfer size */
1355 /* entries in the buffer. Otherwise we just have to wait */
1356 /* for the entries to become ready. */
1357 num_entries = ftl_rwb_get_active_batches(dev->rwb) * dev->xfer_size;
1358 if (size < num_entries) {
1359 ftl_rwb_pad(dev, num_entries - (size % num_entries));
1360 }
1361 }
1362
1363 static int
1364 ftl_wptr_process_writes(struct ftl_wptr *wptr)
1365 {
1366 struct spdk_ftl_dev *dev = wptr->dev;
1367 struct ftl_rwb_batch *batch;
1368 struct ftl_rwb_entry *entry;
1369 struct ftl_io *io;
1370 struct ftl_ppa ppa, prev_ppa;
1371
1372 if (spdk_unlikely(!TAILQ_EMPTY(&wptr->pending_queue))) {
1373 io = TAILQ_FIRST(&wptr->pending_queue);
1374 TAILQ_REMOVE(&wptr->pending_queue, io, retry_entry);
1375
1376 if (ftl_submit_write(wptr, io) == -EAGAIN) {
1377 return 0;
1378 }
1379 }
1380
1381 /* Make sure the band is prepared for writing */
1382 if (!ftl_wptr_ready(wptr)) {
1383 return 0;
1384 }
1385
1386 if (dev->halt) {
1387 ftl_wptr_process_shutdown(wptr);
1388 }
1389
1390 batch = ftl_rwb_pop(dev->rwb);
1391 if (!batch) {
1392 /* If there are queued flush requests we need to pad the RWB to */
1393 /* force out remaining entries */
1394 if (!LIST_EMPTY(&dev->flush_list)) {
1395 ftl_flush_pad_batch(dev);
1396 }
1397
1398 return 0;
1399 }
1400
1401 io = ftl_io_rwb_init(dev, wptr->band, batch, ftl_write_cb);
1402 if (!io) {
1403 goto error;
1404 }
1405
1406 ppa = wptr->ppa;
1407 ftl_rwb_foreach(entry, batch) {
1408 entry->ppa = ppa;
1409
1410 if (entry->lba != FTL_LBA_INVALID) {
1411 pthread_spin_lock(&entry->lock);
1412 prev_ppa = ftl_l2p_get(dev, entry->lba);
1413
1414 /* If the l2p was updated in the meantime, don't update band's metadata */
1415 if (ftl_ppa_cached(prev_ppa) && prev_ppa.offset == entry->pos) {
1416 /* Setting entry's cache bit needs to be done after metadata */
1417 /* within the band is updated to make sure that writes */
1418 /* invalidating the entry clear the metadata as well */
1419 ftl_band_set_addr(wptr->band, entry->lba, entry->ppa);
1420 ftl_rwb_entry_set_valid(entry);
1421 }
1422 pthread_spin_unlock(&entry->lock);
1423 }
1424
1425 ftl_trace_rwb_pop(dev, entry);
1426 ftl_update_rwb_stats(dev, entry);
1427
1428 ppa = ftl_band_next_ppa(wptr->band, ppa, 1);
1429 }
1430
1431 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lx, %lx\n", wptr->ppa.ppa,
1432 ftl_ppa_addr_pack(dev, wptr->ppa));
1433
1434 if (ftl_submit_write(wptr, io)) {
1435 /* TODO: we need some recovery here */
1436 assert(0 && "Write submit failed");
1437 if (ftl_io_done(io)) {
1438 ftl_io_free(io);
1439 }
1440 }
1441
1442 return dev->xfer_size;
1443 error:
1444 ftl_rwb_batch_revert(batch);
1445 return 0;
1446 }
1447
1448 static int
1449 ftl_process_writes(struct spdk_ftl_dev *dev)
1450 {
1451 struct ftl_wptr *wptr, *twptr;
1452 size_t num_active = 0;
1453 enum ftl_band_state state;
1454
1455 LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) {
1456 ftl_wptr_process_writes(wptr);
1457 state = wptr->band->state;
1458
1459 if (state != FTL_BAND_STATE_FULL &&
1460 state != FTL_BAND_STATE_CLOSING &&
1461 state != FTL_BAND_STATE_CLOSED) {
1462 num_active++;
1463 }
1464 }
1465
1466 if (num_active < 1) {
1467 ftl_add_wptr(dev);
1468 }
1469
1470 return 0;
1471 }
1472
1473 static void
1474 ftl_rwb_entry_fill(struct ftl_rwb_entry *entry, struct ftl_io *io)
1475 {
1476 struct ftl_band *band;
1477
1478 memcpy(entry->data, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE);
1479
1480 if (ftl_rwb_entry_weak(entry)) {
1481 band = ftl_band_from_ppa(io->dev, io->ppa);
1482 entry->ppa = ftl_band_next_ppa(band, io->ppa, io->pos);
1483 }
1484
1485 entry->trace = io->trace;
1486 entry->lba = ftl_io_current_lba(io);
1487
1488 if (entry->md) {
1489 memcpy(entry->md, &entry->lba, sizeof(entry->lba));
1490 }
1491 }
1492
1493 static int
1494 ftl_rwb_fill(struct ftl_io *io)
1495 {
1496 struct spdk_ftl_dev *dev = io->dev;
1497 struct ftl_rwb_entry *entry;
1498 struct ftl_ppa ppa = { .cached = 1 };
1499 int flags = ftl_rwb_flags_from_io(io);
1500
1501 while (io->pos < io->lbk_cnt) {
1502 if (ftl_io_current_lba(io) == FTL_LBA_INVALID) {
1503 ftl_io_advance(io, 1);
1504 continue;
1505 }
1506
1507 entry = ftl_acquire_entry(dev, flags);
1508 if (!entry) {
1509 return -EAGAIN;
1510 }
1511
1512 ftl_rwb_entry_fill(entry, io);
1513
1514 ppa.offset = entry->pos;
1515
1516 ftl_trace_rwb_fill(dev, io);
1517 ftl_update_l2p(dev, entry, ppa);
1518 ftl_io_advance(io, 1);
1519
1520 /* Needs to be done after L2P is updated to avoid race with */
1521 /* write completion callback when it's processed faster than */
1522 /* L2P is set in update_l2p(). */
1523 ftl_rwb_push(entry);
1524 }
1525
1526 if (ftl_io_done(io)) {
1527 if (dev->nv_cache.bdev_desc) {
1528 ftl_write_nv_cache(io);
1529 } else {
1530 ftl_io_complete(io);
1531 }
1532 }
1533
1534 return 0;
1535 }
1536
1537 static bool
1538 ftl_dev_needs_defrag(struct spdk_ftl_dev *dev)
1539 {
1540 const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START);
1541
1542 if (ftl_reloc_is_halted(dev->reloc)) {
1543 return false;
1544 }
1545
1546 if (dev->df_band) {
1547 return false;
1548 }
1549
1550 if (dev->num_free <= limit->thld) {
1551 return true;
1552 }
1553
1554 return false;
1555 }
1556
1557 static double
1558 ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid)
1559 {
1560 size_t usable, valid, invalid;
1561 double vld_ratio;
1562
1563 /* If the band doesn't have any usable lbks it's of no use */
1564 usable = ftl_band_num_usable_lbks(band);
1565 if (usable == 0) {
1566 return 0.0;
1567 }
1568
1569 valid = threshold_valid ? (usable - *threshold_valid) : band->lba_map.num_vld;
1570 invalid = usable - valid;
1571
1572 /* Add one to avoid division by 0 */
1573 vld_ratio = (double)invalid / (double)(valid + 1);
1574 return vld_ratio * ftl_band_age(band);
1575 }
1576
1577 static bool
1578 ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev)
1579 {
1580 struct spdk_ftl_conf *conf = &dev->conf;
1581 size_t thld_vld;
1582
1583 /* If we're in dire need of free bands, every band is worth defragging */
1584 if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) {
1585 return true;
1586 }
1587
1588 thld_vld = (ftl_band_num_usable_lbks(band) * conf->defrag.invalid_thld) / 100;
1589
1590 return band->merit > ftl_band_calc_merit(band, &thld_vld);
1591 }
1592
1593 static struct ftl_band *
1594 ftl_select_defrag_band(struct spdk_ftl_dev *dev)
1595 {
1596 struct ftl_band *band, *mband = NULL;
1597 double merit = 0;
1598
1599 LIST_FOREACH(band, &dev->shut_bands, list_entry) {
1600 assert(band->state == FTL_BAND_STATE_CLOSED);
1601 band->merit = ftl_band_calc_merit(band, NULL);
1602 if (band->merit > merit) {
1603 merit = band->merit;
1604 mband = band;
1605 }
1606 }
1607
1608 if (mband && !ftl_band_needs_defrag(mband, dev)) {
1609 mband = NULL;
1610 }
1611
1612 return mband;
1613 }
1614
1615 static void
1616 ftl_process_relocs(struct spdk_ftl_dev *dev)
1617 {
1618 struct ftl_band *band;
1619
1620 if (ftl_dev_needs_defrag(dev)) {
1621 band = dev->df_band = ftl_select_defrag_band(dev);
1622
1623 if (band) {
1624 ftl_reloc_add(dev->reloc, band, 0, ftl_num_band_lbks(dev), 0);
1625 ftl_trace_defrag_band(dev, band);
1626 }
1627 }
1628
1629 ftl_reloc(dev->reloc);
1630 }
1631
1632 int
1633 ftl_current_limit(const struct spdk_ftl_dev *dev)
1634 {
1635 return dev->limit;
1636 }
1637
1638 void
1639 spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs)
1640 {
1641 attrs->uuid = dev->uuid;
1642 attrs->lbk_cnt = dev->num_lbas;
1643 attrs->lbk_size = FTL_BLOCK_SIZE;
1644 attrs->range = dev->range;
1645 attrs->cache_bdev_desc = dev->nv_cache.bdev_desc;
1646 attrs->allow_open_bands = dev->conf.allow_open_bands;
1647 attrs->num_chunks = dev->geo.num_chk;
1648 attrs->chunk_size = dev->geo.clba;
1649 }
1650
1651 static void
1652 _ftl_io_write(void *ctx)
1653 {
1654 ftl_io_write((struct ftl_io *)ctx);
1655 }
1656
1657 static int
1658 ftl_rwb_fill_leaf(struct ftl_io *io)
1659 {
1660 int rc;
1661
1662 rc = ftl_rwb_fill(io);
1663 if (rc == -EAGAIN) {
1664 spdk_thread_send_msg(spdk_io_channel_get_thread(io->ioch),
1665 _ftl_io_write, io);
1666 return 0;
1667 }
1668
1669 return rc;
1670 }
1671
1672 static int
1673 ftl_submit_write_leaf(struct ftl_io *io)
1674 {
1675 int rc;
1676
1677 rc = ftl_submit_write(ftl_wptr_from_band(io->band), io);
1678 if (rc == -EAGAIN) {
1679 /* EAGAIN means that the request was put on the pending queue */
1680 return 0;
1681 }
1682
1683 return rc;
1684 }
1685
1686 void
1687 ftl_io_write(struct ftl_io *io)
1688 {
1689 struct spdk_ftl_dev *dev = io->dev;
1690
1691 /* For normal IOs we just need to copy the data onto the rwb */
1692 if (!(io->flags & FTL_IO_MD)) {
1693 ftl_io_call_foreach_child(io, ftl_rwb_fill_leaf);
1694 } else {
1695 /* Metadata has its own buffer, so it doesn't have to be copied, so just */
1696 /* send it the the core thread and schedule the write immediately */
1697 if (ftl_check_core_thread(dev)) {
1698 ftl_io_call_foreach_child(io, ftl_submit_write_leaf);
1699 } else {
1700 spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io);
1701 }
1702 }
1703 }
1704
1705 int
1706 spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1707 struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1708 {
1709 struct ftl_io *io;
1710
1711 if (iov_cnt == 0) {
1712 return -EINVAL;
1713 }
1714
1715 if (lba_cnt == 0) {
1716 return -EINVAL;
1717 }
1718
1719 if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1720 return -EINVAL;
1721 }
1722
1723 if (!dev->initialized) {
1724 return -EBUSY;
1725 }
1726
1727 io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE);
1728 if (!io) {
1729 return -ENOMEM;
1730 }
1731
1732 ftl_io_write(io);
1733
1734 return 0;
1735 }
1736
1737 static int
1738 ftl_io_read_leaf(struct ftl_io *io)
1739 {
1740 int rc;
1741
1742 rc = ftl_submit_read(io);
1743 if (rc == -ENOMEM) {
1744 /* ENOMEM means that the request was put on a pending queue */
1745 return 0;
1746 }
1747
1748 return rc;
1749 }
1750
1751 static void
1752 _ftl_io_read(void *arg)
1753 {
1754 ftl_io_read((struct ftl_io *)arg);
1755 }
1756
1757 void
1758 ftl_io_read(struct ftl_io *io)
1759 {
1760 struct spdk_ftl_dev *dev = io->dev;
1761
1762 if (ftl_check_read_thread(dev)) {
1763 ftl_io_call_foreach_child(io, ftl_io_read_leaf);
1764 } else {
1765 spdk_thread_send_msg(ftl_get_read_thread(dev), _ftl_io_read, io);
1766 }
1767 }
1768
1769 int
1770 spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1771 struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1772 {
1773 struct ftl_io *io;
1774
1775 if (iov_cnt == 0) {
1776 return -EINVAL;
1777 }
1778
1779 if (lba_cnt == 0) {
1780 return -EINVAL;
1781 }
1782
1783 if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1784 return -EINVAL;
1785 }
1786
1787 if (!dev->initialized) {
1788 return -EBUSY;
1789 }
1790
1791 io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ);
1792 if (!io) {
1793 return -ENOMEM;
1794 }
1795
1796 ftl_io_read(io);
1797 return 0;
1798 }
1799
1800 static struct ftl_flush *
1801 ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1802 {
1803 struct ftl_flush *flush;
1804 struct ftl_rwb *rwb = dev->rwb;
1805
1806 flush = calloc(1, sizeof(*flush));
1807 if (!flush) {
1808 return NULL;
1809 }
1810
1811 flush->bmap = spdk_bit_array_create(ftl_rwb_num_batches(rwb));
1812 if (!flush->bmap) {
1813 goto error;
1814 }
1815
1816 flush->dev = dev;
1817 flush->cb.fn = cb_fn;
1818 flush->cb.ctx = cb_arg;
1819
1820 return flush;
1821 error:
1822 free(flush);
1823 return NULL;
1824 }
1825
1826 static void
1827 _ftl_flush(void *ctx)
1828 {
1829 struct ftl_flush *flush = ctx;
1830 struct spdk_ftl_dev *dev = flush->dev;
1831 struct ftl_rwb *rwb = dev->rwb;
1832 struct ftl_rwb_batch *batch;
1833
1834 /* Attach flush object to all non-empty batches */
1835 ftl_rwb_foreach_batch(batch, rwb) {
1836 if (!ftl_rwb_batch_empty(batch)) {
1837 spdk_bit_array_set(flush->bmap, ftl_rwb_batch_get_offset(batch));
1838 flush->num_req++;
1839 }
1840 }
1841
1842 LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry);
1843
1844 /* If the RWB was already empty, the flush can be completed right away */
1845 if (!flush->num_req) {
1846 ftl_complete_flush(flush);
1847 }
1848 }
1849
1850 int
1851 spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1852 {
1853 struct ftl_flush *flush;
1854
1855 if (!dev->initialized) {
1856 return -EBUSY;
1857 }
1858
1859 flush = ftl_flush_init(dev, cb_fn, cb_arg);
1860 if (!flush) {
1861 return -ENOMEM;
1862 }
1863
1864 spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush);
1865 return 0;
1866 }
1867
1868 void
1869 ftl_process_anm_event(struct ftl_anm_event *event)
1870 {
1871 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Unconsumed ANM received for dev: %p...\n", event->dev);
1872 ftl_anm_event_complete(event);
1873 }
1874
1875 static void
1876 ftl_process_retry_queue(struct spdk_ftl_dev *dev)
1877 {
1878 struct ftl_io *io;
1879 int rc;
1880
1881 while (!TAILQ_EMPTY(&dev->retry_queue)) {
1882 io = TAILQ_FIRST(&dev->retry_queue);
1883
1884 /* Retry only if IO is still healthy */
1885 if (spdk_likely(io->status == 0)) {
1886 rc = ftl_submit_read(io);
1887 if (rc == -ENOMEM) {
1888 break;
1889 }
1890 }
1891
1892 io->flags &= ~FTL_IO_RETRY;
1893 TAILQ_REMOVE(&dev->retry_queue, io, retry_entry);
1894
1895 if (ftl_io_done(io)) {
1896 ftl_io_complete(io);
1897 }
1898 }
1899 }
1900
1901 int
1902 ftl_task_read(void *ctx)
1903 {
1904 struct ftl_thread *thread = ctx;
1905 struct spdk_ftl_dev *dev = thread->dev;
1906 struct spdk_nvme_qpair *qpair = ftl_get_read_qpair(dev);
1907 size_t num_completed;
1908
1909 if (dev->halt) {
1910 if (ftl_shutdown_complete(dev)) {
1911 spdk_poller_unregister(&thread->poller);
1912 return 0;
1913 }
1914 }
1915
1916 num_completed = spdk_nvme_qpair_process_completions(qpair, 0);
1917
1918 if (num_completed && !TAILQ_EMPTY(&dev->retry_queue)) {
1919 ftl_process_retry_queue(dev);
1920 }
1921
1922 return num_completed;
1923 }
1924
1925 int
1926 ftl_task_core(void *ctx)
1927 {
1928 struct ftl_thread *thread = ctx;
1929 struct spdk_ftl_dev *dev = thread->dev;
1930 struct spdk_nvme_qpair *qpair = ftl_get_write_qpair(dev);
1931
1932 if (dev->halt) {
1933 if (ftl_shutdown_complete(dev)) {
1934 spdk_poller_unregister(&thread->poller);
1935 return 0;
1936 }
1937 }
1938
1939 ftl_process_writes(dev);
1940 spdk_nvme_qpair_process_completions(qpair, 0);
1941 ftl_process_relocs(dev);
1942
1943 return 0;
1944 }
1945
1946 SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE)