ceph/src/spdk/lib/ftl/ftl_core.c

   1 /*-
   2  *   BSD LICENSE
   3  *
   4  *   Copyright (c) Intel Corporation.
   5  *   All rights reserved.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following conditions
   9  *   are met:
  10  *
  11  *     * Redistributions of source code must retain the above copyright
  12  *       notice, this list of conditions and the following disclaimer.
  13  *     * Redistributions in binary form must reproduce the above copyright
  14  *       notice, this list of conditions and the following disclaimer in
  15  *       the documentation and/or other materials provided with the
  16  *       distribution.
  17  *     * Neither the name of Intel Corporation nor the names of its
  18  *       contributors may be used to endorse or promote products derived
  19  *       from this software without specific prior written permission.
  20  *
  21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  */
  33
  34 #include "spdk/likely.h"
  35 #include "spdk/stdinc.h"
  36 #include "spdk/nvme.h"
  37 #include "spdk/io_channel.h"
  38 #include "spdk/bdev_module.h"
  39 #include "spdk/string.h"
  40 #include "spdk_internal/log.h"
  41 #include "spdk/ftl.h"
  42
  43 #include "ftl_core.h"
  44 #include "ftl_band.h"
  45 #include "ftl_io.h"
  46 #include "ftl_anm.h"
  47 #include "ftl_rwb.h"
  48 #include "ftl_debug.h"
  49 #include "ftl_reloc.h"
  50
  51 struct ftl_wptr {
  52         /* Owner device */
  53         struct spdk_ftl_dev             *dev;
  54
  55         /* Current PPA */
  56         struct ftl_ppa                  ppa;
  57
  58         /* Band currently being written to */
  59         struct ftl_band                 *band;
  60
  61         /* Current logical block's offset */
  62         uint64_t                        offset;
  63
  64         /* Current erase block */
  65         struct ftl_chunk                *chunk;
  66
  67         /* Pending IO queue */
  68         TAILQ_HEAD(, ftl_io)            pending_queue;
  69
  70         /* List link */
  71         LIST_ENTRY(ftl_wptr)            list_entry;
  72
  73         /*
  74          * If setup in direct mode, there will be no offset or band state update after IO.
  75          * The PPA is not assigned by wptr, and is instead taken directly from the request.
  76          */
  77         bool                            direct_mode;
  78 };
  79
  80 struct ftl_flush {
  81         /* Owner device */
  82         struct spdk_ftl_dev             *dev;
  83
  84         /* Number of batches to wait for */
  85         size_t                          num_req;
  86
  87         /* Callback */
  88         struct {
  89                 spdk_ftl_fn             fn;
  90                 void                    *ctx;
  91         } cb;
  92
  93         /* Batch bitmap */
  94         struct spdk_bit_array           *bmap;
  95
  96         /* List link */
  97         LIST_ENTRY(ftl_flush)           list_entry;
  98 };
  99
 100 static int
 101 ftl_rwb_flags_from_io(const struct ftl_io *io)
 102 {
 103         int valid_flags = FTL_IO_INTERNAL | FTL_IO_WEAK | FTL_IO_PAD;
 104         return io->flags & valid_flags;
 105 }
 106
 107 static int
 108 ftl_rwb_entry_weak(const struct ftl_rwb_entry *entry)
 109 {
 110         return entry->flags & FTL_IO_WEAK;
 111 }
 112
 113 static void
 114 ftl_wptr_free(struct ftl_wptr *wptr)
 115 {
 116         if (!wptr) {
 117                 return;
 118         }
 119
 120         free(wptr);
 121 }
 122
 123 static void
 124 ftl_remove_wptr(struct ftl_wptr *wptr)
 125 {
 126         LIST_REMOVE(wptr, list_entry);
 127         ftl_wptr_free(wptr);
 128 }
 129
 130 static void
 131 ftl_io_cmpl_cb(void *arg, const struct spdk_nvme_cpl *status)
 132 {
 133         struct ftl_io *io = arg;
 134
 135         if (spdk_nvme_cpl_is_error(status)) {
 136                 ftl_io_process_error(io, status);
 137         }
 138
 139         ftl_trace_completion(io->dev, io, FTL_TRACE_COMPLETION_DISK);
 140
 141         ftl_io_dec_req(io);
 142
 143         if (ftl_io_done(io)) {
 144                 ftl_io_complete(io);
 145         }
 146 }
 147
 148 static void
 149 ftl_halt_writes(struct spdk_ftl_dev *dev, struct ftl_band *band)
 150 {
 151         struct ftl_wptr *wptr = NULL;
 152
 153         LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
 154                 if (wptr->band == band) {
 155                         break;
 156                 }
 157         }
 158
 159         /* If the band already has the high_prio flag set, other writes must */
 160         /* have failed earlier, so it's already taken care of. */
 161         if (band->high_prio) {
 162                 assert(wptr == NULL);
 163                 return;
 164         }
 165
 166         ftl_band_write_failed(band);
 167         ftl_remove_wptr(wptr);
 168 }
 169
 170 static struct ftl_wptr *
 171 ftl_wptr_from_band(struct ftl_band *band)
 172 {
 173         struct spdk_ftl_dev *dev = band->dev;
 174         struct ftl_wptr *wptr = NULL;
 175
 176         LIST_FOREACH(wptr, &dev->wptr_list, list_entry) {
 177                 if (wptr->band == band) {
 178                         return wptr;
 179                 }
 180         }
 181
 182         return NULL;
 183 }
 184
 185 static void
 186 ftl_md_write_fail(struct ftl_io *io, int status)
 187 {
 188         struct ftl_band *band = io->band;
 189         struct ftl_wptr *wptr;
 190         char buf[128];
 191
 192         wptr = ftl_wptr_from_band(band);
 193
 194         SPDK_ERRLOG("Metadata write failed @ppa: %s, status: %d\n",
 195                     ftl_ppa2str(wptr->ppa, buf, sizeof(buf)), status);
 196
 197         ftl_halt_writes(io->dev, band);
 198 }
 199
 200 static void
 201 ftl_md_write_cb(struct ftl_io *io, void *arg, int status)
 202 {
 203         struct spdk_ftl_dev *dev = io->dev;
 204         struct ftl_nv_cache *nv_cache = &dev->nv_cache;
 205         struct ftl_wptr *wptr;
 206         struct spdk_bdev *bdev;
 207
 208         wptr = ftl_wptr_from_band(io->band);
 209
 210         if (status) {
 211                 ftl_md_write_fail(io, status);
 212                 return;
 213         }
 214
 215         ftl_band_set_next_state(io->band);
 216         if (io->band->state == FTL_BAND_STATE_CLOSED) {
 217                 if (nv_cache->bdev_desc) {
 218                         bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
 219
 220                         pthread_spin_lock(&nv_cache->lock);
 221                         nv_cache->num_available += ftl_band_user_lbks(io->band);
 222
 223                         if (spdk_unlikely(nv_cache->num_available > spdk_bdev_get_num_blocks(bdev))) {
 224                                 nv_cache->num_available = spdk_bdev_get_num_blocks(bdev);
 225                         }
 226                         pthread_spin_unlock(&nv_cache->lock);
 227                 }
 228
 229                 ftl_remove_wptr(wptr);
 230         }
 231 }
 232
 233 static int
 234 ftl_ppa_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa)
 235 {
 236         struct spdk_ftl_dev *dev = io->dev;
 237         size_t lbk_cnt, max_lbks;
 238
 239         assert(ftl_io_mode_ppa(io));
 240         assert(io->iov_pos < io->iov_cnt);
 241
 242         if (io->pos == 0) {
 243                 *ppa = io->ppa;
 244         } else {
 245                 *ppa = ftl_band_next_xfer_ppa(io->band, io->ppa, io->pos);
 246         }
 247
 248         assert(!ftl_ppa_invalid(*ppa));
 249
 250         /* Metadata has to be read in the way it's written (jumping across */
 251         /* the chunks in xfer_size increments) */
 252         if (io->flags & FTL_IO_MD) {
 253                 max_lbks = dev->xfer_size - (ppa->lbk % dev->xfer_size);
 254                 lbk_cnt = spdk_min(ftl_io_iovec_len_left(io), max_lbks);
 255                 assert(ppa->lbk / dev->xfer_size == (ppa->lbk + lbk_cnt - 1) / dev->xfer_size);
 256         } else {
 257                 lbk_cnt = ftl_io_iovec_len_left(io);
 258         }
 259
 260         return lbk_cnt;
 261 }
 262
 263 static int
 264 ftl_wptr_close_band(struct ftl_wptr *wptr)
 265 {
 266         struct ftl_band *band = wptr->band;
 267
 268         ftl_band_set_state(band, FTL_BAND_STATE_CLOSING);
 269         band->tail_md_ppa = wptr->ppa;
 270
 271         return ftl_band_write_tail_md(band, ftl_md_write_cb);
 272 }
 273
 274 static int
 275 ftl_wptr_open_band(struct ftl_wptr *wptr)
 276 {
 277         struct ftl_band *band = wptr->band;
 278
 279         assert(ftl_band_chunk_is_first(band, wptr->chunk));
 280         assert(band->lba_map.num_vld == 0);
 281
 282         ftl_band_clear_lba_map(band);
 283
 284         assert(band->state == FTL_BAND_STATE_PREP);
 285         ftl_band_set_state(band, FTL_BAND_STATE_OPENING);
 286
 287         return ftl_band_write_head_md(band, ftl_md_write_cb);
 288 }
 289
 290 static int
 291 ftl_submit_erase(struct ftl_io *io)
 292 {
 293         struct spdk_ftl_dev *dev = io->dev;
 294         struct ftl_band *band = io->band;
 295         struct ftl_ppa ppa = io->ppa;
 296         struct ftl_chunk *chunk;
 297         uint64_t ppa_packed;
 298         int rc = 0;
 299         size_t i;
 300
 301         for (i = 0; i < io->lbk_cnt; ++i) {
 302                 if (i != 0) {
 303                         chunk = ftl_band_next_chunk(band, ftl_band_chunk_from_ppa(band, ppa));
 304                         assert(chunk->state == FTL_CHUNK_STATE_CLOSED ||
 305                                chunk->state == FTL_CHUNK_STATE_VACANT);
 306                         ppa = chunk->start_ppa;
 307                 }
 308
 309                 assert(ppa.lbk == 0);
 310                 ppa_packed = ftl_ppa_addr_pack(dev, ppa);
 311
 312                 ftl_trace_submission(dev, io, ppa, 1);
 313                 rc = spdk_nvme_ocssd_ns_cmd_vector_reset(dev->ns, ftl_get_write_qpair(dev),
 314                                 &ppa_packed, 1, NULL, ftl_io_cmpl_cb, io);
 315                 if (spdk_unlikely(rc)) {
 316                         ftl_io_fail(io, rc);
 317                         SPDK_ERRLOG("Vector reset failed with status: %d\n", rc);
 318                         break;
 319                 }
 320
 321                 ftl_io_inc_req(io);
 322                 ftl_io_advance(io, 1);
 323         }
 324
 325         if (ftl_io_done(io)) {
 326                 ftl_io_complete(io);
 327         }
 328
 329         return rc;
 330 }
 331
 332 static void
 333 _ftl_io_erase(void *ctx)
 334 {
 335         ftl_io_erase((struct ftl_io *)ctx);
 336 }
 337
 338 static bool
 339 ftl_check_core_thread(const struct spdk_ftl_dev *dev)
 340 {
 341         return dev->core_thread.thread == spdk_get_thread();
 342 }
 343
 344 static bool
 345 ftl_check_read_thread(const struct spdk_ftl_dev *dev)
 346 {
 347         return dev->read_thread.thread == spdk_get_thread();
 348 }
 349
 350 int
 351 ftl_io_erase(struct ftl_io *io)
 352 {
 353         struct spdk_ftl_dev *dev = io->dev;
 354
 355         if (ftl_check_core_thread(dev)) {
 356                 return ftl_submit_erase(io);
 357         }
 358
 359         spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_erase, io);
 360         return 0;
 361 }
 362
 363 static struct ftl_band *
 364 ftl_next_write_band(struct spdk_ftl_dev *dev)
 365 {
 366         struct ftl_band *band;
 367
 368         band = LIST_FIRST(&dev->free_bands);
 369         if (!band) {
 370                 return NULL;
 371         }
 372         assert(band->state == FTL_BAND_STATE_FREE);
 373
 374         if (ftl_band_erase(band)) {
 375                 /* TODO: handle erase failure */
 376                 return NULL;
 377         }
 378
 379         return band;
 380 }
 381
 382 static struct ftl_band *
 383 ftl_next_wptr_band(struct spdk_ftl_dev *dev)
 384 {
 385         struct ftl_band *band;
 386
 387         if (!dev->next_band) {
 388                 band = ftl_next_write_band(dev);
 389         } else {
 390                 assert(dev->next_band->state == FTL_BAND_STATE_PREP);
 391                 band = dev->next_band;
 392                 dev->next_band = NULL;
 393         }
 394
 395         return band;
 396 }
 397
 398 static struct ftl_wptr *
 399 ftl_wptr_init(struct ftl_band *band)
 400 {
 401         struct spdk_ftl_dev *dev = band->dev;
 402         struct ftl_wptr *wptr;
 403
 404         wptr = calloc(1, sizeof(*wptr));
 405         if (!wptr) {
 406                 return NULL;
 407         }
 408
 409         wptr->dev = dev;
 410         wptr->band = band;
 411         wptr->chunk = CIRCLEQ_FIRST(&band->chunks);
 412         wptr->ppa = wptr->chunk->start_ppa;
 413         TAILQ_INIT(&wptr->pending_queue);
 414
 415         return wptr;
 416 }
 417
 418 static int
 419 ftl_add_direct_wptr(struct ftl_band *band)
 420 {
 421         struct spdk_ftl_dev *dev = band->dev;
 422         struct ftl_wptr *wptr;
 423
 424         assert(band->state == FTL_BAND_STATE_OPEN);
 425
 426         wptr = ftl_wptr_init(band);
 427         if (!wptr) {
 428                 return -1;
 429         }
 430
 431         wptr->direct_mode = true;
 432
 433         if (ftl_band_alloc_lba_map(band)) {
 434                 ftl_wptr_free(wptr);
 435                 return -1;
 436         }
 437
 438         LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
 439
 440         SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: direct band %u\n", band->id);
 441         ftl_trace_write_band(dev, band);
 442         return 0;
 443 }
 444
 445 static void
 446 ftl_close_direct_wptr(struct ftl_band *band)
 447 {
 448         struct ftl_wptr *wptr = ftl_wptr_from_band(band);
 449
 450         assert(wptr->direct_mode);
 451         assert(band->state == FTL_BAND_STATE_CLOSED);
 452
 453         ftl_band_release_lba_map(band);
 454
 455         ftl_remove_wptr(wptr);
 456 }
 457
 458 int
 459 ftl_band_set_direct_access(struct ftl_band *band, bool access)
 460 {
 461         if (access) {
 462                 return ftl_add_direct_wptr(band);
 463         } else {
 464                 ftl_close_direct_wptr(band);
 465                 return 0;
 466         }
 467 }
 468
 469 static int
 470 ftl_add_wptr(struct spdk_ftl_dev *dev)
 471 {
 472         struct ftl_band *band;
 473         struct ftl_wptr *wptr;
 474
 475         band = ftl_next_wptr_band(dev);
 476         if (!band) {
 477                 return -1;
 478         }
 479
 480         wptr = ftl_wptr_init(band);
 481         if (!wptr) {
 482                 return -1;
 483         }
 484
 485         if (ftl_band_write_prep(band)) {
 486                 ftl_wptr_free(wptr);
 487                 return -1;
 488         }
 489
 490         LIST_INSERT_HEAD(&dev->wptr_list, wptr, list_entry);
 491
 492         SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: band %u\n", band->id);
 493         ftl_trace_write_band(dev, band);
 494         return 0;
 495 }
 496
 497 static void
 498 ftl_wptr_advance(struct ftl_wptr *wptr, size_t xfer_size)
 499 {
 500         struct ftl_band *band = wptr->band;
 501         struct spdk_ftl_dev *dev = wptr->dev;
 502         struct spdk_ftl_conf *conf = &dev->conf;
 503         size_t next_thld;
 504
 505         if (spdk_unlikely(wptr->direct_mode)) {
 506                 return;
 507         }
 508
 509         wptr->offset += xfer_size;
 510         next_thld = (ftl_band_num_usable_lbks(band) * conf->band_thld) / 100;
 511
 512         if (ftl_band_full(band, wptr->offset)) {
 513                 ftl_band_set_state(band, FTL_BAND_STATE_FULL);
 514         }
 515
 516         wptr->chunk->busy = true;
 517         wptr->ppa = ftl_band_next_xfer_ppa(band, wptr->ppa, xfer_size);
 518         wptr->chunk = ftl_band_next_operational_chunk(band, wptr->chunk);
 519
 520         assert(!ftl_ppa_invalid(wptr->ppa));
 521
 522         SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "wptr: grp:%d, pu:%d chunk:%d, lbk:%u\n",
 523                       wptr->ppa.grp, wptr->ppa.pu, wptr->ppa.chk, wptr->ppa.lbk);
 524
 525         if (wptr->offset >= next_thld && !dev->next_band) {
 526                 dev->next_band = ftl_next_write_band(dev);
 527         }
 528 }
 529
 530 static size_t
 531 ftl_wptr_user_lbks_left(const struct ftl_wptr *wptr)
 532 {
 533         return ftl_band_user_lbks_left(wptr->band, wptr->offset);
 534 }
 535
 536 static int
 537 ftl_wptr_ready(struct ftl_wptr *wptr)
 538 {
 539         struct ftl_band *band = wptr->band;
 540
 541         /* TODO: add handling of empty bands */
 542
 543         if (spdk_unlikely(!ftl_chunk_is_writable(wptr->chunk))) {
 544                 /* Erasing band may fail after it was assigned to wptr. */
 545                 if (spdk_unlikely(wptr->chunk->state == FTL_CHUNK_STATE_BAD)) {
 546                         ftl_wptr_advance(wptr, wptr->dev->xfer_size);
 547                 }
 548                 return 0;
 549         }
 550
 551         /* If we're in the process of writing metadata, wait till it is */
 552         /* completed. */
 553         /* TODO: we should probably change bands once we're writing tail md */
 554         if (ftl_band_state_changing(band)) {
 555                 return 0;
 556         }
 557
 558         if (band->state == FTL_BAND_STATE_FULL) {
 559                 if (ftl_wptr_close_band(wptr)) {
 560                         /* TODO: need recovery here */
 561                         assert(false);
 562                 }
 563                 return 0;
 564         }
 565
 566         if (band->state != FTL_BAND_STATE_OPEN) {
 567                 if (ftl_wptr_open_band(wptr)) {
 568                         /* TODO: need recovery here */
 569                         assert(false);
 570                 }
 571                 return 0;
 572         }
 573
 574         return 1;
 575 }
 576
 577 static const struct spdk_ftl_limit *
 578 ftl_get_limit(const struct spdk_ftl_dev *dev, int type)
 579 {
 580         assert(type < SPDK_FTL_LIMIT_MAX);
 581         return &dev->conf.defrag.limits[type];
 582 }
 583
 584 static bool
 585 ftl_cache_lba_valid(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
 586 {
 587         struct ftl_ppa ppa;
 588
 589         /* If the LBA is invalid don't bother checking the md and l2p */
 590         if (spdk_unlikely(entry->lba == FTL_LBA_INVALID)) {
 591                 return false;
 592         }
 593
 594         ppa = ftl_l2p_get(dev, entry->lba);
 595         if (!(ftl_ppa_cached(ppa) && ppa.offset == entry->pos)) {
 596                 return false;
 597         }
 598
 599         return true;
 600 }
 601
 602 static void
 603 ftl_evict_cache_entry(struct spdk_ftl_dev *dev, struct ftl_rwb_entry *entry)
 604 {
 605         pthread_spin_lock(&entry->lock);
 606
 607         if (!ftl_rwb_entry_valid(entry)) {
 608                 goto unlock;
 609         }
 610
 611         /* If the l2p wasn't updated and still points at the entry, fill it with the */
 612         /* on-disk PPA and clear the cache status bit. Otherwise, skip the l2p update */
 613         /* and just clear the cache status. */
 614         if (!ftl_cache_lba_valid(dev, entry)) {
 615                 goto clear;
 616         }
 617
 618         ftl_l2p_set(dev, entry->lba, entry->ppa);
 619 clear:
 620         ftl_rwb_entry_invalidate(entry);
 621 unlock:
 622         pthread_spin_unlock(&entry->lock);
 623 }
 624
 625 static struct ftl_rwb_entry *
 626 ftl_acquire_entry(struct spdk_ftl_dev *dev, int flags)
 627 {
 628         struct ftl_rwb_entry *entry;
 629
 630         entry = ftl_rwb_acquire(dev->rwb, ftl_rwb_type_from_flags(flags));
 631         if (!entry) {
 632                 return NULL;
 633         }
 634
 635         ftl_evict_cache_entry(dev, entry);
 636
 637         entry->flags = flags;
 638         return entry;
 639 }
 640
 641 static void
 642 ftl_rwb_pad(struct spdk_ftl_dev *dev, size_t size)
 643 {
 644         struct ftl_rwb_entry *entry;
 645         int flags = FTL_IO_PAD | FTL_IO_INTERNAL;
 646
 647         for (size_t i = 0; i < size; ++i) {
 648                 entry = ftl_acquire_entry(dev, flags);
 649                 if (!entry) {
 650                         break;
 651                 }
 652
 653                 entry->lba = FTL_LBA_INVALID;
 654                 entry->ppa = ftl_to_ppa(FTL_PPA_INVALID);
 655                 memset(entry->data, 0, FTL_BLOCK_SIZE);
 656                 ftl_rwb_push(entry);
 657         }
 658 }
 659
 660 static void
 661 ftl_remove_free_bands(struct spdk_ftl_dev *dev)
 662 {
 663         while (!LIST_EMPTY(&dev->free_bands)) {
 664                 LIST_REMOVE(LIST_FIRST(&dev->free_bands), list_entry);
 665         }
 666
 667         dev->next_band = NULL;
 668 }
 669
 670 static void
 671 ftl_wptr_process_shutdown(struct ftl_wptr *wptr)
 672 {
 673         struct spdk_ftl_dev *dev = wptr->dev;
 674         size_t size = ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_INTERNAL) +
 675                       ftl_rwb_num_acquired(dev->rwb, FTL_RWB_TYPE_USER);
 676         size_t num_active = dev->xfer_size * ftl_rwb_get_active_batches(dev->rwb);
 677         size_t band_length, rwb_free_space, pad_length;
 678
 679         num_active = num_active ? num_active : dev->xfer_size;
 680         if (size >= num_active) {
 681                 return;
 682         }
 683
 684         /* If we reach this point we need to remove free bands */
 685         /* and pad current wptr band to the end */
 686         if (ftl_rwb_get_active_batches(dev->rwb) <= 1) {
 687                 ftl_remove_free_bands(dev);
 688         }
 689
 690         band_length = ftl_wptr_user_lbks_left(wptr);
 691         rwb_free_space = ftl_rwb_size(dev->rwb) - size;
 692         pad_length = spdk_min(band_length, rwb_free_space);
 693
 694         /* Pad write buffer until band is full */
 695         ftl_rwb_pad(dev, pad_length);
 696 }
 697
 698 static int
 699 ftl_shutdown_complete(struct spdk_ftl_dev *dev)
 700 {
 701         return !__atomic_load_n(&dev->num_inflight, __ATOMIC_SEQ_CST) &&
 702                LIST_EMPTY(&dev->wptr_list);
 703 }
 704
 705 void
 706 ftl_apply_limits(struct spdk_ftl_dev *dev)
 707 {
 708         const struct spdk_ftl_limit *limit;
 709         struct ftl_stats *stats = &dev->stats;
 710         size_t rwb_limit[FTL_RWB_TYPE_MAX];
 711         int i;
 712
 713         ftl_rwb_get_limits(dev->rwb, rwb_limit);
 714
 715         /* Clear existing limit */
 716         dev->limit = SPDK_FTL_LIMIT_MAX;
 717
 718         for (i = SPDK_FTL_LIMIT_CRIT; i < SPDK_FTL_LIMIT_MAX; ++i) {
 719                 limit = ftl_get_limit(dev, i);
 720
 721                 if (dev->num_free <= limit->thld) {
 722                         rwb_limit[FTL_RWB_TYPE_USER] =
 723                                 (limit->limit * ftl_rwb_entry_cnt(dev->rwb)) / 100;
 724                         stats->limits[i]++;
 725                         dev->limit = i;
 726                         goto apply;
 727                 }
 728         }
 729
 730         /* Clear the limits, since we don't need to apply them anymore */
 731         rwb_limit[FTL_RWB_TYPE_USER] = ftl_rwb_entry_cnt(dev->rwb);
 732 apply:
 733         ftl_trace_limits(dev, rwb_limit, dev->num_free);
 734         ftl_rwb_set_limits(dev->rwb, rwb_limit);
 735 }
 736
 737 static int
 738 ftl_invalidate_addr_unlocked(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
 739 {
 740         struct ftl_band *band = ftl_band_from_ppa(dev, ppa);
 741         struct ftl_lba_map *lba_map = &band->lba_map;
 742         uint64_t offset;
 743
 744         offset = ftl_band_lbkoff_from_ppa(band, ppa);
 745
 746         /* The bit might be already cleared if two writes are scheduled to the */
 747         /* same LBA at the same time */
 748         if (spdk_bit_array_get(lba_map->vld, offset)) {
 749                 assert(lba_map->num_vld > 0);
 750                 spdk_bit_array_clear(lba_map->vld, offset);
 751                 lba_map->num_vld--;
 752                 return 1;
 753         }
 754
 755         return 0;
 756 }
 757
 758 int
 759 ftl_invalidate_addr(struct spdk_ftl_dev *dev, struct ftl_ppa ppa)
 760 {
 761         struct ftl_band *band;
 762         int rc;
 763
 764         assert(!ftl_ppa_cached(ppa));
 765         band = ftl_band_from_ppa(dev, ppa);
 766
 767         pthread_spin_lock(&band->lba_map.lock);
 768         rc = ftl_invalidate_addr_unlocked(dev, ppa);
 769         pthread_spin_unlock(&band->lba_map.lock);
 770
 771         return rc;
 772 }
 773
 774 static int
 775 ftl_read_retry(int rc)
 776 {
 777         return rc == -EAGAIN;
 778 }
 779
 780 static int
 781 ftl_read_canceled(int rc)
 782 {
 783         return rc == -EFAULT || rc == 0;
 784 }
 785
 786 static void
 787 ftl_add_to_retry_queue(struct ftl_io *io)
 788 {
 789         if (!(io->flags & FTL_IO_RETRY)) {
 790                 io->flags |= FTL_IO_RETRY;
 791                 TAILQ_INSERT_TAIL(&io->dev->retry_queue, io, retry_entry);
 792         }
 793 }
 794
 795 static int
 796 ftl_ppa_cache_read(struct ftl_io *io, uint64_t lba,
 797                    struct ftl_ppa ppa, void *buf)
 798 {
 799         struct ftl_rwb *rwb = io->dev->rwb;
 800         struct ftl_rwb_entry *entry;
 801         struct ftl_ppa nppa;
 802         int rc = 0;
 803
 804         entry = ftl_rwb_entry_from_offset(rwb, ppa.offset);
 805         pthread_spin_lock(&entry->lock);
 806
 807         nppa = ftl_l2p_get(io->dev, lba);
 808         if (ppa.ppa != nppa.ppa) {
 809                 rc = -1;
 810                 goto out;
 811         }
 812
 813         memcpy(buf, entry->data, FTL_BLOCK_SIZE);
 814 out:
 815         pthread_spin_unlock(&entry->lock);
 816         return rc;
 817 }
 818
 819 static int
 820 ftl_lba_read_next_ppa(struct ftl_io *io, struct ftl_ppa *ppa)
 821 {
 822         struct spdk_ftl_dev *dev = io->dev;
 823         struct ftl_ppa next_ppa;
 824         size_t i;
 825
 826         *ppa = ftl_l2p_get(dev, ftl_io_current_lba(io));
 827
 828         SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Read ppa:%lx, lba:%lu\n",
 829                       ppa->ppa, ftl_io_current_lba(io));
 830
 831         /* If the PPA is invalid, skip it (the buffer should already be zero'ed) */
 832         if (ftl_ppa_invalid(*ppa)) {
 833                 return -EFAULT;
 834         }
 835
 836         if (ftl_ppa_cached(*ppa)) {
 837                 if (!ftl_ppa_cache_read(io, ftl_io_current_lba(io), *ppa, ftl_io_iovec_addr(io))) {
 838                         return 0;
 839                 }
 840
 841                 /* If the state changed, we have to re-read the l2p */
 842                 return -EAGAIN;
 843         }
 844
 845         for (i = 1; i < ftl_io_iovec_len_left(io); ++i) {
 846                 next_ppa = ftl_l2p_get(dev, ftl_io_get_lba(io, io->pos + i));
 847
 848                 if (ftl_ppa_invalid(next_ppa) || ftl_ppa_cached(next_ppa)) {
 849                         break;
 850                 }
 851
 852                 if (ftl_ppa_addr_pack(dev, *ppa) + i != ftl_ppa_addr_pack(dev, next_ppa)) {
 853                         break;
 854                 }
 855         }
 856
 857         return i;
 858 }
 859
 860 static int
 861 ftl_submit_read(struct ftl_io *io)
 862 {
 863         struct spdk_ftl_dev *dev = io->dev;
 864         struct ftl_ppa ppa;
 865         int rc = 0, lbk_cnt;
 866
 867         assert(LIST_EMPTY(&io->children));
 868
 869         while (io->pos < io->lbk_cnt) {
 870                 if (ftl_io_mode_ppa(io)) {
 871                         lbk_cnt = rc = ftl_ppa_read_next_ppa(io, &ppa);
 872                 } else {
 873                         lbk_cnt = rc = ftl_lba_read_next_ppa(io, &ppa);
 874                 }
 875
 876                 /* We might need to retry the read from scratch (e.g. */
 877                 /* because write was under way and completed before */
 878                 /* we could read it from rwb */
 879                 if (ftl_read_retry(rc)) {
 880                         continue;
 881                 }
 882
 883                 /* We don't have to schedule the read, as it was read from cache */
 884                 if (ftl_read_canceled(rc)) {
 885                         ftl_io_advance(io, 1);
 886                         ftl_trace_completion(io->dev, io, rc ? FTL_TRACE_COMPLETION_INVALID :
 887                                              FTL_TRACE_COMPLETION_CACHE);
 888                         rc = 0;
 889                         continue;
 890                 }
 891
 892                 assert(lbk_cnt > 0);
 893
 894                 ftl_trace_submission(dev, io, ppa, lbk_cnt);
 895                 rc = spdk_nvme_ns_cmd_read(dev->ns, ftl_get_read_qpair(dev),
 896                                            ftl_io_iovec_addr(io),
 897                                            ftl_ppa_addr_pack(io->dev, ppa), lbk_cnt,
 898                                            ftl_io_cmpl_cb, io, 0);
 899                 if (spdk_unlikely(rc)) {
 900                         if (rc == -ENOMEM) {
 901                                 ftl_add_to_retry_queue(io);
 902                         } else {
 903                                 ftl_io_fail(io, rc);
 904                         }
 905                         break;
 906                 }
 907
 908                 ftl_io_inc_req(io);
 909                 ftl_io_advance(io, lbk_cnt);
 910         }
 911
 912         /* If we didn't have to read anything from the device, */
 913         /* complete the request right away */
 914         if (ftl_io_done(io)) {
 915                 ftl_io_complete(io);
 916         }
 917
 918         return rc;
 919 }
 920
 921 static void
 922 ftl_complete_flush(struct ftl_flush *flush)
 923 {
 924         assert(flush->num_req == 0);
 925         LIST_REMOVE(flush, list_entry);
 926
 927         flush->cb.fn(flush->cb.ctx, 0);
 928
 929         spdk_bit_array_free(&flush->bmap);
 930         free(flush);
 931 }
 932
 933 static void
 934 ftl_process_flush(struct spdk_ftl_dev *dev, struct ftl_rwb_batch *batch)
 935 {
 936         struct ftl_flush *flush, *tflush;
 937         size_t offset;
 938
 939         LIST_FOREACH_SAFE(flush, &dev->flush_list, list_entry, tflush) {
 940                 offset = ftl_rwb_batch_get_offset(batch);
 941
 942                 if (spdk_bit_array_get(flush->bmap, offset)) {
 943                         spdk_bit_array_clear(flush->bmap, offset);
 944                         if (!(--flush->num_req)) {
 945                                 ftl_complete_flush(flush);
 946                         }
 947                 }
 948         }
 949 }
 950
 951 static uint64_t
 952 ftl_reserve_nv_cache(struct ftl_nv_cache *nv_cache, size_t *num_lbks)
 953 {
 954         struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(nv_cache->bdev_desc);
 955         uint64_t num_available, cache_size, cache_addr = FTL_LBA_INVALID;
 956
 957         cache_size = spdk_bdev_get_num_blocks(bdev);
 958
 959         pthread_spin_lock(&nv_cache->lock);
 960         if (spdk_unlikely(nv_cache->num_available == 0)) {
 961                 goto out;
 962         }
 963
 964         num_available = spdk_min(nv_cache->num_available, *num_lbks);
 965         if (spdk_unlikely(nv_cache->current_addr + num_available > cache_size)) {
 966                 *num_lbks = cache_size - nv_cache->current_addr;
 967         } else {
 968                 *num_lbks = num_available;
 969         }
 970
 971         cache_addr = nv_cache->current_addr;
 972         nv_cache->current_addr += *num_lbks;
 973         nv_cache->num_available -= *num_lbks;
 974
 975         if (nv_cache->current_addr == spdk_bdev_get_num_blocks(bdev)) {
 976                 nv_cache->current_addr = 0;
 977         }
 978 out:
 979         pthread_spin_unlock(&nv_cache->lock);
 980         return cache_addr;
 981 }
 982
 983 static struct ftl_io *
 984 ftl_alloc_io_nv_cache(struct ftl_io *parent, size_t num_lbks)
 985 {
 986         struct ftl_io_init_opts opts = {
 987                 .dev            = parent->dev,
 988                 .parent         = parent,
 989                 .data           = ftl_io_iovec_addr(parent),
 990                 .lbk_cnt        = num_lbks,
 991                 .flags          = FTL_IO_CACHE,
 992         };
 993
 994         return ftl_io_init_internal(&opts);
 995 }
 996
 997 static void
 998 ftl_nv_cache_submit_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
 999 {
1000         struct ftl_io *io = cb_arg;
1001
1002         if (spdk_unlikely(!success)) {
1003                 SPDK_ERRLOG("Non-volatile cache write failed at %"PRIx64"\n", io->ppa.ppa);
1004                 io->status = -EIO;
1005         }
1006
1007         ftl_io_dec_req(io);
1008         if (ftl_io_done(io)) {
1009                 ftl_io_complete(io);
1010         }
1011
1012         spdk_bdev_free_io(bdev_io);
1013 }
1014
1015 static void
1016 ftl_submit_nv_cache(void *ctx)
1017 {
1018         struct ftl_io *io = ctx;
1019         struct spdk_ftl_dev *dev = io->dev;
1020         struct spdk_thread *thread;
1021         struct ftl_io_channel *ioch;
1022         int rc;
1023
1024         ioch = spdk_io_channel_get_ctx(io->ioch);
1025         thread = spdk_io_channel_get_thread(io->ioch);
1026
1027         rc = spdk_bdev_write_blocks(dev->nv_cache.bdev_desc, ioch->cache_ioch,
1028                                     ftl_io_iovec_addr(io), io->ppa.ppa, io->lbk_cnt,
1029                                     ftl_nv_cache_submit_cb, io);
1030         if (rc == -ENOMEM) {
1031                 spdk_thread_send_msg(thread, ftl_submit_nv_cache, io);
1032                 return;
1033         } else if (rc) {
1034                 SPDK_ERRLOG("Write to persistent cache failed: %s (%"PRIu64", %"PRIu64")\n",
1035                             spdk_strerror(-rc), io->ppa.ppa, io->lbk_cnt);
1036                 io->status = -EIO;
1037                 ftl_io_complete(io);
1038                 return;
1039         }
1040
1041         ftl_io_advance(io, io->lbk_cnt);
1042         ftl_io_inc_req(io);
1043 }
1044
1045 static void
1046 _ftl_write_nv_cache(void *ctx)
1047 {
1048         struct ftl_io *child, *io = ctx;
1049         struct spdk_ftl_dev *dev = io->dev;
1050         struct spdk_thread *thread;
1051         uint64_t num_lbks;
1052
1053         thread = spdk_io_channel_get_thread(io->ioch);
1054
1055         while (io->pos < io->lbk_cnt) {
1056                 num_lbks = ftl_io_iovec_len_left(io);
1057
1058                 child = ftl_alloc_io_nv_cache(io, num_lbks);
1059                 if (spdk_unlikely(!child)) {
1060                         spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
1061                         return;
1062                 }
1063
1064                 /* Reserve area on the write buffer cache */
1065                 child->ppa.ppa = ftl_reserve_nv_cache(&dev->nv_cache, &num_lbks);
1066                 if (child->ppa.ppa == FTL_LBA_INVALID) {
1067                         ftl_io_free(child);
1068                         spdk_thread_send_msg(thread, _ftl_write_nv_cache, io);
1069                         break;
1070                 }
1071
1072                 /* Shrink the IO if there isn't enough room in the cache to fill the whole iovec */
1073                 if (spdk_unlikely(num_lbks != ftl_io_iovec_len_left(io))) {
1074                         ftl_io_shrink_iovec(child, num_lbks);
1075                 }
1076
1077                 ftl_submit_nv_cache(child);
1078         }
1079
1080         if (ftl_io_done(io)) {
1081                 ftl_io_complete(io);
1082         }
1083 }
1084
1085 static void
1086 ftl_write_nv_cache(struct ftl_io *parent)
1087 {
1088         ftl_io_reset(parent);
1089         parent->flags |= FTL_IO_CACHE;
1090         _ftl_write_nv_cache(parent);
1091 }
1092
1093 static void
1094 ftl_write_fail(struct ftl_io *io, int status)
1095 {
1096         struct ftl_rwb_batch *batch = io->rwb_batch;
1097         struct spdk_ftl_dev *dev = io->dev;
1098         struct ftl_rwb_entry *entry;
1099         struct ftl_band *band;
1100         char buf[128];
1101
1102         entry = ftl_rwb_batch_first_entry(batch);
1103
1104         band = ftl_band_from_ppa(io->dev, entry->ppa);
1105         SPDK_ERRLOG("Write failed @ppa: %s, status: %d\n",
1106                     ftl_ppa2str(entry->ppa, buf, sizeof(buf)), status);
1107
1108         /* Close the band and, halt wptr and defrag */
1109         ftl_halt_writes(dev, band);
1110
1111         ftl_rwb_foreach(entry, batch) {
1112                 /* Invalidate meta set by process_writes() */
1113                 ftl_invalidate_addr(dev, entry->ppa);
1114         }
1115
1116         /* Reset the batch back to the the RWB to resend it later */
1117         ftl_rwb_batch_revert(batch);
1118 }
1119
1120 static void
1121 ftl_write_cb(struct ftl_io *io, void *arg, int status)
1122 {
1123         struct spdk_ftl_dev *dev = io->dev;
1124         struct ftl_rwb_batch *batch = io->rwb_batch;
1125         struct ftl_rwb_entry *entry;
1126
1127         if (status) {
1128                 ftl_write_fail(io, status);
1129                 return;
1130         }
1131
1132         assert(io->lbk_cnt == dev->xfer_size);
1133         ftl_rwb_foreach(entry, batch) {
1134                 if (!(io->flags & FTL_IO_MD) && !(entry->flags & FTL_IO_PAD)) {
1135                         /* Verify that the LBA is set for user lbks */
1136                         assert(entry->lba != FTL_LBA_INVALID);
1137                 }
1138
1139                 SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lu, lba:%lu\n",
1140                               entry->ppa.ppa, entry->lba);
1141         }
1142
1143         ftl_process_flush(dev, batch);
1144         ftl_rwb_batch_release(batch);
1145 }
1146
1147 static void
1148 ftl_update_rwb_stats(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry)
1149 {
1150         if (!ftl_rwb_entry_internal(entry)) {
1151                 dev->stats.write_user++;
1152         }
1153         dev->stats.write_total++;
1154 }
1155
1156 static void
1157 ftl_update_l2p(struct spdk_ftl_dev *dev, const struct ftl_rwb_entry *entry,
1158                struct ftl_ppa ppa)
1159 {
1160         struct ftl_ppa prev_ppa;
1161         struct ftl_rwb_entry *prev;
1162         struct ftl_band *band;
1163         int valid;
1164
1165         prev_ppa = ftl_l2p_get(dev, entry->lba);
1166         if (ftl_ppa_invalid(prev_ppa)) {
1167                 ftl_l2p_set(dev, entry->lba, ppa);
1168                 return;
1169         }
1170
1171         /* If the L2P's PPA is different than what we expected we don't need to */
1172         /* do anything (someone's already overwritten our data). */
1173         if (ftl_rwb_entry_weak(entry) && !ftl_ppa_cmp(prev_ppa, entry->ppa)) {
1174                 return;
1175         }
1176
1177         if (ftl_ppa_cached(prev_ppa)) {
1178                 assert(!ftl_rwb_entry_weak(entry));
1179                 prev = ftl_rwb_entry_from_offset(dev->rwb, prev_ppa.offset);
1180                 pthread_spin_lock(&prev->lock);
1181
1182                 /* Re-read the L2P under the lock to protect against updates */
1183                 /* to this LBA from other threads */
1184                 prev_ppa = ftl_l2p_get(dev, entry->lba);
1185
1186                 /* If the entry is no longer in cache, another write has been */
1187                 /* scheduled in the meantime, so we have to invalidate its LBA */
1188                 if (!ftl_ppa_cached(prev_ppa)) {
1189                         ftl_invalidate_addr(dev, prev_ppa);
1190                 }
1191
1192                 /* If previous entry is part of cache, remove and invalidate it */
1193                 if (ftl_rwb_entry_valid(prev)) {
1194                         ftl_invalidate_addr(dev, prev->ppa);
1195                         ftl_rwb_entry_invalidate(prev);
1196                 }
1197
1198                 ftl_l2p_set(dev, entry->lba, ppa);
1199                 pthread_spin_unlock(&prev->lock);
1200                 return;
1201         }
1202
1203         /* Lock the band containing previous PPA. This assures atomic changes to */
1204         /* the L2P as wall as metadata. The valid bits in metadata are used to */
1205         /* check weak writes validity. */
1206         band = ftl_band_from_ppa(dev, prev_ppa);
1207         pthread_spin_lock(&band->lba_map.lock);
1208
1209         valid = ftl_invalidate_addr_unlocked(dev, prev_ppa);
1210
1211         /* If the address has been invalidated already, we don't want to update */
1212         /* the L2P for weak writes, as it means the write is no longer valid. */
1213         if (!ftl_rwb_entry_weak(entry) || valid) {
1214                 ftl_l2p_set(dev, entry->lba, ppa);
1215         }
1216
1217         pthread_spin_unlock(&band->lba_map.lock);
1218 }
1219
1220 static struct ftl_io *
1221 ftl_io_init_child_write(struct ftl_io *parent, struct ftl_ppa ppa,
1222                         void *data, void *md, ftl_io_fn cb)
1223 {
1224         struct ftl_io *io;
1225         struct spdk_ftl_dev *dev = parent->dev;
1226         struct ftl_io_init_opts opts = {
1227                 .dev            = dev,
1228                 .io             = NULL,
1229                 .parent         = parent,
1230                 .rwb_batch      = NULL,
1231                 .band           = parent->band,
1232                 .size           = sizeof(struct ftl_io),
1233                 .flags          = 0,
1234                 .type           = FTL_IO_WRITE,
1235                 .lbk_cnt        = dev->xfer_size,
1236                 .cb_fn          = cb,
1237                 .data           = data,
1238                 .md             = md,
1239         };
1240
1241         io = ftl_io_init_internal(&opts);
1242         if (!io) {
1243                 return NULL;
1244         }
1245
1246         io->ppa = ppa;
1247
1248         return io;
1249 }
1250
1251 static void
1252 ftl_io_child_write_cb(struct ftl_io *io, void *ctx, int status)
1253 {
1254         struct ftl_chunk *chunk;
1255
1256         chunk = ftl_band_chunk_from_ppa(io->band, io->ppa);
1257         chunk->busy = false;
1258 }
1259
1260 static int
1261 ftl_submit_child_write(struct ftl_wptr *wptr, struct ftl_io *io, int lbk_cnt)
1262 {
1263         struct spdk_ftl_dev     *dev = io->dev;
1264         struct ftl_io           *child;
1265         int                     rc;
1266         struct ftl_ppa          ppa;
1267
1268         if (spdk_likely(!wptr->direct_mode)) {
1269                 ppa = wptr->ppa;
1270         } else {
1271                 assert(io->flags & FTL_IO_DIRECT_ACCESS);
1272                 assert(io->ppa.chk == wptr->band->id);
1273                 ppa = io->ppa;
1274         }
1275
1276         /* Split IO to child requests and release chunk immediately after child is completed */
1277         child = ftl_io_init_child_write(io, ppa, ftl_io_iovec_addr(io),
1278                                         ftl_io_get_md(io), ftl_io_child_write_cb);
1279         if (!child) {
1280                 return -EAGAIN;
1281         }
1282
1283         rc = spdk_nvme_ns_cmd_write_with_md(dev->ns, ftl_get_write_qpair(dev),
1284                                             ftl_io_iovec_addr(child), child->md,
1285                                             ftl_ppa_addr_pack(dev, ppa),
1286                                             lbk_cnt, ftl_io_cmpl_cb, child, 0, 0, 0);
1287         if (rc) {
1288                 ftl_io_fail(child, rc);
1289                 ftl_io_complete(child);
1290                 SPDK_ERRLOG("spdk_nvme_ns_cmd_write failed with status:%d, ppa:%lu\n",
1291                             rc, ppa.ppa);
1292
1293                 return -EIO;
1294         }
1295
1296         ftl_io_inc_req(child);
1297         ftl_io_advance(child, lbk_cnt);
1298
1299         return 0;
1300 }
1301
1302 static int
1303 ftl_submit_write(struct ftl_wptr *wptr, struct ftl_io *io)
1304 {
1305         struct spdk_ftl_dev     *dev = io->dev;
1306         int                     rc = 0;
1307
1308         assert(io->lbk_cnt % dev->xfer_size == 0);
1309
1310         while (io->iov_pos < io->iov_cnt) {
1311                 /* There are no guarantees of the order of completion of NVMe IO submission queue */
1312                 /* so wait until chunk is not busy before submitting another write */
1313                 if (wptr->chunk->busy) {
1314                         TAILQ_INSERT_TAIL(&wptr->pending_queue, io, retry_entry);
1315                         rc = -EAGAIN;
1316                         break;
1317                 }
1318
1319                 rc = ftl_submit_child_write(wptr, io, dev->xfer_size);
1320                 if (spdk_unlikely(rc)) {
1321                         if (rc == -EAGAIN) {
1322                                 TAILQ_INSERT_TAIL(&wptr->pending_queue, io, retry_entry);
1323                         } else {
1324                                 ftl_io_fail(io, rc);
1325                         }
1326                         break;
1327                 }
1328
1329                 ftl_trace_submission(dev, io, wptr->ppa, dev->xfer_size);
1330                 ftl_wptr_advance(wptr, dev->xfer_size);
1331         }
1332
1333         if (ftl_io_done(io)) {
1334                 /* Parent IO will complete after all children are completed */
1335                 ftl_io_complete(io);
1336         }
1337
1338         return rc;
1339 }
1340
1341 static void
1342 ftl_flush_pad_batch(struct spdk_ftl_dev *dev)
1343 {
1344         struct ftl_rwb *rwb = dev->rwb;
1345         size_t size, num_entries;
1346
1347         size = ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_INTERNAL) +
1348                ftl_rwb_num_acquired(rwb, FTL_RWB_TYPE_USER);
1349
1350         /* There must be something in the RWB, otherwise the flush */
1351         /* wouldn't be waiting for anything */
1352         assert(size > 0);
1353
1354         /* Only add padding when there's less than xfer size */
1355         /* entries in the buffer. Otherwise we just have to wait */
1356         /* for the entries to become ready. */
1357         num_entries = ftl_rwb_get_active_batches(dev->rwb) * dev->xfer_size;
1358         if (size < num_entries) {
1359                 ftl_rwb_pad(dev, num_entries - (size % num_entries));
1360         }
1361 }
1362
1363 static int
1364 ftl_wptr_process_writes(struct ftl_wptr *wptr)
1365 {
1366         struct spdk_ftl_dev     *dev = wptr->dev;
1367         struct ftl_rwb_batch    *batch;
1368         struct ftl_rwb_entry    *entry;
1369         struct ftl_io           *io;
1370         struct ftl_ppa          ppa, prev_ppa;
1371
1372         if (spdk_unlikely(!TAILQ_EMPTY(&wptr->pending_queue))) {
1373                 io = TAILQ_FIRST(&wptr->pending_queue);
1374                 TAILQ_REMOVE(&wptr->pending_queue, io, retry_entry);
1375
1376                 if (ftl_submit_write(wptr, io) == -EAGAIN) {
1377                         return 0;
1378                 }
1379         }
1380
1381         /* Make sure the band is prepared for writing */
1382         if (!ftl_wptr_ready(wptr)) {
1383                 return 0;
1384         }
1385
1386         if (dev->halt) {
1387                 ftl_wptr_process_shutdown(wptr);
1388         }
1389
1390         batch = ftl_rwb_pop(dev->rwb);
1391         if (!batch) {
1392                 /* If there are queued flush requests we need to pad the RWB to */
1393                 /* force out remaining entries */
1394                 if (!LIST_EMPTY(&dev->flush_list)) {
1395                         ftl_flush_pad_batch(dev);
1396                 }
1397
1398                 return 0;
1399         }
1400
1401         io = ftl_io_rwb_init(dev, wptr->band, batch, ftl_write_cb);
1402         if (!io) {
1403                 goto error;
1404         }
1405
1406         ppa = wptr->ppa;
1407         ftl_rwb_foreach(entry, batch) {
1408                 entry->ppa = ppa;
1409
1410                 if (entry->lba != FTL_LBA_INVALID) {
1411                         pthread_spin_lock(&entry->lock);
1412                         prev_ppa = ftl_l2p_get(dev, entry->lba);
1413
1414                         /* If the l2p was updated in the meantime, don't update band's metadata */
1415                         if (ftl_ppa_cached(prev_ppa) && prev_ppa.offset == entry->pos) {
1416                                 /* Setting entry's cache bit needs to be done after metadata */
1417                                 /* within the band is updated to make sure that writes */
1418                                 /* invalidating the entry clear the metadata as well */
1419                                 ftl_band_set_addr(wptr->band, entry->lba, entry->ppa);
1420                                 ftl_rwb_entry_set_valid(entry);
1421                         }
1422                         pthread_spin_unlock(&entry->lock);
1423                 }
1424
1425                 ftl_trace_rwb_pop(dev, entry);
1426                 ftl_update_rwb_stats(dev, entry);
1427
1428                 ppa = ftl_band_next_ppa(wptr->band, ppa, 1);
1429         }
1430
1431         SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Write ppa:%lx, %lx\n", wptr->ppa.ppa,
1432                       ftl_ppa_addr_pack(dev, wptr->ppa));
1433
1434         if (ftl_submit_write(wptr, io)) {
1435                 /* TODO: we need some recovery here */
1436                 assert(0 && "Write submit failed");
1437                 if (ftl_io_done(io)) {
1438                         ftl_io_free(io);
1439                 }
1440         }
1441
1442         return dev->xfer_size;
1443 error:
1444         ftl_rwb_batch_revert(batch);
1445         return 0;
1446 }
1447
1448 static int
1449 ftl_process_writes(struct spdk_ftl_dev *dev)
1450 {
1451         struct ftl_wptr *wptr, *twptr;
1452         size_t num_active = 0;
1453         enum ftl_band_state state;
1454
1455         LIST_FOREACH_SAFE(wptr, &dev->wptr_list, list_entry, twptr) {
1456                 ftl_wptr_process_writes(wptr);
1457                 state = wptr->band->state;
1458
1459                 if (state != FTL_BAND_STATE_FULL &&
1460                     state != FTL_BAND_STATE_CLOSING &&
1461                     state != FTL_BAND_STATE_CLOSED) {
1462                         num_active++;
1463                 }
1464         }
1465
1466         if (num_active < 1) {
1467                 ftl_add_wptr(dev);
1468         }
1469
1470         return 0;
1471 }
1472
1473 static void
1474 ftl_rwb_entry_fill(struct ftl_rwb_entry *entry, struct ftl_io *io)
1475 {
1476         struct ftl_band *band;
1477
1478         memcpy(entry->data, ftl_io_iovec_addr(io), FTL_BLOCK_SIZE);
1479
1480         if (ftl_rwb_entry_weak(entry)) {
1481                 band = ftl_band_from_ppa(io->dev, io->ppa);
1482                 entry->ppa = ftl_band_next_ppa(band, io->ppa, io->pos);
1483         }
1484
1485         entry->trace = io->trace;
1486         entry->lba = ftl_io_current_lba(io);
1487
1488         if (entry->md) {
1489                 memcpy(entry->md, &entry->lba, sizeof(entry->lba));
1490         }
1491 }
1492
1493 static int
1494 ftl_rwb_fill(struct ftl_io *io)
1495 {
1496         struct spdk_ftl_dev *dev = io->dev;
1497         struct ftl_rwb_entry *entry;
1498         struct ftl_ppa ppa = { .cached = 1 };
1499         int flags = ftl_rwb_flags_from_io(io);
1500
1501         while (io->pos < io->lbk_cnt) {
1502                 if (ftl_io_current_lba(io) == FTL_LBA_INVALID) {
1503                         ftl_io_advance(io, 1);
1504                         continue;
1505                 }
1506
1507                 entry = ftl_acquire_entry(dev, flags);
1508                 if (!entry) {
1509                         return -EAGAIN;
1510                 }
1511
1512                 ftl_rwb_entry_fill(entry, io);
1513
1514                 ppa.offset = entry->pos;
1515
1516                 ftl_trace_rwb_fill(dev, io);
1517                 ftl_update_l2p(dev, entry, ppa);
1518                 ftl_io_advance(io, 1);
1519
1520                 /* Needs to be done after L2P is updated to avoid race with */
1521                 /* write completion callback when it's processed faster than */
1522                 /* L2P is set in update_l2p(). */
1523                 ftl_rwb_push(entry);
1524         }
1525
1526         if (ftl_io_done(io)) {
1527                 if (dev->nv_cache.bdev_desc) {
1528                         ftl_write_nv_cache(io);
1529                 } else {
1530                         ftl_io_complete(io);
1531                 }
1532         }
1533
1534         return 0;
1535 }
1536
1537 static bool
1538 ftl_dev_needs_defrag(struct spdk_ftl_dev *dev)
1539 {
1540         const struct spdk_ftl_limit *limit = ftl_get_limit(dev, SPDK_FTL_LIMIT_START);
1541
1542         if (ftl_reloc_is_halted(dev->reloc)) {
1543                 return false;
1544         }
1545
1546         if (dev->df_band) {
1547                 return false;
1548         }
1549
1550         if (dev->num_free <= limit->thld) {
1551                 return true;
1552         }
1553
1554         return false;
1555 }
1556
1557 static double
1558 ftl_band_calc_merit(struct ftl_band *band, size_t *threshold_valid)
1559 {
1560         size_t usable, valid, invalid;
1561         double vld_ratio;
1562
1563         /* If the band doesn't have any usable lbks it's of no use */
1564         usable = ftl_band_num_usable_lbks(band);
1565         if (usable == 0) {
1566                 return 0.0;
1567         }
1568
1569         valid =  threshold_valid ? (usable - *threshold_valid) : band->lba_map.num_vld;
1570         invalid = usable - valid;
1571
1572         /* Add one to avoid division by 0 */
1573         vld_ratio = (double)invalid / (double)(valid + 1);
1574         return vld_ratio * ftl_band_age(band);
1575 }
1576
1577 static bool
1578 ftl_band_needs_defrag(struct ftl_band *band, struct spdk_ftl_dev *dev)
1579 {
1580         struct spdk_ftl_conf *conf = &dev->conf;
1581         size_t thld_vld;
1582
1583         /* If we're in dire need of free bands, every band is worth defragging */
1584         if (ftl_current_limit(dev) == SPDK_FTL_LIMIT_CRIT) {
1585                 return true;
1586         }
1587
1588         thld_vld = (ftl_band_num_usable_lbks(band) * conf->defrag.invalid_thld) / 100;
1589
1590         return band->merit > ftl_band_calc_merit(band, &thld_vld);
1591 }
1592
1593 static struct ftl_band *
1594 ftl_select_defrag_band(struct spdk_ftl_dev *dev)
1595 {
1596         struct ftl_band *band, *mband = NULL;
1597         double merit = 0;
1598
1599         LIST_FOREACH(band, &dev->shut_bands, list_entry) {
1600                 assert(band->state == FTL_BAND_STATE_CLOSED);
1601                 band->merit = ftl_band_calc_merit(band, NULL);
1602                 if (band->merit > merit) {
1603                         merit = band->merit;
1604                         mband = band;
1605                 }
1606         }
1607
1608         if (mband && !ftl_band_needs_defrag(mband, dev)) {
1609                 mband = NULL;
1610         }
1611
1612         return mband;
1613 }
1614
1615 static void
1616 ftl_process_relocs(struct spdk_ftl_dev *dev)
1617 {
1618         struct ftl_band *band;
1619
1620         if (ftl_dev_needs_defrag(dev)) {
1621                 band = dev->df_band = ftl_select_defrag_band(dev);
1622
1623                 if (band) {
1624                         ftl_reloc_add(dev->reloc, band, 0, ftl_num_band_lbks(dev), 0);
1625                         ftl_trace_defrag_band(dev, band);
1626                 }
1627         }
1628
1629         ftl_reloc(dev->reloc);
1630 }
1631
1632 int
1633 ftl_current_limit(const struct spdk_ftl_dev *dev)
1634 {
1635         return dev->limit;
1636 }
1637
1638 void
1639 spdk_ftl_dev_get_attrs(const struct spdk_ftl_dev *dev, struct spdk_ftl_attrs *attrs)
1640 {
1641         attrs->uuid = dev->uuid;
1642         attrs->lbk_cnt = dev->num_lbas;
1643         attrs->lbk_size = FTL_BLOCK_SIZE;
1644         attrs->range = dev->range;
1645         attrs->cache_bdev_desc = dev->nv_cache.bdev_desc;
1646         attrs->allow_open_bands = dev->conf.allow_open_bands;
1647         attrs->num_chunks = dev->geo.num_chk;
1648         attrs->chunk_size = dev->geo.clba;
1649 }
1650
1651 static void
1652 _ftl_io_write(void *ctx)
1653 {
1654         ftl_io_write((struct ftl_io *)ctx);
1655 }
1656
1657 static int
1658 ftl_rwb_fill_leaf(struct ftl_io *io)
1659 {
1660         int rc;
1661
1662         rc = ftl_rwb_fill(io);
1663         if (rc == -EAGAIN) {
1664                 spdk_thread_send_msg(spdk_io_channel_get_thread(io->ioch),
1665                                      _ftl_io_write, io);
1666                 return 0;
1667         }
1668
1669         return rc;
1670 }
1671
1672 static int
1673 ftl_submit_write_leaf(struct ftl_io *io)
1674 {
1675         int rc;
1676
1677         rc = ftl_submit_write(ftl_wptr_from_band(io->band), io);
1678         if (rc == -EAGAIN) {
1679                 /* EAGAIN means that the request was put on the pending queue */
1680                 return 0;
1681         }
1682
1683         return rc;
1684 }
1685
1686 void
1687 ftl_io_write(struct ftl_io *io)
1688 {
1689         struct spdk_ftl_dev *dev = io->dev;
1690
1691         /* For normal IOs we just need to copy the data onto the rwb */
1692         if (!(io->flags & FTL_IO_MD)) {
1693                 ftl_io_call_foreach_child(io, ftl_rwb_fill_leaf);
1694         } else {
1695                 /* Metadata has its own buffer, so it doesn't have to be copied, so just */
1696                 /* send it the the core thread and schedule the write immediately */
1697                 if (ftl_check_core_thread(dev)) {
1698                         ftl_io_call_foreach_child(io, ftl_submit_write_leaf);
1699                 } else {
1700                         spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_io_write, io);
1701                 }
1702         }
1703 }
1704
1705 int
1706 spdk_ftl_write(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1707                struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1708 {
1709         struct ftl_io *io;
1710
1711         if (iov_cnt == 0) {
1712                 return -EINVAL;
1713         }
1714
1715         if (lba_cnt == 0) {
1716                 return -EINVAL;
1717         }
1718
1719         if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1720                 return -EINVAL;
1721         }
1722
1723         if (!dev->initialized) {
1724                 return -EBUSY;
1725         }
1726
1727         io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_WRITE);
1728         if (!io) {
1729                 return -ENOMEM;
1730         }
1731
1732         ftl_io_write(io);
1733
1734         return 0;
1735 }
1736
1737 static int
1738 ftl_io_read_leaf(struct ftl_io *io)
1739 {
1740         int rc;
1741
1742         rc = ftl_submit_read(io);
1743         if (rc == -ENOMEM) {
1744                 /* ENOMEM means that the request was put on a pending queue */
1745                 return 0;
1746         }
1747
1748         return rc;
1749 }
1750
1751 static void
1752 _ftl_io_read(void *arg)
1753 {
1754         ftl_io_read((struct ftl_io *)arg);
1755 }
1756
1757 void
1758 ftl_io_read(struct ftl_io *io)
1759 {
1760         struct spdk_ftl_dev *dev = io->dev;
1761
1762         if (ftl_check_read_thread(dev)) {
1763                 ftl_io_call_foreach_child(io, ftl_io_read_leaf);
1764         } else {
1765                 spdk_thread_send_msg(ftl_get_read_thread(dev), _ftl_io_read, io);
1766         }
1767 }
1768
1769 int
1770 spdk_ftl_read(struct spdk_ftl_dev *dev, struct spdk_io_channel *ch, uint64_t lba, size_t lba_cnt,
1771               struct iovec *iov, size_t iov_cnt, spdk_ftl_fn cb_fn, void *cb_arg)
1772 {
1773         struct ftl_io *io;
1774
1775         if (iov_cnt == 0) {
1776                 return -EINVAL;
1777         }
1778
1779         if (lba_cnt == 0) {
1780                 return -EINVAL;
1781         }
1782
1783         if (lba_cnt != ftl_iovec_num_lbks(iov, iov_cnt)) {
1784                 return -EINVAL;
1785         }
1786
1787         if (!dev->initialized) {
1788                 return -EBUSY;
1789         }
1790
1791         io = ftl_io_user_init(ch, lba, lba_cnt, iov, iov_cnt, cb_fn, cb_arg, FTL_IO_READ);
1792         if (!io) {
1793                 return -ENOMEM;
1794         }
1795
1796         ftl_io_read(io);
1797         return 0;
1798 }
1799
1800 static struct ftl_flush *
1801 ftl_flush_init(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1802 {
1803         struct ftl_flush *flush;
1804         struct ftl_rwb *rwb = dev->rwb;
1805
1806         flush = calloc(1, sizeof(*flush));
1807         if (!flush) {
1808                 return NULL;
1809         }
1810
1811         flush->bmap = spdk_bit_array_create(ftl_rwb_num_batches(rwb));
1812         if (!flush->bmap) {
1813                 goto error;
1814         }
1815
1816         flush->dev = dev;
1817         flush->cb.fn = cb_fn;
1818         flush->cb.ctx = cb_arg;
1819
1820         return flush;
1821 error:
1822         free(flush);
1823         return NULL;
1824 }
1825
1826 static void
1827 _ftl_flush(void *ctx)
1828 {
1829         struct ftl_flush *flush = ctx;
1830         struct spdk_ftl_dev *dev = flush->dev;
1831         struct ftl_rwb *rwb = dev->rwb;
1832         struct ftl_rwb_batch *batch;
1833
1834         /* Attach flush object to all non-empty batches */
1835         ftl_rwb_foreach_batch(batch, rwb) {
1836                 if (!ftl_rwb_batch_empty(batch)) {
1837                         spdk_bit_array_set(flush->bmap, ftl_rwb_batch_get_offset(batch));
1838                         flush->num_req++;
1839                 }
1840         }
1841
1842         LIST_INSERT_HEAD(&dev->flush_list, flush, list_entry);
1843
1844         /* If the RWB was already empty, the flush can be completed right away */
1845         if (!flush->num_req) {
1846                 ftl_complete_flush(flush);
1847         }
1848 }
1849
1850 int
1851 spdk_ftl_flush(struct spdk_ftl_dev *dev, spdk_ftl_fn cb_fn, void *cb_arg)
1852 {
1853         struct ftl_flush *flush;
1854
1855         if (!dev->initialized) {
1856                 return -EBUSY;
1857         }
1858
1859         flush = ftl_flush_init(dev, cb_fn, cb_arg);
1860         if (!flush) {
1861                 return -ENOMEM;
1862         }
1863
1864         spdk_thread_send_msg(ftl_get_core_thread(dev), _ftl_flush, flush);
1865         return 0;
1866 }
1867
1868 void
1869 ftl_process_anm_event(struct ftl_anm_event *event)
1870 {
1871         SPDK_DEBUGLOG(SPDK_LOG_FTL_CORE, "Unconsumed ANM received for dev: %p...\n", event->dev);
1872         ftl_anm_event_complete(event);
1873 }
1874
1875 static void
1876 ftl_process_retry_queue(struct spdk_ftl_dev *dev)
1877 {
1878         struct ftl_io *io;
1879         int rc;
1880
1881         while (!TAILQ_EMPTY(&dev->retry_queue)) {
1882                 io = TAILQ_FIRST(&dev->retry_queue);
1883
1884                 /* Retry only if IO is still healthy */
1885                 if (spdk_likely(io->status == 0)) {
1886                         rc = ftl_submit_read(io);
1887                         if (rc == -ENOMEM) {
1888                                 break;
1889                         }
1890                 }
1891
1892                 io->flags &= ~FTL_IO_RETRY;
1893                 TAILQ_REMOVE(&dev->retry_queue, io, retry_entry);
1894
1895                 if (ftl_io_done(io)) {
1896                         ftl_io_complete(io);
1897                 }
1898         }
1899 }
1900
1901 int
1902 ftl_task_read(void *ctx)
1903 {
1904         struct ftl_thread *thread = ctx;
1905         struct spdk_ftl_dev *dev = thread->dev;
1906         struct spdk_nvme_qpair *qpair = ftl_get_read_qpair(dev);
1907         size_t num_completed;
1908
1909         if (dev->halt) {
1910                 if (ftl_shutdown_complete(dev)) {
1911                         spdk_poller_unregister(&thread->poller);
1912                         return 0;
1913                 }
1914         }
1915
1916         num_completed = spdk_nvme_qpair_process_completions(qpair, 0);
1917
1918         if (num_completed && !TAILQ_EMPTY(&dev->retry_queue)) {
1919                 ftl_process_retry_queue(dev);
1920         }
1921
1922         return num_completed;
1923 }
1924
1925 int
1926 ftl_task_core(void *ctx)
1927 {
1928         struct ftl_thread *thread = ctx;
1929         struct spdk_ftl_dev *dev = thread->dev;
1930         struct spdk_nvme_qpair *qpair = ftl_get_write_qpair(dev);
1931
1932         if (dev->halt) {
1933                 if (ftl_shutdown_complete(dev)) {
1934                         spdk_poller_unregister(&thread->poller);
1935                         return 0;
1936                 }
1937         }
1938
1939         ftl_process_writes(dev);
1940         spdk_nvme_qpair_process_completions(qpair, 0);
1941         ftl_process_relocs(dev);
1942
1943         return 0;
1944 }
1945
1946 SPDK_LOG_REGISTER_COMPONENT("ftl_core", SPDK_LOG_FTL_CORE)