fs/dax.c

   1 /*
   2  * fs/dax.c - Direct Access filesystem code
   3  * Copyright (c) 2013-2014 Intel Corporation
   4  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
   5  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify it
   8  * under the terms and conditions of the GNU General Public License,
   9  * version 2, as published by the Free Software Foundation.
  10  *
  11  * This program is distributed in the hope it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14  * more details.
  15  */
  16
  17 #include <linux/atomic.h>
  18 #include <linux/blkdev.h>
  19 #include <linux/buffer_head.h>
  20 #include <linux/dax.h>
  21 #include <linux/fs.h>
  22 #include <linux/genhd.h>
  23 #include <linux/highmem.h>
  24 #include <linux/memcontrol.h>
  25 #include <linux/mm.h>
  26 #include <linux/mutex.h>
  27 #include <linux/pagevec.h>
  28 #include <linux/pmem.h>
  29 #include <linux/sched.h>
  30 #include <linux/uio.h>
  31 #include <linux/vmstat.h>
  32 #include <linux/pfn_t.h>
  33 #include <linux/sizes.h>
  34 #include <linux/iomap.h>
  35 #include "internal.h"
  36
  37 /* We choose 4096 entries - same as per-zone page wait tables */
  38 #define DAX_WAIT_TABLE_BITS 12
  39 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  40
  41 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  42
  43 static int __init init_dax_wait_table(void)
  44 {
  45         int i;
  46
  47         for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  48                 init_waitqueue_head(wait_table + i);
  49         return 0;
  50 }
  51 fs_initcall(init_dax_wait_table);
  52
  53 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  54 {
  55         struct request_queue *q = bdev->bd_queue;
  56         long rc = -EIO;
  57
  58         dax->addr = ERR_PTR(-EIO);
  59         if (blk_queue_enter(q, true) != 0)
  60                 return rc;
  61
  62         rc = bdev_direct_access(bdev, dax);
  63         if (rc < 0) {
  64                 dax->addr = ERR_PTR(rc);
  65                 blk_queue_exit(q);
  66                 return rc;
  67         }
  68         return rc;
  69 }
  70
  71 static void dax_unmap_atomic(struct block_device *bdev,
  72                 const struct blk_dax_ctl *dax)
  73 {
  74         if (IS_ERR(dax->addr))
  75                 return;
  76         blk_queue_exit(bdev->bd_queue);
  77 }
  78
  79 static int dax_is_pmd_entry(void *entry)
  80 {
  81         return (unsigned long)entry & RADIX_DAX_PMD;
  82 }
  83
  84 static int dax_is_pte_entry(void *entry)
  85 {
  86         return !((unsigned long)entry & RADIX_DAX_PMD);
  87 }
  88
  89 static int dax_is_zero_entry(void *entry)
  90 {
  91         return (unsigned long)entry & RADIX_DAX_HZP;
  92 }
  93
  94 static int dax_is_empty_entry(void *entry)
  95 {
  96         return (unsigned long)entry & RADIX_DAX_EMPTY;
  97 }
  98
  99 struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 100 {
 101         struct page *page = alloc_pages(GFP_KERNEL, 0);
 102         struct blk_dax_ctl dax = {
 103                 .size = PAGE_SIZE,
 104                 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
 105         };
 106         long rc;
 107
 108         if (!page)
 109                 return ERR_PTR(-ENOMEM);
 110
 111         rc = dax_map_atomic(bdev, &dax);
 112         if (rc < 0)
 113                 return ERR_PTR(rc);
 114         memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
 115         dax_unmap_atomic(bdev, &dax);
 116         return page;
 117 }
 118
 119 static bool buffer_written(struct buffer_head *bh)
 120 {
 121         return buffer_mapped(bh) && !buffer_unwritten(bh);
 122 }
 123
 124 static sector_t to_sector(const struct buffer_head *bh,
 125                 const struct inode *inode)
 126 {
 127         sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
 128
 129         return sector;
 130 }
 131
 132 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 133                       loff_t start, loff_t end, get_block_t get_block,
 134                       struct buffer_head *bh)
 135 {
 136         loff_t pos = start, max = start, bh_max = start;
 137         bool hole = false;
 138         struct block_device *bdev = NULL;
 139         int rw = iov_iter_rw(iter), rc;
 140         long map_len = 0;
 141         struct blk_dax_ctl dax = {
 142                 .addr = ERR_PTR(-EIO),
 143         };
 144         unsigned blkbits = inode->i_blkbits;
 145         sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
 146                                                                 >> blkbits;
 147
 148         if (rw == READ)
 149                 end = min(end, i_size_read(inode));
 150
 151         while (pos < end) {
 152                 size_t len;
 153                 if (pos == max) {
 154                         long page = pos >> PAGE_SHIFT;
 155                         sector_t block = page << (PAGE_SHIFT - blkbits);
 156                         unsigned first = pos - (block << blkbits);
 157                         long size;
 158
 159                         if (pos == bh_max) {
 160                                 bh->b_size = PAGE_ALIGN(end - pos);
 161                                 bh->b_state = 0;
 162                                 rc = get_block(inode, block, bh, rw == WRITE);
 163                                 if (rc)
 164                                         break;
 165                                 bh_max = pos - first + bh->b_size;
 166                                 bdev = bh->b_bdev;
 167                                 /*
 168                                  * We allow uninitialized buffers for writes
 169                                  * beyond EOF as those cannot race with faults
 170                                  */
 171                                 WARN_ON_ONCE(
 172                                         (buffer_new(bh) && block < file_blks) ||
 173                                         (rw == WRITE && buffer_unwritten(bh)));
 174                         } else {
 175                                 unsigned done = bh->b_size -
 176                                                 (bh_max - (pos - first));
 177                                 bh->b_blocknr += done >> blkbits;
 178                                 bh->b_size -= done;
 179                         }
 180
 181                         hole = rw == READ && !buffer_written(bh);
 182                         if (hole) {
 183                                 size = bh->b_size - first;
 184                         } else {
 185                                 dax_unmap_atomic(bdev, &dax);
 186                                 dax.sector = to_sector(bh, inode);
 187                                 dax.size = bh->b_size;
 188                                 map_len = dax_map_atomic(bdev, &dax);
 189                                 if (map_len < 0) {
 190                                         rc = map_len;
 191                                         break;
 192                                 }
 193                                 dax.addr += first;
 194                                 size = map_len - first;
 195                         }
 196                         /*
 197                          * pos + size is one past the last offset for IO,
 198                          * so pos + size can overflow loff_t at extreme offsets.
 199                          * Cast to u64 to catch this and get the true minimum.
 200                          */
 201                         max = min_t(u64, pos + size, end);
 202                 }
 203
 204                 if (iov_iter_rw(iter) == WRITE) {
 205                         len = copy_from_iter_pmem(dax.addr, max - pos, iter);
 206                 } else if (!hole)
 207                         len = copy_to_iter((void __force *) dax.addr, max - pos,
 208                                         iter);
 209                 else
 210                         len = iov_iter_zero(max - pos, iter);
 211
 212                 if (!len) {
 213                         rc = -EFAULT;
 214                         break;
 215                 }
 216
 217                 pos += len;
 218                 if (!IS_ERR(dax.addr))
 219                         dax.addr += len;
 220         }
 221
 222         dax_unmap_atomic(bdev, &dax);
 223
 224         return (pos == start) ? rc : pos - start;
 225 }
 226
 227 /**
 228  * dax_do_io - Perform I/O to a DAX file
 229  * @iocb: The control block for this I/O
 230  * @inode: The file which the I/O is directed at
 231  * @iter: The addresses to do I/O from or to
 232  * @get_block: The filesystem method used to translate file offsets to blocks
 233  * @end_io: A filesystem callback for I/O completion
 234  * @flags: See below
 235  *
 236  * This function uses the same locking scheme as do_blockdev_direct_IO:
 237  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
 238  * caller for writes.  For reads, we take and release the i_mutex ourselves.
 239  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
 240  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
 241  * is in progress.
 242  */
 243 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 244                   struct iov_iter *iter, get_block_t get_block,
 245                   dio_iodone_t end_io, int flags)
 246 {
 247         struct buffer_head bh;
 248         ssize_t retval = -EINVAL;
 249         loff_t pos = iocb->ki_pos;
 250         loff_t end = pos + iov_iter_count(iter);
 251
 252         memset(&bh, 0, sizeof(bh));
 253         bh.b_bdev = inode->i_sb->s_bdev;
 254
 255         if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 256                 inode_lock(inode);
 257
 258         /* Protects against truncate */
 259         if (!(flags & DIO_SKIP_DIO_COUNT))
 260                 inode_dio_begin(inode);
 261
 262         retval = dax_io(inode, iter, pos, end, get_block, &bh);
 263
 264         if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 265                 inode_unlock(inode);
 266
 267         if (end_io) {
 268                 int err;
 269
 270                 err = end_io(iocb, pos, retval, bh.b_private);
 271                 if (err)
 272                         retval = err;
 273         }
 274
 275         if (!(flags & DIO_SKIP_DIO_COUNT))
 276                 inode_dio_end(inode);
 277         return retval;
 278 }
 279 EXPORT_SYMBOL_GPL(dax_do_io);
 280
 281 /*
 282  * DAX radix tree locking
 283  */
 284 struct exceptional_entry_key {
 285         struct address_space *mapping;
 286         pgoff_t entry_start;
 287 };
 288
 289 struct wait_exceptional_entry_queue {
 290         wait_queue_t wait;
 291         struct exceptional_entry_key key;
 292 };
 293
 294 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
 295                 pgoff_t index, void *entry, struct exceptional_entry_key *key)
 296 {
 297         unsigned long hash;
 298
 299         /*
 300          * If 'entry' is a PMD, align the 'index' that we use for the wait
 301          * queue to the start of that PMD.  This ensures that all offsets in
 302          * the range covered by the PMD map to the same bit lock.
 303          */
 304         if (dax_is_pmd_entry(entry))
 305                 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
 306
 307         key->mapping = mapping;
 308         key->entry_start = index;
 309
 310         hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
 311         return wait_table + hash;
 312 }
 313
 314 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
 315                                        int sync, void *keyp)
 316 {
 317         struct exceptional_entry_key *key = keyp;
 318         struct wait_exceptional_entry_queue *ewait =
 319                 container_of(wait, struct wait_exceptional_entry_queue, wait);
 320
 321         if (key->mapping != ewait->key.mapping ||
 322             key->entry_start != ewait->key.entry_start)
 323                 return 0;
 324         return autoremove_wake_function(wait, mode, sync, NULL);
 325 }
 326
 327 /*
 328  * Check whether the given slot is locked. The function must be called with
 329  * mapping->tree_lock held
 330  */
 331 static inline int slot_locked(struct address_space *mapping, void **slot)
 332 {
 333         unsigned long entry = (unsigned long)
 334                 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 335         return entry & RADIX_DAX_ENTRY_LOCK;
 336 }
 337
 338 /*
 339  * Mark the given slot is locked. The function must be called with
 340  * mapping->tree_lock held
 341  */
 342 static inline void *lock_slot(struct address_space *mapping, void **slot)
 343 {
 344         unsigned long entry = (unsigned long)
 345                 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 346
 347         entry |= RADIX_DAX_ENTRY_LOCK;
 348         radix_tree_replace_slot(slot, (void *)entry);
 349         return (void *)entry;
 350 }
 351
 352 /*
 353  * Mark the given slot is unlocked. The function must be called with
 354  * mapping->tree_lock held
 355  */
 356 static inline void *unlock_slot(struct address_space *mapping, void **slot)
 357 {
 358         unsigned long entry = (unsigned long)
 359                 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 360
 361         entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
 362         radix_tree_replace_slot(slot, (void *)entry);
 363         return (void *)entry;
 364 }
 365
 366 /*
 367  * Lookup entry in radix tree, wait for it to become unlocked if it is
 368  * exceptional entry and return it. The caller must call
 369  * put_unlocked_mapping_entry() when he decided not to lock the entry or
 370  * put_locked_mapping_entry() when he locked the entry and now wants to
 371  * unlock it.
 372  *
 373  * The function must be called with mapping->tree_lock held.
 374  */
 375 static void *get_unlocked_mapping_entry(struct address_space *mapping,
 376                                         pgoff_t index, void ***slotp)
 377 {
 378         void *entry, **slot;
 379         struct wait_exceptional_entry_queue ewait;
 380         wait_queue_head_t *wq;
 381
 382         init_wait(&ewait.wait);
 383         ewait.wait.func = wake_exceptional_entry_func;
 384
 385         for (;;) {
 386                 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
 387                                           &slot);
 388                 if (!entry || !radix_tree_exceptional_entry(entry) ||
 389                     !slot_locked(mapping, slot)) {
 390                         if (slotp)
 391                                 *slotp = slot;
 392                         return entry;
 393                 }
 394
 395                 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
 396                 prepare_to_wait_exclusive(wq, &ewait.wait,
 397                                           TASK_UNINTERRUPTIBLE);
 398                 spin_unlock_irq(&mapping->tree_lock);
 399                 schedule();
 400                 finish_wait(wq, &ewait.wait);
 401                 spin_lock_irq(&mapping->tree_lock);
 402         }
 403 }
 404
 405 static void put_locked_mapping_entry(struct address_space *mapping,
 406                                      pgoff_t index, void *entry)
 407 {
 408         if (!radix_tree_exceptional_entry(entry)) {
 409                 unlock_page(entry);
 410                 put_page(entry);
 411         } else {
 412                 dax_unlock_mapping_entry(mapping, index);
 413         }
 414 }
 415
 416 /*
 417  * Called when we are done with radix tree entry we looked up via
 418  * get_unlocked_mapping_entry() and which we didn't lock in the end.
 419  */
 420 static void put_unlocked_mapping_entry(struct address_space *mapping,
 421                                        pgoff_t index, void *entry)
 422 {
 423         if (!radix_tree_exceptional_entry(entry))
 424                 return;
 425
 426         /* We have to wake up next waiter for the radix tree entry lock */
 427         dax_wake_mapping_entry_waiter(mapping, index, entry, false);
 428 }
 429
 430 /*
 431  * Find radix tree entry at given index. If it points to a page, return with
 432  * the page locked. If it points to the exceptional entry, return with the
 433  * radix tree entry locked. If the radix tree doesn't contain given index,
 434  * create empty exceptional entry for the index and return with it locked.
 435  *
 436  * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
 437  * either return that locked entry or will return an error.  This error will
 438  * happen if there are any 4k entries (either zero pages or DAX entries)
 439  * within the 2MiB range that we are requesting.
 440  *
 441  * We always favor 4k entries over 2MiB entries. There isn't a flow where we
 442  * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
 443  * insertion will fail if it finds any 4k entries already in the tree, and a
 444  * 4k insertion will cause an existing 2MiB entry to be unmapped and
 445  * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
 446  * well as 2MiB empty entries.
 447  *
 448  * The exception to this downgrade path is for 2MiB DAX PMD entries that have
 449  * real storage backing them.  We will leave these real 2MiB DAX entries in
 450  * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
 451  *
 452  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
 453  * persistent memory the benefit is doubtful. We can add that later if we can
 454  * show it helps.
 455  */
 456 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
 457                 unsigned long size_flag)
 458 {
 459         bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
 460         void *entry, **slot;
 461
 462 restart:
 463         spin_lock_irq(&mapping->tree_lock);
 464         entry = get_unlocked_mapping_entry(mapping, index, &slot);
 465
 466         if (entry) {
 467                 if (size_flag & RADIX_DAX_PMD) {
 468                         if (!radix_tree_exceptional_entry(entry) ||
 469                             dax_is_pte_entry(entry)) {
 470                                 put_unlocked_mapping_entry(mapping, index,
 471                                                 entry);
 472                                 entry = ERR_PTR(-EEXIST);
 473                                 goto out_unlock;
 474                         }
 475                 } else { /* trying to grab a PTE entry */
 476                         if (radix_tree_exceptional_entry(entry) &&
 477                             dax_is_pmd_entry(entry) &&
 478                             (dax_is_zero_entry(entry) ||
 479                              dax_is_empty_entry(entry))) {
 480                                 pmd_downgrade = true;
 481                         }
 482                 }
 483         }
 484
 485         /* No entry for given index? Make sure radix tree is big enough. */
 486         if (!entry || pmd_downgrade) {
 487                 int err;
 488
 489                 if (pmd_downgrade) {
 490                         /*
 491                          * Make sure 'entry' remains valid while we drop
 492                          * mapping->tree_lock.
 493                          */
 494                         entry = lock_slot(mapping, slot);
 495                 }
 496
 497                 spin_unlock_irq(&mapping->tree_lock);
 498                 err = radix_tree_preload(
 499                                 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
 500                 if (err) {
 501                         if (pmd_downgrade)
 502                                 put_locked_mapping_entry(mapping, index, entry);
 503                         return ERR_PTR(err);
 504                 }
 505
 506                 /*
 507                  * Besides huge zero pages the only other thing that gets
 508                  * downgraded are empty entries which don't need to be
 509                  * unmapped.
 510                  */
 511                 if (pmd_downgrade && dax_is_zero_entry(entry))
 512                         unmap_mapping_range(mapping,
 513                                 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
 514
 515                 spin_lock_irq(&mapping->tree_lock);
 516
 517                 if (pmd_downgrade) {
 518                         radix_tree_delete(&mapping->page_tree, index);
 519                         mapping->nrexceptional--;
 520                         dax_wake_mapping_entry_waiter(mapping, index, entry,
 521                                         true);
 522                 }
 523
 524                 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
 525
 526                 err = __radix_tree_insert(&mapping->page_tree, index,
 527                                 dax_radix_order(entry), entry);
 528                 radix_tree_preload_end();
 529                 if (err) {
 530                         spin_unlock_irq(&mapping->tree_lock);
 531                         /*
 532                          * Someone already created the entry?  This is a
 533                          * normal failure when inserting PMDs in a range
 534                          * that already contains PTEs.  In that case we want
 535                          * to return -EEXIST immediately.
 536                          */
 537                         if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
 538                                 goto restart;
 539                         /*
 540                          * Our insertion of a DAX PMD entry failed, most
 541                          * likely because it collided with a PTE sized entry
 542                          * at a different index in the PMD range.  We haven't
 543                          * inserted anything into the radix tree and have no
 544                          * waiters to wake.
 545                          */
 546                         return ERR_PTR(err);
 547                 }
 548                 /* Good, we have inserted empty locked entry into the tree. */
 549                 mapping->nrexceptional++;
 550                 spin_unlock_irq(&mapping->tree_lock);
 551                 return entry;
 552         }
 553         /* Normal page in radix tree? */
 554         if (!radix_tree_exceptional_entry(entry)) {
 555                 struct page *page = entry;
 556
 557                 get_page(page);
 558                 spin_unlock_irq(&mapping->tree_lock);
 559                 lock_page(page);
 560                 /* Page got truncated? Retry... */
 561                 if (unlikely(page->mapping != mapping)) {
 562                         unlock_page(page);
 563                         put_page(page);
 564                         goto restart;
 565                 }
 566                 return page;
 567         }
 568         entry = lock_slot(mapping, slot);
 569  out_unlock:
 570         spin_unlock_irq(&mapping->tree_lock);
 571         return entry;
 572 }
 573
 574 /*
 575  * We do not necessarily hold the mapping->tree_lock when we call this
 576  * function so it is possible that 'entry' is no longer a valid item in the
 577  * radix tree.  This is okay because all we really need to do is to find the
 578  * correct waitqueue where tasks might be waiting for that old 'entry' and
 579  * wake them.
 580  */
 581 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 582                 pgoff_t index, void *entry, bool wake_all)
 583 {
 584         struct exceptional_entry_key key;
 585         wait_queue_head_t *wq;
 586
 587         wq = dax_entry_waitqueue(mapping, index, entry, &key);
 588
 589         /*
 590          * Checking for locked entry and prepare_to_wait_exclusive() happens
 591          * under mapping->tree_lock, ditto for entry handling in our callers.
 592          * So at this point all tasks that could have seen our entry locked
 593          * must be in the waitqueue and the following check will see them.
 594          */
 595         if (waitqueue_active(wq))
 596                 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 597 }
 598
 599 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 600 {
 601         void *entry, **slot;
 602
 603         spin_lock_irq(&mapping->tree_lock);
 604         entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
 605         if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
 606                          !slot_locked(mapping, slot))) {
 607                 spin_unlock_irq(&mapping->tree_lock);
 608                 return;
 609         }
 610         unlock_slot(mapping, slot);
 611         spin_unlock_irq(&mapping->tree_lock);
 612         dax_wake_mapping_entry_waiter(mapping, index, entry, false);
 613 }
 614
 615 /*
 616  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
 617  * entry to get unlocked before deleting it.
 618  */
 619 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 620 {
 621         void *entry;
 622
 623         spin_lock_irq(&mapping->tree_lock);
 624         entry = get_unlocked_mapping_entry(mapping, index, NULL);
 625         /*
 626          * This gets called from truncate / punch_hole path. As such, the caller
 627          * must hold locks protecting against concurrent modifications of the
 628          * radix tree (usually fs-private i_mmap_sem for writing). Since the
 629          * caller has seen exceptional entry for this index, we better find it
 630          * at that index as well...
 631          */
 632         if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
 633                 spin_unlock_irq(&mapping->tree_lock);
 634                 return 0;
 635         }
 636         radix_tree_delete(&mapping->page_tree, index);
 637         mapping->nrexceptional--;
 638         spin_unlock_irq(&mapping->tree_lock);
 639         dax_wake_mapping_entry_waiter(mapping, index, entry, true);
 640
 641         return 1;
 642 }
 643
 644 /*
 645  * The user has performed a load from a hole in the file.  Allocating
 646  * a new page in the file would cause excessive storage usage for
 647  * workloads with sparse files.  We allocate a page cache page instead.
 648  * We'll kick it out of the page cache if it's ever written to,
 649  * otherwise it will simply fall out of the page cache under memory
 650  * pressure without ever having been dirtied.
 651  */
 652 static int dax_load_hole(struct address_space *mapping, void *entry,
 653                          struct vm_fault *vmf)
 654 {
 655         struct page *page;
 656
 657         /* Hole page already exists? Return it...  */
 658         if (!radix_tree_exceptional_entry(entry)) {
 659                 vmf->page = entry;
 660                 return VM_FAULT_LOCKED;
 661         }
 662
 663         /* This will replace locked radix tree entry with a hole page */
 664         page = find_or_create_page(mapping, vmf->pgoff,
 665                                    vmf->gfp_mask | __GFP_ZERO);
 666         if (!page) {
 667                 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
 668                 return VM_FAULT_OOM;
 669         }
 670         vmf->page = page;
 671         return VM_FAULT_LOCKED;
 672 }
 673
 674 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
 675                 struct page *to, unsigned long vaddr)
 676 {
 677         struct blk_dax_ctl dax = {
 678                 .sector = sector,
 679                 .size = size,
 680         };
 681         void *vto;
 682
 683         if (dax_map_atomic(bdev, &dax) < 0)
 684                 return PTR_ERR(dax.addr);
 685         vto = kmap_atomic(to);
 686         copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
 687         kunmap_atomic(vto);
 688         dax_unmap_atomic(bdev, &dax);
 689         return 0;
 690 }
 691
 692 /*
 693  * By this point grab_mapping_entry() has ensured that we have a locked entry
 694  * of the appropriate size so we don't have to worry about downgrading PMDs to
 695  * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
 696  * already in the tree, we will skip the insertion and just dirty the PMD as
 697  * appropriate.
 698  */
 699 static void *dax_insert_mapping_entry(struct address_space *mapping,
 700                                       struct vm_fault *vmf,
 701                                       void *entry, sector_t sector,
 702                                       unsigned long flags)
 703 {
 704         struct radix_tree_root *page_tree = &mapping->page_tree;
 705         int error = 0;
 706         bool hole_fill = false;
 707         void *new_entry;
 708         pgoff_t index = vmf->pgoff;
 709
 710         if (vmf->flags & FAULT_FLAG_WRITE)
 711                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 712
 713         /* Replacing hole page with block mapping? */
 714         if (!radix_tree_exceptional_entry(entry)) {
 715                 hole_fill = true;
 716                 /*
 717                  * Unmap the page now before we remove it from page cache below.
 718                  * The page is locked so it cannot be faulted in again.
 719                  */
 720                 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
 721                                     PAGE_SIZE, 0);
 722                 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
 723                 if (error)
 724                         return ERR_PTR(error);
 725         } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
 726                 /* replacing huge zero page with PMD block mapping */
 727                 unmap_mapping_range(mapping,
 728                         (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
 729         }
 730
 731         spin_lock_irq(&mapping->tree_lock);
 732         new_entry = dax_radix_locked_entry(sector, flags);
 733
 734         if (hole_fill) {
 735                 __delete_from_page_cache(entry, NULL);
 736                 /* Drop pagecache reference */
 737                 put_page(entry);
 738                 error = __radix_tree_insert(page_tree, index,
 739                                 dax_radix_order(new_entry), new_entry);
 740                 if (error) {
 741                         new_entry = ERR_PTR(error);
 742                         goto unlock;
 743                 }
 744                 mapping->nrexceptional++;
 745         } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
 746                 /*
 747                  * Only swap our new entry into the radix tree if the current
 748                  * entry is a zero page or an empty entry.  If a normal PTE or
 749                  * PMD entry is already in the tree, we leave it alone.  This
 750                  * means that if we are trying to insert a PTE and the
 751                  * existing entry is a PMD, we will just leave the PMD in the
 752                  * tree and dirty it if necessary.
 753                  */
 754                 void **slot;
 755                 void *ret;
 756
 757                 ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
 758                 WARN_ON_ONCE(ret != entry);
 759                 radix_tree_replace_slot(slot, new_entry);
 760         }
 761         if (vmf->flags & FAULT_FLAG_WRITE)
 762                 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 763  unlock:
 764         spin_unlock_irq(&mapping->tree_lock);
 765         if (hole_fill) {
 766                 radix_tree_preload_end();
 767                 /*
 768                  * We don't need hole page anymore, it has been replaced with
 769                  * locked radix tree entry now.
 770                  */
 771                 if (mapping->a_ops->freepage)
 772                         mapping->a_ops->freepage(entry);
 773                 unlock_page(entry);
 774                 put_page(entry);
 775         }
 776         return new_entry;
 777 }
 778
 779 static int dax_writeback_one(struct block_device *bdev,
 780                 struct address_space *mapping, pgoff_t index, void *entry)
 781 {
 782         struct radix_tree_root *page_tree = &mapping->page_tree;
 783         struct radix_tree_node *node;
 784         struct blk_dax_ctl dax;
 785         void **slot;
 786         int ret = 0;
 787
 788         spin_lock_irq(&mapping->tree_lock);
 789         /*
 790          * Regular page slots are stabilized by the page lock even
 791          * without the tree itself locked.  These unlocked entries
 792          * need verification under the tree lock.
 793          */
 794         if (!__radix_tree_lookup(page_tree, index, &node, &slot))
 795                 goto unlock;
 796         if (*slot != entry)
 797                 goto unlock;
 798
 799         /* another fsync thread may have already written back this entry */
 800         if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
 801                 goto unlock;
 802
 803         if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
 804                                 dax_is_zero_entry(entry))) {
 805                 ret = -EIO;
 806                 goto unlock;
 807         }
 808
 809         /*
 810          * Even if dax_writeback_mapping_range() was given a wbc->range_start
 811          * in the middle of a PMD, the 'index' we are given will be aligned to
 812          * the start index of the PMD, as will the sector we pull from
 813          * 'entry'.  This allows us to flush for PMD_SIZE and not have to
 814          * worry about partial PMD writebacks.
 815          */
 816         dax.sector = dax_radix_sector(entry);
 817         dax.size = PAGE_SIZE << dax_radix_order(entry);
 818         spin_unlock_irq(&mapping->tree_lock);
 819
 820         /*
 821          * We cannot hold tree_lock while calling dax_map_atomic() because it
 822          * eventually calls cond_resched().
 823          */
 824         ret = dax_map_atomic(bdev, &dax);
 825         if (ret < 0)
 826                 return ret;
 827
 828         if (WARN_ON_ONCE(ret < dax.size)) {
 829                 ret = -EIO;
 830                 goto unmap;
 831         }
 832
 833         wb_cache_pmem(dax.addr, dax.size);
 834
 835         spin_lock_irq(&mapping->tree_lock);
 836         radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
 837         spin_unlock_irq(&mapping->tree_lock);
 838  unmap:
 839         dax_unmap_atomic(bdev, &dax);
 840         return ret;
 841
 842  unlock:
 843         spin_unlock_irq(&mapping->tree_lock);
 844         return ret;
 845 }
 846
 847 /*
 848  * Flush the mapping to the persistent domain within the byte range of [start,
 849  * end]. This is required by data integrity operations to ensure file data is
 850  * on persistent storage prior to completion of the operation.
 851  */
 852 int dax_writeback_mapping_range(struct address_space *mapping,
 853                 struct block_device *bdev, struct writeback_control *wbc)
 854 {
 855         struct inode *inode = mapping->host;
 856         pgoff_t start_index, end_index;
 857         pgoff_t indices[PAGEVEC_SIZE];
 858         struct pagevec pvec;
 859         bool done = false;
 860         int i, ret = 0;
 861
 862         if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 863                 return -EIO;
 864
 865         if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 866                 return 0;
 867
 868         start_index = wbc->range_start >> PAGE_SHIFT;
 869         end_index = wbc->range_end >> PAGE_SHIFT;
 870
 871         tag_pages_for_writeback(mapping, start_index, end_index);
 872
 873         pagevec_init(&pvec, 0);
 874         while (!done) {
 875                 pvec.nr = find_get_entries_tag(mapping, start_index,
 876                                 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
 877                                 pvec.pages, indices);
 878
 879                 if (pvec.nr == 0)
 880                         break;
 881
 882                 for (i = 0; i < pvec.nr; i++) {
 883                         if (indices[i] > end_index) {
 884                                 done = true;
 885                                 break;
 886                         }
 887
 888                         ret = dax_writeback_one(bdev, mapping, indices[i],
 889                                         pvec.pages[i]);
 890                         if (ret < 0)
 891                                 return ret;
 892                 }
 893         }
 894         return 0;
 895 }
 896 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 897
 898 static int dax_insert_mapping(struct address_space *mapping,
 899                 struct block_device *bdev, sector_t sector, size_t size,
 900                 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
 901 {
 902         unsigned long vaddr = (unsigned long)vmf->virtual_address;
 903         struct blk_dax_ctl dax = {
 904                 .sector = sector,
 905                 .size = size,
 906         };
 907         void *ret;
 908         void *entry = *entryp;
 909
 910         if (dax_map_atomic(bdev, &dax) < 0)
 911                 return PTR_ERR(dax.addr);
 912         dax_unmap_atomic(bdev, &dax);
 913
 914         ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
 915         if (IS_ERR(ret))
 916                 return PTR_ERR(ret);
 917         *entryp = ret;
 918
 919         return vm_insert_mixed(vma, vaddr, dax.pfn);
 920 }
 921
 922 /**
 923  * dax_fault - handle a page fault on a DAX file
 924  * @vma: The virtual memory area where the fault occurred
 925  * @vmf: The description of the fault
 926  * @get_block: The filesystem method used to translate file offsets to blocks
 927  *
 928  * When a page fault occurs, filesystems may call this helper in their
 929  * fault handler for DAX files. dax_fault() assumes the caller has done all
 930  * the necessary locking for the page fault to proceed successfully.
 931  */
 932 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 933                         get_block_t get_block)
 934 {
 935         struct file *file = vma->vm_file;
 936         struct address_space *mapping = file->f_mapping;
 937         struct inode *inode = mapping->host;
 938         void *entry;
 939         struct buffer_head bh;
 940         unsigned long vaddr = (unsigned long)vmf->virtual_address;
 941         unsigned blkbits = inode->i_blkbits;
 942         sector_t block;
 943         pgoff_t size;
 944         int error;
 945         int major = 0;
 946
 947         /*
 948          * Check whether offset isn't beyond end of file now. Caller is supposed
 949          * to hold locks serializing us with truncate / punch hole so this is
 950          * a reliable test.
 951          */
 952         size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 953         if (vmf->pgoff >= size)
 954                 return VM_FAULT_SIGBUS;
 955
 956         memset(&bh, 0, sizeof(bh));
 957         block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
 958         bh.b_bdev = inode->i_sb->s_bdev;
 959         bh.b_size = PAGE_SIZE;
 960
 961         entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
 962         if (IS_ERR(entry)) {
 963                 error = PTR_ERR(entry);
 964                 goto out;
 965         }
 966
 967         error = get_block(inode, block, &bh, 0);
 968         if (!error && (bh.b_size < PAGE_SIZE))
 969                 error = -EIO;           /* fs corruption? */
 970         if (error)
 971                 goto unlock_entry;
 972
 973         if (vmf->cow_page) {
 974                 struct page *new_page = vmf->cow_page;
 975                 if (buffer_written(&bh))
 976                         error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
 977                                         bh.b_size, new_page, vaddr);
 978                 else
 979                         clear_user_highpage(new_page, vaddr);
 980                 if (error)
 981                         goto unlock_entry;
 982                 if (!radix_tree_exceptional_entry(entry)) {
 983                         vmf->page = entry;
 984                         return VM_FAULT_LOCKED;
 985                 }
 986                 vmf->entry = entry;
 987                 return VM_FAULT_DAX_LOCKED;
 988         }
 989
 990         if (!buffer_mapped(&bh)) {
 991                 if (vmf->flags & FAULT_FLAG_WRITE) {
 992                         error = get_block(inode, block, &bh, 1);
 993                         count_vm_event(PGMAJFAULT);
 994                         mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 995                         major = VM_FAULT_MAJOR;
 996                         if (!error && (bh.b_size < PAGE_SIZE))
 997                                 error = -EIO;
 998                         if (error)
 999                                 goto unlock_entry;
1000                 } else {
1001                         return dax_load_hole(mapping, entry, vmf);
1002                 }
1003         }
1004
1005         /* Filesystem should not return unwritten buffers to us! */
1006         WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1007         error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
1008                         bh.b_size, &entry, vma, vmf);
1009  unlock_entry:
1010         put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1011  out:
1012         if (error == -ENOMEM)
1013                 return VM_FAULT_OOM | major;
1014         /* -EBUSY is fine, somebody else faulted on the same PTE */
1015         if ((error < 0) && (error != -EBUSY))
1016                 return VM_FAULT_SIGBUS | major;
1017         return VM_FAULT_NOPAGE | major;
1018 }
1019 EXPORT_SYMBOL_GPL(dax_fault);
1020
1021 /**
1022  * dax_pfn_mkwrite - handle first write to DAX page
1023  * @vma: The virtual memory area where the fault occurred
1024  * @vmf: The description of the fault
1025  */
1026 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1027 {
1028         struct file *file = vma->vm_file;
1029         struct address_space *mapping = file->f_mapping;
1030         void *entry;
1031         pgoff_t index = vmf->pgoff;
1032
1033         spin_lock_irq(&mapping->tree_lock);
1034         entry = get_unlocked_mapping_entry(mapping, index, NULL);
1035         if (!entry || !radix_tree_exceptional_entry(entry))
1036                 goto out;
1037         radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
1038         put_unlocked_mapping_entry(mapping, index, entry);
1039 out:
1040         spin_unlock_irq(&mapping->tree_lock);
1041         return VM_FAULT_NOPAGE;
1042 }
1043 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
1044
1045 static bool dax_range_is_aligned(struct block_device *bdev,
1046                                  unsigned int offset, unsigned int length)
1047 {
1048         unsigned short sector_size = bdev_logical_block_size(bdev);
1049
1050         if (!IS_ALIGNED(offset, sector_size))
1051                 return false;
1052         if (!IS_ALIGNED(length, sector_size))
1053                 return false;
1054
1055         return true;
1056 }
1057
1058 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
1059                 unsigned int offset, unsigned int length)
1060 {
1061         struct blk_dax_ctl dax = {
1062                 .sector         = sector,
1063                 .size           = PAGE_SIZE,
1064         };
1065
1066         if (dax_range_is_aligned(bdev, offset, length)) {
1067                 sector_t start_sector = dax.sector + (offset >> 9);
1068
1069                 return blkdev_issue_zeroout(bdev, start_sector,
1070                                 length >> 9, GFP_NOFS, true);
1071         } else {
1072                 if (dax_map_atomic(bdev, &dax) < 0)
1073                         return PTR_ERR(dax.addr);
1074                 clear_pmem(dax.addr + offset, length);
1075                 dax_unmap_atomic(bdev, &dax);
1076         }
1077         return 0;
1078 }
1079 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1080
1081 /**
1082  * dax_zero_page_range - zero a range within a page of a DAX file
1083  * @inode: The file being truncated
1084  * @from: The file offset that is being truncated to
1085  * @length: The number of bytes to zero
1086  * @get_block: The filesystem method used to translate file offsets to blocks
1087  *
1088  * This function can be called by a filesystem when it is zeroing part of a
1089  * page in a DAX file.  This is intended for hole-punch operations.  If
1090  * you are truncating a file, the helper function dax_truncate_page() may be
1091  * more convenient.
1092  */
1093 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1094                                                         get_block_t get_block)
1095 {
1096         struct buffer_head bh;
1097         pgoff_t index = from >> PAGE_SHIFT;
1098         unsigned offset = from & (PAGE_SIZE-1);
1099         int err;
1100
1101         /* Block boundary? Nothing to do */
1102         if (!length)
1103                 return 0;
1104         if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
1105                 return -EINVAL;
1106
1107         memset(&bh, 0, sizeof(bh));
1108         bh.b_bdev = inode->i_sb->s_bdev;
1109         bh.b_size = PAGE_SIZE;
1110         err = get_block(inode, index, &bh, 0);
1111         if (err < 0 || !buffer_written(&bh))
1112                 return err;
1113
1114         return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1115                         offset, length);
1116 }
1117 EXPORT_SYMBOL_GPL(dax_zero_page_range);
1118
1119 /**
1120  * dax_truncate_page - handle a partial page being truncated in a DAX file
1121  * @inode: The file being truncated
1122  * @from: The file offset that is being truncated to
1123  * @get_block: The filesystem method used to translate file offsets to blocks
1124  *
1125  * Similar to block_truncate_page(), this function can be called by a
1126  * filesystem when it is truncating a DAX file to handle the partial page.
1127  */
1128 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1129 {
1130         unsigned length = PAGE_ALIGN(from) - from;
1131         return dax_zero_page_range(inode, from, length, get_block);
1132 }
1133 EXPORT_SYMBOL_GPL(dax_truncate_page);
1134
1135 #ifdef CONFIG_FS_IOMAP
1136 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
1137 {
1138         return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
1139 }
1140
1141 static loff_t
1142 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1143                 struct iomap *iomap)
1144 {
1145         struct iov_iter *iter = data;
1146         loff_t end = pos + length, done = 0;
1147         ssize_t ret = 0;
1148
1149         if (iov_iter_rw(iter) == READ) {
1150                 end = min(end, i_size_read(inode));
1151                 if (pos >= end)
1152                         return 0;
1153
1154                 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1155                         return iov_iter_zero(min(length, end - pos), iter);
1156         }
1157
1158         if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1159                 return -EIO;
1160
1161         while (pos < end) {
1162                 unsigned offset = pos & (PAGE_SIZE - 1);
1163                 struct blk_dax_ctl dax = { 0 };
1164                 ssize_t map_len;
1165
1166                 dax.sector = dax_iomap_sector(iomap, pos);
1167                 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1168                 map_len = dax_map_atomic(iomap->bdev, &dax);
1169                 if (map_len < 0) {
1170                         ret = map_len;
1171                         break;
1172                 }
1173
1174                 dax.addr += offset;
1175                 map_len -= offset;
1176                 if (map_len > end - pos)
1177                         map_len = end - pos;
1178
1179                 if (iov_iter_rw(iter) == WRITE)
1180                         map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1181                 else
1182                         map_len = copy_to_iter(dax.addr, map_len, iter);
1183                 dax_unmap_atomic(iomap->bdev, &dax);
1184                 if (map_len <= 0) {
1185                         ret = map_len ? map_len : -EFAULT;
1186                         break;
1187                 }
1188
1189                 pos += map_len;
1190                 length -= map_len;
1191                 done += map_len;
1192         }
1193
1194         return done ? done : ret;
1195 }
1196
1197 /**
1198  * dax_iomap_rw - Perform I/O to a DAX file
1199  * @iocb:       The control block for this I/O
1200  * @iter:       The addresses to do I/O from or to
1201  * @ops:        iomap ops passed from the file system
1202  *
1203  * This function performs read and write operations to directly mapped
1204  * persistent memory.  The callers needs to take care of read/write exclusion
1205  * and evicting any page cache pages in the region under I/O.
1206  */
1207 ssize_t
1208 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1209                 struct iomap_ops *ops)
1210 {
1211         struct address_space *mapping = iocb->ki_filp->f_mapping;
1212         struct inode *inode = mapping->host;
1213         loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1214         unsigned flags = 0;
1215
1216         if (iov_iter_rw(iter) == WRITE)
1217                 flags |= IOMAP_WRITE;
1218
1219         /*
1220          * Yes, even DAX files can have page cache attached to them:  A zeroed
1221          * page is inserted into the pagecache when we have to serve a write
1222          * fault on a hole.  It should never be dirtied and can simply be
1223          * dropped from the pagecache once we get real data for the page.
1224          *
1225          * XXX: This is racy against mmap, and there's nothing we can do about
1226          * it. We'll eventually need to shift this down even further so that
1227          * we can check if we allocated blocks over a hole first.
1228          */
1229         if (mapping->nrpages) {
1230                 ret = invalidate_inode_pages2_range(mapping,
1231                                 pos >> PAGE_SHIFT,
1232                                 (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
1233                 WARN_ON_ONCE(ret);
1234         }
1235
1236         while (iov_iter_count(iter)) {
1237                 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1238                                 iter, dax_iomap_actor);
1239                 if (ret <= 0)
1240                         break;
1241                 pos += ret;
1242                 done += ret;
1243         }
1244
1245         iocb->ki_pos += done;
1246         return done ? done : ret;
1247 }
1248 EXPORT_SYMBOL_GPL(dax_iomap_rw);
1249
1250 /**
1251  * dax_iomap_fault - handle a page fault on a DAX file
1252  * @vma: The virtual memory area where the fault occurred
1253  * @vmf: The description of the fault
1254  * @ops: iomap ops passed from the file system
1255  *
1256  * When a page fault occurs, filesystems may call this helper in their fault
1257  * or mkwrite handler for DAX files. Assumes the caller has done all the
1258  * necessary locking for the page fault to proceed successfully.
1259  */
1260 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1261                         struct iomap_ops *ops)
1262 {
1263         struct address_space *mapping = vma->vm_file->f_mapping;
1264         struct inode *inode = mapping->host;
1265         unsigned long vaddr = (unsigned long)vmf->virtual_address;
1266         loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1267         sector_t sector;
1268         struct iomap iomap = { 0 };
1269         unsigned flags = 0;
1270         int error, major = 0;
1271         int locked_status = 0;
1272         void *entry;
1273
1274         /*
1275          * Check whether offset isn't beyond end of file now. Caller is supposed
1276          * to hold locks serializing us with truncate / punch hole so this is
1277          * a reliable test.
1278          */
1279         if (pos >= i_size_read(inode))
1280                 return VM_FAULT_SIGBUS;
1281
1282         entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1283         if (IS_ERR(entry)) {
1284                 error = PTR_ERR(entry);
1285                 goto out;
1286         }
1287
1288         if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1289                 flags |= IOMAP_WRITE;
1290
1291         /*
1292          * Note that we don't bother to use iomap_apply here: DAX required
1293          * the file system block size to be equal the page size, which means
1294          * that we never have to deal with more than a single extent here.
1295          */
1296         error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1297         if (error)
1298                 goto unlock_entry;
1299         if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1300                 error = -EIO;           /* fs corruption? */
1301                 goto finish_iomap;
1302         }
1303
1304         sector = dax_iomap_sector(&iomap, pos);
1305
1306         if (vmf->cow_page) {
1307                 switch (iomap.type) {
1308                 case IOMAP_HOLE:
1309                 case IOMAP_UNWRITTEN:
1310                         clear_user_highpage(vmf->cow_page, vaddr);
1311                         break;
1312                 case IOMAP_MAPPED:
1313                         error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1314                                         vmf->cow_page, vaddr);
1315                         break;
1316                 default:
1317                         WARN_ON_ONCE(1);
1318                         error = -EIO;
1319                         break;
1320                 }
1321
1322                 if (error)
1323                         goto finish_iomap;
1324                 if (!radix_tree_exceptional_entry(entry)) {
1325                         vmf->page = entry;
1326                         locked_status = VM_FAULT_LOCKED;
1327                 } else {
1328                         vmf->entry = entry;
1329                         locked_status = VM_FAULT_DAX_LOCKED;
1330                 }
1331                 goto finish_iomap;
1332         }
1333
1334         switch (iomap.type) {
1335         case IOMAP_MAPPED:
1336                 if (iomap.flags & IOMAP_F_NEW) {
1337                         count_vm_event(PGMAJFAULT);
1338                         mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1339                         major = VM_FAULT_MAJOR;
1340                 }
1341                 error = dax_insert_mapping(mapping, iomap.bdev, sector,
1342                                 PAGE_SIZE, &entry, vma, vmf);
1343                 break;
1344         case IOMAP_UNWRITTEN:
1345         case IOMAP_HOLE:
1346                 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1347                         locked_status = dax_load_hole(mapping, entry, vmf);
1348                         break;
1349                 }
1350                 /*FALLTHRU*/
1351         default:
1352                 WARN_ON_ONCE(1);
1353                 error = -EIO;
1354                 break;
1355         }
1356
1357  finish_iomap:
1358         if (ops->iomap_end) {
1359                 if (error) {
1360                         /* keep previous error */
1361                         ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
1362                                         &iomap);
1363                 } else {
1364                         error = ops->iomap_end(inode, pos, PAGE_SIZE,
1365                                         PAGE_SIZE, flags, &iomap);
1366                 }
1367         }
1368  unlock_entry:
1369         if (!locked_status || error)
1370                 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1371  out:
1372         if (error == -ENOMEM)
1373                 return VM_FAULT_OOM | major;
1374         /* -EBUSY is fine, somebody else faulted on the same PTE */
1375         if (error < 0 && error != -EBUSY)
1376                 return VM_FAULT_SIGBUS | major;
1377         if (locked_status) {
1378                 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
1379                 return locked_status;
1380         }
1381         return VM_FAULT_NOPAGE | major;
1382 }
1383 EXPORT_SYMBOL_GPL(dax_iomap_fault);
1384
1385 #ifdef CONFIG_FS_DAX_PMD
1386 /*
1387  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
1388  * more often than one might expect in the below functions.
1389  */
1390 #define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
1391
1392 static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
1393                 struct vm_fault *vmf, unsigned long address,
1394                 struct iomap *iomap, loff_t pos, bool write, void **entryp)
1395 {
1396         struct address_space *mapping = vma->vm_file->f_mapping;
1397         struct block_device *bdev = iomap->bdev;
1398         struct blk_dax_ctl dax = {
1399                 .sector = dax_iomap_sector(iomap, pos),
1400                 .size = PMD_SIZE,
1401         };
1402         long length = dax_map_atomic(bdev, &dax);
1403         void *ret;
1404
1405         if (length < 0) /* dax_map_atomic() failed */
1406                 return VM_FAULT_FALLBACK;
1407         if (length < PMD_SIZE)
1408                 goto unmap_fallback;
1409         if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1410                 goto unmap_fallback;
1411         if (!pfn_t_devmap(dax.pfn))
1412                 goto unmap_fallback;
1413
1414         dax_unmap_atomic(bdev, &dax);
1415
1416         ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
1417                         RADIX_DAX_PMD);
1418         if (IS_ERR(ret))
1419                 return VM_FAULT_FALLBACK;
1420         *entryp = ret;
1421
1422         return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
1423
1424  unmap_fallback:
1425         dax_unmap_atomic(bdev, &dax);
1426         return VM_FAULT_FALLBACK;
1427 }
1428
1429 static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
1430                 struct vm_fault *vmf, unsigned long address,
1431                 struct iomap *iomap, void **entryp)
1432 {
1433         struct address_space *mapping = vma->vm_file->f_mapping;
1434         unsigned long pmd_addr = address & PMD_MASK;
1435         struct page *zero_page;
1436         spinlock_t *ptl;
1437         pmd_t pmd_entry;
1438         void *ret;
1439
1440         zero_page = mm_get_huge_zero_page(vma->vm_mm);
1441
1442         if (unlikely(!zero_page))
1443                 return VM_FAULT_FALLBACK;
1444
1445         ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
1446                         RADIX_DAX_PMD | RADIX_DAX_HZP);
1447         if (IS_ERR(ret))
1448                 return VM_FAULT_FALLBACK;
1449         *entryp = ret;
1450
1451         ptl = pmd_lock(vma->vm_mm, pmd);
1452         if (!pmd_none(*pmd)) {
1453                 spin_unlock(ptl);
1454                 return VM_FAULT_FALLBACK;
1455         }
1456
1457         pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
1458         pmd_entry = pmd_mkhuge(pmd_entry);
1459         set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
1460         spin_unlock(ptl);
1461         return VM_FAULT_NOPAGE;
1462 }
1463
1464 int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1465                 pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
1466 {
1467         struct address_space *mapping = vma->vm_file->f_mapping;
1468         unsigned long pmd_addr = address & PMD_MASK;
1469         bool write = flags & FAULT_FLAG_WRITE;
1470         unsigned int iomap_flags = write ? IOMAP_WRITE : 0;
1471         struct inode *inode = mapping->host;
1472         int result = VM_FAULT_FALLBACK;
1473         struct iomap iomap = { 0 };
1474         pgoff_t max_pgoff, pgoff;
1475         struct vm_fault vmf;
1476         void *entry;
1477         loff_t pos;
1478         int error;
1479
1480         /* Fall back to PTEs if we're going to COW */
1481         if (write && !(vma->vm_flags & VM_SHARED))
1482                 goto fallback;
1483
1484         /* If the PMD would extend outside the VMA */
1485         if (pmd_addr < vma->vm_start)
1486                 goto fallback;
1487         if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1488                 goto fallback;
1489
1490         /*
1491          * Check whether offset isn't beyond end of file now. Caller is
1492          * supposed to hold locks serializing us with truncate / punch hole so
1493          * this is a reliable test.
1494          */
1495         pgoff = linear_page_index(vma, pmd_addr);
1496         max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
1497
1498         if (pgoff > max_pgoff)
1499                 return VM_FAULT_SIGBUS;
1500
1501         /* If the PMD would extend beyond the file size */
1502         if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
1503                 goto fallback;
1504
1505         /*
1506          * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1507          * PMD or a HZP entry.  If it can't (because a 4k page is already in
1508          * the tree, for instance), it will return -EEXIST and we just fall
1509          * back to 4k entries.
1510          */
1511         entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1512         if (IS_ERR(entry))
1513                 goto fallback;
1514
1515         /*
1516          * Note that we don't use iomap_apply here.  We aren't doing I/O, only
1517          * setting up a mapping, so really we're using iomap_begin() as a way
1518          * to look up our filesystem block.
1519          */
1520         pos = (loff_t)pgoff << PAGE_SHIFT;
1521         error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1522         if (error)
1523                 goto unlock_entry;
1524         if (iomap.offset + iomap.length < pos + PMD_SIZE)
1525                 goto finish_iomap;
1526
1527         vmf.pgoff = pgoff;
1528         vmf.flags = flags;
1529         vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
1530
1531         switch (iomap.type) {
1532         case IOMAP_MAPPED:
1533                 result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
1534                                 &iomap, pos, write, &entry);
1535                 break;
1536         case IOMAP_UNWRITTEN:
1537         case IOMAP_HOLE:
1538                 if (WARN_ON_ONCE(write))
1539                         goto finish_iomap;
1540                 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
1541                                 &entry);
1542                 break;
1543         default:
1544                 WARN_ON_ONCE(1);
1545                 break;
1546         }
1547
1548  finish_iomap:
1549         if (ops->iomap_end) {
1550                 if (result == VM_FAULT_FALLBACK) {
1551                         ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
1552                                         &iomap);
1553                 } else {
1554                         error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
1555                                         iomap_flags, &iomap);
1556                         if (error)
1557                                 result = VM_FAULT_FALLBACK;
1558                 }
1559         }
1560  unlock_entry:
1561         put_locked_mapping_entry(mapping, pgoff, entry);
1562  fallback:
1563         if (result == VM_FAULT_FALLBACK) {
1564                 split_huge_pmd(vma, pmd, address);
1565                 count_vm_event(THP_FAULT_FALLBACK);
1566         }
1567         return result;
1568 }
1569 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
1570 #endif /* CONFIG_FS_DAX_PMD */
1571 #endif /* CONFIG_FS_IOMAP */