fs/dax.c

   1 /*
   2  * fs/dax.c - Direct Access filesystem code
   3  * Copyright (c) 2013-2014 Intel Corporation
   4  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
   5  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify it
   8  * under the terms and conditions of the GNU General Public License,
   9  * version 2, as published by the Free Software Foundation.
  10  *
  11  * This program is distributed in the hope it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14  * more details.
  15  */
  16
  17 #include <linux/atomic.h>
  18 #include <linux/blkdev.h>
  19 #include <linux/buffer_head.h>
  20 #include <linux/dax.h>
  21 #include <linux/fs.h>
  22 #include <linux/genhd.h>
  23 #include <linux/highmem.h>
  24 #include <linux/memcontrol.h>
  25 #include <linux/mm.h>
  26 #include <linux/mutex.h>
  27 #include <linux/pagevec.h>
  28 #include <linux/pmem.h>
  29 #include <linux/sched.h>
  30 #include <linux/uio.h>
  31 #include <linux/vmstat.h>
  32 #include <linux/pfn_t.h>
  33 #include <linux/sizes.h>
  34
  35 /*
  36  * We use lowest available bit in exceptional entry for locking, other two
  37  * bits to determine entry type. In total 3 special bits.
  38  */
  39 #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
  40 #define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
  41 #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
  42 #define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
  43 #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
  44 #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
  45 #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
  46                 RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
  47                 RADIX_TREE_EXCEPTIONAL_ENTRY))
  48
  49 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
  50 {
  51         struct request_queue *q = bdev->bd_queue;
  52         long rc = -EIO;
  53
  54         dax->addr = (void __pmem *) ERR_PTR(-EIO);
  55         if (blk_queue_enter(q, true) != 0)
  56                 return rc;
  57
  58         rc = bdev_direct_access(bdev, dax);
  59         if (rc < 0) {
  60                 dax->addr = (void __pmem *) ERR_PTR(rc);
  61                 blk_queue_exit(q);
  62                 return rc;
  63         }
  64         return rc;
  65 }
  66
  67 static void dax_unmap_atomic(struct block_device *bdev,
  68                 const struct blk_dax_ctl *dax)
  69 {
  70         if (IS_ERR(dax->addr))
  71                 return;
  72         blk_queue_exit(bdev->bd_queue);
  73 }
  74
  75 struct page *read_dax_sector(struct block_device *bdev, sector_t n)
  76 {
  77         struct page *page = alloc_pages(GFP_KERNEL, 0);
  78         struct blk_dax_ctl dax = {
  79                 .size = PAGE_SIZE,
  80                 .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
  81         };
  82         long rc;
  83
  84         if (!page)
  85                 return ERR_PTR(-ENOMEM);
  86
  87         rc = dax_map_atomic(bdev, &dax);
  88         if (rc < 0)
  89                 return ERR_PTR(rc);
  90         memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
  91         dax_unmap_atomic(bdev, &dax);
  92         return page;
  93 }
  94
  95 static bool buffer_written(struct buffer_head *bh)
  96 {
  97         return buffer_mapped(bh) && !buffer_unwritten(bh);
  98 }
  99
 100 /*
 101  * When ext4 encounters a hole, it returns without modifying the buffer_head
 102  * which means that we can't trust b_size.  To cope with this, we set b_state
 103  * to 0 before calling get_block and, if any bit is set, we know we can trust
 104  * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
 105  * and would save us time calling get_block repeatedly.
 106  */
 107 static bool buffer_size_valid(struct buffer_head *bh)
 108 {
 109         return bh->b_state != 0;
 110 }
 111
 112
 113 static sector_t to_sector(const struct buffer_head *bh,
 114                 const struct inode *inode)
 115 {
 116         sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
 117
 118         return sector;
 119 }
 120
 121 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 122                       loff_t start, loff_t end, get_block_t get_block,
 123                       struct buffer_head *bh)
 124 {
 125         loff_t pos = start, max = start, bh_max = start;
 126         bool hole = false, need_wmb = false;
 127         struct block_device *bdev = NULL;
 128         int rw = iov_iter_rw(iter), rc;
 129         long map_len = 0;
 130         struct blk_dax_ctl dax = {
 131                 .addr = (void __pmem *) ERR_PTR(-EIO),
 132         };
 133         unsigned blkbits = inode->i_blkbits;
 134         sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
 135                                                                 >> blkbits;
 136
 137         if (rw == READ)
 138                 end = min(end, i_size_read(inode));
 139
 140         while (pos < end) {
 141                 size_t len;
 142                 if (pos == max) {
 143                         long page = pos >> PAGE_SHIFT;
 144                         sector_t block = page << (PAGE_SHIFT - blkbits);
 145                         unsigned first = pos - (block << blkbits);
 146                         long size;
 147
 148                         if (pos == bh_max) {
 149                                 bh->b_size = PAGE_ALIGN(end - pos);
 150                                 bh->b_state = 0;
 151                                 rc = get_block(inode, block, bh, rw == WRITE);
 152                                 if (rc)
 153                                         break;
 154                                 if (!buffer_size_valid(bh))
 155                                         bh->b_size = 1 << blkbits;
 156                                 bh_max = pos - first + bh->b_size;
 157                                 bdev = bh->b_bdev;
 158                                 /*
 159                                  * We allow uninitialized buffers for writes
 160                                  * beyond EOF as those cannot race with faults
 161                                  */
 162                                 WARN_ON_ONCE(
 163                                         (buffer_new(bh) && block < file_blks) ||
 164                                         (rw == WRITE && buffer_unwritten(bh)));
 165                         } else {
 166                                 unsigned done = bh->b_size -
 167                                                 (bh_max - (pos - first));
 168                                 bh->b_blocknr += done >> blkbits;
 169                                 bh->b_size -= done;
 170                         }
 171
 172                         hole = rw == READ && !buffer_written(bh);
 173                         if (hole) {
 174                                 size = bh->b_size - first;
 175                         } else {
 176                                 dax_unmap_atomic(bdev, &dax);
 177                                 dax.sector = to_sector(bh, inode);
 178                                 dax.size = bh->b_size;
 179                                 map_len = dax_map_atomic(bdev, &dax);
 180                                 if (map_len < 0) {
 181                                         rc = map_len;
 182                                         break;
 183                                 }
 184                                 dax.addr += first;
 185                                 size = map_len - first;
 186                         }
 187                         max = min(pos + size, end);
 188                 }
 189
 190                 if (iov_iter_rw(iter) == WRITE) {
 191                         len = copy_from_iter_pmem(dax.addr, max - pos, iter);
 192                         need_wmb = true;
 193                 } else if (!hole)
 194                         len = copy_to_iter((void __force *) dax.addr, max - pos,
 195                                         iter);
 196                 else
 197                         len = iov_iter_zero(max - pos, iter);
 198
 199                 if (!len) {
 200                         rc = -EFAULT;
 201                         break;
 202                 }
 203
 204                 pos += len;
 205                 if (!IS_ERR(dax.addr))
 206                         dax.addr += len;
 207         }
 208
 209         if (need_wmb)
 210                 wmb_pmem();
 211         dax_unmap_atomic(bdev, &dax);
 212
 213         return (pos == start) ? rc : pos - start;
 214 }
 215
 216 /**
 217  * dax_do_io - Perform I/O to a DAX file
 218  * @iocb: The control block for this I/O
 219  * @inode: The file which the I/O is directed at
 220  * @iter: The addresses to do I/O from or to
 221  * @pos: The file offset where the I/O starts
 222  * @get_block: The filesystem method used to translate file offsets to blocks
 223  * @end_io: A filesystem callback for I/O completion
 224  * @flags: See below
 225  *
 226  * This function uses the same locking scheme as do_blockdev_direct_IO:
 227  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
 228  * caller for writes.  For reads, we take and release the i_mutex ourselves.
 229  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
 230  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
 231  * is in progress.
 232  */
 233 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 234                   struct iov_iter *iter, loff_t pos, get_block_t get_block,
 235                   dio_iodone_t end_io, int flags)
 236 {
 237         struct buffer_head bh;
 238         ssize_t retval = -EINVAL;
 239         loff_t end = pos + iov_iter_count(iter);
 240
 241         memset(&bh, 0, sizeof(bh));
 242         bh.b_bdev = inode->i_sb->s_bdev;
 243
 244         if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 245                 inode_lock(inode);
 246
 247         /* Protects against truncate */
 248         if (!(flags & DIO_SKIP_DIO_COUNT))
 249                 inode_dio_begin(inode);
 250
 251         retval = dax_io(inode, iter, pos, end, get_block, &bh);
 252
 253         if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
 254                 inode_unlock(inode);
 255
 256         if (end_io) {
 257                 int err;
 258
 259                 err = end_io(iocb, pos, retval, bh.b_private);
 260                 if (err)
 261                         retval = err;
 262         }
 263
 264         if (!(flags & DIO_SKIP_DIO_COUNT))
 265                 inode_dio_end(inode);
 266         return retval;
 267 }
 268 EXPORT_SYMBOL_GPL(dax_do_io);
 269
 270 /*
 271  * The user has performed a load from a hole in the file.  Allocating
 272  * a new page in the file would cause excessive storage usage for
 273  * workloads with sparse files.  We allocate a page cache page instead.
 274  * We'll kick it out of the page cache if it's ever written to,
 275  * otherwise it will simply fall out of the page cache under memory
 276  * pressure without ever having been dirtied.
 277  */
 278 static int dax_load_hole(struct address_space *mapping, struct page *page,
 279                                                         struct vm_fault *vmf)
 280 {
 281         if (!page)
 282                 page = find_or_create_page(mapping, vmf->pgoff,
 283                                                 GFP_KERNEL | __GFP_ZERO);
 284         if (!page)
 285                 return VM_FAULT_OOM;
 286
 287         vmf->page = page;
 288         return VM_FAULT_LOCKED;
 289 }
 290
 291 static int copy_user_bh(struct page *to, struct inode *inode,
 292                 struct buffer_head *bh, unsigned long vaddr)
 293 {
 294         struct blk_dax_ctl dax = {
 295                 .sector = to_sector(bh, inode),
 296                 .size = bh->b_size,
 297         };
 298         struct block_device *bdev = bh->b_bdev;
 299         void *vto;
 300
 301         if (dax_map_atomic(bdev, &dax) < 0)
 302                 return PTR_ERR(dax.addr);
 303         vto = kmap_atomic(to);
 304         copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
 305         kunmap_atomic(vto);
 306         dax_unmap_atomic(bdev, &dax);
 307         return 0;
 308 }
 309
 310 #define NO_SECTOR -1
 311 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 312
 313 static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
 314                 sector_t sector, bool pmd_entry, bool dirty)
 315 {
 316         struct radix_tree_root *page_tree = &mapping->page_tree;
 317         pgoff_t pmd_index = DAX_PMD_INDEX(index);
 318         int type, error = 0;
 319         void *entry;
 320
 321         WARN_ON_ONCE(pmd_entry && !dirty);
 322         if (dirty)
 323                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 324
 325         spin_lock_irq(&mapping->tree_lock);
 326
 327         entry = radix_tree_lookup(page_tree, pmd_index);
 328         if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
 329                 index = pmd_index;
 330                 goto dirty;
 331         }
 332
 333         entry = radix_tree_lookup(page_tree, index);
 334         if (entry) {
 335                 type = RADIX_DAX_TYPE(entry);
 336                 if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
 337                                         type != RADIX_DAX_PMD)) {
 338                         error = -EIO;
 339                         goto unlock;
 340                 }
 341
 342                 if (!pmd_entry || type == RADIX_DAX_PMD)
 343                         goto dirty;
 344
 345                 /*
 346                  * We only insert dirty PMD entries into the radix tree.  This
 347                  * means we don't need to worry about removing a dirty PTE
 348                  * entry and inserting a clean PMD entry, thus reducing the
 349                  * range we would flush with a follow-up fsync/msync call.
 350                  */
 351                 radix_tree_delete(&mapping->page_tree, index);
 352                 mapping->nrexceptional--;
 353         }
 354
 355         if (sector == NO_SECTOR) {
 356                 /*
 357                  * This can happen during correct operation if our pfn_mkwrite
 358                  * fault raced against a hole punch operation.  If this
 359                  * happens the pte that was hole punched will have been
 360                  * unmapped and the radix tree entry will have been removed by
 361                  * the time we are called, but the call will still happen.  We
 362                  * will return all the way up to wp_pfn_shared(), where the
 363                  * pte_same() check will fail, eventually causing page fault
 364                  * to be retried by the CPU.
 365                  */
 366                 goto unlock;
 367         }
 368
 369         error = radix_tree_insert(page_tree, index,
 370                         RADIX_DAX_ENTRY(sector, pmd_entry));
 371         if (error)
 372                 goto unlock;
 373
 374         mapping->nrexceptional++;
 375  dirty:
 376         if (dirty)
 377                 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
 378  unlock:
 379         spin_unlock_irq(&mapping->tree_lock);
 380         return error;
 381 }
 382
 383 static int dax_writeback_one(struct block_device *bdev,
 384                 struct address_space *mapping, pgoff_t index, void *entry)
 385 {
 386         struct radix_tree_root *page_tree = &mapping->page_tree;
 387         int type = RADIX_DAX_TYPE(entry);
 388         struct radix_tree_node *node;
 389         struct blk_dax_ctl dax;
 390         void **slot;
 391         int ret = 0;
 392
 393         spin_lock_irq(&mapping->tree_lock);
 394         /*
 395          * Regular page slots are stabilized by the page lock even
 396          * without the tree itself locked.  These unlocked entries
 397          * need verification under the tree lock.
 398          */
 399         if (!__radix_tree_lookup(page_tree, index, &node, &slot))
 400                 goto unlock;
 401         if (*slot != entry)
 402                 goto unlock;
 403
 404         /* another fsync thread may have already written back this entry */
 405         if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
 406                 goto unlock;
 407
 408         if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
 409                 ret = -EIO;
 410                 goto unlock;
 411         }
 412
 413         dax.sector = RADIX_DAX_SECTOR(entry);
 414         dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
 415         spin_unlock_irq(&mapping->tree_lock);
 416
 417         /*
 418          * We cannot hold tree_lock while calling dax_map_atomic() because it
 419          * eventually calls cond_resched().
 420          */
 421         ret = dax_map_atomic(bdev, &dax);
 422         if (ret < 0)
 423                 return ret;
 424
 425         if (WARN_ON_ONCE(ret < dax.size)) {
 426                 ret = -EIO;
 427                 goto unmap;
 428         }
 429
 430         wb_cache_pmem(dax.addr, dax.size);
 431
 432         spin_lock_irq(&mapping->tree_lock);
 433         radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
 434         spin_unlock_irq(&mapping->tree_lock);
 435  unmap:
 436         dax_unmap_atomic(bdev, &dax);
 437         return ret;
 438
 439  unlock:
 440         spin_unlock_irq(&mapping->tree_lock);
 441         return ret;
 442 }
 443
 444 /*
 445  * Flush the mapping to the persistent domain within the byte range of [start,
 446  * end]. This is required by data integrity operations to ensure file data is
 447  * on persistent storage prior to completion of the operation.
 448  */
 449 int dax_writeback_mapping_range(struct address_space *mapping,
 450                 struct block_device *bdev, struct writeback_control *wbc)
 451 {
 452         struct inode *inode = mapping->host;
 453         pgoff_t start_index, end_index, pmd_index;
 454         pgoff_t indices[PAGEVEC_SIZE];
 455         struct pagevec pvec;
 456         bool done = false;
 457         int i, ret = 0;
 458         void *entry;
 459
 460         if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 461                 return -EIO;
 462
 463         if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 464                 return 0;
 465
 466         start_index = wbc->range_start >> PAGE_SHIFT;
 467         end_index = wbc->range_end >> PAGE_SHIFT;
 468         pmd_index = DAX_PMD_INDEX(start_index);
 469
 470         rcu_read_lock();
 471         entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
 472         rcu_read_unlock();
 473
 474         /* see if the start of our range is covered by a PMD entry */
 475         if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
 476                 start_index = pmd_index;
 477
 478         tag_pages_for_writeback(mapping, start_index, end_index);
 479
 480         pagevec_init(&pvec, 0);
 481         while (!done) {
 482                 pvec.nr = find_get_entries_tag(mapping, start_index,
 483                                 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
 484                                 pvec.pages, indices);
 485
 486                 if (pvec.nr == 0)
 487                         break;
 488
 489                 for (i = 0; i < pvec.nr; i++) {
 490                         if (indices[i] > end_index) {
 491                                 done = true;
 492                                 break;
 493                         }
 494
 495                         ret = dax_writeback_one(bdev, mapping, indices[i],
 496                                         pvec.pages[i]);
 497                         if (ret < 0)
 498                                 return ret;
 499                 }
 500         }
 501         wmb_pmem();
 502         return 0;
 503 }
 504 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 505
 506 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 507                         struct vm_area_struct *vma, struct vm_fault *vmf)
 508 {
 509         unsigned long vaddr = (unsigned long)vmf->virtual_address;
 510         struct address_space *mapping = inode->i_mapping;
 511         struct block_device *bdev = bh->b_bdev;
 512         struct blk_dax_ctl dax = {
 513                 .sector = to_sector(bh, inode),
 514                 .size = bh->b_size,
 515         };
 516         int error;
 517
 518         i_mmap_lock_read(mapping);
 519
 520         if (dax_map_atomic(bdev, &dax) < 0) {
 521                 error = PTR_ERR(dax.addr);
 522                 goto out;
 523         }
 524         dax_unmap_atomic(bdev, &dax);
 525
 526         error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
 527                         vmf->flags & FAULT_FLAG_WRITE);
 528         if (error)
 529                 goto out;
 530
 531         error = vm_insert_mixed(vma, vaddr, dax.pfn);
 532
 533  out:
 534         i_mmap_unlock_read(mapping);
 535
 536         return error;
 537 }
 538
 539 /**
 540  * __dax_fault - handle a page fault on a DAX file
 541  * @vma: The virtual memory area where the fault occurred
 542  * @vmf: The description of the fault
 543  * @get_block: The filesystem method used to translate file offsets to blocks
 544  *
 545  * When a page fault occurs, filesystems may call this helper in their
 546  * fault handler for DAX files. __dax_fault() assumes the caller has done all
 547  * the necessary locking for the page fault to proceed successfully.
 548  */
 549 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 550                         get_block_t get_block)
 551 {
 552         struct file *file = vma->vm_file;
 553         struct address_space *mapping = file->f_mapping;
 554         struct inode *inode = mapping->host;
 555         struct page *page;
 556         struct buffer_head bh;
 557         unsigned long vaddr = (unsigned long)vmf->virtual_address;
 558         unsigned blkbits = inode->i_blkbits;
 559         sector_t block;
 560         pgoff_t size;
 561         int error;
 562         int major = 0;
 563
 564         size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 565         if (vmf->pgoff >= size)
 566                 return VM_FAULT_SIGBUS;
 567
 568         memset(&bh, 0, sizeof(bh));
 569         block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
 570         bh.b_bdev = inode->i_sb->s_bdev;
 571         bh.b_size = PAGE_SIZE;
 572
 573  repeat:
 574         page = find_get_page(mapping, vmf->pgoff);
 575         if (page) {
 576                 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
 577                         put_page(page);
 578                         return VM_FAULT_RETRY;
 579                 }
 580                 if (unlikely(page->mapping != mapping)) {
 581                         unlock_page(page);
 582                         put_page(page);
 583                         goto repeat;
 584                 }
 585         }
 586
 587         error = get_block(inode, block, &bh, 0);
 588         if (!error && (bh.b_size < PAGE_SIZE))
 589                 error = -EIO;           /* fs corruption? */
 590         if (error)
 591                 goto unlock_page;
 592
 593         if (!buffer_mapped(&bh) && !vmf->cow_page) {
 594                 if (vmf->flags & FAULT_FLAG_WRITE) {
 595                         error = get_block(inode, block, &bh, 1);
 596                         count_vm_event(PGMAJFAULT);
 597                         mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 598                         major = VM_FAULT_MAJOR;
 599                         if (!error && (bh.b_size < PAGE_SIZE))
 600                                 error = -EIO;
 601                         if (error)
 602                                 goto unlock_page;
 603                 } else {
 604                         return dax_load_hole(mapping, page, vmf);
 605                 }
 606         }
 607
 608         if (vmf->cow_page) {
 609                 struct page *new_page = vmf->cow_page;
 610                 if (buffer_written(&bh))
 611                         error = copy_user_bh(new_page, inode, &bh, vaddr);
 612                 else
 613                         clear_user_highpage(new_page, vaddr);
 614                 if (error)
 615                         goto unlock_page;
 616                 vmf->page = page;
 617                 if (!page)
 618                         i_mmap_lock_read(mapping);
 619                 return VM_FAULT_LOCKED;
 620         }
 621
 622         /* Check we didn't race with a read fault installing a new page */
 623         if (!page && major)
 624                 page = find_lock_page(mapping, vmf->pgoff);
 625
 626         if (page) {
 627                 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
 628                                                         PAGE_SIZE, 0);
 629                 delete_from_page_cache(page);
 630                 unlock_page(page);
 631                 put_page(page);
 632                 page = NULL;
 633         }
 634
 635         /* Filesystem should not return unwritten buffers to us! */
 636         WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
 637         error = dax_insert_mapping(inode, &bh, vma, vmf);
 638
 639  out:
 640         if (error == -ENOMEM)
 641                 return VM_FAULT_OOM | major;
 642         /* -EBUSY is fine, somebody else faulted on the same PTE */
 643         if ((error < 0) && (error != -EBUSY))
 644                 return VM_FAULT_SIGBUS | major;
 645         return VM_FAULT_NOPAGE | major;
 646
 647  unlock_page:
 648         if (page) {
 649                 unlock_page(page);
 650                 put_page(page);
 651         }
 652         goto out;
 653 }
 654 EXPORT_SYMBOL(__dax_fault);
 655
 656 /**
 657  * dax_fault - handle a page fault on a DAX file
 658  * @vma: The virtual memory area where the fault occurred
 659  * @vmf: The description of the fault
 660  * @get_block: The filesystem method used to translate file offsets to blocks
 661  *
 662  * When a page fault occurs, filesystems may call this helper in their
 663  * fault handler for DAX files.
 664  */
 665 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 666               get_block_t get_block)
 667 {
 668         int result;
 669         struct super_block *sb = file_inode(vma->vm_file)->i_sb;
 670
 671         if (vmf->flags & FAULT_FLAG_WRITE) {
 672                 sb_start_pagefault(sb);
 673                 file_update_time(vma->vm_file);
 674         }
 675         result = __dax_fault(vma, vmf, get_block);
 676         if (vmf->flags & FAULT_FLAG_WRITE)
 677                 sb_end_pagefault(sb);
 678
 679         return result;
 680 }
 681 EXPORT_SYMBOL_GPL(dax_fault);
 682
 683 #if defined(CONFIG_TRANSPARENT_HUGEPAGE)
 684 /*
 685  * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
 686  * more often than one might expect in the below function.
 687  */
 688 #define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHIFT) - 1)
 689
 690 static void __dax_dbg(struct buffer_head *bh, unsigned long address,
 691                 const char *reason, const char *fn)
 692 {
 693         if (bh) {
 694                 char bname[BDEVNAME_SIZE];
 695                 bdevname(bh->b_bdev, bname);
 696                 pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
 697                         "length %zd fallback: %s\n", fn, current->comm,
 698                         address, bname, bh->b_state, (u64)bh->b_blocknr,
 699                         bh->b_size, reason);
 700         } else {
 701                 pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
 702                         current->comm, address, reason);
 703         }
 704 }
 705
 706 #define dax_pmd_dbg(bh, address, reason)        __dax_dbg(bh, address, reason, "dax_pmd")
 707
 708 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 709                 pmd_t *pmd, unsigned int flags, get_block_t get_block)
 710 {
 711         struct file *file = vma->vm_file;
 712         struct address_space *mapping = file->f_mapping;
 713         struct inode *inode = mapping->host;
 714         struct buffer_head bh;
 715         unsigned blkbits = inode->i_blkbits;
 716         unsigned long pmd_addr = address & PMD_MASK;
 717         bool write = flags & FAULT_FLAG_WRITE;
 718         struct block_device *bdev;
 719         pgoff_t size, pgoff;
 720         sector_t block;
 721         int error, result = 0;
 722         bool alloc = false;
 723
 724         /* dax pmd mappings require pfn_t_devmap() */
 725         if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
 726                 return VM_FAULT_FALLBACK;
 727
 728         /* Fall back to PTEs if we're going to COW */
 729         if (write && !(vma->vm_flags & VM_SHARED)) {
 730                 split_huge_pmd(vma, pmd, address);
 731                 dax_pmd_dbg(NULL, address, "cow write");
 732                 return VM_FAULT_FALLBACK;
 733         }
 734         /* If the PMD would extend outside the VMA */
 735         if (pmd_addr < vma->vm_start) {
 736                 dax_pmd_dbg(NULL, address, "vma start unaligned");
 737                 return VM_FAULT_FALLBACK;
 738         }
 739         if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
 740                 dax_pmd_dbg(NULL, address, "vma end unaligned");
 741                 return VM_FAULT_FALLBACK;
 742         }
 743
 744         pgoff = linear_page_index(vma, pmd_addr);
 745         size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 746         if (pgoff >= size)
 747                 return VM_FAULT_SIGBUS;
 748         /* If the PMD would cover blocks out of the file */
 749         if ((pgoff | PG_PMD_COLOUR) >= size) {
 750                 dax_pmd_dbg(NULL, address,
 751                                 "offset + huge page size > file size");
 752                 return VM_FAULT_FALLBACK;
 753         }
 754
 755         memset(&bh, 0, sizeof(bh));
 756         bh.b_bdev = inode->i_sb->s_bdev;
 757         block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 758
 759         bh.b_size = PMD_SIZE;
 760
 761         if (get_block(inode, block, &bh, 0) != 0)
 762                 return VM_FAULT_SIGBUS;
 763
 764         if (!buffer_mapped(&bh) && write) {
 765                 if (get_block(inode, block, &bh, 1) != 0)
 766                         return VM_FAULT_SIGBUS;
 767                 alloc = true;
 768                 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
 769         }
 770
 771         bdev = bh.b_bdev;
 772
 773         /*
 774          * If the filesystem isn't willing to tell us the length of a hole,
 775          * just fall back to PTEs.  Calling get_block 512 times in a loop
 776          * would be silly.
 777          */
 778         if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
 779                 dax_pmd_dbg(&bh, address, "allocated block too small");
 780                 return VM_FAULT_FALLBACK;
 781         }
 782
 783         /*
 784          * If we allocated new storage, make sure no process has any
 785          * zero pages covering this hole
 786          */
 787         if (alloc) {
 788                 loff_t lstart = pgoff << PAGE_SHIFT;
 789                 loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
 790
 791                 truncate_pagecache_range(inode, lstart, lend);
 792         }
 793
 794         i_mmap_lock_read(mapping);
 795
 796         if (!write && !buffer_mapped(&bh)) {
 797                 spinlock_t *ptl;
 798                 pmd_t entry;
 799                 struct page *zero_page = get_huge_zero_page();
 800
 801                 if (unlikely(!zero_page)) {
 802                         dax_pmd_dbg(&bh, address, "no zero page");
 803                         goto fallback;
 804                 }
 805
 806                 ptl = pmd_lock(vma->vm_mm, pmd);
 807                 if (!pmd_none(*pmd)) {
 808                         spin_unlock(ptl);
 809                         dax_pmd_dbg(&bh, address, "pmd already present");
 810                         goto fallback;
 811                 }
 812
 813                 dev_dbg(part_to_dev(bdev->bd_part),
 814                                 "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
 815                                 __func__, current->comm, address,
 816                                 (unsigned long long) to_sector(&bh, inode));
 817
 818                 entry = mk_pmd(zero_page, vma->vm_page_prot);
 819                 entry = pmd_mkhuge(entry);
 820                 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
 821                 result = VM_FAULT_NOPAGE;
 822                 spin_unlock(ptl);
 823         } else {
 824                 struct blk_dax_ctl dax = {
 825                         .sector = to_sector(&bh, inode),
 826                         .size = PMD_SIZE,
 827                 };
 828                 long length = dax_map_atomic(bdev, &dax);
 829
 830                 if (length < 0) {
 831                         dax_pmd_dbg(&bh, address, "dax-error fallback");
 832                         goto fallback;
 833                 }
 834                 if (length < PMD_SIZE) {
 835                         dax_pmd_dbg(&bh, address, "dax-length too small");
 836                         dax_unmap_atomic(bdev, &dax);
 837                         goto fallback;
 838                 }
 839                 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
 840                         dax_pmd_dbg(&bh, address, "pfn unaligned");
 841                         dax_unmap_atomic(bdev, &dax);
 842                         goto fallback;
 843                 }
 844
 845                 if (!pfn_t_devmap(dax.pfn)) {
 846                         dax_unmap_atomic(bdev, &dax);
 847                         dax_pmd_dbg(&bh, address, "pfn not in memmap");
 848                         goto fallback;
 849                 }
 850                 dax_unmap_atomic(bdev, &dax);
 851
 852                 /*
 853                  * For PTE faults we insert a radix tree entry for reads, and
 854                  * leave it clean.  Then on the first write we dirty the radix
 855                  * tree entry via the dax_pfn_mkwrite() path.  This sequence
 856                  * allows the dax_pfn_mkwrite() call to be simpler and avoid a
 857                  * call into get_block() to translate the pgoff to a sector in
 858                  * order to be able to create a new radix tree entry.
 859                  *
 860                  * The PMD path doesn't have an equivalent to
 861                  * dax_pfn_mkwrite(), though, so for a read followed by a
 862                  * write we traverse all the way through __dax_pmd_fault()
 863                  * twice.  This means we can just skip inserting a radix tree
 864                  * entry completely on the initial read and just wait until
 865                  * the write to insert a dirty entry.
 866                  */
 867                 if (write) {
 868                         error = dax_radix_entry(mapping, pgoff, dax.sector,
 869                                         true, true);
 870                         if (error) {
 871                                 dax_pmd_dbg(&bh, address,
 872                                                 "PMD radix insertion failed");
 873                                 goto fallback;
 874                         }
 875                 }
 876
 877                 dev_dbg(part_to_dev(bdev->bd_part),
 878                                 "%s: %s addr: %lx pfn: %lx sect: %llx\n",
 879                                 __func__, current->comm, address,
 880                                 pfn_t_to_pfn(dax.pfn),
 881                                 (unsigned long long) dax.sector);
 882                 result |= vmf_insert_pfn_pmd(vma, address, pmd,
 883                                 dax.pfn, write);
 884         }
 885
 886  out:
 887         i_mmap_unlock_read(mapping);
 888
 889         return result;
 890
 891  fallback:
 892         count_vm_event(THP_FAULT_FALLBACK);
 893         result = VM_FAULT_FALLBACK;
 894         goto out;
 895 }
 896 EXPORT_SYMBOL_GPL(__dax_pmd_fault);
 897
 898 /**
 899  * dax_pmd_fault - handle a PMD fault on a DAX file
 900  * @vma: The virtual memory area where the fault occurred
 901  * @vmf: The description of the fault
 902  * @get_block: The filesystem method used to translate file offsets to blocks
 903  *
 904  * When a page fault occurs, filesystems may call this helper in their
 905  * pmd_fault handler for DAX files.
 906  */
 907 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 908                         pmd_t *pmd, unsigned int flags, get_block_t get_block)
 909 {
 910         int result;
 911         struct super_block *sb = file_inode(vma->vm_file)->i_sb;
 912
 913         if (flags & FAULT_FLAG_WRITE) {
 914                 sb_start_pagefault(sb);
 915                 file_update_time(vma->vm_file);
 916         }
 917         result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
 918         if (flags & FAULT_FLAG_WRITE)
 919                 sb_end_pagefault(sb);
 920
 921         return result;
 922 }
 923 EXPORT_SYMBOL_GPL(dax_pmd_fault);
 924 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 925
 926 /**
 927  * dax_pfn_mkwrite - handle first write to DAX page
 928  * @vma: The virtual memory area where the fault occurred
 929  * @vmf: The description of the fault
 930  */
 931 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 932 {
 933         struct file *file = vma->vm_file;
 934         int error;
 935
 936         /*
 937          * We pass NO_SECTOR to dax_radix_entry() because we expect that a
 938          * RADIX_DAX_PTE entry already exists in the radix tree from a
 939          * previous call to __dax_fault().  We just want to look up that PTE
 940          * entry using vmf->pgoff and make sure the dirty tag is set.  This
 941          * saves us from having to make a call to get_block() here to look
 942          * up the sector.
 943          */
 944         error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
 945                         true);
 946
 947         if (error == -ENOMEM)
 948                 return VM_FAULT_OOM;
 949         if (error)
 950                 return VM_FAULT_SIGBUS;
 951         return VM_FAULT_NOPAGE;
 952 }
 953 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
 954
 955 static bool dax_range_is_aligned(struct block_device *bdev,
 956                                  unsigned int offset, unsigned int length)
 957 {
 958         unsigned short sector_size = bdev_logical_block_size(bdev);
 959
 960         if (!IS_ALIGNED(offset, sector_size))
 961                 return false;
 962         if (!IS_ALIGNED(length, sector_size))
 963                 return false;
 964
 965         return true;
 966 }
 967
 968 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 969                 unsigned int offset, unsigned int length)
 970 {
 971         struct blk_dax_ctl dax = {
 972                 .sector         = sector,
 973                 .size           = PAGE_SIZE,
 974         };
 975
 976         if (dax_range_is_aligned(bdev, offset, length)) {
 977                 sector_t start_sector = dax.sector + (offset >> 9);
 978
 979                 return blkdev_issue_zeroout(bdev, start_sector,
 980                                 length >> 9, GFP_NOFS, true);
 981         } else {
 982                 if (dax_map_atomic(bdev, &dax) < 0)
 983                         return PTR_ERR(dax.addr);
 984                 clear_pmem(dax.addr + offset, length);
 985                 wmb_pmem();
 986                 dax_unmap_atomic(bdev, &dax);
 987         }
 988         return 0;
 989 }
 990 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 991
 992 /**
 993  * dax_zero_page_range - zero a range within a page of a DAX file
 994  * @inode: The file being truncated
 995  * @from: The file offset that is being truncated to
 996  * @length: The number of bytes to zero
 997  * @get_block: The filesystem method used to translate file offsets to blocks
 998  *
 999  * This function can be called by a filesystem when it is zeroing part of a
1000  * page in a DAX file.  This is intended for hole-punch operations.  If
1001  * you are truncating a file, the helper function dax_truncate_page() may be
1002  * more convenient.
1003  */
1004 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1005                                                         get_block_t get_block)
1006 {
1007         struct buffer_head bh;
1008         pgoff_t index = from >> PAGE_SHIFT;
1009         unsigned offset = from & (PAGE_SIZE-1);
1010         int err;
1011
1012         /* Block boundary? Nothing to do */
1013         if (!length)
1014                 return 0;
1015         BUG_ON((offset + length) > PAGE_SIZE);
1016
1017         memset(&bh, 0, sizeof(bh));
1018         bh.b_bdev = inode->i_sb->s_bdev;
1019         bh.b_size = PAGE_SIZE;
1020         err = get_block(inode, index, &bh, 0);
1021         if (err < 0 || !buffer_written(&bh))
1022                 return err;
1023
1024         return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1025                         offset, length);
1026 }
1027 EXPORT_SYMBOL_GPL(dax_zero_page_range);
1028
1029 /**
1030  * dax_truncate_page - handle a partial page being truncated in a DAX file
1031  * @inode: The file being truncated
1032  * @from: The file offset that is being truncated to
1033  * @get_block: The filesystem method used to translate file offsets to blocks
1034  *
1035  * Similar to block_truncate_page(), this function can be called by a
1036  * filesystem when it is truncating a DAX file to handle the partial page.
1037  */
1038 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1039 {
1040         unsigned length = PAGE_ALIGN(from) - from;
1041         return dax_zero_page_range(inode, from, length, get_block);
1042 }
1043 EXPORT_SYMBOL_GPL(dax_truncate_page);