]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - fs/dax.c
Revert "block: use DAX for partition table reads"
[mirror_ubuntu-bionic-kernel.git] / fs / dax.c
CommitLineData
d475c634
MW
1/*
2 * fs/dax.c - Direct Access filesystem code
3 * Copyright (c) 2013-2014 Intel Corporation
4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/atomic.h>
18#include <linux/blkdev.h>
19#include <linux/buffer_head.h>
d77e92e2 20#include <linux/dax.h>
d475c634
MW
21#include <linux/fs.h>
22#include <linux/genhd.h>
f7ca90b1
MW
23#include <linux/highmem.h>
24#include <linux/memcontrol.h>
25#include <linux/mm.h>
d475c634 26#include <linux/mutex.h>
9973c98e 27#include <linux/pagevec.h>
2765cfbb 28#include <linux/pmem.h>
289c6aed 29#include <linux/sched.h>
f361bf4a 30#include <linux/sched/signal.h>
d475c634 31#include <linux/uio.h>
f7ca90b1 32#include <linux/vmstat.h>
34c0fd54 33#include <linux/pfn_t.h>
0e749e54 34#include <linux/sizes.h>
4b4bb46d 35#include <linux/mmu_notifier.h>
a254e568
CH
36#include <linux/iomap.h>
37#include "internal.h"
d475c634 38
282a8e03
RZ
39#define CREATE_TRACE_POINTS
40#include <trace/events/fs_dax.h>
41
ac401cc7
JK
42/* We choose 4096 entries - same as per-zone page wait tables */
43#define DAX_WAIT_TABLE_BITS 12
44#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
45
ce95ab0f 46static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
ac401cc7
JK
47
48static int __init init_dax_wait_table(void)
49{
50 int i;
51
52 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
53 init_waitqueue_head(wait_table + i);
54 return 0;
55}
56fs_initcall(init_dax_wait_table);
57
b2e0d162
DW
58static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
59{
60 struct request_queue *q = bdev->bd_queue;
61 long rc = -EIO;
62
7a9eb206 63 dax->addr = ERR_PTR(-EIO);
b2e0d162
DW
64 if (blk_queue_enter(q, true) != 0)
65 return rc;
66
67 rc = bdev_direct_access(bdev, dax);
68 if (rc < 0) {
7a9eb206 69 dax->addr = ERR_PTR(rc);
b2e0d162
DW
70 blk_queue_exit(q);
71 return rc;
72 }
73 return rc;
74}
75
76static void dax_unmap_atomic(struct block_device *bdev,
77 const struct blk_dax_ctl *dax)
78{
79 if (IS_ERR(dax->addr))
80 return;
81 blk_queue_exit(bdev->bd_queue);
82}
83
642261ac 84static int dax_is_pmd_entry(void *entry)
d1a5f2b4 85{
642261ac 86 return (unsigned long)entry & RADIX_DAX_PMD;
d1a5f2b4
DW
87}
88
642261ac 89static int dax_is_pte_entry(void *entry)
d475c634 90{
642261ac 91 return !((unsigned long)entry & RADIX_DAX_PMD);
d475c634
MW
92}
93
642261ac 94static int dax_is_zero_entry(void *entry)
d475c634 95{
642261ac 96 return (unsigned long)entry & RADIX_DAX_HZP;
d475c634
MW
97}
98
642261ac 99static int dax_is_empty_entry(void *entry)
b2e0d162 100{
642261ac 101 return (unsigned long)entry & RADIX_DAX_EMPTY;
b2e0d162
DW
102}
103
ac401cc7
JK
104/*
105 * DAX radix tree locking
106 */
107struct exceptional_entry_key {
108 struct address_space *mapping;
63e95b5c 109 pgoff_t entry_start;
ac401cc7
JK
110};
111
112struct wait_exceptional_entry_queue {
113 wait_queue_t wait;
114 struct exceptional_entry_key key;
115};
116
63e95b5c
RZ
117static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
118 pgoff_t index, void *entry, struct exceptional_entry_key *key)
119{
120 unsigned long hash;
121
122 /*
123 * If 'entry' is a PMD, align the 'index' that we use for the wait
124 * queue to the start of that PMD. This ensures that all offsets in
125 * the range covered by the PMD map to the same bit lock.
126 */
642261ac 127 if (dax_is_pmd_entry(entry))
63e95b5c
RZ
128 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
129
130 key->mapping = mapping;
131 key->entry_start = index;
132
133 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
134 return wait_table + hash;
135}
136
ac401cc7
JK
137static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
138 int sync, void *keyp)
139{
140 struct exceptional_entry_key *key = keyp;
141 struct wait_exceptional_entry_queue *ewait =
142 container_of(wait, struct wait_exceptional_entry_queue, wait);
143
144 if (key->mapping != ewait->key.mapping ||
63e95b5c 145 key->entry_start != ewait->key.entry_start)
ac401cc7
JK
146 return 0;
147 return autoremove_wake_function(wait, mode, sync, NULL);
148}
149
150/*
151 * Check whether the given slot is locked. The function must be called with
152 * mapping->tree_lock held
153 */
154static inline int slot_locked(struct address_space *mapping, void **slot)
155{
156 unsigned long entry = (unsigned long)
157 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
158 return entry & RADIX_DAX_ENTRY_LOCK;
159}
160
161/*
162 * Mark the given slot is locked. The function must be called with
163 * mapping->tree_lock held
164 */
165static inline void *lock_slot(struct address_space *mapping, void **slot)
166{
167 unsigned long entry = (unsigned long)
168 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
169
170 entry |= RADIX_DAX_ENTRY_LOCK;
6d75f366 171 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
ac401cc7
JK
172 return (void *)entry;
173}
174
175/*
176 * Mark the given slot is unlocked. The function must be called with
177 * mapping->tree_lock held
178 */
179static inline void *unlock_slot(struct address_space *mapping, void **slot)
180{
181 unsigned long entry = (unsigned long)
182 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
183
184 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
6d75f366 185 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
ac401cc7
JK
186 return (void *)entry;
187}
188
189/*
190 * Lookup entry in radix tree, wait for it to become unlocked if it is
191 * exceptional entry and return it. The caller must call
192 * put_unlocked_mapping_entry() when he decided not to lock the entry or
193 * put_locked_mapping_entry() when he locked the entry and now wants to
194 * unlock it.
195 *
196 * The function must be called with mapping->tree_lock held.
197 */
198static void *get_unlocked_mapping_entry(struct address_space *mapping,
199 pgoff_t index, void ***slotp)
200{
e3ad61c6 201 void *entry, **slot;
ac401cc7 202 struct wait_exceptional_entry_queue ewait;
63e95b5c 203 wait_queue_head_t *wq;
ac401cc7
JK
204
205 init_wait(&ewait.wait);
206 ewait.wait.func = wake_exceptional_entry_func;
ac401cc7
JK
207
208 for (;;) {
e3ad61c6 209 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
ac401cc7 210 &slot);
e3ad61c6 211 if (!entry || !radix_tree_exceptional_entry(entry) ||
ac401cc7
JK
212 !slot_locked(mapping, slot)) {
213 if (slotp)
214 *slotp = slot;
e3ad61c6 215 return entry;
ac401cc7 216 }
63e95b5c
RZ
217
218 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
ac401cc7
JK
219 prepare_to_wait_exclusive(wq, &ewait.wait,
220 TASK_UNINTERRUPTIBLE);
221 spin_unlock_irq(&mapping->tree_lock);
222 schedule();
223 finish_wait(wq, &ewait.wait);
224 spin_lock_irq(&mapping->tree_lock);
225 }
226}
227
b1aa812b
JK
228static void dax_unlock_mapping_entry(struct address_space *mapping,
229 pgoff_t index)
230{
231 void *entry, **slot;
232
233 spin_lock_irq(&mapping->tree_lock);
234 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
235 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
236 !slot_locked(mapping, slot))) {
237 spin_unlock_irq(&mapping->tree_lock);
238 return;
239 }
240 unlock_slot(mapping, slot);
241 spin_unlock_irq(&mapping->tree_lock);
242 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
243}
244
422476c4
RZ
245static void put_locked_mapping_entry(struct address_space *mapping,
246 pgoff_t index, void *entry)
247{
248 if (!radix_tree_exceptional_entry(entry)) {
249 unlock_page(entry);
250 put_page(entry);
251 } else {
252 dax_unlock_mapping_entry(mapping, index);
253 }
254}
255
256/*
257 * Called when we are done with radix tree entry we looked up via
258 * get_unlocked_mapping_entry() and which we didn't lock in the end.
259 */
260static void put_unlocked_mapping_entry(struct address_space *mapping,
261 pgoff_t index, void *entry)
262{
263 if (!radix_tree_exceptional_entry(entry))
264 return;
265
266 /* We have to wake up next waiter for the radix tree entry lock */
267 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
268}
269
ac401cc7
JK
270/*
271 * Find radix tree entry at given index. If it points to a page, return with
272 * the page locked. If it points to the exceptional entry, return with the
273 * radix tree entry locked. If the radix tree doesn't contain given index,
274 * create empty exceptional entry for the index and return with it locked.
275 *
642261ac
RZ
276 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
277 * either return that locked entry or will return an error. This error will
278 * happen if there are any 4k entries (either zero pages or DAX entries)
279 * within the 2MiB range that we are requesting.
280 *
281 * We always favor 4k entries over 2MiB entries. There isn't a flow where we
282 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
283 * insertion will fail if it finds any 4k entries already in the tree, and a
284 * 4k insertion will cause an existing 2MiB entry to be unmapped and
285 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as
286 * well as 2MiB empty entries.
287 *
288 * The exception to this downgrade path is for 2MiB DAX PMD entries that have
289 * real storage backing them. We will leave these real 2MiB DAX entries in
290 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
291 *
ac401cc7
JK
292 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
293 * persistent memory the benefit is doubtful. We can add that later if we can
294 * show it helps.
295 */
642261ac
RZ
296static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
297 unsigned long size_flag)
ac401cc7 298{
642261ac 299 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
e3ad61c6 300 void *entry, **slot;
ac401cc7
JK
301
302restart:
303 spin_lock_irq(&mapping->tree_lock);
e3ad61c6 304 entry = get_unlocked_mapping_entry(mapping, index, &slot);
642261ac
RZ
305
306 if (entry) {
307 if (size_flag & RADIX_DAX_PMD) {
308 if (!radix_tree_exceptional_entry(entry) ||
309 dax_is_pte_entry(entry)) {
310 put_unlocked_mapping_entry(mapping, index,
311 entry);
312 entry = ERR_PTR(-EEXIST);
313 goto out_unlock;
314 }
315 } else { /* trying to grab a PTE entry */
316 if (radix_tree_exceptional_entry(entry) &&
317 dax_is_pmd_entry(entry) &&
318 (dax_is_zero_entry(entry) ||
319 dax_is_empty_entry(entry))) {
320 pmd_downgrade = true;
321 }
322 }
323 }
324
ac401cc7 325 /* No entry for given index? Make sure radix tree is big enough. */
642261ac 326 if (!entry || pmd_downgrade) {
ac401cc7
JK
327 int err;
328
642261ac
RZ
329 if (pmd_downgrade) {
330 /*
331 * Make sure 'entry' remains valid while we drop
332 * mapping->tree_lock.
333 */
334 entry = lock_slot(mapping, slot);
335 }
336
ac401cc7 337 spin_unlock_irq(&mapping->tree_lock);
642261ac
RZ
338 /*
339 * Besides huge zero pages the only other thing that gets
340 * downgraded are empty entries which don't need to be
341 * unmapped.
342 */
343 if (pmd_downgrade && dax_is_zero_entry(entry))
344 unmap_mapping_range(mapping,
345 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
346
ac401cc7
JK
347 err = radix_tree_preload(
348 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
0cb80b48
JK
349 if (err) {
350 if (pmd_downgrade)
351 put_locked_mapping_entry(mapping, index, entry);
ac401cc7 352 return ERR_PTR(err);
0cb80b48 353 }
ac401cc7 354 spin_lock_irq(&mapping->tree_lock);
642261ac
RZ
355
356 if (pmd_downgrade) {
357 radix_tree_delete(&mapping->page_tree, index);
358 mapping->nrexceptional--;
359 dax_wake_mapping_entry_waiter(mapping, index, entry,
360 true);
361 }
362
363 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
364
365 err = __radix_tree_insert(&mapping->page_tree, index,
366 dax_radix_order(entry), entry);
ac401cc7
JK
367 radix_tree_preload_end();
368 if (err) {
369 spin_unlock_irq(&mapping->tree_lock);
642261ac
RZ
370 /*
371 * Someone already created the entry? This is a
372 * normal failure when inserting PMDs in a range
373 * that already contains PTEs. In that case we want
374 * to return -EEXIST immediately.
375 */
376 if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
ac401cc7 377 goto restart;
642261ac
RZ
378 /*
379 * Our insertion of a DAX PMD entry failed, most
380 * likely because it collided with a PTE sized entry
381 * at a different index in the PMD range. We haven't
382 * inserted anything into the radix tree and have no
383 * waiters to wake.
384 */
ac401cc7
JK
385 return ERR_PTR(err);
386 }
387 /* Good, we have inserted empty locked entry into the tree. */
388 mapping->nrexceptional++;
389 spin_unlock_irq(&mapping->tree_lock);
e3ad61c6 390 return entry;
ac401cc7
JK
391 }
392 /* Normal page in radix tree? */
e3ad61c6
RZ
393 if (!radix_tree_exceptional_entry(entry)) {
394 struct page *page = entry;
ac401cc7
JK
395
396 get_page(page);
397 spin_unlock_irq(&mapping->tree_lock);
398 lock_page(page);
399 /* Page got truncated? Retry... */
400 if (unlikely(page->mapping != mapping)) {
401 unlock_page(page);
402 put_page(page);
403 goto restart;
404 }
405 return page;
406 }
e3ad61c6 407 entry = lock_slot(mapping, slot);
642261ac 408 out_unlock:
ac401cc7 409 spin_unlock_irq(&mapping->tree_lock);
e3ad61c6 410 return entry;
ac401cc7
JK
411}
412
63e95b5c
RZ
413/*
414 * We do not necessarily hold the mapping->tree_lock when we call this
415 * function so it is possible that 'entry' is no longer a valid item in the
642261ac
RZ
416 * radix tree. This is okay because all we really need to do is to find the
417 * correct waitqueue where tasks might be waiting for that old 'entry' and
418 * wake them.
63e95b5c 419 */
ac401cc7 420void dax_wake_mapping_entry_waiter(struct address_space *mapping,
63e95b5c 421 pgoff_t index, void *entry, bool wake_all)
ac401cc7 422{
63e95b5c
RZ
423 struct exceptional_entry_key key;
424 wait_queue_head_t *wq;
425
426 wq = dax_entry_waitqueue(mapping, index, entry, &key);
ac401cc7
JK
427
428 /*
429 * Checking for locked entry and prepare_to_wait_exclusive() happens
430 * under mapping->tree_lock, ditto for entry handling in our callers.
431 * So at this point all tasks that could have seen our entry locked
432 * must be in the waitqueue and the following check will see them.
433 */
63e95b5c 434 if (waitqueue_active(wq))
ac401cc7 435 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
ac401cc7
JK
436}
437
c6dcf52c
JK
438static int __dax_invalidate_mapping_entry(struct address_space *mapping,
439 pgoff_t index, bool trunc)
440{
441 int ret = 0;
442 void *entry;
443 struct radix_tree_root *page_tree = &mapping->page_tree;
444
445 spin_lock_irq(&mapping->tree_lock);
446 entry = get_unlocked_mapping_entry(mapping, index, NULL);
447 if (!entry || !radix_tree_exceptional_entry(entry))
448 goto out;
449 if (!trunc &&
450 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
451 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
452 goto out;
453 radix_tree_delete(page_tree, index);
454 mapping->nrexceptional--;
455 ret = 1;
456out:
457 put_unlocked_mapping_entry(mapping, index, entry);
458 spin_unlock_irq(&mapping->tree_lock);
459 return ret;
460}
ac401cc7
JK
461/*
462 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
463 * entry to get unlocked before deleting it.
464 */
465int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
466{
c6dcf52c 467 int ret = __dax_invalidate_mapping_entry(mapping, index, true);
ac401cc7 468
ac401cc7
JK
469 /*
470 * This gets called from truncate / punch_hole path. As such, the caller
471 * must hold locks protecting against concurrent modifications of the
472 * radix tree (usually fs-private i_mmap_sem for writing). Since the
473 * caller has seen exceptional entry for this index, we better find it
474 * at that index as well...
475 */
c6dcf52c
JK
476 WARN_ON_ONCE(!ret);
477 return ret;
478}
479
480/*
481 * Invalidate exceptional DAX entry if easily possible. This handles DAX
482 * entries for invalidate_inode_pages() so we evict the entry only if we can
483 * do so without blocking.
484 */
485int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
486{
487 int ret = 0;
488 void *entry, **slot;
489 struct radix_tree_root *page_tree = &mapping->page_tree;
490
491 spin_lock_irq(&mapping->tree_lock);
492 entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
493 if (!entry || !radix_tree_exceptional_entry(entry) ||
494 slot_locked(mapping, slot))
495 goto out;
496 if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
497 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
498 goto out;
499 radix_tree_delete(page_tree, index);
ac401cc7 500 mapping->nrexceptional--;
c6dcf52c
JK
501 ret = 1;
502out:
ac401cc7 503 spin_unlock_irq(&mapping->tree_lock);
c6dcf52c
JK
504 if (ret)
505 dax_wake_mapping_entry_waiter(mapping, index, entry, true);
506 return ret;
507}
ac401cc7 508
c6dcf52c
JK
509/*
510 * Invalidate exceptional DAX entry if it is clean.
511 */
512int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
513 pgoff_t index)
514{
515 return __dax_invalidate_mapping_entry(mapping, index, false);
ac401cc7
JK
516}
517
f7ca90b1
MW
518/*
519 * The user has performed a load from a hole in the file. Allocating
520 * a new page in the file would cause excessive storage usage for
521 * workloads with sparse files. We allocate a page cache page instead.
522 * We'll kick it out of the page cache if it's ever written to,
523 * otherwise it will simply fall out of the page cache under memory
524 * pressure without ever having been dirtied.
525 */
f449b936 526static int dax_load_hole(struct address_space *mapping, void **entry,
ac401cc7 527 struct vm_fault *vmf)
f7ca90b1 528{
ac401cc7 529 struct page *page;
f449b936 530 int ret;
f7ca90b1 531
ac401cc7 532 /* Hole page already exists? Return it... */
f449b936
JK
533 if (!radix_tree_exceptional_entry(*entry)) {
534 page = *entry;
535 goto out;
ac401cc7 536 }
f7ca90b1 537
ac401cc7
JK
538 /* This will replace locked radix tree entry with a hole page */
539 page = find_or_create_page(mapping, vmf->pgoff,
540 vmf->gfp_mask | __GFP_ZERO);
b1aa812b 541 if (!page)
ac401cc7 542 return VM_FAULT_OOM;
f449b936 543 out:
f7ca90b1 544 vmf->page = page;
f449b936
JK
545 ret = finish_fault(vmf);
546 vmf->page = NULL;
547 *entry = page;
548 if (!ret) {
549 /* Grab reference for PTE that is now referencing the page */
550 get_page(page);
551 return VM_FAULT_NOPAGE;
552 }
553 return ret;
f7ca90b1
MW
554}
555
b0d5e82f
CH
556static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
557 struct page *to, unsigned long vaddr)
f7ca90b1 558{
b2e0d162 559 struct blk_dax_ctl dax = {
b0d5e82f
CH
560 .sector = sector,
561 .size = size,
b2e0d162 562 };
e2e05394
RZ
563 void *vto;
564
b2e0d162
DW
565 if (dax_map_atomic(bdev, &dax) < 0)
566 return PTR_ERR(dax.addr);
f7ca90b1 567 vto = kmap_atomic(to);
b2e0d162 568 copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
f7ca90b1 569 kunmap_atomic(vto);
b2e0d162 570 dax_unmap_atomic(bdev, &dax);
f7ca90b1
MW
571 return 0;
572}
573
642261ac
RZ
574/*
575 * By this point grab_mapping_entry() has ensured that we have a locked entry
576 * of the appropriate size so we don't have to worry about downgrading PMDs to
577 * PTEs. If we happen to be trying to insert a PTE and there is a PMD
578 * already in the tree, we will skip the insertion and just dirty the PMD as
579 * appropriate.
580 */
ac401cc7
JK
581static void *dax_insert_mapping_entry(struct address_space *mapping,
582 struct vm_fault *vmf,
642261ac
RZ
583 void *entry, sector_t sector,
584 unsigned long flags)
9973c98e
RZ
585{
586 struct radix_tree_root *page_tree = &mapping->page_tree;
ac401cc7
JK
587 int error = 0;
588 bool hole_fill = false;
589 void *new_entry;
590 pgoff_t index = vmf->pgoff;
9973c98e 591
ac401cc7 592 if (vmf->flags & FAULT_FLAG_WRITE)
d2b2a28e 593 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
9973c98e 594
ac401cc7
JK
595 /* Replacing hole page with block mapping? */
596 if (!radix_tree_exceptional_entry(entry)) {
597 hole_fill = true;
598 /*
599 * Unmap the page now before we remove it from page cache below.
600 * The page is locked so it cannot be faulted in again.
601 */
602 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
603 PAGE_SIZE, 0);
604 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
605 if (error)
606 return ERR_PTR(error);
642261ac
RZ
607 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
608 /* replacing huge zero page with PMD block mapping */
609 unmap_mapping_range(mapping,
610 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
9973c98e
RZ
611 }
612
ac401cc7 613 spin_lock_irq(&mapping->tree_lock);
642261ac
RZ
614 new_entry = dax_radix_locked_entry(sector, flags);
615
ac401cc7
JK
616 if (hole_fill) {
617 __delete_from_page_cache(entry, NULL);
618 /* Drop pagecache reference */
619 put_page(entry);
642261ac
RZ
620 error = __radix_tree_insert(page_tree, index,
621 dax_radix_order(new_entry), new_entry);
ac401cc7
JK
622 if (error) {
623 new_entry = ERR_PTR(error);
9973c98e
RZ
624 goto unlock;
625 }
ac401cc7 626 mapping->nrexceptional++;
642261ac
RZ
627 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
628 /*
629 * Only swap our new entry into the radix tree if the current
630 * entry is a zero page or an empty entry. If a normal PTE or
631 * PMD entry is already in the tree, we leave it alone. This
632 * means that if we are trying to insert a PTE and the
633 * existing entry is a PMD, we will just leave the PMD in the
634 * tree and dirty it if necessary.
635 */
f7942430 636 struct radix_tree_node *node;
ac401cc7
JK
637 void **slot;
638 void *ret;
9973c98e 639
f7942430 640 ret = __radix_tree_lookup(page_tree, index, &node, &slot);
ac401cc7 641 WARN_ON_ONCE(ret != entry);
4d693d08
JW
642 __radix_tree_replace(page_tree, node, slot,
643 new_entry, NULL, NULL);
9973c98e 644 }
ac401cc7 645 if (vmf->flags & FAULT_FLAG_WRITE)
9973c98e
RZ
646 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
647 unlock:
648 spin_unlock_irq(&mapping->tree_lock);
ac401cc7
JK
649 if (hole_fill) {
650 radix_tree_preload_end();
651 /*
652 * We don't need hole page anymore, it has been replaced with
653 * locked radix tree entry now.
654 */
655 if (mapping->a_ops->freepage)
656 mapping->a_ops->freepage(entry);
657 unlock_page(entry);
658 put_page(entry);
659 }
660 return new_entry;
9973c98e
RZ
661}
662
4b4bb46d
JK
663static inline unsigned long
664pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
665{
666 unsigned long address;
667
668 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
669 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
670 return address;
671}
672
673/* Walk all mappings of a given index of a file and writeprotect them */
674static void dax_mapping_entry_mkclean(struct address_space *mapping,
675 pgoff_t index, unsigned long pfn)
676{
677 struct vm_area_struct *vma;
f729c8c9
RZ
678 pte_t pte, *ptep = NULL;
679 pmd_t *pmdp = NULL;
4b4bb46d
JK
680 spinlock_t *ptl;
681 bool changed;
682
683 i_mmap_lock_read(mapping);
684 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
685 unsigned long address;
686
687 cond_resched();
688
689 if (!(vma->vm_flags & VM_SHARED))
690 continue;
691
692 address = pgoff_address(index, vma);
693 changed = false;
f729c8c9 694 if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
4b4bb46d 695 continue;
4b4bb46d 696
f729c8c9
RZ
697 if (pmdp) {
698#ifdef CONFIG_FS_DAX_PMD
699 pmd_t pmd;
700
701 if (pfn != pmd_pfn(*pmdp))
702 goto unlock_pmd;
703 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
704 goto unlock_pmd;
705
706 flush_cache_page(vma, address, pfn);
707 pmd = pmdp_huge_clear_flush(vma, address, pmdp);
708 pmd = pmd_wrprotect(pmd);
709 pmd = pmd_mkclean(pmd);
710 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
711 changed = true;
712unlock_pmd:
713 spin_unlock(ptl);
714#endif
715 } else {
716 if (pfn != pte_pfn(*ptep))
717 goto unlock_pte;
718 if (!pte_dirty(*ptep) && !pte_write(*ptep))
719 goto unlock_pte;
720
721 flush_cache_page(vma, address, pfn);
722 pte = ptep_clear_flush(vma, address, ptep);
723 pte = pte_wrprotect(pte);
724 pte = pte_mkclean(pte);
725 set_pte_at(vma->vm_mm, address, ptep, pte);
726 changed = true;
727unlock_pte:
728 pte_unmap_unlock(ptep, ptl);
729 }
4b4bb46d
JK
730
731 if (changed)
732 mmu_notifier_invalidate_page(vma->vm_mm, address);
733 }
734 i_mmap_unlock_read(mapping);
735}
736
9973c98e
RZ
737static int dax_writeback_one(struct block_device *bdev,
738 struct address_space *mapping, pgoff_t index, void *entry)
739{
740 struct radix_tree_root *page_tree = &mapping->page_tree;
9973c98e 741 struct blk_dax_ctl dax;
a6abc2c0 742 void *entry2, **slot;
9973c98e
RZ
743 int ret = 0;
744
9973c98e 745 /*
a6abc2c0
JK
746 * A page got tagged dirty in DAX mapping? Something is seriously
747 * wrong.
9973c98e 748 */
a6abc2c0
JK
749 if (WARN_ON(!radix_tree_exceptional_entry(entry)))
750 return -EIO;
9973c98e 751
a6abc2c0
JK
752 spin_lock_irq(&mapping->tree_lock);
753 entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
754 /* Entry got punched out / reallocated? */
755 if (!entry2 || !radix_tree_exceptional_entry(entry2))
756 goto put_unlocked;
757 /*
758 * Entry got reallocated elsewhere? No need to writeback. We have to
759 * compare sectors as we must not bail out due to difference in lockbit
760 * or entry type.
761 */
762 if (dax_radix_sector(entry2) != dax_radix_sector(entry))
763 goto put_unlocked;
642261ac
RZ
764 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
765 dax_is_zero_entry(entry))) {
9973c98e 766 ret = -EIO;
a6abc2c0 767 goto put_unlocked;
9973c98e
RZ
768 }
769
a6abc2c0
JK
770 /* Another fsync thread may have already written back this entry */
771 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
772 goto put_unlocked;
773 /* Lock the entry to serialize with page faults */
774 entry = lock_slot(mapping, slot);
775 /*
776 * We can clear the tag now but we have to be careful so that concurrent
777 * dax_writeback_one() calls for the same index cannot finish before we
778 * actually flush the caches. This is achieved as the calls will look
779 * at the entry only under tree_lock and once they do that they will
780 * see the entry locked and wait for it to unlock.
781 */
782 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
783 spin_unlock_irq(&mapping->tree_lock);
784
642261ac
RZ
785 /*
786 * Even if dax_writeback_mapping_range() was given a wbc->range_start
787 * in the middle of a PMD, the 'index' we are given will be aligned to
788 * the start index of the PMD, as will the sector we pull from
789 * 'entry'. This allows us to flush for PMD_SIZE and not have to
790 * worry about partial PMD writebacks.
791 */
792 dax.sector = dax_radix_sector(entry);
793 dax.size = PAGE_SIZE << dax_radix_order(entry);
9973c98e
RZ
794
795 /*
796 * We cannot hold tree_lock while calling dax_map_atomic() because it
797 * eventually calls cond_resched().
798 */
799 ret = dax_map_atomic(bdev, &dax);
a6abc2c0
JK
800 if (ret < 0) {
801 put_locked_mapping_entry(mapping, index, entry);
9973c98e 802 return ret;
a6abc2c0 803 }
9973c98e
RZ
804
805 if (WARN_ON_ONCE(ret < dax.size)) {
806 ret = -EIO;
807 goto unmap;
808 }
809
4b4bb46d 810 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
9973c98e 811 wb_cache_pmem(dax.addr, dax.size);
4b4bb46d
JK
812 /*
813 * After we have flushed the cache, we can clear the dirty tag. There
814 * cannot be new dirty data in the pfn after the flush has completed as
815 * the pfn mappings are writeprotected and fault waits for mapping
816 * entry lock.
817 */
818 spin_lock_irq(&mapping->tree_lock);
819 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
820 spin_unlock_irq(&mapping->tree_lock);
9973c98e
RZ
821 unmap:
822 dax_unmap_atomic(bdev, &dax);
a6abc2c0 823 put_locked_mapping_entry(mapping, index, entry);
9973c98e
RZ
824 return ret;
825
a6abc2c0
JK
826 put_unlocked:
827 put_unlocked_mapping_entry(mapping, index, entry2);
9973c98e
RZ
828 spin_unlock_irq(&mapping->tree_lock);
829 return ret;
830}
831
832/*
833 * Flush the mapping to the persistent domain within the byte range of [start,
834 * end]. This is required by data integrity operations to ensure file data is
835 * on persistent storage prior to completion of the operation.
836 */
7f6d5b52
RZ
837int dax_writeback_mapping_range(struct address_space *mapping,
838 struct block_device *bdev, struct writeback_control *wbc)
9973c98e
RZ
839{
840 struct inode *inode = mapping->host;
642261ac 841 pgoff_t start_index, end_index;
9973c98e
RZ
842 pgoff_t indices[PAGEVEC_SIZE];
843 struct pagevec pvec;
844 bool done = false;
845 int i, ret = 0;
9973c98e
RZ
846
847 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
848 return -EIO;
849
7f6d5b52
RZ
850 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
851 return 0;
852
09cbfeaf
KS
853 start_index = wbc->range_start >> PAGE_SHIFT;
854 end_index = wbc->range_end >> PAGE_SHIFT;
9973c98e
RZ
855
856 tag_pages_for_writeback(mapping, start_index, end_index);
857
858 pagevec_init(&pvec, 0);
859 while (!done) {
860 pvec.nr = find_get_entries_tag(mapping, start_index,
861 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
862 pvec.pages, indices);
863
864 if (pvec.nr == 0)
865 break;
866
867 for (i = 0; i < pvec.nr; i++) {
868 if (indices[i] > end_index) {
869 done = true;
870 break;
871 }
872
873 ret = dax_writeback_one(bdev, mapping, indices[i],
874 pvec.pages[i]);
875 if (ret < 0)
876 return ret;
877 }
878 }
9973c98e
RZ
879 return 0;
880}
881EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
882
ac401cc7 883static int dax_insert_mapping(struct address_space *mapping,
1aaba095
CH
884 struct block_device *bdev, sector_t sector, size_t size,
885 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
f7ca90b1 886{
1a29d85e 887 unsigned long vaddr = vmf->address;
b2e0d162 888 struct blk_dax_ctl dax = {
1aaba095
CH
889 .sector = sector,
890 .size = size,
b2e0d162 891 };
ac401cc7
JK
892 void *ret;
893 void *entry = *entryp;
f7ca90b1 894
4d9a2c87
JK
895 if (dax_map_atomic(bdev, &dax) < 0)
896 return PTR_ERR(dax.addr);
b2e0d162 897 dax_unmap_atomic(bdev, &dax);
f7ca90b1 898
642261ac 899 ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
4d9a2c87
JK
900 if (IS_ERR(ret))
901 return PTR_ERR(ret);
ac401cc7 902 *entryp = ret;
9973c98e 903
4d9a2c87 904 return vm_insert_mixed(vma, vaddr, dax.pfn);
f7ca90b1
MW
905}
906
0e3b210c
BH
907/**
908 * dax_pfn_mkwrite - handle first write to DAX page
0e3b210c 909 * @vmf: The description of the fault
0e3b210c 910 */
11bac800 911int dax_pfn_mkwrite(struct vm_fault *vmf)
0e3b210c 912{
11bac800 913 struct file *file = vmf->vma->vm_file;
ac401cc7 914 struct address_space *mapping = file->f_mapping;
2f89dc12 915 void *entry, **slot;
ac401cc7 916 pgoff_t index = vmf->pgoff;
30f471fd 917
ac401cc7 918 spin_lock_irq(&mapping->tree_lock);
2f89dc12
JK
919 entry = get_unlocked_mapping_entry(mapping, index, &slot);
920 if (!entry || !radix_tree_exceptional_entry(entry)) {
921 if (entry)
922 put_unlocked_mapping_entry(mapping, index, entry);
923 spin_unlock_irq(&mapping->tree_lock);
924 return VM_FAULT_NOPAGE;
925 }
ac401cc7 926 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
2f89dc12 927 entry = lock_slot(mapping, slot);
ac401cc7 928 spin_unlock_irq(&mapping->tree_lock);
2f89dc12
JK
929 /*
930 * If we race with somebody updating the PTE and finish_mkwrite_fault()
931 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
932 * the fault in either case.
933 */
934 finish_mkwrite_fault(vmf);
935 put_locked_mapping_entry(mapping, index, entry);
0e3b210c
BH
936 return VM_FAULT_NOPAGE;
937}
938EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
939
4b0228fa
VV
940static bool dax_range_is_aligned(struct block_device *bdev,
941 unsigned int offset, unsigned int length)
942{
943 unsigned short sector_size = bdev_logical_block_size(bdev);
944
945 if (!IS_ALIGNED(offset, sector_size))
946 return false;
947 if (!IS_ALIGNED(length, sector_size))
948 return false;
949
950 return true;
951}
952
679c8bd3
CH
953int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
954 unsigned int offset, unsigned int length)
955{
956 struct blk_dax_ctl dax = {
957 .sector = sector,
958 .size = PAGE_SIZE,
959 };
960
4b0228fa
VV
961 if (dax_range_is_aligned(bdev, offset, length)) {
962 sector_t start_sector = dax.sector + (offset >> 9);
963
964 return blkdev_issue_zeroout(bdev, start_sector,
965 length >> 9, GFP_NOFS, true);
966 } else {
967 if (dax_map_atomic(bdev, &dax) < 0)
968 return PTR_ERR(dax.addr);
969 clear_pmem(dax.addr + offset, length);
4b0228fa
VV
970 dax_unmap_atomic(bdev, &dax);
971 }
679c8bd3
CH
972 return 0;
973}
974EXPORT_SYMBOL_GPL(__dax_zero_page_range);
975
333ccc97 976static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
25726bc1 977{
333ccc97 978 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
25726bc1 979}
a254e568 980
a254e568 981static loff_t
11c59c92 982dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
a254e568
CH
983 struct iomap *iomap)
984{
985 struct iov_iter *iter = data;
986 loff_t end = pos + length, done = 0;
987 ssize_t ret = 0;
988
989 if (iov_iter_rw(iter) == READ) {
990 end = min(end, i_size_read(inode));
991 if (pos >= end)
992 return 0;
993
994 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
995 return iov_iter_zero(min(length, end - pos), iter);
996 }
997
998 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
999 return -EIO;
1000
e3fce68c
JK
1001 /*
1002 * Write can allocate block for an area which has a hole page mapped
1003 * into page tables. We have to tear down these mappings so that data
1004 * written by write(2) is visible in mmap.
1005 */
1006 if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
1007 invalidate_inode_pages2_range(inode->i_mapping,
1008 pos >> PAGE_SHIFT,
1009 (end - 1) >> PAGE_SHIFT);
1010 }
1011
a254e568
CH
1012 while (pos < end) {
1013 unsigned offset = pos & (PAGE_SIZE - 1);
1014 struct blk_dax_ctl dax = { 0 };
1015 ssize_t map_len;
1016
d1908f52
MH
1017 if (fatal_signal_pending(current)) {
1018 ret = -EINTR;
1019 break;
1020 }
1021
333ccc97 1022 dax.sector = dax_iomap_sector(iomap, pos);
a254e568
CH
1023 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1024 map_len = dax_map_atomic(iomap->bdev, &dax);
1025 if (map_len < 0) {
1026 ret = map_len;
1027 break;
1028 }
1029
1030 dax.addr += offset;
1031 map_len -= offset;
1032 if (map_len > end - pos)
1033 map_len = end - pos;
1034
1035 if (iov_iter_rw(iter) == WRITE)
1036 map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1037 else
1038 map_len = copy_to_iter(dax.addr, map_len, iter);
1039 dax_unmap_atomic(iomap->bdev, &dax);
1040 if (map_len <= 0) {
1041 ret = map_len ? map_len : -EFAULT;
1042 break;
1043 }
1044
1045 pos += map_len;
1046 length -= map_len;
1047 done += map_len;
1048 }
1049
1050 return done ? done : ret;
1051}
1052
1053/**
11c59c92 1054 * dax_iomap_rw - Perform I/O to a DAX file
a254e568
CH
1055 * @iocb: The control block for this I/O
1056 * @iter: The addresses to do I/O from or to
1057 * @ops: iomap ops passed from the file system
1058 *
1059 * This function performs read and write operations to directly mapped
1060 * persistent memory. The callers needs to take care of read/write exclusion
1061 * and evicting any page cache pages in the region under I/O.
1062 */
1063ssize_t
11c59c92 1064dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
8ff6daa1 1065 const struct iomap_ops *ops)
a254e568
CH
1066{
1067 struct address_space *mapping = iocb->ki_filp->f_mapping;
1068 struct inode *inode = mapping->host;
1069 loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1070 unsigned flags = 0;
1071
168316db
CH
1072 if (iov_iter_rw(iter) == WRITE) {
1073 lockdep_assert_held_exclusive(&inode->i_rwsem);
a254e568 1074 flags |= IOMAP_WRITE;
168316db
CH
1075 } else {
1076 lockdep_assert_held(&inode->i_rwsem);
1077 }
a254e568 1078
a254e568
CH
1079 while (iov_iter_count(iter)) {
1080 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
11c59c92 1081 iter, dax_iomap_actor);
a254e568
CH
1082 if (ret <= 0)
1083 break;
1084 pos += ret;
1085 done += ret;
1086 }
1087
1088 iocb->ki_pos += done;
1089 return done ? done : ret;
1090}
11c59c92 1091EXPORT_SYMBOL_GPL(dax_iomap_rw);
a7d73fe6 1092
9f141d6e
JK
1093static int dax_fault_return(int error)
1094{
1095 if (error == 0)
1096 return VM_FAULT_NOPAGE;
1097 if (error == -ENOMEM)
1098 return VM_FAULT_OOM;
1099 return VM_FAULT_SIGBUS;
1100}
1101
a2d58167
DJ
1102static int dax_iomap_pte_fault(struct vm_fault *vmf,
1103 const struct iomap_ops *ops)
a7d73fe6 1104{
11bac800 1105 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
a7d73fe6 1106 struct inode *inode = mapping->host;
1a29d85e 1107 unsigned long vaddr = vmf->address;
a7d73fe6
CH
1108 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1109 sector_t sector;
1110 struct iomap iomap = { 0 };
9484ab1b 1111 unsigned flags = IOMAP_FAULT;
a7d73fe6 1112 int error, major = 0;
b1aa812b 1113 int vmf_ret = 0;
a7d73fe6
CH
1114 void *entry;
1115
1116 /*
1117 * Check whether offset isn't beyond end of file now. Caller is supposed
1118 * to hold locks serializing us with truncate / punch hole so this is
1119 * a reliable test.
1120 */
1121 if (pos >= i_size_read(inode))
1122 return VM_FAULT_SIGBUS;
1123
a7d73fe6
CH
1124 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1125 flags |= IOMAP_WRITE;
1126
1127 /*
1128 * Note that we don't bother to use iomap_apply here: DAX required
1129 * the file system block size to be equal the page size, which means
1130 * that we never have to deal with more than a single extent here.
1131 */
1132 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1133 if (error)
9f141d6e 1134 return dax_fault_return(error);
a7d73fe6 1135 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
9f141d6e
JK
1136 vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
1137 goto finish_iomap;
1138 }
1139
1140 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1141 if (IS_ERR(entry)) {
1142 vmf_ret = dax_fault_return(PTR_ERR(entry));
1550290b 1143 goto finish_iomap;
a7d73fe6
CH
1144 }
1145
333ccc97 1146 sector = dax_iomap_sector(&iomap, pos);
a7d73fe6
CH
1147
1148 if (vmf->cow_page) {
1149 switch (iomap.type) {
1150 case IOMAP_HOLE:
1151 case IOMAP_UNWRITTEN:
1152 clear_user_highpage(vmf->cow_page, vaddr);
1153 break;
1154 case IOMAP_MAPPED:
1155 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1156 vmf->cow_page, vaddr);
1157 break;
1158 default:
1159 WARN_ON_ONCE(1);
1160 error = -EIO;
1161 break;
1162 }
1163
1164 if (error)
9f141d6e 1165 goto error_unlock_entry;
b1aa812b
JK
1166
1167 __SetPageUptodate(vmf->cow_page);
1168 vmf_ret = finish_fault(vmf);
1169 if (!vmf_ret)
1170 vmf_ret = VM_FAULT_DONE_COW;
9f141d6e 1171 goto unlock_entry;
a7d73fe6
CH
1172 }
1173
1174 switch (iomap.type) {
1175 case IOMAP_MAPPED:
1176 if (iomap.flags & IOMAP_F_NEW) {
1177 count_vm_event(PGMAJFAULT);
11bac800 1178 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
a7d73fe6
CH
1179 major = VM_FAULT_MAJOR;
1180 }
1181 error = dax_insert_mapping(mapping, iomap.bdev, sector,
11bac800 1182 PAGE_SIZE, &entry, vmf->vma, vmf);
9f141d6e
JK
1183 /* -EBUSY is fine, somebody else faulted on the same PTE */
1184 if (error == -EBUSY)
1185 error = 0;
a7d73fe6
CH
1186 break;
1187 case IOMAP_UNWRITTEN:
1188 case IOMAP_HOLE:
1550290b 1189 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
f449b936 1190 vmf_ret = dax_load_hole(mapping, &entry, vmf);
9f141d6e 1191 goto unlock_entry;
1550290b 1192 }
a7d73fe6
CH
1193 /*FALLTHRU*/
1194 default:
1195 WARN_ON_ONCE(1);
1196 error = -EIO;
1197 break;
1198 }
1199
9f141d6e
JK
1200 error_unlock_entry:
1201 vmf_ret = dax_fault_return(error) | major;
a7d73fe6 1202 unlock_entry:
f449b936 1203 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
9f141d6e
JK
1204 finish_iomap:
1205 if (ops->iomap_end) {
1206 int copied = PAGE_SIZE;
1207
1208 if (vmf_ret & VM_FAULT_ERROR)
1209 copied = 0;
1210 /*
1211 * The fault is done by now and there's no way back (other
1212 * thread may be already happily using PTE we have installed).
1213 * Just ignore error from ->iomap_end since we cannot do much
1214 * with it.
1215 */
1216 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1550290b 1217 }
9f141d6e 1218 return vmf_ret;
a7d73fe6 1219}
642261ac
RZ
1220
1221#ifdef CONFIG_FS_DAX_PMD
1222/*
1223 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
1224 * more often than one might expect in the below functions.
1225 */
1226#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
1227
f4200391
DJ
1228static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
1229 loff_t pos, void **entryp)
642261ac 1230{
f4200391 1231 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
642261ac 1232 struct block_device *bdev = iomap->bdev;
27a7ffac 1233 struct inode *inode = mapping->host;
642261ac
RZ
1234 struct blk_dax_ctl dax = {
1235 .sector = dax_iomap_sector(iomap, pos),
1236 .size = PMD_SIZE,
1237 };
1238 long length = dax_map_atomic(bdev, &dax);
27a7ffac 1239 void *ret = NULL;
642261ac
RZ
1240
1241 if (length < 0) /* dax_map_atomic() failed */
27a7ffac 1242 goto fallback;
642261ac
RZ
1243 if (length < PMD_SIZE)
1244 goto unmap_fallback;
1245 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
1246 goto unmap_fallback;
1247 if (!pfn_t_devmap(dax.pfn))
1248 goto unmap_fallback;
1249
1250 dax_unmap_atomic(bdev, &dax);
1251
1252 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
1253 RADIX_DAX_PMD);
1254 if (IS_ERR(ret))
27a7ffac 1255 goto fallback;
642261ac
RZ
1256 *entryp = ret;
1257
f4200391
DJ
1258 trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
1259 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
1260 dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
642261ac
RZ
1261
1262 unmap_fallback:
1263 dax_unmap_atomic(bdev, &dax);
27a7ffac 1264fallback:
f4200391
DJ
1265 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
1266 dax.pfn, ret);
642261ac
RZ
1267 return VM_FAULT_FALLBACK;
1268}
1269
f4200391
DJ
1270static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1271 void **entryp)
642261ac 1272{
f4200391
DJ
1273 struct address_space *mapping = vmf->vma->vm_file->f_mapping;
1274 unsigned long pmd_addr = vmf->address & PMD_MASK;
653b2ea3 1275 struct inode *inode = mapping->host;
642261ac 1276 struct page *zero_page;
653b2ea3 1277 void *ret = NULL;
642261ac
RZ
1278 spinlock_t *ptl;
1279 pmd_t pmd_entry;
642261ac 1280
f4200391 1281 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
642261ac
RZ
1282
1283 if (unlikely(!zero_page))
653b2ea3 1284 goto fallback;
642261ac
RZ
1285
1286 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
1287 RADIX_DAX_PMD | RADIX_DAX_HZP);
1288 if (IS_ERR(ret))
653b2ea3 1289 goto fallback;
642261ac
RZ
1290 *entryp = ret;
1291
f4200391
DJ
1292 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1293 if (!pmd_none(*(vmf->pmd))) {
642261ac 1294 spin_unlock(ptl);
653b2ea3 1295 goto fallback;
642261ac
RZ
1296 }
1297
f4200391 1298 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
642261ac 1299 pmd_entry = pmd_mkhuge(pmd_entry);
f4200391 1300 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
642261ac 1301 spin_unlock(ptl);
f4200391 1302 trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
642261ac 1303 return VM_FAULT_NOPAGE;
653b2ea3
RZ
1304
1305fallback:
f4200391 1306 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
653b2ea3 1307 return VM_FAULT_FALLBACK;
642261ac
RZ
1308}
1309
a2d58167
DJ
1310static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1311 const struct iomap_ops *ops)
642261ac 1312{
f4200391 1313 struct vm_area_struct *vma = vmf->vma;
642261ac 1314 struct address_space *mapping = vma->vm_file->f_mapping;
d8a849e1
DJ
1315 unsigned long pmd_addr = vmf->address & PMD_MASK;
1316 bool write = vmf->flags & FAULT_FLAG_WRITE;
9484ab1b 1317 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
642261ac
RZ
1318 struct inode *inode = mapping->host;
1319 int result = VM_FAULT_FALLBACK;
1320 struct iomap iomap = { 0 };
1321 pgoff_t max_pgoff, pgoff;
642261ac
RZ
1322 void *entry;
1323 loff_t pos;
1324 int error;
1325
282a8e03
RZ
1326 /*
1327 * Check whether offset isn't beyond end of file now. Caller is
1328 * supposed to hold locks serializing us with truncate / punch hole so
1329 * this is a reliable test.
1330 */
1331 pgoff = linear_page_index(vma, pmd_addr);
1332 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
1333
f4200391 1334 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
282a8e03 1335
642261ac
RZ
1336 /* Fall back to PTEs if we're going to COW */
1337 if (write && !(vma->vm_flags & VM_SHARED))
1338 goto fallback;
1339
1340 /* If the PMD would extend outside the VMA */
1341 if (pmd_addr < vma->vm_start)
1342 goto fallback;
1343 if ((pmd_addr + PMD_SIZE) > vma->vm_end)
1344 goto fallback;
1345
282a8e03
RZ
1346 if (pgoff > max_pgoff) {
1347 result = VM_FAULT_SIGBUS;
1348 goto out;
1349 }
642261ac
RZ
1350
1351 /* If the PMD would extend beyond the file size */
1352 if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
1353 goto fallback;
1354
642261ac
RZ
1355 /*
1356 * Note that we don't use iomap_apply here. We aren't doing I/O, only
1357 * setting up a mapping, so really we're using iomap_begin() as a way
1358 * to look up our filesystem block.
1359 */
1360 pos = (loff_t)pgoff << PAGE_SHIFT;
1361 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1362 if (error)
9f141d6e
JK
1363 goto fallback;
1364
642261ac
RZ
1365 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1366 goto finish_iomap;
1367
9f141d6e
JK
1368 /*
1369 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1370 * PMD or a HZP entry. If it can't (because a 4k page is already in
1371 * the tree, for instance), it will return -EEXIST and we just fall
1372 * back to 4k entries.
1373 */
1374 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1375 if (IS_ERR(entry))
1376 goto finish_iomap;
1377
642261ac
RZ
1378 switch (iomap.type) {
1379 case IOMAP_MAPPED:
f4200391 1380 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
642261ac
RZ
1381 break;
1382 case IOMAP_UNWRITTEN:
1383 case IOMAP_HOLE:
1384 if (WARN_ON_ONCE(write))
9f141d6e 1385 goto unlock_entry;
f4200391 1386 result = dax_pmd_load_hole(vmf, &iomap, &entry);
642261ac
RZ
1387 break;
1388 default:
1389 WARN_ON_ONCE(1);
1390 break;
1391 }
1392
9f141d6e
JK
1393 unlock_entry:
1394 put_locked_mapping_entry(mapping, pgoff, entry);
642261ac
RZ
1395 finish_iomap:
1396 if (ops->iomap_end) {
9f141d6e
JK
1397 int copied = PMD_SIZE;
1398
1399 if (result == VM_FAULT_FALLBACK)
1400 copied = 0;
1401 /*
1402 * The fault is done by now and there's no way back (other
1403 * thread may be already happily using PMD we have installed).
1404 * Just ignore error from ->iomap_end since we cannot do much
1405 * with it.
1406 */
1407 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1408 &iomap);
642261ac 1409 }
642261ac
RZ
1410 fallback:
1411 if (result == VM_FAULT_FALLBACK) {
d8a849e1 1412 split_huge_pmd(vma, vmf->pmd, vmf->address);
642261ac
RZ
1413 count_vm_event(THP_FAULT_FALLBACK);
1414 }
282a8e03 1415out:
f4200391 1416 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
642261ac
RZ
1417 return result;
1418}
a2d58167 1419#else
01cddfe9
AB
1420static int dax_iomap_pmd_fault(struct vm_fault *vmf,
1421 const struct iomap_ops *ops)
a2d58167
DJ
1422{
1423 return VM_FAULT_FALLBACK;
1424}
642261ac 1425#endif /* CONFIG_FS_DAX_PMD */
a2d58167
DJ
1426
1427/**
1428 * dax_iomap_fault - handle a page fault on a DAX file
1429 * @vmf: The description of the fault
1430 * @ops: iomap ops passed from the file system
1431 *
1432 * When a page fault occurs, filesystems may call this helper in
1433 * their fault handler for DAX files. dax_iomap_fault() assumes the caller
1434 * has done all the necessary locking for page fault to proceed
1435 * successfully.
1436 */
c791ace1
DJ
1437int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
1438 const struct iomap_ops *ops)
a2d58167 1439{
c791ace1
DJ
1440 switch (pe_size) {
1441 case PE_SIZE_PTE:
a2d58167 1442 return dax_iomap_pte_fault(vmf, ops);
c791ace1 1443 case PE_SIZE_PMD:
a2d58167
DJ
1444 return dax_iomap_pmd_fault(vmf, ops);
1445 default:
1446 return VM_FAULT_FALLBACK;
1447 }
1448}
1449EXPORT_SYMBOL_GPL(dax_iomap_fault);