1 // SPDX-License-Identifier: GPL-2.0
3 * This is a module to test the HMM (Heterogeneous Memory Management)
4 * mirror and zone device private memory migration APIs of the kernel.
5 * Userspace programs can register with the driver to mirror their own address
6 * space and can use the device to read/write any valid virtual address.
8 #include <linux/init.h>
11 #include <linux/module.h>
12 #include <linux/kernel.h>
13 #include <linux/cdev.h>
14 #include <linux/device.h>
15 #include <linux/mutex.h>
16 #include <linux/rwsem.h>
17 #include <linux/sched.h>
18 #include <linux/slab.h>
19 #include <linux/highmem.h>
20 #include <linux/delay.h>
21 #include <linux/pagemap.h>
22 #include <linux/hmm.h>
23 #include <linux/vmalloc.h>
24 #include <linux/swap.h>
25 #include <linux/swapops.h>
26 #include <linux/sched/mm.h>
27 #include <linux/platform_device.h>
28 #include <linux/rmap.h>
30 #include "test_hmm_uapi.h"
32 #define DMIRROR_NDEVICES 2
33 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000
34 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U)
35 #define DEVMEM_CHUNKS_RESERVE 16
37 static const struct dev_pagemap_ops dmirror_devmem_ops
;
38 static const struct mmu_interval_notifier_ops dmirror_min_ops
;
39 static dev_t dmirror_dev
;
41 struct dmirror_device
;
43 struct dmirror_bounce
{
50 #define DPT_XA_TAG_ATOMIC 1UL
51 #define DPT_XA_TAG_WRITE 3UL
54 * Data structure to track address ranges and register for mmu interval
57 struct dmirror_interval
{
58 struct mmu_interval_notifier notifier
;
59 struct dmirror
*dmirror
;
63 * Data attached to the open device file.
64 * Note that it might be shared after a fork().
67 struct dmirror_device
*mdevice
;
69 struct mmu_interval_notifier notifier
;
74 * ZONE_DEVICE pages for migration and simulating device memory.
76 struct dmirror_chunk
{
77 struct dev_pagemap pagemap
;
78 struct dmirror_device
*mdevice
;
84 struct dmirror_device
{
86 struct hmm_devmem
*devmem
;
88 unsigned int devmem_capacity
;
89 unsigned int devmem_count
;
90 struct dmirror_chunk
**devmem_chunks
;
91 struct mutex devmem_lock
; /* protects the above */
95 struct page
*free_pages
;
96 spinlock_t lock
; /* protects the above */
99 static struct dmirror_device dmirror_devices
[DMIRROR_NDEVICES
];
101 static int dmirror_bounce_init(struct dmirror_bounce
*bounce
,
108 bounce
->ptr
= vmalloc(size
);
114 static void dmirror_bounce_fini(struct dmirror_bounce
*bounce
)
119 static int dmirror_fops_open(struct inode
*inode
, struct file
*filp
)
121 struct cdev
*cdev
= inode
->i_cdev
;
122 struct dmirror
*dmirror
;
125 /* Mirror this process address space */
126 dmirror
= kzalloc(sizeof(*dmirror
), GFP_KERNEL
);
130 dmirror
->mdevice
= container_of(cdev
, struct dmirror_device
, cdevice
);
131 mutex_init(&dmirror
->mutex
);
132 xa_init(&dmirror
->pt
);
134 ret
= mmu_interval_notifier_insert(&dmirror
->notifier
, current
->mm
,
135 0, ULONG_MAX
& PAGE_MASK
, &dmirror_min_ops
);
141 filp
->private_data
= dmirror
;
145 static int dmirror_fops_release(struct inode
*inode
, struct file
*filp
)
147 struct dmirror
*dmirror
= filp
->private_data
;
149 mmu_interval_notifier_remove(&dmirror
->notifier
);
150 xa_destroy(&dmirror
->pt
);
155 static struct dmirror_device
*dmirror_page_to_device(struct page
*page
)
158 return container_of(page
->pgmap
, struct dmirror_chunk
,
162 static int dmirror_do_fault(struct dmirror
*dmirror
, struct hmm_range
*range
)
164 unsigned long *pfns
= range
->hmm_pfns
;
167 for (pfn
= (range
->start
>> PAGE_SHIFT
);
168 pfn
< (range
->end
>> PAGE_SHIFT
);
174 * Since we asked for hmm_range_fault() to populate pages,
175 * it shouldn't return an error entry on success.
177 WARN_ON(*pfns
& HMM_PFN_ERROR
);
178 WARN_ON(!(*pfns
& HMM_PFN_VALID
));
180 page
= hmm_pfn_to_page(*pfns
);
184 if (*pfns
& HMM_PFN_WRITE
)
185 entry
= xa_tag_pointer(entry
, DPT_XA_TAG_WRITE
);
186 else if (WARN_ON(range
->default_flags
& HMM_PFN_WRITE
))
188 entry
= xa_store(&dmirror
->pt
, pfn
, entry
, GFP_ATOMIC
);
189 if (xa_is_err(entry
))
190 return xa_err(entry
);
196 static void dmirror_do_update(struct dmirror
*dmirror
, unsigned long start
,
203 * The XArray doesn't hold references to pages since it relies on
204 * the mmu notifier to clear page pointers when they become stale.
205 * Therefore, it is OK to just clear the entry.
207 xa_for_each_range(&dmirror
->pt
, pfn
, entry
, start
>> PAGE_SHIFT
,
209 xa_erase(&dmirror
->pt
, pfn
);
212 static bool dmirror_interval_invalidate(struct mmu_interval_notifier
*mni
,
213 const struct mmu_notifier_range
*range
,
214 unsigned long cur_seq
)
216 struct dmirror
*dmirror
= container_of(mni
, struct dmirror
, notifier
);
219 * Ignore invalidation callbacks for device private pages since
220 * the invalidation is handled as part of the migration process.
222 if (range
->event
== MMU_NOTIFY_MIGRATE
&&
223 range
->owner
== dmirror
->mdevice
)
226 if (mmu_notifier_range_blockable(range
))
227 mutex_lock(&dmirror
->mutex
);
228 else if (!mutex_trylock(&dmirror
->mutex
))
231 mmu_interval_set_seq(mni
, cur_seq
);
232 dmirror_do_update(dmirror
, range
->start
, range
->end
);
234 mutex_unlock(&dmirror
->mutex
);
238 static const struct mmu_interval_notifier_ops dmirror_min_ops
= {
239 .invalidate
= dmirror_interval_invalidate
,
242 static int dmirror_range_fault(struct dmirror
*dmirror
,
243 struct hmm_range
*range
)
245 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
246 unsigned long timeout
=
247 jiffies
+ msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT
);
251 if (time_after(jiffies
, timeout
)) {
256 range
->notifier_seq
= mmu_interval_read_begin(range
->notifier
);
258 ret
= hmm_range_fault(range
);
259 mmap_read_unlock(mm
);
266 mutex_lock(&dmirror
->mutex
);
267 if (mmu_interval_read_retry(range
->notifier
,
268 range
->notifier_seq
)) {
269 mutex_unlock(&dmirror
->mutex
);
275 ret
= dmirror_do_fault(dmirror
, range
);
277 mutex_unlock(&dmirror
->mutex
);
282 static int dmirror_fault(struct dmirror
*dmirror
, unsigned long start
,
283 unsigned long end
, bool write
)
285 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
287 unsigned long pfns
[64];
288 struct hmm_range range
= {
289 .notifier
= &dmirror
->notifier
,
293 HMM_PFN_REQ_FAULT
| (write
? HMM_PFN_REQ_WRITE
: 0),
294 .dev_private_owner
= dmirror
->mdevice
,
298 /* Since the mm is for the mirrored process, get a reference first. */
299 if (!mmget_not_zero(mm
))
302 for (addr
= start
; addr
< end
; addr
= range
.end
) {
304 range
.end
= min(addr
+ (ARRAY_SIZE(pfns
) << PAGE_SHIFT
), end
);
306 ret
= dmirror_range_fault(dmirror
, &range
);
315 static int dmirror_do_read(struct dmirror
*dmirror
, unsigned long start
,
316 unsigned long end
, struct dmirror_bounce
*bounce
)
321 ptr
= bounce
->ptr
+ ((start
- bounce
->addr
) & PAGE_MASK
);
323 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++) {
328 entry
= xa_load(&dmirror
->pt
, pfn
);
329 page
= xa_untag_pointer(entry
);
334 memcpy(ptr
, tmp
, PAGE_SIZE
);
344 static int dmirror_read(struct dmirror
*dmirror
, struct hmm_dmirror_cmd
*cmd
)
346 struct dmirror_bounce bounce
;
347 unsigned long start
, end
;
348 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
356 ret
= dmirror_bounce_init(&bounce
, start
, size
);
361 mutex_lock(&dmirror
->mutex
);
362 ret
= dmirror_do_read(dmirror
, start
, end
, &bounce
);
363 mutex_unlock(&dmirror
->mutex
);
367 start
= cmd
->addr
+ (bounce
.cpages
<< PAGE_SHIFT
);
368 ret
= dmirror_fault(dmirror
, start
, end
, false);
375 if (copy_to_user(u64_to_user_ptr(cmd
->ptr
), bounce
.ptr
,
379 cmd
->cpages
= bounce
.cpages
;
380 dmirror_bounce_fini(&bounce
);
384 static int dmirror_do_write(struct dmirror
*dmirror
, unsigned long start
,
385 unsigned long end
, struct dmirror_bounce
*bounce
)
390 ptr
= bounce
->ptr
+ ((start
- bounce
->addr
) & PAGE_MASK
);
392 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++) {
397 entry
= xa_load(&dmirror
->pt
, pfn
);
398 page
= xa_untag_pointer(entry
);
399 if (!page
|| xa_pointer_tag(entry
) != DPT_XA_TAG_WRITE
)
403 memcpy(tmp
, ptr
, PAGE_SIZE
);
413 static int dmirror_write(struct dmirror
*dmirror
, struct hmm_dmirror_cmd
*cmd
)
415 struct dmirror_bounce bounce
;
416 unsigned long start
, end
;
417 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
425 ret
= dmirror_bounce_init(&bounce
, start
, size
);
428 if (copy_from_user(bounce
.ptr
, u64_to_user_ptr(cmd
->ptr
),
435 mutex_lock(&dmirror
->mutex
);
436 ret
= dmirror_do_write(dmirror
, start
, end
, &bounce
);
437 mutex_unlock(&dmirror
->mutex
);
441 start
= cmd
->addr
+ (bounce
.cpages
<< PAGE_SHIFT
);
442 ret
= dmirror_fault(dmirror
, start
, end
, true);
449 cmd
->cpages
= bounce
.cpages
;
450 dmirror_bounce_fini(&bounce
);
454 static bool dmirror_allocate_chunk(struct dmirror_device
*mdevice
,
457 struct dmirror_chunk
*devmem
;
458 struct resource
*res
;
460 unsigned long pfn_first
;
461 unsigned long pfn_last
;
464 devmem
= kzalloc(sizeof(*devmem
), GFP_KERNEL
);
468 res
= request_free_mem_region(&iomem_resource
, DEVMEM_CHUNK_SIZE
,
473 devmem
->pagemap
.type
= MEMORY_DEVICE_PRIVATE
;
474 devmem
->pagemap
.range
.start
= res
->start
;
475 devmem
->pagemap
.range
.end
= res
->end
;
476 devmem
->pagemap
.nr_range
= 1;
477 devmem
->pagemap
.ops
= &dmirror_devmem_ops
;
478 devmem
->pagemap
.owner
= mdevice
;
480 mutex_lock(&mdevice
->devmem_lock
);
482 if (mdevice
->devmem_count
== mdevice
->devmem_capacity
) {
483 struct dmirror_chunk
**new_chunks
;
484 unsigned int new_capacity
;
486 new_capacity
= mdevice
->devmem_capacity
+
487 DEVMEM_CHUNKS_RESERVE
;
488 new_chunks
= krealloc(mdevice
->devmem_chunks
,
489 sizeof(new_chunks
[0]) * new_capacity
,
493 mdevice
->devmem_capacity
= new_capacity
;
494 mdevice
->devmem_chunks
= new_chunks
;
497 ptr
= memremap_pages(&devmem
->pagemap
, numa_node_id());
501 devmem
->mdevice
= mdevice
;
502 pfn_first
= devmem
->pagemap
.range
.start
>> PAGE_SHIFT
;
503 pfn_last
= pfn_first
+ (range_len(&devmem
->pagemap
.range
) >> PAGE_SHIFT
);
504 mdevice
->devmem_chunks
[mdevice
->devmem_count
++] = devmem
;
506 mutex_unlock(&mdevice
->devmem_lock
);
508 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n",
509 DEVMEM_CHUNK_SIZE
/ (1024 * 1024),
510 mdevice
->devmem_count
,
511 mdevice
->devmem_count
* (DEVMEM_CHUNK_SIZE
/ (1024 * 1024)),
512 pfn_first
, pfn_last
);
514 spin_lock(&mdevice
->lock
);
515 for (pfn
= pfn_first
; pfn
< pfn_last
; pfn
++) {
516 struct page
*page
= pfn_to_page(pfn
);
518 page
->zone_device_data
= mdevice
->free_pages
;
519 mdevice
->free_pages
= page
;
522 *ppage
= mdevice
->free_pages
;
523 mdevice
->free_pages
= (*ppage
)->zone_device_data
;
526 spin_unlock(&mdevice
->lock
);
531 mutex_unlock(&mdevice
->devmem_lock
);
532 release_mem_region(devmem
->pagemap
.range
.start
, range_len(&devmem
->pagemap
.range
));
539 static struct page
*dmirror_devmem_alloc_page(struct dmirror_device
*mdevice
)
541 struct page
*dpage
= NULL
;
545 * This is a fake device so we alloc real system memory to store
548 rpage
= alloc_page(GFP_HIGHUSER
);
552 spin_lock(&mdevice
->lock
);
554 if (mdevice
->free_pages
) {
555 dpage
= mdevice
->free_pages
;
556 mdevice
->free_pages
= dpage
->zone_device_data
;
558 spin_unlock(&mdevice
->lock
);
560 spin_unlock(&mdevice
->lock
);
561 if (!dmirror_allocate_chunk(mdevice
, &dpage
))
565 dpage
->zone_device_data
= rpage
;
575 static void dmirror_migrate_alloc_and_copy(struct migrate_vma
*args
,
576 struct dmirror
*dmirror
)
578 struct dmirror_device
*mdevice
= dmirror
->mdevice
;
579 const unsigned long *src
= args
->src
;
580 unsigned long *dst
= args
->dst
;
583 for (addr
= args
->start
; addr
< args
->end
; addr
+= PAGE_SIZE
,
589 if (!(*src
& MIGRATE_PFN_MIGRATE
))
593 * Note that spage might be NULL which is OK since it is an
594 * unallocated pte_none() or read-only zero page.
596 spage
= migrate_pfn_to_page(*src
);
598 dpage
= dmirror_devmem_alloc_page(mdevice
);
602 rpage
= dpage
->zone_device_data
;
604 copy_highpage(rpage
, spage
);
606 clear_highpage(rpage
);
609 * Normally, a device would use the page->zone_device_data to
610 * point to the mirror but here we use it to hold the page for
611 * the simulated device memory and that page holds the pointer
614 rpage
->zone_device_data
= dmirror
;
616 *dst
= migrate_pfn(page_to_pfn(dpage
)) |
618 if ((*src
& MIGRATE_PFN_WRITE
) ||
619 (!spage
&& args
->vma
->vm_flags
& VM_WRITE
))
620 *dst
|= MIGRATE_PFN_WRITE
;
624 static int dmirror_check_atomic(struct dmirror
*dmirror
, unsigned long start
,
629 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++) {
632 entry
= xa_load(&dmirror
->pt
, pfn
);
633 if (xa_pointer_tag(entry
) == DPT_XA_TAG_ATOMIC
)
640 static int dmirror_atomic_map(unsigned long start
, unsigned long end
,
641 struct page
**pages
, struct dmirror
*dmirror
)
643 unsigned long pfn
, mapped
= 0;
646 /* Map the migrated pages into the device's page tables. */
647 mutex_lock(&dmirror
->mutex
);
649 for (i
= 0, pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++, i
++) {
656 entry
= xa_tag_pointer(entry
, DPT_XA_TAG_ATOMIC
);
657 entry
= xa_store(&dmirror
->pt
, pfn
, entry
, GFP_ATOMIC
);
658 if (xa_is_err(entry
)) {
659 mutex_unlock(&dmirror
->mutex
);
660 return xa_err(entry
);
666 mutex_unlock(&dmirror
->mutex
);
670 static int dmirror_migrate_finalize_and_map(struct migrate_vma
*args
,
671 struct dmirror
*dmirror
)
673 unsigned long start
= args
->start
;
674 unsigned long end
= args
->end
;
675 const unsigned long *src
= args
->src
;
676 const unsigned long *dst
= args
->dst
;
679 /* Map the migrated pages into the device's page tables. */
680 mutex_lock(&dmirror
->mutex
);
682 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++,
687 if (!(*src
& MIGRATE_PFN_MIGRATE
))
690 dpage
= migrate_pfn_to_page(*dst
);
695 * Store the page that holds the data so the page table
696 * doesn't have to deal with ZONE_DEVICE private pages.
698 entry
= dpage
->zone_device_data
;
699 if (*dst
& MIGRATE_PFN_WRITE
)
700 entry
= xa_tag_pointer(entry
, DPT_XA_TAG_WRITE
);
701 entry
= xa_store(&dmirror
->pt
, pfn
, entry
, GFP_ATOMIC
);
702 if (xa_is_err(entry
)) {
703 mutex_unlock(&dmirror
->mutex
);
704 return xa_err(entry
);
708 mutex_unlock(&dmirror
->mutex
);
712 static int dmirror_exclusive(struct dmirror
*dmirror
,
713 struct hmm_dmirror_cmd
*cmd
)
715 unsigned long start
, end
, addr
;
716 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
717 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
718 struct page
*pages
[64];
719 struct dmirror_bounce bounce
;
728 /* Since the mm is for the mirrored process, get a reference first. */
729 if (!mmget_not_zero(mm
))
733 for (addr
= start
; addr
< end
; addr
= next
) {
734 unsigned long mapped
;
737 if (end
< addr
+ (ARRAY_SIZE(pages
) << PAGE_SHIFT
))
740 next
= addr
+ (ARRAY_SIZE(pages
) << PAGE_SHIFT
);
742 ret
= make_device_exclusive_range(mm
, addr
, next
, pages
, NULL
);
743 mapped
= dmirror_atomic_map(addr
, next
, pages
, dmirror
);
744 for (i
= 0; i
< ret
; i
++) {
746 unlock_page(pages
[i
]);
751 if (addr
+ (mapped
<< PAGE_SHIFT
) < next
) {
752 mmap_read_unlock(mm
);
757 mmap_read_unlock(mm
);
760 /* Return the migrated data for verification. */
761 ret
= dmirror_bounce_init(&bounce
, start
, size
);
764 mutex_lock(&dmirror
->mutex
);
765 ret
= dmirror_do_read(dmirror
, start
, end
, &bounce
);
766 mutex_unlock(&dmirror
->mutex
);
768 if (copy_to_user(u64_to_user_ptr(cmd
->ptr
), bounce
.ptr
,
773 cmd
->cpages
= bounce
.cpages
;
774 dmirror_bounce_fini(&bounce
);
778 static int dmirror_migrate(struct dmirror
*dmirror
,
779 struct hmm_dmirror_cmd
*cmd
)
781 unsigned long start
, end
, addr
;
782 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
783 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
784 struct vm_area_struct
*vma
;
785 unsigned long src_pfns
[64];
786 unsigned long dst_pfns
[64];
787 struct dmirror_bounce bounce
;
788 struct migrate_vma args
;
797 /* Since the mm is for the mirrored process, get a reference first. */
798 if (!mmget_not_zero(mm
))
802 for (addr
= start
; addr
< end
; addr
= next
) {
803 vma
= vma_lookup(mm
, addr
);
804 if (!vma
|| !(vma
->vm_flags
& VM_READ
)) {
808 next
= min(end
, addr
+ (ARRAY_SIZE(src_pfns
) << PAGE_SHIFT
));
809 if (next
> vma
->vm_end
)
817 args
.pgmap_owner
= dmirror
->mdevice
;
818 args
.flags
= MIGRATE_VMA_SELECT_SYSTEM
;
819 ret
= migrate_vma_setup(&args
);
823 dmirror_migrate_alloc_and_copy(&args
, dmirror
);
824 migrate_vma_pages(&args
);
825 dmirror_migrate_finalize_and_map(&args
, dmirror
);
826 migrate_vma_finalize(&args
);
828 mmap_read_unlock(mm
);
831 /* Return the migrated data for verification. */
832 ret
= dmirror_bounce_init(&bounce
, start
, size
);
835 mutex_lock(&dmirror
->mutex
);
836 ret
= dmirror_do_read(dmirror
, start
, end
, &bounce
);
837 mutex_unlock(&dmirror
->mutex
);
839 if (copy_to_user(u64_to_user_ptr(cmd
->ptr
), bounce
.ptr
,
843 cmd
->cpages
= bounce
.cpages
;
844 dmirror_bounce_fini(&bounce
);
848 mmap_read_unlock(mm
);
853 static void dmirror_mkentry(struct dmirror
*dmirror
, struct hmm_range
*range
,
854 unsigned char *perm
, unsigned long entry
)
858 if (entry
& HMM_PFN_ERROR
) {
859 *perm
= HMM_DMIRROR_PROT_ERROR
;
862 if (!(entry
& HMM_PFN_VALID
)) {
863 *perm
= HMM_DMIRROR_PROT_NONE
;
867 page
= hmm_pfn_to_page(entry
);
868 if (is_device_private_page(page
)) {
869 /* Is the page migrated to this device or some other? */
870 if (dmirror
->mdevice
== dmirror_page_to_device(page
))
871 *perm
= HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL
;
873 *perm
= HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE
;
874 } else if (is_zero_pfn(page_to_pfn(page
)))
875 *perm
= HMM_DMIRROR_PROT_ZERO
;
877 *perm
= HMM_DMIRROR_PROT_NONE
;
878 if (entry
& HMM_PFN_WRITE
)
879 *perm
|= HMM_DMIRROR_PROT_WRITE
;
881 *perm
|= HMM_DMIRROR_PROT_READ
;
882 if (hmm_pfn_to_map_order(entry
) + PAGE_SHIFT
== PMD_SHIFT
)
883 *perm
|= HMM_DMIRROR_PROT_PMD
;
884 else if (hmm_pfn_to_map_order(entry
) + PAGE_SHIFT
== PUD_SHIFT
)
885 *perm
|= HMM_DMIRROR_PROT_PUD
;
888 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier
*mni
,
889 const struct mmu_notifier_range
*range
,
890 unsigned long cur_seq
)
892 struct dmirror_interval
*dmi
=
893 container_of(mni
, struct dmirror_interval
, notifier
);
894 struct dmirror
*dmirror
= dmi
->dmirror
;
896 if (mmu_notifier_range_blockable(range
))
897 mutex_lock(&dmirror
->mutex
);
898 else if (!mutex_trylock(&dmirror
->mutex
))
902 * Snapshots only need to set the sequence number since any
903 * invalidation in the interval invalidates the whole snapshot.
905 mmu_interval_set_seq(mni
, cur_seq
);
907 mutex_unlock(&dmirror
->mutex
);
911 static const struct mmu_interval_notifier_ops dmirror_mrn_ops
= {
912 .invalidate
= dmirror_snapshot_invalidate
,
915 static int dmirror_range_snapshot(struct dmirror
*dmirror
,
916 struct hmm_range
*range
,
919 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
920 struct dmirror_interval notifier
;
921 unsigned long timeout
=
922 jiffies
+ msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT
);
927 notifier
.dmirror
= dmirror
;
928 range
->notifier
= ¬ifier
.notifier
;
930 ret
= mmu_interval_notifier_insert(range
->notifier
, mm
,
931 range
->start
, range
->end
- range
->start
,
937 if (time_after(jiffies
, timeout
)) {
942 range
->notifier_seq
= mmu_interval_read_begin(range
->notifier
);
945 ret
= hmm_range_fault(range
);
946 mmap_read_unlock(mm
);
953 mutex_lock(&dmirror
->mutex
);
954 if (mmu_interval_read_retry(range
->notifier
,
955 range
->notifier_seq
)) {
956 mutex_unlock(&dmirror
->mutex
);
962 n
= (range
->end
- range
->start
) >> PAGE_SHIFT
;
963 for (i
= 0; i
< n
; i
++)
964 dmirror_mkentry(dmirror
, range
, perm
+ i
, range
->hmm_pfns
[i
]);
966 mutex_unlock(&dmirror
->mutex
);
968 mmu_interval_notifier_remove(range
->notifier
);
972 static int dmirror_snapshot(struct dmirror
*dmirror
,
973 struct hmm_dmirror_cmd
*cmd
)
975 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
976 unsigned long start
, end
;
977 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
980 unsigned long pfns
[64];
981 unsigned char perm
[64];
983 struct hmm_range range
= {
985 .dev_private_owner
= dmirror
->mdevice
,
994 /* Since the mm is for the mirrored process, get a reference first. */
995 if (!mmget_not_zero(mm
))
999 * Register a temporary notifier to detect invalidations even if it
1000 * overlaps with other mmu_interval_notifiers.
1002 uptr
= u64_to_user_ptr(cmd
->ptr
);
1003 for (addr
= start
; addr
< end
; addr
= next
) {
1006 next
= min(addr
+ (ARRAY_SIZE(pfns
) << PAGE_SHIFT
), end
);
1010 ret
= dmirror_range_snapshot(dmirror
, &range
, perm
);
1014 n
= (range
.end
- range
.start
) >> PAGE_SHIFT
;
1015 if (copy_to_user(uptr
, perm
, n
)) {
1028 static long dmirror_fops_unlocked_ioctl(struct file
*filp
,
1029 unsigned int command
,
1032 void __user
*uarg
= (void __user
*)arg
;
1033 struct hmm_dmirror_cmd cmd
;
1034 struct dmirror
*dmirror
;
1037 dmirror
= filp
->private_data
;
1041 if (copy_from_user(&cmd
, uarg
, sizeof(cmd
)))
1044 if (cmd
.addr
& ~PAGE_MASK
)
1046 if (cmd
.addr
>= (cmd
.addr
+ (cmd
.npages
<< PAGE_SHIFT
)))
1053 case HMM_DMIRROR_READ
:
1054 ret
= dmirror_read(dmirror
, &cmd
);
1057 case HMM_DMIRROR_WRITE
:
1058 ret
= dmirror_write(dmirror
, &cmd
);
1061 case HMM_DMIRROR_MIGRATE
:
1062 ret
= dmirror_migrate(dmirror
, &cmd
);
1065 case HMM_DMIRROR_EXCLUSIVE
:
1066 ret
= dmirror_exclusive(dmirror
, &cmd
);
1069 case HMM_DMIRROR_CHECK_EXCLUSIVE
:
1070 ret
= dmirror_check_atomic(dmirror
, cmd
.addr
,
1071 cmd
.addr
+ (cmd
.npages
<< PAGE_SHIFT
));
1074 case HMM_DMIRROR_SNAPSHOT
:
1075 ret
= dmirror_snapshot(dmirror
, &cmd
);
1084 if (copy_to_user(uarg
, &cmd
, sizeof(cmd
)))
1090 static int dmirror_fops_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1094 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= PAGE_SIZE
) {
1098 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
1102 ret
= vm_insert_page(vma
, addr
, page
);
1113 static const struct file_operations dmirror_fops
= {
1114 .open
= dmirror_fops_open
,
1115 .release
= dmirror_fops_release
,
1116 .mmap
= dmirror_fops_mmap
,
1117 .unlocked_ioctl
= dmirror_fops_unlocked_ioctl
,
1118 .llseek
= default_llseek
,
1119 .owner
= THIS_MODULE
,
1122 static void dmirror_devmem_free(struct page
*page
)
1124 struct page
*rpage
= page
->zone_device_data
;
1125 struct dmirror_device
*mdevice
;
1130 mdevice
= dmirror_page_to_device(page
);
1132 spin_lock(&mdevice
->lock
);
1134 page
->zone_device_data
= mdevice
->free_pages
;
1135 mdevice
->free_pages
= page
;
1136 spin_unlock(&mdevice
->lock
);
1139 static vm_fault_t
dmirror_devmem_fault_alloc_and_copy(struct migrate_vma
*args
,
1140 struct dmirror
*dmirror
)
1142 const unsigned long *src
= args
->src
;
1143 unsigned long *dst
= args
->dst
;
1144 unsigned long start
= args
->start
;
1145 unsigned long end
= args
->end
;
1148 for (addr
= start
; addr
< end
; addr
+= PAGE_SIZE
,
1150 struct page
*dpage
, *spage
;
1152 spage
= migrate_pfn_to_page(*src
);
1153 if (!spage
|| !(*src
& MIGRATE_PFN_MIGRATE
))
1155 spage
= spage
->zone_device_data
;
1157 dpage
= alloc_page_vma(GFP_HIGHUSER_MOVABLE
, args
->vma
, addr
);
1162 xa_erase(&dmirror
->pt
, addr
>> PAGE_SHIFT
);
1163 copy_highpage(dpage
, spage
);
1164 *dst
= migrate_pfn(page_to_pfn(dpage
)) | MIGRATE_PFN_LOCKED
;
1165 if (*src
& MIGRATE_PFN_WRITE
)
1166 *dst
|= MIGRATE_PFN_WRITE
;
1171 static vm_fault_t
dmirror_devmem_fault(struct vm_fault
*vmf
)
1173 struct migrate_vma args
;
1174 unsigned long src_pfns
;
1175 unsigned long dst_pfns
;
1177 struct dmirror
*dmirror
;
1181 * Normally, a device would use the page->zone_device_data to point to
1182 * the mirror but here we use it to hold the page for the simulated
1183 * device memory and that page holds the pointer to the mirror.
1185 rpage
= vmf
->page
->zone_device_data
;
1186 dmirror
= rpage
->zone_device_data
;
1188 /* FIXME demonstrate how we can adjust migrate range */
1189 args
.vma
= vmf
->vma
;
1190 args
.start
= vmf
->address
;
1191 args
.end
= args
.start
+ PAGE_SIZE
;
1192 args
.src
= &src_pfns
;
1193 args
.dst
= &dst_pfns
;
1194 args
.pgmap_owner
= dmirror
->mdevice
;
1195 args
.flags
= MIGRATE_VMA_SELECT_DEVICE_PRIVATE
;
1197 if (migrate_vma_setup(&args
))
1198 return VM_FAULT_SIGBUS
;
1200 ret
= dmirror_devmem_fault_alloc_and_copy(&args
, dmirror
);
1203 migrate_vma_pages(&args
);
1205 * No device finalize step is needed since
1206 * dmirror_devmem_fault_alloc_and_copy() will have already
1207 * invalidated the device page table.
1209 migrate_vma_finalize(&args
);
1213 static const struct dev_pagemap_ops dmirror_devmem_ops
= {
1214 .page_free
= dmirror_devmem_free
,
1215 .migrate_to_ram
= dmirror_devmem_fault
,
1218 static int dmirror_device_init(struct dmirror_device
*mdevice
, int id
)
1223 dev
= MKDEV(MAJOR(dmirror_dev
), id
);
1224 mutex_init(&mdevice
->devmem_lock
);
1225 spin_lock_init(&mdevice
->lock
);
1227 cdev_init(&mdevice
->cdevice
, &dmirror_fops
);
1228 mdevice
->cdevice
.owner
= THIS_MODULE
;
1229 ret
= cdev_add(&mdevice
->cdevice
, dev
, 1);
1233 /* Build a list of free ZONE_DEVICE private struct pages */
1234 dmirror_allocate_chunk(mdevice
, NULL
);
1239 static void dmirror_device_remove(struct dmirror_device
*mdevice
)
1243 if (mdevice
->devmem_chunks
) {
1244 for (i
= 0; i
< mdevice
->devmem_count
; i
++) {
1245 struct dmirror_chunk
*devmem
=
1246 mdevice
->devmem_chunks
[i
];
1248 memunmap_pages(&devmem
->pagemap
);
1249 release_mem_region(devmem
->pagemap
.range
.start
,
1250 range_len(&devmem
->pagemap
.range
));
1253 kfree(mdevice
->devmem_chunks
);
1256 cdev_del(&mdevice
->cdevice
);
1259 static int __init
hmm_dmirror_init(void)
1264 ret
= alloc_chrdev_region(&dmirror_dev
, 0, DMIRROR_NDEVICES
,
1269 for (id
= 0; id
< DMIRROR_NDEVICES
; id
++) {
1270 ret
= dmirror_device_init(dmirror_devices
+ id
, id
);
1275 pr_info("HMM test module loaded. This is only for testing HMM.\n");
1280 dmirror_device_remove(dmirror_devices
+ id
);
1281 unregister_chrdev_region(dmirror_dev
, DMIRROR_NDEVICES
);
1286 static void __exit
hmm_dmirror_exit(void)
1290 for (id
= 0; id
< DMIRROR_NDEVICES
; id
++)
1291 dmirror_device_remove(dmirror_devices
+ id
);
1292 unregister_chrdev_region(dmirror_dev
, DMIRROR_NDEVICES
);
1295 module_init(hmm_dmirror_init
);
1296 module_exit(hmm_dmirror_exit
);
1297 MODULE_LICENSE("GPL");