1 // SPDX-License-Identifier: GPL-2.0
3 * This is a module to test the HMM (Heterogeneous Memory Management)
4 * mirror and zone device private memory migration APIs of the kernel.
5 * Userspace programs can register with the driver to mirror their own address
6 * space and can use the device to read/write any valid virtual address.
8 #include <linux/init.h>
11 #include <linux/module.h>
12 #include <linux/kernel.h>
13 #include <linux/cdev.h>
14 #include <linux/device.h>
15 #include <linux/mutex.h>
16 #include <linux/rwsem.h>
17 #include <linux/sched.h>
18 #include <linux/slab.h>
19 #include <linux/highmem.h>
20 #include <linux/delay.h>
21 #include <linux/pagemap.h>
22 #include <linux/hmm.h>
23 #include <linux/vmalloc.h>
24 #include <linux/swap.h>
25 #include <linux/swapops.h>
26 #include <linux/sched/mm.h>
27 #include <linux/platform_device.h>
28 #include <linux/rmap.h>
30 #include "test_hmm_uapi.h"
32 #define DMIRROR_NDEVICES 2
33 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000
34 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U)
35 #define DEVMEM_CHUNKS_RESERVE 16
37 static const struct dev_pagemap_ops dmirror_devmem_ops
;
38 static const struct mmu_interval_notifier_ops dmirror_min_ops
;
39 static dev_t dmirror_dev
;
41 struct dmirror_device
;
43 struct dmirror_bounce
{
50 #define DPT_XA_TAG_ATOMIC 1UL
51 #define DPT_XA_TAG_WRITE 3UL
54 * Data structure to track address ranges and register for mmu interval
57 struct dmirror_interval
{
58 struct mmu_interval_notifier notifier
;
59 struct dmirror
*dmirror
;
63 * Data attached to the open device file.
64 * Note that it might be shared after a fork().
67 struct dmirror_device
*mdevice
;
69 struct mmu_interval_notifier notifier
;
74 * ZONE_DEVICE pages for migration and simulating device memory.
76 struct dmirror_chunk
{
77 struct dev_pagemap pagemap
;
78 struct dmirror_device
*mdevice
;
84 struct dmirror_device
{
86 struct hmm_devmem
*devmem
;
88 unsigned int devmem_capacity
;
89 unsigned int devmem_count
;
90 struct dmirror_chunk
**devmem_chunks
;
91 struct mutex devmem_lock
; /* protects the above */
95 struct page
*free_pages
;
96 spinlock_t lock
; /* protects the above */
99 static struct dmirror_device dmirror_devices
[DMIRROR_NDEVICES
];
101 static int dmirror_bounce_init(struct dmirror_bounce
*bounce
,
108 bounce
->ptr
= vmalloc(size
);
114 static void dmirror_bounce_fini(struct dmirror_bounce
*bounce
)
119 static int dmirror_fops_open(struct inode
*inode
, struct file
*filp
)
121 struct cdev
*cdev
= inode
->i_cdev
;
122 struct dmirror
*dmirror
;
125 /* Mirror this process address space */
126 dmirror
= kzalloc(sizeof(*dmirror
), GFP_KERNEL
);
130 dmirror
->mdevice
= container_of(cdev
, struct dmirror_device
, cdevice
);
131 mutex_init(&dmirror
->mutex
);
132 xa_init(&dmirror
->pt
);
134 ret
= mmu_interval_notifier_insert(&dmirror
->notifier
, current
->mm
,
135 0, ULONG_MAX
& PAGE_MASK
, &dmirror_min_ops
);
141 filp
->private_data
= dmirror
;
145 static int dmirror_fops_release(struct inode
*inode
, struct file
*filp
)
147 struct dmirror
*dmirror
= filp
->private_data
;
149 mmu_interval_notifier_remove(&dmirror
->notifier
);
150 xa_destroy(&dmirror
->pt
);
155 static struct dmirror_device
*dmirror_page_to_device(struct page
*page
)
158 return container_of(page
->pgmap
, struct dmirror_chunk
,
162 static int dmirror_do_fault(struct dmirror
*dmirror
, struct hmm_range
*range
)
164 unsigned long *pfns
= range
->hmm_pfns
;
167 for (pfn
= (range
->start
>> PAGE_SHIFT
);
168 pfn
< (range
->end
>> PAGE_SHIFT
);
174 * Since we asked for hmm_range_fault() to populate pages,
175 * it shouldn't return an error entry on success.
177 WARN_ON(*pfns
& HMM_PFN_ERROR
);
178 WARN_ON(!(*pfns
& HMM_PFN_VALID
));
180 page
= hmm_pfn_to_page(*pfns
);
184 if (*pfns
& HMM_PFN_WRITE
)
185 entry
= xa_tag_pointer(entry
, DPT_XA_TAG_WRITE
);
186 else if (WARN_ON(range
->default_flags
& HMM_PFN_WRITE
))
188 entry
= xa_store(&dmirror
->pt
, pfn
, entry
, GFP_ATOMIC
);
189 if (xa_is_err(entry
))
190 return xa_err(entry
);
196 static void dmirror_do_update(struct dmirror
*dmirror
, unsigned long start
,
203 * The XArray doesn't hold references to pages since it relies on
204 * the mmu notifier to clear page pointers when they become stale.
205 * Therefore, it is OK to just clear the entry.
207 xa_for_each_range(&dmirror
->pt
, pfn
, entry
, start
>> PAGE_SHIFT
,
209 xa_erase(&dmirror
->pt
, pfn
);
212 static bool dmirror_interval_invalidate(struct mmu_interval_notifier
*mni
,
213 const struct mmu_notifier_range
*range
,
214 unsigned long cur_seq
)
216 struct dmirror
*dmirror
= container_of(mni
, struct dmirror
, notifier
);
219 * Ignore invalidation callbacks for device private pages since
220 * the invalidation is handled as part of the migration process.
222 if (range
->event
== MMU_NOTIFY_MIGRATE
&&
223 range
->owner
== dmirror
->mdevice
)
226 if (mmu_notifier_range_blockable(range
))
227 mutex_lock(&dmirror
->mutex
);
228 else if (!mutex_trylock(&dmirror
->mutex
))
231 mmu_interval_set_seq(mni
, cur_seq
);
232 dmirror_do_update(dmirror
, range
->start
, range
->end
);
234 mutex_unlock(&dmirror
->mutex
);
238 static const struct mmu_interval_notifier_ops dmirror_min_ops
= {
239 .invalidate
= dmirror_interval_invalidate
,
242 static int dmirror_range_fault(struct dmirror
*dmirror
,
243 struct hmm_range
*range
)
245 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
246 unsigned long timeout
=
247 jiffies
+ msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT
);
251 if (time_after(jiffies
, timeout
)) {
256 range
->notifier_seq
= mmu_interval_read_begin(range
->notifier
);
258 ret
= hmm_range_fault(range
);
259 mmap_read_unlock(mm
);
266 mutex_lock(&dmirror
->mutex
);
267 if (mmu_interval_read_retry(range
->notifier
,
268 range
->notifier_seq
)) {
269 mutex_unlock(&dmirror
->mutex
);
275 ret
= dmirror_do_fault(dmirror
, range
);
277 mutex_unlock(&dmirror
->mutex
);
282 static int dmirror_fault(struct dmirror
*dmirror
, unsigned long start
,
283 unsigned long end
, bool write
)
285 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
287 unsigned long pfns
[64];
288 struct hmm_range range
= {
289 .notifier
= &dmirror
->notifier
,
293 HMM_PFN_REQ_FAULT
| (write
? HMM_PFN_REQ_WRITE
: 0),
294 .dev_private_owner
= dmirror
->mdevice
,
298 /* Since the mm is for the mirrored process, get a reference first. */
299 if (!mmget_not_zero(mm
))
302 for (addr
= start
; addr
< end
; addr
= range
.end
) {
304 range
.end
= min(addr
+ (ARRAY_SIZE(pfns
) << PAGE_SHIFT
), end
);
306 ret
= dmirror_range_fault(dmirror
, &range
);
315 static int dmirror_do_read(struct dmirror
*dmirror
, unsigned long start
,
316 unsigned long end
, struct dmirror_bounce
*bounce
)
321 ptr
= bounce
->ptr
+ ((start
- bounce
->addr
) & PAGE_MASK
);
323 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++) {
328 entry
= xa_load(&dmirror
->pt
, pfn
);
329 page
= xa_untag_pointer(entry
);
334 memcpy(ptr
, tmp
, PAGE_SIZE
);
344 static int dmirror_read(struct dmirror
*dmirror
, struct hmm_dmirror_cmd
*cmd
)
346 struct dmirror_bounce bounce
;
347 unsigned long start
, end
;
348 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
356 ret
= dmirror_bounce_init(&bounce
, start
, size
);
361 mutex_lock(&dmirror
->mutex
);
362 ret
= dmirror_do_read(dmirror
, start
, end
, &bounce
);
363 mutex_unlock(&dmirror
->mutex
);
367 start
= cmd
->addr
+ (bounce
.cpages
<< PAGE_SHIFT
);
368 ret
= dmirror_fault(dmirror
, start
, end
, false);
375 if (copy_to_user(u64_to_user_ptr(cmd
->ptr
), bounce
.ptr
,
379 cmd
->cpages
= bounce
.cpages
;
380 dmirror_bounce_fini(&bounce
);
384 static int dmirror_do_write(struct dmirror
*dmirror
, unsigned long start
,
385 unsigned long end
, struct dmirror_bounce
*bounce
)
390 ptr
= bounce
->ptr
+ ((start
- bounce
->addr
) & PAGE_MASK
);
392 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++) {
397 entry
= xa_load(&dmirror
->pt
, pfn
);
398 page
= xa_untag_pointer(entry
);
399 if (!page
|| xa_pointer_tag(entry
) != DPT_XA_TAG_WRITE
)
403 memcpy(tmp
, ptr
, PAGE_SIZE
);
413 static int dmirror_write(struct dmirror
*dmirror
, struct hmm_dmirror_cmd
*cmd
)
415 struct dmirror_bounce bounce
;
416 unsigned long start
, end
;
417 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
425 ret
= dmirror_bounce_init(&bounce
, start
, size
);
428 if (copy_from_user(bounce
.ptr
, u64_to_user_ptr(cmd
->ptr
),
435 mutex_lock(&dmirror
->mutex
);
436 ret
= dmirror_do_write(dmirror
, start
, end
, &bounce
);
437 mutex_unlock(&dmirror
->mutex
);
441 start
= cmd
->addr
+ (bounce
.cpages
<< PAGE_SHIFT
);
442 ret
= dmirror_fault(dmirror
, start
, end
, true);
449 cmd
->cpages
= bounce
.cpages
;
450 dmirror_bounce_fini(&bounce
);
454 static bool dmirror_allocate_chunk(struct dmirror_device
*mdevice
,
457 struct dmirror_chunk
*devmem
;
458 struct resource
*res
;
460 unsigned long pfn_first
;
461 unsigned long pfn_last
;
464 devmem
= kzalloc(sizeof(*devmem
), GFP_KERNEL
);
468 res
= request_free_mem_region(&iomem_resource
, DEVMEM_CHUNK_SIZE
,
473 devmem
->pagemap
.type
= MEMORY_DEVICE_PRIVATE
;
474 devmem
->pagemap
.range
.start
= res
->start
;
475 devmem
->pagemap
.range
.end
= res
->end
;
476 devmem
->pagemap
.nr_range
= 1;
477 devmem
->pagemap
.ops
= &dmirror_devmem_ops
;
478 devmem
->pagemap
.owner
= mdevice
;
480 mutex_lock(&mdevice
->devmem_lock
);
482 if (mdevice
->devmem_count
== mdevice
->devmem_capacity
) {
483 struct dmirror_chunk
**new_chunks
;
484 unsigned int new_capacity
;
486 new_capacity
= mdevice
->devmem_capacity
+
487 DEVMEM_CHUNKS_RESERVE
;
488 new_chunks
= krealloc(mdevice
->devmem_chunks
,
489 sizeof(new_chunks
[0]) * new_capacity
,
493 mdevice
->devmem_capacity
= new_capacity
;
494 mdevice
->devmem_chunks
= new_chunks
;
497 ptr
= memremap_pages(&devmem
->pagemap
, numa_node_id());
501 devmem
->mdevice
= mdevice
;
502 pfn_first
= devmem
->pagemap
.range
.start
>> PAGE_SHIFT
;
503 pfn_last
= pfn_first
+ (range_len(&devmem
->pagemap
.range
) >> PAGE_SHIFT
);
504 mdevice
->devmem_chunks
[mdevice
->devmem_count
++] = devmem
;
506 mutex_unlock(&mdevice
->devmem_lock
);
508 pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n",
509 DEVMEM_CHUNK_SIZE
/ (1024 * 1024),
510 mdevice
->devmem_count
,
511 mdevice
->devmem_count
* (DEVMEM_CHUNK_SIZE
/ (1024 * 1024)),
512 pfn_first
, pfn_last
);
514 spin_lock(&mdevice
->lock
);
515 for (pfn
= pfn_first
; pfn
< pfn_last
; pfn
++) {
516 struct page
*page
= pfn_to_page(pfn
);
518 page
->zone_device_data
= mdevice
->free_pages
;
519 mdevice
->free_pages
= page
;
522 *ppage
= mdevice
->free_pages
;
523 mdevice
->free_pages
= (*ppage
)->zone_device_data
;
526 spin_unlock(&mdevice
->lock
);
531 mutex_unlock(&mdevice
->devmem_lock
);
532 release_mem_region(devmem
->pagemap
.range
.start
, range_len(&devmem
->pagemap
.range
));
539 static struct page
*dmirror_devmem_alloc_page(struct dmirror_device
*mdevice
)
541 struct page
*dpage
= NULL
;
545 * This is a fake device so we alloc real system memory to store
548 rpage
= alloc_page(GFP_HIGHUSER
);
552 spin_lock(&mdevice
->lock
);
554 if (mdevice
->free_pages
) {
555 dpage
= mdevice
->free_pages
;
556 mdevice
->free_pages
= dpage
->zone_device_data
;
558 spin_unlock(&mdevice
->lock
);
560 spin_unlock(&mdevice
->lock
);
561 if (!dmirror_allocate_chunk(mdevice
, &dpage
))
565 dpage
->zone_device_data
= rpage
;
575 static void dmirror_migrate_alloc_and_copy(struct migrate_vma
*args
,
576 struct dmirror
*dmirror
)
578 struct dmirror_device
*mdevice
= dmirror
->mdevice
;
579 const unsigned long *src
= args
->src
;
580 unsigned long *dst
= args
->dst
;
583 for (addr
= args
->start
; addr
< args
->end
; addr
+= PAGE_SIZE
,
589 if (!(*src
& MIGRATE_PFN_MIGRATE
))
593 * Note that spage might be NULL which is OK since it is an
594 * unallocated pte_none() or read-only zero page.
596 spage
= migrate_pfn_to_page(*src
);
598 dpage
= dmirror_devmem_alloc_page(mdevice
);
602 rpage
= dpage
->zone_device_data
;
604 copy_highpage(rpage
, spage
);
606 clear_highpage(rpage
);
609 * Normally, a device would use the page->zone_device_data to
610 * point to the mirror but here we use it to hold the page for
611 * the simulated device memory and that page holds the pointer
614 rpage
->zone_device_data
= dmirror
;
616 *dst
= migrate_pfn(page_to_pfn(dpage
)) |
618 if ((*src
& MIGRATE_PFN_WRITE
) ||
619 (!spage
&& args
->vma
->vm_flags
& VM_WRITE
))
620 *dst
|= MIGRATE_PFN_WRITE
;
624 static int dmirror_check_atomic(struct dmirror
*dmirror
, unsigned long start
,
629 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++) {
632 entry
= xa_load(&dmirror
->pt
, pfn
);
633 if (xa_pointer_tag(entry
) == DPT_XA_TAG_ATOMIC
)
640 static int dmirror_atomic_map(unsigned long start
, unsigned long end
,
641 struct page
**pages
, struct dmirror
*dmirror
)
643 unsigned long pfn
, mapped
= 0;
646 /* Map the migrated pages into the device's page tables. */
647 mutex_lock(&dmirror
->mutex
);
649 for (i
= 0, pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++, i
++) {
656 entry
= xa_tag_pointer(entry
, DPT_XA_TAG_ATOMIC
);
657 entry
= xa_store(&dmirror
->pt
, pfn
, entry
, GFP_ATOMIC
);
658 if (xa_is_err(entry
)) {
659 mutex_unlock(&dmirror
->mutex
);
660 return xa_err(entry
);
666 mutex_unlock(&dmirror
->mutex
);
670 static int dmirror_migrate_finalize_and_map(struct migrate_vma
*args
,
671 struct dmirror
*dmirror
)
673 unsigned long start
= args
->start
;
674 unsigned long end
= args
->end
;
675 const unsigned long *src
= args
->src
;
676 const unsigned long *dst
= args
->dst
;
679 /* Map the migrated pages into the device's page tables. */
680 mutex_lock(&dmirror
->mutex
);
682 for (pfn
= start
>> PAGE_SHIFT
; pfn
< (end
>> PAGE_SHIFT
); pfn
++,
687 if (!(*src
& MIGRATE_PFN_MIGRATE
))
690 dpage
= migrate_pfn_to_page(*dst
);
695 * Store the page that holds the data so the page table
696 * doesn't have to deal with ZONE_DEVICE private pages.
698 entry
= dpage
->zone_device_data
;
699 if (*dst
& MIGRATE_PFN_WRITE
)
700 entry
= xa_tag_pointer(entry
, DPT_XA_TAG_WRITE
);
701 entry
= xa_store(&dmirror
->pt
, pfn
, entry
, GFP_ATOMIC
);
702 if (xa_is_err(entry
)) {
703 mutex_unlock(&dmirror
->mutex
);
704 return xa_err(entry
);
708 mutex_unlock(&dmirror
->mutex
);
712 static int dmirror_exclusive(struct dmirror
*dmirror
,
713 struct hmm_dmirror_cmd
*cmd
)
715 unsigned long start
, end
, addr
;
716 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
717 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
718 struct page
*pages
[64];
719 struct dmirror_bounce bounce
;
728 /* Since the mm is for the mirrored process, get a reference first. */
729 if (!mmget_not_zero(mm
))
733 for (addr
= start
; addr
< end
; addr
= next
) {
734 unsigned long mapped
= 0;
737 if (end
< addr
+ (ARRAY_SIZE(pages
) << PAGE_SHIFT
))
740 next
= addr
+ (ARRAY_SIZE(pages
) << PAGE_SHIFT
);
742 ret
= make_device_exclusive_range(mm
, addr
, next
, pages
, NULL
);
744 * Do dmirror_atomic_map() iff all pages are marked for
745 * exclusive access to avoid accessing uninitialized
748 if (ret
== (next
- addr
) >> PAGE_SHIFT
)
749 mapped
= dmirror_atomic_map(addr
, next
, pages
, dmirror
);
750 for (i
= 0; i
< ret
; i
++) {
752 unlock_page(pages
[i
]);
757 if (addr
+ (mapped
<< PAGE_SHIFT
) < next
) {
758 mmap_read_unlock(mm
);
763 mmap_read_unlock(mm
);
766 /* Return the migrated data for verification. */
767 ret
= dmirror_bounce_init(&bounce
, start
, size
);
770 mutex_lock(&dmirror
->mutex
);
771 ret
= dmirror_do_read(dmirror
, start
, end
, &bounce
);
772 mutex_unlock(&dmirror
->mutex
);
774 if (copy_to_user(u64_to_user_ptr(cmd
->ptr
), bounce
.ptr
,
779 cmd
->cpages
= bounce
.cpages
;
780 dmirror_bounce_fini(&bounce
);
784 static int dmirror_migrate(struct dmirror
*dmirror
,
785 struct hmm_dmirror_cmd
*cmd
)
787 unsigned long start
, end
, addr
;
788 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
789 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
790 struct vm_area_struct
*vma
;
791 unsigned long src_pfns
[64];
792 unsigned long dst_pfns
[64];
793 struct dmirror_bounce bounce
;
794 struct migrate_vma args
;
803 /* Since the mm is for the mirrored process, get a reference first. */
804 if (!mmget_not_zero(mm
))
808 for (addr
= start
; addr
< end
; addr
= next
) {
809 vma
= vma_lookup(mm
, addr
);
810 if (!vma
|| !(vma
->vm_flags
& VM_READ
)) {
814 next
= min(end
, addr
+ (ARRAY_SIZE(src_pfns
) << PAGE_SHIFT
));
815 if (next
> vma
->vm_end
)
823 args
.pgmap_owner
= dmirror
->mdevice
;
824 args
.flags
= MIGRATE_VMA_SELECT_SYSTEM
;
825 ret
= migrate_vma_setup(&args
);
829 dmirror_migrate_alloc_and_copy(&args
, dmirror
);
830 migrate_vma_pages(&args
);
831 dmirror_migrate_finalize_and_map(&args
, dmirror
);
832 migrate_vma_finalize(&args
);
834 mmap_read_unlock(mm
);
837 /* Return the migrated data for verification. */
838 ret
= dmirror_bounce_init(&bounce
, start
, size
);
841 mutex_lock(&dmirror
->mutex
);
842 ret
= dmirror_do_read(dmirror
, start
, end
, &bounce
);
843 mutex_unlock(&dmirror
->mutex
);
845 if (copy_to_user(u64_to_user_ptr(cmd
->ptr
), bounce
.ptr
,
849 cmd
->cpages
= bounce
.cpages
;
850 dmirror_bounce_fini(&bounce
);
854 mmap_read_unlock(mm
);
859 static void dmirror_mkentry(struct dmirror
*dmirror
, struct hmm_range
*range
,
860 unsigned char *perm
, unsigned long entry
)
864 if (entry
& HMM_PFN_ERROR
) {
865 *perm
= HMM_DMIRROR_PROT_ERROR
;
868 if (!(entry
& HMM_PFN_VALID
)) {
869 *perm
= HMM_DMIRROR_PROT_NONE
;
873 page
= hmm_pfn_to_page(entry
);
874 if (is_device_private_page(page
)) {
875 /* Is the page migrated to this device or some other? */
876 if (dmirror
->mdevice
== dmirror_page_to_device(page
))
877 *perm
= HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL
;
879 *perm
= HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE
;
880 } else if (is_zero_pfn(page_to_pfn(page
)))
881 *perm
= HMM_DMIRROR_PROT_ZERO
;
883 *perm
= HMM_DMIRROR_PROT_NONE
;
884 if (entry
& HMM_PFN_WRITE
)
885 *perm
|= HMM_DMIRROR_PROT_WRITE
;
887 *perm
|= HMM_DMIRROR_PROT_READ
;
888 if (hmm_pfn_to_map_order(entry
) + PAGE_SHIFT
== PMD_SHIFT
)
889 *perm
|= HMM_DMIRROR_PROT_PMD
;
890 else if (hmm_pfn_to_map_order(entry
) + PAGE_SHIFT
== PUD_SHIFT
)
891 *perm
|= HMM_DMIRROR_PROT_PUD
;
894 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier
*mni
,
895 const struct mmu_notifier_range
*range
,
896 unsigned long cur_seq
)
898 struct dmirror_interval
*dmi
=
899 container_of(mni
, struct dmirror_interval
, notifier
);
900 struct dmirror
*dmirror
= dmi
->dmirror
;
902 if (mmu_notifier_range_blockable(range
))
903 mutex_lock(&dmirror
->mutex
);
904 else if (!mutex_trylock(&dmirror
->mutex
))
908 * Snapshots only need to set the sequence number since any
909 * invalidation in the interval invalidates the whole snapshot.
911 mmu_interval_set_seq(mni
, cur_seq
);
913 mutex_unlock(&dmirror
->mutex
);
917 static const struct mmu_interval_notifier_ops dmirror_mrn_ops
= {
918 .invalidate
= dmirror_snapshot_invalidate
,
921 static int dmirror_range_snapshot(struct dmirror
*dmirror
,
922 struct hmm_range
*range
,
925 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
926 struct dmirror_interval notifier
;
927 unsigned long timeout
=
928 jiffies
+ msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT
);
933 notifier
.dmirror
= dmirror
;
934 range
->notifier
= ¬ifier
.notifier
;
936 ret
= mmu_interval_notifier_insert(range
->notifier
, mm
,
937 range
->start
, range
->end
- range
->start
,
943 if (time_after(jiffies
, timeout
)) {
948 range
->notifier_seq
= mmu_interval_read_begin(range
->notifier
);
951 ret
= hmm_range_fault(range
);
952 mmap_read_unlock(mm
);
959 mutex_lock(&dmirror
->mutex
);
960 if (mmu_interval_read_retry(range
->notifier
,
961 range
->notifier_seq
)) {
962 mutex_unlock(&dmirror
->mutex
);
968 n
= (range
->end
- range
->start
) >> PAGE_SHIFT
;
969 for (i
= 0; i
< n
; i
++)
970 dmirror_mkentry(dmirror
, range
, perm
+ i
, range
->hmm_pfns
[i
]);
972 mutex_unlock(&dmirror
->mutex
);
974 mmu_interval_notifier_remove(range
->notifier
);
978 static int dmirror_snapshot(struct dmirror
*dmirror
,
979 struct hmm_dmirror_cmd
*cmd
)
981 struct mm_struct
*mm
= dmirror
->notifier
.mm
;
982 unsigned long start
, end
;
983 unsigned long size
= cmd
->npages
<< PAGE_SHIFT
;
986 unsigned long pfns
[64];
987 unsigned char perm
[64];
989 struct hmm_range range
= {
991 .dev_private_owner
= dmirror
->mdevice
,
1000 /* Since the mm is for the mirrored process, get a reference first. */
1001 if (!mmget_not_zero(mm
))
1005 * Register a temporary notifier to detect invalidations even if it
1006 * overlaps with other mmu_interval_notifiers.
1008 uptr
= u64_to_user_ptr(cmd
->ptr
);
1009 for (addr
= start
; addr
< end
; addr
= next
) {
1012 next
= min(addr
+ (ARRAY_SIZE(pfns
) << PAGE_SHIFT
), end
);
1016 ret
= dmirror_range_snapshot(dmirror
, &range
, perm
);
1020 n
= (range
.end
- range
.start
) >> PAGE_SHIFT
;
1021 if (copy_to_user(uptr
, perm
, n
)) {
1034 static long dmirror_fops_unlocked_ioctl(struct file
*filp
,
1035 unsigned int command
,
1038 void __user
*uarg
= (void __user
*)arg
;
1039 struct hmm_dmirror_cmd cmd
;
1040 struct dmirror
*dmirror
;
1043 dmirror
= filp
->private_data
;
1047 if (copy_from_user(&cmd
, uarg
, sizeof(cmd
)))
1050 if (cmd
.addr
& ~PAGE_MASK
)
1052 if (cmd
.addr
>= (cmd
.addr
+ (cmd
.npages
<< PAGE_SHIFT
)))
1059 case HMM_DMIRROR_READ
:
1060 ret
= dmirror_read(dmirror
, &cmd
);
1063 case HMM_DMIRROR_WRITE
:
1064 ret
= dmirror_write(dmirror
, &cmd
);
1067 case HMM_DMIRROR_MIGRATE
:
1068 ret
= dmirror_migrate(dmirror
, &cmd
);
1071 case HMM_DMIRROR_EXCLUSIVE
:
1072 ret
= dmirror_exclusive(dmirror
, &cmd
);
1075 case HMM_DMIRROR_CHECK_EXCLUSIVE
:
1076 ret
= dmirror_check_atomic(dmirror
, cmd
.addr
,
1077 cmd
.addr
+ (cmd
.npages
<< PAGE_SHIFT
));
1080 case HMM_DMIRROR_SNAPSHOT
:
1081 ret
= dmirror_snapshot(dmirror
, &cmd
);
1090 if (copy_to_user(uarg
, &cmd
, sizeof(cmd
)))
1096 static int dmirror_fops_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1100 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= PAGE_SIZE
) {
1104 page
= alloc_page(GFP_KERNEL
| __GFP_ZERO
);
1108 ret
= vm_insert_page(vma
, addr
, page
);
1119 static const struct file_operations dmirror_fops
= {
1120 .open
= dmirror_fops_open
,
1121 .release
= dmirror_fops_release
,
1122 .mmap
= dmirror_fops_mmap
,
1123 .unlocked_ioctl
= dmirror_fops_unlocked_ioctl
,
1124 .llseek
= default_llseek
,
1125 .owner
= THIS_MODULE
,
1128 static void dmirror_devmem_free(struct page
*page
)
1130 struct page
*rpage
= page
->zone_device_data
;
1131 struct dmirror_device
*mdevice
;
1136 mdevice
= dmirror_page_to_device(page
);
1138 spin_lock(&mdevice
->lock
);
1140 page
->zone_device_data
= mdevice
->free_pages
;
1141 mdevice
->free_pages
= page
;
1142 spin_unlock(&mdevice
->lock
);
1145 static vm_fault_t
dmirror_devmem_fault_alloc_and_copy(struct migrate_vma
*args
,
1146 struct dmirror
*dmirror
)
1148 const unsigned long *src
= args
->src
;
1149 unsigned long *dst
= args
->dst
;
1150 unsigned long start
= args
->start
;
1151 unsigned long end
= args
->end
;
1154 for (addr
= start
; addr
< end
; addr
+= PAGE_SIZE
,
1156 struct page
*dpage
, *spage
;
1158 spage
= migrate_pfn_to_page(*src
);
1159 if (!spage
|| !(*src
& MIGRATE_PFN_MIGRATE
))
1161 spage
= spage
->zone_device_data
;
1163 dpage
= alloc_page_vma(GFP_HIGHUSER_MOVABLE
, args
->vma
, addr
);
1168 xa_erase(&dmirror
->pt
, addr
>> PAGE_SHIFT
);
1169 copy_highpage(dpage
, spage
);
1170 *dst
= migrate_pfn(page_to_pfn(dpage
)) | MIGRATE_PFN_LOCKED
;
1171 if (*src
& MIGRATE_PFN_WRITE
)
1172 *dst
|= MIGRATE_PFN_WRITE
;
1177 static vm_fault_t
dmirror_devmem_fault(struct vm_fault
*vmf
)
1179 struct migrate_vma args
;
1180 unsigned long src_pfns
;
1181 unsigned long dst_pfns
;
1183 struct dmirror
*dmirror
;
1187 * Normally, a device would use the page->zone_device_data to point to
1188 * the mirror but here we use it to hold the page for the simulated
1189 * device memory and that page holds the pointer to the mirror.
1191 rpage
= vmf
->page
->zone_device_data
;
1192 dmirror
= rpage
->zone_device_data
;
1194 /* FIXME demonstrate how we can adjust migrate range */
1195 args
.vma
= vmf
->vma
;
1196 args
.start
= vmf
->address
;
1197 args
.end
= args
.start
+ PAGE_SIZE
;
1198 args
.src
= &src_pfns
;
1199 args
.dst
= &dst_pfns
;
1200 args
.pgmap_owner
= dmirror
->mdevice
;
1201 args
.flags
= MIGRATE_VMA_SELECT_DEVICE_PRIVATE
;
1203 if (migrate_vma_setup(&args
))
1204 return VM_FAULT_SIGBUS
;
1206 ret
= dmirror_devmem_fault_alloc_and_copy(&args
, dmirror
);
1209 migrate_vma_pages(&args
);
1211 * No device finalize step is needed since
1212 * dmirror_devmem_fault_alloc_and_copy() will have already
1213 * invalidated the device page table.
1215 migrate_vma_finalize(&args
);
1219 static const struct dev_pagemap_ops dmirror_devmem_ops
= {
1220 .page_free
= dmirror_devmem_free
,
1221 .migrate_to_ram
= dmirror_devmem_fault
,
1224 static int dmirror_device_init(struct dmirror_device
*mdevice
, int id
)
1229 dev
= MKDEV(MAJOR(dmirror_dev
), id
);
1230 mutex_init(&mdevice
->devmem_lock
);
1231 spin_lock_init(&mdevice
->lock
);
1233 cdev_init(&mdevice
->cdevice
, &dmirror_fops
);
1234 mdevice
->cdevice
.owner
= THIS_MODULE
;
1235 ret
= cdev_add(&mdevice
->cdevice
, dev
, 1);
1239 /* Build a list of free ZONE_DEVICE private struct pages */
1240 dmirror_allocate_chunk(mdevice
, NULL
);
1245 static void dmirror_device_remove(struct dmirror_device
*mdevice
)
1249 if (mdevice
->devmem_chunks
) {
1250 for (i
= 0; i
< mdevice
->devmem_count
; i
++) {
1251 struct dmirror_chunk
*devmem
=
1252 mdevice
->devmem_chunks
[i
];
1254 memunmap_pages(&devmem
->pagemap
);
1255 release_mem_region(devmem
->pagemap
.range
.start
,
1256 range_len(&devmem
->pagemap
.range
));
1259 kfree(mdevice
->devmem_chunks
);
1262 cdev_del(&mdevice
->cdevice
);
1265 static int __init
hmm_dmirror_init(void)
1270 ret
= alloc_chrdev_region(&dmirror_dev
, 0, DMIRROR_NDEVICES
,
1275 for (id
= 0; id
< DMIRROR_NDEVICES
; id
++) {
1276 ret
= dmirror_device_init(dmirror_devices
+ id
, id
);
1281 pr_info("HMM test module loaded. This is only for testing HMM.\n");
1286 dmirror_device_remove(dmirror_devices
+ id
);
1287 unregister_chrdev_region(dmirror_dev
, DMIRROR_NDEVICES
);
1292 static void __exit
hmm_dmirror_exit(void)
1296 for (id
= 0; id
< DMIRROR_NDEVICES
; id
++)
1297 dmirror_device_remove(dmirror_devices
+ id
);
1298 unregister_chrdev_region(dmirror_dev
, DMIRROR_NDEVICES
);
1301 module_init(hmm_dmirror_init
);
1302 module_exit(hmm_dmirror_exit
);
1303 MODULE_LICENSE("GPL");