]> git.proxmox.com Git - mirror_ubuntu-kernels.git/commitdiff
selftests/vm: rename selftests/vm to selftests/mm
authorSeongJae Park <sj@kernel.org>
Tue, 3 Jan 2023 18:07:53 +0000 (18:07 +0000)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 19 Jan 2023 01:12:56 +0000 (17:12 -0800)
Rename selftets/vm to selftests/mm for being more consistent with the
code, documentation, and tools directories, and won't be confused with
virtual machines.

[sj@kernel.org: convert missing vm->mm changes]
Link: https://lkml.kernel.org/r/20230107230643.252273-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20230103180754.129637-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
110 files changed:
Documentation/admin-guide/mm/hugetlbpage.rst
Documentation/core-api/pin_user_pages.rst
MAINTAINERS
mm/Kconfig
tools/testing/selftests/Makefile
tools/testing/selftests/kselftest_deps.sh
tools/testing/selftests/mm/.gitignore [new file with mode: 0644]
tools/testing/selftests/mm/Makefile [new file with mode: 0644]
tools/testing/selftests/mm/charge_reserved_hugetlb.sh [new file with mode: 0644]
tools/testing/selftests/mm/check_config.sh [new file with mode: 0644]
tools/testing/selftests/mm/compaction_test.c [new file with mode: 0644]
tools/testing/selftests/mm/config [new file with mode: 0644]
tools/testing/selftests/mm/cow.c [new file with mode: 0644]
tools/testing/selftests/mm/gup_test.c [new file with mode: 0644]
tools/testing/selftests/mm/hmm-tests.c [new file with mode: 0644]
tools/testing/selftests/mm/hugepage-mmap.c [new file with mode: 0644]
tools/testing/selftests/mm/hugepage-mremap.c [new file with mode: 0644]
tools/testing/selftests/mm/hugepage-shm.c [new file with mode: 0644]
tools/testing/selftests/mm/hugepage-vmemmap.c [new file with mode: 0644]
tools/testing/selftests/mm/hugetlb-madvise.c [new file with mode: 0644]
tools/testing/selftests/mm/hugetlb_reparenting_test.sh [new file with mode: 0644]
tools/testing/selftests/mm/khugepaged.c [new file with mode: 0644]
tools/testing/selftests/mm/ksm_functional_tests.c [new file with mode: 0644]
tools/testing/selftests/mm/ksm_tests.c [new file with mode: 0644]
tools/testing/selftests/mm/madv_populate.c [new file with mode: 0644]
tools/testing/selftests/mm/map_fixed_noreplace.c [new file with mode: 0644]
tools/testing/selftests/mm/map_hugetlb.c [new file with mode: 0644]
tools/testing/selftests/mm/map_populate.c [new file with mode: 0644]
tools/testing/selftests/mm/memfd_secret.c [new file with mode: 0644]
tools/testing/selftests/mm/migration.c [new file with mode: 0644]
tools/testing/selftests/mm/mlock-random-test.c [new file with mode: 0644]
tools/testing/selftests/mm/mlock2-tests.c [new file with mode: 0644]
tools/testing/selftests/mm/mlock2.h [new file with mode: 0644]
tools/testing/selftests/mm/mrelease_test.c [new file with mode: 0644]
tools/testing/selftests/mm/mremap_dontunmap.c [new file with mode: 0644]
tools/testing/selftests/mm/mremap_test.c [new file with mode: 0644]
tools/testing/selftests/mm/on-fault-limit.c [new file with mode: 0644]
tools/testing/selftests/mm/pkey-helpers.h [new file with mode: 0644]
tools/testing/selftests/mm/pkey-powerpc.h [new file with mode: 0644]
tools/testing/selftests/mm/pkey-x86.h [new file with mode: 0644]
tools/testing/selftests/mm/protection_keys.c [new file with mode: 0644]
tools/testing/selftests/mm/run_vmtests.sh [new file with mode: 0644]
tools/testing/selftests/mm/settings [new file with mode: 0644]
tools/testing/selftests/mm/soft-dirty.c [new file with mode: 0644]
tools/testing/selftests/mm/split_huge_page_test.c [new file with mode: 0644]
tools/testing/selftests/mm/test_hmm.sh [new file with mode: 0644]
tools/testing/selftests/mm/test_vmalloc.sh [new file with mode: 0644]
tools/testing/selftests/mm/thuge-gen.c [new file with mode: 0644]
tools/testing/selftests/mm/transhuge-stress.c [new file with mode: 0644]
tools/testing/selftests/mm/userfaultfd.c [new file with mode: 0644]
tools/testing/selftests/mm/util.h [new file with mode: 0644]
tools/testing/selftests/mm/va_128TBswitch.c [new file with mode: 0644]
tools/testing/selftests/mm/va_128TBswitch.sh [new file with mode: 0644]
tools/testing/selftests/mm/virtual_address_range.c [new file with mode: 0644]
tools/testing/selftests/mm/vm_util.c [new file with mode: 0644]
tools/testing/selftests/mm/vm_util.h [new file with mode: 0644]
tools/testing/selftests/mm/write_hugetlb_memory.sh [new file with mode: 0644]
tools/testing/selftests/mm/write_to_hugetlbfs.c [new file with mode: 0644]
tools/testing/selftests/vm/.gitignore [deleted file]
tools/testing/selftests/vm/Makefile [deleted file]
tools/testing/selftests/vm/charge_reserved_hugetlb.sh [deleted file]
tools/testing/selftests/vm/check_config.sh [deleted file]
tools/testing/selftests/vm/compaction_test.c [deleted file]
tools/testing/selftests/vm/config [deleted file]
tools/testing/selftests/vm/cow.c [deleted file]
tools/testing/selftests/vm/gup_test.c [deleted file]
tools/testing/selftests/vm/hmm-tests.c [deleted file]
tools/testing/selftests/vm/hugepage-mmap.c [deleted file]
tools/testing/selftests/vm/hugepage-mremap.c [deleted file]
tools/testing/selftests/vm/hugepage-shm.c [deleted file]
tools/testing/selftests/vm/hugepage-vmemmap.c [deleted file]
tools/testing/selftests/vm/hugetlb-madvise.c [deleted file]
tools/testing/selftests/vm/hugetlb_reparenting_test.sh [deleted file]
tools/testing/selftests/vm/khugepaged.c [deleted file]
tools/testing/selftests/vm/ksm_functional_tests.c [deleted file]
tools/testing/selftests/vm/ksm_tests.c [deleted file]
tools/testing/selftests/vm/madv_populate.c [deleted file]
tools/testing/selftests/vm/map_fixed_noreplace.c [deleted file]
tools/testing/selftests/vm/map_hugetlb.c [deleted file]
tools/testing/selftests/vm/map_populate.c [deleted file]
tools/testing/selftests/vm/memfd_secret.c [deleted file]
tools/testing/selftests/vm/migration.c [deleted file]
tools/testing/selftests/vm/mlock-random-test.c [deleted file]
tools/testing/selftests/vm/mlock2-tests.c [deleted file]
tools/testing/selftests/vm/mlock2.h [deleted file]
tools/testing/selftests/vm/mrelease_test.c [deleted file]
tools/testing/selftests/vm/mremap_dontunmap.c [deleted file]
tools/testing/selftests/vm/mremap_test.c [deleted file]
tools/testing/selftests/vm/on-fault-limit.c [deleted file]
tools/testing/selftests/vm/pkey-helpers.h [deleted file]
tools/testing/selftests/vm/pkey-powerpc.h [deleted file]
tools/testing/selftests/vm/pkey-x86.h [deleted file]
tools/testing/selftests/vm/protection_keys.c [deleted file]
tools/testing/selftests/vm/run_vmtests.sh [deleted file]
tools/testing/selftests/vm/settings [deleted file]
tools/testing/selftests/vm/soft-dirty.c [deleted file]
tools/testing/selftests/vm/split_huge_page_test.c [deleted file]
tools/testing/selftests/vm/test_hmm.sh [deleted file]
tools/testing/selftests/vm/test_vmalloc.sh [deleted file]
tools/testing/selftests/vm/thuge-gen.c [deleted file]
tools/testing/selftests/vm/transhuge-stress.c [deleted file]
tools/testing/selftests/vm/userfaultfd.c [deleted file]
tools/testing/selftests/vm/util.h [deleted file]
tools/testing/selftests/vm/va_128TBswitch.c [deleted file]
tools/testing/selftests/vm/va_128TBswitch.sh [deleted file]
tools/testing/selftests/vm/virtual_address_range.c [deleted file]
tools/testing/selftests/vm/vm_util.c [deleted file]
tools/testing/selftests/vm/vm_util.h [deleted file]
tools/testing/selftests/vm/write_hugetlb_memory.sh [deleted file]
tools/testing/selftests/vm/write_to_hugetlbfs.c [deleted file]

index 19f27c0d92e074a7e829ed0eb1d403fe9bb65709..a969a2c742b212dd5657f024e9776cc792fc0e00 100644 (file)
@@ -461,13 +461,13 @@ Examples
 .. _map_hugetlb:
 
 ``map_hugetlb``
-       see tools/testing/selftests/vm/map_hugetlb.c
+       see tools/testing/selftests/mm/map_hugetlb.c
 
 ``hugepage-shm``
-       see tools/testing/selftests/vm/hugepage-shm.c
+       see tools/testing/selftests/mm/hugepage-shm.c
 
 ``hugepage-mmap``
-       see tools/testing/selftests/vm/hugepage-mmap.c
+       see tools/testing/selftests/mm/hugepage-mmap.c
 
 The `libhugetlbfs`_  library provides a wide range of userspace tools
 to help with huge page usability, environment setup, and control.
index b18416f4500feffb7b39407b0ca8fc4f4e08cb21..facafbdecb952187a3d91b354897f3cc87b84baa 100644 (file)
@@ -221,7 +221,7 @@ Unit testing
 ============
 This file::
 
- tools/testing/selftests/vm/gup_test.c
+ tools/testing/selftests/mm/gup_test.c
 
 has the following new calls to exercise the new pin*() wrapper functions:
 
index c726adfd1f0db967c9e3e917007d66ae8f3082d0..8ac1472bea341c1f653e1ea4d5479d8928b9abb3 100644 (file)
@@ -9466,7 +9466,7 @@ F:        Documentation/mm/hmm.rst
 F:     include/linux/hmm*
 F:     lib/test_hmm*
 F:     mm/hmm*
-F:     tools/testing/selftests/vm/*hmm*
+F:     tools/testing/selftests/mm/*hmm*
 
 HOST AP DRIVER
 M:     Jouni Malinen <j@w1.fi>
@@ -13484,7 +13484,7 @@ F:      include/linux/mmzone.h
 F:     include/linux/pagewalk.h
 F:     mm/
 F:     tools/mm/
-F:     tools/testing/selftests/vm/
+F:     tools/testing/selftests/mm/
 
 VMALLOC
 M:     Andrew Morton <akpm@linux-foundation.org>
index ff7b209dec05574e2af34147139fd678ea207d57..39df30dcabe35f907810cfe6d7350325d5404a97 100644 (file)
@@ -1073,7 +1073,7 @@ config GUP_TEST
          pin_user_pages*(), or pinned via get_user_pages*(), as specified
          by other command line arguments.
 
-         See tools/testing/selftests/vm/gup_test.c
+         See tools/testing/selftests/mm/gup_test.c
 
 comment "GUP_TEST needs to have DEBUG_FS enabled"
        depends on !GUP_TEST && !DEBUG_FS
index 41b649452560c3d91936c4ac8ba46df6e75c9736..56a29f2de8e665c99f8419288fb7563136d8ed0d 100644 (file)
@@ -85,7 +85,7 @@ TARGETS += tmpfs
 TARGETS += tpm2
 TARGETS += user
 TARGETS += vDSO
-TARGETS += vm
+TARGETS += mm
 TARGETS += x86
 TARGETS += zram
 #Please keep the TARGETS list alphabetically sorted
index 7424a1f5babcf846f985a29bf2a722feaba5e498..4bc14d9e8ff1d07a11e77d6af7c636b34ed77bf9 100755 (executable)
@@ -12,9 +12,9 @@ usage()
 
 echo -e "Usage: $0 -[p] <compiler> [test_name]\n"
 echo -e "\tkselftest_deps.sh [-p] gcc"
-echo -e "\tkselftest_deps.sh [-p] gcc vm"
+echo -e "\tkselftest_deps.sh [-p] gcc mm"
 echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc"
-echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n"
+echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc mm\n"
 echo "- Should be run in selftests directory in the kernel repo."
 echo "- Checks if Kselftests can be built/cross-built on a system."
 echo "- Parses all test/sub-test Makefile to find library dependencies."
@@ -120,7 +120,7 @@ l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \
 # Level 2
 # Some tests have multiple valid LDLIBS lines for individual sub-tests
 # that need dependency checks. Find them and append them to the tests
-# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
+# e.g: mm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
 # Filter out VAR_LDLIBS to discard the following:
 #      memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS)
 # Append space at the end of the list to append more tests.
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore
new file mode 100644 (file)
index 0000000..1f8c36a
--- /dev/null
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: GPL-2.0-only
+cow
+hugepage-mmap
+hugepage-mremap
+hugepage-shm
+hugepage-vmemmap
+hugetlb-madvise
+khugepaged
+map_hugetlb
+map_populate
+thuge-gen
+compaction_test
+migration
+mlock2-tests
+mrelease_test
+mremap_dontunmap
+mremap_test
+on-fault-limit
+transhuge-stress
+protection_keys
+protection_keys_32
+protection_keys_64
+madv_populate
+userfaultfd
+mlock-intersect-test
+mlock-random-test
+virtual_address_range
+gup_test
+va_128TBswitch
+map_fixed_noreplace
+write_to_hugetlbfs
+hmm-tests
+memfd_secret
+soft-dirty
+split_huge_page_test
+ksm_tests
+local_config.h
+local_config.mk
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
new file mode 100644 (file)
index 0000000..6a4b639
--- /dev/null
@@ -0,0 +1,180 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mm selftests
+
+LOCAL_HDRS += $(selfdir)/mm/local_config.h $(top_srcdir)/mm/gup_test.h
+
+include local_config.mk
+
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
+
+# Without this, failed build products remain, with up-to-date timestamps,
+# thus tricking Make (and you!) into believing that All Is Well, in subsequent
+# make invocations:
+.DELETE_ON_ERROR:
+
+# Avoid accidental wrong builds, due to built-in rules working just a little
+# bit too well--but not quite as well as required for our situation here.
+#
+# In other words, "make userfaultfd" is supposed to fail to build at all,
+# because this Makefile only supports either "make" (all), or "make /full/path".
+# However,  the built-in rules, if not suppressed, will pick up CFLAGS and the
+# initial LDLIBS (but not the target-specific LDLIBS, because those are only
+# set for the full path target!). This causes it to get pretty far into building
+# things despite using incorrect values such as an *occasionally* incomplete
+# LDLIBS.
+MAKEFLAGS += --no-builtin-rules
+
+CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
+LDLIBS = -lrt -lpthread
+TEST_GEN_FILES = cow
+TEST_GEN_FILES += compaction_test
+TEST_GEN_FILES += gup_test
+TEST_GEN_FILES += hmm-tests
+TEST_GEN_FILES += hugetlb-madvise
+TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-mremap
+TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += hugepage-vmemmap
+TEST_GEN_FILES += khugepaged
+TEST_GEN_PROGS = madv_populate
+TEST_GEN_FILES += map_fixed_noreplace
+TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_populate
+TEST_GEN_FILES += memfd_secret
+TEST_GEN_FILES += migration
+TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mrelease_test
+TEST_GEN_FILES += mremap_dontunmap
+TEST_GEN_FILES += mremap_test
+TEST_GEN_FILES += on-fault-limit
+TEST_GEN_FILES += thuge-gen
+TEST_GEN_FILES += transhuge-stress
+TEST_GEN_FILES += userfaultfd
+TEST_GEN_PROGS += soft-dirty
+TEST_GEN_PROGS += split_huge_page_test
+TEST_GEN_FILES += ksm_tests
+TEST_GEN_PROGS += ksm_functional_tests
+
+ifeq ($(MACHINE),x86_64)
+CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c)
+CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
+
+VMTARGETS := protection_keys
+BINARIES_32 := $(VMTARGETS:%=%_32)
+BINARIES_64 := $(VMTARGETS:%=%_64)
+
+ifeq ($(CAN_BUILD_WITH_NOPIE),1)
+CFLAGS += -no-pie
+endif
+
+ifeq ($(CAN_BUILD_I386),1)
+TEST_GEN_FILES += $(BINARIES_32)
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+TEST_GEN_FILES += $(BINARIES_64)
+endif
+else
+
+ifneq (,$(findstring $(MACHINE),ppc64))
+TEST_GEN_FILES += protection_keys
+endif
+
+endif
+
+ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64))
+TEST_GEN_FILES += va_128TBswitch
+TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += write_to_hugetlbfs
+endif
+
+TEST_PROGS := run_vmtests.sh
+
+TEST_FILES := test_vmalloc.sh
+TEST_FILES += test_hmm.sh
+TEST_FILES += va_128TBswitch.sh
+
+include ../lib.mk
+
+$(OUTPUT)/cow: vm_util.c
+$(OUTPUT)/khugepaged: vm_util.c
+$(OUTPUT)/ksm_functional_tests: vm_util.c
+$(OUTPUT)/madv_populate: vm_util.c
+$(OUTPUT)/soft-dirty: vm_util.c
+$(OUTPUT)/split_huge_page_test: vm_util.c
+$(OUTPUT)/userfaultfd: vm_util.c
+
+ifeq ($(MACHINE),x86_64)
+BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
+BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
+
+define gen-target-rule-32
+$(1) $(1)_32: $(OUTPUT)/$(1)_32
+.PHONY: $(1) $(1)_32
+endef
+
+define gen-target-rule-64
+$(1) $(1)_64: $(OUTPUT)/$(1)_64
+.PHONY: $(1) $(1)_64
+endef
+
+ifeq ($(CAN_BUILD_I386),1)
+$(BINARIES_32): CFLAGS += -m32 -mxsave
+$(BINARIES_32): LDLIBS += -lrt -ldl -lm
+$(BINARIES_32): $(OUTPUT)/%_32: %.c
+       $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t))))
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+$(BINARIES_64): CFLAGS += -m64 -mxsave
+$(BINARIES_64): LDLIBS += -lrt -ldl
+$(BINARIES_64): $(OUTPUT)/%_64: %.c
+       $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t))))
+endif
+
+# x86_64 users should be encouraged to install 32-bit libraries
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
+all: warn_32bit_failure
+
+warn_32bit_failure:
+       @echo "Warning: you seem to have a broken 32-bit build" 2>&1;           \
+       echo  "environment. This will reduce test coverage of 64-bit" 2>&1;     \
+       echo  "kernels. If you are using a Debian-like distribution," 2>&1;     \
+       echo  "try:"; 2>&1;                                                     \
+       echo  "";                                                               \
+       echo  "  apt-get install gcc-multilib libc6-i386 libc6-dev-i386";       \
+       echo  "";                                                               \
+       echo  "If you are using a Fedora-like distribution, try:";              \
+       echo  "";                                                               \
+       echo  "  yum install glibc-devel.*i686";                                \
+       exit 0;
+endif
+endif
+
+# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS)
+
+$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
+
+$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
+
+$(OUTPUT)/migration: LDLIBS += -lnuma
+
+local_config.mk local_config.h: check_config.sh
+       /bin/sh ./check_config.sh $(CC)
+
+EXTRA_CLEAN += local_config.mk local_config.h
+
+ifeq ($(COW_EXTRA_LIBS),)
+all: warn_missing_liburing
+
+warn_missing_liburing:
+       @echo ; \
+       echo "Warning: missing liburing support. Some COW tests will be skipped." ; \
+       echo
+endif
diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh
new file mode 100644 (file)
index 0000000..a5cb4b0
--- /dev/null
@@ -0,0 +1,584 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+  echo "This test must be run as root. Skipping..."
+  exit $ksft_skip
+fi
+
+fault_limit_file=limit_in_bytes
+reservation_limit_file=rsvd.limit_in_bytes
+fault_usage_file=usage_in_bytes
+reservation_usage_file=rsvd.usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+  cgroup2=1
+  fault_limit_file=max
+  reservation_limit_file=rsvd.max
+  fault_usage_file=current
+  reservation_usage_file=rsvd.current
+fi
+
+if [[ $cgroup2 ]]; then
+  cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+  if [[ -z "$cgroup_path" ]]; then
+    cgroup_path=/dev/cgroup/memory
+    mount -t cgroup2 none $cgroup_path
+    do_umount=1
+  fi
+  echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
+else
+  cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+  if [[ -z "$cgroup_path" ]]; then
+    cgroup_path=/dev/cgroup/memory
+    mount -t cgroup memory,hugetlb $cgroup_path
+    do_umount=1
+  fi
+fi
+export cgroup_path
+
+function cleanup() {
+  if [[ $cgroup2 ]]; then
+    echo $$ >$cgroup_path/cgroup.procs
+  else
+    echo $$ >$cgroup_path/tasks
+  fi
+
+  if [[ -e /mnt/huge ]]; then
+    rm -rf /mnt/huge/*
+    umount /mnt/huge || echo error
+    rmdir /mnt/huge
+  fi
+  if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then
+    rmdir $cgroup_path/hugetlb_cgroup_test
+  fi
+  if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then
+    rmdir $cgroup_path/hugetlb_cgroup_test1
+  fi
+  if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
+    rmdir $cgroup_path/hugetlb_cgroup_test2
+  fi
+  echo 0 >/proc/sys/vm/nr_hugepages
+  echo CLEANUP DONE
+}
+
+function expect_equal() {
+  local expected="$1"
+  local actual="$2"
+  local error="$3"
+
+  if [[ "$expected" != "$actual" ]]; then
+    echo "expected ($expected) != actual ($actual): $3"
+    cleanup
+    exit 1
+  fi
+}
+
+function get_machine_hugepage_size() {
+  hpz=$(grep -i hugepagesize /proc/meminfo)
+  kb=${hpz:14:-3}
+  mb=$(($kb / 1024))
+  echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function setup_cgroup() {
+  local name="$1"
+  local cgroup_limit="$2"
+  local reservation_limit="$3"
+
+  mkdir $cgroup_path/$name
+
+  echo writing cgroup limit: "$cgroup_limit"
+  echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
+
+  echo writing reseravation limit: "$reservation_limit"
+  echo "$reservation_limit" > \
+    $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
+
+  if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
+    echo 0 >$cgroup_path/$name/cpuset.cpus
+  fi
+  if [ -e "$cgroup_path/$name/cpuset.mems" ]; then
+    echo 0 >$cgroup_path/$name/cpuset.mems
+  fi
+}
+
+function wait_for_hugetlb_memory_to_get_depleted() {
+  local cgroup="$1"
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  # Wait for hugetlbfs memory to get depleted.
+  while [ $(cat $path) != 0 ]; do
+    echo Waiting for hugetlb memory to get depleted.
+    cat $path
+    sleep 0.5
+  done
+}
+
+function wait_for_hugetlb_memory_to_get_reserved() {
+  local cgroup="$1"
+  local size="$2"
+
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+  # Wait for hugetlbfs memory to get written.
+  while [ $(cat $path) != $size ]; do
+    echo Waiting for hugetlb memory reservation to reach size $size.
+    cat $path
+    sleep 0.5
+  done
+}
+
+function wait_for_hugetlb_memory_to_get_written() {
+  local cgroup="$1"
+  local size="$2"
+
+  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
+  # Wait for hugetlbfs memory to get written.
+  while [ $(cat $path) != $size ]; do
+    echo Waiting for hugetlb memory to reach size $size.
+    cat $path
+    sleep 0.5
+  done
+}
+
+function write_hugetlbfs_and_get_usage() {
+  local cgroup="$1"
+  local size="$2"
+  local populate="$3"
+  local write="$4"
+  local path="$5"
+  local method="$6"
+  local private="$7"
+  local expect_failure="$8"
+  local reserve="$9"
+
+  # Function return values.
+  reservation_failed=0
+  oom_killed=0
+  hugetlb_difference=0
+  reserved_difference=0
+
+  local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
+  local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file
+
+  local hugetlb_before=$(cat $hugetlb_usage)
+  local reserved_before=$(cat $reserved_usage)
+
+  echo
+  echo Starting:
+  echo hugetlb_usage="$hugetlb_before"
+  echo reserved_usage="$reserved_before"
+  echo expect_failure is "$expect_failure"
+
+  output=$(mktemp)
+  set +e
+  if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] ||
+    [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then
+
+    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+      "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output &
+
+    local write_result=$?
+    local write_pid=$!
+
+    until grep -q -i "DONE" $output; do
+      echo waiting for DONE signal.
+      if ! ps $write_pid > /dev/null
+      then
+        echo "FAIL: The write died"
+        cleanup
+        exit 1
+      fi
+      sleep 0.5
+    done
+
+    echo ================= write_hugetlb_memory.sh output is:
+    cat $output
+    echo ================= end output.
+
+    if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then
+      wait_for_hugetlb_memory_to_get_written "$cgroup" "$size"
+    elif [[ "$reserve" != "-n" ]]; then
+      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+    else
+      # This case doesn't produce visible effects, but we still have
+      # to wait for the async process to start and execute...
+      sleep 0.5
+    fi
+
+    echo write_result is $write_result
+  else
+    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+      "$cgroup" "$path" "$method" "$private" "$reserve"
+    local write_result=$?
+
+    if [[ "$reserve" != "-n" ]]; then
+      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+    fi
+  fi
+  set -e
+
+  if [[ "$write_result" == 1 ]]; then
+    reservation_failed=1
+  fi
+
+  # On linus/master, the above process gets SIGBUS'd on oomkill, with
+  # return code 135. On earlier kernels, it gets actual oomkill, with return
+  # code 137, so just check for both conditions in case we're testing
+  # against an earlier kernel.
+  if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then
+    oom_killed=1
+  fi
+
+  local hugetlb_after=$(cat $hugetlb_usage)
+  local reserved_after=$(cat $reserved_usage)
+
+  echo After write:
+  echo hugetlb_usage="$hugetlb_after"
+  echo reserved_usage="$reserved_after"
+
+  hugetlb_difference=$(($hugetlb_after - $hugetlb_before))
+  reserved_difference=$(($reserved_after - $reserved_before))
+}
+
+function cleanup_hugetlb_memory() {
+  set +e
+  local cgroup="$1"
+  if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
+    echo killing write_to_hugetlbfs
+    killall -2 write_to_hugetlbfs
+    wait_for_hugetlb_memory_to_get_depleted $cgroup
+  fi
+  set -e
+
+  if [[ -e /mnt/huge ]]; then
+    rm -rf /mnt/huge/*
+    umount /mnt/huge
+    rmdir /mnt/huge
+  fi
+}
+
+function run_test() {
+  local size=$(($1 * ${MB} * 1024 * 1024))
+  local populate="$2"
+  local write="$3"
+  local cgroup_limit=$(($4 * ${MB} * 1024 * 1024))
+  local reservation_limit=$(($5 * ${MB} * 1024 * 1024))
+  local nr_hugepages="$6"
+  local method="$7"
+  local private="$8"
+  local expect_failure="$9"
+  local reserve="${10}"
+
+  # Function return values.
+  hugetlb_difference=0
+  reserved_difference=0
+  reservation_failed=0
+  oom_killed=0
+
+  echo nr hugepages = "$nr_hugepages"
+  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+  setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"
+
+  mkdir -p /mnt/huge
+  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
+    "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
+    "$reserve"
+
+  cleanup_hugetlb_memory "hugetlb_cgroup_test"
+
+  local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
+  local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)
+
+  echo $hugetlb_difference
+  echo $reserved_difference
+  expect_equal "0" "$final_hugetlb" "final hugetlb is not zero"
+  expect_equal "0" "$final_reservation" "final reservation is not zero"
+}
+
+function run_multiple_cgroup_test() {
+  local size1="$1"
+  local populate1="$2"
+  local write1="$3"
+  local cgroup_limit1="$4"
+  local reservation_limit1="$5"
+
+  local size2="$6"
+  local populate2="$7"
+  local write2="$8"
+  local cgroup_limit2="$9"
+  local reservation_limit2="${10}"
+
+  local nr_hugepages="${11}"
+  local method="${12}"
+  local private="${13}"
+  local expect_failure="${14}"
+  local reserve="${15}"
+
+  # Function return values.
+  hugetlb_difference1=0
+  reserved_difference1=0
+  reservation_failed1=0
+  oom_killed1=0
+
+  hugetlb_difference2=0
+  reserved_difference2=0
+  reservation_failed2=0
+  oom_killed2=0
+
+  echo nr hugepages = "$nr_hugepages"
+  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+  setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1"
+  setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"
+
+  mkdir -p /mnt/huge
+  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
+    "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
+    "$expect_failure" "$reserve"
+
+  hugetlb_difference1=$hugetlb_difference
+  reserved_difference1=$reserved_difference
+  reservation_failed1=$reservation_failed
+  oom_killed1=$oom_killed
+
+  local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
+  local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
+  local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
+  local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file
+
+  local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
+  local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)
+
+  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \
+    "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \
+    "$expect_failure" "$reserve"
+
+  hugetlb_difference2=$hugetlb_difference
+  reserved_difference2=$reserved_difference
+  reservation_failed2=$reservation_failed
+  oom_killed2=$oom_killed
+
+  expect_equal "$usage_before_second_write" \
+    "$(cat $cgroup1_hugetlb_usage)" "Usage changed."
+  expect_equal "$reservation_usage_before_second_write" \
+    "$(cat $cgroup1_reservation_usage)" "Reservation usage changed."
+
+  cleanup_hugetlb_memory
+
+  local final_hugetlb=$(cat $cgroup1_hugetlb_usage)
+  local final_reservation=$(cat $cgroup1_reservation_usage)
+
+  expect_equal "0" "$final_hugetlb" \
+    "hugetlbt_cgroup_test1 final hugetlb is not zero"
+  expect_equal "0" "$final_reservation" \
+    "hugetlbt_cgroup_test1 final reservation is not zero"
+
+  local final_hugetlb=$(cat $cgroup2_hugetlb_usage)
+  local final_reservation=$(cat $cgroup2_reservation_usage)
+
+  expect_equal "0" "$final_hugetlb" \
+    "hugetlb_cgroup_test2 final hugetlb is not zero"
+  expect_equal "0" "$final_reservation" \
+    "hugetlb_cgroup_test2 final reservation is not zero"
+}
+
+cleanup
+
+for populate in "" "-o"; do
+  for method in 0 1 2; do
+    for private in "" "-r"; do
+      for reserve in "" "-n"; do
+
+        # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported.
+        if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then
+          continue
+        fi
+
+        # Skip populated shmem tests. Doesn't seem to be supported.
+        if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then
+          continue
+        fi
+
+        if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then
+          continue
+        fi
+
+        cleanup
+        echo
+        echo
+        echo
+        echo Test normal case.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb=$hugetlb_difference
+        echo Memory charged to reservation=$reserved_difference
+
+        if [[ "$populate" == "-o" ]]; then
+          expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+            "Reserved memory charged to hugetlb cgroup."
+        else
+          expect_equal "0" "$hugetlb_difference" \
+            "Reserved memory charged to hugetlb cgroup."
+        fi
+
+        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+          expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+            "Reserved memory not charged to reservation usage."
+        else
+          expect_equal "0" "$reserved_difference" \
+            "Reserved memory not charged to reservation usage."
+        fi
+
+        echo 'PASS'
+
+        cleanup
+        echo
+        echo
+        echo
+        echo Test normal case with write.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb=$hugetlb_difference
+        echo Memory charged to reservation=$reserved_difference
+
+        expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+          "Reserved memory charged to hugetlb cgroup."
+
+        expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+          "Reserved memory not charged to reservation usage."
+
+        echo 'PASS'
+
+        cleanup
+        continue
+        echo
+        echo
+        echo
+        echo Test more than reservation case.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+        if [ "$reserve" != "-n" ]; then
+          run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \
+            "$reserve"
+
+          expect_equal "1" "$reservation_failed" "Reservation succeeded."
+        fi
+
+        echo 'PASS'
+
+        cleanup
+
+        echo
+        echo
+        echo
+        echo Test more than cgroup limit case.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+        # Not sure if shm memory can be cleaned up when the process gets sigbus'd.
+        if [[ "$method" != 2 ]]; then
+          run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve"
+
+          expect_equal "1" "$oom_killed" "Not oom killed."
+        fi
+        echo 'PASS'
+
+        cleanup
+
+        echo
+        echo
+        echo
+        echo Test normal case, multiple cgroups.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \
+          "$populate" "" "10" "10" "10" \
+          "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb1=$hugetlb_difference1
+        echo Memory charged to reservation1=$reserved_difference1
+        echo Memory charged to hugtlb2=$hugetlb_difference2
+        echo Memory charged to reservation2=$reserved_difference2
+
+        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+          expect_equal "3" "$reserved_difference1" \
+            "Incorrect reservations charged to cgroup 1."
+
+          expect_equal "5" "$reserved_difference2" \
+            "Incorrect reservation charged to cgroup 2."
+
+        else
+          expect_equal "0" "$reserved_difference1" \
+            "Incorrect reservations charged to cgroup 1."
+
+          expect_equal "0" "$reserved_difference2" \
+            "Incorrect reservation charged to cgroup 2."
+        fi
+
+        if [[ "$populate" == "-o" ]]; then
+          expect_equal "3" "$hugetlb_difference1" \
+            "Incorrect hugetlb charged to cgroup 1."
+
+          expect_equal "5" "$hugetlb_difference2" \
+            "Incorrect hugetlb charged to cgroup 2."
+
+        else
+          expect_equal "0" "$hugetlb_difference1" \
+            "Incorrect hugetlb charged to cgroup 1."
+
+          expect_equal "0" "$hugetlb_difference2" \
+            "Incorrect hugetlb charged to cgroup 2."
+        fi
+        echo 'PASS'
+
+        cleanup
+        echo
+        echo
+        echo
+        echo Test normal case with write, multiple cgroups.
+        echo private=$private, populate=$populate, method=$method, reserve=$reserve
+        run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \
+          "$populate" "-w" "10" "10" "10" \
+          "$method" "$private" "0" "$reserve"
+
+        echo Memory charged to hugtlb1=$hugetlb_difference1
+        echo Memory charged to reservation1=$reserved_difference1
+        echo Memory charged to hugtlb2=$hugetlb_difference2
+        echo Memory charged to reservation2=$reserved_difference2
+
+        expect_equal "3" "$hugetlb_difference1" \
+          "Incorrect hugetlb charged to cgroup 1."
+
+        expect_equal "3" "$reserved_difference1" \
+          "Incorrect reservation charged to cgroup 1."
+
+        expect_equal "5" "$hugetlb_difference2" \
+          "Incorrect hugetlb charged to cgroup 2."
+
+        expect_equal "5" "$reserved_difference2" \
+          "Incorrected reservation charged to cgroup 2."
+        echo 'PASS'
+
+        cleanup
+
+      done # reserve
+    done   # private
+  done     # populate
+done       # method
+
+if [[ $do_umount ]]; then
+  umount $cgroup_path
+  rmdir $cgroup_path
+fi
diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh
new file mode 100644 (file)
index 0000000..bcba3af
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Probe for libraries and create header files to record the results. Both C
+# header files and Makefile include fragments are created.
+
+OUTPUT_H_FILE=local_config.h
+OUTPUT_MKFILE=local_config.mk
+
+tmpname=$(mktemp)
+tmpfile_c=${tmpname}.c
+tmpfile_o=${tmpname}.o
+
+# liburing
+echo "#include <sys/types.h>"        > $tmpfile_c
+echo "#include <liburing.h>"        >> $tmpfile_c
+echo "int func(void) { return 0; }" >> $tmpfile_c
+
+CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
+$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+
+if [ -f $tmpfile_o ]; then
+    echo "#define LOCAL_CONFIG_HAVE_LIBURING 1"  > $OUTPUT_H_FILE
+    echo "COW_EXTRA_LIBS = -luring"              > $OUTPUT_MKFILE
+else
+    echo "// No liburing support found"          > $OUTPUT_H_FILE
+    echo "# No liburing support found, so:"      > $OUTPUT_MKFILE
+    echo "COW_EXTRA_LIBS = "                    >> $OUTPUT_MKFILE
+fi
+
+rm ${tmpname}.*
diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
new file mode 100644 (file)
index 0000000..9b42014
--- /dev/null
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * A test for the patch "Allow compaction of unevictable pages".
+ * With this patch we should be able to allocate at least 1/4
+ * of RAM in huge pages. Without the patch much less is
+ * allocated.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#define MAP_SIZE_MB    100
+#define MAP_SIZE       (MAP_SIZE_MB * 1024 * 1024)
+
+struct map_list {
+       void *map;
+       struct map_list *next;
+};
+
+int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
+{
+       char  buffer[256] = {0};
+       char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
+       FILE *cmdfile = popen(cmd, "r");
+
+       if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+               perror("Failed to read meminfo\n");
+               return -1;
+       }
+
+       pclose(cmdfile);
+
+       *memfree = atoll(buffer);
+       cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
+       cmdfile = popen(cmd, "r");
+
+       if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+               perror("Failed to read meminfo\n");
+               return -1;
+       }
+
+       pclose(cmdfile);
+       *hugepagesize = atoll(buffer);
+
+       return 0;
+}
+
+int prereq(void)
+{
+       char allowed;
+       int fd;
+
+       fd = open("/proc/sys/vm/compact_unevictable_allowed",
+                 O_RDONLY | O_NONBLOCK);
+       if (fd < 0) {
+               perror("Failed to open\n"
+                      "/proc/sys/vm/compact_unevictable_allowed\n");
+               return -1;
+       }
+
+       if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
+               perror("Failed to read from\n"
+                      "/proc/sys/vm/compact_unevictable_allowed\n");
+               close(fd);
+               return -1;
+       }
+
+       close(fd);
+       if (allowed == '1')
+               return 0;
+
+       return -1;
+}
+
+int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
+{
+       int fd;
+       int compaction_index = 0;
+       char initial_nr_hugepages[10] = {0};
+       char nr_hugepages[10] = {0};
+
+       /* We want to test with 80% of available memory. Else, OOM killer comes
+          in to play */
+       mem_free = mem_free * 0.8;
+
+       fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+       if (fd < 0) {
+               perror("Failed to open /proc/sys/vm/nr_hugepages");
+               return -1;
+       }
+
+       if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
+               perror("Failed to read from /proc/sys/vm/nr_hugepages");
+               goto close_fd;
+       }
+
+       /* Start with the initial condition of 0 huge pages*/
+       if (write(fd, "0", sizeof(char)) != sizeof(char)) {
+               perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n");
+               goto close_fd;
+       }
+
+       lseek(fd, 0, SEEK_SET);
+
+       /* Request a large number of huge pages. The Kernel will allocate
+          as much as it can */
+       if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
+               perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n");
+               goto close_fd;
+       }
+
+       lseek(fd, 0, SEEK_SET);
+
+       if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+               perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n");
+               goto close_fd;
+       }
+
+       /* We should have been able to request at least 1/3 rd of the memory in
+          huge pages */
+       compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
+
+       if (compaction_index > 3) {
+               printf("No of huge pages allocated = %d\n",
+                      (atoi(nr_hugepages)));
+               fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
+                       "as huge pages\n", compaction_index);
+               goto close_fd;
+       }
+
+       printf("No of huge pages allocated = %d\n",
+              (atoi(nr_hugepages)));
+
+       lseek(fd, 0, SEEK_SET);
+
+       if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
+           != strlen(initial_nr_hugepages)) {
+               perror("Failed to write value to /proc/sys/vm/nr_hugepages\n");
+               goto close_fd;
+       }
+
+       close(fd);
+       return 0;
+
+ close_fd:
+       close(fd);
+       printf("Not OK. Compaction test failed.");
+       return -1;
+}
+
+
+int main(int argc, char **argv)
+{
+       struct rlimit lim;
+       struct map_list *list, *entry;
+       size_t page_size, i;
+       void *map = NULL;
+       unsigned long mem_free = 0;
+       unsigned long hugepage_size = 0;
+       long mem_fragmentable_MB = 0;
+
+       if (prereq() != 0) {
+               printf("Either the sysctl compact_unevictable_allowed is not\n"
+                      "set to 1 or couldn't read the proc file.\n"
+                      "Skipping the test\n");
+               return KSFT_SKIP;
+       }
+
+       lim.rlim_cur = RLIM_INFINITY;
+       lim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
+               perror("Failed to set rlimit:\n");
+               return -1;
+       }
+
+       page_size = getpagesize();
+
+       list = NULL;
+
+       if (read_memory_info(&mem_free, &hugepage_size) != 0) {
+               printf("ERROR: Cannot read meminfo\n");
+               return -1;
+       }
+
+       mem_fragmentable_MB = mem_free * 0.8 / 1024;
+
+       while (mem_fragmentable_MB > 0) {
+               map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
+                          MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+               if (map == MAP_FAILED)
+                       break;
+
+               entry = malloc(sizeof(struct map_list));
+               if (!entry) {
+                       munmap(map, MAP_SIZE);
+                       break;
+               }
+               entry->map = map;
+               entry->next = list;
+               list = entry;
+
+               /* Write something (in this case the address of the map) to
+                * ensure that KSM can't merge the mapped pages
+                */
+               for (i = 0; i < MAP_SIZE; i += page_size)
+                       *(unsigned long *)(map + i) = (unsigned long)map + i;
+
+               mem_fragmentable_MB -= MAP_SIZE_MB;
+       }
+
+       for (entry = list; entry != NULL; entry = entry->next) {
+               munmap(entry->map, MAP_SIZE);
+               if (!entry->next)
+                       break;
+               entry = entry->next;
+       }
+
+       if (check_compaction(mem_free, hugepage_size) == 0)
+               return 0;
+
+       return -1;
+}
diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config
new file mode 100644 (file)
index 0000000..be087c4
--- /dev/null
@@ -0,0 +1,8 @@
+CONFIG_SYSVIPC=y
+CONFIG_USERFAULTFD=y
+CONFIG_TEST_VMALLOC=m
+CONFIG_DEVICE_PRIVATE=y
+CONFIG_TEST_HMM=m
+CONFIG_GUP_TEST=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_MEM_SOFT_DIRTY=y
diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c
new file mode 100644 (file)
index 0000000..16216d8
--- /dev/null
@@ -0,0 +1,1764 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * COW (Copy On Write) tests.
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <linux/memfd.h>
+
+#include "local_config.h"
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+#include <liburing.h>
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+#include "../../../../mm/gup_test.h"
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
+
+static size_t pagesize;
+static int pagemap_fd;
+static size_t thpsize;
+static int nr_hugetlbsizes;
+static size_t hugetlbsizes[10];
+static int gup_fd;
+static bool has_huge_zeropage;
+
+static void detect_thpsize(void)
+{
+       int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
+                     O_RDONLY);
+       size_t size = 0;
+       char buf[15];
+       int ret;
+
+       if (fd < 0)
+               return;
+
+       ret = pread(fd, buf, sizeof(buf), 0);
+       if (ret > 0 && ret < sizeof(buf)) {
+               buf[ret] = 0;
+
+               size = strtoul(buf, NULL, 10);
+               if (size < pagesize)
+                       size = 0;
+               if (size > 0) {
+                       thpsize = size;
+                       ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
+                                      thpsize / 1024);
+               }
+       }
+
+       close(fd);
+}
+
+static void detect_huge_zeropage(void)
+{
+       int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
+                     O_RDONLY);
+       size_t enabled = 0;
+       char buf[15];
+       int ret;
+
+       if (fd < 0)
+               return;
+
+       ret = pread(fd, buf, sizeof(buf), 0);
+       if (ret > 0 && ret < sizeof(buf)) {
+               buf[ret] = 0;
+
+               enabled = strtoul(buf, NULL, 10);
+               if (enabled == 1) {
+                       has_huge_zeropage = true;
+                       ksft_print_msg("[INFO] huge zeropage is enabled\n");
+               }
+       }
+
+       close(fd);
+}
+
+static void detect_hugetlbsizes(void)
+{
+       DIR *dir = opendir("/sys/kernel/mm/hugepages/");
+
+       if (!dir)
+               return;
+
+       while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
+               struct dirent *entry = readdir(dir);
+               size_t kb;
+
+               if (!entry)
+                       break;
+               if (entry->d_type != DT_DIR)
+                       continue;
+               if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
+                       continue;
+               hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
+               nr_hugetlbsizes++;
+               ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
+                              kb);
+       }
+       closedir(dir);
+}
+
+static bool range_is_swapped(void *addr, size_t size)
+{
+       for (; size; addr += pagesize, size -= pagesize)
+               if (!pagemap_is_swapped(pagemap_fd, addr))
+                       return false;
+       return true;
+}
+
+struct comm_pipes {
+       int child_ready[2];
+       int parent_ready[2];
+};
+
+static int setup_comm_pipes(struct comm_pipes *comm_pipes)
+{
+       if (pipe(comm_pipes->child_ready) < 0)
+               return -errno;
+       if (pipe(comm_pipes->parent_ready) < 0) {
+               close(comm_pipes->child_ready[0]);
+               close(comm_pipes->child_ready[1]);
+               return -errno;
+       }
+
+       return 0;
+}
+
+static void close_comm_pipes(struct comm_pipes *comm_pipes)
+{
+       close(comm_pipes->child_ready[0]);
+       close(comm_pipes->child_ready[1]);
+       close(comm_pipes->parent_ready[0]);
+       close(comm_pipes->parent_ready[1]);
+}
+
+static int child_memcmp_fn(char *mem, size_t size,
+                          struct comm_pipes *comm_pipes)
+{
+       char *old = malloc(size);
+       char buf;
+
+       /* Backup the original content. */
+       memcpy(old, mem, size);
+
+       /* Wait until the parent modified the page. */
+       write(comm_pipes->child_ready[1], "0", 1);
+       while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+               ;
+
+       /* See if we still read the old values. */
+       return memcmp(old, mem, size);
+}
+
+static int child_vmsplice_memcmp_fn(char *mem, size_t size,
+                                   struct comm_pipes *comm_pipes)
+{
+       struct iovec iov = {
+               .iov_base = mem,
+               .iov_len = size,
+       };
+       ssize_t cur, total, transferred;
+       char *old, *new;
+       int fds[2];
+       char buf;
+
+       old = malloc(size);
+       new = malloc(size);
+
+       /* Backup the original content. */
+       memcpy(old, mem, size);
+
+       if (pipe(fds) < 0)
+               return -errno;
+
+       /* Trigger a read-only pin. */
+       transferred = vmsplice(fds[1], &iov, 1, 0);
+       if (transferred < 0)
+               return -errno;
+       if (transferred == 0)
+               return -EINVAL;
+
+       /* Unmap it from our page tables. */
+       if (munmap(mem, size) < 0)
+               return -errno;
+
+       /* Wait until the parent modified it. */
+       write(comm_pipes->child_ready[1], "0", 1);
+       while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
+               ;
+
+       /* See if we still read the old values via the pipe. */
+       for (total = 0; total < transferred; total += cur) {
+               cur = read(fds[0], new + total, transferred - total);
+               if (cur < 0)
+                       return -errno;
+       }
+
+       return memcmp(old, new, transferred);
+}
+
+typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
+
+static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
+                                 child_fn fn)
+{
+       struct comm_pipes comm_pipes;
+       char buf;
+       int ret;
+
+       ret = setup_comm_pipes(&comm_pipes);
+       if (ret) {
+               ksft_test_result_fail("pipe() failed\n");
+               return;
+       }
+
+       ret = fork();
+       if (ret < 0) {
+               ksft_test_result_fail("fork() failed\n");
+               goto close_comm_pipes;
+       } else if (!ret) {
+               exit(fn(mem, size, &comm_pipes));
+       }
+
+       while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+               ;
+
+       if (do_mprotect) {
+               /*
+                * mprotect() optimizations might try avoiding
+                * write-faults by directly mapping pages writable.
+                */
+               ret = mprotect(mem, size, PROT_READ);
+               ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
+               if (ret) {
+                       ksft_test_result_fail("mprotect() failed\n");
+                       write(comm_pipes.parent_ready[1], "0", 1);
+                       wait(&ret);
+                       goto close_comm_pipes;
+               }
+       }
+
+       /* Modify the page. */
+       memset(mem, 0xff, size);
+       write(comm_pipes.parent_ready[1], "0", 1);
+
+       wait(&ret);
+       if (WIFEXITED(ret))
+               ret = WEXITSTATUS(ret);
+       else
+               ret = -EINVAL;
+
+       ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+       close_comm_pipes(&comm_pipes);
+}
+
+static void test_cow_in_parent(char *mem, size_t size)
+{
+       do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
+}
+
+static void test_cow_in_parent_mprotect(char *mem, size_t size)
+{
+       do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
+}
+
+static void test_vmsplice_in_child(char *mem, size_t size)
+{
+       do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
+}
+
+static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
+{
+       do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
+}
+
+static void do_test_vmsplice_in_parent(char *mem, size_t size,
+                                      bool before_fork)
+{
+       struct iovec iov = {
+               .iov_base = mem,
+               .iov_len = size,
+       };
+       ssize_t cur, total, transferred;
+       struct comm_pipes comm_pipes;
+       char *old, *new;
+       int ret, fds[2];
+       char buf;
+
+       old = malloc(size);
+       new = malloc(size);
+
+       memcpy(old, mem, size);
+
+       ret = setup_comm_pipes(&comm_pipes);
+       if (ret) {
+               ksft_test_result_fail("pipe() failed\n");
+               goto free;
+       }
+
+       if (pipe(fds) < 0) {
+               ksft_test_result_fail("pipe() failed\n");
+               goto close_comm_pipes;
+       }
+
+       if (before_fork) {
+               transferred = vmsplice(fds[1], &iov, 1, 0);
+               if (transferred <= 0) {
+                       ksft_test_result_fail("vmsplice() failed\n");
+                       goto close_pipe;
+               }
+       }
+
+       ret = fork();
+       if (ret < 0) {
+               ksft_test_result_fail("fork() failed\n");
+               goto close_pipe;
+       } else if (!ret) {
+               write(comm_pipes.child_ready[1], "0", 1);
+               while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+                       ;
+               /* Modify page content in the child. */
+               memset(mem, 0xff, size);
+               exit(0);
+       }
+
+       if (!before_fork) {
+               transferred = vmsplice(fds[1], &iov, 1, 0);
+               if (transferred <= 0) {
+                       ksft_test_result_fail("vmsplice() failed\n");
+                       wait(&ret);
+                       goto close_pipe;
+               }
+       }
+
+       while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+               ;
+       if (munmap(mem, size) < 0) {
+               ksft_test_result_fail("munmap() failed\n");
+               goto close_pipe;
+       }
+       write(comm_pipes.parent_ready[1], "0", 1);
+
+       /* Wait until the child is done writing. */
+       wait(&ret);
+       if (!WIFEXITED(ret)) {
+               ksft_test_result_fail("wait() failed\n");
+               goto close_pipe;
+       }
+
+       /* See if we still read the old values. */
+       for (total = 0; total < transferred; total += cur) {
+               cur = read(fds[0], new + total, transferred - total);
+               if (cur < 0) {
+                       ksft_test_result_fail("read() failed\n");
+                       goto close_pipe;
+               }
+       }
+
+       ksft_test_result(!memcmp(old, new, transferred),
+                        "No leak from child into parent\n");
+close_pipe:
+       close(fds[0]);
+       close(fds[1]);
+close_comm_pipes:
+       close_comm_pipes(&comm_pipes);
+free:
+       free(old);
+       free(new);
+}
+
+static void test_vmsplice_before_fork(char *mem, size_t size)
+{
+       do_test_vmsplice_in_parent(mem, size, true);
+}
+
+static void test_vmsplice_after_fork(char *mem, size_t size)
+{
+       do_test_vmsplice_in_parent(mem, size, false);
+}
+
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+static void do_test_iouring(char *mem, size_t size, bool use_fork)
+{
+       struct comm_pipes comm_pipes;
+       struct io_uring_cqe *cqe;
+       struct io_uring_sqe *sqe;
+       struct io_uring ring;
+       ssize_t cur, total;
+       struct iovec iov;
+       char *buf, *tmp;
+       int ret, fd;
+       FILE *file;
+
+       ret = setup_comm_pipes(&comm_pipes);
+       if (ret) {
+               ksft_test_result_fail("pipe() failed\n");
+               return;
+       }
+
+       file = tmpfile();
+       if (!file) {
+               ksft_test_result_fail("tmpfile() failed\n");
+               goto close_comm_pipes;
+       }
+       fd = fileno(file);
+       assert(fd);
+
+       tmp = malloc(size);
+       if (!tmp) {
+               ksft_test_result_fail("malloc() failed\n");
+               goto close_file;
+       }
+
+       /* Skip on errors, as we might just lack kernel support. */
+       ret = io_uring_queue_init(1, &ring, 0);
+       if (ret < 0) {
+               ksft_test_result_skip("io_uring_queue_init() failed\n");
+               goto free_tmp;
+       }
+
+       /*
+        * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
+        * | FOLL_LONGTERM the range.
+        *
+        * Skip on errors, as we might just lack kernel support or might not
+        * have sufficient MEMLOCK permissions.
+        */
+       iov.iov_base = mem;
+       iov.iov_len = size;
+       ret = io_uring_register_buffers(&ring, &iov, 1);
+       if (ret) {
+               ksft_test_result_skip("io_uring_register_buffers() failed\n");
+               goto queue_exit;
+       }
+
+       if (use_fork) {
+               /*
+                * fork() and keep the child alive until we're done. Note that
+                * we expect the pinned page to not get shared with the child.
+                */
+               ret = fork();
+               if (ret < 0) {
+                       ksft_test_result_fail("fork() failed\n");
+                       goto unregister_buffers;
+               } else if (!ret) {
+                       write(comm_pipes.child_ready[1], "0", 1);
+                       while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+                               ;
+                       exit(0);
+               }
+
+               while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+                       ;
+       } else {
+               /*
+                * Map the page R/O into the page table. Enable softdirty
+                * tracking to stop the page from getting mapped R/W immediately
+                * again by mprotect() optimizations. Note that we don't have an
+                * easy way to test if that worked (the pagemap does not export
+                * if the page is mapped R/O vs. R/W).
+                */
+               ret = mprotect(mem, size, PROT_READ);
+               clear_softdirty();
+               ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+               if (ret) {
+                       ksft_test_result_fail("mprotect() failed\n");
+                       goto unregister_buffers;
+               }
+       }
+
+       /*
+        * Modify the page and write page content as observed by the fixed
+        * buffer pin to the file so we can verify it.
+        */
+       memset(mem, 0xff, size);
+       sqe = io_uring_get_sqe(&ring);
+       if (!sqe) {
+               ksft_test_result_fail("io_uring_get_sqe() failed\n");
+               goto quit_child;
+       }
+       io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
+
+       ret = io_uring_submit(&ring);
+       if (ret < 0) {
+               ksft_test_result_fail("io_uring_submit() failed\n");
+               goto quit_child;
+       }
+
+       ret = io_uring_wait_cqe(&ring, &cqe);
+       if (ret < 0) {
+               ksft_test_result_fail("io_uring_wait_cqe() failed\n");
+               goto quit_child;
+       }
+
+       if (cqe->res != size) {
+               ksft_test_result_fail("write_fixed failed\n");
+               goto quit_child;
+       }
+       io_uring_cqe_seen(&ring, cqe);
+
+       /* Read back the file content to the temporary buffer. */
+       total = 0;
+       while (total < size) {
+               cur = pread(fd, tmp + total, size - total, total);
+               if (cur < 0) {
+                       ksft_test_result_fail("pread() failed\n");
+                       goto quit_child;
+               }
+               total += cur;
+       }
+
+       /* Finally, check if we read what we expected. */
+       ksft_test_result(!memcmp(mem, tmp, size),
+                        "Longterm R/W pin is reliable\n");
+
+quit_child:
+       if (use_fork) {
+               write(comm_pipes.parent_ready[1], "0", 1);
+               wait(&ret);
+       }
+unregister_buffers:
+       io_uring_unregister_buffers(&ring);
+queue_exit:
+       io_uring_queue_exit(&ring);
+free_tmp:
+       free(tmp);
+close_file:
+       fclose(file);
+close_comm_pipes:
+       close_comm_pipes(&comm_pipes);
+}
+
+static void test_iouring_ro(char *mem, size_t size)
+{
+       do_test_iouring(mem, size, false);
+}
+
+static void test_iouring_fork(char *mem, size_t size)
+{
+       do_test_iouring(mem, size, true);
+}
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+
+enum ro_pin_test {
+       RO_PIN_TEST,
+       RO_PIN_TEST_SHARED,
+       RO_PIN_TEST_PREVIOUSLY_SHARED,
+       RO_PIN_TEST_RO_EXCLUSIVE,
+};
+
+static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
+                          bool fast)
+{
+       struct pin_longterm_test args;
+       struct comm_pipes comm_pipes;
+       char *tmp, buf;
+       __u64 tmp_val;
+       int ret;
+
+       if (gup_fd < 0) {
+               ksft_test_result_skip("gup_test not available\n");
+               return;
+       }
+
+       tmp = malloc(size);
+       if (!tmp) {
+               ksft_test_result_fail("malloc() failed\n");
+               return;
+       }
+
+       ret = setup_comm_pipes(&comm_pipes);
+       if (ret) {
+               ksft_test_result_fail("pipe() failed\n");
+               goto free_tmp;
+       }
+
+       switch (test) {
+       case RO_PIN_TEST:
+               break;
+       case RO_PIN_TEST_SHARED:
+       case RO_PIN_TEST_PREVIOUSLY_SHARED:
+               /*
+                * Share the pages with our child. As the pages are not pinned,
+                * this should just work.
+                */
+               ret = fork();
+               if (ret < 0) {
+                       ksft_test_result_fail("fork() failed\n");
+                       goto close_comm_pipes;
+               } else if (!ret) {
+                       write(comm_pipes.child_ready[1], "0", 1);
+                       while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
+                               ;
+                       exit(0);
+               }
+
+               /* Wait until our child is ready. */
+               while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+                       ;
+
+               if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
+                       /*
+                        * Tell the child to quit now and wait until it quit.
+                        * The pages should now be mapped R/O into our page
+                        * tables, but they are no longer shared.
+                        */
+                       write(comm_pipes.parent_ready[1], "0", 1);
+                       wait(&ret);
+                       if (!WIFEXITED(ret))
+                               ksft_print_msg("[INFO] wait() failed\n");
+               }
+               break;
+       case RO_PIN_TEST_RO_EXCLUSIVE:
+               /*
+                * Map the page R/O into the page table. Enable softdirty
+                * tracking to stop the page from getting mapped R/W immediately
+                * again by mprotect() optimizations. Note that we don't have an
+                * easy way to test if that worked (the pagemap does not export
+                * if the page is mapped R/O vs. R/W).
+                */
+               ret = mprotect(mem, size, PROT_READ);
+               clear_softdirty();
+               ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
+               if (ret) {
+                       ksft_test_result_fail("mprotect() failed\n");
+                       goto close_comm_pipes;
+               }
+               break;
+       default:
+               assert(false);
+       }
+
+       /* Take a R/O pin. This should trigger unsharing. */
+       args.addr = (__u64)(uintptr_t)mem;
+       args.size = size;
+       args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
+       ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
+       if (ret) {
+               if (errno == EINVAL)
+                       ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
+               else
+                       ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
+               goto wait;
+       }
+
+       /* Modify the page. */
+       memset(mem, 0xff, size);
+
+       /*
+        * Read back the content via the pin to the temporary buffer and
+        * test if we observed the modification.
+        */
+       tmp_val = (__u64)(uintptr_t)tmp;
+       ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
+       if (ret)
+               ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
+       else
+               ksft_test_result(!memcmp(mem, tmp, size),
+                                "Longterm R/O pin is reliable\n");
+
+       ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
+       if (ret)
+               ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
+wait:
+       switch (test) {
+       case RO_PIN_TEST_SHARED:
+               write(comm_pipes.parent_ready[1], "0", 1);
+               wait(&ret);
+               if (!WIFEXITED(ret))
+                       ksft_print_msg("[INFO] wait() failed\n");
+               break;
+       default:
+               break;
+       }
+close_comm_pipes:
+       close_comm_pipes(&comm_pipes);
+free_tmp:
+       free(tmp);
+}
+
+static void test_ro_pin_on_shared(char *mem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_shared(char *mem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
+}
+
+static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
+}
+
+static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
+}
+
+static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
+}
+
+typedef void (*test_fn)(char *mem, size_t size);
+
+static void do_run_with_base_page(test_fn fn, bool swapout)
+{
+       char *mem;
+       int ret;
+
+       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               return;
+       }
+
+       ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
+       /* Ignore if not around on a kernel. */
+       if (ret && errno != EINVAL) {
+               ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+               goto munmap;
+       }
+
+       /* Populate a base page. */
+       memset(mem, 0, pagesize);
+
+       if (swapout) {
+               madvise(mem, pagesize, MADV_PAGEOUT);
+               if (!pagemap_is_swapped(pagemap_fd, mem)) {
+                       ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+                       goto munmap;
+               }
+       }
+
+       fn(mem, pagesize);
+munmap:
+       munmap(mem, pagesize);
+}
+
+static void run_with_base_page(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with base page\n", desc);
+       do_run_with_base_page(fn, false);
+}
+
+static void run_with_base_page_swap(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
+       do_run_with_base_page(fn, true);
+}
+
+enum thp_run {
+       THP_RUN_PMD,
+       THP_RUN_PMD_SWAPOUT,
+       THP_RUN_PTE,
+       THP_RUN_PTE_SWAPOUT,
+       THP_RUN_SINGLE_PTE,
+       THP_RUN_SINGLE_PTE_SWAPOUT,
+       THP_RUN_PARTIAL_MREMAP,
+       THP_RUN_PARTIAL_SHARED,
+};
+
+static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
+{
+       char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
+       size_t size, mmap_size, mremap_size;
+       int ret;
+
+       /* For alignment purposes, we need twice the thp size. */
+       mmap_size = 2 * thpsize;
+       mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (mmap_mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               return;
+       }
+
+       /* We need a THP-aligned memory area. */
+       mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+
+       ret = madvise(mem, thpsize, MADV_HUGEPAGE);
+       if (ret) {
+               ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+               goto munmap;
+       }
+
+       /*
+        * Try to populate a THP. Touch the first sub-page and test if we get
+        * another sub-page populated automatically.
+        */
+       mem[0] = 0;
+       if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
+               ksft_test_result_skip("Did not get a THP populated\n");
+               goto munmap;
+       }
+       memset(mem, 0, thpsize);
+
+       size = thpsize;
+       switch (thp_run) {
+       case THP_RUN_PMD:
+       case THP_RUN_PMD_SWAPOUT:
+               break;
+       case THP_RUN_PTE:
+       case THP_RUN_PTE_SWAPOUT:
+               /*
+                * Trigger PTE-mapping the THP by temporarily mapping a single
+                * subpage R/O.
+                */
+               ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+               if (ret) {
+                       ksft_test_result_fail("mprotect() failed\n");
+                       goto munmap;
+               }
+               ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+               if (ret) {
+                       ksft_test_result_fail("mprotect() failed\n");
+                       goto munmap;
+               }
+               break;
+       case THP_RUN_SINGLE_PTE:
+       case THP_RUN_SINGLE_PTE_SWAPOUT:
+               /*
+                * Discard all but a single subpage of that PTE-mapped THP. What
+                * remains is a single PTE mapping a single subpage.
+                */
+               ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
+               if (ret) {
+                       ksft_test_result_fail("MADV_DONTNEED failed\n");
+                       goto munmap;
+               }
+               size = pagesize;
+               break;
+       case THP_RUN_PARTIAL_MREMAP:
+               /*
+                * Remap half of the THP. We need some new memory location
+                * for that.
+                */
+               mremap_size = thpsize / 2;
+               mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
+                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+               if (mem == MAP_FAILED) {
+                       ksft_test_result_fail("mmap() failed\n");
+                       goto munmap;
+               }
+               tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
+                            MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
+               if (tmp != mremap_mem) {
+                       ksft_test_result_fail("mremap() failed\n");
+                       goto munmap;
+               }
+               size = mremap_size;
+               break;
+       case THP_RUN_PARTIAL_SHARED:
+               /*
+                * Share the first page of the THP with a child and quit the
+                * child. This will result in some parts of the THP never
+                * have been shared.
+                */
+               ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
+               if (ret) {
+                       ksft_test_result_fail("MADV_DONTFORK failed\n");
+                       goto munmap;
+               }
+               ret = fork();
+               if (ret < 0) {
+                       ksft_test_result_fail("fork() failed\n");
+                       goto munmap;
+               } else if (!ret) {
+                       exit(0);
+               }
+               wait(&ret);
+               /* Allow for sharing all pages again. */
+               ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
+               if (ret) {
+                       ksft_test_result_fail("MADV_DOFORK failed\n");
+                       goto munmap;
+               }
+               break;
+       default:
+               assert(false);
+       }
+
+       switch (thp_run) {
+       case THP_RUN_PMD_SWAPOUT:
+       case THP_RUN_PTE_SWAPOUT:
+       case THP_RUN_SINGLE_PTE_SWAPOUT:
+               madvise(mem, size, MADV_PAGEOUT);
+               if (!range_is_swapped(mem, size)) {
+                       ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
+                       goto munmap;
+               }
+               break;
+       default:
+               break;
+       }
+
+       fn(mem, size);
+munmap:
+       munmap(mmap_mem, mmap_size);
+       if (mremap_mem != MAP_FAILED)
+               munmap(mremap_mem, mremap_size);
+}
+
+static void run_with_thp(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_PMD);
+}
+
+static void run_with_thp_swap(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
+}
+
+static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_PTE);
+}
+
+static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
+}
+
+static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
+}
+
+static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
+}
+
+static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
+}
+
+static void run_with_partial_shared_thp(test_fn fn, const char *desc)
+{
+       ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
+       do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
+}
+
+static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
+{
+       int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
+       char *mem, *dummy;
+
+       ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
+                      hugetlbsize / 1024);
+
+       flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
+
+       mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_skip("need more free huge pages\n");
+               return;
+       }
+
+       /* Populate an huge page. */
+       memset(mem, 0, hugetlbsize);
+
+       /*
+        * We need a total of two hugetlb pages to handle COW/unsharing
+        * properly, otherwise we might get zapped by a SIGBUS.
+        */
+       dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
+       if (dummy == MAP_FAILED) {
+               ksft_test_result_skip("need more free huge pages\n");
+               goto munmap;
+       }
+       munmap(dummy, hugetlbsize);
+
+       fn(mem, hugetlbsize);
+munmap:
+       munmap(mem, hugetlbsize);
+}
+
+struct test_case {
+       const char *desc;
+       test_fn fn;
+};
+
+/*
+ * Test cases that are specific to anonymous pages: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_test_cases[] = {
+       /*
+        * Basic COW tests for fork() without any GUP. If we miss to break COW,
+        * either the child can observe modifications by the parent or the
+        * other way around.
+        */
+       {
+               "Basic COW after fork()",
+               test_cow_in_parent,
+       },
+       /*
+        * Basic test, but do an additional mprotect(PROT_READ)+
+        * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
+        */
+       {
+               "Basic COW after fork() with mprotect() optimization",
+               test_cow_in_parent_mprotect,
+       },
+       /*
+        * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
+        * we miss to break COW, the child observes modifications by the parent.
+        * This is CVE-2020-29374 reported by Jann Horn.
+        */
+       {
+               "vmsplice() + unmap in child",
+               test_vmsplice_in_child
+       },
+       /*
+        * vmsplice() test, but do an additional mprotect(PROT_READ)+
+        * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
+        */
+       {
+               "vmsplice() + unmap in child with mprotect() optimization",
+               test_vmsplice_in_child_mprotect
+       },
+       /*
+        * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
+        * fork(); modify in the child. If we miss to break COW, the parent
+        * observes modifications by the child.
+        */
+       {
+               "vmsplice() before fork(), unmap in parent after fork()",
+               test_vmsplice_before_fork,
+       },
+       /*
+        * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
+        * child. If we miss to break COW, the parent observes modifications by
+        * the child.
+        */
+       {
+               "vmsplice() + unmap in parent after fork()",
+               test_vmsplice_after_fork,
+       },
+#ifdef LOCAL_CONFIG_HAVE_LIBURING
+       /*
+        * Take a R/W longterm pin and then map the page R/O into the page
+        * table to trigger a write fault on next access. When modifying the
+        * page, the page content must be visible via the pin.
+        */
+       {
+               "R/O-mapping a page registered as iouring fixed buffer",
+               test_iouring_ro,
+       },
+       /*
+        * Take a R/W longterm pin and then fork() a child. When modifying the
+        * page, the page content must be visible via the pin. We expect the
+        * pinned page to not get shared with the child.
+        */
+       {
+               "fork() with an iouring fixed buffer",
+               test_iouring_fork,
+       },
+
+#endif /* LOCAL_CONFIG_HAVE_LIBURING */
+       /*
+        * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
+        * When modifying the page via the page table, the page content change
+        * must be visible via the pin.
+        */
+       {
+               "R/O GUP pin on R/O-mapped shared page",
+               test_ro_pin_on_shared,
+       },
+       /* Same as above, but using GUP-fast. */
+       {
+               "R/O GUP-fast pin on R/O-mapped shared page",
+               test_ro_fast_pin_on_shared,
+       },
+       /*
+        * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
+        * was previously shared. When modifying the page via the page table,
+        * the page content change must be visible via the pin.
+        */
+       {
+               "R/O GUP pin on R/O-mapped previously-shared page",
+               test_ro_pin_on_ro_previously_shared,
+       },
+       /* Same as above, but using GUP-fast. */
+       {
+               "R/O GUP-fast pin on R/O-mapped previously-shared page",
+               test_ro_fast_pin_on_ro_previously_shared,
+       },
+       /*
+        * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
+        * When modifying the page via the page table, the page content change
+        * must be visible via the pin.
+        */
+       {
+               "R/O GUP pin on R/O-mapped exclusive page",
+               test_ro_pin_on_ro_exclusive,
+       },
+       /* Same as above, but using GUP-fast. */
+       {
+               "R/O GUP-fast pin on R/O-mapped exclusive page",
+               test_ro_fast_pin_on_ro_exclusive,
+       },
+};
+
+static void run_anon_test_case(struct test_case const *test_case)
+{
+       int i;
+
+       run_with_base_page(test_case->fn, test_case->desc);
+       run_with_base_page_swap(test_case->fn, test_case->desc);
+       if (thpsize) {
+               run_with_thp(test_case->fn, test_case->desc);
+               run_with_thp_swap(test_case->fn, test_case->desc);
+               run_with_pte_mapped_thp(test_case->fn, test_case->desc);
+               run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
+               run_with_single_pte_of_thp(test_case->fn, test_case->desc);
+               run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
+               run_with_partial_mremap_thp(test_case->fn, test_case->desc);
+               run_with_partial_shared_thp(test_case->fn, test_case->desc);
+       }
+       for (i = 0; i < nr_hugetlbsizes; i++)
+               run_with_hugetlb(test_case->fn, test_case->desc,
+                                hugetlbsizes[i]);
+}
+
+static void run_anon_test_cases(void)
+{
+       int i;
+
+       ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
+
+       for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
+               run_anon_test_case(&anon_test_cases[i]);
+}
+
+static int tests_per_anon_test_case(void)
+{
+       int tests = 2 + nr_hugetlbsizes;
+
+       if (thpsize)
+               tests += 8;
+       return tests;
+}
+
+enum anon_thp_collapse_test {
+       ANON_THP_COLLAPSE_UNSHARED,
+       ANON_THP_COLLAPSE_FULLY_SHARED,
+       ANON_THP_COLLAPSE_LOWER_SHARED,
+       ANON_THP_COLLAPSE_UPPER_SHARED,
+};
+
+static void do_test_anon_thp_collapse(char *mem, size_t size,
+                                     enum anon_thp_collapse_test test)
+{
+       struct comm_pipes comm_pipes;
+       char buf;
+       int ret;
+
+       ret = setup_comm_pipes(&comm_pipes);
+       if (ret) {
+               ksft_test_result_fail("pipe() failed\n");
+               return;
+       }
+
+       /*
+        * Trigger PTE-mapping the THP by temporarily mapping a single subpage
+        * R/O, such that we can try collapsing it later.
+        */
+       ret = mprotect(mem + pagesize, pagesize, PROT_READ);
+       if (ret) {
+               ksft_test_result_fail("mprotect() failed\n");
+               goto close_comm_pipes;
+       }
+       ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
+       if (ret) {
+               ksft_test_result_fail("mprotect() failed\n");
+               goto close_comm_pipes;
+       }
+
+       switch (test) {
+       case ANON_THP_COLLAPSE_UNSHARED:
+               /* Collapse before actually COW-sharing the page. */
+               ret = madvise(mem, size, MADV_COLLAPSE);
+               if (ret) {
+                       ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+                                             strerror(errno));
+                       goto close_comm_pipes;
+               }
+               break;
+       case ANON_THP_COLLAPSE_FULLY_SHARED:
+               /* COW-share the full PTE-mapped THP. */
+               break;
+       case ANON_THP_COLLAPSE_LOWER_SHARED:
+               /* Don't COW-share the upper part of the THP. */
+               ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
+               if (ret) {
+                       ksft_test_result_fail("MADV_DONTFORK failed\n");
+                       goto close_comm_pipes;
+               }
+               break;
+       case ANON_THP_COLLAPSE_UPPER_SHARED:
+               /* Don't COW-share the lower part of the THP. */
+               ret = madvise(mem, size / 2, MADV_DONTFORK);
+               if (ret) {
+                       ksft_test_result_fail("MADV_DONTFORK failed\n");
+                       goto close_comm_pipes;
+               }
+               break;
+       default:
+               assert(false);
+       }
+
+       ret = fork();
+       if (ret < 0) {
+               ksft_test_result_fail("fork() failed\n");
+               goto close_comm_pipes;
+       } else if (!ret) {
+               switch (test) {
+               case ANON_THP_COLLAPSE_UNSHARED:
+               case ANON_THP_COLLAPSE_FULLY_SHARED:
+                       exit(child_memcmp_fn(mem, size, &comm_pipes));
+                       break;
+               case ANON_THP_COLLAPSE_LOWER_SHARED:
+                       exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
+                       break;
+               case ANON_THP_COLLAPSE_UPPER_SHARED:
+                       exit(child_memcmp_fn(mem + size / 2, size / 2,
+                                            &comm_pipes));
+                       break;
+               default:
+                       assert(false);
+               }
+       }
+
+       while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
+               ;
+
+       switch (test) {
+       case ANON_THP_COLLAPSE_UNSHARED:
+               break;
+       case ANON_THP_COLLAPSE_UPPER_SHARED:
+       case ANON_THP_COLLAPSE_LOWER_SHARED:
+               /*
+                * Revert MADV_DONTFORK such that we merge the VMAs and are
+                * able to actually collapse.
+                */
+               ret = madvise(mem, size, MADV_DOFORK);
+               if (ret) {
+                       ksft_test_result_fail("MADV_DOFORK failed\n");
+                       write(comm_pipes.parent_ready[1], "0", 1);
+                       wait(&ret);
+                       goto close_comm_pipes;
+               }
+               /* FALLTHROUGH */
+       case ANON_THP_COLLAPSE_FULLY_SHARED:
+               /* Collapse before anyone modified the COW-shared page. */
+               ret = madvise(mem, size, MADV_COLLAPSE);
+               if (ret) {
+                       ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
+                                             strerror(errno));
+                       write(comm_pipes.parent_ready[1], "0", 1);
+                       wait(&ret);
+                       goto close_comm_pipes;
+               }
+               break;
+       default:
+               assert(false);
+       }
+
+       /* Modify the page. */
+       memset(mem, 0xff, size);
+       write(comm_pipes.parent_ready[1], "0", 1);
+
+       wait(&ret);
+       if (WIFEXITED(ret))
+               ret = WEXITSTATUS(ret);
+       else
+               ret = -EINVAL;
+
+       ksft_test_result(!ret, "No leak from parent into child\n");
+close_comm_pipes:
+       close_comm_pipes(&comm_pipes);
+}
+
+static void test_anon_thp_collapse_unshared(char *mem, size_t size)
+{
+       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
+}
+
+static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
+{
+       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
+}
+
+static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
+{
+       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
+}
+
+static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
+{
+       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
+}
+
+/*
+ * Test cases that are specific to anonymous THP: pages in private mappings
+ * that may get shared via COW during fork().
+ */
+static const struct test_case anon_thp_test_cases[] = {
+       /*
+        * Basic COW test for fork() without any GUP when collapsing a THP
+        * before fork().
+        *
+        * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
+        * collapse") might easily get COW handling wrong when not collapsing
+        * exclusivity information properly.
+        */
+       {
+               "Basic COW after fork() when collapsing before fork()",
+               test_anon_thp_collapse_unshared,
+       },
+       /* Basic COW test, but collapse after COW-sharing a full THP. */
+       {
+               "Basic COW after fork() when collapsing after fork() (fully shared)",
+               test_anon_thp_collapse_fully_shared,
+       },
+       /*
+        * Basic COW test, but collapse after COW-sharing the lower half of a
+        * THP.
+        */
+       {
+               "Basic COW after fork() when collapsing after fork() (lower shared)",
+               test_anon_thp_collapse_lower_shared,
+       },
+       /*
+        * Basic COW test, but collapse after COW-sharing the upper half of a
+        * THP.
+        */
+       {
+               "Basic COW after fork() when collapsing after fork() (upper shared)",
+               test_anon_thp_collapse_upper_shared,
+       },
+};
+
+static void run_anon_thp_test_cases(void)
+{
+       int i;
+
+       if (!thpsize)
+               return;
+
+       ksft_print_msg("[INFO] Anonymous THP tests\n");
+
+       for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
+               struct test_case const *test_case = &anon_thp_test_cases[i];
+
+               ksft_print_msg("[RUN] %s\n", test_case->desc);
+               do_run_with_thp(test_case->fn, THP_RUN_PMD);
+       }
+}
+
+static int tests_per_anon_thp_test_case(void)
+{
+       return thpsize ? 1 : 0;
+}
+
+typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
+
+static void test_cow(char *mem, const char *smem, size_t size)
+{
+       char *old = malloc(size);
+
+       /* Backup the original content. */
+       memcpy(old, smem, size);
+
+       /* Modify the page. */
+       memset(mem, 0xff, size);
+
+       /* See if we still read the old values via the other mapping. */
+       ksft_test_result(!memcmp(smem, old, size),
+                        "Other mapping not modified\n");
+       free(old);
+}
+
+static void test_ro_pin(char *mem, const char *smem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST, false);
+}
+
+static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
+{
+       do_test_ro_pin(mem, size, RO_PIN_TEST, true);
+}
+
+static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
+{
+       char *mem, *smem, tmp;
+
+       ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
+
+       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANON, -1, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               return;
+       }
+
+       smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               goto munmap;
+       }
+
+       /* Read from the page to populate the shared zeropage. */
+       tmp = *mem + *smem;
+       asm volatile("" : "+r" (tmp));
+
+       fn(mem, smem, pagesize);
+munmap:
+       munmap(mem, pagesize);
+       if (smem != MAP_FAILED)
+               munmap(smem, pagesize);
+}
+
+static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
+{
+       char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
+       size_t mmap_size;
+       int ret;
+
+       ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
+
+       if (!has_huge_zeropage) {
+               ksft_test_result_skip("Huge zeropage not enabled\n");
+               return;
+       }
+
+       /* For alignment purposes, we need twice the thp size. */
+       mmap_size = 2 * thpsize;
+       mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (mmap_mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               return;
+       }
+       mmap_smem = mmap(NULL, mmap_size, PROT_READ,
+                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (mmap_smem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               goto munmap;
+       }
+
+       /* We need a THP-aligned memory area. */
+       mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
+       smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
+
+       ret = madvise(mem, thpsize, MADV_HUGEPAGE);
+       ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
+       if (ret) {
+               ksft_test_result_fail("MADV_HUGEPAGE failed\n");
+               goto munmap;
+       }
+
+       /*
+        * Read from the memory to populate the huge shared zeropage. Read from
+        * the first sub-page and test if we get another sub-page populated
+        * automatically.
+        */
+       tmp = *mem + *smem;
+       asm volatile("" : "+r" (tmp));
+       if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
+           !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
+               ksft_test_result_skip("Did not get THPs populated\n");
+               goto munmap;
+       }
+
+       fn(mem, smem, thpsize);
+munmap:
+       munmap(mmap_mem, mmap_size);
+       if (mmap_smem != MAP_FAILED)
+               munmap(mmap_smem, mmap_size);
+}
+
+static void run_with_memfd(non_anon_test_fn fn, const char *desc)
+{
+       char *mem, *smem, tmp;
+       int fd;
+
+       ksft_print_msg("[RUN] %s ... with memfd\n", desc);
+
+       fd = memfd_create("test", 0);
+       if (fd < 0) {
+               ksft_test_result_fail("memfd_create() failed\n");
+               return;
+       }
+
+       /* File consists of a single page filled with zeroes. */
+       if (fallocate(fd, 0, 0, pagesize)) {
+               ksft_test_result_fail("fallocate() failed\n");
+               goto close;
+       }
+
+       /* Create a private mapping of the memfd. */
+       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               goto close;
+       }
+       smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               goto munmap;
+       }
+
+       /* Fault the page in. */
+       tmp = *mem + *smem;
+       asm volatile("" : "+r" (tmp));
+
+       fn(mem, smem, pagesize);
+munmap:
+       munmap(mem, pagesize);
+       if (smem != MAP_FAILED)
+               munmap(smem, pagesize);
+close:
+       close(fd);
+}
+
+static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
+{
+       char *mem, *smem, tmp;
+       FILE *file;
+       int fd;
+
+       ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
+
+       file = tmpfile();
+       if (!file) {
+               ksft_test_result_fail("tmpfile() failed\n");
+               return;
+       }
+
+       fd = fileno(file);
+       if (fd < 0) {
+               ksft_test_result_skip("fileno() failed\n");
+               return;
+       }
+
+       /* File consists of a single page filled with zeroes. */
+       if (fallocate(fd, 0, 0, pagesize)) {
+               ksft_test_result_fail("fallocate() failed\n");
+               goto close;
+       }
+
+       /* Create a private mapping of the memfd. */
+       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               goto close;
+       }
+       smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               goto munmap;
+       }
+
+       /* Fault the page in. */
+       tmp = *mem + *smem;
+       asm volatile("" : "+r" (tmp));
+
+       fn(mem, smem, pagesize);
+munmap:
+       munmap(mem, pagesize);
+       if (smem != MAP_FAILED)
+               munmap(smem, pagesize);
+close:
+       fclose(file);
+}
+
+static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
+                                  size_t hugetlbsize)
+{
+       int flags = MFD_HUGETLB;
+       char *mem, *smem, tmp;
+       int fd;
+
+       ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
+                      hugetlbsize / 1024);
+
+       flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
+
+       fd = memfd_create("test", flags);
+       if (fd < 0) {
+               ksft_test_result_skip("memfd_create() failed\n");
+               return;
+       }
+
+       /* File consists of a single page filled with zeroes. */
+       if (fallocate(fd, 0, 0, hugetlbsize)) {
+               ksft_test_result_skip("need more free huge pages\n");
+               goto close;
+       }
+
+       /* Create a private mapping of the memfd. */
+       mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
+                  0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_skip("need more free huge pages\n");
+               goto close;
+       }
+       smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
+       if (mem == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               goto munmap;
+       }
+
+       /* Fault the page in. */
+       tmp = *mem + *smem;
+       asm volatile("" : "+r" (tmp));
+
+       fn(mem, smem, hugetlbsize);
+munmap:
+       munmap(mem, hugetlbsize);
+       if (mem != MAP_FAILED)
+               munmap(smem, hugetlbsize);
+close:
+       close(fd);
+}
+
+struct non_anon_test_case {
+       const char *desc;
+       non_anon_test_fn fn;
+};
+
+/*
+ * Test cases that target any pages in private mappings that are not anonymous:
+ * pages that may get shared via COW ndependent of fork(). This includes
+ * the shared zeropage(s), pagecache pages, ...
+ */
+static const struct non_anon_test_case non_anon_test_cases[] = {
+       /*
+        * Basic COW test without any GUP. If we miss to break COW, changes are
+        * visible via other private/shared mappings.
+        */
+       {
+               "Basic COW",
+               test_cow,
+       },
+       /*
+        * Take a R/O longterm pin. When modifying the page via the page table,
+        * the page content change must be visible via the pin.
+        */
+       {
+               "R/O longterm GUP pin",
+               test_ro_pin,
+       },
+       /* Same as above, but using GUP-fast. */
+       {
+               "R/O longterm GUP-fast pin",
+               test_ro_fast_pin,
+       },
+};
+
+static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
+{
+       int i;
+
+       run_with_zeropage(test_case->fn, test_case->desc);
+       run_with_memfd(test_case->fn, test_case->desc);
+       run_with_tmpfile(test_case->fn, test_case->desc);
+       if (thpsize)
+               run_with_huge_zeropage(test_case->fn, test_case->desc);
+       for (i = 0; i < nr_hugetlbsizes; i++)
+               run_with_memfd_hugetlb(test_case->fn, test_case->desc,
+                                      hugetlbsizes[i]);
+}
+
+static void run_non_anon_test_cases(void)
+{
+       int i;
+
+       ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
+
+       for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
+               run_non_anon_test_case(&non_anon_test_cases[i]);
+}
+
+static int tests_per_non_anon_test_case(void)
+{
+       int tests = 3 + nr_hugetlbsizes;
+
+       if (thpsize)
+               tests += 1;
+       return tests;
+}
+
+int main(int argc, char **argv)
+{
+       int err;
+
+       pagesize = getpagesize();
+       detect_thpsize();
+       detect_hugetlbsizes();
+       detect_huge_zeropage();
+
+       ksft_print_header();
+       ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
+                     ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
+                     ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
+
+       gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+       if (pagemap_fd < 0)
+               ksft_exit_fail_msg("opening pagemap failed\n");
+
+       run_anon_test_cases();
+       run_anon_thp_test_cases();
+       run_non_anon_test_cases();
+
+       err = ksft_get_fail_cnt();
+       if (err)
+               ksft_exit_fail_msg("%d out of %d tests failed\n",
+                                  err, ksft_test_num());
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c
new file mode 100644 (file)
index 0000000..e438792
--- /dev/null
@@ -0,0 +1,271 @@
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <assert.h>
+#include <mm/gup_test.h>
+#include "../kselftest.h"
+
+#include "util.h"
+
+#define MB (1UL << 20)
+
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE     0x01    /* check pte is writable */
+#define FOLL_TOUCH     0x02    /* mark page accessed */
+
+#define GUP_TEST_FILE "/sys/kernel/debug/gup_test"
+
+static unsigned long cmd = GUP_FAST_BENCHMARK;
+static int gup_fd, repeats = 1;
+static unsigned long size = 128 * MB;
+/* Serialize prints */
+static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static char *cmd_to_str(unsigned long cmd)
+{
+       switch (cmd) {
+       case GUP_FAST_BENCHMARK:
+               return "GUP_FAST_BENCHMARK";
+       case PIN_FAST_BENCHMARK:
+               return "PIN_FAST_BENCHMARK";
+       case PIN_LONGTERM_BENCHMARK:
+               return "PIN_LONGTERM_BENCHMARK";
+       case GUP_BASIC_TEST:
+               return "GUP_BASIC_TEST";
+       case PIN_BASIC_TEST:
+               return "PIN_BASIC_TEST";
+       case DUMP_USER_PAGES_TEST:
+               return "DUMP_USER_PAGES_TEST";
+       }
+       return "Unknown command";
+}
+
+void *gup_thread(void *data)
+{
+       struct gup_test gup = *(struct gup_test *)data;
+       int i;
+
+       /* Only report timing information on the *_BENCHMARK commands: */
+       if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
+            (cmd == PIN_LONGTERM_BENCHMARK)) {
+               for (i = 0; i < repeats; i++) {
+                       gup.size = size;
+                       if (ioctl(gup_fd, cmd, &gup))
+                               perror("ioctl"), exit(1);
+
+                       pthread_mutex_lock(&print_mutex);
+                       printf("%s: Time: get:%lld put:%lld us",
+                              cmd_to_str(cmd), gup.get_delta_usec,
+                              gup.put_delta_usec);
+                       if (gup.size != size)
+                               printf(", truncated (size: %lld)", gup.size);
+                       printf("\n");
+                       pthread_mutex_unlock(&print_mutex);
+               }
+       } else {
+               gup.size = size;
+               if (ioctl(gup_fd, cmd, &gup)) {
+                       perror("ioctl");
+                       exit(1);
+               }
+
+               pthread_mutex_lock(&print_mutex);
+               printf("%s: done\n", cmd_to_str(cmd));
+               if (gup.size != size)
+                       printf("Truncated (size: %lld)\n", gup.size);
+               pthread_mutex_unlock(&print_mutex);
+       }
+
+       return NULL;
+}
+
+int main(int argc, char **argv)
+{
+       struct gup_test gup = { 0 };
+       int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret;
+       int flags = MAP_PRIVATE, touch = 0;
+       char *file = "/dev/zero";
+       pthread_t *tid;
+       char *p;
+
+       while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) {
+               switch (opt) {
+               case 'a':
+                       cmd = PIN_FAST_BENCHMARK;
+                       break;
+               case 'b':
+                       cmd = PIN_BASIC_TEST;
+                       break;
+               case 'L':
+                       cmd = PIN_LONGTERM_BENCHMARK;
+                       break;
+               case 'c':
+                       cmd = DUMP_USER_PAGES_TEST;
+                       /*
+                        * Dump page 0 (index 1). May be overridden later, by
+                        * user's non-option arguments.
+                        *
+                        * .which_pages is zero-based, so that zero can mean "do
+                        * nothing".
+                        */
+                       gup.which_pages[0] = 1;
+                       break;
+               case 'p':
+                       /* works only with DUMP_USER_PAGES_TEST */
+                       gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN;
+                       break;
+               case 'F':
+                       /* strtol, so you can pass flags in hex form */
+                       gup.gup_flags = strtol(optarg, 0, 0);
+                       break;
+               case 'j':
+                       nthreads = atoi(optarg);
+                       break;
+               case 'm':
+                       size = atoi(optarg) * MB;
+                       break;
+               case 'r':
+                       repeats = atoi(optarg);
+                       break;
+               case 'n':
+                       nr_pages = atoi(optarg);
+                       break;
+               case 't':
+                       thp = 1;
+                       break;
+               case 'T':
+                       thp = 0;
+                       break;
+               case 'U':
+                       cmd = GUP_BASIC_TEST;
+                       break;
+               case 'u':
+                       cmd = GUP_FAST_BENCHMARK;
+                       break;
+               case 'w':
+                       write = 1;
+                       break;
+               case 'W':
+                       write = 0;
+                       break;
+               case 'f':
+                       file = optarg;
+                       break;
+               case 'S':
+                       flags &= ~MAP_PRIVATE;
+                       flags |= MAP_SHARED;
+                       break;
+               case 'H':
+                       flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
+                       break;
+               case 'z':
+                       /* fault pages in gup, do not fault in userland */
+                       touch = 1;
+                       break;
+               default:
+                       return -1;
+               }
+       }
+
+       if (optind < argc) {
+               int extra_arg_count = 0;
+               /*
+                * For example:
+                *
+                *   ./gup_test -c 0 1 0x1001
+                *
+                * ...to dump pages 0, 1, and 4097
+                */
+
+               while ((optind < argc) &&
+                      (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) {
+                       /*
+                        * Do the 1-based indexing here, so that the user can
+                        * use normal 0-based indexing on the command line.
+                        */
+                       long page_index = strtol(argv[optind], 0, 0) + 1;
+
+                       gup.which_pages[extra_arg_count] = page_index;
+                       extra_arg_count++;
+                       optind++;
+               }
+       }
+
+       filed = open(file, O_RDWR|O_CREAT);
+       if (filed < 0) {
+               perror("open");
+               exit(filed);
+       }
+
+       gup.nr_pages_per_call = nr_pages;
+       if (write)
+               gup.gup_flags |= FOLL_WRITE;
+
+       gup_fd = open(GUP_TEST_FILE, O_RDWR);
+       if (gup_fd == -1) {
+               switch (errno) {
+               case EACCES:
+                       if (getuid())
+                               printf("Please run this test as root\n");
+                       break;
+               case ENOENT:
+                       if (opendir("/sys/kernel/debug") == NULL) {
+                               printf("mount debugfs at /sys/kernel/debug\n");
+                               break;
+                       }
+                       printf("check if CONFIG_GUP_TEST is enabled in kernel config\n");
+                       break;
+               default:
+                       perror("failed to open " GUP_TEST_FILE);
+                       break;
+               }
+               exit(KSFT_SKIP);
+       }
+
+       p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
+       if (p == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       gup.addr = (unsigned long)p;
+
+       if (thp == 1)
+               madvise(p, size, MADV_HUGEPAGE);
+       else if (thp == 0)
+               madvise(p, size, MADV_NOHUGEPAGE);
+
+       /*
+        * FOLL_TOUCH, in gup_test, is used as an either/or case: either
+        * fault pages in from the kernel via FOLL_TOUCH, or fault them
+        * in here, from user space. This allows comparison of performance
+        * between those two cases.
+        */
+       if (touch) {
+               gup.gup_flags |= FOLL_TOUCH;
+       } else {
+               for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+                       p[0] = 0;
+       }
+
+       tid = malloc(sizeof(pthread_t) * nthreads);
+       assert(tid);
+       for (i = 0; i < nthreads; i++) {
+               ret = pthread_create(&tid[i], NULL, gup_thread, &gup);
+               assert(ret == 0);
+       }
+       for (i = 0; i < nthreads; i++) {
+               ret = pthread_join(tid[i], NULL);
+               assert(ret == 0);
+       }
+       free(tid);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c
new file mode 100644 (file)
index 0000000..4adaad1
--- /dev/null
@@ -0,0 +1,2054 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HMM stands for Heterogeneous Memory Management, it is a helper layer inside
+ * the linux kernel to help device drivers mirror a process address space in
+ * the device. This allows the device to use the same address space which
+ * makes communication and data exchange a lot easier.
+ *
+ * This framework's sole purpose is to exercise various code paths inside
+ * the kernel to make sure that HMM performs as expected and to flush out any
+ * bugs.
+ */
+
+#include "../kselftest_harness.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <strings.h>
+#include <time.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+
+/*
+ * This is a private UAPI to the kernel test module so it isn't exported
+ * in the usual include/uapi/... directory.
+ */
+#include <lib/test_hmm_uapi.h>
+#include <mm/gup_test.h>
+
+struct hmm_buffer {
+       void            *ptr;
+       void            *mirror;
+       unsigned long   size;
+       int             fd;
+       uint64_t        cpages;
+       uint64_t        faults;
+};
+
+enum {
+       HMM_PRIVATE_DEVICE_ONE,
+       HMM_PRIVATE_DEVICE_TWO,
+       HMM_COHERENCE_DEVICE_ONE,
+       HMM_COHERENCE_DEVICE_TWO,
+};
+
+#define TWOMEG         (1 << 21)
+#define HMM_BUFFER_SIZE (1024 << 12)
+#define HMM_PATH_MAX    64
+#define NTIMES         10
+
+#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE     0x01    /* check pte is writable */
+#define FOLL_LONGTERM   0x10000 /* mapping lifetime is indefinite */
+
+FIXTURE(hmm)
+{
+       int             fd;
+       unsigned int    page_size;
+       unsigned int    page_shift;
+};
+
+FIXTURE_VARIANT(hmm)
+{
+       int     device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+       .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+       .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
+FIXTURE(hmm2)
+{
+       int             fd0;
+       int             fd1;
+       unsigned int    page_size;
+       unsigned int    page_shift;
+};
+
+FIXTURE_VARIANT(hmm2)
+{
+       int     device_number0;
+       int     device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+       .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+       .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+       .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+       .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
+static int hmm_open(int unit)
+{
+       char pathname[HMM_PATH_MAX];
+       int fd;
+
+       snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit);
+       fd = open(pathname, O_RDWR, 0);
+       if (fd < 0)
+               fprintf(stderr, "could not open hmm dmirror driver (%s)\n",
+                       pathname);
+       return fd;
+}
+
+static bool hmm_is_coherent_type(int dev_num)
+{
+       return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
+FIXTURE_SETUP(hmm)
+{
+       self->page_size = sysconf(_SC_PAGE_SIZE);
+       self->page_shift = ffs(self->page_size) - 1;
+
+       self->fd = hmm_open(variant->device_number);
+       if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+               SKIP(exit(0), "DEVICE_COHERENT not available");
+       ASSERT_GE(self->fd, 0);
+}
+
+FIXTURE_SETUP(hmm2)
+{
+       self->page_size = sysconf(_SC_PAGE_SIZE);
+       self->page_shift = ffs(self->page_size) - 1;
+
+       self->fd0 = hmm_open(variant->device_number0);
+       if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+               SKIP(exit(0), "DEVICE_COHERENT not available");
+       ASSERT_GE(self->fd0, 0);
+       self->fd1 = hmm_open(variant->device_number1);
+       ASSERT_GE(self->fd1, 0);
+}
+
+FIXTURE_TEARDOWN(hmm)
+{
+       int ret = close(self->fd);
+
+       ASSERT_EQ(ret, 0);
+       self->fd = -1;
+}
+
+FIXTURE_TEARDOWN(hmm2)
+{
+       int ret = close(self->fd0);
+
+       ASSERT_EQ(ret, 0);
+       self->fd0 = -1;
+
+       ret = close(self->fd1);
+       ASSERT_EQ(ret, 0);
+       self->fd1 = -1;
+}
+
+static int hmm_dmirror_cmd(int fd,
+                          unsigned long request,
+                          struct hmm_buffer *buffer,
+                          unsigned long npages)
+{
+       struct hmm_dmirror_cmd cmd;
+       int ret;
+
+       /* Simulate a device reading system memory. */
+       cmd.addr = (__u64)buffer->ptr;
+       cmd.ptr = (__u64)buffer->mirror;
+       cmd.npages = npages;
+
+       for (;;) {
+               ret = ioctl(fd, request, &cmd);
+               if (ret == 0)
+                       break;
+               if (errno == EINTR)
+                       continue;
+               return -errno;
+       }
+       buffer->cpages = cmd.cpages;
+       buffer->faults = cmd.faults;
+
+       return 0;
+}
+
+static void hmm_buffer_free(struct hmm_buffer *buffer)
+{
+       if (buffer == NULL)
+               return;
+
+       if (buffer->ptr)
+               munmap(buffer->ptr, buffer->size);
+       free(buffer->mirror);
+       free(buffer);
+}
+
+/*
+ * Create a temporary file that will be deleted on close.
+ */
+static int hmm_create_file(unsigned long size)
+{
+       char path[HMM_PATH_MAX];
+       int fd;
+
+       strcpy(path, "/tmp");
+       fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600);
+       if (fd >= 0) {
+               int r;
+
+               do {
+                       r = ftruncate(fd, size);
+               } while (r == -1 && errno == EINTR);
+               if (!r)
+                       return fd;
+               close(fd);
+       }
+       return -1;
+}
+
+/*
+ * Return a random unsigned number.
+ */
+static unsigned int hmm_random(void)
+{
+       static int fd = -1;
+       unsigned int r;
+
+       if (fd < 0) {
+               fd = open("/dev/urandom", O_RDONLY);
+               if (fd < 0) {
+                       fprintf(stderr, "%s:%d failed to open /dev/urandom\n",
+                                       __FILE__, __LINE__);
+                       return ~0U;
+               }
+       }
+       read(fd, &r, sizeof(r));
+       return r;
+}
+
+static void hmm_nanosleep(unsigned int n)
+{
+       struct timespec t;
+
+       t.tv_sec = 0;
+       t.tv_nsec = n;
+       nanosleep(&t, NULL);
+}
+
+static int hmm_migrate_sys_to_dev(int fd,
+                                  struct hmm_buffer *buffer,
+                                  unsigned long npages)
+{
+       return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+                                  struct hmm_buffer *buffer,
+                                  unsigned long npages)
+{
+       return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
+/*
+ * Simple NULL test of device open/close.
+ */
+TEST_F(hmm, open_close)
+{
+}
+
+/*
+ * Read private anonymous memory.
+ */
+TEST_F(hmm, anon_read)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+       int val;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /*
+        * Initialize buffer in system memory but leave the first two pages
+        * zero (pte_none and pfn_zero).
+        */
+       i = 2 * self->page_size / sizeof(*ptr);
+       for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Set buffer permission to read-only. */
+       ret = mprotect(buffer->ptr, size, PROT_READ);
+       ASSERT_EQ(ret, 0);
+
+       /* Populate the CPU page table with a special zero page. */
+       val = *(int *)(buffer->ptr + self->page_size);
+       ASSERT_EQ(val, 0);
+
+       /* Simulate a device reading system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device read. */
+       ptr = buffer->mirror;
+       for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], 0);
+       for (; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Read private anonymous memory which has been protected with
+ * mprotect() PROT_NONE.
+ */
+TEST_F(hmm, anon_read_prot)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Initialize mirror buffer so we can verify it isn't written. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = -i;
+
+       /* Protect buffer from reading. */
+       ret = mprotect(buffer->ptr, size, PROT_NONE);
+       ASSERT_EQ(ret, 0);
+
+       /* Simulate a device reading system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+       ASSERT_EQ(ret, -EFAULT);
+
+       /* Allow CPU to read the buffer so we can check it. */
+       ret = mprotect(buffer->ptr, size, PROT_READ);
+       ASSERT_EQ(ret, 0);
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], -i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory.
+ */
+TEST_F(hmm, anon_write)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize data that the device will write to buffer->ptr. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory which has been protected with
+ * mprotect() PROT_READ.
+ */
+TEST_F(hmm, anon_write_prot)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Simulate a device reading a zero page of memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, 1);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Initialize data that the device will write to buffer->ptr. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, -EPERM);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], 0);
+
+       /* Now allow writing and see that the zero page is replaced. */
+       ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ);
+       ASSERT_EQ(ret, 0);
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Check that a device writing an anonymous private mapping
+ * will copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       pid_t pid;
+       int child_fd;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer->ptr so we can tell if it is written. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Initialize data that the device will write to buffer->ptr. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = -i;
+
+       pid = fork();
+       if (pid == -1)
+               ASSERT_EQ(pid, 0);
+       if (pid != 0) {
+               waitpid(pid, &ret, 0);
+               ASSERT_EQ(WIFEXITED(ret), 1);
+
+               /* Check that the parent's buffer did not change. */
+               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+                       ASSERT_EQ(ptr[i], i);
+               return;
+       }
+
+       /* Check that we see the parent's values. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], -i);
+
+       /* The child process needs its own mirror to its own mm. */
+       child_fd = hmm_open(0);
+       ASSERT_GE(child_fd, 0);
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], -i);
+
+       close(child_fd);
+       exit(0);
+}
+
+/*
+ * Check that a device writing an anonymous shared mapping
+ * will not copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child_shared)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       pid_t pid;
+       int child_fd;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_SHARED | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer->ptr so we can tell if it is written. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Initialize data that the device will write to buffer->ptr. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = -i;
+
+       pid = fork();
+       if (pid == -1)
+               ASSERT_EQ(pid, 0);
+       if (pid != 0) {
+               waitpid(pid, &ret, 0);
+               ASSERT_EQ(WIFEXITED(ret), 1);
+
+               /* Check that the parent's buffer did change. */
+               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+                       ASSERT_EQ(ptr[i], -i);
+               return;
+       }
+
+       /* Check that we see the parent's values. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], -i);
+
+       /* The child process needs its own mirror to its own mm. */
+       child_fd = hmm_open(0);
+       ASSERT_GE(child_fd, 0);
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], -i);
+
+       close(child_fd);
+       exit(0);
+}
+
+/*
+ * Write private anonymous huge page.
+ */
+TEST_F(hmm, anon_write_huge)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       void *old_ptr;
+       void *map;
+       int *ptr;
+       int ret;
+
+       size = 2 * TWOMEG;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       size = TWOMEG;
+       npages = size >> self->page_shift;
+       map = (void *)ALIGN((uintptr_t)buffer->ptr, size);
+       ret = madvise(map, size, MADV_HUGEPAGE);
+       ASSERT_EQ(ret, 0);
+       old_ptr = buffer->ptr;
+       buffer->ptr = map;
+
+       /* Initialize data that the device will write to buffer->ptr. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       buffer->ptr = old_ptr;
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Read numeric data from raw and tagged kernel status files.  Used to read
+ * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
+ */
+static long file_read_ulong(char *file, const char *tag)
+{
+       int fd;
+       char buf[2048];
+       int len;
+       char *p, *q;
+       long val;
+
+       fd = open(file, O_RDONLY);
+       if (fd < 0) {
+               /* Error opening the file */
+               return -1;
+       }
+
+       len = read(fd, buf, sizeof(buf));
+       close(fd);
+       if (len < 0) {
+               /* Error in reading the file */
+               return -1;
+       }
+       if (len == sizeof(buf)) {
+               /* Error file is too large */
+               return -1;
+       }
+       buf[len] = '\0';
+
+       /* Search for a tag if provided */
+       if (tag) {
+               p = strstr(buf, tag);
+               if (!p)
+                       return -1; /* looks like the line we want isn't there */
+               p += strlen(tag);
+       } else
+               p = buf;
+
+       val = strtol(p, &q, 0);
+       if (*q != ' ') {
+               /* Error parsing the file */
+               return -1;
+       }
+
+       return val;
+}
+
+/*
+ * Write huge TLBFS page.
+ */
+TEST_F(hmm, anon_write_hugetlbfs)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long default_hsize;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+       if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+               SKIP(return, "Huge page size could not be determined");
+       default_hsize = default_hsize*1024; /* KB to B */
+
+       size = ALIGN(TWOMEG, default_hsize);
+       npages = size >> self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                                  PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                                  -1, 0);
+       if (buffer->ptr == MAP_FAILED) {
+               free(buffer);
+               SKIP(return, "Huge page could not be allocated");
+       }
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       /* Initialize data that the device will write to buffer->ptr. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       munmap(buffer->ptr, buffer->size);
+       buffer->ptr = NULL;
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Read mmap'ed file memory.
+ */
+TEST_F(hmm, file_read)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+       int fd;
+       ssize_t len;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       fd = hmm_create_file(size);
+       ASSERT_GE(fd, 0);
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = fd;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       /* Write initial contents of the file. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+       len = pwrite(fd, buffer->mirror, size, 0);
+       ASSERT_EQ(len, size);
+       memset(buffer->mirror, 0, size);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ,
+                          MAP_SHARED,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Simulate a device reading system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Write mmap'ed file memory.
+ */
+TEST_F(hmm, file_write)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+       int fd;
+       ssize_t len;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       fd = hmm_create_file(size);
+       ASSERT_GE(fd, 0);
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = fd;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_SHARED,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize data that the device will write to buffer->ptr. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device wrote. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Check that the device also wrote the file. */
+       len = pread(fd, buffer->mirror, size, 0);
+       ASSERT_EQ(len, size);
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory.
+ */
+TEST_F(hmm, migrate)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Migrate memory to device. */
+       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory and fault some of it back
+ * to system memory, then try migrating the resulting mix of system and device
+ * private memory to the device.
+ */
+TEST_F(hmm, migrate_fault)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Migrate memory to device. */
+       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Fault half the pages back to system memory and check them. */
+       for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Migrate memory to the device again. */
+       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+TEST_F(hmm, migrate_release)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Migrate memory to device. */
+       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Release device memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+
+       /* Fault pages back to system memory and check them. */
+       for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous shared memory to device private memory.
+ */
+TEST_F(hmm, migrate_shared)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_SHARED | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Migrate memory to device. */
+       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+       ASSERT_EQ(ret, -ENOENT);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Try to migrate various memory types to device private memory.
+ */
+TEST_F(hmm2, migrate_mixed)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       int *ptr;
+       unsigned char *p;
+       int ret;
+       int val;
+
+       npages = 6;
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       /* Reserve a range of addresses. */
+       buffer->ptr = mmap(NULL, size,
+                          PROT_NONE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+       p = buffer->ptr;
+
+       /* Migrating a protected area should be an error. */
+       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
+       ASSERT_EQ(ret, -EINVAL);
+
+       /* Punch a hole after the first page address. */
+       ret = munmap(buffer->ptr + self->page_size, self->page_size);
+       ASSERT_EQ(ret, 0);
+
+       /* We expect an error if the vma doesn't cover the range. */
+       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3);
+       ASSERT_EQ(ret, -EINVAL);
+
+       /* Page 2 will be a read-only zero page. */
+       ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+                               PROT_READ);
+       ASSERT_EQ(ret, 0);
+       ptr = (int *)(buffer->ptr + 2 * self->page_size);
+       val = *ptr + 3;
+       ASSERT_EQ(val, 3);
+
+       /* Page 3 will be read-only. */
+       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+                               PROT_READ | PROT_WRITE);
+       ASSERT_EQ(ret, 0);
+       ptr = (int *)(buffer->ptr + 3 * self->page_size);
+       *ptr = val;
+       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+                               PROT_READ);
+       ASSERT_EQ(ret, 0);
+
+       /* Page 4-5 will be read-write. */
+       ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size,
+                               PROT_READ | PROT_WRITE);
+       ASSERT_EQ(ret, 0);
+       ptr = (int *)(buffer->ptr + 4 * self->page_size);
+       *ptr = val;
+       ptr = (int *)(buffer->ptr + 5 * self->page_size);
+       *ptr = val;
+
+       /* Now try to migrate pages 2-5 to device 1. */
+       buffer->ptr = p + 2 * self->page_size;
+       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, 4);
+
+       /* Page 5 won't be migrated to device 0 because it's on device 1. */
+       buffer->ptr = p + 5 * self->page_size;
+       ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
+       ASSERT_EQ(ret, -ENOENT);
+       buffer->ptr = p;
+
+       buffer->ptr = p;
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device memory and back to system memory
+ * multiple times. In case of private zone configuration, this is done
+ * through fault pages accessed by CPU. In case of coherent zone configuration,
+ * the pages from the device should be explicitly migrated back to system memory.
+ * The reason is Coherent device zone has coherent access by CPU, therefore
+ * it will not generate any page fault.
+ */
+TEST_F(hmm, migrate_multiple)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       unsigned long c;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       for (c = 0; c < NTIMES; c++) {
+               buffer = malloc(sizeof(*buffer));
+               ASSERT_NE(buffer, NULL);
+
+               buffer->fd = -1;
+               buffer->size = size;
+               buffer->mirror = malloc(size);
+               ASSERT_NE(buffer->mirror, NULL);
+
+               buffer->ptr = mmap(NULL, size,
+                                  PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANONYMOUS,
+                                  buffer->fd, 0);
+               ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+               /* Initialize buffer in system memory. */
+               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+                       ptr[i] = i;
+
+               /* Migrate memory to device. */
+               ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+               ASSERT_EQ(ret, 0);
+               ASSERT_EQ(buffer->cpages, npages);
+
+               /* Check what the device read. */
+               for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+                       ASSERT_EQ(ptr[i], i);
+
+               /* Migrate back to system memory and check them. */
+               if (hmm_is_coherent_type(variant->device_number)) {
+                       ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages);
+                       ASSERT_EQ(ret, 0);
+                       ASSERT_EQ(buffer->cpages, npages);
+               }
+
+               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+                       ASSERT_EQ(ptr[i], i);
+
+               hmm_buffer_free(buffer);
+       }
+}
+
+/*
+ * Read anonymous memory multiple times.
+ */
+TEST_F(hmm, anon_read_multiple)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       unsigned long c;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       for (c = 0; c < NTIMES; c++) {
+               buffer = malloc(sizeof(*buffer));
+               ASSERT_NE(buffer, NULL);
+
+               buffer->fd = -1;
+               buffer->size = size;
+               buffer->mirror = malloc(size);
+               ASSERT_NE(buffer->mirror, NULL);
+
+               buffer->ptr = mmap(NULL, size,
+                                  PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANONYMOUS,
+                                  buffer->fd, 0);
+               ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+               /* Initialize buffer in system memory. */
+               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+                       ptr[i] = i + c;
+
+               /* Simulate a device reading system memory. */
+               ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+                                     npages);
+               ASSERT_EQ(ret, 0);
+               ASSERT_EQ(buffer->cpages, npages);
+               ASSERT_EQ(buffer->faults, 1);
+
+               /* Check what the device read. */
+               for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+                       ASSERT_EQ(ptr[i], i + c);
+
+               hmm_buffer_free(buffer);
+       }
+}
+
+void *unmap_buffer(void *p)
+{
+       struct hmm_buffer *buffer = p;
+
+       /* Delay for a bit and then unmap buffer while it is being read. */
+       hmm_nanosleep(hmm_random() % 32000);
+       munmap(buffer->ptr + buffer->size / 2, buffer->size / 2);
+       buffer->ptr = NULL;
+
+       return NULL;
+}
+
+/*
+ * Try reading anonymous memory while it is being unmapped.
+ */
+TEST_F(hmm, anon_teardown)
+{
+       unsigned long npages;
+       unsigned long size;
+       unsigned long c;
+       void *ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       for (c = 0; c < NTIMES; ++c) {
+               pthread_t thread;
+               struct hmm_buffer *buffer;
+               unsigned long i;
+               int *ptr;
+               int rc;
+
+               buffer = malloc(sizeof(*buffer));
+               ASSERT_NE(buffer, NULL);
+
+               buffer->fd = -1;
+               buffer->size = size;
+               buffer->mirror = malloc(size);
+               ASSERT_NE(buffer->mirror, NULL);
+
+               buffer->ptr = mmap(NULL, size,
+                                  PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANONYMOUS,
+                                  buffer->fd, 0);
+               ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+               /* Initialize buffer in system memory. */
+               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+                       ptr[i] = i + c;
+
+               rc = pthread_create(&thread, NULL, unmap_buffer, buffer);
+               ASSERT_EQ(rc, 0);
+
+               /* Simulate a device reading system memory. */
+               rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+                                    npages);
+               if (rc == 0) {
+                       ASSERT_EQ(buffer->cpages, npages);
+                       ASSERT_EQ(buffer->faults, 1);
+
+                       /* Check what the device read. */
+                       for (i = 0, ptr = buffer->mirror;
+                            i < size / sizeof(*ptr);
+                            ++i)
+                               ASSERT_EQ(ptr[i], i + c);
+               }
+
+               pthread_join(thread, &ret);
+               hmm_buffer_free(buffer);
+       }
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned char *m;
+       int ret;
+
+       npages = 1;
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(npages);
+       ASSERT_NE(buffer->mirror, NULL);
+
+
+       /* Reserve a range of addresses. */
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE,
+                          self->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Simulate a device snapshotting CPU pagetables. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device saw. */
+       m = buffer->mirror;
+       ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm2, snapshot)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       int *ptr;
+       unsigned char *p;
+       unsigned char *m;
+       int ret;
+       int val;
+
+       npages = 7;
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(npages);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       /* Reserve a range of addresses. */
+       buffer->ptr = mmap(NULL, size,
+                          PROT_NONE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+       p = buffer->ptr;
+
+       /* Punch a hole after the first page address. */
+       ret = munmap(buffer->ptr + self->page_size, self->page_size);
+       ASSERT_EQ(ret, 0);
+
+       /* Page 2 will be read-only zero page. */
+       ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+                               PROT_READ);
+       ASSERT_EQ(ret, 0);
+       ptr = (int *)(buffer->ptr + 2 * self->page_size);
+       val = *ptr + 3;
+       ASSERT_EQ(val, 3);
+
+       /* Page 3 will be read-only. */
+       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+                               PROT_READ | PROT_WRITE);
+       ASSERT_EQ(ret, 0);
+       ptr = (int *)(buffer->ptr + 3 * self->page_size);
+       *ptr = val;
+       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+                               PROT_READ);
+       ASSERT_EQ(ret, 0);
+
+       /* Page 4-6 will be read-write. */
+       ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size,
+                               PROT_READ | PROT_WRITE);
+       ASSERT_EQ(ret, 0);
+       ptr = (int *)(buffer->ptr + 4 * self->page_size);
+       *ptr = val;
+
+       /* Page 5 will be migrated to device 0. */
+       buffer->ptr = p + 5 * self->page_size;
+       ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, 1);
+
+       /* Page 6 will be migrated to device 1. */
+       buffer->ptr = p + 6 * self->page_size;
+       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, 1);
+
+       /* Simulate a device snapshotting CPU pagetables. */
+       buffer->ptr = p;
+       ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device saw. */
+       m = buffer->mirror;
+       ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR);
+       ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR);
+       ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
+       ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
+       ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);
+       if (!hmm_is_coherent_type(variant->device_number0)) {
+               ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
+                               HMM_DMIRROR_PROT_WRITE);
+               ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
+       } else {
+               ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL |
+                               HMM_DMIRROR_PROT_WRITE);
+               ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE |
+                               HMM_DMIRROR_PROT_WRITE);
+       }
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
+ * should be mapped by a large page table entry.
+ */
+TEST_F(hmm, compound)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long default_hsize;
+       int *ptr;
+       unsigned char *m;
+       int ret;
+       unsigned long i;
+
+       /* Skip test if we can't allocate a hugetlbfs page. */
+
+       default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
+       if (default_hsize < 0 || default_hsize*1024 < default_hsize)
+               SKIP(return, "Huge page size could not be determined");
+       default_hsize = default_hsize*1024; /* KB to B */
+
+       size = ALIGN(TWOMEG, default_hsize);
+       npages = size >> self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                                  PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                                  -1, 0);
+       if (buffer->ptr == MAP_FAILED) {
+               free(buffer);
+               return;
+       }
+
+       buffer->size = size;
+       buffer->mirror = malloc(npages);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       /* Initialize the pages the device will snapshot in buffer->ptr. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Simulate a device snapshotting CPU pagetables. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device saw. */
+       m = buffer->mirror;
+       for (i = 0; i < npages; ++i)
+               ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE |
+                               HMM_DMIRROR_PROT_PMD);
+
+       /* Make the region read-only. */
+       ret = mprotect(buffer->ptr, size, PROT_READ);
+       ASSERT_EQ(ret, 0);
+
+       /* Simulate a device snapshotting CPU pagetables. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device saw. */
+       m = buffer->mirror;
+       for (i = 0; i < npages; ++i)
+               ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
+                               HMM_DMIRROR_PROT_PMD);
+
+       munmap(buffer->ptr, buffer->size);
+       buffer->ptr = NULL;
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Test two devices reading the same memory (double mapped).
+ */
+TEST_F(hmm2, double_map)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = 6;
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(npages);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       /* Reserve a range of addresses. */
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Make region read-only. */
+       ret = mprotect(buffer->ptr, size, PROT_READ);
+       ASSERT_EQ(ret, 0);
+
+       /* Simulate device 0 reading system memory. */
+       ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Simulate device 1 reading system memory. */
+       ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Migrate pages to device 1 and try to read from device 0. */
+       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       ASSERT_EQ(buffer->faults, 1);
+
+       /* Check what device 0 read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Basic check of exclusive faulting.
+ */
+TEST_F(hmm, exclusive)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Map memory exclusively for device access. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       /* Fault pages back to system memory and check them. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i]++, i);
+
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i+1);
+
+       /* Check atomic access revoked */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+
+       hmm_buffer_free(buffer);
+}
+
+TEST_F(hmm, exclusive_mprotect)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Map memory exclusively for device access. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       ret = mprotect(buffer->ptr, size, PROT_READ);
+       ASSERT_EQ(ret, 0);
+
+       /* Simulate a device writing system memory. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+       ASSERT_EQ(ret, -EPERM);
+
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Check copy-on-write works.
+ */
+TEST_F(hmm, exclusive_cow)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+
+       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+       ASSERT_NE(npages, 0);
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Map memory exclusively for device access. */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       fork();
+
+       /* Fault pages back to system memory and check them. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i]++, i);
+
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i+1);
+
+       hmm_buffer_free(buffer);
+}
+
+static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
+                        int npages, int size, int flags)
+{
+       struct gup_test gup = {
+               .nr_pages_per_call      = npages,
+               .addr                   = addr,
+               .gup_flags              = FOLL_WRITE | flags,
+               .size                   = size,
+       };
+
+       if (ioctl(gup_fd, cmd, &gup)) {
+               perror("ioctl on error\n");
+               return errno;
+       }
+
+       return 0;
+}
+
+/*
+ * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
+ * This should trigger a migration back to system memory for both, private
+ * and coherent type pages.
+ * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
+ * to your configuration before you run it.
+ */
+TEST_F(hmm, hmm_gup_test)
+{
+       struct hmm_buffer *buffer;
+       int gup_fd;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+       unsigned char *m;
+
+       gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
+       if (gup_fd == -1)
+               SKIP(return, "Skipping test, could not find gup_test driver");
+
+       npages = 4;
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Migrate memory to device. */
+       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       /* Check what the device read. */
+       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       ASSERT_EQ(gup_test_exec(gup_fd,
+                               (unsigned long)buffer->ptr,
+                               GUP_BASIC_TEST, 1, self->page_size, 0), 0);
+       ASSERT_EQ(gup_test_exec(gup_fd,
+                               (unsigned long)buffer->ptr + 1 * self->page_size,
+                               GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
+       ASSERT_EQ(gup_test_exec(gup_fd,
+                               (unsigned long)buffer->ptr + 2 * self->page_size,
+                               PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0);
+       ASSERT_EQ(gup_test_exec(gup_fd,
+                               (unsigned long)buffer->ptr + 3 * self->page_size,
+                               PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0);
+
+       /* Take snapshot to CPU pagetables */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       m = buffer->mirror;
+       if (hmm_is_coherent_type(variant->device_number)) {
+               ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]);
+               ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]);
+       } else {
+               ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
+               ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
+       }
+       ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]);
+       ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]);
+       /*
+        * Check again the content on the pages. Make sure there's no
+        * corrupted data.
+        */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ASSERT_EQ(ptr[i], i);
+
+       close(gup_fd);
+       hmm_buffer_free(buffer);
+}
+
+/*
+ * Test copy-on-write in device pages.
+ * In case of writing to COW private page(s), a page fault will migrate pages
+ * back to system memory first. Then, these pages will be duplicated. In case
+ * of COW device coherent type, pages are duplicated directly from device
+ * memory.
+ */
+TEST_F(hmm, hmm_cow_in_device)
+{
+       struct hmm_buffer *buffer;
+       unsigned long npages;
+       unsigned long size;
+       unsigned long i;
+       int *ptr;
+       int ret;
+       unsigned char *m;
+       pid_t pid;
+       int status;
+
+       npages = 4;
+       size = npages << self->page_shift;
+
+       buffer = malloc(sizeof(*buffer));
+       ASSERT_NE(buffer, NULL);
+
+       buffer->fd = -1;
+       buffer->size = size;
+       buffer->mirror = malloc(size);
+       ASSERT_NE(buffer->mirror, NULL);
+
+       buffer->ptr = mmap(NULL, size,
+                          PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS,
+                          buffer->fd, 0);
+       ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+       /* Initialize buffer in system memory. */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Migrate memory to device. */
+
+       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+
+       pid = fork();
+       if (pid == -1)
+               ASSERT_EQ(pid, 0);
+       if (!pid) {
+               /* Child process waitd for SIGTERM from the parent. */
+               while (1) {
+               }
+               perror("Should not reach this\n");
+               exit(0);
+       }
+       /* Parent process writes to COW pages(s) and gets a
+        * new copy in system. In case of device private pages,
+        * this write causes a migration to system mem first.
+        */
+       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+               ptr[i] = i;
+
+       /* Terminate child and wait */
+       EXPECT_EQ(0, kill(pid, SIGTERM));
+       EXPECT_EQ(pid, waitpid(pid, &status, 0));
+       EXPECT_NE(0, WIFSIGNALED(status));
+       EXPECT_EQ(SIGTERM, WTERMSIG(status));
+
+       /* Take snapshot to CPU pagetables */
+       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+       ASSERT_EQ(ret, 0);
+       ASSERT_EQ(buffer->cpages, npages);
+       m = buffer->mirror;
+       for (i = 0; i < npages; i++)
+               ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
+
+       hmm_buffer_free(buffer);
+}
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/hugepage-mmap.c b/tools/testing/selftests/mm/hugepage-mmap.c
new file mode 100644 (file)
index 0000000..955ef87
--- /dev/null
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call.  Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+static void check_bytes(char *addr)
+{
+       printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+       unsigned long i;
+
+       for (i = 0; i < LENGTH; i++)
+               *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+       unsigned long i;
+
+       check_bytes(addr);
+       for (i = 0; i < LENGTH; i++)
+               if (*(addr + i) != (char)i) {
+                       printf("Mismatch at %lu\n", i);
+                       return 1;
+               }
+       return 0;
+}
+
+int main(void)
+{
+       void *addr;
+       int fd, ret;
+
+       fd = memfd_create("hugepage-mmap", MFD_HUGETLB);
+       if (fd < 0) {
+               perror("memfd_create() failed");
+               exit(1);
+       }
+
+       addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               close(fd);
+               exit(1);
+       }
+
+       printf("Returned address is %p\n", addr);
+       check_bytes(addr);
+       write_bytes(addr);
+       ret = read_bytes(addr);
+
+       munmap(addr, LENGTH);
+       close(fd);
+
+       return ret;
+}
diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c
new file mode 100644 (file)
index 0000000..e53b5ea
--- /dev/null
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mremap:
+ *
+ * Example of remapping huge page memory in a user application using the
+ * mremap system call.  The path to a file in a hugetlbfs filesystem must
+ * be passed as the last argument to this test.  The amount of memory used
+ * by this test in MBs can optionally be passed as an argument.  If no memory
+ * amount is passed, the default amount is 10MB.
+ *
+ * To make sure the test triggers pmd sharing and goes through the 'unshare'
+ * path in the mremap code use 1GB (1024) or more.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <fcntl.h> /* Definition of O_* constants */
+#include <sys/syscall.h> /* Definition of SYS_* constants */
+#include <linux/userfaultfd.h>
+#include <sys/ioctl.h>
+#include <string.h>
+
+#define DEFAULT_LENGTH_MB 10UL
+#define MB_TO_BYTES(x) (x * 1024 * 1024)
+
+#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
+#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
+
+static void check_bytes(char *addr)
+{
+       printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr, size_t len)
+{
+       unsigned long i;
+
+       for (i = 0; i < len; i++)
+               *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr, size_t len)
+{
+       unsigned long i;
+
+       check_bytes(addr);
+       for (i = 0; i < len; i++)
+               if (*(addr + i) != (char)i) {
+                       printf("Mismatch at %lu\n", i);
+                       return 1;
+               }
+       return 0;
+}
+
+static void register_region_with_uffd(char *addr, size_t len)
+{
+       long uffd; /* userfaultfd file descriptor */
+       struct uffdio_api uffdio_api;
+       struct uffdio_register uffdio_register;
+
+       /* Create and enable userfaultfd object. */
+
+       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+       if (uffd == -1) {
+               perror("userfaultfd");
+               exit(1);
+       }
+
+       uffdio_api.api = UFFD_API;
+       uffdio_api.features = 0;
+       if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+               perror("ioctl-UFFDIO_API");
+               exit(1);
+       }
+
+       /* Create a private anonymous mapping. The memory will be
+        * demand-zero paged--that is, not yet allocated. When we
+        * actually touch the memory, it will be allocated via
+        * the userfaultfd.
+        */
+
+       addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       printf("Address returned by mmap() = %p\n", addr);
+
+       /* Register the memory range of the mapping we just created for
+        * handling by the userfaultfd object. In mode, we request to track
+        * missing pages (i.e., pages that have not yet been faulted in).
+        */
+
+       uffdio_register.range.start = (unsigned long)addr;
+       uffdio_register.range.len = len;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+               perror("ioctl-UFFDIO_REGISTER");
+               exit(1);
+       }
+}
+
+int main(int argc, char *argv[])
+{
+       size_t length = 0;
+       int ret = 0, fd;
+
+       if (argc >= 2 && !strcmp(argv[1], "-h")) {
+               printf("Usage: %s [length_in_MB]\n", argv[0]);
+               exit(1);
+       }
+
+       /* Read memory length as the first arg if valid, otherwise fallback to
+        * the default length.
+        */
+       if (argc >= 2)
+               length = (size_t)atoi(argv[1]);
+       else
+               length = DEFAULT_LENGTH_MB;
+
+       length = MB_TO_BYTES(length);
+       fd = memfd_create(argv[0], MFD_HUGETLB);
+       if (fd < 0) {
+               perror("Open failed");
+               exit(1);
+       }
+
+       /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
+       unsigned long suggested_addr = 0x7eaa40000000;
+       void *haddr = mmap((void *)suggested_addr, length, PROTECTION,
+                          MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+       printf("Map haddr: Returned address is %p\n", haddr);
+       if (haddr == MAP_FAILED) {
+               perror("mmap1");
+               exit(1);
+       }
+
+       /* mmap again to a dummy address to hopefully trigger pmd sharing. */
+       suggested_addr = 0x7daa40000000;
+       void *daddr = mmap((void *)suggested_addr, length, PROTECTION,
+                          MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
+       printf("Map daddr: Returned address is %p\n", daddr);
+       if (daddr == MAP_FAILED) {
+               perror("mmap3");
+               exit(1);
+       }
+
+       suggested_addr = 0x7faa40000000;
+       void *vaddr =
+               mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0);
+       printf("Map vaddr: Returned address is %p\n", vaddr);
+       if (vaddr == MAP_FAILED) {
+               perror("mmap2");
+               exit(1);
+       }
+
+       register_region_with_uffd(haddr, length);
+
+       void *addr = mremap(haddr, length, length,
+                           MREMAP_MAYMOVE | MREMAP_FIXED, vaddr);
+       if (addr == MAP_FAILED) {
+               perror("mremap");
+               exit(1);
+       }
+
+       printf("Mremap: Returned address is %p\n", addr);
+       check_bytes(addr);
+       write_bytes(addr, length);
+       ret = read_bytes(addr, length);
+
+       munmap(addr, length);
+
+       addr = mremap(addr, length, length, 0);
+       if (addr != MAP_FAILED) {
+               printf("mremap: Expected failure, but call succeeded\n");
+               exit(1);
+       }
+
+       close(fd);
+
+       return ret;
+}
diff --git a/tools/testing/selftests/mm/hugepage-shm.c b/tools/testing/selftests/mm/hugepage-shm.c
new file mode 100644 (file)
index 0000000..e2527f3
--- /dev/null
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls.  In this example the app is requesting 256MB of
+ * memory that is backed by huge pages.  The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages.  That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required.  If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x)  printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+       int shmid;
+       unsigned long i;
+       char *shmaddr;
+
+       shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+       if (shmid < 0) {
+               perror("shmget");
+               exit(1);
+       }
+       printf("shmid: 0x%x\n", shmid);
+
+       shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+       if (shmaddr == (char *)-1) {
+               perror("Shared memory attach failure");
+               shmctl(shmid, IPC_RMID, NULL);
+               exit(2);
+       }
+       printf("shmaddr: %p\n", shmaddr);
+
+       dprintf("Starting the writes:\n");
+       for (i = 0; i < LENGTH; i++) {
+               shmaddr[i] = (char)(i);
+               if (!(i % (1024 * 1024)))
+                       dprintf(".");
+       }
+       dprintf("\n");
+
+       dprintf("Starting the Check...");
+       for (i = 0; i < LENGTH; i++)
+               if (shmaddr[i] != (char)i) {
+                       printf("\nIndex %lu mismatched\n", i);
+                       exit(3);
+               }
+       dprintf("Done.\n");
+
+       if (shmdt((const void *)shmaddr) != 0) {
+               perror("Detach failure");
+               shmctl(shmid, IPC_RMID, NULL);
+               exit(4);
+       }
+
+       shmctl(shmid, IPC_RMID, NULL);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/hugepage-vmemmap.c b/tools/testing/selftests/mm/hugepage-vmemmap.c
new file mode 100644 (file)
index 0000000..557bdbd
--- /dev/null
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test case of using hugepage memory in a user application using the
+ * mmap system call with MAP_HUGETLB flag.  Before running this program
+ * make sure the administrator has allocated enough default sized huge
+ * pages to cover the 2 MB allocation.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define MAP_LENGTH             (2UL * 1024 * 1024)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB            0x40000 /* arch specific */
+#endif
+
+#define PAGE_SIZE              4096
+
+#define PAGE_COMPOUND_HEAD     (1UL << 15)
+#define PAGE_COMPOUND_TAIL     (1UL << 16)
+#define PAGE_HUGE              (1UL << 17)
+
+#define HEAD_PAGE_FLAGS                (PAGE_COMPOUND_HEAD | PAGE_HUGE)
+#define TAIL_PAGE_FLAGS                (PAGE_COMPOUND_TAIL | PAGE_HUGE)
+
+#define PM_PFRAME_BITS         55
+#define PM_PFRAME_MASK         ~((1UL << PM_PFRAME_BITS) - 1)
+
+/*
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#ifdef __ia64__
+#define MAP_ADDR               (void *)(0x8000000000000000UL)
+#define MAP_FLAGS              (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define MAP_ADDR               NULL
+#define MAP_FLAGS              (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void write_bytes(char *addr, size_t length)
+{
+       unsigned long i;
+
+       for (i = 0; i < length; i++)
+               *(addr + i) = (char)i;
+}
+
+static unsigned long virt_to_pfn(void *addr)
+{
+       int fd;
+       unsigned long pagemap;
+
+       fd = open("/proc/self/pagemap", O_RDONLY);
+       if (fd < 0)
+               return -1UL;
+
+       lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET);
+       read(fd, &pagemap, sizeof(pagemap));
+       close(fd);
+
+       return pagemap & ~PM_PFRAME_MASK;
+}
+
+static int check_page_flags(unsigned long pfn)
+{
+       int fd, i;
+       unsigned long pageflags;
+
+       fd = open("/proc/kpageflags", O_RDONLY);
+       if (fd < 0)
+               return -1;
+
+       lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
+
+       read(fd, &pageflags, sizeof(pageflags));
+       if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
+               close(fd);
+               printf("Head page flags (%lx) is invalid\n", pageflags);
+               return -1;
+       }
+
+       /*
+        * pages other than the first page must be tail and shouldn't be head;
+        * this also verifies kernel has correctly set the fake page_head to tail
+        * while hugetlb_free_vmemmap is enabled.
+        */
+       for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) {
+               read(fd, &pageflags, sizeof(pageflags));
+               if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
+                   (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
+                       close(fd);
+                       printf("Tail page flags (%lx) is invalid\n", pageflags);
+                       return -1;
+               }
+       }
+
+       close(fd);
+
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       void *addr;
+       unsigned long pfn;
+
+       addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* Trigger allocation of HugeTLB page. */
+       write_bytes(addr, MAP_LENGTH);
+
+       pfn = virt_to_pfn(addr);
+       if (pfn == -1UL) {
+               munmap(addr, MAP_LENGTH);
+               perror("virt_to_pfn");
+               exit(1);
+       }
+
+       printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
+
+       if (check_page_flags(pfn) < 0) {
+               munmap(addr, MAP_LENGTH);
+               perror("check_page_flags");
+               exit(1);
+       }
+
+       /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+       if (munmap(addr, MAP_LENGTH)) {
+               perror("munmap");
+               exit(1);
+       }
+
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c
new file mode 100644 (file)
index 0000000..a634f47
--- /dev/null
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-madvise:
+ *
+ * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE
+ * on hugetlb mappings.
+ *
+ * Before running this test, make sure the administrator has pre-allocated
+ * at least MIN_FREE_PAGES hugetlb pages and they are free.  In addition,
+ * the test takes an argument that is the path to a file in a hugetlbfs
+ * filesystem.  Therefore, a hugetlbfs filesystem must be mounted on some
+ * directory.
+ */
+
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#define __USE_GNU
+#include <fcntl.h>
+
+#define MIN_FREE_PAGES 20
+#define NR_HUGE_PAGES  10      /* common number of pages to map/allocate */
+
+#define validate_free_pages(exp_free)                                  \
+       do {                                                            \
+               int fhp = get_free_hugepages();                         \
+               if (fhp != (exp_free)) {                                \
+                       printf("Unexpected number of free huge "        \
+                               "pages line %d\n", __LINE__);           \
+                       exit(1);                                        \
+               }                                                       \
+       } while (0)
+
+unsigned long huge_page_size;
+unsigned long base_page_size;
+
+/*
+ * default_huge_page_size copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+       unsigned long hps = 0;
+       char *line = NULL;
+       size_t linelen = 0;
+       FILE *f = fopen("/proc/meminfo", "r");
+
+       if (!f)
+               return 0;
+       while (getline(&line, &linelen, f) > 0) {
+               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+                       hps <<= 10;
+                       break;
+               }
+       }
+
+       free(line);
+       fclose(f);
+       return hps;
+}
+
+unsigned long get_free_hugepages(void)
+{
+       unsigned long fhp = 0;
+       char *line = NULL;
+       size_t linelen = 0;
+       FILE *f = fopen("/proc/meminfo", "r");
+
+       if (!f)
+               return fhp;
+       while (getline(&line, &linelen, f) > 0) {
+               if (sscanf(line, "HugePages_Free:      %lu", &fhp) == 1)
+                       break;
+       }
+
+       free(line);
+       fclose(f);
+       return fhp;
+}
+
+void write_fault_pages(void *addr, unsigned long nr_pages)
+{
+       unsigned long i;
+
+       for (i = 0; i < nr_pages; i++)
+               *((unsigned long *)(addr + (i * huge_page_size))) = i;
+}
+
+void read_fault_pages(void *addr, unsigned long nr_pages)
+{
+       unsigned long dummy = 0;
+       unsigned long i;
+
+       for (i = 0; i < nr_pages; i++)
+               dummy += *((unsigned long *)(addr + (i * huge_page_size)));
+}
+
+int main(int argc, char **argv)
+{
+       unsigned long free_hugepages;
+       void *addr, *addr2;
+       int fd;
+       int ret;
+
+       huge_page_size = default_huge_page_size();
+       if (!huge_page_size) {
+               printf("Unable to determine huge page size, exiting!\n");
+               exit(1);
+       }
+       base_page_size = sysconf(_SC_PAGE_SIZE);
+       if (!huge_page_size) {
+               printf("Unable to determine base page size, exiting!\n");
+               exit(1);
+       }
+
+       free_hugepages = get_free_hugepages();
+       if (free_hugepages < MIN_FREE_PAGES) {
+               printf("Not enough free huge pages to test, exiting!\n");
+               exit(1);
+       }
+
+       fd = memfd_create(argv[0], MFD_HUGETLB);
+       if (fd < 0) {
+               perror("memfd_create() failed");
+               exit(1);
+       }
+
+       /*
+        * Test validity of MADV_DONTNEED addr and length arguments.  mmap
+        * size is NR_HUGE_PAGES + 2.  One page at the beginning and end of
+        * the mapping will be unmapped so we KNOW there is nothing mapped
+        * there.
+        */
+       addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                       -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       if (munmap(addr, huge_page_size) ||
+                       munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size,
+                               huge_page_size)) {
+               perror("munmap");
+               exit(1);
+       }
+       addr = addr + huge_page_size;
+
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* addr before mapping should fail */
+       ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size,
+               MADV_DONTNEED);
+       if (!ret) {
+               printf("Unexpected success of madvise call with invalid addr line %d\n",
+                               __LINE__);
+                       exit(1);
+       }
+
+       /* addr + length after mapping should fail */
+       ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size,
+               MADV_DONTNEED);
+       if (!ret) {
+               printf("Unexpected success of madvise call with invalid length line %d\n",
+                               __LINE__);
+                       exit(1);
+       }
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test alignment of MADV_DONTNEED addr and length arguments
+        */
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                       -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* addr is not huge page size aligned and should fail */
+       ret = madvise(addr + base_page_size,
+                       NR_HUGE_PAGES * huge_page_size - base_page_size,
+                       MADV_DONTNEED);
+       if (!ret) {
+               printf("Unexpected success of madvise call with unaligned start address %d\n",
+                               __LINE__);
+                       exit(1);
+       }
+
+       /* addr + length should be aligned down to huge page size */
+       if (madvise(addr,
+                       ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
+                       MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+
+       /* should free all but last page in mapping */
+       validate_free_pages(free_hugepages - 1);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+       validate_free_pages(free_hugepages);
+
+       /*
+        * Test MADV_DONTNEED on anonymous private mapping
+        */
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+                       -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+
+       /* should free all pages in mapping */
+       validate_free_pages(free_hugepages);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test MADV_DONTNEED on private mapping of hugetlb file
+        */
+       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* read should not consume any pages */
+       read_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* madvise should not free any pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* writes should allocate private pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /* madvise should free private pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* writes should allocate private pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /*
+        * The fallocate below certainly should free the pages associated
+        * with the file.  However, pages in the private mapping are also
+        * freed.  This is not the 'correct' behavior, but is expected
+        * because this is how it has worked since the initial hugetlb
+        * implementation.
+        */
+       if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                                       0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test MADV_DONTNEED on shared mapping of hugetlb file
+        */
+       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* write should not consume any pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* madvise should not free any pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /*
+        * Test MADV_REMOVE on shared mapping of hugetlb file
+        *
+        * madvise is same as hole punch and should free all pages.
+        */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages);
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+
+       /*
+        * Test MADV_REMOVE on shared and private mapping of hugetlb file
+        */
+       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
+               perror("fallocate");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fd, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* shared write should not consume any additional pages */
+       write_fault_pages(addr, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
+                       PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+       if (addr2 == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       /* private read should not consume any pages */
+       read_fault_pages(addr2, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* private write should consume additional pages */
+       write_fault_pages(addr2, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /* madvise of shared mapping should not free any pages */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /* madvise of private mapping should free private pages */
+       if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
+
+       /* private write should consume additional pages again */
+       write_fault_pages(addr2, NR_HUGE_PAGES);
+       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
+
+       /*
+        * madvise should free both file and private pages although this is
+        * not correct.  private pages should not be freed, but this is
+        * expected.  See comment associated with FALLOC_FL_PUNCH_HOLE call.
+        */
+       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
+               perror("madvise");
+               exit(1);
+       }
+       validate_free_pages(free_hugepages);
+
+       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
+       (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
+
+       close(fd);
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh
new file mode 100644 (file)
index 0000000..bf2d2a6
--- /dev/null
@@ -0,0 +1,252 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+  echo "This test must be run as root. Skipping..."
+  exit $ksft_skip
+fi
+
+usage_file=usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+  cgroup2=1
+  usage_file=current
+fi
+
+
+if [[ $cgroup2 ]]; then
+  CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
+  if [[ -z "$CGROUP_ROOT" ]]; then
+    CGROUP_ROOT=/dev/cgroup/memory
+    mount -t cgroup2 none $CGROUP_ROOT
+    do_umount=1
+  fi
+  echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
+else
+  CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
+  if [[ -z "$CGROUP_ROOT" ]]; then
+    CGROUP_ROOT=/dev/cgroup/memory
+    mount -t cgroup memory,hugetlb $CGROUP_ROOT
+    do_umount=1
+  fi
+fi
+MNT='/mnt/huge/'
+
+function get_machine_hugepage_size() {
+  hpz=$(grep -i hugepagesize /proc/meminfo)
+  kb=${hpz:14:-3}
+  mb=$(($kb / 1024))
+  echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function cleanup() {
+  echo cleanup
+  set +e
+  rm -rf "$MNT"/* 2>/dev/null
+  umount "$MNT" 2>/dev/null
+  rmdir "$MNT" 2>/dev/null
+  rmdir "$CGROUP_ROOT"/a/b 2>/dev/null
+  rmdir "$CGROUP_ROOT"/a 2>/dev/null
+  rmdir "$CGROUP_ROOT"/test1 2>/dev/null
+  echo 0 >/proc/sys/vm/nr_hugepages
+  set -e
+}
+
+function assert_state() {
+  local expected_a="$1"
+  local expected_a_hugetlb="$2"
+  local expected_b=""
+  local expected_b_hugetlb=""
+
+  if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then
+    expected_b="$3"
+    expected_b_hugetlb="$4"
+  fi
+  local tolerance=$((5 * 1024 * 1024))
+
+  local actual_a
+  actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
+  if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
+    [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
+    echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
+    echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+
+  local actual_a_hugetlb
+  actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
+  if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
+    [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
+    echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
+    echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+
+  if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
+    return
+  fi
+
+  local actual_b
+  actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
+  if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
+    [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
+    echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
+    echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+
+  local actual_b_hugetlb
+  actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
+  if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
+    [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
+    echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
+    echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
+    echo fail
+
+    cleanup
+    exit 1
+  fi
+}
+
+function setup() {
+  echo 100 >/proc/sys/vm/nr_hugepages
+  mkdir "$CGROUP_ROOT"/a
+  sleep 1
+  if [[ $cgroup2 ]]; then
+    echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control
+  else
+    echo 0 >$CGROUP_ROOT/a/cpuset.mems
+    echo 0 >$CGROUP_ROOT/a/cpuset.cpus
+  fi
+
+  mkdir "$CGROUP_ROOT"/a/b
+
+  if [[ ! $cgroup2 ]]; then
+    echo 0 >$CGROUP_ROOT/a/b/cpuset.mems
+    echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus
+  fi
+
+  mkdir -p "$MNT"
+  mount -t hugetlbfs none "$MNT"
+}
+
+write_hugetlbfs() {
+  local cgroup="$1"
+  local path="$2"
+  local size="$3"
+
+  if [[ $cgroup2 ]]; then
+    echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs
+  else
+    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems
+    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus
+    echo $$ >"$CGROUP_ROOT/$cgroup/tasks"
+  fi
+  ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o
+  if [[ $cgroup2 ]]; then
+    echo $$ >$CGROUP_ROOT/cgroup.procs
+  else
+    echo $$ >"$CGROUP_ROOT/tasks"
+  fi
+  echo
+}
+
+set -e
+
+size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages.
+
+cleanup
+
+echo
+echo
+echo Test charge, rmdir, uncharge
+setup
+echo mkdir
+mkdir $CGROUP_ROOT/test1
+
+echo write
+write_hugetlbfs test1 "$MNT"/test $size
+
+echo rmdir
+rmdir $CGROUP_ROOT/test1
+mkdir $CGROUP_ROOT/test1
+
+echo uncharge
+rm -rf /mnt/huge/*
+
+cleanup
+
+echo done
+echo
+echo
+if [[ ! $cgroup2 ]]; then
+  echo "Test parent and child hugetlb usage"
+  setup
+
+  echo write
+  write_hugetlbfs a "$MNT"/test $size
+
+  echo Assert memory charged correctly for parent use.
+  assert_state 0 $size 0 0
+
+  write_hugetlbfs a/b "$MNT"/test2 $size
+
+  echo Assert memory charged correctly for child use.
+  assert_state 0 $(($size * 2)) 0 $size
+
+  rmdir "$CGROUP_ROOT"/a/b
+  sleep 5
+  echo Assert memory reparent correctly.
+  assert_state 0 $(($size * 2))
+
+  rm -rf "$MNT"/*
+  umount "$MNT"
+  echo Assert memory uncharged correctly.
+  assert_state 0 0
+
+  cleanup
+fi
+
+echo
+echo
+echo "Test child only hugetlb usage"
+echo setup
+setup
+
+echo write
+write_hugetlbfs a/b "$MNT"/test2 $size
+
+echo Assert memory charged correctly for child only use.
+assert_state 0 $(($size)) 0 $size
+
+rmdir "$CGROUP_ROOT"/a/b
+echo Assert memory reparent correctly.
+assert_state 0 $size
+
+rm -rf "$MNT"/*
+umount "$MNT"
+echo Assert memory uncharged correctly.
+assert_state 0 0
+
+cleanup
+
+echo ALL PASS
+
+umount $CGROUP_ROOT
+rm -rf $CGROUP_ROOT
diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c
new file mode 100644 (file)
index 0000000..64126c8
--- /dev/null
@@ -0,0 +1,1558 @@
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <dirent.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+
+#include "linux/magic.h"
+
+#include "vm_util.h"
+
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+#ifndef MADV_POPULATE_READ
+#define MADV_POPULATE_READ 22
+#endif
+#ifndef MADV_COLLAPSE
+#define MADV_COLLAPSE 25
+#endif
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+
+#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
+#define PID_SMAPS "/proc/self/smaps"
+#define TEST_FILE "collapse_test_file"
+
+#define MAX_LINE_LENGTH 500
+
+enum vma_type {
+       VMA_ANON,
+       VMA_FILE,
+       VMA_SHMEM,
+};
+
+struct mem_ops {
+       void *(*setup_area)(int nr_hpages);
+       void (*cleanup_area)(void *p, unsigned long size);
+       void (*fault)(void *p, unsigned long start, unsigned long end);
+       bool (*check_huge)(void *addr, int nr_hpages);
+       const char *name;
+};
+
+static struct mem_ops *file_ops;
+static struct mem_ops *anon_ops;
+static struct mem_ops *shmem_ops;
+
+struct collapse_context {
+       void (*collapse)(const char *msg, char *p, int nr_hpages,
+                        struct mem_ops *ops, bool expect);
+       bool enforce_pte_scan_limits;
+       const char *name;
+};
+
+static struct collapse_context *khugepaged_context;
+static struct collapse_context *madvise_context;
+
+struct file_info {
+       const char *dir;
+       char path[PATH_MAX];
+       enum vma_type type;
+       int fd;
+       char dev_queue_read_ahead_path[PATH_MAX];
+};
+
+static struct file_info finfo;
+
+enum thp_enabled {
+       THP_ALWAYS,
+       THP_MADVISE,
+       THP_NEVER,
+};
+
+static const char *thp_enabled_strings[] = {
+       "always",
+       "madvise",
+       "never",
+       NULL
+};
+
+enum thp_defrag {
+       THP_DEFRAG_ALWAYS,
+       THP_DEFRAG_DEFER,
+       THP_DEFRAG_DEFER_MADVISE,
+       THP_DEFRAG_MADVISE,
+       THP_DEFRAG_NEVER,
+};
+
+static const char *thp_defrag_strings[] = {
+       "always",
+       "defer",
+       "defer+madvise",
+       "madvise",
+       "never",
+       NULL
+};
+
+enum shmem_enabled {
+       SHMEM_ALWAYS,
+       SHMEM_WITHIN_SIZE,
+       SHMEM_ADVISE,
+       SHMEM_NEVER,
+       SHMEM_DENY,
+       SHMEM_FORCE,
+};
+
+static const char *shmem_enabled_strings[] = {
+       "always",
+       "within_size",
+       "advise",
+       "never",
+       "deny",
+       "force",
+       NULL
+};
+
+struct khugepaged_settings {
+       bool defrag;
+       unsigned int alloc_sleep_millisecs;
+       unsigned int scan_sleep_millisecs;
+       unsigned int max_ptes_none;
+       unsigned int max_ptes_swap;
+       unsigned int max_ptes_shared;
+       unsigned long pages_to_scan;
+};
+
+struct settings {
+       enum thp_enabled thp_enabled;
+       enum thp_defrag thp_defrag;
+       enum shmem_enabled shmem_enabled;
+       bool use_zero_page;
+       struct khugepaged_settings khugepaged;
+       unsigned long read_ahead_kb;
+};
+
+static struct settings saved_settings;
+static bool skip_settings_restore;
+
+static int exit_status;
+
+static void success(const char *msg)
+{
+       printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+       printf(" \e[31m%s\e[0m\n", msg);
+       exit_status++;
+}
+
+static void skip(const char *msg)
+{
+       printf(" \e[33m%s\e[0m\n", msg);
+}
+
+static int read_file(const char *path, char *buf, size_t buflen)
+{
+       int fd;
+       ssize_t numread;
+
+       fd = open(path, O_RDONLY);
+       if (fd == -1)
+               return 0;
+
+       numread = read(fd, buf, buflen - 1);
+       if (numread < 1) {
+               close(fd);
+               return 0;
+       }
+
+       buf[numread] = '\0';
+       close(fd);
+
+       return (unsigned int) numread;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+       int fd;
+       ssize_t numwritten;
+
+       fd = open(path, O_WRONLY);
+       if (fd == -1) {
+               printf("open(%s)\n", path);
+               exit(EXIT_FAILURE);
+               return 0;
+       }
+
+       numwritten = write(fd, buf, buflen - 1);
+       close(fd);
+       if (numwritten < 1) {
+               printf("write(%s)\n", buf);
+               exit(EXIT_FAILURE);
+               return 0;
+       }
+
+       return (unsigned int) numwritten;
+}
+
+static int read_string(const char *name, const char *strings[])
+{
+       char path[PATH_MAX];
+       char buf[256];
+       char *c;
+       int ret;
+
+       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+       if (ret >= PATH_MAX) {
+               printf("%s: Pathname is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+       if (!read_file(path, buf, sizeof(buf))) {
+               perror(path);
+               exit(EXIT_FAILURE);
+       }
+
+       c = strchr(buf, '[');
+       if (!c) {
+               printf("%s: Parse failure\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+       c++;
+       memmove(buf, c, sizeof(buf) - (c - buf));
+
+       c = strchr(buf, ']');
+       if (!c) {
+               printf("%s: Parse failure\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+       *c = '\0';
+
+       ret = 0;
+       while (strings[ret]) {
+               if (!strcmp(strings[ret], buf))
+                       return ret;
+               ret++;
+       }
+
+       printf("Failed to parse %s\n", name);
+       exit(EXIT_FAILURE);
+}
+
+static void write_string(const char *name, const char *val)
+{
+       char path[PATH_MAX];
+       int ret;
+
+       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+       if (ret >= PATH_MAX) {
+               printf("%s: Pathname is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+       if (!write_file(path, val, strlen(val) + 1)) {
+               perror(path);
+               exit(EXIT_FAILURE);
+       }
+}
+
+static const unsigned long _read_num(const char *path)
+{
+       char buf[21];
+
+       if (read_file(path, buf, sizeof(buf)) < 0) {
+               perror("read_file(read_num)");
+               exit(EXIT_FAILURE);
+       }
+
+       return strtoul(buf, NULL, 10);
+}
+
+static const unsigned long read_num(const char *name)
+{
+       char path[PATH_MAX];
+       int ret;
+
+       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+       if (ret >= PATH_MAX) {
+               printf("%s: Pathname is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+       return _read_num(path);
+}
+
+static void _write_num(const char *path, unsigned long num)
+{
+       char buf[21];
+
+       sprintf(buf, "%ld", num);
+       if (!write_file(path, buf, strlen(buf) + 1)) {
+               perror(path);
+               exit(EXIT_FAILURE);
+       }
+}
+
+static void write_num(const char *name, unsigned long num)
+{
+       char path[PATH_MAX];
+       int ret;
+
+       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+       if (ret >= PATH_MAX) {
+               printf("%s: Pathname is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+       _write_num(path, num);
+}
+
+static void write_settings(struct settings *settings)
+{
+       struct khugepaged_settings *khugepaged = &settings->khugepaged;
+
+       write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
+       write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
+       write_string("shmem_enabled",
+                       shmem_enabled_strings[settings->shmem_enabled]);
+       write_num("use_zero_page", settings->use_zero_page);
+
+       write_num("khugepaged/defrag", khugepaged->defrag);
+       write_num("khugepaged/alloc_sleep_millisecs",
+                       khugepaged->alloc_sleep_millisecs);
+       write_num("khugepaged/scan_sleep_millisecs",
+                       khugepaged->scan_sleep_millisecs);
+       write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
+       write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+       write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
+       write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+
+       if (file_ops && finfo.type == VMA_FILE)
+               _write_num(finfo.dev_queue_read_ahead_path,
+                          settings->read_ahead_kb);
+}
+
+#define MAX_SETTINGS_DEPTH 4
+static struct settings settings_stack[MAX_SETTINGS_DEPTH];
+static int settings_index;
+
+static struct settings *current_settings(void)
+{
+       if (!settings_index) {
+               printf("Fail: No settings set");
+               exit(EXIT_FAILURE);
+       }
+       return settings_stack + settings_index - 1;
+}
+
+static void push_settings(struct settings *settings)
+{
+       if (settings_index >= MAX_SETTINGS_DEPTH) {
+               printf("Fail: Settings stack exceeded");
+               exit(EXIT_FAILURE);
+       }
+       settings_stack[settings_index++] = *settings;
+       write_settings(current_settings());
+}
+
+static void pop_settings(void)
+{
+       if (settings_index <= 0) {
+               printf("Fail: Settings stack empty");
+               exit(EXIT_FAILURE);
+       }
+       --settings_index;
+       write_settings(current_settings());
+}
+
+static void restore_settings(int sig)
+{
+       if (skip_settings_restore)
+               goto out;
+
+       printf("Restore THP and khugepaged settings...");
+       write_settings(&saved_settings);
+       success("OK");
+       if (sig)
+               exit(EXIT_FAILURE);
+out:
+       exit(exit_status);
+}
+
+static void save_settings(void)
+{
+       printf("Save THP and khugepaged settings...");
+       saved_settings = (struct settings) {
+               .thp_enabled = read_string("enabled", thp_enabled_strings),
+               .thp_defrag = read_string("defrag", thp_defrag_strings),
+               .shmem_enabled =
+                       read_string("shmem_enabled", shmem_enabled_strings),
+               .use_zero_page = read_num("use_zero_page"),
+       };
+       saved_settings.khugepaged = (struct khugepaged_settings) {
+               .defrag = read_num("khugepaged/defrag"),
+               .alloc_sleep_millisecs =
+                       read_num("khugepaged/alloc_sleep_millisecs"),
+               .scan_sleep_millisecs =
+                       read_num("khugepaged/scan_sleep_millisecs"),
+               .max_ptes_none = read_num("khugepaged/max_ptes_none"),
+               .max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+               .max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
+               .pages_to_scan = read_num("khugepaged/pages_to_scan"),
+       };
+       if (file_ops && finfo.type == VMA_FILE)
+               saved_settings.read_ahead_kb =
+                               _read_num(finfo.dev_queue_read_ahead_path);
+
+       success("OK");
+
+       signal(SIGTERM, restore_settings);
+       signal(SIGINT, restore_settings);
+       signal(SIGHUP, restore_settings);
+       signal(SIGQUIT, restore_settings);
+}
+
+static void get_finfo(const char *dir)
+{
+       struct stat path_stat;
+       struct statfs fs;
+       char buf[1 << 10];
+       char path[PATH_MAX];
+       char *str, *end;
+
+       finfo.dir = dir;
+       stat(finfo.dir, &path_stat);
+       if (!S_ISDIR(path_stat.st_mode)) {
+               printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
+               exit(EXIT_FAILURE);
+       }
+       if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
+                    finfo.dir) >= sizeof(finfo.path)) {
+               printf("%s: Pathname is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+       if (statfs(finfo.dir, &fs)) {
+               perror("statfs()");
+               exit(EXIT_FAILURE);
+       }
+       finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
+       if (finfo.type == VMA_SHMEM)
+               return;
+
+       /* Find owning device's queue/read_ahead_kb control */
+       if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
+                    major(path_stat.st_dev), minor(path_stat.st_dev))
+           >= sizeof(path)) {
+               printf("%s: Pathname is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+       if (read_file(path, buf, sizeof(buf)) < 0) {
+               perror("read_file(read_num)");
+               exit(EXIT_FAILURE);
+       }
+       if (strstr(buf, "DEVTYPE=disk")) {
+               /* Found it */
+               if (snprintf(finfo.dev_queue_read_ahead_path,
+                            sizeof(finfo.dev_queue_read_ahead_path),
+                            "/sys/dev/block/%d:%d/queue/read_ahead_kb",
+                            major(path_stat.st_dev), minor(path_stat.st_dev))
+                   >= sizeof(finfo.dev_queue_read_ahead_path)) {
+                       printf("%s: Pathname is too long\n", __func__);
+                       exit(EXIT_FAILURE);
+               }
+               return;
+       }
+       if (!strstr(buf, "DEVTYPE=partition")) {
+               printf("%s: Unknown device type: %s\n", __func__, path);
+               exit(EXIT_FAILURE);
+       }
+       /*
+        * Partition of block device - need to find actual device.
+        * Using naming convention that devnameN is partition of
+        * device devname.
+        */
+       str = strstr(buf, "DEVNAME=");
+       if (!str) {
+               printf("%s: Could not read: %s", __func__, path);
+               exit(EXIT_FAILURE);
+       }
+       str += 8;
+       end = str;
+       while (*end) {
+               if (isdigit(*end)) {
+                       *end = '\0';
+                       if (snprintf(finfo.dev_queue_read_ahead_path,
+                                    sizeof(finfo.dev_queue_read_ahead_path),
+                                    "/sys/block/%s/queue/read_ahead_kb",
+                                    str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
+                               printf("%s: Pathname is too long\n", __func__);
+                               exit(EXIT_FAILURE);
+                       }
+                       return;
+               }
+               ++end;
+       }
+       printf("%s: Could not read: %s\n", __func__, path);
+       exit(EXIT_FAILURE);
+}
+
+static bool check_swap(void *addr, unsigned long size)
+{
+       bool swap = false;
+       int ret;
+       FILE *fp;
+       char buffer[MAX_LINE_LENGTH];
+       char addr_pattern[MAX_LINE_LENGTH];
+
+       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+                      (unsigned long) addr);
+       if (ret >= MAX_LINE_LENGTH) {
+               printf("%s: Pattern is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+
+       fp = fopen(PID_SMAPS, "r");
+       if (!fp) {
+               printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+               exit(EXIT_FAILURE);
+       }
+       if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
+               goto err_out;
+
+       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
+                      size >> 10);
+       if (ret >= MAX_LINE_LENGTH) {
+               printf("%s: Pattern is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+       /*
+        * Fetch the Swap: in the same block and check whether it got
+        * the expected number of hugeepages next.
+        */
+       if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
+               goto err_out;
+
+       if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+               goto err_out;
+
+       swap = true;
+err_out:
+       fclose(fp);
+       return swap;
+}
+
+static void *alloc_mapping(int nr)
+{
+       void *p;
+
+       p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
+                MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+       if (p != BASE_ADDR) {
+               printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+               exit(EXIT_FAILURE);
+       }
+
+       return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+       int i;
+
+       for (i = start / page_size; i < end / page_size; i++)
+               p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+/*
+ * MADV_COLLAPSE is a best-effort request and may fail if an internal
+ * resource is temporarily unavailable, in which case it will set errno to
+ * EAGAIN.  In such a case, immediately reattempt the operation one more
+ * time.
+ */
+static int madvise_collapse_retry(void *p, unsigned long size)
+{
+       bool retry = true;
+       int ret;
+
+retry:
+       ret = madvise(p, size, MADV_COLLAPSE);
+       if (ret && errno == EAGAIN && retry) {
+               retry = false;
+               goto retry;
+       }
+       return ret;
+}
+
+/*
+ * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
+ * validate_memory()'able contents.
+ */
+static void *alloc_hpage(struct mem_ops *ops)
+{
+       void *p = ops->setup_area(1);
+
+       ops->fault(p, 0, hpage_pmd_size);
+
+       /*
+        * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
+        * The latter is ineligible for collapse by MADV_COLLAPSE
+        * while the former might cause MADV_COLLAPSE to race with
+        * khugepaged on low-load system (like a test machine), which
+        * would cause MADV_COLLAPSE to fail with EAGAIN.
+        */
+       printf("Allocate huge page...");
+       if (madvise_collapse_retry(p, hpage_pmd_size)) {
+               perror("madvise(MADV_COLLAPSE)");
+               exit(EXIT_FAILURE);
+       }
+       if (!ops->check_huge(p, 1)) {
+               perror("madvise(MADV_COLLAPSE)");
+               exit(EXIT_FAILURE);
+       }
+       if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
+               perror("madvise(MADV_HUGEPAGE)");
+               exit(EXIT_FAILURE);
+       }
+       success("OK");
+       return p;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+       int i;
+
+       for (i = start / page_size; i < end / page_size; i++) {
+               if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+                       printf("Page %d is corrupted: %#x\n",
+                                       i, p[i * page_size / sizeof(*p)]);
+                       exit(EXIT_FAILURE);
+               }
+       }
+}
+
+static void *anon_setup_area(int nr_hpages)
+{
+       return alloc_mapping(nr_hpages);
+}
+
+static void anon_cleanup_area(void *p, unsigned long size)
+{
+       munmap(p, size);
+}
+
+static void anon_fault(void *p, unsigned long start, unsigned long end)
+{
+       fill_memory(p, start, end);
+}
+
+static bool anon_check_huge(void *addr, int nr_hpages)
+{
+       return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
+}
+
+static void *file_setup_area(int nr_hpages)
+{
+       int fd;
+       void *p;
+       unsigned long size;
+
+       unlink(finfo.path);  /* Cleanup from previous failed tests */
+       printf("Creating %s for collapse%s...", finfo.path,
+              finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
+       fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
+                 777);
+       if (fd < 0) {
+               perror("open()");
+               exit(EXIT_FAILURE);
+       }
+
+       size = nr_hpages * hpage_pmd_size;
+       p = alloc_mapping(nr_hpages);
+       fill_memory(p, 0, size);
+       write(fd, p, size);
+       close(fd);
+       munmap(p, size);
+       success("OK");
+
+       printf("Opening %s read only for collapse...", finfo.path);
+       finfo.fd = open(finfo.path, O_RDONLY, 777);
+       if (finfo.fd < 0) {
+               perror("open()");
+               exit(EXIT_FAILURE);
+       }
+       p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
+                MAP_PRIVATE, finfo.fd, 0);
+       if (p == MAP_FAILED || p != BASE_ADDR) {
+               perror("mmap()");
+               exit(EXIT_FAILURE);
+       }
+
+       /* Drop page cache */
+       write_file("/proc/sys/vm/drop_caches", "3", 2);
+       success("OK");
+       return p;
+}
+
+static void file_cleanup_area(void *p, unsigned long size)
+{
+       munmap(p, size);
+       close(finfo.fd);
+       unlink(finfo.path);
+}
+
+static void file_fault(void *p, unsigned long start, unsigned long end)
+{
+       if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
+               perror("madvise(MADV_POPULATE_READ");
+               exit(EXIT_FAILURE);
+       }
+}
+
+static bool file_check_huge(void *addr, int nr_hpages)
+{
+       switch (finfo.type) {
+       case VMA_FILE:
+               return check_huge_file(addr, nr_hpages, hpage_pmd_size);
+       case VMA_SHMEM:
+               return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+       default:
+               exit(EXIT_FAILURE);
+               return false;
+       }
+}
+
+static void *shmem_setup_area(int nr_hpages)
+{
+       void *p;
+       unsigned long size = nr_hpages * hpage_pmd_size;
+
+       finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
+       if (finfo.fd < 0)  {
+               perror("memfd_create()");
+               exit(EXIT_FAILURE);
+       }
+       if (ftruncate(finfo.fd, size)) {
+               perror("ftruncate()");
+               exit(EXIT_FAILURE);
+       }
+       p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
+                0);
+       if (p != BASE_ADDR) {
+               perror("mmap()");
+               exit(EXIT_FAILURE);
+       }
+       return p;
+}
+
+static void shmem_cleanup_area(void *p, unsigned long size)
+{
+       munmap(p, size);
+       close(finfo.fd);
+}
+
+static bool shmem_check_huge(void *addr, int nr_hpages)
+{
+       return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
+}
+
+static struct mem_ops __anon_ops = {
+       .setup_area = &anon_setup_area,
+       .cleanup_area = &anon_cleanup_area,
+       .fault = &anon_fault,
+       .check_huge = &anon_check_huge,
+       .name = "anon",
+};
+
+static struct mem_ops __file_ops = {
+       .setup_area = &file_setup_area,
+       .cleanup_area = &file_cleanup_area,
+       .fault = &file_fault,
+       .check_huge = &file_check_huge,
+       .name = "file",
+};
+
+static struct mem_ops __shmem_ops = {
+       .setup_area = &shmem_setup_area,
+       .cleanup_area = &shmem_cleanup_area,
+       .fault = &anon_fault,
+       .check_huge = &shmem_check_huge,
+       .name = "shmem",
+};
+
+static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
+                              struct mem_ops *ops, bool expect)
+{
+       int ret;
+       struct settings settings = *current_settings();
+
+       printf("%s...", msg);
+
+       /*
+        * Prevent khugepaged interference and tests that MADV_COLLAPSE
+        * ignores /sys/kernel/mm/transparent_hugepage/enabled
+        */
+       settings.thp_enabled = THP_NEVER;
+       settings.shmem_enabled = SHMEM_NEVER;
+       push_settings(&settings);
+
+       /* Clear VM_NOHUGEPAGE */
+       madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+       ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
+       if (((bool)ret) == expect)
+               fail("Fail: Bad return value");
+       else if (!ops->check_huge(p, expect ? nr_hpages : 0))
+               fail("Fail: check_huge()");
+       else
+               success("OK");
+
+       pop_settings();
+}
+
+static void madvise_collapse(const char *msg, char *p, int nr_hpages,
+                            struct mem_ops *ops, bool expect)
+{
+       /* Sanity check */
+       if (!ops->check_huge(p, 0)) {
+               printf("Unexpected huge page\n");
+               exit(EXIT_FAILURE);
+       }
+       __madvise_collapse(msg, p, nr_hpages, ops, expect);
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
+                         struct mem_ops *ops)
+{
+       int full_scans;
+       int timeout = 6; /* 3 seconds */
+
+       /* Sanity check */
+       if (!ops->check_huge(p, 0)) {
+               printf("Unexpected huge page\n");
+               exit(EXIT_FAILURE);
+       }
+
+       madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
+
+       /* Wait until the second full_scan completed */
+       full_scans = read_num("khugepaged/full_scans") + 2;
+
+       printf("%s...", msg);
+       while (timeout--) {
+               if (ops->check_huge(p, nr_hpages))
+                       break;
+               if (read_num("khugepaged/full_scans") >= full_scans)
+                       break;
+               printf(".");
+               usleep(TICK);
+       }
+
+       madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
+
+       return timeout == -1;
+}
+
+static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
+                               struct mem_ops *ops, bool expect)
+{
+       if (wait_for_scan(msg, p, nr_hpages, ops)) {
+               if (expect)
+                       fail("Timeout");
+               else
+                       success("OK");
+               return;
+       }
+
+       /*
+        * For file and shmem memory, khugepaged only retracts pte entries after
+        * putting the new hugepage in the page cache. The hugepage must be
+        * subsequently refaulted to install the pmd mapping for the mm.
+        */
+       if (ops != &__anon_ops)
+               ops->fault(p, 0, nr_hpages * hpage_pmd_size);
+
+       if (ops->check_huge(p, expect ? nr_hpages : 0))
+               success("OK");
+       else
+               fail("Fail");
+}
+
+static struct collapse_context __khugepaged_context = {
+       .collapse = &khugepaged_collapse,
+       .enforce_pte_scan_limits = true,
+       .name = "khugepaged",
+};
+
+static struct collapse_context __madvise_context = {
+       .collapse = &madvise_collapse,
+       .enforce_pte_scan_limits = false,
+       .name = "madvise",
+};
+
+static bool is_tmpfs(struct mem_ops *ops)
+{
+       return ops == &__file_ops && finfo.type == VMA_SHMEM;
+}
+
+static void alloc_at_fault(void)
+{
+       struct settings settings = *current_settings();
+       char *p;
+
+       settings.thp_enabled = THP_ALWAYS;
+       push_settings(&settings);
+
+       p = alloc_mapping(1);
+       *p = 1;
+       printf("Allocate huge page on fault...");
+       if (check_huge_anon(p, 1, hpage_pmd_size))
+               success("OK");
+       else
+               fail("Fail");
+
+       pop_settings();
+
+       madvise(p, page_size, MADV_DONTNEED);
+       printf("Split huge PMD on MADV_DONTNEED...");
+       if (check_huge_anon(p, 0, hpage_pmd_size))
+               success("OK");
+       else
+               fail("Fail");
+       munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
+{
+       void *p;
+       int nr_hpages = 4;
+       unsigned long size = nr_hpages * hpage_pmd_size;
+
+       p = ops->setup_area(nr_hpages);
+       ops->fault(p, 0, size);
+       c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
+                   ops, true);
+       validate_memory(p, 0, size);
+       ops->cleanup_area(p, size);
+}
+
+static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
+{
+       void *p;
+
+       p = ops->setup_area(1);
+       c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
+{
+       void *p;
+
+       p = ops->setup_area(1);
+       ops->fault(p, 0, page_size);
+       c->collapse("Collapse PTE table with single PTE entry present", p,
+                   1, ops, true);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
+{
+       int max_ptes_none = hpage_pmd_nr / 2;
+       struct settings settings = *current_settings();
+       void *p;
+
+       settings.khugepaged.max_ptes_none = max_ptes_none;
+       push_settings(&settings);
+
+       p = ops->setup_area(1);
+
+       if (is_tmpfs(ops)) {
+               /* shmem pages always in the page cache */
+               printf("tmpfs...");
+               skip("Skip");
+               goto skip;
+       }
+
+       ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+       c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
+                   ops, !c->enforce_pte_scan_limits);
+       validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+
+       if (c->enforce_pte_scan_limits) {
+               ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+               c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
+                           true);
+               validate_memory(p, 0,
+                               (hpage_pmd_nr - max_ptes_none) * page_size);
+       }
+skip:
+       ops->cleanup_area(p, hpage_pmd_size);
+       pop_settings();
+}
+
+static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
+{
+       void *p;
+
+       p = ops->setup_area(1);
+       ops->fault(p, 0, hpage_pmd_size);
+
+       printf("Swapout one page...");
+       if (madvise(p, page_size, MADV_PAGEOUT)) {
+               perror("madvise(MADV_PAGEOUT)");
+               exit(EXIT_FAILURE);
+       }
+       if (check_swap(p, page_size)) {
+               success("OK");
+       } else {
+               fail("Fail");
+               goto out;
+       }
+
+       c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
+                   true);
+       validate_memory(p, 0, hpage_pmd_size);
+out:
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
+{
+       int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
+       void *p;
+
+       p = ops->setup_area(1);
+       ops->fault(p, 0, hpage_pmd_size);
+
+       printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+       if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+               perror("madvise(MADV_PAGEOUT)");
+               exit(EXIT_FAILURE);
+       }
+       if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+               success("OK");
+       } else {
+               fail("Fail");
+               goto out;
+       }
+
+       c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
+                   !c->enforce_pte_scan_limits);
+       validate_memory(p, 0, hpage_pmd_size);
+
+       if (c->enforce_pte_scan_limits) {
+               ops->fault(p, 0, hpage_pmd_size);
+               printf("Swapout %d of %d pages...", max_ptes_swap,
+                      hpage_pmd_nr);
+               if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+                       perror("madvise(MADV_PAGEOUT)");
+                       exit(EXIT_FAILURE);
+               }
+               if (check_swap(p, max_ptes_swap * page_size)) {
+                       success("OK");
+               } else {
+                       fail("Fail");
+                       goto out;
+               }
+
+               c->collapse("Collapse with max_ptes_swap pages swapped out", p,
+                           1, ops, true);
+               validate_memory(p, 0, hpage_pmd_size);
+       }
+out:
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+       void *p;
+
+       p = alloc_hpage(ops);
+
+       if (is_tmpfs(ops)) {
+               /* MADV_DONTNEED won't evict tmpfs pages */
+               printf("tmpfs...");
+               skip("Skip");
+               goto skip;
+       }
+
+       madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+       printf("Split huge page leaving single PTE mapping compound page...");
+       madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+       if (ops->check_huge(p, 0))
+               success("OK");
+       else
+               fail("Fail");
+
+       c->collapse("Collapse PTE table with single PTE mapping compound page",
+                   p, 1, ops, true);
+       validate_memory(p, 0, page_size);
+skip:
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+       void *p;
+
+       p = alloc_hpage(ops);
+       printf("Split huge page leaving single PTE page table full of compound pages...");
+       madvise(p, page_size, MADV_NOHUGEPAGE);
+       madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+       if (ops->check_huge(p, 0))
+               success("OK");
+       else
+               fail("Fail");
+
+       c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
+                   true);
+       validate_memory(p, 0, hpage_pmd_size);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
+{
+       void *p;
+       int i;
+
+       p = ops->setup_area(1);
+       for (i = 0; i < hpage_pmd_nr; i++) {
+               printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+                               i + 1, hpage_pmd_nr);
+
+               madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+               ops->fault(BASE_ADDR, 0, hpage_pmd_size);
+               if (!ops->check_huge(BASE_ADDR, 1)) {
+                       printf("Failed to allocate huge page\n");
+                       exit(EXIT_FAILURE);
+               }
+               madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+               p = mremap(BASE_ADDR - i * page_size,
+                               i * page_size + hpage_pmd_size,
+                               (i + 1) * page_size,
+                               MREMAP_MAYMOVE | MREMAP_FIXED,
+                               BASE_ADDR + 2 * hpage_pmd_size);
+               if (p == MAP_FAILED) {
+                       perror("mremap+unmap");
+                       exit(EXIT_FAILURE);
+               }
+
+               p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+                               (i + 1) * page_size,
+                               (i + 1) * page_size + hpage_pmd_size,
+                               MREMAP_MAYMOVE | MREMAP_FIXED,
+                               BASE_ADDR - (i + 1) * page_size);
+               if (p == MAP_FAILED) {
+                       perror("mremap+alloc");
+                       exit(EXIT_FAILURE);
+               }
+       }
+
+       ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
+       ops->fault(p, 0, hpage_pmd_size);
+       if (!ops->check_huge(p, 1))
+               success("OK");
+       else
+               fail("Fail");
+
+       c->collapse("Collapse PTE table full of different compound pages", p, 1,
+                   ops, true);
+
+       validate_memory(p, 0, hpage_pmd_size);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
+{
+       int wstatus;
+       void *p;
+
+       p = ops->setup_area(1);
+
+       printf("Allocate small page...");
+       ops->fault(p, 0, page_size);
+       if (ops->check_huge(p, 0))
+               success("OK");
+       else
+               fail("Fail");
+
+       printf("Share small page over fork()...");
+       if (!fork()) {
+               /* Do not touch settings on child exit */
+               skip_settings_restore = true;
+               exit_status = 0;
+
+               if (ops->check_huge(p, 0))
+                       success("OK");
+               else
+                       fail("Fail");
+
+               ops->fault(p, page_size, 2 * page_size);
+               c->collapse("Collapse PTE table with single page shared with parent process",
+                           p, 1, ops, true);
+
+               validate_memory(p, 0, page_size);
+               ops->cleanup_area(p, hpage_pmd_size);
+               exit(exit_status);
+       }
+
+       wait(&wstatus);
+       exit_status += WEXITSTATUS(wstatus);
+
+       printf("Check if parent still has small page...");
+       if (ops->check_huge(p, 0))
+               success("OK");
+       else
+               fail("Fail");
+       validate_memory(p, 0, page_size);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
+{
+       int wstatus;
+       void *p;
+
+       p = alloc_hpage(ops);
+       printf("Share huge page over fork()...");
+       if (!fork()) {
+               /* Do not touch settings on child exit */
+               skip_settings_restore = true;
+               exit_status = 0;
+
+               if (ops->check_huge(p, 1))
+                       success("OK");
+               else
+                       fail("Fail");
+
+               printf("Split huge page PMD in child process...");
+               madvise(p, page_size, MADV_NOHUGEPAGE);
+               madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+               if (ops->check_huge(p, 0))
+                       success("OK");
+               else
+                       fail("Fail");
+               ops->fault(p, 0, page_size);
+
+               write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
+               c->collapse("Collapse PTE table full of compound pages in child",
+                           p, 1, ops, true);
+               write_num("khugepaged/max_ptes_shared",
+                         current_settings()->khugepaged.max_ptes_shared);
+
+               validate_memory(p, 0, hpage_pmd_size);
+               ops->cleanup_area(p, hpage_pmd_size);
+               exit(exit_status);
+       }
+
+       wait(&wstatus);
+       exit_status += WEXITSTATUS(wstatus);
+
+       printf("Check if parent still has huge page...");
+       if (ops->check_huge(p, 1))
+               success("OK");
+       else
+               fail("Fail");
+       validate_memory(p, 0, hpage_pmd_size);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
+{
+       int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
+       int wstatus;
+       void *p;
+
+       p = alloc_hpage(ops);
+       printf("Share huge page over fork()...");
+       if (!fork()) {
+               /* Do not touch settings on child exit */
+               skip_settings_restore = true;
+               exit_status = 0;
+
+               if (ops->check_huge(p, 1))
+                       success("OK");
+               else
+                       fail("Fail");
+
+               printf("Trigger CoW on page %d of %d...",
+                               hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+               ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+               if (ops->check_huge(p, 0))
+                       success("OK");
+               else
+                       fail("Fail");
+
+               c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
+                           1, ops, !c->enforce_pte_scan_limits);
+
+               if (c->enforce_pte_scan_limits) {
+                       printf("Trigger CoW on page %d of %d...",
+                              hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+                       ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
+                                   page_size);
+                       if (ops->check_huge(p, 0))
+                               success("OK");
+                       else
+                               fail("Fail");
+
+                       c->collapse("Collapse with max_ptes_shared PTEs shared",
+                                   p, 1, ops, true);
+               }
+
+               validate_memory(p, 0, hpage_pmd_size);
+               ops->cleanup_area(p, hpage_pmd_size);
+               exit(exit_status);
+       }
+
+       wait(&wstatus);
+       exit_status += WEXITSTATUS(wstatus);
+
+       printf("Check if parent still has huge page...");
+       if (ops->check_huge(p, 1))
+               success("OK");
+       else
+               fail("Fail");
+       validate_memory(p, 0, hpage_pmd_size);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+static void madvise_collapse_existing_thps(struct collapse_context *c,
+                                          struct mem_ops *ops)
+{
+       void *p;
+
+       p = ops->setup_area(1);
+       ops->fault(p, 0, hpage_pmd_size);
+       c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
+       validate_memory(p, 0, hpage_pmd_size);
+
+       /* c->collapse() will find a hugepage and complain - call directly. */
+       __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
+       validate_memory(p, 0, hpage_pmd_size);
+       ops->cleanup_area(p, hpage_pmd_size);
+}
+
+/*
+ * Test race with khugepaged where page tables have been retracted and
+ * pmd cleared.
+ */
+static void madvise_retracted_page_tables(struct collapse_context *c,
+                                         struct mem_ops *ops)
+{
+       void *p;
+       int nr_hpages = 1;
+       unsigned long size = nr_hpages * hpage_pmd_size;
+
+       p = ops->setup_area(nr_hpages);
+       ops->fault(p, 0, size);
+
+       /* Let khugepaged collapse and leave pmd cleared */
+       if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
+                         ops)) {
+               fail("Timeout");
+               return;
+       }
+       success("OK");
+       c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
+                   true);
+       validate_memory(p, 0, size);
+       ops->cleanup_area(p, size);
+}
+
+static void usage(void)
+{
+       fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
+       fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
+       fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
+       fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
+       fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
+       fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
+       fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
+       fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
+       fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n");
+       exit(1);
+}
+
+static void parse_test_type(int argc, const char **argv)
+{
+       char *buf;
+       const char *token;
+
+       if (argc == 1) {
+               /* Backwards compatibility */
+               khugepaged_context =  &__khugepaged_context;
+               madvise_context =  &__madvise_context;
+               anon_ops = &__anon_ops;
+               return;
+       }
+
+       buf = strdup(argv[1]);
+       token = strsep(&buf, ":");
+
+       if (!strcmp(token, "all")) {
+               khugepaged_context =  &__khugepaged_context;
+               madvise_context =  &__madvise_context;
+       } else if (!strcmp(token, "khugepaged")) {
+               khugepaged_context =  &__khugepaged_context;
+       } else if (!strcmp(token, "madvise")) {
+               madvise_context =  &__madvise_context;
+       } else {
+               usage();
+       }
+
+       if (!buf)
+               usage();
+
+       if (!strcmp(buf, "all")) {
+               file_ops =  &__file_ops;
+               anon_ops = &__anon_ops;
+               shmem_ops = &__shmem_ops;
+       } else if (!strcmp(buf, "anon")) {
+               anon_ops = &__anon_ops;
+       } else if (!strcmp(buf, "file")) {
+               file_ops =  &__file_ops;
+       } else if (!strcmp(buf, "shmem")) {
+               shmem_ops = &__shmem_ops;
+       } else {
+               usage();
+       }
+
+       if (!file_ops)
+               return;
+
+       if (argc != 3)
+               usage();
+}
+
+int main(int argc, const char **argv)
+{
+       struct settings default_settings = {
+               .thp_enabled = THP_MADVISE,
+               .thp_defrag = THP_DEFRAG_ALWAYS,
+               .shmem_enabled = SHMEM_ADVISE,
+               .use_zero_page = 0,
+               .khugepaged = {
+                       .defrag = 1,
+                       .alloc_sleep_millisecs = 10,
+                       .scan_sleep_millisecs = 10,
+               },
+               /*
+                * When testing file-backed memory, the collapse path
+                * looks at how many pages are found in the page cache, not
+                * what pages are mapped. Disable read ahead optimization so
+                * pages don't find their way into the page cache unless
+                * we mem_ops->fault() them in.
+                */
+               .read_ahead_kb = 0,
+       };
+
+       parse_test_type(argc, argv);
+
+       if (file_ops)
+               get_finfo(argv[2]);
+
+       setbuf(stdout, NULL);
+
+       page_size = getpagesize();
+       hpage_pmd_size = read_pmd_pagesize();
+       hpage_pmd_nr = hpage_pmd_size / page_size;
+
+       default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+       default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+       default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
+       default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+
+       save_settings();
+       push_settings(&default_settings);
+
+       alloc_at_fault();
+
+#define TEST(t, c, o) do { \
+       if (c && o) { \
+               printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
+               t(c, o); \
+       } \
+       } while (0)
+
+       TEST(collapse_full, khugepaged_context, anon_ops);
+       TEST(collapse_full, khugepaged_context, file_ops);
+       TEST(collapse_full, khugepaged_context, shmem_ops);
+       TEST(collapse_full, madvise_context, anon_ops);
+       TEST(collapse_full, madvise_context, file_ops);
+       TEST(collapse_full, madvise_context, shmem_ops);
+
+       TEST(collapse_empty, khugepaged_context, anon_ops);
+       TEST(collapse_empty, madvise_context, anon_ops);
+
+       TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
+       TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
+       TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
+       TEST(collapse_single_pte_entry, madvise_context, anon_ops);
+       TEST(collapse_single_pte_entry, madvise_context, file_ops);
+       TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
+
+       TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
+       TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
+       TEST(collapse_max_ptes_none, madvise_context, anon_ops);
+       TEST(collapse_max_ptes_none, madvise_context, file_ops);
+
+       TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
+       TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
+       TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
+       TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
+
+       TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
+       TEST(collapse_full_of_compound, khugepaged_context, file_ops);
+       TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
+       TEST(collapse_full_of_compound, madvise_context, anon_ops);
+       TEST(collapse_full_of_compound, madvise_context, file_ops);
+       TEST(collapse_full_of_compound, madvise_context, shmem_ops);
+
+       TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
+       TEST(collapse_compound_extreme, madvise_context, anon_ops);
+
+       TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
+       TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
+
+       TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
+       TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
+
+       TEST(collapse_fork, khugepaged_context, anon_ops);
+       TEST(collapse_fork, madvise_context, anon_ops);
+
+       TEST(collapse_fork_compound, khugepaged_context, anon_ops);
+       TEST(collapse_fork_compound, madvise_context, anon_ops);
+
+       TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
+       TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
+
+       TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
+       TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
+       TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
+
+       TEST(madvise_retracted_page_tables, madvise_context, file_ops);
+       TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
+
+       restore_settings(0);
+}
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
new file mode 100644 (file)
index 0000000..d8b5b49
--- /dev/null
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KSM functional tests
+ *
+ * Copyright 2022, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define KiB 1024u
+#define MiB (1024 * KiB)
+
+static int ksm_fd;
+static int ksm_full_scans_fd;
+static int pagemap_fd;
+static size_t pagesize;
+
+static bool range_maps_duplicates(char *addr, unsigned long size)
+{
+       unsigned long offs_a, offs_b, pfn_a, pfn_b;
+
+       /*
+        * There is no easy way to check if there are KSM pages mapped into
+        * this range. We only check that the range does not map the same PFN
+        * twice by comparing each pair of mapped pages.
+        */
+       for (offs_a = 0; offs_a < size; offs_a += pagesize) {
+               pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
+               /* Page not present or PFN not exposed by the kernel. */
+               if (pfn_a == -1ul || !pfn_a)
+                       continue;
+
+               for (offs_b = offs_a + pagesize; offs_b < size;
+                    offs_b += pagesize) {
+                       pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
+                       if (pfn_b == -1ul || !pfn_b)
+                               continue;
+                       if (pfn_a == pfn_b)
+                               return true;
+               }
+       }
+       return false;
+}
+
+static long ksm_get_full_scans(void)
+{
+       char buf[10];
+       ssize_t ret;
+
+       ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
+       if (ret <= 0)
+               return -errno;
+       buf[ret] = 0;
+
+       return strtol(buf, NULL, 10);
+}
+
+static int ksm_merge(void)
+{
+       long start_scans, end_scans;
+
+       /* Wait for two full scans such that any possible merging happened. */
+       start_scans = ksm_get_full_scans();
+       if (start_scans < 0)
+               return start_scans;
+       if (write(ksm_fd, "1", 1) != 1)
+               return -errno;
+       do {
+               end_scans = ksm_get_full_scans();
+               if (end_scans < 0)
+                       return end_scans;
+       } while (end_scans < start_scans + 2);
+
+       return 0;
+}
+
+static char *mmap_and_merge_range(char val, unsigned long size)
+{
+       char *map;
+
+       map = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                  MAP_PRIVATE|MAP_ANON, -1, 0);
+       if (map == MAP_FAILED) {
+               ksft_test_result_fail("mmap() failed\n");
+               return MAP_FAILED;
+       }
+
+       /* Don't use THP. Ignore if THP are not around on a kernel. */
+       if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
+               ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
+               goto unmap;
+       }
+
+       /* Make sure each page contains the same values to merge them. */
+       memset(map, val, size);
+       if (madvise(map, size, MADV_MERGEABLE)) {
+               ksft_test_result_fail("MADV_MERGEABLE failed\n");
+               goto unmap;
+       }
+
+       /* Run KSM to trigger merging and wait. */
+       if (ksm_merge()) {
+               ksft_test_result_fail("Running KSM failed\n");
+               goto unmap;
+       }
+       return map;
+unmap:
+       munmap(map, size);
+       return MAP_FAILED;
+}
+
+static void test_unmerge(void)
+{
+       const unsigned int size = 2 * MiB;
+       char *map;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       map = mmap_and_merge_range(0xcf, size);
+       if (map == MAP_FAILED)
+               return;
+
+       if (madvise(map, size, MADV_UNMERGEABLE)) {
+               ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+               goto unmap;
+       }
+
+       ksft_test_result(!range_maps_duplicates(map, size),
+                        "Pages were unmerged\n");
+unmap:
+       munmap(map, size);
+}
+
+static void test_unmerge_discarded(void)
+{
+       const unsigned int size = 2 * MiB;
+       char *map;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       map = mmap_and_merge_range(0xcf, size);
+       if (map == MAP_FAILED)
+               return;
+
+       /* Discard half of all mapped pages so we have pte_none() entries. */
+       if (madvise(map, size / 2, MADV_DONTNEED)) {
+               ksft_test_result_fail("MADV_DONTNEED failed\n");
+               goto unmap;
+       }
+
+       if (madvise(map, size, MADV_UNMERGEABLE)) {
+               ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+               goto unmap;
+       }
+
+       ksft_test_result(!range_maps_duplicates(map, size),
+                        "Pages were unmerged\n");
+unmap:
+       munmap(map, size);
+}
+
+#ifdef __NR_userfaultfd
+static void test_unmerge_uffd_wp(void)
+{
+       struct uffdio_writeprotect uffd_writeprotect;
+       struct uffdio_register uffdio_register;
+       const unsigned int size = 2 * MiB;
+       struct uffdio_api uffdio_api;
+       char *map;
+       int uffd;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       map = mmap_and_merge_range(0xcf, size);
+       if (map == MAP_FAILED)
+               return;
+
+       /* See if UFFD is around. */
+       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+       if (uffd < 0) {
+               ksft_test_result_skip("__NR_userfaultfd failed\n");
+               goto unmap;
+       }
+
+       /* See if UFFD-WP is around. */
+       uffdio_api.api = UFFD_API;
+       uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+       if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
+               ksft_test_result_fail("UFFDIO_API failed\n");
+               goto close_uffd;
+       }
+       if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
+               ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
+               goto close_uffd;
+       }
+
+       /* Register UFFD-WP, no need for an actual handler. */
+       uffdio_register.range.start = (unsigned long) map;
+       uffdio_register.range.len = size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
+               ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
+               goto close_uffd;
+       }
+
+       /* Write-protect the range using UFFD-WP. */
+       uffd_writeprotect.range.start = (unsigned long) map;
+       uffd_writeprotect.range.len = size;
+       uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+       if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+               ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
+               goto close_uffd;
+       }
+
+       if (madvise(map, size, MADV_UNMERGEABLE)) {
+               ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
+               goto close_uffd;
+       }
+
+       ksft_test_result(!range_maps_duplicates(map, size),
+                        "Pages were unmerged\n");
+close_uffd:
+       close(uffd);
+unmap:
+       munmap(map, size);
+}
+#endif
+
+int main(int argc, char **argv)
+{
+       unsigned int tests = 2;
+       int err;
+
+#ifdef __NR_userfaultfd
+       tests++;
+#endif
+
+       ksft_print_header();
+       ksft_set_plan(tests);
+
+       pagesize = getpagesize();
+
+       ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
+       if (ksm_fd < 0)
+               ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
+       ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
+       if (ksm_full_scans_fd < 0)
+               ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
+       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+       if (pagemap_fd < 0)
+               ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
+
+       test_unmerge();
+       test_unmerge_discarded();
+#ifdef __NR_userfaultfd
+       test_unmerge_uffd_wp();
+#endif
+
+       err = ksft_get_fail_cnt();
+       if (err)
+               ksft_exit_fail_msg("%d out of %d tests failed\n",
+                                  err, ksft_test_num());
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/ksm_tests.c b/tools/testing/selftests/mm/ksm_tests.c
new file mode 100644 (file)
index 0000000..f9eb4d6
--- /dev/null
@@ -0,0 +1,849 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/mman.h>
+#include <stdbool.h>
+#include <time.h>
+#include <string.h>
+#include <numa.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <err.h>
+
+#include "../kselftest.h"
+#include <include/vdso/time64.h>
+#include "util.h"
+
+#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
+#define KSM_FP(s) (KSM_SYSFS_PATH s)
+#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
+#define KSM_PAGE_COUNT_DEFAULT 10l
+#define KSM_PROT_STR_DEFAULT "rw"
+#define KSM_USE_ZERO_PAGES_DEFAULT false
+#define KSM_MERGE_ACROSS_NODES_DEFAULT true
+#define MB (1ul << 20)
+
+struct ksm_sysfs {
+       unsigned long max_page_sharing;
+       unsigned long merge_across_nodes;
+       unsigned long pages_to_scan;
+       unsigned long run;
+       unsigned long sleep_millisecs;
+       unsigned long stable_node_chains_prune_millisecs;
+       unsigned long use_zero_pages;
+};
+
+enum ksm_test_name {
+       CHECK_KSM_MERGE,
+       CHECK_KSM_UNMERGE,
+       CHECK_KSM_ZERO_PAGE_MERGE,
+       CHECK_KSM_NUMA_MERGE,
+       KSM_MERGE_TIME,
+       KSM_MERGE_TIME_HUGE_PAGES,
+       KSM_UNMERGE_TIME,
+       KSM_COW_TIME
+};
+
+static int ksm_write_sysfs(const char *file_path, unsigned long val)
+{
+       FILE *f = fopen(file_path, "w");
+
+       if (!f) {
+               fprintf(stderr, "f %s\n", file_path);
+               perror("fopen");
+               return 1;
+       }
+       if (fprintf(f, "%lu", val) < 0) {
+               perror("fprintf");
+               fclose(f);
+               return 1;
+       }
+       fclose(f);
+
+       return 0;
+}
+
+static int ksm_read_sysfs(const char *file_path, unsigned long *val)
+{
+       FILE *f = fopen(file_path, "r");
+
+       if (!f) {
+               fprintf(stderr, "f %s\n", file_path);
+               perror("fopen");
+               return 1;
+       }
+       if (fscanf(f, "%lu", val) != 1) {
+               perror("fscanf");
+               fclose(f);
+               return 1;
+       }
+       fclose(f);
+
+       return 0;
+}
+
+static int str_to_prot(char *prot_str)
+{
+       int prot = 0;
+
+       if ((strchr(prot_str, 'r')) != NULL)
+               prot |= PROT_READ;
+       if ((strchr(prot_str, 'w')) != NULL)
+               prot |= PROT_WRITE;
+       if ((strchr(prot_str, 'x')) != NULL)
+               prot |= PROT_EXEC;
+
+       return prot;
+}
+
+static void print_help(void)
+{
+       printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
+              "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n");
+
+       printf("Supported <test type>:\n"
+              " -M (page merging)\n"
+              " -Z (zero pages merging)\n"
+              " -N (merging of pages in different NUMA nodes)\n"
+              " -U (page unmerging)\n"
+              " -P evaluate merging time and speed.\n"
+              "    For this test, the size of duplicated memory area (in MiB)\n"
+              "    must be provided using -s option\n"
+              " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
+              "    For this test, the size of duplicated memory area (in MiB)\n"
+              "    must be provided using -s option\n"
+              " -D evaluate unmerging time and speed when disabling KSM.\n"
+              "    For this test, the size of duplicated memory area (in MiB)\n"
+              "    must be provided using -s option\n"
+              " -C evaluate the time required to break COW of merged pages.\n\n");
+
+       printf(" -a: specify the access protections of pages.\n"
+              "     <prot> must be of the form [rwx].\n"
+              "     Default: %s\n", KSM_PROT_STR_DEFAULT);
+       printf(" -p: specify the number of pages to test.\n"
+              "     Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
+       printf(" -l: limit the maximum running time (in seconds) for a test.\n"
+              "     Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
+       printf(" -z: change use_zero_pages tunable\n"
+              "     Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
+       printf(" -m: change merge_across_nodes tunable\n"
+              "     Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
+       printf(" -s: the size of duplicated memory area (in MiB)\n");
+
+       exit(0);
+}
+
+static void  *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
+{
+       void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
+
+       if (!map_ptr) {
+               perror("mmap");
+               return NULL;
+       }
+       memset(map_ptr, data, map_size);
+       if (mprotect(map_ptr, map_size, prot)) {
+               perror("mprotect");
+               munmap(map_ptr, map_size);
+               return NULL;
+       }
+
+       return map_ptr;
+}
+
+static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
+{
+       struct timespec cur_time;
+       unsigned long cur_scan, init_scan;
+
+       if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
+               return 1;
+       cur_scan = init_scan;
+
+       while (cur_scan < init_scan + scan_count) {
+               if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
+                       return 1;
+               if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
+                       perror("clock_gettime");
+                       return 1;
+               }
+               if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
+                       printf("Scan time limit exceeded\n");
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
+{
+       if (madvise(addr, size, MADV_MERGEABLE)) {
+               perror("madvise");
+               return 1;
+       }
+       if (ksm_write_sysfs(KSM_FP("run"), 1))
+               return 1;
+
+       /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
+       if (ksm_do_scan(2, start_time, timeout))
+               return 1;
+
+       return 0;
+}
+
+static int ksm_unmerge_pages(void *addr, size_t size,
+                            struct timespec start_time, int timeout)
+{
+       if (madvise(addr, size, MADV_UNMERGEABLE)) {
+               perror("madvise");
+               return 1;
+       }
+       return 0;
+}
+
+static bool assert_ksm_pages_count(long dupl_page_count)
+{
+       unsigned long max_page_sharing, pages_sharing, pages_shared;
+
+       if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
+           ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
+           ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
+               return false;
+
+       /*
+        * Since there must be at least 2 pages for merging and 1 page can be
+        * shared with the limited number of pages (max_page_sharing), sometimes
+        * there are 'leftover' pages that cannot be merged. For example, if there
+        * are 11 pages and max_page_sharing = 10, then only 10 pages will be
+        * merged and the 11th page won't be affected. As a result, when the number
+        * of duplicate pages is divided by max_page_sharing and the remainder is 1,
+        * pages_shared and pages_sharing values will be equal between dupl_page_count
+        * and dupl_page_count - 1.
+        */
+       if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
+               if (pages_shared == dupl_page_count / max_page_sharing &&
+                   pages_sharing == pages_shared * (max_page_sharing - 1))
+                       return true;
+       } else {
+               if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
+                   pages_sharing == dupl_page_count - pages_shared)
+                       return true;
+       }
+
+       return false;
+}
+
+static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
+{
+       if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
+           numa_available() ? 0 :
+               ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
+           ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
+           ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
+           ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
+           ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+                          &ksm_sysfs->stable_node_chains_prune_millisecs) ||
+           ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
+               return 1;
+
+       return 0;
+}
+
+static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
+{
+       if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
+           numa_available() ? 0 :
+               ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
+           ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
+           ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
+           ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
+           ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
+                           ksm_sysfs->stable_node_chains_prune_millisecs) ||
+           ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
+               return 1;
+
+       return 0;
+}
+
+static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       /* fill pages with the same data and merge them */
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       /* verify that the right number of pages are merged */
+       if (assert_ksm_pages_count(page_count)) {
+               printf("OK\n");
+               munmap(map_ptr, page_size * page_count);
+               return KSFT_PASS;
+       }
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time;
+       int page_count = 2;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       /* fill pages with the same data and merge them */
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
+       memset(map_ptr, '-', 1);
+       memset(map_ptr + page_size, '+', 1);
+
+       /* get at least 1 scan, so KSM can detect that the pages were modified */
+       if (ksm_do_scan(1, start_time, timeout))
+               goto err_out;
+
+       /* check that unmerging was successful and 0 pages are currently merged */
+       if (assert_ksm_pages_count(0)) {
+               printf("OK\n");
+               munmap(map_ptr, page_size * page_count);
+               return KSFT_PASS;
+       }
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
+                                    bool use_zero_pages, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
+               return KSFT_FAIL;
+
+       /* fill pages with zero and try to merge them */
+       map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       /*
+       * verify that the right number of pages are merged:
+       * 1) if use_zero_pages is set to 1, empty pages are merged
+       *    with the kernel zero page instead of with each other;
+       * 2) if use_zero_pages is set to 0, empty pages are not treated specially
+       *    and merged as usual.
+       */
+       if (use_zero_pages && !assert_ksm_pages_count(0))
+               goto err_out;
+       else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
+               goto err_out;
+
+       printf("OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+static int get_next_mem_node(int node)
+{
+
+       long node_size;
+       int mem_node = 0;
+       int i, max_node = numa_max_node();
+
+       for (i = node + 1; i <= max_node + node; i++) {
+               mem_node = i % (max_node + 1);
+               node_size = numa_node_size(mem_node, NULL);
+               if (node_size > 0)
+                       break;
+       }
+       return mem_node;
+}
+
+static int get_first_mem_node(void)
+{
+       return get_next_mem_node(numa_max_node());
+}
+
+static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
+                               size_t page_size)
+{
+       void *numa1_map_ptr, *numa2_map_ptr;
+       struct timespec start_time;
+       int page_count = 2;
+       int first_node;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       if (numa_available() < 0) {
+               perror("NUMA support not enabled");
+               return KSFT_SKIP;
+       }
+       if (numa_num_configured_nodes() <= 1) {
+               printf("At least 2 NUMA nodes must be available\n");
+               return KSFT_SKIP;
+       }
+       if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
+               return KSFT_FAIL;
+
+       /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
+       first_node = get_first_mem_node();
+       numa1_map_ptr = numa_alloc_onnode(page_size, first_node);
+       numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node));
+       if (!numa1_map_ptr || !numa2_map_ptr) {
+               perror("numa_alloc_onnode");
+               return KSFT_FAIL;
+       }
+
+       memset(numa1_map_ptr, '*', page_size);
+       memset(numa2_map_ptr, '*', page_size);
+
+       /* try to merge the pages */
+       if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
+           ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
+               goto err_out;
+
+       /*
+       * verify that the right number of pages are merged:
+       * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
+       * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
+       *    only 1 unique page in each node and they can't be shared.
+       */
+       if (merge_across_nodes && !assert_ksm_pages_count(page_count))
+               goto err_out;
+       else if (!merge_across_nodes && !assert_ksm_pages_count(0))
+               goto err_out;
+
+       numa_free(numa1_map_ptr, page_size);
+       numa_free(numa2_map_ptr, page_size);
+       printf("OK\n");
+       return KSFT_PASS;
+
+err_out:
+       numa_free(numa1_map_ptr, page_size);
+       numa_free(numa2_map_ptr, page_size);
+       printf("Not OK\n");
+       return KSFT_FAIL;
+}
+
+static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size)
+{
+       void *map_ptr, *map_ptr_orig;
+       struct timespec start_time, end_time;
+       unsigned long scan_time_ns;
+       int pagemap_fd, n_normal_pages, n_huge_pages;
+
+       map_size *= MB;
+       size_t len = map_size;
+
+       len -= len % HPAGE_SIZE;
+       map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+       map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE;
+
+       if (map_ptr_orig == MAP_FAILED)
+               err(2, "initial mmap");
+
+       if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE))
+               err(2, "MADV_HUGEPAGE");
+
+       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+       if (pagemap_fd < 0)
+               err(2, "open pagemap");
+
+       n_normal_pages = 0;
+       n_huge_pages = 0;
+       for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) {
+               if (allocate_transhuge(p, pagemap_fd) < 0)
+                       n_normal_pages++;
+               else
+                       n_huge_pages++;
+       }
+       printf("Number of normal pages:    %d\n", n_normal_pages);
+       printf("Number of huge pages:    %d\n", n_huge_pages);
+
+       memset(map_ptr, '*', len);
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+               goto err_out;
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+
+       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Total size:    %lu MiB\n", map_size / MB);
+       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+              scan_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+                                              ((double)scan_time_ns / NSEC_PER_SEC));
+
+       munmap(map_ptr_orig, len + HPAGE_SIZE);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr_orig, len + HPAGE_SIZE);
+       return KSFT_FAIL;
+}
+
+static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+       void *map_ptr;
+       struct timespec start_time, end_time;
+       unsigned long scan_time_ns;
+
+       map_size *= MB;
+
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+               goto err_out;
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+
+       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Total size:    %lu MiB\n", map_size / MB);
+       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+              scan_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+                                              ((double)scan_time_ns / NSEC_PER_SEC));
+
+       munmap(map_ptr, map_size);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, map_size);
+       return KSFT_FAIL;
+}
+
+static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
+{
+       void *map_ptr;
+       struct timespec start_time, end_time;
+       unsigned long scan_time_ns;
+
+       map_size *= MB;
+
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
+       if (!map_ptr)
+               return KSFT_FAIL;
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
+               goto err_out;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout))
+               goto err_out;
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+
+       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Total size:    %lu MiB\n", map_size / MB);
+       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
+              scan_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
+                                              ((double)scan_time_ns / NSEC_PER_SEC));
+
+       munmap(map_ptr, map_size);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, map_size);
+       return KSFT_FAIL;
+}
+
+static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
+{
+       void *map_ptr;
+       struct timespec start_time, end_time;
+       unsigned long cow_time_ns;
+
+       /* page_count must be less than 2*page_size */
+       size_t page_count = 4000;
+
+       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
+       if (!map_ptr)
+               return KSFT_FAIL;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+       for (size_t i = 0; i < page_count - 1; i = i + 2)
+               memset(map_ptr + page_size * i, '-', 1);
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               return KSFT_FAIL;
+       }
+
+       cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Total size:    %lu MiB\n\n", (page_size * page_count) / MB);
+       printf("Not merged pages:\n");
+       printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+              cow_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) /
+                                              ((double)cow_time_ns / NSEC_PER_SEC));
+
+       /* Create 2000 pairs of duplicate pages */
+       for (size_t i = 0; i < page_count - 1; i = i + 2) {
+               memset(map_ptr + page_size * i, '+', i / 2 + 1);
+               memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
+       }
+       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
+               goto err_out;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+       for (size_t i = 0; i < page_count - 1; i = i + 2)
+               memset(map_ptr + page_size * i, '-', 1);
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
+               perror("clock_gettime");
+               goto err_out;
+       }
+
+       cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
+                      (end_time.tv_nsec - start_time.tv_nsec);
+
+       printf("Merged pages:\n");
+       printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
+              cow_time_ns % NSEC_PER_SEC);
+       printf("Average speed:  %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) /
+                                              ((double)cow_time_ns / NSEC_PER_SEC));
+
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_PASS;
+
+err_out:
+       printf("Not OK\n");
+       munmap(map_ptr, page_size * page_count);
+       return KSFT_FAIL;
+}
+
+int main(int argc, char *argv[])
+{
+       int ret, opt;
+       int prot = 0;
+       int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
+       long page_count = KSM_PAGE_COUNT_DEFAULT;
+       size_t page_size = sysconf(_SC_PAGESIZE);
+       struct ksm_sysfs ksm_sysfs_old;
+       int test_name = CHECK_KSM_MERGE;
+       bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
+       bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
+       long size_MB = 0;
+
+       while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) {
+               switch (opt) {
+               case 'a':
+                       prot = str_to_prot(optarg);
+                       break;
+               case 'p':
+                       page_count = atol(optarg);
+                       if (page_count <= 0) {
+                               printf("The number of pages must be greater than 0\n");
+                               return KSFT_FAIL;
+                       }
+                       break;
+               case 'l':
+                       ksm_scan_limit_sec = atoi(optarg);
+                       if (ksm_scan_limit_sec <= 0) {
+                               printf("Timeout value must be greater than 0\n");
+                               return KSFT_FAIL;
+                       }
+                       break;
+               case 'h':
+                       print_help();
+                       break;
+               case 'z':
+                       if (strcmp(optarg, "0") == 0)
+                               use_zero_pages = 0;
+                       else
+                               use_zero_pages = 1;
+                       break;
+               case 'm':
+                       if (strcmp(optarg, "0") == 0)
+                               merge_across_nodes = 0;
+                       else
+                               merge_across_nodes = 1;
+                       break;
+               case 's':
+                       size_MB = atoi(optarg);
+                       if (size_MB <= 0) {
+                               printf("Size must be greater than 0\n");
+                               return KSFT_FAIL;
+                       }
+               case 'M':
+                       break;
+               case 'U':
+                       test_name = CHECK_KSM_UNMERGE;
+                       break;
+               case 'Z':
+                       test_name = CHECK_KSM_ZERO_PAGE_MERGE;
+                       break;
+               case 'N':
+                       test_name = CHECK_KSM_NUMA_MERGE;
+                       break;
+               case 'P':
+                       test_name = KSM_MERGE_TIME;
+                       break;
+               case 'H':
+                       test_name = KSM_MERGE_TIME_HUGE_PAGES;
+                       break;
+               case 'D':
+                       test_name = KSM_UNMERGE_TIME;
+                       break;
+               case 'C':
+                       test_name = KSM_COW_TIME;
+                       break;
+               default:
+                       return KSFT_FAIL;
+               }
+       }
+
+       if (prot == 0)
+               prot = str_to_prot(KSM_PROT_STR_DEFAULT);
+
+       if (access(KSM_SYSFS_PATH, F_OK)) {
+               printf("Config KSM not enabled\n");
+               return KSFT_SKIP;
+       }
+
+       if (ksm_save_def(&ksm_sysfs_old)) {
+               printf("Cannot save default tunables\n");
+               return KSFT_FAIL;
+       }
+
+       if (ksm_write_sysfs(KSM_FP("run"), 2) ||
+           ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
+           numa_available() ? 0 :
+               ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
+           ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
+               return KSFT_FAIL;
+
+       switch (test_name) {
+       case CHECK_KSM_MERGE:
+               ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+                                     ksm_scan_limit_sec, page_size);
+               break;
+       case CHECK_KSM_UNMERGE:
+               ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                       page_size);
+               break;
+       case CHECK_KSM_ZERO_PAGE_MERGE:
+               ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
+                                               ksm_scan_limit_sec, use_zero_pages, page_size);
+               break;
+       case CHECK_KSM_NUMA_MERGE:
+               ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                          merge_across_nodes, page_size);
+               break;
+       case KSM_MERGE_TIME:
+               if (size_MB == 0) {
+                       printf("Option '-s' is required.\n");
+                       return KSFT_FAIL;
+               }
+               ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                    size_MB);
+               break;
+       case KSM_MERGE_TIME_HUGE_PAGES:
+               if (size_MB == 0) {
+                       printf("Option '-s' is required.\n");
+                       return KSFT_FAIL;
+               }
+               ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+                               ksm_scan_limit_sec, size_MB);
+               break;
+       case KSM_UNMERGE_TIME:
+               if (size_MB == 0) {
+                       printf("Option '-s' is required.\n");
+                       return KSFT_FAIL;
+               }
+               ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
+                                      ksm_scan_limit_sec, size_MB);
+               break;
+       case KSM_COW_TIME:
+               ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
+                                  page_size);
+               break;
+       }
+
+       if (ksm_restore(&ksm_sysfs_old)) {
+               printf("Cannot restore default tunables\n");
+               return KSFT_FAIL;
+       }
+
+       return ret;
+}
diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c
new file mode 100644 (file)
index 0000000..262eae6
--- /dev/null
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
+ *
+ * Copyright 2021, Red Hat, Inc.
+ *
+ * Author(s): David Hildenbrand <david@redhat.com>
+ */
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#ifndef MADV_POPULATE_READ
+#define MADV_POPULATE_READ     22
+#endif /* MADV_POPULATE_READ */
+#ifndef MADV_POPULATE_WRITE
+#define MADV_POPULATE_WRITE    23
+#endif /* MADV_POPULATE_WRITE */
+
+/*
+ * For now, we're using 2 MiB of private anonymous memory for all tests.
+ */
+#define SIZE (2 * 1024 * 1024)
+
+static size_t pagesize;
+
+static void sense_support(void)
+{
+       char *addr;
+       int ret;
+
+       addr = mmap(0, pagesize, PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (!addr)
+               ksft_exit_fail_msg("mmap failed\n");
+
+       ret = madvise(addr, pagesize, MADV_POPULATE_READ);
+       if (ret)
+               ksft_exit_skip("MADV_POPULATE_READ is not available\n");
+
+       ret = madvise(addr, pagesize, MADV_POPULATE_WRITE);
+       if (ret)
+               ksft_exit_skip("MADV_POPULATE_WRITE is not available\n");
+
+       munmap(addr, pagesize);
+}
+
+static void test_prot_read(void)
+{
+       char *addr;
+       int ret;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (addr == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed\n");
+
+       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+       ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n");
+
+       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+       ksft_test_result(ret == -1 && errno == EINVAL,
+                        "MADV_POPULATE_WRITE with PROT_READ\n");
+
+       munmap(addr, SIZE);
+}
+
+static void test_prot_write(void)
+{
+       char *addr;
+       int ret;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (addr == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed\n");
+
+       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+       ksft_test_result(ret == -1 && errno == EINVAL,
+                        "MADV_POPULATE_READ with PROT_WRITE\n");
+
+       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+       ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n");
+
+       munmap(addr, SIZE);
+}
+
+static void test_holes(void)
+{
+       char *addr;
+       int ret;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (addr == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed\n");
+       ret = munmap(addr + pagesize, pagesize);
+       if (ret)
+               ksft_exit_fail_msg("munmap failed\n");
+
+       /* Hole in the middle */
+       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+       ksft_test_result(ret == -1 && errno == ENOMEM,
+                        "MADV_POPULATE_READ with holes in the middle\n");
+       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+       ksft_test_result(ret == -1 && errno == ENOMEM,
+                        "MADV_POPULATE_WRITE with holes in the middle\n");
+
+       /* Hole at end */
+       ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ);
+       ksft_test_result(ret == -1 && errno == ENOMEM,
+                        "MADV_POPULATE_READ with holes at the end\n");
+       ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE);
+       ksft_test_result(ret == -1 && errno == ENOMEM,
+                        "MADV_POPULATE_WRITE with holes at the end\n");
+
+       /* Hole at beginning */
+       ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ);
+       ksft_test_result(ret == -1 && errno == ENOMEM,
+                        "MADV_POPULATE_READ with holes at the beginning\n");
+       ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE);
+       ksft_test_result(ret == -1 && errno == ENOMEM,
+                        "MADV_POPULATE_WRITE with holes at the beginning\n");
+
+       munmap(addr, SIZE);
+}
+
+static bool range_is_populated(char *start, ssize_t size)
+{
+       int fd = open("/proc/self/pagemap", O_RDONLY);
+       bool ret = true;
+
+       if (fd < 0)
+               ksft_exit_fail_msg("opening pagemap failed\n");
+       for (; size > 0 && ret; size -= pagesize, start += pagesize)
+               if (!pagemap_is_populated(fd, start))
+                       ret = false;
+       close(fd);
+       return ret;
+}
+
+static bool range_is_not_populated(char *start, ssize_t size)
+{
+       int fd = open("/proc/self/pagemap", O_RDONLY);
+       bool ret = true;
+
+       if (fd < 0)
+               ksft_exit_fail_msg("opening pagemap failed\n");
+       for (; size > 0 && ret; size -= pagesize, start += pagesize)
+               if (pagemap_is_populated(fd, start))
+                       ret = false;
+       close(fd);
+       return ret;
+}
+
+static void test_populate_read(void)
+{
+       char *addr;
+       int ret;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (addr == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed\n");
+       ksft_test_result(range_is_not_populated(addr, SIZE),
+                        "range initially not populated\n");
+
+       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+       ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+       ksft_test_result(range_is_populated(addr, SIZE),
+                        "range is populated\n");
+
+       munmap(addr, SIZE);
+}
+
+static void test_populate_write(void)
+{
+       char *addr;
+       int ret;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (addr == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed\n");
+       ksft_test_result(range_is_not_populated(addr, SIZE),
+                        "range initially not populated\n");
+
+       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+       ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+       ksft_test_result(range_is_populated(addr, SIZE),
+                        "range is populated\n");
+
+       munmap(addr, SIZE);
+}
+
+static bool range_is_softdirty(char *start, ssize_t size)
+{
+       int fd = open("/proc/self/pagemap", O_RDONLY);
+       bool ret = true;
+
+       if (fd < 0)
+               ksft_exit_fail_msg("opening pagemap failed\n");
+       for (; size > 0 && ret; size -= pagesize, start += pagesize)
+               if (!pagemap_is_softdirty(fd, start))
+                       ret = false;
+       close(fd);
+       return ret;
+}
+
+static bool range_is_not_softdirty(char *start, ssize_t size)
+{
+       int fd = open("/proc/self/pagemap", O_RDONLY);
+       bool ret = true;
+
+       if (fd < 0)
+               ksft_exit_fail_msg("opening pagemap failed\n");
+       for (; size > 0 && ret; size -= pagesize, start += pagesize)
+               if (pagemap_is_softdirty(fd, start))
+                       ret = false;
+       close(fd);
+       return ret;
+}
+
+static void test_softdirty(void)
+{
+       char *addr;
+       int ret;
+
+       ksft_print_msg("[RUN] %s\n", __func__);
+
+       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+       if (addr == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed\n");
+
+       /* Clear any softdirty bits. */
+       clear_softdirty();
+       ksft_test_result(range_is_not_softdirty(addr, SIZE),
+                        "range is not softdirty\n");
+
+       /* Populating READ should set softdirty. */
+       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
+       ksft_test_result(!ret, "MADV_POPULATE_READ\n");
+       ksft_test_result(range_is_not_softdirty(addr, SIZE),
+                        "range is not softdirty\n");
+
+       /* Populating WRITE should set softdirty. */
+       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
+       ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
+       ksft_test_result(range_is_softdirty(addr, SIZE),
+                        "range is softdirty\n");
+
+       munmap(addr, SIZE);
+}
+
+int main(int argc, char **argv)
+{
+       int err;
+
+       pagesize = getpagesize();
+
+       ksft_print_header();
+       ksft_set_plan(21);
+
+       sense_support();
+       test_prot_read();
+       test_prot_write();
+       test_holes();
+       test_populate_read();
+       test_populate_write();
+       test_softdirty();
+
+       err = ksft_get_fail_cnt();
+       if (err)
+               ksft_exit_fail_msg("%d out of %d tests failed\n",
+                                  err, ksft_test_num());
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/map_fixed_noreplace.c b/tools/testing/selftests/mm/map_fixed_noreplace.c
new file mode 100644 (file)
index 0000000..eed4432
--- /dev/null
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test that MAP_FIXED_NOREPLACE works.
+ *
+ * Copyright 2018, Jann Horn <jannh@google.com>
+ * Copyright 2018, Michael Ellerman, IBM Corporation.
+ */
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+
+static void dump_maps(void)
+{
+       char cmd[32];
+
+       snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+       system(cmd);
+}
+
+static unsigned long find_base_addr(unsigned long size)
+{
+       void *addr;
+       unsigned long flags;
+
+       flags = MAP_PRIVATE | MAP_ANONYMOUS;
+       addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+       if (addr == MAP_FAILED) {
+               printf("Error: couldn't map the space we need for the test\n");
+               return 0;
+       }
+
+       if (munmap(addr, size) != 0) {
+               printf("Error: couldn't map the space we need for the test\n");
+               return 0;
+       }
+       return (unsigned long)addr;
+}
+
+int main(void)
+{
+       unsigned long base_addr;
+       unsigned long flags, addr, size, page_size;
+       char *p;
+
+       page_size = sysconf(_SC_PAGE_SIZE);
+
+       //let's find a base addr that is free before we start the tests
+       size = 5 * page_size;
+       base_addr = find_base_addr(size);
+       if (!base_addr) {
+               printf("Error: couldn't map the space we need for the test\n");
+               return 1;
+       }
+
+       flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+
+       // Check we can map all the areas we need below
+       errno = 0;
+       addr = base_addr;
+       size = 5 * page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p == MAP_FAILED) {
+               dump_maps();
+               printf("Error: couldn't map the space we need for the test\n");
+               return 1;
+       }
+
+       errno = 0;
+       if (munmap((void *)addr, 5 * page_size) != 0) {
+               dump_maps();
+               printf("Error: munmap failed!?\n");
+               return 1;
+       }
+       printf("unmap() successful\n");
+
+       errno = 0;
+       addr = base_addr + page_size;
+       size = 3 * page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p == MAP_FAILED) {
+               dump_maps();
+               printf("Error: first mmap() failed unexpectedly\n");
+               return 1;
+       }
+
+       /*
+        * Exact same mapping again:
+        *   base |  free  | new
+        *     +1 | mapped | new
+        *     +2 | mapped | new
+        *     +3 | mapped | new
+        *     +4 |  free  | new
+        */
+       errno = 0;
+       addr = base_addr;
+       size = 5 * page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p != MAP_FAILED) {
+               dump_maps();
+               printf("Error:1: mmap() succeeded when it shouldn't have\n");
+               return 1;
+       }
+
+       /*
+        * Second mapping contained within first:
+        *
+        *   base |  free  |
+        *     +1 | mapped |
+        *     +2 | mapped | new
+        *     +3 | mapped |
+        *     +4 |  free  |
+        */
+       errno = 0;
+       addr = base_addr + (2 * page_size);
+       size = page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p != MAP_FAILED) {
+               dump_maps();
+               printf("Error:2: mmap() succeeded when it shouldn't have\n");
+               return 1;
+       }
+
+       /*
+        * Overlap end of existing mapping:
+        *   base |  free  |
+        *     +1 | mapped |
+        *     +2 | mapped |
+        *     +3 | mapped | new
+        *     +4 |  free  | new
+        */
+       errno = 0;
+       addr = base_addr + (3 * page_size);
+       size = 2 * page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p != MAP_FAILED) {
+               dump_maps();
+               printf("Error:3: mmap() succeeded when it shouldn't have\n");
+               return 1;
+       }
+
+       /*
+        * Overlap start of existing mapping:
+        *   base |  free  | new
+        *     +1 | mapped | new
+        *     +2 | mapped |
+        *     +3 | mapped |
+        *     +4 |  free  |
+        */
+       errno = 0;
+       addr = base_addr;
+       size = 2 * page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p != MAP_FAILED) {
+               dump_maps();
+               printf("Error:4: mmap() succeeded when it shouldn't have\n");
+               return 1;
+       }
+
+       /*
+        * Adjacent to start of existing mapping:
+        *   base |  free  | new
+        *     +1 | mapped |
+        *     +2 | mapped |
+        *     +3 | mapped |
+        *     +4 |  free  |
+        */
+       errno = 0;
+       addr = base_addr;
+       size = page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p == MAP_FAILED) {
+               dump_maps();
+               printf("Error:5: mmap() failed when it shouldn't have\n");
+               return 1;
+       }
+
+       /*
+        * Adjacent to end of existing mapping:
+        *   base |  free  |
+        *     +1 | mapped |
+        *     +2 | mapped |
+        *     +3 | mapped |
+        *     +4 |  free  |  new
+        */
+       errno = 0;
+       addr = base_addr + (4 * page_size);
+       size = page_size;
+       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+       if (p == MAP_FAILED) {
+               dump_maps();
+               printf("Error:6: mmap() failed when it shouldn't have\n");
+               return 1;
+       }
+
+       addr = base_addr;
+       size = 5 * page_size;
+       if (munmap((void *)addr, size) != 0) {
+               dump_maps();
+               printf("Error: munmap failed!?\n");
+               return 1;
+       }
+       printf("unmap() successful\n");
+
+       printf("OK\n");
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/map_hugetlb.c b/tools/testing/selftests/mm/map_hugetlb.c
new file mode 100644 (file)
index 0000000..312889e
--- /dev/null
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag.  Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified.  Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#ifndef MAP_HUGE_MASK
+#define MAP_HUGE_MASK 0x3f
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void check_bytes(char *addr)
+{
+       printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr, size_t length)
+{
+       unsigned long i;
+
+       for (i = 0; i < length; i++)
+               *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr, size_t length)
+{
+       unsigned long i;
+
+       check_bytes(addr);
+       for (i = 0; i < length; i++)
+               if (*(addr + i) != (char)i) {
+                       printf("Mismatch at %lu\n", i);
+                       return 1;
+               }
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       void *addr;
+       int ret;
+       size_t length = LENGTH;
+       int flags = FLAGS;
+       int shift = 0;
+
+       if (argc > 1)
+               length = atol(argv[1]) << 20;
+       if (argc > 2) {
+               shift = atoi(argv[2]);
+               if (shift)
+                       flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+       }
+
+       if (shift)
+               printf("%u kB hugepages\n", 1 << (shift - 10));
+       else
+               printf("Default size hugepages\n");
+       printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
+
+       addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
+       if (addr == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       printf("Returned address is %p\n", addr);
+       check_bytes(addr);
+       write_bytes(addr, length);
+       ret = read_bytes(addr, length);
+
+       /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+       if (munmap(addr, length)) {
+               perror("munmap");
+               exit(1);
+       }
+
+       return ret;
+}
diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c
new file mode 100644 (file)
index 0000000..6b8aeaa
--- /dev/null
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifndef MMAP_SZ
+#define MMAP_SZ                4096
+#endif
+
+#define BUG_ON(condition, description)                                 \
+       do {                                                            \
+               if (condition) {                                        \
+                       fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
+                               __LINE__, (description), strerror(errno)); \
+                       exit(1);                                        \
+               }                                                       \
+       } while (0)
+
+static int parent_f(int sock, unsigned long *smap, int child)
+{
+       int status, ret;
+
+       ret = read(sock, &status, sizeof(int));
+       BUG_ON(ret <= 0, "read(sock)");
+
+       *smap = 0x22222BAD;
+       ret = msync(smap, MMAP_SZ, MS_SYNC);
+       BUG_ON(ret, "msync()");
+
+       ret = write(sock, &status, sizeof(int));
+       BUG_ON(ret <= 0, "write(sock)");
+
+       waitpid(child, &status, 0);
+       BUG_ON(!WIFEXITED(status), "child in unexpected state");
+
+       return WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+       int ret, buf = 0;
+
+       smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE | MAP_POPULATE, fd, 0);
+       BUG_ON(smap == MAP_FAILED, "mmap()");
+
+       BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+       ret = write(sock, &buf, sizeof(int));
+       BUG_ON(ret <= 0, "write(sock)");
+
+       ret = read(sock, &buf, sizeof(int));
+       BUG_ON(ret <= 0, "read(sock)");
+
+       BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
+       BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       int sock[2], child, ret;
+       FILE *ftmp;
+       unsigned long *smap;
+
+       ftmp = tmpfile();
+       BUG_ON(ftmp == 0, "tmpfile()");
+
+       ret = ftruncate(fileno(ftmp), MMAP_SZ);
+       BUG_ON(ret, "ftruncate()");
+
+       smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fileno(ftmp), 0);
+       BUG_ON(smap == MAP_FAILED, "mmap()");
+
+       *smap = 0xdeadbabe;
+       /* Probably unnecessary, but let it be. */
+       ret = msync(smap, MMAP_SZ, MS_SYNC);
+       BUG_ON(ret, "msync()");
+
+       ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+       BUG_ON(ret, "socketpair()");
+
+       child = fork();
+       BUG_ON(child == -1, "fork()");
+
+       if (child) {
+               ret = close(sock[0]);
+               BUG_ON(ret, "close()");
+
+               return parent_f(sock[1], smap, child);
+       }
+
+       ret = close(sock[1]);
+       BUG_ON(ret, "close()");
+
+       return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/mm/memfd_secret.c b/tools/testing/selftests/mm/memfd_secret.c
new file mode 100644 (file)
index 0000000..957b9e1
--- /dev/null
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2021
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#define _GNU_SOURCE
+#include <sys/uio.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+
+#include "../kselftest.h"
+
+#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__)
+#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
+#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
+
+#ifdef __NR_memfd_secret
+
+#define PATTERN        0x55
+
+static const int prot = PROT_READ | PROT_WRITE;
+static const int mode = MAP_SHARED;
+
+static unsigned long page_size;
+static unsigned long mlock_limit_cur;
+static unsigned long mlock_limit_max;
+
+static int memfd_secret(unsigned int flags)
+{
+       return syscall(__NR_memfd_secret, flags);
+}
+
+static void test_file_apis(int fd)
+{
+       char buf[64];
+
+       if ((read(fd, buf, sizeof(buf)) >= 0) ||
+           (write(fd, buf, sizeof(buf)) >= 0) ||
+           (pread(fd, buf, sizeof(buf), 0) >= 0) ||
+           (pwrite(fd, buf, sizeof(buf), 0) >= 0))
+               fail("unexpected file IO\n");
+       else
+               pass("file IO is blocked as expected\n");
+}
+
+static void test_mlock_limit(int fd)
+{
+       size_t len;
+       char *mem;
+
+       len = mlock_limit_cur;
+       mem = mmap(NULL, len, prot, mode, fd, 0);
+       if (mem == MAP_FAILED) {
+               fail("unable to mmap secret memory\n");
+               return;
+       }
+       munmap(mem, len);
+
+       len = mlock_limit_max * 2;
+       mem = mmap(NULL, len, prot, mode, fd, 0);
+       if (mem != MAP_FAILED) {
+               fail("unexpected mlock limit violation\n");
+               munmap(mem, len);
+               return;
+       }
+
+       pass("mlock limit is respected\n");
+}
+
+static void try_process_vm_read(int fd, int pipefd[2])
+{
+       struct iovec liov, riov;
+       char buf[64];
+       char *mem;
+
+       if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+               fail("pipe write: %s\n", strerror(errno));
+               exit(KSFT_FAIL);
+       }
+
+       liov.iov_len = riov.iov_len = sizeof(buf);
+       liov.iov_base = buf;
+       riov.iov_base = mem;
+
+       if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) {
+               if (errno == ENOSYS)
+                       exit(KSFT_SKIP);
+               exit(KSFT_PASS);
+       }
+
+       exit(KSFT_FAIL);
+}
+
+static void try_ptrace(int fd, int pipefd[2])
+{
+       pid_t ppid = getppid();
+       int status;
+       char *mem;
+       long ret;
+
+       if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
+               perror("pipe write");
+               exit(KSFT_FAIL);
+       }
+
+       ret = ptrace(PTRACE_ATTACH, ppid, 0, 0);
+       if (ret) {
+               perror("ptrace_attach");
+               exit(KSFT_FAIL);
+       }
+
+       ret = waitpid(ppid, &status, WUNTRACED);
+       if ((ret != ppid) || !(WIFSTOPPED(status))) {
+               fprintf(stderr, "weird waitppid result %ld stat %x\n",
+                       ret, status);
+               exit(KSFT_FAIL);
+       }
+
+       if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0))
+               exit(KSFT_PASS);
+
+       exit(KSFT_FAIL);
+}
+
+static void check_child_status(pid_t pid, const char *name)
+{
+       int status;
+
+       waitpid(pid, &status, 0);
+
+       if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) {
+               skip("%s is not supported\n", name);
+               return;
+       }
+
+       if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) ||
+           WIFSIGNALED(status)) {
+               pass("%s is blocked as expected\n", name);
+               return;
+       }
+
+       fail("%s: unexpected memory access\n", name);
+}
+
+static void test_remote_access(int fd, const char *name,
+                              void (*func)(int fd, int pipefd[2]))
+{
+       int pipefd[2];
+       pid_t pid;
+       char *mem;
+
+       if (pipe(pipefd)) {
+               fail("pipe failed: %s\n", strerror(errno));
+               return;
+       }
+
+       pid = fork();
+       if (pid < 0) {
+               fail("fork failed: %s\n", strerror(errno));
+               return;
+       }
+
+       if (pid == 0) {
+               func(fd, pipefd);
+               return;
+       }
+
+       mem = mmap(NULL, page_size, prot, mode, fd, 0);
+       if (mem == MAP_FAILED) {
+               fail("Unable to mmap secret memory\n");
+               return;
+       }
+
+       ftruncate(fd, page_size);
+       memset(mem, PATTERN, page_size);
+
+       if (write(pipefd[1], &mem, sizeof(mem)) < 0) {
+               fail("pipe write: %s\n", strerror(errno));
+               return;
+       }
+
+       check_child_status(pid, name);
+}
+
+static void test_process_vm_read(int fd)
+{
+       test_remote_access(fd, "process_vm_read", try_process_vm_read);
+}
+
+static void test_ptrace(int fd)
+{
+       test_remote_access(fd, "ptrace", try_ptrace);
+}
+
+static int set_cap_limits(rlim_t max)
+{
+       struct rlimit new;
+       cap_t cap = cap_init();
+
+       new.rlim_cur = max;
+       new.rlim_max = max;
+       if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+               perror("setrlimit() returns error");
+               return -1;
+       }
+
+       /* drop capabilities including CAP_IPC_LOCK */
+       if (cap_set_proc(cap)) {
+               perror("cap_set_proc() returns error");
+               return -2;
+       }
+
+       return 0;
+}
+
+static void prepare(void)
+{
+       struct rlimit rlim;
+
+       page_size = sysconf(_SC_PAGE_SIZE);
+       if (!page_size)
+               ksft_exit_fail_msg("Failed to get page size %s\n",
+                                  strerror(errno));
+
+       if (getrlimit(RLIMIT_MEMLOCK, &rlim))
+               ksft_exit_fail_msg("Unable to detect mlock limit: %s\n",
+                                  strerror(errno));
+
+       mlock_limit_cur = rlim.rlim_cur;
+       mlock_limit_max = rlim.rlim_max;
+
+       printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n",
+              page_size, mlock_limit_cur, mlock_limit_max);
+
+       if (page_size > mlock_limit_cur)
+               mlock_limit_cur = page_size;
+       if (page_size > mlock_limit_max)
+               mlock_limit_max = page_size;
+
+       if (set_cap_limits(mlock_limit_max))
+               ksft_exit_fail_msg("Unable to set mlock limit: %s\n",
+                                  strerror(errno));
+}
+
+#define NUM_TESTS 4
+
+int main(int argc, char *argv[])
+{
+       int fd;
+
+       prepare();
+
+       ksft_print_header();
+       ksft_set_plan(NUM_TESTS);
+
+       fd = memfd_secret(0);
+       if (fd < 0) {
+               if (errno == ENOSYS)
+                       ksft_exit_skip("memfd_secret is not supported\n");
+               else
+                       ksft_exit_fail_msg("memfd_secret failed: %s\n",
+                                          strerror(errno));
+       }
+
+       test_mlock_limit(fd);
+       test_file_apis(fd);
+       test_process_vm_read(fd);
+       test_ptrace(fd);
+
+       close(fd);
+
+       ksft_finished();
+}
+
+#else /* __NR_memfd_secret */
+
+int main(int argc, char *argv[])
+{
+       printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
+       return KSFT_SKIP;
+}
+
+#endif /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c
new file mode 100644 (file)
index 0000000..1cec842
--- /dev/null
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The main purpose of the tests here is to exercise the migration entry code
+ * paths in the kernel.
+ */
+
+#include "../kselftest_harness.h"
+#include <strings.h>
+#include <pthread.h>
+#include <numa.h>
+#include <numaif.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <time.h>
+
+#define TWOMEG (2<<20)
+#define RUNTIME (60)
+
+#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+
+FIXTURE(migration)
+{
+       pthread_t *threads;
+       pid_t *pids;
+       int nthreads;
+       int n1;
+       int n2;
+};
+
+FIXTURE_SETUP(migration)
+{
+       int n;
+
+       ASSERT_EQ(numa_available(), 0);
+       self->nthreads = numa_num_task_cpus() - 1;
+       self->n1 = -1;
+       self->n2 = -1;
+
+       for (n = 0; n < numa_max_possible_node(); n++)
+               if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) {
+                       if (self->n1 == -1) {
+                               self->n1 = n;
+                       } else {
+                               self->n2 = n;
+                               break;
+                       }
+               }
+
+       self->threads = malloc(self->nthreads * sizeof(*self->threads));
+       ASSERT_NE(self->threads, NULL);
+       self->pids = malloc(self->nthreads * sizeof(*self->pids));
+       ASSERT_NE(self->pids, NULL);
+};
+
+FIXTURE_TEARDOWN(migration)
+{
+       free(self->threads);
+       free(self->pids);
+}
+
+int migrate(uint64_t *ptr, int n1, int n2)
+{
+       int ret, tmp;
+       int status = 0;
+       struct timespec ts1, ts2;
+
+       if (clock_gettime(CLOCK_MONOTONIC, &ts1))
+               return -1;
+
+       while (1) {
+               if (clock_gettime(CLOCK_MONOTONIC, &ts2))
+                       return -1;
+
+               if (ts2.tv_sec - ts1.tv_sec >= RUNTIME)
+                       return 0;
+
+               ret = move_pages(0, 1, (void **) &ptr, &n2, &status,
+                               MPOL_MF_MOVE_ALL);
+               if (ret) {
+                       if (ret > 0)
+                               printf("Didn't migrate %d pages\n", ret);
+                       else
+                               perror("Couldn't migrate pages");
+                       return -2;
+               }
+
+               tmp = n2;
+               n2 = n1;
+               n1 = tmp;
+       }
+
+       return 0;
+}
+
+void *access_mem(void *ptr)
+{
+       uint64_t y = 0;
+       volatile uint64_t *x = ptr;
+
+       while (1) {
+               pthread_testcancel();
+               y += *x;
+       }
+
+       return NULL;
+}
+
+/*
+ * Basic migration entry testing. One thread will move pages back and forth
+ * between nodes whilst other threads try and access them triggering the
+ * migration entry wait paths in the kernel.
+ */
+TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME)
+{
+       uint64_t *ptr;
+       int i;
+
+       if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+               SKIP(return, "Not enough threads or NUMA nodes available");
+
+       ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(ptr, MAP_FAILED);
+
+       memset(ptr, 0xde, TWOMEG);
+       for (i = 0; i < self->nthreads - 1; i++)
+               if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+                       perror("Couldn't create thread");
+
+       ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+       for (i = 0; i < self->nthreads - 1; i++)
+               ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+/*
+ * Same as the previous test but with shared memory.
+ */
+TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME)
+{
+       pid_t pid;
+       uint64_t *ptr;
+       int i;
+
+       if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+               SKIP(return, "Not enough threads or NUMA nodes available");
+
+       ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
+               MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(ptr, MAP_FAILED);
+
+       memset(ptr, 0xde, TWOMEG);
+       for (i = 0; i < self->nthreads - 1; i++) {
+               pid = fork();
+               if (!pid)
+                       access_mem(ptr);
+               else
+                       self->pids[i] = pid;
+       }
+
+       ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+       for (i = 0; i < self->nthreads - 1; i++)
+               ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
+}
+
+/*
+ * Tests the pmd migration entry paths.
+ */
+TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME)
+{
+       uint64_t *ptr;
+       int i;
+
+       if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
+               SKIP(return, "Not enough threads or NUMA nodes available");
+
+       ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE,
+               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       ASSERT_NE(ptr, MAP_FAILED);
+
+       ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG);
+       ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0);
+       memset(ptr, 0xde, TWOMEG);
+       for (i = 0; i < self->nthreads - 1; i++)
+               if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
+                       perror("Couldn't create thread");
+
+       ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
+       for (i = 0; i < self->nthreads - 1; i++)
+               ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mm/mlock-random-test.c b/tools/testing/selftests/mm/mlock-random-test.c
new file mode 100644 (file)
index 0000000..782ea94
--- /dev/null
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+       struct rlimit new;
+       cap_t cap = cap_init();
+
+       new.rlim_cur = max;
+       new.rlim_max = max;
+       if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+               perror("setrlimit() returns error\n");
+               return -1;
+       }
+
+       /* drop capabilities including CAP_IPC_LOCK */
+       if (cap_set_proc(cap)) {
+               perror("cap_set_proc() returns error\n");
+               return -2;
+       }
+
+       return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+       FILE *f;
+       int ret = -1;
+       char line[1024] = {0};
+       unsigned long lock_size = 0;
+
+       f = fopen("/proc/self/status", "r");
+       if (!f) {
+               perror("fopen");
+               return -1;
+       }
+
+       while (fgets(line, 1024, f)) {
+               if (strstr(line, "VmLck")) {
+                       ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+                       if (ret <= 0) {
+                               printf("sscanf() on VmLck error: %s: %d\n",
+                                               line, ret);
+                               fclose(f);
+                               return -1;
+                       }
+                       fclose(f);
+                       return (int)(lock_size << 10);
+               }
+       }
+
+       perror("cannot parse VmLck in /proc/self/status\n");
+       fclose(f);
+       return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+       FILE *smaps;
+       char *line;
+       unsigned long mmupage_size = 0;
+       size_t size;
+
+       smaps = seek_to_smaps_entry(addr);
+       if (!smaps) {
+               printf("Unable to parse /proc/self/smaps\n");
+               return 0;
+       }
+
+       while (getline(&line, &size, smaps) > 0) {
+               if (!strstr(line, "MMUPageSize")) {
+                       free(line);
+                       line = NULL;
+                       size = 0;
+                       continue;
+               }
+
+               /* found the MMUPageSize of this section */
+               if (sscanf(line, "MMUPageSize:    %8lu kB",
+                                       &mmupage_size) < 1) {
+                       printf("Unable to parse smaps entry for Size:%s\n",
+                                       line);
+                       break;
+               }
+
+       }
+       free(line);
+       if (smaps)
+               fclose(smaps);
+       return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start +  len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_within_limit(char *p, int alloc_size)
+{
+       int i;
+       int ret = 0;
+       int locked_vm_size = 0;
+       struct rlimit cur;
+       int page_size = 0;
+
+       getrlimit(RLIMIT_MEMLOCK, &cur);
+       if (cur.rlim_cur < alloc_size) {
+               printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+                               alloc_size, (unsigned int)cur.rlim_cur);
+               return -1;
+       }
+
+       srand(time(NULL));
+       for (i = 0; i < TEST_LOOP; i++) {
+               /*
+                * - choose mlock/mlock2 randomly
+                * - choose lock_size randomly but lock_size < alloc_size
+                * - choose start_offset randomly but p+start_offset+lock_size
+                *   < p+alloc_size
+                */
+               int is_mlock = !!(rand() % 2);
+               int lock_size = rand() % alloc_size;
+               int start_offset = rand() % (alloc_size - lock_size);
+
+               if (is_mlock)
+                       ret = mlock(p + start_offset, lock_size);
+               else
+                       ret = mlock2_(p + start_offset, lock_size,
+                                      MLOCK_ONFAULT);
+
+               if (ret) {
+                       printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+                                       is_mlock ? "mlock" : "mlock2",
+                                       p, alloc_size,
+                                       p + start_offset, lock_size);
+                       return ret;
+               }
+       }
+
+       /*
+        * Check VmLck left by the tests.
+        */
+       locked_vm_size = get_proc_locked_vm_size();
+       page_size = get_proc_page_size((unsigned long)p);
+       if (page_size == 0) {
+               printf("cannot get proc MMUPageSize\n");
+               return -1;
+       }
+
+       if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
+               printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
+                               locked_vm_size, alloc_size);
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ *    return value: 0 - success
+ *    else: failure
+ */
+int test_mlock_outof_limit(char *p, int alloc_size)
+{
+       int i;
+       int ret = 0;
+       int locked_vm_size = 0, old_locked_vm_size = 0;
+       struct rlimit cur;
+
+       getrlimit(RLIMIT_MEMLOCK, &cur);
+       if (cur.rlim_cur >= alloc_size) {
+               printf("alloc_size[%d] >%u rlimit, violates test condition\n",
+                               alloc_size, (unsigned int)cur.rlim_cur);
+               return -1;
+       }
+
+       old_locked_vm_size = get_proc_locked_vm_size();
+       srand(time(NULL));
+       for (i = 0; i < TEST_LOOP; i++) {
+               int is_mlock = !!(rand() % 2);
+               int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+                       + cur.rlim_cur;
+               int start_offset = rand() % (alloc_size - lock_size);
+
+               if (is_mlock)
+                       ret = mlock(p + start_offset, lock_size);
+               else
+                       ret = mlock2_(p + start_offset, lock_size,
+                                       MLOCK_ONFAULT);
+               if (ret == 0) {
+                       printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+                                       is_mlock ? "mlock" : "mlock2",
+                                       p, alloc_size,
+                                       p + start_offset, lock_size);
+                       return -1;
+               }
+       }
+
+       locked_vm_size = get_proc_locked_vm_size();
+       if (locked_vm_size != old_locked_vm_size) {
+               printf("tests leads to new mlocked page: old[%d], new[%d]\n",
+                               old_locked_vm_size,
+                               locked_vm_size);
+               return -1;
+       }
+
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       char *p = NULL;
+       int ret = 0;
+
+       if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+               return -1;
+
+       p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+       if (p == NULL) {
+               perror("malloc() failure\n");
+               return -1;
+       }
+       ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+       if (ret)
+               return ret;
+       munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+       free(p);
+
+
+       p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+       if (p == NULL) {
+               perror("malloc() failure\n");
+               return -1;
+       }
+       ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+       if (ret)
+               return ret;
+       munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+       free(p);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/mlock2-tests.c b/tools/testing/selftests/mm/mlock2-tests.c
new file mode 100644 (file)
index 0000000..11b2301
--- /dev/null
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "mlock2.h"
+
+#include "../kselftest.h"
+
+struct vm_boundaries {
+       unsigned long start;
+       unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+       FILE *file;
+       int ret = 1;
+       char line[1024] = {0};
+       char *end_addr;
+       char *stop;
+       unsigned long start;
+       unsigned long end;
+
+       if (!area)
+               return ret;
+
+       file = fopen("/proc/self/maps", "r");
+       if (!file) {
+               perror("fopen");
+               return ret;
+       }
+
+       memset(area, 0, sizeof(struct vm_boundaries));
+
+       while(fgets(line, 1024, file)) {
+               end_addr = strchr(line, '-');
+               if (!end_addr) {
+                       printf("cannot parse /proc/self/maps\n");
+                       goto out;
+               }
+               *end_addr = '\0';
+               end_addr++;
+               stop = strchr(end_addr, ' ');
+               if (!stop) {
+                       printf("cannot parse /proc/self/maps\n");
+                       goto out;
+               }
+               stop = '\0';
+
+               sscanf(line, "%lx", &start);
+               sscanf(end_addr, "%lx", &end);
+
+               if (start <= addr && end > addr) {
+                       area->start = start;
+                       area->end = end;
+                       ret = 0;
+                       goto out;
+               }
+       }
+out:
+       fclose(file);
+       return ret;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+       char *line = NULL;
+       char *flags;
+       size_t size = 0;
+       bool ret = false;
+       FILE *smaps;
+
+       smaps = seek_to_smaps_entry(addr);
+       if (!smaps) {
+               printf("Unable to parse /proc/self/smaps\n");
+               goto out;
+       }
+
+       while (getline(&line, &size, smaps) > 0) {
+               if (!strstr(line, VMFLAGS)) {
+                       free(line);
+                       line = NULL;
+                       size = 0;
+                       continue;
+               }
+
+               flags = line + strlen(VMFLAGS);
+               ret = (strstr(flags, vmflag) != NULL);
+               goto out;
+       }
+
+out:
+       free(line);
+       fclose(smaps);
+       return ret;
+}
+
+#define SIZE "Size:"
+#define RSS  "Rss:"
+#define LOCKED "lo"
+
+static unsigned long get_value_for_name(unsigned long addr, const char *name)
+{
+       char *line = NULL;
+       size_t size = 0;
+       char *value_ptr;
+       FILE *smaps = NULL;
+       unsigned long value = -1UL;
+
+       smaps = seek_to_smaps_entry(addr);
+       if (!smaps) {
+               printf("Unable to parse /proc/self/smaps\n");
+               goto out;
+       }
+
+       while (getline(&line, &size, smaps) > 0) {
+               if (!strstr(line, name)) {
+                       free(line);
+                       line = NULL;
+                       size = 0;
+                       continue;
+               }
+
+               value_ptr = line + strlen(name);
+               if (sscanf(value_ptr, "%lu kB", &value) < 1) {
+                       printf("Unable to parse smaps entry for Size\n");
+                       goto out;
+               }
+               break;
+       }
+
+out:
+       if (smaps)
+               fclose(smaps);
+       free(line);
+       return value;
+}
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+       bool locked;
+       unsigned long vma_size, vma_rss;
+
+       locked = is_vmflag_set(addr, LOCKED);
+       if (!locked)
+               return false;
+
+       vma_size = get_value_for_name(addr, SIZE);
+       vma_rss = get_value_for_name(addr, RSS);
+
+       /* only one page is faulted in */
+       return (vma_rss < vma_size);
+}
+
+#define PRESENT_BIT     0x8000000000000000ULL
+#define PFN_MASK        0x007FFFFFFFFFFFFFULL
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(unsigned long addr)
+{
+       bool locked;
+       unsigned long vma_size, vma_rss;
+
+       locked = is_vmflag_set(addr, LOCKED);
+       if (!locked)
+               return false;
+
+       vma_size = get_value_for_name(addr, SIZE);
+       vma_rss = get_value_for_name(addr, RSS);
+
+       return (vma_rss == vma_size);
+}
+
+static int unlock_lock_check(char *map)
+{
+       if (is_vmflag_set((unsigned long)map, LOCKED)) {
+               printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+               return 1;
+       }
+
+       return 0;
+}
+
+static int test_mlock_lock()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+       if (map == MAP_FAILED) {
+               perror("test_mlock_locked mmap");
+               goto out;
+       }
+
+       if (mlock2_(map, 2 * page_size, 0)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(KSFT_SKIP);
+               }
+               perror("mlock2(0)");
+               goto unmap;
+       }
+
+       if (!lock_check((unsigned long)map))
+               goto unmap;
+
+       /* Now unlock and recheck attributes */
+       if (munlock(map, 2 * page_size)) {
+               perror("munlock()");
+               goto unmap;
+       }
+
+       ret = unlock_lock_check(map);
+
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       return ret;
+}
+
+static int onfault_check(char *map)
+{
+       *map = 'a';
+       if (!is_vma_lock_on_fault((unsigned long)map)) {
+               printf("VMA is not marked for lock on fault\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+       unsigned long page_size = getpagesize();
+
+       if (is_vma_lock_on_fault((unsigned long)map) ||
+           is_vma_lock_on_fault((unsigned long)map + page_size)) {
+               printf("VMA is still lock on fault after unlock\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+static int test_mlock_onfault()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+       if (map == MAP_FAILED) {
+               perror("test_mlock_locked mmap");
+               goto out;
+       }
+
+       if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(KSFT_SKIP);
+               }
+               perror("mlock2(MLOCK_ONFAULT)");
+               goto unmap;
+       }
+
+       if (onfault_check(map))
+               goto unmap;
+
+       /* Now unlock and recheck attributes */
+       if (munlock(map, 2 * page_size)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(KSFT_SKIP);
+               }
+               perror("munlock()");
+               goto unmap;
+       }
+
+       ret = unlock_onfault_check(map);
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+       if (map == MAP_FAILED) {
+               perror("test_mlock_locked mmap");
+               goto out;
+       }
+
+       *map = 'a';
+
+       if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(KSFT_SKIP);
+               }
+               perror("mlock2(MLOCK_ONFAULT)");
+               goto unmap;
+       }
+
+       if (!is_vma_lock_on_fault((unsigned long)map) ||
+           !is_vma_lock_on_fault((unsigned long)map + page_size)) {
+               printf("VMA with present pages is not marked lock on fault\n");
+               goto unmap;
+       }
+       ret = 0;
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       return ret;
+}
+
+static int test_munlockall()
+{
+       char *map;
+       int ret = 1;
+       unsigned long page_size = getpagesize();
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+       if (map == MAP_FAILED) {
+               perror("test_munlockall mmap");
+               goto out;
+       }
+
+       if (mlockall(MCL_CURRENT)) {
+               perror("mlockall(MCL_CURRENT)");
+               goto out;
+       }
+
+       if (!lock_check((unsigned long)map))
+               goto unmap;
+
+       if (munlockall()) {
+               perror("munlockall()");
+               goto unmap;
+       }
+
+       if (unlock_lock_check(map))
+               goto unmap;
+
+       munmap(map, 2 * page_size);
+
+       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+       if (map == MAP_FAILED) {
+               perror("test_munlockall second mmap");
+               goto out;
+       }
+
+       if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+               perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+               goto unmap;
+       }
+
+       if (onfault_check(map))
+               goto unmap;
+
+       if (munlockall()) {
+               perror("munlockall()");
+               goto unmap;
+       }
+
+       if (unlock_onfault_check(map))
+               goto unmap;
+
+       if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+               perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+               goto out;
+       }
+
+       if (!lock_check((unsigned long)map))
+               goto unmap;
+
+       if (munlockall()) {
+               perror("munlockall()");
+               goto unmap;
+       }
+
+       ret = unlock_lock_check(map);
+
+unmap:
+       munmap(map, 2 * page_size);
+out:
+       munlockall();
+       return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+       int ret = 1;
+       void *map;
+       unsigned long page_size = getpagesize();
+       struct vm_boundaries page1;
+       struct vm_boundaries page2;
+       struct vm_boundaries page3;
+
+       map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+       if (map == MAP_FAILED) {
+               perror("mmap()");
+               return ret;
+       }
+
+       if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+               if (errno == ENOSYS) {
+                       printf("Cannot call new mlock family, skipping test\n");
+                       _exit(KSFT_SKIP);
+               }
+               perror("mlock(ONFAULT)\n");
+               goto out;
+       }
+
+       if (get_vm_area((unsigned long)map, &page1) ||
+           get_vm_area((unsigned long)map + page_size, &page2) ||
+           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+               printf("couldn't find mapping in /proc/self/maps\n");
+               goto out;
+       }
+
+       /*
+        * Before we unlock a portion, we need to that all three pages are in
+        * the same VMA.  If they are not we abort this test (Note that this is
+        * not a failure)
+        */
+       if (page1.start != page2.start || page2.start != page3.start) {
+               printf("VMAs are not merged to start, aborting test\n");
+               ret = 0;
+               goto out;
+       }
+
+       if (munlock(map + page_size, page_size)) {
+               perror("munlock()");
+               goto out;
+       }
+
+       if (get_vm_area((unsigned long)map, &page1) ||
+           get_vm_area((unsigned long)map + page_size, &page2) ||
+           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+               printf("couldn't find mapping in /proc/self/maps\n");
+               goto out;
+       }
+
+       /* All three VMAs should be different */
+       if (page1.start == page2.start || page2.start == page3.start) {
+               printf("failed to split VMA for munlock\n");
+               goto out;
+       }
+
+       /* Now unlock the first and third page and check the VMAs again */
+       if (munlock(map, page_size * 3)) {
+               perror("munlock()");
+               goto out;
+       }
+
+       if (get_vm_area((unsigned long)map, &page1) ||
+           get_vm_area((unsigned long)map + page_size, &page2) ||
+           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+               printf("couldn't find mapping in /proc/self/maps\n");
+               goto out;
+       }
+
+       /* Now all three VMAs should be the same */
+       if (page1.start != page2.start || page2.start != page3.start) {
+               printf("failed to merge VMAs after munlock\n");
+               goto out;
+       }
+
+       ret = 0;
+out:
+       munmap(map, 3 * page_size);
+       return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+       int ret = 1;
+
+       if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+               perror("mlockall");
+               return ret;
+       }
+
+       ret = test_function(false);
+       munlockall();
+       return ret;
+}
+
+int main(int argc, char **argv)
+{
+       int ret = 0;
+       ret += test_mlock_lock();
+       ret += test_mlock_onfault();
+       ret += test_munlockall();
+       ret += test_lock_onfault_of_present();
+       ret += test_vma_management(true);
+       ret += test_mlockall(test_vma_management);
+       return ret;
+}
diff --git a/tools/testing/selftests/mm/mlock2.h b/tools/testing/selftests/mm/mlock2.h
new file mode 100644 (file)
index 0000000..2a6e76c
--- /dev/null
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+       return syscall(__NR_mlock2, start, len, flags);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+       FILE *file;
+       char *line = NULL;
+       size_t size = 0;
+       unsigned long start, end;
+       char perms[5];
+       unsigned long offset;
+       char dev[32];
+       unsigned long inode;
+       char path[BUFSIZ];
+
+       file = fopen("/proc/self/smaps", "r");
+       if (!file) {
+               perror("fopen smaps");
+               _exit(1);
+       }
+
+       while (getline(&line, &size, file) > 0) {
+               if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+                          &start, &end, perms, &offset, dev, &inode, path) < 6)
+                       goto next;
+
+               if (start <= addr && addr < end)
+                       goto out;
+
+next:
+               free(line);
+               line = NULL;
+               size = 0;
+       }
+
+       fclose(file);
+       file = NULL;
+
+out:
+       free(line);
+       return file;
+}
diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c
new file mode 100644 (file)
index 0000000..6c62966
--- /dev/null
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2022 Google LLC
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "util.h"
+
+#include "../kselftest.h"
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open -1
+#endif
+
+#ifndef __NR_process_mrelease
+#define __NR_process_mrelease -1
+#endif
+
+#define MB(x) (x << 20)
+#define MAX_SIZE_MB 1024
+
+static int alloc_noexit(unsigned long nr_pages, int pipefd)
+{
+       int ppid = getppid();
+       int timeout = 10; /* 10sec timeout to get killed */
+       unsigned long i;
+       char *buf;
+
+       buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANON, 0, 0);
+       if (buf == MAP_FAILED) {
+               perror("mmap failed, halting the test");
+               return KSFT_FAIL;
+       }
+
+       for (i = 0; i < nr_pages; i++)
+               *((unsigned long *)(buf + (i * PAGE_SIZE))) = i;
+
+       /* Signal the parent that the child is ready */
+       if (write(pipefd, "", 1) < 0) {
+               perror("write");
+               return KSFT_FAIL;
+       }
+
+       /* Wait to be killed (when reparenting happens) */
+       while (getppid() == ppid && timeout > 0) {
+               sleep(1);
+               timeout--;
+       }
+
+       munmap(buf, nr_pages * PAGE_SIZE);
+
+       return (timeout > 0) ? KSFT_PASS : KSFT_FAIL;
+}
+
+/* The process_mrelease calls in this test are expected to fail */
+static void run_negative_tests(int pidfd)
+{
+       int res;
+       /* Test invalid flags. Expect to fail with EINVAL error code. */
+       if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
+                       errno != EINVAL) {
+               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+               perror("process_mrelease with wrong flags");
+               exit(res);
+       }
+       /*
+        * Test reaping while process is alive with no pending SIGKILL.
+        * Expect to fail with EINVAL error code.
+        */
+       if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) {
+               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+               perror("process_mrelease on a live process");
+               exit(res);
+       }
+}
+
+static int child_main(int pipefd[], size_t size)
+{
+       int res;
+
+       /* Allocate and fault-in memory and wait to be killed */
+       close(pipefd[0]);
+       res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
+       close(pipefd[1]);
+       return res;
+}
+
+int main(void)
+{
+       int pipefd[2], pidfd;
+       bool success, retry;
+       size_t size;
+       pid_t pid;
+       char byte;
+       int res;
+
+       /* Test a wrong pidfd */
+       if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
+               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+               perror("process_mrelease with wrong pidfd");
+               exit(res);
+       }
+
+       /* Start the test with 1MB child memory allocation */
+       size = 1;
+retry:
+       /*
+        * Pipe for the child to signal when it's done allocating
+        * memory
+        */
+       if (pipe(pipefd)) {
+               perror("pipe");
+               exit(KSFT_FAIL);
+       }
+       pid = fork();
+       if (pid < 0) {
+               perror("fork");
+               close(pipefd[0]);
+               close(pipefd[1]);
+               exit(KSFT_FAIL);
+       }
+
+       if (pid == 0) {
+               /* Child main routine */
+               res = child_main(pipefd, size);
+               exit(res);
+       }
+
+       /*
+        * Parent main routine:
+        * Wait for the child to finish allocations, then kill and reap
+        */
+       close(pipefd[1]);
+       /* Block until the child is ready */
+       res = read(pipefd[0], &byte, 1);
+       close(pipefd[0]);
+       if (res < 0) {
+               perror("read");
+               if (!kill(pid, SIGKILL))
+                       waitpid(pid, NULL, 0);
+               exit(KSFT_FAIL);
+       }
+
+       pidfd = syscall(__NR_pidfd_open, pid, 0);
+       if (pidfd < 0) {
+               perror("pidfd_open");
+               if (!kill(pid, SIGKILL))
+                       waitpid(pid, NULL, 0);
+               exit(KSFT_FAIL);
+       }
+
+       /* Run negative tests which require a live child */
+       run_negative_tests(pidfd);
+
+       if (kill(pid, SIGKILL)) {
+               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+               perror("kill");
+               exit(res);
+       }
+
+       success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
+       if (!success) {
+               /*
+                * If we failed to reap because the child exited too soon,
+                * before we could call process_mrelease. Double child's memory
+                * which causes it to spend more time on cleanup and increases
+                * our chances of reaping its memory before it exits.
+                * Retry until we succeed or reach MAX_SIZE_MB.
+                */
+               if (errno == ESRCH) {
+                       retry = (size <= MAX_SIZE_MB);
+               } else {
+                       res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
+                       perror("process_mrelease");
+                       waitpid(pid, NULL, 0);
+                       exit(res);
+               }
+       }
+
+       /* Cleanup to prevent zombies */
+       if (waitpid(pid, NULL, 0) < 0) {
+               perror("waitpid");
+               exit(KSFT_FAIL);
+       }
+       close(pidfd);
+
+       if (!success) {
+               if (retry) {
+                       size *= 2;
+                       goto retry;
+               }
+               printf("All process_mrelease attempts failed!\n");
+               exit(KSFT_FAIL);
+       }
+
+       printf("Success reaping a child with %zuMB of memory allocations\n",
+              size);
+       return KSFT_PASS;
+}
diff --git a/tools/testing/selftests/mm/mremap_dontunmap.c b/tools/testing/selftests/mm/mremap_dontunmap.c
new file mode 100644 (file)
index 0000000..f01dc4a
--- /dev/null
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Tests for mremap w/ MREMAP_DONTUNMAP.
+ *
+ * Copyright 2020, Brian Geffon <bgeffon@google.com>
+ */
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
+
+unsigned long page_size;
+char *page_buffer;
+
+static void dump_maps(void)
+{
+       char cmd[32];
+
+       snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+       system(cmd);
+}
+
+#define BUG_ON(condition, description)                                       \
+       do {                                                                  \
+               if (condition) {                                              \
+                       fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
+                               __LINE__, (description), strerror(errno));    \
+                       dump_maps();                                      \
+                       exit(1);                                              \
+               }                                                             \
+       } while (0)
+
+// Try a simple operation for to "test" for kernel support this prevents
+// reporting tests as failed when it's run on an older kernel.
+static int kernel_support_for_mremap_dontunmap()
+{
+       int ret = 0;
+       unsigned long num_pages = 1;
+       void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
+                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+       // This simple remap should only fail if MREMAP_DONTUNMAP isn't
+       // supported.
+       void *dest_mapping =
+           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
+       if (dest_mapping == MAP_FAILED) {
+               ret = errno;
+       } else {
+               BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+                      "unable to unmap destination mapping");
+       }
+
+       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+              "unable to unmap source mapping");
+       return ret;
+}
+
+// This helper will just validate that an entire mapping contains the expected
+// byte.
+static int check_region_contains_byte(void *addr, unsigned long size, char byte)
+{
+       BUG_ON(size & (page_size - 1),
+              "check_region_contains_byte expects page multiples");
+       BUG_ON((unsigned long)addr & (page_size - 1),
+              "check_region_contains_byte expects page alignment");
+
+       memset(page_buffer, byte, page_size);
+
+       unsigned long num_pages = size / page_size;
+       unsigned long i;
+
+       // Compare each page checking that it contains our expected byte.
+       for (i = 0; i < num_pages; ++i) {
+               int ret =
+                   memcmp(addr + (i * page_size), page_buffer, page_size);
+               if (ret) {
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
+// the source mapping mapped.
+static void mremap_dontunmap_simple()
+{
+       unsigned long num_pages = 5;
+
+       void *source_mapping =
+           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+       memset(source_mapping, 'a', num_pages * page_size);
+
+       // Try to just move the whole mapping anywhere (not fixed).
+       void *dest_mapping =
+           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+       // Validate that the pages have been moved, we know they were moved if
+       // the dest_mapping contains a's.
+       BUG_ON(check_region_contains_byte
+              (dest_mapping, num_pages * page_size, 'a') != 0,
+              "pages did not migrate");
+       BUG_ON(check_region_contains_byte
+              (source_mapping, num_pages * page_size, 0) != 0,
+              "source should have no ptes");
+
+       BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+              "unable to unmap destination mapping");
+       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+              "unable to unmap source mapping");
+}
+
+// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
+static void mremap_dontunmap_simple_shmem()
+{
+       unsigned long num_pages = 5;
+
+       int mem_fd = memfd_create("memfd", MFD_CLOEXEC);
+       BUG_ON(mem_fd < 0, "memfd_create");
+
+       BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0,
+                       "ftruncate");
+
+       void *source_mapping =
+           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+                MAP_FILE | MAP_SHARED, mem_fd, 0);
+       BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+       BUG_ON(close(mem_fd) < 0, "close");
+
+       memset(source_mapping, 'a', num_pages * page_size);
+
+       // Try to just move the whole mapping anywhere (not fixed).
+       void *dest_mapping =
+           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+       if (dest_mapping == MAP_FAILED && errno == EINVAL) {
+               // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem.
+               BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+                       "unable to unmap source mapping");
+               return;
+       }
+
+       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+       // Validate that the pages have been moved, we know they were moved if
+       // the dest_mapping contains a's.
+       BUG_ON(check_region_contains_byte
+              (dest_mapping, num_pages * page_size, 'a') != 0,
+              "pages did not migrate");
+
+       // Because the region is backed by shmem, we will actually see the same
+       // memory at the source location still.
+       BUG_ON(check_region_contains_byte
+              (source_mapping, num_pages * page_size, 'a') != 0,
+              "source should have no ptes");
+
+       BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+              "unable to unmap destination mapping");
+       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+              "unable to unmap source mapping");
+}
+
+// This test validates MREMAP_DONTUNMAP will move page tables to a specific
+// destination using MREMAP_FIXED, also while validating that the source
+// remains intact.
+static void mremap_dontunmap_simple_fixed()
+{
+       unsigned long num_pages = 5;
+
+       // Since we want to guarantee that we can remap to a point, we will
+       // create a mapping up front.
+       void *dest_mapping =
+           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+       memset(dest_mapping, 'X', num_pages * page_size);
+
+       void *source_mapping =
+           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(source_mapping == MAP_FAILED, "mmap");
+       memset(source_mapping, 'a', num_pages * page_size);
+
+       void *remapped_mapping =
+           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+                  MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
+                  dest_mapping);
+       BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
+       BUG_ON(remapped_mapping != dest_mapping,
+              "mremap should have placed the remapped mapping at dest_mapping");
+
+       // The dest mapping will have been unmap by mremap so we expect the Xs
+       // to be gone and replaced with a's.
+       BUG_ON(check_region_contains_byte
+              (dest_mapping, num_pages * page_size, 'a') != 0,
+              "pages did not migrate");
+
+       // And the source mapping will have had its ptes dropped.
+       BUG_ON(check_region_contains_byte
+              (source_mapping, num_pages * page_size, 0) != 0,
+              "source should have no ptes");
+
+       BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+              "unable to unmap destination mapping");
+       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+              "unable to unmap source mapping");
+}
+
+// This test validates that we can MREMAP_DONTUNMAP for a portion of an
+// existing mapping.
+static void mremap_dontunmap_partial_mapping()
+{
+       /*
+        *  source mapping:
+        *  --------------
+        *  | aaaaaaaaaa |
+        *  --------------
+        *  to become:
+        *  --------------
+        *  | aaaaa00000 |
+        *  --------------
+        *  With the destination mapping containing 5 pages of As.
+        *  ---------
+        *  | aaaaa |
+        *  ---------
+        */
+       unsigned long num_pages = 10;
+       void *source_mapping =
+           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(source_mapping == MAP_FAILED, "mmap");
+       memset(source_mapping, 'a', num_pages * page_size);
+
+       // We will grab the last 5 pages of the source and move them.
+       void *dest_mapping =
+           mremap(source_mapping + (5 * page_size), 5 * page_size,
+                  5 * page_size,
+                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+       // We expect the first 5 pages of the source to contain a's and the
+       // final 5 pages to contain zeros.
+       BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
+              0, "first 5 pages of source should have original pages");
+       BUG_ON(check_region_contains_byte
+              (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
+              "final 5 pages of source should have no ptes");
+
+       // Finally we expect the destination to have 5 pages worth of a's.
+       BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
+              0, "dest mapping should contain ptes from the source");
+
+       BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
+              "unable to unmap destination mapping");
+       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+              "unable to unmap source mapping");
+}
+
+// This test validates that we can remap over only a portion of a mapping.
+static void mremap_dontunmap_partial_mapping_overwrite(void)
+{
+       /*
+        *  source mapping:
+        *  ---------
+        *  |aaaaa|
+        *  ---------
+        *  dest mapping initially:
+        *  -----------
+        *  |XXXXXXXXXX|
+        *  ------------
+        *  Source to become:
+        *  ---------
+        *  |00000|
+        *  ---------
+        *  With the destination mapping containing 5 pages of As.
+        *  ------------
+        *  |aaaaaXXXXX|
+        *  ------------
+        */
+       void *source_mapping =
+           mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(source_mapping == MAP_FAILED, "mmap");
+       memset(source_mapping, 'a', 5 * page_size);
+
+       void *dest_mapping =
+           mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+       memset(dest_mapping, 'X', 10 * page_size);
+
+       // We will grab the last 5 pages of the source and move them.
+       void *remapped_mapping =
+           mremap(source_mapping, 5 * page_size,
+                  5 * page_size,
+                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping);
+       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+       BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping");
+
+       BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
+              0, "first 5 pages of source should have no ptes");
+
+       // Finally we expect the destination to have 5 pages worth of a's.
+       BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0,
+                       "dest mapping should contain ptes from the source");
+
+       // Finally the last 5 pages shouldn't have been touched.
+       BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
+                               5 * page_size, 'X') != 0,
+                       "dest mapping should have retained the last 5 pages");
+
+       BUG_ON(munmap(dest_mapping, 10 * page_size) == -1,
+              "unable to unmap destination mapping");
+       BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
+              "unable to unmap source mapping");
+}
+
+int main(void)
+{
+       page_size = sysconf(_SC_PAGE_SIZE);
+
+       // test for kernel support for MREMAP_DONTUNMAP skipping the test if
+       // not.
+       if (kernel_support_for_mremap_dontunmap() != 0) {
+               printf("No kernel support for MREMAP_DONTUNMAP\n");
+               return KSFT_SKIP;
+       }
+
+       // Keep a page sized buffer around for when we need it.
+       page_buffer =
+           mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+       BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
+
+       mremap_dontunmap_simple();
+       mremap_dontunmap_simple_shmem();
+       mremap_dontunmap_simple_fixed();
+       mremap_dontunmap_partial_mapping();
+       mremap_dontunmap_partial_mapping_overwrite();
+
+       BUG_ON(munmap(page_buffer, page_size) == -1,
+              "unable to unmap page buffer");
+
+       printf("OK\n");
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c
new file mode 100644 (file)
index 0000000..9496346
--- /dev/null
@@ -0,0 +1,475 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020 Google LLC
+ */
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#define EXPECT_SUCCESS 0
+#define EXPECT_FAILURE 1
+#define NON_OVERLAPPING 0
+#define OVERLAPPING 1
+#define NS_PER_SEC 1000000000ULL
+#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */
+#define VALIDATION_NO_THRESHOLD 0      /* Verify the entire region */
+
+#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
+
+struct config {
+       unsigned long long src_alignment;
+       unsigned long long dest_alignment;
+       unsigned long long region_size;
+       int overlapping;
+};
+
+struct test {
+       const char *name;
+       struct config config;
+       int expect_failure;
+};
+
+enum {
+       _1KB = 1ULL << 10,      /* 1KB -> not page aligned */
+       _4KB = 4ULL << 10,
+       _8KB = 8ULL << 10,
+       _1MB = 1ULL << 20,
+       _2MB = 2ULL << 20,
+       _4MB = 4ULL << 20,
+       _1GB = 1ULL << 30,
+       _2GB = 2ULL << 30,
+       PMD = _2MB,
+       PUD = _1GB,
+};
+
+#define PTE page_size
+
+#define MAKE_TEST(source_align, destination_align, size,       \
+                 overlaps, should_fail, test_name)             \
+(struct test){                                                 \
+       .name = test_name,                                      \
+       .config = {                                             \
+               .src_alignment = source_align,                  \
+               .dest_alignment = destination_align,            \
+               .region_size = size,                            \
+               .overlapping = overlaps,                        \
+       },                                                      \
+       .expect_failure = should_fail                           \
+}
+
+/*
+ * Returns false if the requested remap region overlaps with an
+ * existing mapping (e.g text, stack) else returns true.
+ */
+static bool is_remap_region_valid(void *addr, unsigned long long size)
+{
+       void *remap_addr = NULL;
+       bool ret = true;
+
+       /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
+       remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
+                                        MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+                                        -1, 0);
+
+       if (remap_addr == MAP_FAILED) {
+               if (errno == EEXIST)
+                       ret = false;
+       } else {
+               munmap(remap_addr, size);
+       }
+
+       return ret;
+}
+
+/* Returns mmap_min_addr sysctl tunable from procfs */
+static unsigned long long get_mmap_min_addr(void)
+{
+       FILE *fp;
+       int n_matched;
+       static unsigned long long addr;
+
+       if (addr)
+               return addr;
+
+       fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
+       if (fp == NULL) {
+               ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
+                       strerror(errno));
+               exit(KSFT_SKIP);
+       }
+
+       n_matched = fscanf(fp, "%llu", &addr);
+       if (n_matched != 1) {
+               ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
+                       strerror(errno));
+               fclose(fp);
+               exit(KSFT_SKIP);
+       }
+
+       fclose(fp);
+       return addr;
+}
+
+/*
+ * This test validates that merge is called when expanding a mapping.
+ * Mapping containing three pages is created, middle page is unmapped
+ * and then the mapping containing the first page is expanded so that
+ * it fills the created hole. The two parts should merge creating
+ * single mapping with three pages.
+ */
+static void mremap_expand_merge(unsigned long page_size)
+{
+       char *test_name = "mremap expand merge";
+       FILE *fp;
+       char *line = NULL;
+       size_t len = 0;
+       bool success = false;
+       char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+       munmap(start + page_size, page_size);
+       mremap(start, page_size, 2 * page_size, 0);
+
+       fp = fopen("/proc/self/maps", "r");
+       if (fp == NULL) {
+               ksft_test_result_fail("%s\n", test_name);
+               return;
+       }
+
+       while (getline(&line, &len, fp) != -1) {
+               char *first = strtok(line, "- ");
+               void *first_val = (void *)strtol(first, NULL, 16);
+               char *second = strtok(NULL, "- ");
+               void *second_val = (void *) strtol(second, NULL, 16);
+
+               if (first_val == start && second_val == start + 3 * page_size) {
+                       success = true;
+                       break;
+               }
+       }
+       if (success)
+               ksft_test_result_pass("%s\n", test_name);
+       else
+               ksft_test_result_fail("%s\n", test_name);
+       fclose(fp);
+}
+
+/*
+ * Returns the start address of the mapping on success, else returns
+ * NULL on failure.
+ */
+static void *get_source_mapping(struct config c)
+{
+       unsigned long long addr = 0ULL;
+       void *src_addr = NULL;
+       unsigned long long mmap_min_addr;
+
+       mmap_min_addr = get_mmap_min_addr();
+
+retry:
+       addr += c.src_alignment;
+       if (addr < mmap_min_addr)
+               goto retry;
+
+       src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
+                                       MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+                                       -1, 0);
+       if (src_addr == MAP_FAILED) {
+               if (errno == EPERM || errno == EEXIST)
+                       goto retry;
+               goto error;
+       }
+       /*
+        * Check that the address is aligned to the specified alignment.
+        * Addresses which have alignments that are multiples of that
+        * specified are not considered valid. For instance, 1GB address is
+        * 2MB-aligned, however it will not be considered valid for a
+        * requested alignment of 2MB. This is done to reduce coincidental
+        * alignment in the tests.
+        */
+       if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
+                       !((unsigned long long) src_addr & c.src_alignment)) {
+               munmap(src_addr, c.region_size);
+               goto retry;
+       }
+
+       if (!src_addr)
+               goto error;
+
+       return src_addr;
+error:
+       ksft_print_msg("Failed to map source region: %s\n",
+                       strerror(errno));
+       return NULL;
+}
+
+/* Returns the time taken for the remap on success else returns -1. */
+static long long remap_region(struct config c, unsigned int threshold_mb,
+                             char pattern_seed)
+{
+       void *addr, *src_addr, *dest_addr;
+       unsigned long long i;
+       struct timespec t_start = {0, 0}, t_end = {0, 0};
+       long long  start_ns, end_ns, align_mask, ret, offset;
+       unsigned long long threshold;
+
+       if (threshold_mb == VALIDATION_NO_THRESHOLD)
+               threshold = c.region_size;
+       else
+               threshold = MIN(threshold_mb * _1MB, c.region_size);
+
+       src_addr = get_source_mapping(c);
+       if (!src_addr) {
+               ret = -1;
+               goto out;
+       }
+
+       /* Set byte pattern */
+       srand(pattern_seed);
+       for (i = 0; i < threshold; i++)
+               memset((char *) src_addr + i, (char) rand(), 1);
+
+       /* Mask to zero out lower bits of address for alignment */
+       align_mask = ~(c.dest_alignment - 1);
+       /* Offset of destination address from the end of the source region */
+       offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment;
+       addr = (void *) (((unsigned long long) src_addr + c.region_size
+                         + offset) & align_mask);
+
+       /* See comment in get_source_mapping() */
+       if (!((unsigned long long) addr & c.dest_alignment))
+               addr = (void *) ((unsigned long long) addr | c.dest_alignment);
+
+       /* Don't destroy existing mappings unless expected to overlap */
+       while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
+               /* Check for unsigned overflow */
+               if (addr + c.dest_alignment < addr) {
+                       ksft_print_msg("Couldn't find a valid region to remap to\n");
+                       ret = -1;
+                       goto out;
+               }
+               addr += c.dest_alignment;
+       }
+
+       clock_gettime(CLOCK_MONOTONIC, &t_start);
+       dest_addr = mremap(src_addr, c.region_size, c.region_size,
+                                         MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
+       clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+       if (dest_addr == MAP_FAILED) {
+               ksft_print_msg("mremap failed: %s\n", strerror(errno));
+               ret = -1;
+               goto clean_up_src;
+       }
+
+       /* Verify byte pattern after remapping */
+       srand(pattern_seed);
+       for (i = 0; i < threshold; i++) {
+               char c = (char) rand();
+
+               if (((char *) dest_addr)[i] != c) {
+                       ksft_print_msg("Data after remap doesn't match at offset %d\n",
+                                      i);
+                       ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
+                                       ((char *) dest_addr)[i] & 0xff);
+                       ret = -1;
+                       goto clean_up_dest;
+               }
+       }
+
+       start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec;
+       end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec;
+       ret = end_ns - start_ns;
+
+/*
+ * Since the destination address is specified using MREMAP_FIXED, subsequent
+ * mremap will unmap any previous mapping at the address range specified by
+ * dest_addr and region_size. This significantly affects the remap time of
+ * subsequent tests. So we clean up mappings after each test.
+ */
+clean_up_dest:
+       munmap(dest_addr, c.region_size);
+clean_up_src:
+       munmap(src_addr, c.region_size);
+out:
+       return ret;
+}
+
+static void run_mremap_test_case(struct test test_case, int *failures,
+                                unsigned int threshold_mb,
+                                unsigned int pattern_seed)
+{
+       long long remap_time = remap_region(test_case.config, threshold_mb,
+                                           pattern_seed);
+
+       if (remap_time < 0) {
+               if (test_case.expect_failure)
+                       ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
+                                             test_case.name);
+               else {
+                       ksft_test_result_fail("%s\n", test_case.name);
+                       *failures += 1;
+               }
+       } else {
+               /*
+                * Comparing mremap time is only applicable if entire region
+                * was faulted in.
+                */
+               if (threshold_mb == VALIDATION_NO_THRESHOLD ||
+                   test_case.config.region_size <= threshold_mb * _1MB)
+                       ksft_test_result_pass("%s\n\tmremap time: %12lldns\n",
+                                             test_case.name, remap_time);
+               else
+                       ksft_test_result_pass("%s\n", test_case.name);
+       }
+}
+
+static void usage(const char *cmd)
+{
+       fprintf(stderr,
+               "Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n"
+               "-t\t only validate threshold_mb of the remapped region\n"
+               "  \t if 0 is supplied no threshold is used; all tests\n"
+               "  \t are run and remapped regions validated fully.\n"
+               "  \t The default threshold used is 4MB.\n"
+               "-p\t provide a seed to generate the random pattern for\n"
+               "  \t validating the remapped region.\n", cmd);
+}
+
+static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
+                     unsigned int *pattern_seed)
+{
+       const char *optstr = "t:p:";
+       int opt;
+
+       while ((opt = getopt(argc, argv, optstr)) != -1) {
+               switch (opt) {
+               case 't':
+                       *threshold_mb = atoi(optarg);
+                       break;
+               case 'p':
+                       *pattern_seed = atoi(optarg);
+                       break;
+               default:
+                       usage(argv[0]);
+                       return -1;
+               }
+       }
+
+       if (optind < argc) {
+               usage(argv[0]);
+               return -1;
+       }
+
+       return 0;
+}
+
+#define MAX_TEST 13
+#define MAX_PERF_TEST 3
+int main(int argc, char **argv)
+{
+       int failures = 0;
+       int i, run_perf_tests;
+       unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
+       unsigned int pattern_seed;
+       int num_expand_tests = 1;
+       struct test test_cases[MAX_TEST];
+       struct test perf_test_cases[MAX_PERF_TEST];
+       int page_size;
+       time_t t;
+
+       pattern_seed = (unsigned int) time(&t);
+
+       if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0)
+               exit(EXIT_FAILURE);
+
+       ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
+                      threshold_mb, pattern_seed);
+
+       page_size = sysconf(_SC_PAGESIZE);
+
+       /* Expected mremap failures */
+       test_cases[0] = MAKE_TEST(page_size, page_size, page_size,
+                                 OVERLAPPING, EXPECT_FAILURE,
+                                 "mremap - Source and Destination Regions Overlapping");
+
+       test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
+                                 NON_OVERLAPPING, EXPECT_FAILURE,
+                                 "mremap - Destination Address Misaligned (1KB-aligned)");
+       test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
+                                 NON_OVERLAPPING, EXPECT_FAILURE,
+                                 "mremap - Source Address Misaligned (1KB-aligned)");
+
+       /* Src addr PTE aligned */
+       test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
+                                 NON_OVERLAPPING, EXPECT_SUCCESS,
+                                 "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
+
+       /* Src addr 1MB aligned */
+       test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                 "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
+       test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                 "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
+
+       /* Src addr PMD aligned */
+       test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                 "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
+       test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                 "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
+       test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                 "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
+
+       /* Src addr PUD aligned */
+       test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                 "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
+       test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                  "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
+       test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                  "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
+       test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                  "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+       perf_test_cases[0] =  MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                       "1GB mremap - Source PTE-aligned, Destination PTE-aligned");
+       /*
+        * mremap 1GB region - Page table level aligned time
+        * comparison.
+        */
+       perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                      "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
+       perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
+                                      "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
+
+       run_perf_tests =  (threshold_mb == VALIDATION_NO_THRESHOLD) ||
+                               (threshold_mb * _1MB >= _1GB);
+
+       ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ?
+                     ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests);
+
+       for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+               run_mremap_test_case(test_cases[i], &failures, threshold_mb,
+                                    pattern_seed);
+
+       mremap_expand_merge(page_size);
+
+       if (run_perf_tests) {
+               ksft_print_msg("\n%s\n",
+                "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
+               for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++)
+                       run_mremap_test_case(perf_test_cases[i], &failures,
+                                            threshold_mb, pattern_seed);
+       }
+
+       if (failures > 0)
+               ksft_exit_fail();
+       else
+               ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/on-fault-limit.c b/tools/testing/selftests/mm/on-fault-limit.c
new file mode 100644 (file)
index 0000000..634d87d
--- /dev/null
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+       int ret = 1;
+       struct rlimit lims;
+       void *map;
+
+       if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+               perror("getrlimit");
+               return ret;
+       }
+
+       if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
+               perror("mlockall");
+               return ret;
+       }
+
+       map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+       if (map != MAP_FAILED)
+               printf("mmap should have failed, but didn't\n");
+       else {
+               ret = 0;
+               munmap(map, 2 * lims.rlim_max);
+       }
+
+       munlockall();
+       return ret;
+}
+
+int main(int argc, char **argv)
+{
+       int ret = 0;
+
+       ret += test_limit();
+       return ret;
+}
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
new file mode 100644 (file)
index 0000000..92f3be3
--- /dev/null
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+#include "../kselftest.h"
+
+/* Define some kernel-like types */
+#define  u8 __u8
+#define u16 __u16
+#define u32 __u32
+#define u64 __u64
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+extern int test_nr;
+extern int iteration_nr;
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
+static inline void sigsafe_printf(const char *format, ...)
+{
+       va_list ap;
+
+       if (!dprint_in_signal) {
+               va_start(ap, format);
+               vprintf(format, ap);
+               va_end(ap);
+       } else {
+               int ret;
+               /*
+                * No printf() functions are signal-safe.
+                * They deadlock easily. Write the format
+                * string to get some output, even if
+                * incomplete.
+                */
+               ret = write(1, format, strlen(format));
+               if (ret < 0)
+                       exit(1);
+       }
+}
+#define dprintf_level(level, args...) do {     \
+       if (level <= DEBUG_LEVEL)               \
+               sigsafe_printf(args);           \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do {            \
+       if (!(condition)) {                     \
+               dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+                               __FILE__, __LINE__,     \
+                               test_nr, iteration_nr); \
+               dprintf0("errno at assert: %d", errno); \
+               abort_hooks();                  \
+               exit(__LINE__);                 \
+       }                                       \
+} while (0)
+
+__attribute__((noinline)) int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
+int sys_pkey_free(unsigned long pkey);
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+               unsigned long pkey);
+void record_pkey_malloc(void *ptr, long size, int prot);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#include "pkey-x86.h"
+#elif defined(__powerpc64__) /* arch */
+#include "pkey-powerpc.h"
+#else /* arch */
+#error Architecture not supported
+#endif /* arch */
+
+#define PKEY_MASK      (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+
+static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+       u32 shift = pkey_bit_position(pkey);
+       /* mask out bits from pkey in old value */
+       reg &= ~((u64)PKEY_MASK << shift);
+       /* OR in new bits for pkey */
+       reg |= (flags & PKEY_MASK) << shift;
+       return reg;
+}
+
+static inline u64 get_pkey_bits(u64 reg, int pkey)
+{
+       u32 shift = pkey_bit_position(pkey);
+       /*
+        * shift down the relevant bits to the lowest two, then
+        * mask off all the other higher bits
+        */
+       return ((reg >> shift) & PKEY_MASK);
+}
+
+extern u64 shadow_pkey_reg;
+
+static inline u64 _read_pkey_reg(int line)
+{
+       u64 pkey_reg = __read_pkey_reg();
+
+       dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx"
+                       " shadow: %016llx\n",
+                       line, pkey_reg, shadow_pkey_reg);
+       assert(pkey_reg == shadow_pkey_reg);
+
+       return pkey_reg;
+}
+
+#define read_pkey_reg() _read_pkey_reg(__LINE__)
+
+static inline void write_pkey_reg(u64 pkey_reg)
+{
+       dprintf4("%s() changing %016llx to %016llx\n", __func__,
+                       __read_pkey_reg(), pkey_reg);
+       /* will do the shadow check for us: */
+       read_pkey_reg();
+       __write_pkey_reg(pkey_reg);
+       shadow_pkey_reg = pkey_reg;
+       dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__,
+                       pkey_reg, __read_pkey_reg());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKEY register between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+       u64 pkey_reg = read_pkey_reg();
+       int bit = pkey * 2;
+
+       if (do_allow)
+               pkey_reg &= (1<<bit);
+       else
+               pkey_reg |= (1<<bit);
+
+       dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+       write_pkey_reg(pkey_reg);
+}
+
+static inline void __pkey_write_allow(int pkey, int do_allow_write)
+{
+       u64 pkey_reg = read_pkey_reg();
+       int bit = pkey * 2 + 1;
+
+       if (do_allow_write)
+               pkey_reg &= (1<<bit);
+       else
+               pkey_reg |= (1<<bit);
+
+       write_pkey_reg(pkey_reg);
+       dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+}
+
+#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to)  \
+       ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to)        \
+       ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...)     #x
+#define __stringify(x...)       __stringify_1(x)
+
+static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
+{
+#ifdef si_pkey
+       return &si->si_pkey;
+#else
+       return (u32 *)(((u8 *)si) + si_pkey_offset);
+#endif
+}
+
+static inline int kernel_has_pkeys(void)
+{
+       /* try allocating a key and see if it succeeds */
+       int ret = sys_pkey_alloc(0, 0);
+       if (ret <= 0) {
+               return 0;
+       }
+       sys_pkey_free(ret);
+       return 1;
+}
+
+static inline int is_pkeys_supported(void)
+{
+       /* check if the cpu supports pkeys */
+       if (!cpu_has_pkeys()) {
+               dprintf1("SKIP: %s: no CPU support\n", __func__);
+               return 0;
+       }
+
+       /* check if the kernel supports pkeys */
+       if (!kernel_has_pkeys()) {
+               dprintf1("SKIP: %s: no kernel support\n", __func__);
+               return 0;
+       }
+
+       return 1;
+}
+
+#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/mm/pkey-powerpc.h b/tools/testing/selftests/mm/pkey-powerpc.h
new file mode 100644 (file)
index 0000000..1ebb586
--- /dev/null
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_POWERPC_H
+#define _PKEYS_POWERPC_H
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key      386
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc                384
+# define SYS_pkey_free         385
+#endif
+#define REG_IP_IDX             PT_NIP
+#define REG_TRAPNO             PT_TRAP
+#define gregs                  gp_regs
+#define fpregs                 fp_regs
+#define si_pkey_offset         0x20
+
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS    0x3  /* disable read and write */
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE     0x2
+
+#define NR_PKEYS               32
+#define NR_RESERVED_PKEYS_4K   27 /* pkey-0, pkey-1, exec-only-pkey
+                                     and 24 other keys that cannot be
+                                     represented in the PTE */
+#define NR_RESERVED_PKEYS_64K_3KEYS    3 /* PowerNV and KVM: pkey-0,
+                                            pkey-1 and exec-only key */
+#define NR_RESERVED_PKEYS_64K_4KEYS    4 /* PowerVM: pkey-0, pkey-1,
+                                            pkey-31 and exec-only key */
+#define PKEY_BITS_PER_PKEY     2
+#define HPAGE_SIZE             (1UL << 24)
+#define PAGE_SIZE              sysconf(_SC_PAGESIZE)
+
+static inline u32 pkey_bit_position(int pkey)
+{
+       return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+       u64 pkey_reg;
+
+       asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
+
+       return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+       u64 amr = pkey_reg;
+
+       dprintf4("%s() changing %016llx to %016llx\n",
+                        __func__, __read_pkey_reg(), pkey_reg);
+
+       asm volatile("isync; mtspr 0xd, %0; isync"
+                    : : "r" ((unsigned long)(amr)) : "memory");
+
+       dprintf4("%s() pkey register after changing %016llx to %016llx\n",
+                       __func__, __read_pkey_reg(), pkey_reg);
+}
+
+static inline int cpu_has_pkeys(void)
+{
+       /* No simple way to determine this */
+       return 1;
+}
+
+static inline bool arch_is_powervm()
+{
+       struct stat buf;
+
+       if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) &&
+           (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) &&
+           (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) )
+               return true;
+
+       return false;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+       if (sysconf(_SC_PAGESIZE) == 4096)
+               return NR_RESERVED_PKEYS_4K;
+       else
+               if (arch_is_powervm())
+                       return NR_RESERVED_PKEYS_64K_4KEYS;
+               else
+                       return NR_RESERVED_PKEYS_64K_3KEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+       /*
+        * powerpc does not allow userspace to change permissions of exec-only
+        * keys since those keys are not allocated by userspace. The signal
+        * handler wont be able to reset the permissions, which means the code
+        * will infinitely continue to segfault here.
+        */
+       return;
+}
+
+/* 4-byte instructions * 16384 = 64K page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+       void *ptr;
+       int ret;
+
+       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+                       size, prot, pkey);
+       pkey_assert(pkey < NR_PKEYS);
+       ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+       pkey_assert(ptr != (void *)-1);
+
+       ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+       if (ret) {
+               perror("subpage_perm");
+               return PTR_ERR_ENOTSUP;
+       }
+
+       ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+       pkey_assert(!ret);
+       record_pkey_malloc(ptr, size, prot);
+
+       dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+       return ptr;
+}
+
+#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
new file mode 100644 (file)
index 0000000..72c14cd
--- /dev/null
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_X86_H
+#define _PKEYS_X86_H
+
+#ifdef __i386__
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key      380
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc                381
+# define SYS_pkey_free         382
+#endif
+
+#define REG_IP_IDX             REG_EIP
+#define si_pkey_offset         0x14
+
+#else
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key      329
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc                330
+# define SYS_pkey_free         331
+#endif
+
+#define REG_IP_IDX             REG_RIP
+#define si_pkey_offset         0x20
+
+#endif
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS   0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE    0x2
+#endif
+
+#define NR_PKEYS               16
+#define NR_RESERVED_PKEYS      2 /* pkey-0 and exec-only-pkey */
+#define PKEY_BITS_PER_PKEY     2
+#define HPAGE_SIZE             (1UL<<21)
+#define PAGE_SIZE              4096
+#define MB                     (1<<20)
+
+static inline void __page_o_noops(void)
+{
+       /* 8-bytes of instruction * 512 bytes = 1 page */
+       asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+       unsigned int eax, edx;
+       unsigned int ecx = 0;
+       unsigned pkey_reg;
+
+       asm volatile(".byte 0x0f,0x01,0xee\n\t"
+                    : "=a" (eax), "=d" (edx)
+                    : "c" (ecx));
+       pkey_reg = eax;
+       return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+       unsigned int eax = pkey_reg;
+       unsigned int ecx = 0;
+       unsigned int edx = 0;
+
+       dprintf4("%s() changing %016llx to %016llx\n", __func__,
+                       __read_pkey_reg(), pkey_reg);
+       asm volatile(".byte 0x0f,0x01,0xef\n\t"
+                    : : "a" (eax), "c" (ecx), "d" (edx));
+       assert(pkey_reg == __read_pkey_reg());
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU        (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE      (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pkeys(void)
+{
+       unsigned int eax;
+       unsigned int ebx;
+       unsigned int ecx;
+       unsigned int edx;
+
+       __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx);
+
+       if (!(ecx & X86_FEATURE_PKU)) {
+               dprintf2("cpu does not have PKU\n");
+               return 0;
+       }
+       if (!(ecx & X86_FEATURE_OSPKE)) {
+               dprintf2("cpu does not have OSPKE\n");
+               return 0;
+       }
+       return 1;
+}
+
+static inline int cpu_max_xsave_size(void)
+{
+       unsigned long XSTATE_CPUID = 0xd;
+       unsigned int eax;
+       unsigned int ebx;
+       unsigned int ecx;
+       unsigned int edx;
+
+       __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx);
+       return ecx;
+}
+
+static inline u32 pkey_bit_position(int pkey)
+{
+       return pkey * PKEY_BITS_PER_PKEY;
+}
+
+#define XSTATE_PKEY_BIT        (9)
+#define XSTATE_PKEY    0x200
+#define XSTATE_BV_OFFSET       512
+
+int pkey_reg_xstate_offset(void)
+{
+       unsigned int eax;
+       unsigned int ebx;
+       unsigned int ecx;
+       unsigned int edx;
+       int xstate_offset;
+       int xstate_size;
+       unsigned long XSTATE_CPUID = 0xd;
+       int leaf;
+
+       /* assume that XSTATE_PKEY is set in XCR0 */
+       leaf = XSTATE_PKEY_BIT;
+       {
+               __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx);
+
+               if (leaf == XSTATE_PKEY_BIT) {
+                       xstate_offset = ebx;
+                       xstate_size = eax;
+               }
+       }
+
+       if (xstate_size == 0) {
+               printf("could not find size/offset of PKEY in xsave state\n");
+               return 0;
+       }
+
+       return xstate_offset;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+       return NR_RESERVED_PKEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+       int ptr_contents;
+
+       ptr_contents = read_ptr(p1);
+       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+       expected_pkey_fault(pkey);
+}
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+       return PTR_ERR_ENOTSUP;
+}
+
+#endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c
new file mode 100644 (file)
index 0000000..95f403a
--- /dev/null
@@ -0,0 +1,1788 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ *
+ * There are examples in here of:
+ *  * how to set protection keys on memory
+ *  * how to set/clear bits in pkey registers (the rights register)
+ *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
+ *    information from the siginfo
+ *
+ * Things to add:
+ *     make sure KSM and KSM COW breaking works
+ *     prefault pages in at malloc, or not
+ *     protect MPX bounds tables with protection keys?
+ *     make sure VMA splitting/merging is working correctly
+ *     OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
+ *     look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
+ *     do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
+ *
+ * Compile like this:
+ *     gcc -mxsave      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ *     gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <errno.h>
+#include <linux/elf.h>
+#include <linux/futex.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <setjmp.h>
+
+#include "pkey-helpers.h"
+
+int iteration_nr = 1;
+int test_nr;
+
+u64 shadow_pkey_reg;
+int dprint_in_signal;
+char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+void cat_into_file(char *str, char *file)
+{
+       int fd = open(file, O_RDWR);
+       int ret;
+
+       dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
+       /*
+        * these need to be raw because they are called under
+        * pkey_assert()
+        */
+       if (fd < 0) {
+               fprintf(stderr, "error opening '%s'\n", str);
+               perror("error: ");
+               exit(__LINE__);
+       }
+
+       ret = write(fd, str, strlen(str));
+       if (ret != strlen(str)) {
+               perror("write to file failed");
+               fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
+               exit(__LINE__);
+       }
+       close(fd);
+}
+
+#if CONTROL_TRACING > 0
+static int warned_tracing;
+int tracing_root_ok(void)
+{
+       if (geteuid() != 0) {
+               if (!warned_tracing)
+                       fprintf(stderr, "WARNING: not run as root, "
+                                       "can not do tracing control\n");
+               warned_tracing = 1;
+               return 0;
+       }
+       return 1;
+}
+#endif
+
+void tracing_on(void)
+{
+#if CONTROL_TRACING > 0
+#define TRACEDIR "/sys/kernel/debug/tracing"
+       char pidstr[32];
+
+       if (!tracing_root_ok())
+               return;
+
+       sprintf(pidstr, "%d", getpid());
+       cat_into_file("0", TRACEDIR "/tracing_on");
+       cat_into_file("\n", TRACEDIR "/trace");
+       if (1) {
+               cat_into_file("function_graph", TRACEDIR "/current_tracer");
+               cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
+       } else {
+               cat_into_file("nop", TRACEDIR "/current_tracer");
+       }
+       cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
+       cat_into_file("1", TRACEDIR "/tracing_on");
+       dprintf1("enabled tracing\n");
+#endif
+}
+
+void tracing_off(void)
+{
+#if CONTROL_TRACING > 0
+       if (!tracing_root_ok())
+               return;
+       cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
+#endif
+}
+
+void abort_hooks(void)
+{
+       fprintf(stderr, "running %s()...\n", __func__);
+       tracing_off();
+#ifdef SLEEP_ON_ABORT
+       sleep(SLEEP_ON_ABORT);
+#endif
+}
+
+/*
+ * This attempts to have roughly a page of instructions followed by a few
+ * instructions that do a write, and another page of instructions.  That
+ * way, we are pretty sure that the write is in the second page of
+ * instructions and has at least a page of padding behind it.
+ *
+ * *That* lets us be sure to madvise() away the write instruction, which
+ * will then fault, which makes sure that the fault code handles
+ * execute-only memory properly.
+ */
+#ifdef __powerpc64__
+/* This way, both 4K and 64K alignment are maintained */
+__attribute__((__aligned__(65536)))
+#else
+__attribute__((__aligned__(PAGE_SIZE)))
+#endif
+void lots_o_noops_around_write(int *write_to_me)
+{
+       dprintf3("running %s()\n", __func__);
+       __page_o_noops();
+       /* Assume this happens in the second page of instructions: */
+       *write_to_me = __LINE__;
+       /* pad out by another page: */
+       __page_o_noops();
+       dprintf3("%s() done\n", __func__);
+}
+
+void dump_mem(void *dumpme, int len_bytes)
+{
+       char *c = (void *)dumpme;
+       int i;
+
+       for (i = 0; i < len_bytes; i += sizeof(u64)) {
+               u64 *ptr = (u64 *)(c + i);
+               dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
+       }
+}
+
+static u32 hw_pkey_get(int pkey, unsigned long flags)
+{
+       u64 pkey_reg = __read_pkey_reg();
+
+       dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+                       __func__, pkey, flags, 0, 0);
+       dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
+
+       return (u32) get_pkey_bits(pkey_reg, pkey);
+}
+
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+       u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+       u64 old_pkey_reg = __read_pkey_reg();
+       u64 new_pkey_reg;
+
+       /* make sure that 'rights' only contains the bits we expect: */
+       assert(!(rights & ~mask));
+
+       /* modify bits accordingly in old pkey_reg and assign it */
+       new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
+
+       __write_pkey_reg(new_pkey_reg);
+
+       dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
+               " pkey_reg now: %016llx old_pkey_reg: %016llx\n",
+               __func__, pkey, rights, flags, 0, __read_pkey_reg(),
+               old_pkey_reg);
+       return 0;
+}
+
+void pkey_disable_set(int pkey, int flags)
+{
+       unsigned long syscall_flags = 0;
+       int ret;
+       int pkey_rights;
+       u64 orig_pkey_reg = read_pkey_reg();
+
+       dprintf1("START->%s(%d, 0x%x)\n", __func__,
+               pkey, flags);
+       pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+       pkey_rights = hw_pkey_get(pkey, syscall_flags);
+
+       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+
+       pkey_assert(pkey_rights >= 0);
+
+       pkey_rights |= flags;
+
+       ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
+       assert(!ret);
+       /* pkey_reg and flags have the same format */
+       shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+       dprintf1("%s(%d) shadow: 0x%016llx\n",
+               __func__, pkey, shadow_pkey_reg);
+
+       pkey_assert(ret >= 0);
+
+       pkey_rights = hw_pkey_get(pkey, syscall_flags);
+       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+
+       dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
+               __func__, pkey, read_pkey_reg());
+       if (flags)
+               pkey_assert(read_pkey_reg() >= orig_pkey_reg);
+       dprintf1("END<---%s(%d, 0x%x)\n", __func__,
+               pkey, flags);
+}
+
+void pkey_disable_clear(int pkey, int flags)
+{
+       unsigned long syscall_flags = 0;
+       int ret;
+       int pkey_rights = hw_pkey_get(pkey, syscall_flags);
+       u64 orig_pkey_reg = read_pkey_reg();
+
+       pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+       pkey_assert(pkey_rights >= 0);
+
+       pkey_rights &= ~flags;
+
+       ret = hw_pkey_set(pkey, pkey_rights, 0);
+       shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+       pkey_assert(ret >= 0);
+
+       pkey_rights = hw_pkey_get(pkey, syscall_flags);
+       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+                       pkey, pkey, pkey_rights);
+
+       dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
+                       pkey, read_pkey_reg());
+       if (flags)
+               assert(read_pkey_reg() <= orig_pkey_reg);
+}
+
+void pkey_write_allow(int pkey)
+{
+       pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_write_deny(int pkey)
+{
+       pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_access_allow(int pkey)
+{
+       pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
+}
+void pkey_access_deny(int pkey)
+{
+       pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
+}
+
+/* Failed address bound checks: */
+#ifndef SEGV_BNDERR
+# define SEGV_BNDERR           3
+#endif
+
+#ifndef SEGV_PKUERR
+# define SEGV_PKUERR           4
+#endif
+
+static char *si_code_str(int si_code)
+{
+       if (si_code == SEGV_MAPERR)
+               return "SEGV_MAPERR";
+       if (si_code == SEGV_ACCERR)
+               return "SEGV_ACCERR";
+       if (si_code == SEGV_BNDERR)
+               return "SEGV_BNDERR";
+       if (si_code == SEGV_PKUERR)
+               return "SEGV_PKUERR";
+       return "UNKNOWN";
+}
+
+int pkey_faults;
+int last_si_pkey = -1;
+void signal_handler(int signum, siginfo_t *si, void *vucontext)
+{
+       ucontext_t *uctxt = vucontext;
+       int trapno;
+       unsigned long ip;
+       char *fpregs;
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+       u32 *pkey_reg_ptr;
+       int pkey_reg_offset;
+#endif /* arch */
+       u64 siginfo_pkey;
+       u32 *si_pkey_ptr;
+
+       dprint_in_signal = 1;
+       dprintf1(">>>>===============SIGSEGV============================\n");
+       dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+                       __func__, __LINE__,
+                       __read_pkey_reg(), shadow_pkey_reg);
+
+       trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
+       ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+       fpregs = (char *) uctxt->uc_mcontext.fpregs;
+
+       dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
+                       __func__, trapno, ip, si_code_str(si->si_code),
+                       si->si_code);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#ifdef __i386__
+       /*
+        * 32-bit has some extra padding so that userspace can tell whether
+        * the XSTATE header is present in addition to the "legacy" FPU
+        * state.  We just assume that it is here.
+        */
+       fpregs += 0x70;
+#endif /* i386 */
+       pkey_reg_offset = pkey_reg_xstate_offset();
+       pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
+
+       /*
+        * If we got a PKEY fault, we *HAVE* to have at least one bit set in
+        * here.
+        */
+       dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
+       if (DEBUG_LEVEL > 4)
+               dump_mem(pkey_reg_ptr - 128, 256);
+       pkey_assert(*pkey_reg_ptr);
+#endif /* arch */
+
+       dprintf1("siginfo: %p\n", si);
+       dprintf1(" fpregs: %p\n", fpregs);
+
+       if ((si->si_code == SEGV_MAPERR) ||
+           (si->si_code == SEGV_ACCERR) ||
+           (si->si_code == SEGV_BNDERR)) {
+               printf("non-PK si_code, exiting...\n");
+               exit(4);
+       }
+
+       si_pkey_ptr = siginfo_get_pkey_ptr(si);
+       dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
+       dump_mem((u8 *)si_pkey_ptr - 8, 24);
+       siginfo_pkey = *si_pkey_ptr;
+       pkey_assert(siginfo_pkey < NR_PKEYS);
+       last_si_pkey = siginfo_pkey;
+
+       /*
+        * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+        * checking
+        */
+       dprintf1("signal pkey_reg from  pkey_reg: %016llx\n",
+                       __read_pkey_reg());
+       dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+       dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
+       *(u64 *)pkey_reg_ptr = 0x00000000;
+       dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
+#elif defined(__powerpc64__) /* arch */
+       /* restore access and let the faulting instruction continue */
+       pkey_access_allow(siginfo_pkey);
+#endif /* arch */
+       pkey_faults++;
+       dprintf1("<<<<==================================================\n");
+       dprint_in_signal = 0;
+}
+
+int wait_all_children(void)
+{
+       int status;
+       return waitpid(-1, &status, 0);
+}
+
+void sig_chld(int x)
+{
+       dprint_in_signal = 1;
+       dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
+       dprint_in_signal = 0;
+}
+
+void setup_sigsegv_handler(void)
+{
+       int r, rs;
+       struct sigaction newact;
+       struct sigaction oldact;
+
+       /* #PF is mapped to sigsegv */
+       int signum  = SIGSEGV;
+
+       newact.sa_handler = 0;
+       newact.sa_sigaction = signal_handler;
+
+       /*sigset_t - signals to block while in the handler */
+       /* get the old signal mask. */
+       rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
+       pkey_assert(rs == 0);
+
+       /* call sa_sigaction, not sa_handler*/
+       newact.sa_flags = SA_SIGINFO;
+
+       newact.sa_restorer = 0;  /* void(*)(), obsolete */
+       r = sigaction(signum, &newact, &oldact);
+       r = sigaction(SIGALRM, &newact, &oldact);
+       pkey_assert(r == 0);
+}
+
+void setup_handlers(void)
+{
+       signal(SIGCHLD, &sig_chld);
+       setup_sigsegv_handler();
+}
+
+pid_t fork_lazy_child(void)
+{
+       pid_t forkret;
+
+       forkret = fork();
+       pkey_assert(forkret >= 0);
+       dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+       if (!forkret) {
+               /* in the child */
+               while (1) {
+                       dprintf1("child sleeping...\n");
+                       sleep(30);
+               }
+       }
+       return forkret;
+}
+
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+               unsigned long pkey)
+{
+       int sret;
+
+       dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
+                       ptr, size, orig_prot, pkey);
+
+       errno = 0;
+       sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
+       if (errno) {
+               dprintf2("SYS_mprotect_key sret: %d\n", sret);
+               dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
+               dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
+               if (DEBUG_LEVEL >= 2)
+                       perror("SYS_mprotect_pkey");
+       }
+       return sret;
+}
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+       int ret = syscall(SYS_pkey_alloc, flags, init_val);
+       dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
+                       __func__, flags, init_val, ret, errno);
+       return ret;
+}
+
+int alloc_pkey(void)
+{
+       int ret;
+       unsigned long init_val = 0x0;
+
+       dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+                       __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
+       ret = sys_pkey_alloc(0, init_val);
+       /*
+        * pkey_alloc() sets PKEY register, so we need to reflect it in
+        * shadow_pkey_reg:
+        */
+       dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                       " shadow: 0x%016llx\n",
+                       __func__, __LINE__, ret, __read_pkey_reg(),
+                       shadow_pkey_reg);
+       if (ret > 0) {
+               /* clear both the bits: */
+               shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+                                               ~PKEY_MASK);
+               dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                               " shadow: 0x%016llx\n",
+                               __func__,
+                               __LINE__, ret, __read_pkey_reg(),
+                               shadow_pkey_reg);
+               /*
+                * move the new state in from init_val
+                * (remember, we cheated and init_val == pkey_reg format)
+                */
+               shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+                                               init_val);
+       }
+       dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                       " shadow: 0x%016llx\n",
+                       __func__, __LINE__, ret, __read_pkey_reg(),
+                       shadow_pkey_reg);
+       dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
+       /* for shadow checking: */
+       read_pkey_reg();
+       dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                " shadow: 0x%016llx\n",
+               __func__, __LINE__, ret, __read_pkey_reg(),
+               shadow_pkey_reg);
+       return ret;
+}
+
+int sys_pkey_free(unsigned long pkey)
+{
+       int ret = syscall(SYS_pkey_free, pkey);
+       dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
+       return ret;
+}
+
+/*
+ * I had a bug where pkey bits could be set by mprotect() but
+ * not cleared.  This ensures we get lots of random bit sets
+ * and clears on the vma and pte pkey bits.
+ */
+int alloc_random_pkey(void)
+{
+       int max_nr_pkey_allocs;
+       int ret;
+       int i;
+       int alloced_pkeys[NR_PKEYS];
+       int nr_alloced = 0;
+       int random_index;
+       memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+
+       /* allocate every possible key and make a note of which ones we got */
+       max_nr_pkey_allocs = NR_PKEYS;
+       for (i = 0; i < max_nr_pkey_allocs; i++) {
+               int new_pkey = alloc_pkey();
+               if (new_pkey < 0)
+                       break;
+               alloced_pkeys[nr_alloced++] = new_pkey;
+       }
+
+       pkey_assert(nr_alloced > 0);
+       /* select a random one out of the allocated ones */
+       random_index = rand() % nr_alloced;
+       ret = alloced_pkeys[random_index];
+       /* now zero it out so we don't free it next */
+       alloced_pkeys[random_index] = 0;
+
+       /* go through the allocated ones that we did not want and free them */
+       for (i = 0; i < nr_alloced; i++) {
+               int free_ret;
+               if (!alloced_pkeys[i])
+                       continue;
+               free_ret = sys_pkey_free(alloced_pkeys[i]);
+               pkey_assert(!free_ret);
+       }
+       dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                        " shadow: 0x%016llx\n", __func__,
+                       __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+       return ret;
+}
+
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+               unsigned long pkey)
+{
+       int nr_iterations = random() % 100;
+       int ret;
+
+       while (0) {
+               int rpkey = alloc_random_pkey();
+               ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+               dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+                               ptr, size, orig_prot, pkey, ret);
+               if (nr_iterations-- < 0)
+                       break;
+
+               dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                       " shadow: 0x%016llx\n",
+                       __func__, __LINE__, ret, __read_pkey_reg(),
+                       shadow_pkey_reg);
+               sys_pkey_free(rpkey);
+               dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                       " shadow: 0x%016llx\n",
+                       __func__, __LINE__, ret, __read_pkey_reg(),
+                       shadow_pkey_reg);
+       }
+       pkey_assert(pkey < NR_PKEYS);
+
+       ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+       dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+                       ptr, size, orig_prot, pkey, ret);
+       pkey_assert(!ret);
+       dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+                       " shadow: 0x%016llx\n", __func__,
+                       __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+       return ret;
+}
+
+struct pkey_malloc_record {
+       void *ptr;
+       long size;
+       int prot;
+};
+struct pkey_malloc_record *pkey_malloc_records;
+struct pkey_malloc_record *pkey_last_malloc_record;
+long nr_pkey_malloc_records;
+void record_pkey_malloc(void *ptr, long size, int prot)
+{
+       long i;
+       struct pkey_malloc_record *rec = NULL;
+
+       for (i = 0; i < nr_pkey_malloc_records; i++) {
+               rec = &pkey_malloc_records[i];
+               /* find a free record */
+               if (rec)
+                       break;
+       }
+       if (!rec) {
+               /* every record is full */
+               size_t old_nr_records = nr_pkey_malloc_records;
+               size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
+               size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
+               dprintf2("new_nr_records: %zd\n", new_nr_records);
+               dprintf2("new_size: %zd\n", new_size);
+               pkey_malloc_records = realloc(pkey_malloc_records, new_size);
+               pkey_assert(pkey_malloc_records != NULL);
+               rec = &pkey_malloc_records[nr_pkey_malloc_records];
+               /*
+                * realloc() does not initialize memory, so zero it from
+                * the first new record all the way to the end.
+                */
+               for (i = 0; i < new_nr_records - old_nr_records; i++)
+                       memset(rec + i, 0, sizeof(*rec));
+       }
+       dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
+               (int)(rec - pkey_malloc_records), rec, ptr, size);
+       rec->ptr = ptr;
+       rec->size = size;
+       rec->prot = prot;
+       pkey_last_malloc_record = rec;
+       nr_pkey_malloc_records++;
+}
+
+void free_pkey_malloc(void *ptr)
+{
+       long i;
+       int ret;
+       dprintf3("%s(%p)\n", __func__, ptr);
+       for (i = 0; i < nr_pkey_malloc_records; i++) {
+               struct pkey_malloc_record *rec = &pkey_malloc_records[i];
+               dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
+                               ptr, i, rec, rec->ptr, rec->size);
+               if ((ptr <  rec->ptr) ||
+                   (ptr >= rec->ptr + rec->size))
+                       continue;
+
+               dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
+                               ptr, i, rec, rec->ptr, rec->size);
+               nr_pkey_malloc_records--;
+               ret = munmap(rec->ptr, rec->size);
+               dprintf3("munmap ret: %d\n", ret);
+               pkey_assert(!ret);
+               dprintf3("clearing rec->ptr, rec: %p\n", rec);
+               rec->ptr = NULL;
+               dprintf3("done clearing rec->ptr, rec: %p\n", rec);
+               return;
+       }
+       pkey_assert(false);
+}
+
+
+void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
+{
+       void *ptr;
+       int ret;
+
+       read_pkey_reg();
+       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+                       size, prot, pkey);
+       pkey_assert(pkey < NR_PKEYS);
+       ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+       pkey_assert(ptr != (void *)-1);
+       ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+       pkey_assert(!ret);
+       record_pkey_malloc(ptr, size, prot);
+       read_pkey_reg();
+
+       dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+       return ptr;
+}
+
+void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
+{
+       int ret;
+       void *ptr;
+
+       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+                       size, prot, pkey);
+       /*
+        * Guarantee we can fit at least one huge page in the resulting
+        * allocation by allocating space for 2:
+        */
+       size = ALIGN_UP(size, HPAGE_SIZE * 2);
+       ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+       pkey_assert(ptr != (void *)-1);
+       record_pkey_malloc(ptr, size, prot);
+       mprotect_pkey(ptr, size, prot, pkey);
+
+       dprintf1("unaligned ptr: %p\n", ptr);
+       ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
+       dprintf1("  aligned ptr: %p\n", ptr);
+       ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
+       dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
+       ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
+       dprintf1("MADV_WILLNEED ret: %d\n", ret);
+       memset(ptr, 0, HPAGE_SIZE);
+
+       dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
+       return ptr;
+}
+
+int hugetlb_setup_ok;
+#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
+#define GET_NR_HUGE_PAGES 10
+void setup_hugetlbfs(void)
+{
+       int err;
+       int fd;
+       char buf[256];
+       long hpagesz_kb;
+       long hpagesz_mb;
+
+       if (geteuid() != 0) {
+               fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
+               return;
+       }
+
+       cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
+
+       /*
+        * Now go make sure that we got the pages and that they
+        * are PMD-level pages. Someone might have made PUD-level
+        * pages the default.
+        */
+       hpagesz_kb = HPAGE_SIZE / 1024;
+       hpagesz_mb = hpagesz_kb / 1024;
+       sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
+       fd = open(buf, O_RDONLY);
+       if (fd < 0) {
+               fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
+                       hpagesz_mb, strerror(errno));
+               return;
+       }
+
+       /* -1 to guarantee leaving the trailing \0 */
+       err = read(fd, buf, sizeof(buf)-1);
+       close(fd);
+       if (err <= 0) {
+               fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
+                       hpagesz_mb, strerror(errno));
+               return;
+       }
+
+       if (atoi(buf) != GET_NR_HUGE_PAGES) {
+               fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
+                       hpagesz_mb, buf, GET_NR_HUGE_PAGES);
+               return;
+       }
+
+       hugetlb_setup_ok = 1;
+}
+
+void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
+{
+       void *ptr;
+       int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
+
+       if (!hugetlb_setup_ok)
+               return PTR_ERR_ENOTSUP;
+
+       dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
+       size = ALIGN_UP(size, HPAGE_SIZE * 2);
+       pkey_assert(pkey < NR_PKEYS);
+       ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+       pkey_assert(ptr != (void *)-1);
+       mprotect_pkey(ptr, size, prot, pkey);
+
+       record_pkey_malloc(ptr, size, prot);
+
+       dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
+       return ptr;
+}
+
+void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
+{
+       void *ptr;
+       int fd;
+
+       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+                       size, prot, pkey);
+       pkey_assert(pkey < NR_PKEYS);
+       fd = open("/dax/foo", O_RDWR);
+       pkey_assert(fd >= 0);
+
+       ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
+       pkey_assert(ptr != (void *)-1);
+
+       mprotect_pkey(ptr, size, prot, pkey);
+
+       record_pkey_malloc(ptr, size, prot);
+
+       dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
+       close(fd);
+       return ptr;
+}
+
+void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
+
+       malloc_pkey_with_mprotect,
+       malloc_pkey_with_mprotect_subpage,
+       malloc_pkey_anon_huge,
+       malloc_pkey_hugetlb
+/* can not do direct with the pkey_mprotect() API:
+       malloc_pkey_mmap_direct,
+       malloc_pkey_mmap_dax,
+*/
+};
+
+void *malloc_pkey(long size, int prot, u16 pkey)
+{
+       void *ret;
+       static int malloc_type;
+       int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
+
+       pkey_assert(pkey < NR_PKEYS);
+
+       while (1) {
+               pkey_assert(malloc_type < nr_malloc_types);
+
+               ret = pkey_malloc[malloc_type](size, prot, pkey);
+               pkey_assert(ret != (void *)-1);
+
+               malloc_type++;
+               if (malloc_type >= nr_malloc_types)
+                       malloc_type = (random()%nr_malloc_types);
+
+               /* try again if the malloc_type we tried is unsupported */
+               if (ret == PTR_ERR_ENOTSUP)
+                       continue;
+
+               break;
+       }
+
+       dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
+                       size, prot, pkey, ret);
+       return ret;
+}
+
+int last_pkey_faults;
+#define UNKNOWN_PKEY -2
+void expected_pkey_fault(int pkey)
+{
+       dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
+                       __func__, last_pkey_faults, pkey_faults);
+       dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
+       pkey_assert(last_pkey_faults + 1 == pkey_faults);
+
+       /*
+       * For exec-only memory, we do not know the pkey in
+       * advance, so skip this check.
+       */
+       if (pkey != UNKNOWN_PKEY)
+               pkey_assert(last_si_pkey == pkey);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+       /*
+        * The signal handler shold have cleared out PKEY register to let the
+        * test program continue.  We now have to restore it.
+        */
+       if (__read_pkey_reg() != 0)
+#else /* arch */
+       if (__read_pkey_reg() != shadow_pkey_reg)
+#endif /* arch */
+               pkey_assert(0);
+
+       __write_pkey_reg(shadow_pkey_reg);
+       dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
+                      "nuked it\n", __func__, shadow_pkey_reg);
+       last_pkey_faults = pkey_faults;
+       last_si_pkey = -1;
+}
+
+#define do_not_expect_pkey_fault(msg)  do {                    \
+       if (last_pkey_faults != pkey_faults)                    \
+               dprintf0("unexpected PKey fault: %s\n", msg);   \
+       pkey_assert(last_pkey_faults == pkey_faults);           \
+} while (0)
+
+int test_fds[10] = { -1 };
+int nr_test_fds;
+void __save_test_fd(int fd)
+{
+       pkey_assert(fd >= 0);
+       pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
+       test_fds[nr_test_fds] = fd;
+       nr_test_fds++;
+}
+
+int get_test_read_fd(void)
+{
+       int test_fd = open("/etc/passwd", O_RDONLY);
+       __save_test_fd(test_fd);
+       return test_fd;
+}
+
+void close_test_fds(void)
+{
+       int i;
+
+       for (i = 0; i < nr_test_fds; i++) {
+               if (test_fds[i] < 0)
+                       continue;
+               close(test_fds[i]);
+               test_fds[i] = -1;
+       }
+       nr_test_fds = 0;
+}
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+__attribute__((noinline)) int read_ptr(int *ptr)
+{
+       /*
+        * Keep GCC from optimizing this away somehow
+        */
+       barrier();
+       return *ptr;
+}
+
+void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+{
+       int i, err;
+       int max_nr_pkey_allocs;
+       int alloced_pkeys[NR_PKEYS];
+       int nr_alloced = 0;
+       long size;
+
+       pkey_assert(pkey_last_malloc_record);
+       size = pkey_last_malloc_record->size;
+       /*
+        * This is a bit of a hack.  But mprotect() requires
+        * huge-page-aligned sizes when operating on hugetlbfs.
+        * So, make sure that we use something that's a multiple
+        * of a huge page when we can.
+        */
+       if (size >= HPAGE_SIZE)
+               size = HPAGE_SIZE;
+
+       /* allocate every possible key and make sure key-0 never got allocated */
+       max_nr_pkey_allocs = NR_PKEYS;
+       for (i = 0; i < max_nr_pkey_allocs; i++) {
+               int new_pkey = alloc_pkey();
+               pkey_assert(new_pkey != 0);
+
+               if (new_pkey < 0)
+                       break;
+               alloced_pkeys[nr_alloced++] = new_pkey;
+       }
+       /* free all the allocated keys */
+       for (i = 0; i < nr_alloced; i++) {
+               int free_ret;
+
+               if (!alloced_pkeys[i])
+                       continue;
+               free_ret = sys_pkey_free(alloced_pkeys[i]);
+               pkey_assert(!free_ret);
+       }
+
+       /* attach key-0 in various modes */
+       err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
+       pkey_assert(!err);
+       err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
+       pkey_assert(!err);
+       err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
+       pkey_assert(!err);
+       err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
+       pkey_assert(!err);
+       err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
+       pkey_assert(!err);
+}
+
+void test_read_of_write_disabled_region(int *ptr, u16 pkey)
+{
+       int ptr_contents;
+
+       dprintf1("disabling write access to PKEY[1], doing read\n");
+       pkey_write_deny(pkey);
+       ptr_contents = read_ptr(ptr);
+       dprintf1("*ptr: %d\n", ptr_contents);
+       dprintf1("\n");
+}
+void test_read_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       int ptr_contents;
+
+       dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
+       read_pkey_reg();
+       pkey_access_deny(pkey);
+       ptr_contents = read_ptr(ptr);
+       dprintf1("*ptr: %d\n", ptr_contents);
+       expected_pkey_fault(pkey);
+}
+
+void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+               u16 pkey)
+{
+       int ptr_contents;
+
+       dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+                               pkey, ptr);
+       ptr_contents = read_ptr(ptr);
+       dprintf1("reading ptr before disabling the read : %d\n",
+                       ptr_contents);
+       read_pkey_reg();
+       pkey_access_deny(pkey);
+       ptr_contents = read_ptr(ptr);
+       dprintf1("*ptr: %d\n", ptr_contents);
+       expected_pkey_fault(pkey);
+}
+
+void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+               u16 pkey)
+{
+       *ptr = __LINE__;
+       dprintf1("disabling write access; after accessing the page, "
+               "to PKEY[%02d], doing write\n", pkey);
+       pkey_write_deny(pkey);
+       *ptr = __LINE__;
+       expected_pkey_fault(pkey);
+}
+
+void test_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+       dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
+       pkey_write_deny(pkey);
+       *ptr = __LINE__;
+       expected_pkey_fault(pkey);
+}
+void test_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
+       pkey_access_deny(pkey);
+       *ptr = __LINE__;
+       expected_pkey_fault(pkey);
+}
+
+void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+                       u16 pkey)
+{
+       *ptr = __LINE__;
+       dprintf1("disabling access; after accessing the page, "
+               " to PKEY[%02d], doing write\n", pkey);
+       pkey_access_deny(pkey);
+       *ptr = __LINE__;
+       expected_pkey_fault(pkey);
+}
+
+void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       int ret;
+       int test_fd = get_test_read_fd();
+
+       dprintf1("disabling access to PKEY[%02d], "
+                "having kernel read() to buffer\n", pkey);
+       pkey_access_deny(pkey);
+       ret = read(test_fd, ptr, 1);
+       dprintf1("read ret: %d\n", ret);
+       pkey_assert(ret);
+}
+void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+       int ret;
+       int test_fd = get_test_read_fd();
+
+       pkey_write_deny(pkey);
+       ret = read(test_fd, ptr, 100);
+       dprintf1("read ret: %d\n", ret);
+       if (ret < 0 && (DEBUG_LEVEL > 0))
+               perror("verbose read result (OK for this to be bad)");
+       pkey_assert(ret);
+}
+
+void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
+{
+       int pipe_ret, vmsplice_ret;
+       struct iovec iov;
+       int pipe_fds[2];
+
+       pipe_ret = pipe(pipe_fds);
+
+       pkey_assert(pipe_ret == 0);
+       dprintf1("disabling access to PKEY[%02d], "
+                "having kernel vmsplice from buffer\n", pkey);
+       pkey_access_deny(pkey);
+       iov.iov_base = ptr;
+       iov.iov_len = PAGE_SIZE;
+       vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
+       dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
+       pkey_assert(vmsplice_ret == -1);
+
+       close(pipe_fds[0]);
+       close(pipe_fds[1]);
+}
+
+void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
+{
+       int ignored = 0xdada;
+       int futex_ret;
+       int some_int = __LINE__;
+
+       dprintf1("disabling write to PKEY[%02d], "
+                "doing futex gunk in buffer\n", pkey);
+       *ptr = some_int;
+       pkey_write_deny(pkey);
+       futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
+                       &ignored, ignored);
+       if (DEBUG_LEVEL > 0)
+               perror("futex");
+       dprintf1("futex() ret: %d\n", futex_ret);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
+{
+       int err;
+       int i;
+
+       /* Note: 0 is the default pkey, so don't mess with it */
+       for (i = 1; i < NR_PKEYS; i++) {
+               if (pkey == i)
+                       continue;
+
+               dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
+               err = sys_pkey_free(i);
+               pkey_assert(err);
+
+               err = sys_pkey_free(i);
+               pkey_assert(err);
+
+               err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
+               pkey_assert(err);
+       }
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
+{
+       int err;
+       int bad_pkey = NR_PKEYS+99;
+
+       /* pass a known-invalid pkey in: */
+       err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
+       pkey_assert(err);
+}
+
+void become_child(void)
+{
+       pid_t forkret;
+
+       forkret = fork();
+       pkey_assert(forkret >= 0);
+       dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+       if (!forkret) {
+               /* in the child */
+               return;
+       }
+       exit(0);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
+{
+       int err;
+       int allocated_pkeys[NR_PKEYS] = {0};
+       int nr_allocated_pkeys = 0;
+       int i;
+
+       for (i = 0; i < NR_PKEYS*3; i++) {
+               int new_pkey;
+               dprintf1("%s() alloc loop: %d\n", __func__, i);
+               new_pkey = alloc_pkey();
+               dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
+                               " shadow: 0x%016llx\n",
+                               __func__, __LINE__, err, __read_pkey_reg(),
+                               shadow_pkey_reg);
+               read_pkey_reg(); /* for shadow checking */
+               dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
+               if ((new_pkey == -1) && (errno == ENOSPC)) {
+                       dprintf2("%s() failed to allocate pkey after %d tries\n",
+                               __func__, nr_allocated_pkeys);
+               } else {
+                       /*
+                        * Ensure the number of successes never
+                        * exceeds the number of keys supported
+                        * in the hardware.
+                        */
+                       pkey_assert(nr_allocated_pkeys < NR_PKEYS);
+                       allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+               }
+
+               /*
+                * Make sure that allocation state is properly
+                * preserved across fork().
+                */
+               if (i == NR_PKEYS*2)
+                       become_child();
+       }
+
+       dprintf3("%s()::%d\n", __func__, __LINE__);
+
+       /*
+        * On x86:
+        * There are 16 pkeys supported in hardware.  Three are
+        * allocated by the time we get here:
+        *   1. The default key (0)
+        *   2. One possibly consumed by an execute-only mapping.
+        *   3. One allocated by the test code and passed in via
+        *      'pkey' to this function.
+        * Ensure that we can allocate at least another 13 (16-3).
+        *
+        * On powerpc:
+        * There are either 5, 28, 29 or 32 pkeys supported in
+        * hardware depending on the page size (4K or 64K) and
+        * platform (powernv or powervm). Four are allocated by
+        * the time we get here. These include pkey-0, pkey-1,
+        * exec-only pkey and the one allocated by the test code.
+        * Ensure that we can allocate the remaining.
+        */
+       pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
+
+       for (i = 0; i < nr_allocated_pkeys; i++) {
+               err = sys_pkey_free(allocated_pkeys[i]);
+               pkey_assert(!err);
+               read_pkey_reg(); /* for shadow checking */
+       }
+}
+
+void arch_force_pkey_reg_init(void)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+       u64 *buf;
+
+       /*
+        * All keys should be allocated and set to allow reads and
+        * writes, so the register should be all 0.  If not, just
+        * skip the test.
+        */
+       if (read_pkey_reg())
+               return;
+
+       /*
+        * Just allocate an absurd about of memory rather than
+        * doing the XSAVE size enumeration dance.
+        */
+       buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+
+       /* These __builtins require compiling with -mxsave */
+
+       /* XSAVE to build a valid buffer: */
+       __builtin_ia32_xsave(buf, XSTATE_PKEY);
+       /* Clear XSTATE_BV[PKRU]: */
+       buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
+       /* XRSTOR will likely get PKRU back to the init state: */
+       __builtin_ia32_xrstor(buf, XSTATE_PKEY);
+
+       munmap(buf, 1*MB);
+#endif
+}
+
+
+/*
+ * This is mostly useless on ppc for now.  But it will not
+ * hurt anything and should give some better coverage as
+ * a long-running test that continually checks the pkey
+ * register.
+ */
+void test_pkey_init_state(int *ptr, u16 pkey)
+{
+       int err;
+       int allocated_pkeys[NR_PKEYS] = {0};
+       int nr_allocated_pkeys = 0;
+       int i;
+
+       for (i = 0; i < NR_PKEYS; i++) {
+               int new_pkey = alloc_pkey();
+
+               if (new_pkey < 0)
+                       continue;
+               allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+       }
+
+       dprintf3("%s()::%d\n", __func__, __LINE__);
+
+       arch_force_pkey_reg_init();
+
+       /*
+        * Loop for a bit, hoping to get exercise the kernel
+        * context switch code.
+        */
+       for (i = 0; i < 1000000; i++)
+               read_pkey_reg();
+
+       for (i = 0; i < nr_allocated_pkeys; i++) {
+               err = sys_pkey_free(allocated_pkeys[i]);
+               pkey_assert(!err);
+               read_pkey_reg(); /* for shadow checking */
+       }
+}
+
+/*
+ * pkey 0 is special.  It is allocated by default, so you do not
+ * have to call pkey_alloc() to use it first.  Make sure that it
+ * is usable.
+ */
+void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
+{
+       long size;
+       int prot;
+
+       assert(pkey_last_malloc_record);
+       size = pkey_last_malloc_record->size;
+       /*
+        * This is a bit of a hack.  But mprotect() requires
+        * huge-page-aligned sizes when operating on hugetlbfs.
+        * So, make sure that we use something that's a multiple
+        * of a huge page when we can.
+        */
+       if (size >= HPAGE_SIZE)
+               size = HPAGE_SIZE;
+       prot = pkey_last_malloc_record->prot;
+
+       /* Use pkey 0 */
+       mprotect_pkey(ptr, size, prot, 0);
+
+       /* Make sure that we can set it back to the original pkey. */
+       mprotect_pkey(ptr, size, prot, pkey);
+}
+
+void test_ptrace_of_child(int *ptr, u16 pkey)
+{
+       __attribute__((__unused__)) int peek_result;
+       pid_t child_pid;
+       void *ignored = 0;
+       long ret;
+       int status;
+       /*
+        * This is the "control" for our little expermient.  Make sure
+        * we can always access it when ptracing.
+        */
+       int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
+       int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
+
+       /*
+        * Fork a child which is an exact copy of this process, of course.
+        * That means we can do all of our tests via ptrace() and then plain
+        * memory access and ensure they work differently.
+        */
+       child_pid = fork_lazy_child();
+       dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
+
+       ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
+       if (ret)
+               perror("attach");
+       dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
+       pkey_assert(ret != -1);
+       ret = waitpid(child_pid, &status, WUNTRACED);
+       if ((ret != child_pid) || !(WIFSTOPPED(status))) {
+               fprintf(stderr, "weird waitpid result %ld stat %x\n",
+                               ret, status);
+               pkey_assert(0);
+       }
+       dprintf2("waitpid ret: %ld\n", ret);
+       dprintf2("waitpid status: %d\n", status);
+
+       pkey_access_deny(pkey);
+       pkey_write_deny(pkey);
+
+       /* Write access, untested for now:
+       ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
+       pkey_assert(ret != -1);
+       dprintf1("poke at %p: %ld\n", peek_at, ret);
+       */
+
+       /*
+        * Try to access the pkey-protected "ptr" via ptrace:
+        */
+       ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
+       /* expect it to work, without an error: */
+       pkey_assert(ret != -1);
+       /* Now access from the current task, and expect an exception: */
+       peek_result = read_ptr(ptr);
+       expected_pkey_fault(pkey);
+
+       /*
+        * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
+        */
+       ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
+       /* expect it to work, without an error: */
+       pkey_assert(ret != -1);
+       /* Now access from the current task, and expect NO exception: */
+       peek_result = read_ptr(plain_ptr);
+       do_not_expect_pkey_fault("read plain pointer after ptrace");
+
+       ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
+       pkey_assert(ret != -1);
+
+       ret = kill(child_pid, SIGKILL);
+       pkey_assert(ret != -1);
+
+       wait(&status);
+
+       free(plain_ptr_unaligned);
+}
+
+void *get_pointer_to_instructions(void)
+{
+       void *p1;
+
+       p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
+       dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
+       /* lots_o_noops_around_write should be page-aligned already */
+       assert(p1 == &lots_o_noops_around_write);
+
+       /* Point 'p1' at the *second* page of the function: */
+       p1 += PAGE_SIZE;
+
+       /*
+        * Try to ensure we fault this in on next touch to ensure
+        * we get an instruction fault as opposed to a data one
+        */
+       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+
+       return p1;
+}
+
+void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+{
+       void *p1;
+       int scratch;
+       int ptr_contents;
+       int ret;
+
+       p1 = get_pointer_to_instructions();
+       lots_o_noops_around_write(&scratch);
+       ptr_contents = read_ptr(p1);
+       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+       ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
+       pkey_assert(!ret);
+       pkey_access_deny(pkey);
+
+       dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+       /*
+        * Make sure this is an *instruction* fault
+        */
+       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+       lots_o_noops_around_write(&scratch);
+       do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+       expect_fault_on_read_execonly_key(p1, pkey);
+}
+
+void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
+{
+       void *p1;
+       int scratch;
+       int ptr_contents;
+       int ret;
+
+       dprintf1("%s() start\n", __func__);
+
+       p1 = get_pointer_to_instructions();
+       lots_o_noops_around_write(&scratch);
+       ptr_contents = read_ptr(p1);
+       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+       /* Use a *normal* mprotect(), not mprotect_pkey(): */
+       ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
+       pkey_assert(!ret);
+
+       /*
+        * Reset the shadow, assuming that the above mprotect()
+        * correctly changed PKRU, but to an unknown value since
+        * the actual allocated pkey is unknown.
+        */
+       shadow_pkey_reg = __read_pkey_reg();
+
+       dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+       /* Make sure this is an *instruction* fault */
+       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+       lots_o_noops_around_write(&scratch);
+       do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+       expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
+
+       /*
+        * Put the memory back to non-PROT_EXEC.  Should clear the
+        * exec-only pkey off the VMA and allow it to be readable
+        * again.  Go to PROT_NONE first to check for a kernel bug
+        * that did not clear the pkey when doing PROT_NONE.
+        */
+       ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
+       pkey_assert(!ret);
+
+       ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
+       pkey_assert(!ret);
+       ptr_contents = read_ptr(p1);
+       do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
+{
+       u32 new_pkru;
+       pid_t child;
+       int status, ret;
+       int pkey_offset = pkey_reg_xstate_offset();
+       size_t xsave_size = cpu_max_xsave_size();
+       void *xsave;
+       u32 *pkey_register;
+       u64 *xstate_bv;
+       struct iovec iov;
+
+       new_pkru = ~read_pkey_reg();
+       /* Don't make PROT_EXEC mappings inaccessible */
+       new_pkru &= ~3;
+
+       child = fork();
+       pkey_assert(child >= 0);
+       dprintf3("[%d] fork() ret: %d\n", getpid(), child);
+       if (!child) {
+               ptrace(PTRACE_TRACEME, 0, 0, 0);
+               /* Stop and allow the tracer to modify PKRU directly */
+               raise(SIGSTOP);
+
+               /*
+                * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+                * checking
+                */
+               if (__read_pkey_reg() != new_pkru)
+                       exit(1);
+
+               /* Stop and allow the tracer to clear XSTATE_BV for PKRU */
+               raise(SIGSTOP);
+
+               if (__read_pkey_reg() != 0)
+                       exit(1);
+
+               /* Stop and allow the tracer to examine PKRU */
+               raise(SIGSTOP);
+
+               exit(0);
+       }
+
+       pkey_assert(child == waitpid(child, &status, 0));
+       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+       pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+       xsave = (void *)malloc(xsave_size);
+       pkey_assert(xsave > 0);
+
+       /* Modify the PKRU register directly */
+       iov.iov_base = xsave;
+       iov.iov_len = xsave_size;
+       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+       pkey_assert(ret == 0);
+
+       pkey_register = (u32 *)(xsave + pkey_offset);
+       pkey_assert(*pkey_register == read_pkey_reg());
+
+       *pkey_register = new_pkru;
+
+       ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+       pkey_assert(ret == 0);
+
+       /* Test that the modification is visible in ptrace before any execution */
+       memset(xsave, 0xCC, xsave_size);
+       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+       pkey_assert(ret == 0);
+       pkey_assert(*pkey_register == new_pkru);
+
+       /* Execute the tracee */
+       ret = ptrace(PTRACE_CONT, child, 0, 0);
+       pkey_assert(ret == 0);
+
+       /* Test that the tracee saw the PKRU value change */
+       pkey_assert(child == waitpid(child, &status, 0));
+       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+       pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+       /* Test that the modification is visible in ptrace after execution */
+       memset(xsave, 0xCC, xsave_size);
+       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+       pkey_assert(ret == 0);
+       pkey_assert(*pkey_register == new_pkru);
+
+       /* Clear the PKRU bit from XSTATE_BV */
+       xstate_bv = (u64 *)(xsave + 512);
+       *xstate_bv &= ~(1 << 9);
+
+       ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+       pkey_assert(ret == 0);
+
+       /* Test that the modification is visible in ptrace before any execution */
+       memset(xsave, 0xCC, xsave_size);
+       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+       pkey_assert(ret == 0);
+       pkey_assert(*pkey_register == 0);
+
+       ret = ptrace(PTRACE_CONT, child, 0, 0);
+       pkey_assert(ret == 0);
+
+       /* Test that the tracee saw the PKRU value go to 0 */
+       pkey_assert(child == waitpid(child, &status, 0));
+       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+       pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+       /* Test that the modification is visible in ptrace after execution */
+       memset(xsave, 0xCC, xsave_size);
+       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
+       pkey_assert(ret == 0);
+       pkey_assert(*pkey_register == 0);
+
+       ret = ptrace(PTRACE_CONT, child, 0, 0);
+       pkey_assert(ret == 0);
+       pkey_assert(child == waitpid(child, &status, 0));
+       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
+       pkey_assert(WIFEXITED(status));
+       pkey_assert(WEXITSTATUS(status) == 0);
+       free(xsave);
+}
+#endif
+
+void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
+{
+       int size = PAGE_SIZE;
+       int sret;
+
+       if (cpu_has_pkeys()) {
+               dprintf1("SKIP: %s: no CPU support\n", __func__);
+               return;
+       }
+
+       sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
+       pkey_assert(sret < 0);
+}
+
+void (*pkey_tests[])(int *ptr, u16 pkey) = {
+       test_read_of_write_disabled_region,
+       test_read_of_access_disabled_region,
+       test_read_of_access_disabled_region_with_page_already_mapped,
+       test_write_of_write_disabled_region,
+       test_write_of_write_disabled_region_with_page_already_mapped,
+       test_write_of_access_disabled_region,
+       test_write_of_access_disabled_region_with_page_already_mapped,
+       test_kernel_write_of_access_disabled_region,
+       test_kernel_write_of_write_disabled_region,
+       test_kernel_gup_of_access_disabled_region,
+       test_kernel_gup_write_to_write_disabled_region,
+       test_executing_on_unreadable_memory,
+       test_implicit_mprotect_exec_only_memory,
+       test_mprotect_with_pkey_0,
+       test_ptrace_of_child,
+       test_pkey_init_state,
+       test_pkey_syscalls_on_non_allocated_pkey,
+       test_pkey_syscalls_bad_args,
+       test_pkey_alloc_exhaust,
+       test_pkey_alloc_free_attach_pkey0,
+#if defined(__i386__) || defined(__x86_64__)
+       test_ptrace_modifies_pkru,
+#endif
+};
+
+void run_tests_once(void)
+{
+       int *ptr;
+       int prot = PROT_READ|PROT_WRITE;
+
+       for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
+               int pkey;
+               int orig_pkey_faults = pkey_faults;
+
+               dprintf1("======================\n");
+               dprintf1("test %d preparing...\n", test_nr);
+
+               tracing_on();
+               pkey = alloc_random_pkey();
+               dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
+               ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
+               dprintf1("test %d starting...\n", test_nr);
+               pkey_tests[test_nr](ptr, pkey);
+               dprintf1("freeing test memory: %p\n", ptr);
+               free_pkey_malloc(ptr);
+               sys_pkey_free(pkey);
+
+               dprintf1("pkey_faults: %d\n", pkey_faults);
+               dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
+
+               tracing_off();
+               close_test_fds();
+
+               printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
+               dprintf1("======================\n\n");
+       }
+       iteration_nr++;
+}
+
+void pkey_setup_shadow(void)
+{
+       shadow_pkey_reg = __read_pkey_reg();
+}
+
+int main(void)
+{
+       int nr_iterations = 22;
+       int pkeys_supported = is_pkeys_supported();
+
+       srand((unsigned int)time(NULL));
+
+       setup_handlers();
+
+       printf("has pkeys: %d\n", pkeys_supported);
+
+       if (!pkeys_supported) {
+               int size = PAGE_SIZE;
+               int *ptr;
+
+               printf("running PKEY tests for unsupported CPU/OS\n");
+
+               ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+               assert(ptr != (void *)-1);
+               test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
+               exit(0);
+       }
+
+       pkey_setup_shadow();
+       printf("startup pkey_reg: %016llx\n", read_pkey_reg());
+       setup_hugetlbfs();
+
+       while (nr_iterations-- > 0)
+               run_tests_once();
+
+       printf("done (all tests OK)\n");
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh
new file mode 100644 (file)
index 0000000..8984e0b
--- /dev/null
@@ -0,0 +1,274 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Please run as root
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+exitcode=0
+
+usage() {
+       cat <<EOF
+usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"]
+  -t: specify specific categories to tests to run
+  -h: display this message
+
+The default behavior is to run all tests.
+
+Alternatively, specific groups tests can be run by passing a string
+to the -t argument containing one or more of the following categories
+separated by spaces:
+- mmap
+       tests for mmap(2)
+- gup_test
+       tests for gup using gup_test interface
+- userfaultfd
+       tests for  userfaultfd(2)
+- compaction
+       a test for the patch "Allow compaction of unevictable pages"
+- mlock
+       tests for mlock(2)
+- mremap
+       tests for mremap(2)
+- hugevm
+       tests for very large virtual address space
+- vmalloc
+       vmalloc smoke tests
+- hmm
+       hmm smoke tests
+- madv_populate
+       test memadvise(2) MADV_POPULATE_{READ,WRITE} options
+- memfd_secret
+       test memfd_secret(2)
+- process_mrelease
+       test process_mrelease(2)
+- ksm
+       ksm tests that do not require >=2 NUMA nodes
+- ksm_numa
+       ksm tests that require >=2 NUMA nodes
+- pkey
+       memory protection key tests
+- soft_dirty
+       test soft dirty page bit semantics
+- cow
+       test copy-on-write semantics
+example: ./run_vmtests.sh -t "hmm mmap ksm"
+EOF
+       exit 0
+}
+
+
+while getopts "ht:" OPT; do
+       case ${OPT} in
+               "h") usage ;;
+               "t") VM_SELFTEST_ITEMS=${OPTARG} ;;
+       esac
+done
+shift $((OPTIND -1))
+
+# default behavior: run all tests
+VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default}
+
+test_selected() {
+       if [ "$VM_SELFTEST_ITEMS" == "default" ]; then
+               # If no VM_SELFTEST_ITEMS are specified, run all tests
+               return 0
+       fi
+       # If test selected argument is one of the test items
+       if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then
+               return 0
+       else
+               return 1
+       fi
+}
+
+# get huge pagesize and freepages from /proc/meminfo
+while read -r name size unit; do
+       if [ "$name" = "HugePages_Free:" ]; then
+               freepgs="$size"
+       fi
+       if [ "$name" = "Hugepagesize:" ]; then
+               hpgsize_KB="$size"
+       fi
+done < /proc/meminfo
+
+# Simple hugetlbfs tests have a hardcoded minimum requirement of
+# huge pages totaling 256MB (262144KB) in size.  The userfaultfd
+# hugetlb test requires a minimum of 2 * nr_cpus huge pages.  Take
+# both of these requirements into account and attempt to increase
+# number of huge pages available.
+nr_cpus=$(nproc)
+hpgsize_MB=$((hpgsize_KB / 1024))
+half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
+needmem_KB=$((half_ufd_size_MB * 2 * 1024))
+
+# set proper nr_hugepages
+if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
+       nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
+       needpgs=$((needmem_KB / hpgsize_KB))
+       tries=2
+       while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do
+               lackpgs=$((needpgs - freepgs))
+               echo 3 > /proc/sys/vm/drop_caches
+               if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then
+                       echo "Please run this test as root"
+                       exit $ksft_skip
+               fi
+               while read -r name size unit; do
+                       if [ "$name" = "HugePages_Free:" ]; then
+                               freepgs=$size
+                       fi
+               done < /proc/meminfo
+               tries=$((tries - 1))
+       done
+       if [ "$freepgs" -lt "$needpgs" ]; then
+               printf "Not enough huge pages available (%d < %d)\n" \
+                      "$freepgs" "$needpgs"
+               exit 1
+       fi
+else
+       echo "no hugetlbfs support in kernel?"
+       exit 1
+fi
+
+# filter 64bit architectures
+ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64"
+if [ -z "$ARCH" ]; then
+       ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
+fi
+VADDR64=0
+echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1
+
+# Usage: run_test [test binary] [arbitrary test arguments...]
+run_test() {
+       if test_selected ${CATEGORY}; then
+               local title="running $*"
+               local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
+               printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
+
+               "$@"
+               local ret=$?
+               if [ $ret -eq 0 ]; then
+                       echo "[PASS]"
+               elif [ $ret -eq $ksft_skip ]; then
+                       echo "[SKIP]"
+                       exitcode=$ksft_skip
+               else
+                       echo "[FAIL]"
+                       exitcode=1
+               fi
+       fi # test_selected
+}
+
+CATEGORY="hugetlb" run_test ./hugepage-mmap
+
+shmmax=$(cat /proc/sys/kernel/shmmax)
+shmall=$(cat /proc/sys/kernel/shmall)
+echo 268435456 > /proc/sys/kernel/shmmax
+echo 4194304 > /proc/sys/kernel/shmall
+CATEGORY="hugetlb" run_test ./hugepage-shm
+echo "$shmmax" > /proc/sys/kernel/shmmax
+echo "$shmall" > /proc/sys/kernel/shmall
+
+CATEGORY="hugetlb" run_test ./map_hugetlb
+CATEGORY="hugetlb" run_test ./hugepage-mremap
+CATEGORY="hugetlb" run_test ./hugepage-vmemmap
+CATEGORY="hugetlb" run_test ./hugetlb-madvise
+
+if test_selected "hugetlb"; then
+       echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
+       echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
+       echo "      hugetlb regression testing."
+fi
+
+CATEGORY="mmap" run_test ./map_fixed_noreplace
+
+# get_user_pages_fast() benchmark
+CATEGORY="gup_test" run_test ./gup_test -u
+# pin_user_pages_fast() benchmark
+CATEGORY="gup_test" run_test ./gup_test -a
+# Dump pages 0, 19, and 4096, using pin_user_pages:
+CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
+
+uffd_mods=("" ":dev")
+for mod in "${uffd_mods[@]}"; do
+       CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16
+       # Hugetlb tests require source and destination huge pages. Pass in half
+       # the size ($half_ufd_size_MB), which is used for *each*.
+       CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
+       CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
+       CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16
+done
+
+#cleanup
+echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
+
+CATEGORY="compaction" run_test ./compaction_test
+
+CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
+
+CATEGORY="mmap" run_test ./map_populate
+
+CATEGORY="mlock" run_test ./mlock-random-test
+
+CATEGORY="mlock" run_test ./mlock2-tests
+
+CATEGORY="process_mrelease" run_test ./mrelease_test
+
+CATEGORY="mremap" run_test ./mremap_test
+
+CATEGORY="hugetlb" run_test ./thuge-gen
+
+if [ $VADDR64 -ne 0 ]; then
+       CATEGORY="hugevm" run_test ./virtual_address_range
+
+       # virtual address 128TB switch test
+       CATEGORY="hugevm" run_test ./va_128TBswitch.sh
+fi # VADDR64
+
+# vmalloc stability smoke test
+CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke
+
+CATEGORY="mremap" run_test ./mremap_dontunmap
+
+CATEGORY="hmm" run_test ./test_hmm.sh smoke
+
+# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
+CATEGORY="madv_populate" run_test ./madv_populate
+
+CATEGORY="memfd_secret" run_test ./memfd_secret
+
+# KSM MADV_MERGEABLE test with 10 identical pages
+CATEGORY="ksm" run_test ./ksm_tests -M -p 10
+# KSM unmerge test
+CATEGORY="ksm" run_test ./ksm_tests -U
+# KSM test with 10 zero pages and use_zero_pages = 0
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0
+# KSM test with 10 zero pages and use_zero_pages = 1
+CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1
+# KSM test with 2 NUMA nodes and merge_across_nodes = 1
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1
+# KSM test with 2 NUMA nodes and merge_across_nodes = 0
+CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
+
+CATEGORY="ksm" run_test ./ksm_functional_tests
+
+run_test ./ksm_functional_tests
+
+# protection_keys tests
+if [ -x ./protection_keys_32 ]
+then
+       CATEGORY="pkey" run_test ./protection_keys_32
+fi
+
+if [ -x ./protection_keys_64 ]
+then
+       CATEGORY="pkey" run_test ./protection_keys_64
+fi
+
+CATEGORY="soft_dirty" run_test ./soft-dirty
+
+# COW tests
+CATEGORY="cow" run_test ./cow
+
+exit $exitcode
diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings
new file mode 100644 (file)
index 0000000..9abfc60
--- /dev/null
@@ -0,0 +1 @@
+timeout=45
diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c
new file mode 100644 (file)
index 0000000..21d8830
--- /dev/null
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <sys/mman.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define PAGEMAP_FILE_PATH "/proc/self/pagemap"
+#define TEST_ITERATIONS 10000
+
+static void test_simple(int pagemap_fd, int pagesize)
+{
+       int i;
+       char *map;
+
+       map = aligned_alloc(pagesize, pagesize);
+       if (!map)
+               ksft_exit_fail_msg("mmap failed\n");
+
+       clear_softdirty();
+
+       for (i = 0 ; i < TEST_ITERATIONS; i++) {
+               if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
+                       ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
+                       break;
+               }
+
+               clear_softdirty();
+               // Write something to the page to get the dirty bit enabled on the page
+               map[0]++;
+
+               if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
+                       ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
+                       break;
+               }
+
+               clear_softdirty();
+       }
+       free(map);
+
+       ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__);
+}
+
+static void test_vma_reuse(int pagemap_fd, int pagesize)
+{
+       char *map, *map2;
+
+       map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
+       if (map == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed");
+
+       // The kernel always marks new regions as soft dirty
+       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+                        "Test %s dirty bit of allocated page\n", __func__);
+
+       clear_softdirty();
+       munmap(map, pagesize);
+
+       map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
+       if (map2 == MAP_FAILED)
+               ksft_exit_fail_msg("mmap failed");
+
+       // Dirty bit is set for new regions even if they are reused
+       if (map == map2)
+               ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1,
+                                "Test %s dirty bit of reused address page\n", __func__);
+       else
+               ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__);
+
+       munmap(map2, pagesize);
+}
+
+static void test_hugepage(int pagemap_fd, int pagesize)
+{
+       char *map;
+       int i, ret;
+       size_t hpage_len = read_pmd_pagesize();
+
+       map = memalign(hpage_len, hpage_len);
+       if (!map)
+               ksft_exit_fail_msg("memalign failed\n");
+
+       ret = madvise(map, hpage_len, MADV_HUGEPAGE);
+       if (ret)
+               ksft_exit_fail_msg("madvise failed %d\n", ret);
+
+       for (i = 0; i < hpage_len; i++)
+               map[i] = (char)i;
+
+       if (check_huge_anon(map, 1, hpage_len)) {
+               ksft_test_result_pass("Test %s huge page allocation\n", __func__);
+
+               clear_softdirty();
+               for (i = 0 ; i < TEST_ITERATIONS ; i++) {
+                       if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
+                               ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
+                               break;
+                       }
+
+                       clear_softdirty();
+                       // Write something to the page to get the dirty bit enabled on the page
+                       map[0]++;
+
+                       if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
+                               ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
+                               break;
+                       }
+                       clear_softdirty();
+               }
+
+               ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__);
+       } else {
+               // hugepage allocation failed. skip these tests
+               ksft_test_result_skip("Test %s huge page allocation\n", __func__);
+               ksft_test_result_skip("Test %s huge page dirty bit\n", __func__);
+       }
+       free(map);
+}
+
+static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
+{
+       const char *type[] = {"file", "anon"};
+       const char *fname = "./soft-dirty-test-file";
+       int test_fd;
+       char *map;
+
+       if (anon) {
+               map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
+                          MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+               if (!map)
+                       ksft_exit_fail_msg("anon mmap failed\n");
+       } else {
+               test_fd = open(fname, O_RDWR | O_CREAT);
+               if (test_fd < 0) {
+                       ksft_test_result_skip("Test %s open() file failed\n", __func__);
+                       return;
+               }
+               unlink(fname);
+               ftruncate(test_fd, pagesize);
+               map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
+                          MAP_SHARED, test_fd, 0);
+               if (!map)
+                       ksft_exit_fail_msg("file mmap failed\n");
+       }
+
+       *map = 1;
+       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+                        "Test %s-%s dirty bit of new written page\n",
+                        __func__, type[anon]);
+       clear_softdirty();
+       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+                        "Test %s-%s soft-dirty clear after clear_refs\n",
+                        __func__, type[anon]);
+       mprotect(map, pagesize, PROT_READ);
+       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+                        "Test %s-%s soft-dirty clear after marking RO\n",
+                        __func__, type[anon]);
+       mprotect(map, pagesize, PROT_READ|PROT_WRITE);
+       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
+                        "Test %s-%s soft-dirty clear after marking RW\n",
+                        __func__, type[anon]);
+       *map = 2;
+       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
+                        "Test %s-%s soft-dirty after rewritten\n",
+                        __func__, type[anon]);
+
+       munmap(map, pagesize);
+
+       if (!anon)
+               close(test_fd);
+}
+
+static void test_mprotect_anon(int pagemap_fd, int pagesize)
+{
+       test_mprotect(pagemap_fd, pagesize, true);
+}
+
+static void test_mprotect_file(int pagemap_fd, int pagesize)
+{
+       test_mprotect(pagemap_fd, pagesize, false);
+}
+
+int main(int argc, char **argv)
+{
+       int pagemap_fd;
+       int pagesize;
+
+       ksft_print_header();
+       ksft_set_plan(15);
+
+       pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY);
+       if (pagemap_fd < 0)
+               ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH);
+
+       pagesize = getpagesize();
+
+       test_simple(pagemap_fd, pagesize);
+       test_vma_reuse(pagemap_fd, pagesize);
+       test_hugepage(pagemap_fd, pagesize);
+       test_mprotect_anon(pagemap_fd, pagesize);
+       test_mprotect_file(pagemap_fd, pagesize);
+
+       close(pagemap_fd);
+
+       return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c
new file mode 100644 (file)
index 0000000..76e1c36
--- /dev/null
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
+ * address range in a process via <debugfs>/split_huge_pages interface.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <malloc.h>
+#include <stdbool.h>
+#include "vm_util.h"
+
+uint64_t pagesize;
+unsigned int pageshift;
+uint64_t pmd_pagesize;
+
+#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
+#define INPUT_MAX 80
+
+#define PID_FMT "%d,0x%lx,0x%lx"
+#define PATH_FMT "%s,0x%lx,0x%lx"
+
+#define PFN_MASK     ((1UL<<55)-1)
+#define KPF_THP      (1UL<<22)
+
+int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
+{
+       uint64_t paddr;
+       uint64_t page_flags;
+
+       if (pagemap_file) {
+               pread(pagemap_file, &paddr, sizeof(paddr),
+                       ((long)vaddr >> pageshift) * sizeof(paddr));
+
+               if (kpageflags_file) {
+                       pread(kpageflags_file, &page_flags, sizeof(page_flags),
+                               (paddr & PFN_MASK) * sizeof(page_flags));
+
+                       return !!(page_flags & KPF_THP);
+               }
+       }
+       return 0;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+       int fd;
+       ssize_t numwritten;
+
+       fd = open(path, O_WRONLY);
+       if (fd == -1)
+               return 0;
+
+       numwritten = write(fd, buf, buflen - 1);
+       close(fd);
+       if (numwritten < 1)
+               return 0;
+
+       return (unsigned int) numwritten;
+}
+
+static void write_debugfs(const char *fmt, ...)
+{
+       char input[INPUT_MAX];
+       int ret;
+       va_list argp;
+
+       va_start(argp, fmt);
+       ret = vsnprintf(input, INPUT_MAX, fmt, argp);
+       va_end(argp);
+
+       if (ret >= INPUT_MAX) {
+               printf("%s: Debugfs input is too long\n", __func__);
+               exit(EXIT_FAILURE);
+       }
+
+       if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
+               perror(SPLIT_DEBUGFS);
+               exit(EXIT_FAILURE);
+       }
+}
+
+void split_pmd_thp(void)
+{
+       char *one_page;
+       size_t len = 4 * pmd_pagesize;
+       size_t i;
+
+       one_page = memalign(pmd_pagesize, len);
+
+       if (!one_page) {
+               printf("Fail to allocate memory\n");
+               exit(EXIT_FAILURE);
+       }
+
+       madvise(one_page, len, MADV_HUGEPAGE);
+
+       for (i = 0; i < len; i++)
+               one_page[i] = (char)i;
+
+       if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+               printf("No THP is allocated\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* split all THPs */
+       write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
+               (uint64_t)one_page + len);
+
+       for (i = 0; i < len; i++)
+               if (one_page[i] != (char)i) {
+                       printf("%ld byte corrupted\n", i);
+                       exit(EXIT_FAILURE);
+               }
+
+
+       if (check_huge_anon(one_page, 0, pmd_pagesize)) {
+               printf("Still AnonHugePages not split\n");
+               exit(EXIT_FAILURE);
+       }
+
+       printf("Split huge pages successful\n");
+       free(one_page);
+}
+
+void split_pte_mapped_thp(void)
+{
+       char *one_page, *pte_mapped, *pte_mapped2;
+       size_t len = 4 * pmd_pagesize;
+       uint64_t thp_size;
+       size_t i;
+       const char *pagemap_template = "/proc/%d/pagemap";
+       const char *kpageflags_proc = "/proc/kpageflags";
+       char pagemap_proc[255];
+       int pagemap_fd;
+       int kpageflags_fd;
+
+       if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
+               perror("get pagemap proc error");
+               exit(EXIT_FAILURE);
+       }
+       pagemap_fd = open(pagemap_proc, O_RDONLY);
+
+       if (pagemap_fd == -1) {
+               perror("read pagemap:");
+               exit(EXIT_FAILURE);
+       }
+
+       kpageflags_fd = open(kpageflags_proc, O_RDONLY);
+
+       if (kpageflags_fd == -1) {
+               perror("read kpageflags:");
+               exit(EXIT_FAILURE);
+       }
+
+       one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
+                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+       madvise(one_page, len, MADV_HUGEPAGE);
+
+       for (i = 0; i < len; i++)
+               one_page[i] = (char)i;
+
+       if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
+               printf("No THP is allocated\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* remap the first pagesize of first THP */
+       pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
+
+       /* remap the Nth pagesize of Nth THP */
+       for (i = 1; i < 4; i++) {
+               pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
+                                    pagesize, pagesize,
+                                    MREMAP_MAYMOVE|MREMAP_FIXED,
+                                    pte_mapped + pagesize * i);
+               if (pte_mapped2 == (char *)-1) {
+                       perror("mremap failed");
+                       exit(EXIT_FAILURE);
+               }
+       }
+
+       /* smap does not show THPs after mremap, use kpageflags instead */
+       thp_size = 0;
+       for (i = 0; i < pagesize * 4; i++)
+               if (i % pagesize == 0 &&
+                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+                       thp_size++;
+
+       if (thp_size != 4) {
+               printf("Some THPs are missing during mremap\n");
+               exit(EXIT_FAILURE);
+       }
+
+       /* split all remapped THPs */
+       write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
+                     (uint64_t)pte_mapped + pagesize * 4);
+
+       /* smap does not show THPs after mremap, use kpageflags instead */
+       thp_size = 0;
+       for (i = 0; i < pagesize * 4; i++) {
+               if (pte_mapped[i] != (char)i) {
+                       printf("%ld byte corrupted\n", i);
+                       exit(EXIT_FAILURE);
+               }
+               if (i % pagesize == 0 &&
+                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
+                       thp_size++;
+       }
+
+       if (thp_size) {
+               printf("Still %ld THPs not split\n", thp_size);
+               exit(EXIT_FAILURE);
+       }
+
+       printf("Split PTE-mapped huge pages successful\n");
+       munmap(one_page, len);
+       close(pagemap_fd);
+       close(kpageflags_fd);
+}
+
+void split_file_backed_thp(void)
+{
+       int status;
+       int fd;
+       ssize_t num_written;
+       char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
+       const char *tmpfs_loc = mkdtemp(tmpfs_template);
+       char testfile[INPUT_MAX];
+       uint64_t pgoff_start = 0, pgoff_end = 1024;
+
+       printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
+
+       status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
+
+       if (status) {
+               printf("Unable to create a tmpfs for testing\n");
+               exit(EXIT_FAILURE);
+       }
+
+       status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
+       if (status >= INPUT_MAX) {
+               printf("Fail to create file-backed THP split testing file\n");
+               goto cleanup;
+       }
+
+       fd = open(testfile, O_CREAT|O_WRONLY);
+       if (fd == -1) {
+               perror("Cannot open testing file\n");
+               goto cleanup;
+       }
+
+       /* write something to the file, so a file-backed THP can be allocated */
+       num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1);
+       close(fd);
+
+       if (num_written < 1) {
+               printf("Fail to write data to testing file\n");
+               goto cleanup;
+       }
+
+       /* split the file-backed THP */
+       write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
+
+       status = unlink(testfile);
+       if (status)
+               perror("Cannot remove testing file\n");
+
+cleanup:
+       status = umount(tmpfs_loc);
+       if (status) {
+               printf("Unable to umount %s\n", tmpfs_loc);
+               exit(EXIT_FAILURE);
+       }
+       status = rmdir(tmpfs_loc);
+       if (status) {
+               perror("cannot remove tmp dir");
+               exit(EXIT_FAILURE);
+       }
+
+       printf("file-backed THP split test done, please check dmesg for more information\n");
+}
+
+int main(int argc, char **argv)
+{
+       if (geteuid() != 0) {
+               printf("Please run the benchmark as root\n");
+               exit(EXIT_FAILURE);
+       }
+
+       pagesize = getpagesize();
+       pageshift = ffs(pagesize) - 1;
+       pmd_pagesize = read_pmd_pagesize();
+
+       split_pmd_thp();
+       split_pte_mapped_thp();
+       split_file_backed_thp();
+
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh
new file mode 100644 (file)
index 0000000..46e19b5
--- /dev/null
@@ -0,0 +1,105 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+#     a) analyse performance of vmalloc allocations;
+#     b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="test_hmm"
+DRIVER="test_hmm"
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_test_requirements()
+{
+       uid=$(id -u)
+       if [ $uid -ne 0 ]; then
+               echo "$0: Must be run as root"
+               exit $ksft_skip
+       fi
+
+       if ! which modprobe > /dev/null 2>&1; then
+               echo "$0: You need modprobe installed"
+               exit $ksft_skip
+       fi
+
+       if ! modinfo $DRIVER > /dev/null 2>&1; then
+               echo "$0: You must have the following enabled in your kernel:"
+               echo "CONFIG_TEST_HMM=m"
+               exit $ksft_skip
+       fi
+}
+
+load_driver()
+{
+       if [ $# -eq 0 ]; then
+               modprobe $DRIVER > /dev/null 2>&1
+       else
+               if [ $# -eq 2 ]; then
+                       modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+                               > /dev/null 2>&1
+               else
+                       echo "Missing module parameters. Make sure pass"\
+                       "spm_addr_dev0 and spm_addr_dev1"
+                       usage
+               fi
+       fi
+}
+
+unload_driver()
+{
+       modprobe -r $DRIVER > /dev/null 2>&1
+}
+
+run_smoke()
+{
+       echo "Running smoke test. Note, this test provides basic coverage."
+
+       load_driver $1 $2
+       $(dirname "${BASH_SOURCE[0]}")/hmm-tests
+       unload_driver
+}
+
+usage()
+{
+       echo -n "Usage: $0"
+       echo
+       echo "Example usage:"
+       echo
+       echo "# Shows help message"
+       echo "./${TEST_NAME}.sh"
+       echo
+       echo "# Smoke testing"
+       echo "./${TEST_NAME}.sh smoke"
+       echo
+       echo "# Smoke testing with SPM enabled"
+       echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>"
+       echo
+       exit 0
+}
+
+function run_test()
+{
+       if [ $# -eq 0 ]; then
+               usage
+       else
+               if [ "$1" = "smoke" ]; then
+                       run_smoke $2 $3
+               else
+                       usage
+               fi
+       fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh
new file mode 100644 (file)
index 0000000..d73b846
--- /dev/null
@@ -0,0 +1,177 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+#     a) analyse performance of vmalloc allocations;
+#     b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="vmalloc"
+DRIVER="test_${TEST_NAME}"
+NUM_CPUS=`grep -c ^processor /proc/cpuinfo`
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+#
+# Static templates for performance, stressing and smoke tests.
+# Also it is possible to pass any supported parameters manualy.
+#
+PERF_PARAM="sequential_test_order=1 test_repeat_count=3"
+SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10"
+STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20"
+
+check_test_requirements()
+{
+       uid=$(id -u)
+       if [ $uid -ne 0 ]; then
+               echo "$0: Must be run as root"
+               exit $ksft_skip
+       fi
+
+       if ! which modprobe > /dev/null 2>&1; then
+               echo "$0: You need modprobe installed"
+               exit $ksft_skip
+       fi
+
+       if ! modinfo $DRIVER > /dev/null 2>&1; then
+               echo "$0: You must have the following enabled in your kernel:"
+               echo "CONFIG_TEST_VMALLOC=m"
+               exit $ksft_skip
+       fi
+}
+
+run_perfformance_check()
+{
+       echo "Run performance tests to evaluate how fast vmalloc allocation is."
+       echo "It runs all test cases on one single CPU with sequential order."
+
+       modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+       echo "Done."
+       echo "Ccheck the kernel message buffer to see the summary."
+}
+
+run_stability_check()
+{
+       echo "Run stability tests. In order to stress vmalloc subsystem all"
+       echo "available test cases are run by NUM_CPUS workers simultaneously."
+       echo "It will take time, so be patient."
+
+       modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+       echo "Done."
+       echo "Check the kernel ring buffer to see the summary."
+}
+
+run_smoke_check()
+{
+       echo "Run smoke test. Note, this test provides basic coverage."
+       echo "Please check $0 output how it can be used"
+       echo "for deep performance analysis as well as stress testing."
+
+       modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+       echo "Done."
+       echo "Check the kernel ring buffer to see the summary."
+}
+
+usage()
+{
+       echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
+       echo "manual parameters"
+       echo
+       echo "Valid tests and parameters:"
+       echo
+       modinfo $DRIVER
+       echo
+       echo "Example usage:"
+       echo
+       echo "# Shows help message"
+       echo "./${DRIVER}.sh"
+       echo
+       echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers"
+       echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5"
+       echo
+       echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
+       echo "sequential order"
+       echo -n "./${DRIVER}.sh sequential_test_order=1 "
+       echo "run_test_mask=23"
+       echo
+       echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats "
+       echo "20 times"
+       echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20"
+       echo
+       echo "# Performance analysis"
+       echo "./${DRIVER}.sh performance"
+       echo
+       echo "# Stress testing"
+       echo "./${DRIVER}.sh stress"
+       echo
+       exit 0
+}
+
+function validate_passed_args()
+{
+       VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+       #
+       # Something has been passed, check it.
+       #
+       for passed_arg in $@; do
+               key=${passed_arg//=*/}
+               val="${passed_arg:$((${#key}+1))}"
+               valid=0
+
+               for valid_arg in $VALID_ARGS; do
+                       if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
+                               valid=1
+                               break
+                       fi
+               done
+
+               if [[ $valid -ne 1 ]]; then
+                       echo "Error: key or value is not correct: ${key} $val"
+                       exit $exitcode
+               fi
+       done
+}
+
+function run_manual_check()
+{
+       #
+       # Validate passed parameters. If there is wrong one,
+       # the script exists and does not execute further.
+       #
+       validate_passed_args $@
+
+       echo "Run the test with following parameters: $@"
+       modprobe $DRIVER $@ > /dev/null 2>&1
+       echo "Done."
+       echo "Check the kernel ring buffer to see the summary."
+}
+
+function run_test()
+{
+       if [ $# -eq 0 ]; then
+               usage
+       else
+               if [[ "$1" = "performance" ]]; then
+                       run_perfformance_check
+               elif [[ "$1" = "stress" ]]; then
+                       run_stability_check
+               elif [[ "$1" = "smoke" ]]; then
+                       run_smoke_check
+               else
+                       run_manual_check $@
+               fi
+       fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c
new file mode 100644 (file)
index 0000000..361ef71
--- /dev/null
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test selecting other page sizes for mmap/shmget.
+
+   Before running this huge pages for each huge page size must have been
+   reserved.
+   For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
+   Also shmmax must be increased.
+   And you need to run as root to work around some weird permissions in shm.
+   And nothing using huge pages should run in parallel.
+   When the program aborts you may need to clean up the shm segments with
+   ipcrm -m by hand, like this
+   sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
+   (warning this will remove all if someone else uses them) */
+
+#define _GNU_SOURCE 1
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define err(x) perror(x), exit(1)
+
+#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_SHIFT  26
+#define MAP_HUGE_MASK   0x3f
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB    0x40000
+#endif
+
+#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
+#define SHM_HUGE_SHIFT  26
+#define SHM_HUGE_MASK   0x3f
+#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
+
+#define NUM_PAGESIZES   5
+
+#define NUM_PAGES 4
+
+#define Dprintf(fmt...) // printf(fmt)
+
+unsigned long page_sizes[NUM_PAGESIZES];
+int num_page_sizes;
+
+int ilog2(unsigned long v)
+{
+       int l = 0;
+       while ((1UL << l) < v)
+               l++;
+       return l;
+}
+
+void find_pagesizes(void)
+{
+       glob_t g;
+       int i;
+       glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+       assert(g.gl_pathc <= NUM_PAGESIZES);
+       for (i = 0; i < g.gl_pathc; i++) {
+               sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+                               &page_sizes[i]);
+               page_sizes[i] <<= 10;
+               printf("Found %luMB\n", page_sizes[i] >> 20);
+       }
+       num_page_sizes = g.gl_pathc;
+       globfree(&g);
+}
+
+unsigned long default_huge_page_size(void)
+{
+       unsigned long hps = 0;
+       char *line = NULL;
+       size_t linelen = 0;
+       FILE *f = fopen("/proc/meminfo", "r");
+       if (!f)
+               return 0;
+       while (getline(&line, &linelen, f) > 0) {
+               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+                       hps <<= 10;
+                       break;
+               }
+       }
+       free(line);
+       return hps;
+}
+
+void show(unsigned long ps)
+{
+       char buf[100];
+       if (ps == getpagesize())
+               return;
+       printf("%luMB: ", ps >> 20);
+       fflush(stdout);
+       snprintf(buf, sizeof buf,
+               "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+               ps >> 10);
+       system(buf);
+}
+
+unsigned long read_sysfs(int warn, char *fmt, ...)
+{
+       char *line = NULL;
+       size_t linelen = 0;
+       char buf[100];
+       FILE *f;
+       va_list ap;
+       unsigned long val = 0;
+
+       va_start(ap, fmt);
+       vsnprintf(buf, sizeof buf, fmt, ap);
+       va_end(ap);
+
+       f = fopen(buf, "r");
+       if (!f) {
+               if (warn)
+                       printf("missing %s\n", buf);
+               return 0;
+       }
+       if (getline(&line, &linelen, f) > 0) {
+               sscanf(line, "%lu", &val);
+       }
+       fclose(f);
+       free(line);
+       return val;
+}
+
+unsigned long read_free(unsigned long ps)
+{
+       return read_sysfs(ps != getpagesize(),
+                       "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+                       ps >> 10);
+}
+
+void test_mmap(unsigned long size, unsigned flags)
+{
+       char *map;
+       unsigned long before, after;
+       int err;
+
+       before = read_free(size);
+       map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
+                       MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+
+       if (map == (char *)-1) err("mmap");
+       memset(map, 0xff, size*NUM_PAGES);
+       after = read_free(size);
+       Dprintf("before %lu after %lu diff %ld size %lu\n",
+               before, after, before - after, size);
+       assert(size == getpagesize() || (before - after) == NUM_PAGES);
+       show(size);
+       err = munmap(map, size);
+       assert(!err);
+}
+
+void test_shmget(unsigned long size, unsigned flags)
+{
+       int id;
+       unsigned long before, after;
+       int err;
+
+       before = read_free(size);
+       id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
+       if (id < 0) err("shmget");
+
+       struct shm_info i;
+       if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
+       Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+
+
+       Dprintf("id %d\n", id);
+       char *map = shmat(id, NULL, 0600);
+       if (map == (char*)-1) err("shmat");
+
+       shmctl(id, IPC_RMID, NULL);
+
+       memset(map, 0xff, size*NUM_PAGES);
+       after = read_free(size);
+
+       Dprintf("before %lu after %lu diff %ld size %lu\n",
+               before, after, before - after, size);
+       assert(size == getpagesize() || (before - after) == NUM_PAGES);
+       show(size);
+       err = shmdt(map);
+       assert(!err);
+}
+
+void sanity_checks(void)
+{
+       int i;
+       unsigned long largest = getpagesize();
+
+       for (i = 0; i < num_page_sizes; i++) {
+               if (page_sizes[i] > largest)
+                       largest = page_sizes[i];
+
+               if (read_free(page_sizes[i]) < NUM_PAGES) {
+                       printf("Not enough huge pages for page size %lu MB, need %u\n",
+                               page_sizes[i] >> 20,
+                               NUM_PAGES);
+                       exit(0);
+               }
+       }
+
+       if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
+               printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
+               exit(0);
+       }
+
+#if defined(__x86_64__)
+       if (largest != 1U<<30) {
+               printf("No GB pages available on x86-64\n"
+                      "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
+               exit(0);
+       }
+#endif
+}
+
+int main(void)
+{
+       int i;
+       unsigned default_hps = default_huge_page_size();
+
+       find_pagesizes();
+
+       sanity_checks();
+
+       for (i = 0; i < num_page_sizes; i++) {
+               unsigned long ps = page_sizes[i];
+               int arg = ilog2(ps) << MAP_HUGE_SHIFT;
+               printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+               test_mmap(ps, MAP_HUGETLB | arg);
+       }
+       printf("Testing default huge mmap\n");
+       test_mmap(default_hps, SHM_HUGETLB);
+
+       puts("Testing non-huge shmget");
+       test_shmget(getpagesize(), 0);
+
+       for (i = 0; i < num_page_sizes; i++) {
+               unsigned long ps = page_sizes[i];
+               int arg = ilog2(ps) << SHM_HUGE_SHIFT;
+               printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+               test_shmget(ps, SHM_HUGETLB | arg);
+       }
+       puts("default huge shmget");
+       test_shmget(default_hps, SHM_HUGETLB);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c
new file mode 100644 (file)
index 0000000..e3f00ad
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * Stress test for transparent huge pages, memory compaction and migration.
+ *
+ * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include "util.h"
+
+int backing_fd = -1;
+int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
+#define PROT_RW (PROT_READ | PROT_WRITE)
+
+int main(int argc, char **argv)
+{
+       size_t ram, len;
+       void *ptr, *p;
+       struct timespec a, b;
+       int i = 0;
+       char *name = NULL;
+       double s;
+       uint8_t *map;
+       size_t map_len;
+       int pagemap_fd;
+
+       ram = sysconf(_SC_PHYS_PAGES);
+       if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
+               ram = SIZE_MAX / 4;
+       else
+               ram *= sysconf(_SC_PAGESIZE);
+       len = ram;
+
+       while (++i < argc) {
+               if (!strcmp(argv[i], "-h"))
+                       errx(1, "usage: %s [size in MiB]", argv[0]);
+               else if (!strcmp(argv[i], "-f"))
+                       name = argv[++i];
+               else
+                       len = atoll(argv[i]) << 20;
+       }
+
+       if (name) {
+               backing_fd = open(name, O_RDWR);
+               if (backing_fd == -1)
+                       errx(2, "open %s", name);
+               mmap_flags = MAP_SHARED;
+       }
+
+       warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
+             " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
+             ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+
+       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+       if (pagemap_fd < 0)
+               err(2, "open pagemap");
+
+       len -= len % HPAGE_SIZE;
+       ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
+       if (ptr == MAP_FAILED)
+               err(2, "initial mmap");
+       ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
+
+       if (madvise(ptr, len, MADV_HUGEPAGE))
+               err(2, "MADV_HUGEPAGE");
+
+       map_len = ram >> (HPAGE_SHIFT - 1);
+       map = malloc(map_len);
+       if (!map)
+               errx(2, "map malloc");
+
+       while (1) {
+               int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
+
+               memset(map, 0, map_len);
+
+               clock_gettime(CLOCK_MONOTONIC, &a);
+               for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
+                       int64_t pfn;
+
+                       pfn = allocate_transhuge(p, pagemap_fd);
+
+                       if (pfn < 0) {
+                               nr_failed++;
+                       } else {
+                               size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
+
+                               nr_succeed++;
+                               if (idx >= map_len) {
+                                       map = realloc(map, idx + 1);
+                                       if (!map)
+                                               errx(2, "map realloc");
+                                       memset(map + map_len, 0, idx + 1 - map_len);
+                                       map_len = idx + 1;
+                               }
+                               if (!map[idx])
+                                       nr_pages++;
+                               map[idx] = 1;
+                       }
+
+                       /* split transhuge page, keep last page */
+                       if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
+                               err(2, "MADV_DONTNEED");
+               }
+               clock_gettime(CLOCK_MONOTONIC, &b);
+               s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
+
+               warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+                     "%4d succeed, %4d failed, %4d different pages",
+                     s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+                     nr_succeed, nr_failed, nr_pages);
+       }
+}
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
new file mode 100644 (file)
index 0000000..7f22844
--- /dev/null
@@ -0,0 +1,1858 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stress userfaultfd syscall.
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ *    page of the area_dst (while the physical page may still be in
+ *    area_src), and increments a per-page counter in the same page,
+ *    and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ *    thread 1 above. userfaultfd blocking reads or poll() modes are
+ *    exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ *    at maximum bandwidth (if not already transferred by thread
+ *    2). Each cpu thread takes cares of transferring a portion of the
+ *    area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <sys/random.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#ifdef __NR_userfaultfd
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
+
+#define BOUNCE_RANDOM          (1<<0)
+#define BOUNCE_RACINGFAULTS    (1<<1)
+#define BOUNCE_VERIFY          (1<<2)
+#define BOUNCE_POLL            (1<<3)
+static int bounces;
+
+#define TEST_ANON      1
+#define TEST_HUGETLB   2
+#define TEST_SHMEM     3
+static int test_type;
+
+#define UFFD_FLAGS     (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
+
+#define BASE_PMD_ADDR ((void *)(1UL << 30))
+
+/* test using /dev/userfaultfd, instead of userfaultfd(2) */
+static bool test_dev_userfaultfd;
+
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static volatile bool test_uffdio_copy_eexist = true;
+static volatile bool test_uffdio_zeropage_eexist = true;
+/* Whether to test uffd write-protection */
+static bool test_uffdio_wp = true;
+/* Whether to test uffd minor faults */
+static bool test_uffdio_minor = false;
+static bool map_shared;
+static int mem_fd;
+static unsigned long long *count_verify;
+static int uffd = -1;
+static int uffd_flags, finished, *pipefd;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+static char *zeropage;
+pthread_attr_t attr;
+static bool test_collapse;
+
+/* Userfaultfd test statistics */
+struct uffd_stats {
+       int cpu;
+       unsigned long missing_faults;
+       unsigned long wp_faults;
+       unsigned long minor_faults;
+};
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr)                                     \
+       ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr)                                     \
+       ((volatile unsigned long long *) ((unsigned long)               \
+                                ((___area) + (___nr)*page_size +       \
+                                 sizeof(pthread_mutex_t) +             \
+                                 sizeof(unsigned long long) - 1) &     \
+                                ~(unsigned long)(sizeof(unsigned long long) \
+                                                 -  1)))
+
+#define swap(a, b) \
+       do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1)))
+
+const char *examples =
+    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+    "./userfaultfd anon 100 99999\n\n"
+    "# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
+    "./userfaultfd anon:dev 100 99999\n\n"
+    "# Run share memory test on 1GiB region with 99 bounces:\n"
+    "./userfaultfd shmem 1000 99\n\n"
+    "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
+    "./userfaultfd hugetlb 256 50\n\n"
+    "# Run the same hugetlb test but using shared file:\n"
+    "./userfaultfd hugetlb_shared 256 50\n\n"
+    "# 10MiB-~6GiB 999 bounces anonymous test, "
+    "continue forever unless an error triggers\n"
+    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+
+static void usage(void)
+{
+       fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
+               "[hugetlbfs_file]\n\n");
+       fprintf(stderr, "Supported <test type>: anon, hugetlb, "
+               "hugetlb_shared, shmem\n\n");
+       fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
+               "Supported mods:\n");
+       fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
+       fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
+       fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n"
+               "memory\n");
+       fprintf(stderr, "\nExample test mod usage:\n");
+       fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
+       fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
+
+       fprintf(stderr, "Examples:\n\n");
+       fprintf(stderr, "%s", examples);
+       exit(1);
+}
+
+#define _err(fmt, ...)                                         \
+       do {                                                    \
+               int ret = errno;                                \
+               fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);  \
+               fprintf(stderr, " (errno=%d, line=%d)\n",       \
+                       ret, __LINE__);                         \
+       } while (0)
+
+#define errexit(exitcode, fmt, ...)            \
+       do {                                    \
+               _err(fmt, ##__VA_ARGS__);       \
+               exit(exitcode);                 \
+       } while (0)
+
+#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
+
+static void uffd_stats_reset(struct uffd_stats *uffd_stats,
+                            unsigned long n_cpus)
+{
+       int i;
+
+       for (i = 0; i < n_cpus; i++) {
+               uffd_stats[i].cpu = i;
+               uffd_stats[i].missing_faults = 0;
+               uffd_stats[i].wp_faults = 0;
+               uffd_stats[i].minor_faults = 0;
+       }
+}
+
+static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
+{
+       int i;
+       unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
+
+       for (i = 0; i < n_cpus; i++) {
+               miss_total += stats[i].missing_faults;
+               wp_total += stats[i].wp_faults;
+               minor_total += stats[i].minor_faults;
+       }
+
+       printf("userfaults: ");
+       if (miss_total) {
+               printf("%llu missing (", miss_total);
+               for (i = 0; i < n_cpus; i++)
+                       printf("%lu+", stats[i].missing_faults);
+               printf("\b) ");
+       }
+       if (wp_total) {
+               printf("%llu wp (", wp_total);
+               for (i = 0; i < n_cpus; i++)
+                       printf("%lu+", stats[i].wp_faults);
+               printf("\b) ");
+       }
+       if (minor_total) {
+               printf("%llu minor (", minor_total);
+               for (i = 0; i < n_cpus; i++)
+                       printf("%lu+", stats[i].minor_faults);
+               printf("\b)");
+       }
+       printf("\n");
+}
+
+static void anon_release_pages(char *rel_area)
+{
+       if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+               err("madvise(MADV_DONTNEED) failed");
+}
+
+static void anon_allocate_area(void **alloc_area, bool is_src)
+{
+       *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+                          MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+static void hugetlb_release_pages(char *rel_area)
+{
+       if (!map_shared) {
+               if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+                       err("madvise(MADV_DONTNEED) failed");
+       } else {
+               if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+                       err("madvise(MADV_REMOVE) failed");
+       }
+}
+
+static void hugetlb_allocate_area(void **alloc_area, bool is_src)
+{
+       off_t size = nr_pages * page_size;
+       off_t offset = is_src ? 0 : size;
+       void *area_alias = NULL;
+       char **alloc_area_alias;
+
+       *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                          (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+                          (is_src ? 0 : MAP_NORESERVE),
+                          mem_fd, offset);
+       if (*alloc_area == MAP_FAILED)
+               err("mmap of hugetlbfs file failed");
+
+       if (map_shared) {
+               area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                                 MAP_SHARED, mem_fd, offset);
+               if (area_alias == MAP_FAILED)
+                       err("mmap of hugetlb file alias failed");
+       }
+
+       if (is_src) {
+               alloc_area_alias = &area_src_alias;
+       } else {
+               alloc_area_alias = &area_dst_alias;
+       }
+       if (area_alias)
+               *alloc_area_alias = area_alias;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+       if (!map_shared)
+               return;
+
+       *start = (unsigned long) area_dst_alias + offset;
+}
+
+static void shmem_release_pages(char *rel_area)
+{
+       if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+               err("madvise(MADV_REMOVE) failed");
+}
+
+static void shmem_allocate_area(void **alloc_area, bool is_src)
+{
+       void *area_alias = NULL;
+       size_t bytes = nr_pages * page_size;
+       unsigned long offset = is_src ? 0 : bytes;
+       char *p = NULL, *p_alias = NULL;
+
+       if (test_collapse) {
+               p = BASE_PMD_ADDR;
+               if (!is_src)
+                       /* src map + alias + interleaved hpages */
+                       p += 2 * (bytes + hpage_size);
+               p_alias = p;
+               p_alias += bytes;
+               p_alias += hpage_size;  /* Prevent src/dst VMA merge */
+       }
+
+       *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          mem_fd, offset);
+       if (*alloc_area == MAP_FAILED)
+               err("mmap of memfd failed");
+       if (test_collapse && *alloc_area != p)
+               err("mmap of memfd failed at %p", p);
+
+       area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         mem_fd, offset);
+       if (area_alias == MAP_FAILED)
+               err("mmap of memfd alias failed");
+       if (test_collapse && area_alias != p_alias)
+               err("mmap of anonymous memory failed at %p", p_alias);
+
+       if (is_src)
+               area_src_alias = area_alias;
+       else
+               area_dst_alias = area_alias;
+}
+
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+       *start = (unsigned long)area_dst_alias + offset;
+}
+
+static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
+{
+       if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
+               err("Did not find expected %d number of hugepages",
+                   expect_nr_hpages);
+}
+
+struct uffd_test_ops {
+       void (*allocate_area)(void **alloc_area, bool is_src);
+       void (*release_pages)(char *rel_area);
+       void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+       void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
+};
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+       .allocate_area  = anon_allocate_area,
+       .release_pages  = anon_release_pages,
+       .alias_mapping = noop_alias_mapping,
+       .check_pmd_mapping = NULL,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+       .allocate_area  = shmem_allocate_area,
+       .release_pages  = shmem_release_pages,
+       .alias_mapping = shmem_alias_mapping,
+       .check_pmd_mapping = shmem_check_pmd_mapping,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+       .allocate_area  = hugetlb_allocate_area,
+       .release_pages  = hugetlb_release_pages,
+       .alias_mapping = hugetlb_alias_mapping,
+       .check_pmd_mapping = NULL,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
+
+static inline uint64_t uffd_minor_feature(void)
+{
+       if (test_type == TEST_HUGETLB && map_shared)
+               return UFFD_FEATURE_MINOR_HUGETLBFS;
+       else if (test_type == TEST_SHMEM)
+               return UFFD_FEATURE_MINOR_SHMEM;
+       else
+               return 0;
+}
+
+static uint64_t get_expected_ioctls(uint64_t mode)
+{
+       uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
+
+       if (test_type == TEST_HUGETLB)
+               ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
+
+       if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
+               ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
+
+       if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
+               ioctls &= ~(1 << _UFFDIO_CONTINUE);
+
+       return ioctls;
+}
+
+static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
+{
+       uint64_t expected = get_expected_ioctls(mode);
+       uint64_t actual = ioctls & expected;
+
+       if (actual != expected) {
+               err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
+                   expected, actual);
+       }
+}
+
+static int __userfaultfd_open_dev(void)
+{
+       int fd, _uffd;
+
+       fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+       if (fd < 0)
+               errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
+
+       _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
+       if (_uffd < 0)
+               errexit(errno == ENOTTY ? KSFT_SKIP : 1,
+                       "creating userfaultfd failed");
+       close(fd);
+       return _uffd;
+}
+
+static void userfaultfd_open(uint64_t *features)
+{
+       struct uffdio_api uffdio_api;
+
+       if (test_dev_userfaultfd)
+               uffd = __userfaultfd_open_dev();
+       else {
+               uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
+               if (uffd < 0)
+                       errexit(errno == ENOSYS ? KSFT_SKIP : 1,
+                               "creating userfaultfd failed");
+       }
+       uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+       uffdio_api.api = UFFD_API;
+       uffdio_api.features = *features;
+       if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+               err("UFFDIO_API failed.\nPlease make sure to "
+                   "run with either root or ptrace capability.");
+       if (uffdio_api.api != UFFD_API)
+               err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
+
+       *features = uffdio_api.features;
+}
+
+static inline void munmap_area(void **area)
+{
+       if (*area)
+               if (munmap(*area, nr_pages * page_size))
+                       err("munmap");
+
+       *area = NULL;
+}
+
+static void uffd_test_ctx_clear(void)
+{
+       size_t i;
+
+       if (pipefd) {
+               for (i = 0; i < nr_cpus * 2; ++i) {
+                       if (close(pipefd[i]))
+                               err("close pipefd");
+               }
+               free(pipefd);
+               pipefd = NULL;
+       }
+
+       if (count_verify) {
+               free(count_verify);
+               count_verify = NULL;
+       }
+
+       if (uffd != -1) {
+               if (close(uffd))
+                       err("close uffd");
+               uffd = -1;
+       }
+
+       munmap_area((void **)&area_src);
+       munmap_area((void **)&area_src_alias);
+       munmap_area((void **)&area_dst);
+       munmap_area((void **)&area_dst_alias);
+       munmap_area((void **)&area_remap);
+}
+
+static void uffd_test_ctx_init(uint64_t features)
+{
+       unsigned long nr, cpu;
+
+       uffd_test_ctx_clear();
+
+       uffd_test_ops->allocate_area((void **)&area_src, true);
+       uffd_test_ops->allocate_area((void **)&area_dst, false);
+
+       userfaultfd_open(&features);
+
+       count_verify = malloc(nr_pages * sizeof(unsigned long long));
+       if (!count_verify)
+               err("count_verify");
+
+       for (nr = 0; nr < nr_pages; nr++) {
+               *area_mutex(area_src, nr) =
+                       (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+               count_verify[nr] = *area_count(area_src, nr) = 1;
+               /*
+                * In the transition between 255 to 256, powerpc will
+                * read out of order in my_bcmp and see both bytes as
+                * zero, so leave a placeholder below always non-zero
+                * after the count, to avoid my_bcmp to trigger false
+                * positives.
+                */
+               *(area_count(area_src, nr) + 1) = 1;
+       }
+
+       /*
+        * After initialization of area_src, we must explicitly release pages
+        * for area_dst to make sure it's fully empty.  Otherwise we could have
+        * some area_dst pages be errornously initialized with zero pages,
+        * hence we could hit memory corruption later in the test.
+        *
+        * One example is when THP is globally enabled, above allocate_area()
+        * calls could have the two areas merged into a single VMA (as they
+        * will have the same VMA flags so they're mergeable).  When we
+        * initialize the area_src above, it's possible that some part of
+        * area_dst could have been faulted in via one huge THP that will be
+        * shared between area_src and area_dst.  It could cause some of the
+        * area_dst won't be trapped by missing userfaults.
+        *
+        * This release_pages() will guarantee even if that happened, we'll
+        * proactively split the thp and drop any accidentally initialized
+        * pages within area_dst.
+        */
+       uffd_test_ops->release_pages(area_dst);
+
+       pipefd = malloc(sizeof(int) * nr_cpus * 2);
+       if (!pipefd)
+               err("pipefd");
+       for (cpu = 0; cpu < nr_cpus; cpu++)
+               if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
+                       err("pipe");
+}
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+       unsigned long i;
+       for (i = 0; i < n; i++)
+               if (str1[i] != str2[i])
+                       return 1;
+       return 0;
+}
+
+static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+       struct uffdio_writeprotect prms;
+
+       /* Write protection page faults */
+       prms.range.start = start;
+       prms.range.len = len;
+       /* Undo write-protect, do wakeup after that */
+       prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+       if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+               err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
+}
+
+static void continue_range(int ufd, __u64 start, __u64 len)
+{
+       struct uffdio_continue req;
+       int ret;
+
+       req.range.start = start;
+       req.range.len = len;
+       req.mode = 0;
+
+       if (ioctl(ufd, UFFDIO_CONTINUE, &req))
+               err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
+                   (uint64_t)start);
+
+       /*
+        * Error handling within the kernel for continue is subtly different
+        * from copy or zeropage, so it may be a source of bugs. Trigger an
+        * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
+        */
+       req.mapped = 0;
+       ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
+       if (ret >= 0 || req.mapped != -EEXIST)
+               err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
+                   ret, (int64_t) req.mapped);
+}
+
+static void *locking_thread(void *arg)
+{
+       unsigned long cpu = (unsigned long) arg;
+       unsigned long page_nr;
+       unsigned long long count;
+
+       if (!(bounces & BOUNCE_RANDOM)) {
+               page_nr = -bounces;
+               if (!(bounces & BOUNCE_RACINGFAULTS))
+                       page_nr += cpu * nr_pages_per_cpu;
+       }
+
+       while (!finished) {
+               if (bounces & BOUNCE_RANDOM) {
+                       if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
+                               err("getrandom failed");
+               } else
+                       page_nr += 1;
+               page_nr %= nr_pages;
+               pthread_mutex_lock(area_mutex(area_dst, page_nr));
+               count = *area_count(area_dst, page_nr);
+               if (count != count_verify[page_nr])
+                       err("page_nr %lu memory corruption %llu %llu",
+                           page_nr, count, count_verify[page_nr]);
+               count++;
+               *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+               pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+       }
+
+       return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+                           unsigned long offset)
+{
+       uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+                                    uffdio_copy->len,
+                                    offset);
+       if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+               /* real retval in ufdio_copy.copy */
+               if (uffdio_copy->copy != -EEXIST)
+                       err("UFFDIO_COPY retry error: %"PRId64,
+                           (int64_t)uffdio_copy->copy);
+       } else {
+               err("UFFDIO_COPY retry unexpected: %"PRId64,
+                   (int64_t)uffdio_copy->copy);
+       }
+}
+
+static void wake_range(int ufd, unsigned long addr, unsigned long len)
+{
+       struct uffdio_range uffdio_wake;
+
+       uffdio_wake.start = addr;
+       uffdio_wake.len = len;
+
+       if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
+               fprintf(stderr, "error waking %lu\n",
+                       addr), exit(1);
+}
+
+static int __copy_page(int ufd, unsigned long offset, bool retry)
+{
+       struct uffdio_copy uffdio_copy;
+
+       if (offset >= nr_pages * page_size)
+               err("unexpected offset %lu\n", offset);
+       uffdio_copy.dst = (unsigned long) area_dst + offset;
+       uffdio_copy.src = (unsigned long) area_src + offset;
+       uffdio_copy.len = page_size;
+       if (test_uffdio_wp)
+               uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+       else
+               uffdio_copy.mode = 0;
+       uffdio_copy.copy = 0;
+       if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+               /* real retval in ufdio_copy.copy */
+               if (uffdio_copy.copy != -EEXIST)
+                       err("UFFDIO_COPY error: %"PRId64,
+                           (int64_t)uffdio_copy.copy);
+               wake_range(ufd, uffdio_copy.dst, page_size);
+       } else if (uffdio_copy.copy != page_size) {
+               err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
+       } else {
+               if (test_uffdio_copy_eexist && retry) {
+                       test_uffdio_copy_eexist = false;
+                       retry_copy_page(ufd, &uffdio_copy, offset);
+               }
+               return 1;
+       }
+       return 0;
+}
+
+static int copy_page_retry(int ufd, unsigned long offset)
+{
+       return __copy_page(ufd, offset, true);
+}
+
+static int copy_page(int ufd, unsigned long offset)
+{
+       return __copy_page(ufd, offset, false);
+}
+
+static int uffd_read_msg(int ufd, struct uffd_msg *msg)
+{
+       int ret = read(uffd, msg, sizeof(*msg));
+
+       if (ret != sizeof(*msg)) {
+               if (ret < 0) {
+                       if (errno == EAGAIN || errno == EINTR)
+                               return 1;
+                       err("blocking read error");
+               } else {
+                       err("short read");
+               }
+       }
+
+       return 0;
+}
+
+static void uffd_handle_page_fault(struct uffd_msg *msg,
+                                  struct uffd_stats *stats)
+{
+       unsigned long offset;
+
+       if (msg->event != UFFD_EVENT_PAGEFAULT)
+               err("unexpected msg event %u", msg->event);
+
+       if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+               /* Write protect page faults */
+               wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+               stats->wp_faults++;
+       } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
+               uint8_t *area;
+               int b;
+
+               /*
+                * Minor page faults
+                *
+                * To prove we can modify the original range for testing
+                * purposes, we're going to bit flip this range before
+                * continuing.
+                *
+                * Note that this requires all minor page fault tests operate on
+                * area_dst (non-UFFD-registered) and area_dst_alias
+                * (UFFD-registered).
+                */
+
+               area = (uint8_t *)(area_dst +
+                                  ((char *)msg->arg.pagefault.address -
+                                   area_dst_alias));
+               for (b = 0; b < page_size; ++b)
+                       area[b] = ~area[b];
+               continue_range(uffd, msg->arg.pagefault.address, page_size);
+               stats->minor_faults++;
+       } else {
+               /*
+                * Missing page faults.
+                *
+                * Here we force a write check for each of the missing mode
+                * faults.  It's guaranteed because the only threads that
+                * will trigger uffd faults are the locking threads, and
+                * their first instruction to touch the missing page will
+                * always be pthread_mutex_lock().
+                *
+                * Note that here we relied on an NPTL glibc impl detail to
+                * always read the lock type at the entry of the lock op
+                * (pthread_mutex_t.__data.__type, offset 0x10) before
+                * doing any locking operations to guarantee that.  It's
+                * actually not good to rely on this impl detail because
+                * logically a pthread-compatible lib can implement the
+                * locks without types and we can fail when linking with
+                * them.  However since we used to find bugs with this
+                * strict check we still keep it around.  Hopefully this
+                * could be a good hint when it fails again.  If one day
+                * it'll break on some other impl of glibc we'll revisit.
+                */
+               if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+                       err("unexpected write fault");
+
+               offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+               offset &= ~(page_size-1);
+
+               if (copy_page(uffd, offset))
+                       stats->missing_faults++;
+       }
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+       struct uffd_stats *stats = (struct uffd_stats *)arg;
+       unsigned long cpu = stats->cpu;
+       struct pollfd pollfd[2];
+       struct uffd_msg msg;
+       struct uffdio_register uffd_reg;
+       int ret;
+       char tmp_chr;
+
+       pollfd[0].fd = uffd;
+       pollfd[0].events = POLLIN;
+       pollfd[1].fd = pipefd[cpu*2];
+       pollfd[1].events = POLLIN;
+
+       for (;;) {
+               ret = poll(pollfd, 2, -1);
+               if (ret <= 0) {
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       err("poll error: %d", ret);
+               }
+               if (pollfd[1].revents & POLLIN) {
+                       if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+                               err("read pipefd error");
+                       break;
+               }
+               if (!(pollfd[0].revents & POLLIN))
+                       err("pollfd[0].revents %d", pollfd[0].revents);
+               if (uffd_read_msg(uffd, &msg))
+                       continue;
+               switch (msg.event) {
+               default:
+                       err("unexpected msg event %u\n", msg.event);
+                       break;
+               case UFFD_EVENT_PAGEFAULT:
+                       uffd_handle_page_fault(&msg, stats);
+                       break;
+               case UFFD_EVENT_FORK:
+                       close(uffd);
+                       uffd = msg.arg.fork.ufd;
+                       pollfd[0].fd = uffd;
+                       break;
+               case UFFD_EVENT_REMOVE:
+                       uffd_reg.range.start = msg.arg.remove.start;
+                       uffd_reg.range.len = msg.arg.remove.end -
+                               msg.arg.remove.start;
+                       if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+                               err("remove failure");
+                       break;
+               case UFFD_EVENT_REMAP:
+                       area_remap = area_dst;  /* save for later unmap */
+                       area_dst = (char *)(unsigned long)msg.arg.remap.to;
+                       break;
+               }
+       }
+
+       return NULL;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+       struct uffd_stats *stats = (struct uffd_stats *)arg;
+       struct uffd_msg msg;
+
+       pthread_mutex_unlock(&uffd_read_mutex);
+       /* from here cancellation is ok */
+
+       for (;;) {
+               if (uffd_read_msg(uffd, &msg))
+                       continue;
+               uffd_handle_page_fault(&msg, stats);
+       }
+
+       return NULL;
+}
+
+static void *background_thread(void *arg)
+{
+       unsigned long cpu = (unsigned long) arg;
+       unsigned long page_nr, start_nr, mid_nr, end_nr;
+
+       start_nr = cpu * nr_pages_per_cpu;
+       end_nr = (cpu+1) * nr_pages_per_cpu;
+       mid_nr = (start_nr + end_nr) / 2;
+
+       /* Copy the first half of the pages */
+       for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
+               copy_page_retry(uffd, page_nr * page_size);
+
+       /*
+        * If we need to test uffd-wp, set it up now.  Then we'll have
+        * at least the first half of the pages mapped already which
+        * can be write-protected for testing
+        */
+       if (test_uffdio_wp)
+               wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
+                       nr_pages_per_cpu * page_size, true);
+
+       /*
+        * Continue the 2nd half of the page copying, handling write
+        * protection faults if any
+        */
+       for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
+               copy_page_retry(uffd, page_nr * page_size);
+
+       return NULL;
+}
+
+static int stress(struct uffd_stats *uffd_stats)
+{
+       unsigned long cpu;
+       pthread_t locking_threads[nr_cpus];
+       pthread_t uffd_threads[nr_cpus];
+       pthread_t background_threads[nr_cpus];
+
+       finished = 0;
+       for (cpu = 0; cpu < nr_cpus; cpu++) {
+               if (pthread_create(&locking_threads[cpu], &attr,
+                                  locking_thread, (void *)cpu))
+                       return 1;
+               if (bounces & BOUNCE_POLL) {
+                       if (pthread_create(&uffd_threads[cpu], &attr,
+                                          uffd_poll_thread,
+                                          (void *)&uffd_stats[cpu]))
+                               return 1;
+               } else {
+                       if (pthread_create(&uffd_threads[cpu], &attr,
+                                          uffd_read_thread,
+                                          (void *)&uffd_stats[cpu]))
+                               return 1;
+                       pthread_mutex_lock(&uffd_read_mutex);
+               }
+               if (pthread_create(&background_threads[cpu], &attr,
+                                  background_thread, (void *)cpu))
+                       return 1;
+       }
+       for (cpu = 0; cpu < nr_cpus; cpu++)
+               if (pthread_join(background_threads[cpu], NULL))
+                       return 1;
+
+       /*
+        * Be strict and immediately zap area_src, the whole area has
+        * been transferred already by the background treads. The
+        * area_src could then be faulted in a racy way by still
+        * running uffdio_threads reading zeropages after we zapped
+        * area_src (but they're guaranteed to get -EEXIST from
+        * UFFDIO_COPY without writing zero pages into area_dst
+        * because the background threads already completed).
+        */
+       uffd_test_ops->release_pages(area_src);
+
+       finished = 1;
+       for (cpu = 0; cpu < nr_cpus; cpu++)
+               if (pthread_join(locking_threads[cpu], NULL))
+                       return 1;
+
+       for (cpu = 0; cpu < nr_cpus; cpu++) {
+               char c;
+               if (bounces & BOUNCE_POLL) {
+                       if (write(pipefd[cpu*2+1], &c, 1) != 1)
+                               err("pipefd write error");
+                       if (pthread_join(uffd_threads[cpu],
+                                        (void *)&uffd_stats[cpu]))
+                               return 1;
+               } else {
+                       if (pthread_cancel(uffd_threads[cpu]))
+                               return 1;
+                       if (pthread_join(uffd_threads[cpu], NULL))
+                               return 1;
+               }
+       }
+
+       return 0;
+}
+
+sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+       if (sig == SIGBUS) {
+               if (sigbuf)
+                       siglongjmp(*sigbuf, 1);
+               abort();
+       }
+}
+
+/*
+ * For non-cooperative userfaultfd test we fork() a process that will
+ * generate pagefaults, will mremap the area monitored by the
+ * userfaultfd and at last this process will release the monitored
+ * area.
+ * For the anonymous and shared memory the area is divided into two
+ * parts, the first part is accessed before mremap, and the second
+ * part is accessed after mremap. Since hugetlbfs does not support
+ * mremap, the entire monitored area is accessed in a single pass for
+ * HUGETLB_TEST.
+ * The release of the pages currently generates event for shmem and
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
+ * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
+ */
+static int faulting_process(int signal_test)
+{
+       unsigned long nr;
+       unsigned long long count;
+       unsigned long split_nr_pages;
+       unsigned long lastnr;
+       struct sigaction act;
+       volatile unsigned long signalled = 0;
+
+       split_nr_pages = (nr_pages + 1) / 2;
+
+       if (signal_test) {
+               sigbuf = &jbuf;
+               memset(&act, 0, sizeof(act));
+               act.sa_sigaction = sighndl;
+               act.sa_flags = SA_SIGINFO;
+               if (sigaction(SIGBUS, &act, 0))
+                       err("sigaction");
+               lastnr = (unsigned long)-1;
+       }
+
+       for (nr = 0; nr < split_nr_pages; nr++) {
+               volatile int steps = 1;
+               unsigned long offset = nr * page_size;
+
+               if (signal_test) {
+                       if (sigsetjmp(*sigbuf, 1) != 0) {
+                               if (steps == 1 && nr == lastnr)
+                                       err("Signal repeated");
+
+                               lastnr = nr;
+                               if (signal_test == 1) {
+                                       if (steps == 1) {
+                                               /* This is a MISSING request */
+                                               steps++;
+                                               if (copy_page(uffd, offset))
+                                                       signalled++;
+                                       } else {
+                                               /* This is a WP request */
+                                               assert(steps == 2);
+                                               wp_range(uffd,
+                                                        (__u64)area_dst +
+                                                        offset,
+                                                        page_size, false);
+                                       }
+                               } else {
+                                       signalled++;
+                                       continue;
+                               }
+                       }
+               }
+
+               count = *area_count(area_dst, nr);
+               if (count != count_verify[nr])
+                       err("nr %lu memory corruption %llu %llu\n",
+                           nr, count, count_verify[nr]);
+               /*
+                * Trigger write protection if there is by writing
+                * the same value back.
+                */
+               *area_count(area_dst, nr) = count;
+       }
+
+       if (signal_test)
+               return signalled != split_nr_pages;
+
+       area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
+                         MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
+       if (area_dst == MAP_FAILED)
+               err("mremap");
+       /* Reset area_src since we just clobbered it */
+       area_src = NULL;
+
+       for (; nr < nr_pages; nr++) {
+               count = *area_count(area_dst, nr);
+               if (count != count_verify[nr]) {
+                       err("nr %lu memory corruption %llu %llu\n",
+                           nr, count, count_verify[nr]);
+               }
+               /*
+                * Trigger write protection if there is by writing
+                * the same value back.
+                */
+               *area_count(area_dst, nr) = count;
+       }
+
+       uffd_test_ops->release_pages(area_dst);
+
+       for (nr = 0; nr < nr_pages; nr++)
+               if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
+                       err("nr %lu is not zero", nr);
+
+       return 0;
+}
+
+static void retry_uffdio_zeropage(int ufd,
+                                 struct uffdio_zeropage *uffdio_zeropage,
+                                 unsigned long offset)
+{
+       uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+                                    uffdio_zeropage->range.len,
+                                    offset);
+       if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+               if (uffdio_zeropage->zeropage != -EEXIST)
+                       err("UFFDIO_ZEROPAGE error: %"PRId64,
+                           (int64_t)uffdio_zeropage->zeropage);
+       } else {
+               err("UFFDIO_ZEROPAGE error: %"PRId64,
+                   (int64_t)uffdio_zeropage->zeropage);
+       }
+}
+
+static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
+{
+       struct uffdio_zeropage uffdio_zeropage;
+       int ret;
+       bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
+       __s64 res;
+
+       if (offset >= nr_pages * page_size)
+               err("unexpected offset %lu", offset);
+       uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
+       uffdio_zeropage.range.len = page_size;
+       uffdio_zeropage.mode = 0;
+       ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
+       res = uffdio_zeropage.zeropage;
+       if (ret) {
+               /* real retval in ufdio_zeropage.zeropage */
+               if (has_zeropage)
+                       err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
+               else if (res != -EINVAL)
+                       err("UFFDIO_ZEROPAGE not -EINVAL");
+       } else if (has_zeropage) {
+               if (res != page_size) {
+                       err("UFFDIO_ZEROPAGE unexpected size");
+               } else {
+                       if (test_uffdio_zeropage_eexist && retry) {
+                               test_uffdio_zeropage_eexist = false;
+                               retry_uffdio_zeropage(ufd, &uffdio_zeropage,
+                                                     offset);
+                       }
+                       return 1;
+               }
+       } else
+               err("UFFDIO_ZEROPAGE succeeded");
+
+       return 0;
+}
+
+static int uffdio_zeropage(int ufd, unsigned long offset)
+{
+       return __uffdio_zeropage(ufd, offset, false);
+}
+
+/* exercise UFFDIO_ZEROPAGE */
+static int userfaultfd_zeropage_test(void)
+{
+       struct uffdio_register uffdio_register;
+
+       printf("testing UFFDIO_ZEROPAGE: ");
+       fflush(stdout);
+
+       uffd_test_ctx_init(0);
+
+       uffdio_register.range.start = (unsigned long) area_dst;
+       uffdio_register.range.len = nr_pages * page_size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (test_uffdio_wp)
+               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+               err("register failure");
+
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
+
+       if (uffdio_zeropage(uffd, 0))
+               if (my_bcmp(area_dst, zeropage, page_size))
+                       err("zeropage is not zero");
+
+       printf("done.\n");
+       return 0;
+}
+
+static int userfaultfd_events_test(void)
+{
+       struct uffdio_register uffdio_register;
+       pthread_t uffd_mon;
+       int err, features;
+       pid_t pid;
+       char c;
+       struct uffd_stats stats = { 0 };
+
+       printf("testing events (fork, remap, remove): ");
+       fflush(stdout);
+
+       features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
+               UFFD_FEATURE_EVENT_REMOVE;
+       uffd_test_ctx_init(features);
+
+       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+       uffdio_register.range.start = (unsigned long) area_dst;
+       uffdio_register.range.len = nr_pages * page_size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (test_uffdio_wp)
+               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+               err("register failure");
+
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
+
+       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+               err("uffd_poll_thread create");
+
+       pid = fork();
+       if (pid < 0)
+               err("fork");
+
+       if (!pid)
+               exit(faulting_process(0));
+
+       waitpid(pid, &err, 0);
+       if (err)
+               err("faulting process failed");
+       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+               err("pipe write");
+       if (pthread_join(uffd_mon, NULL))
+               return 1;
+
+       uffd_stats_report(&stats, 1);
+
+       return stats.missing_faults != nr_pages;
+}
+
+static int userfaultfd_sig_test(void)
+{
+       struct uffdio_register uffdio_register;
+       unsigned long userfaults;
+       pthread_t uffd_mon;
+       int err, features;
+       pid_t pid;
+       char c;
+       struct uffd_stats stats = { 0 };
+
+       printf("testing signal delivery: ");
+       fflush(stdout);
+
+       features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
+       uffd_test_ctx_init(features);
+
+       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+       uffdio_register.range.start = (unsigned long) area_dst;
+       uffdio_register.range.len = nr_pages * page_size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+       if (test_uffdio_wp)
+               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+               err("register failure");
+
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
+
+       if (faulting_process(1))
+               err("faulting process failed");
+
+       uffd_test_ops->release_pages(area_dst);
+
+       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+               err("uffd_poll_thread create");
+
+       pid = fork();
+       if (pid < 0)
+               err("fork");
+
+       if (!pid)
+               exit(faulting_process(2));
+
+       waitpid(pid, &err, 0);
+       if (err)
+               err("faulting process failed");
+       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+               err("pipe write");
+       if (pthread_join(uffd_mon, (void **)&userfaults))
+               return 1;
+
+       printf("done.\n");
+       if (userfaults)
+               err("Signal test failed, userfaults: %ld", userfaults);
+
+       return userfaults != 0;
+}
+
+void check_memory_contents(char *p)
+{
+       unsigned long i;
+       uint8_t expected_byte;
+       void *expected_page;
+
+       if (posix_memalign(&expected_page, page_size, page_size))
+               err("out of memory");
+
+       for (i = 0; i < nr_pages; ++i) {
+               expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
+               memset(expected_page, expected_byte, page_size);
+               if (my_bcmp(expected_page, p + (i * page_size), page_size))
+                       err("unexpected page contents after minor fault");
+       }
+
+       free(expected_page);
+}
+
+static int userfaultfd_minor_test(void)
+{
+       unsigned long p;
+       struct uffdio_register uffdio_register;
+       pthread_t uffd_mon;
+       char c;
+       struct uffd_stats stats = { 0 };
+
+       if (!test_uffdio_minor)
+               return 0;
+
+       printf("testing minor faults: ");
+       fflush(stdout);
+
+       uffd_test_ctx_init(uffd_minor_feature());
+
+       uffdio_register.range.start = (unsigned long)area_dst_alias;
+       uffdio_register.range.len = nr_pages * page_size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+               err("register failure");
+
+       assert_expected_ioctls_present(
+               uffdio_register.mode, uffdio_register.ioctls);
+
+       /*
+        * After registering with UFFD, populate the non-UFFD-registered side of
+        * the shared mapping. This should *not* trigger any UFFD minor faults.
+        */
+       for (p = 0; p < nr_pages; ++p) {
+               memset(area_dst + (p * page_size), p % ((uint8_t)-1),
+                      page_size);
+       }
+
+       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
+               err("uffd_poll_thread create");
+
+       /*
+        * Read each of the pages back using the UFFD-registered mapping. We
+        * expect that the first time we touch a page, it will result in a minor
+        * fault. uffd_poll_thread will resolve the fault by bit-flipping the
+        * page's contents, and then issuing a CONTINUE ioctl.
+        */
+       check_memory_contents(area_dst_alias);
+
+       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
+               err("pipe write");
+       if (pthread_join(uffd_mon, NULL))
+               return 1;
+
+       uffd_stats_report(&stats, 1);
+
+       if (test_collapse) {
+               printf("testing collapse of uffd memory into PMD-mapped THPs:");
+               if (madvise(area_dst_alias, nr_pages * page_size,
+                           MADV_COLLAPSE))
+                       err("madvise(MADV_COLLAPSE)");
+
+               uffd_test_ops->check_pmd_mapping(area_dst,
+                                                nr_pages * page_size /
+                                                hpage_size);
+               /*
+                * This won't cause uffd-fault - it purely just makes sure there
+                * was no corruption.
+                */
+               check_memory_contents(area_dst_alias);
+               printf(" done.\n");
+       }
+
+       return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
+}
+
+#define BIT_ULL(nr)                   (1ULL << (nr))
+#define PM_SOFT_DIRTY                 BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
+#define PM_UFFD_WP                    BIT_ULL(57)
+#define PM_FILE                       BIT_ULL(61)
+#define PM_SWAP                       BIT_ULL(62)
+#define PM_PRESENT                    BIT_ULL(63)
+
+static int pagemap_open(void)
+{
+       int fd = open("/proc/self/pagemap", O_RDONLY);
+
+       if (fd < 0)
+               err("open pagemap");
+
+       return fd;
+}
+
+static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
+{
+       uint64_t value;
+       int ret;
+
+       ret = pread(fd, &value, sizeof(uint64_t),
+                   ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
+       if (ret != sizeof(uint64_t))
+               err("pread() on pagemap failed");
+
+       return value;
+}
+
+/* This macro let __LINE__ works in err() */
+#define  pagemap_check_wp(value, wp) do {                              \
+               if (!!(value & PM_UFFD_WP) != wp)                       \
+                       err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
+       } while (0)
+
+static int pagemap_test_fork(bool present)
+{
+       pid_t child = fork();
+       uint64_t value;
+       int fd, result;
+
+       if (!child) {
+               /* Open the pagemap fd of the child itself */
+               fd = pagemap_open();
+               value = pagemap_read_vaddr(fd, area_dst);
+               /*
+                * After fork() uffd-wp bit should be gone as long as we're
+                * without UFFD_FEATURE_EVENT_FORK
+                */
+               pagemap_check_wp(value, false);
+               /* Succeed */
+               exit(0);
+       }
+       waitpid(child, &result, 0);
+       return result;
+}
+
+static void userfaultfd_pagemap_test(unsigned int test_pgsize)
+{
+       struct uffdio_register uffdio_register;
+       int pagemap_fd;
+       uint64_t value;
+
+       /* Pagemap tests uffd-wp only */
+       if (!test_uffdio_wp)
+               return;
+
+       /* Not enough memory to test this page size */
+       if (test_pgsize > nr_pages * page_size)
+               return;
+
+       printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
+       /* Flush so it doesn't flush twice in parent/child later */
+       fflush(stdout);
+
+       uffd_test_ctx_init(0);
+
+       if (test_pgsize > page_size) {
+               /* This is a thp test */
+               if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
+                       err("madvise(MADV_HUGEPAGE) failed");
+       } else if (test_pgsize == page_size) {
+               /* This is normal page test; force no thp */
+               if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
+                       err("madvise(MADV_NOHUGEPAGE) failed");
+       }
+
+       uffdio_register.range.start = (unsigned long) area_dst;
+       uffdio_register.range.len = nr_pages * page_size;
+       uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
+       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+               err("register failed");
+
+       pagemap_fd = pagemap_open();
+
+       /* Touch the page */
+       *area_dst = 1;
+       wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
+       value = pagemap_read_vaddr(pagemap_fd, area_dst);
+       pagemap_check_wp(value, true);
+       /* Make sure uffd-wp bit dropped when fork */
+       if (pagemap_test_fork(true))
+               err("Detected stall uffd-wp bit in child");
+
+       /* Exclusive required or PAGEOUT won't work */
+       if (!(value & PM_MMAP_EXCLUSIVE))
+               err("multiple mapping detected: 0x%"PRIx64, value);
+
+       if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
+               err("madvise(MADV_PAGEOUT) failed");
+
+       /* Uffd-wp should persist even swapped out */
+       value = pagemap_read_vaddr(pagemap_fd, area_dst);
+       pagemap_check_wp(value, true);
+       /* Make sure uffd-wp bit dropped when fork */
+       if (pagemap_test_fork(false))
+               err("Detected stall uffd-wp bit in child");
+
+       /* Unprotect; this tests swap pte modifications */
+       wp_range(uffd, (uint64_t)area_dst, page_size, false);
+       value = pagemap_read_vaddr(pagemap_fd, area_dst);
+       pagemap_check_wp(value, false);
+
+       /* Fault in the page from disk */
+       *area_dst = 2;
+       value = pagemap_read_vaddr(pagemap_fd, area_dst);
+       pagemap_check_wp(value, false);
+
+       close(pagemap_fd);
+       printf("done\n");
+}
+
+static int userfaultfd_stress(void)
+{
+       void *area;
+       unsigned long nr;
+       struct uffdio_register uffdio_register;
+       struct uffd_stats uffd_stats[nr_cpus];
+
+       uffd_test_ctx_init(0);
+
+       if (posix_memalign(&area, page_size, page_size))
+               err("out of memory");
+       zeropage = area;
+       bzero(zeropage, page_size);
+
+       pthread_mutex_lock(&uffd_read_mutex);
+
+       pthread_attr_init(&attr);
+       pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+       while (bounces--) {
+               printf("bounces: %d, mode:", bounces);
+               if (bounces & BOUNCE_RANDOM)
+                       printf(" rnd");
+               if (bounces & BOUNCE_RACINGFAULTS)
+                       printf(" racing");
+               if (bounces & BOUNCE_VERIFY)
+                       printf(" ver");
+               if (bounces & BOUNCE_POLL)
+                       printf(" poll");
+               else
+                       printf(" read");
+               printf(", ");
+               fflush(stdout);
+
+               if (bounces & BOUNCE_POLL)
+                       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+               else
+                       fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+               /* register */
+               uffdio_register.range.start = (unsigned long) area_dst;
+               uffdio_register.range.len = nr_pages * page_size;
+               uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+               if (test_uffdio_wp)
+                       uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+               if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+                       err("register failure");
+               assert_expected_ioctls_present(
+                       uffdio_register.mode, uffdio_register.ioctls);
+
+               if (area_dst_alias) {
+                       uffdio_register.range.start = (unsigned long)
+                               area_dst_alias;
+                       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+                               err("register failure alias");
+               }
+
+               /*
+                * The madvise done previously isn't enough: some
+                * uffd_thread could have read userfaults (one of
+                * those already resolved by the background thread)
+                * and it may be in the process of calling
+                * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+                * area_src and it would map a zero page in it (of
+                * course such a UFFDIO_COPY is perfectly safe as it'd
+                * return -EEXIST). The problem comes at the next
+                * bounce though: that racing UFFDIO_COPY would
+                * generate zeropages in the area_src, so invalidating
+                * the previous MADV_DONTNEED. Without this additional
+                * MADV_DONTNEED those zeropages leftovers in the
+                * area_src would lead to -EEXIST failure during the
+                * next bounce, effectively leaving a zeropage in the
+                * area_dst.
+                *
+                * Try to comment this out madvise to see the memory
+                * corruption being caught pretty quick.
+                *
+                * khugepaged is also inhibited to collapse THP after
+                * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+                * required to MADV_DONTNEED here.
+                */
+               uffd_test_ops->release_pages(area_dst);
+
+               uffd_stats_reset(uffd_stats, nr_cpus);
+
+               /* bounce pass */
+               if (stress(uffd_stats))
+                       return 1;
+
+               /* Clear all the write protections if there is any */
+               if (test_uffdio_wp)
+                       wp_range(uffd, (unsigned long)area_dst,
+                                nr_pages * page_size, false);
+
+               /* unregister */
+               if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
+                       err("unregister failure");
+               if (area_dst_alias) {
+                       uffdio_register.range.start = (unsigned long) area_dst;
+                       if (ioctl(uffd, UFFDIO_UNREGISTER,
+                                 &uffdio_register.range))
+                               err("unregister failure alias");
+               }
+
+               /* verification */
+               if (bounces & BOUNCE_VERIFY)
+                       for (nr = 0; nr < nr_pages; nr++)
+                               if (*area_count(area_dst, nr) != count_verify[nr])
+                                       err("error area_count %llu %llu %lu\n",
+                                           *area_count(area_src, nr),
+                                           count_verify[nr], nr);
+
+               /* prepare next bounce */
+               swap(area_src, area_dst);
+
+               swap(area_src_alias, area_dst_alias);
+
+               uffd_stats_report(uffd_stats, nr_cpus);
+       }
+
+       if (test_type == TEST_ANON) {
+               /*
+                * shmem/hugetlb won't be able to run since they have different
+                * behavior on fork() (file-backed memory normally drops ptes
+                * directly when fork), meanwhile the pagemap test will verify
+                * pgtable entry of fork()ed child.
+                */
+               userfaultfd_pagemap_test(page_size);
+               /*
+                * Hard-code for x86_64 for now for 2M THP, as x86_64 is
+                * currently the only one that supports uffd-wp
+                */
+               userfaultfd_pagemap_test(page_size * 512);
+       }
+
+       return userfaultfd_zeropage_test() || userfaultfd_sig_test()
+               || userfaultfd_events_test() || userfaultfd_minor_test();
+}
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+       unsigned long hps = 0;
+       char *line = NULL;
+       size_t linelen = 0;
+       FILE *f = fopen("/proc/meminfo", "r");
+
+       if (!f)
+               return 0;
+       while (getline(&line, &linelen, f) > 0) {
+               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+                       hps <<= 10;
+                       break;
+               }
+       }
+
+       free(line);
+       fclose(f);
+       return hps;
+}
+
+static void set_test_type(const char *type)
+{
+       if (!strcmp(type, "anon")) {
+               test_type = TEST_ANON;
+               uffd_test_ops = &anon_uffd_test_ops;
+       } else if (!strcmp(type, "hugetlb")) {
+               test_type = TEST_HUGETLB;
+               uffd_test_ops = &hugetlb_uffd_test_ops;
+       } else if (!strcmp(type, "hugetlb_shared")) {
+               map_shared = true;
+               test_type = TEST_HUGETLB;
+               uffd_test_ops = &hugetlb_uffd_test_ops;
+               /* Minor faults require shared hugetlb; only enable here. */
+               test_uffdio_minor = true;
+       } else if (!strcmp(type, "shmem")) {
+               map_shared = true;
+               test_type = TEST_SHMEM;
+               uffd_test_ops = &shmem_uffd_test_ops;
+               test_uffdio_minor = true;
+       }
+}
+
+static void parse_test_type_arg(const char *raw_type)
+{
+       char *buf = strdup(raw_type);
+       uint64_t features = UFFD_API_FEATURES;
+
+       while (buf) {
+               const char *token = strsep(&buf, ":");
+
+               if (!test_type)
+                       set_test_type(token);
+               else if (!strcmp(token, "dev"))
+                       test_dev_userfaultfd = true;
+               else if (!strcmp(token, "syscall"))
+                       test_dev_userfaultfd = false;
+               else if (!strcmp(token, "collapse"))
+                       test_collapse = true;
+               else
+                       err("unrecognized test mod '%s'", token);
+       }
+
+       if (!test_type)
+               err("failed to parse test type argument: '%s'", raw_type);
+
+       if (test_collapse && test_type != TEST_SHMEM)
+               err("Unsupported test: %s", raw_type);
+
+       if (test_type == TEST_HUGETLB)
+               page_size = hpage_size;
+       else
+               page_size = sysconf(_SC_PAGE_SIZE);
+
+       if (!page_size)
+               err("Unable to determine page size");
+       if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+           > page_size)
+               err("Impossible to run this test");
+
+       /*
+        * Whether we can test certain features depends not just on test type,
+        * but also on whether or not this particular kernel supports the
+        * feature.
+        */
+
+       userfaultfd_open(&features);
+
+       test_uffdio_wp = test_uffdio_wp &&
+               (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
+       test_uffdio_minor = test_uffdio_minor &&
+               (features & uffd_minor_feature());
+
+       close(uffd);
+       uffd = -1;
+}
+
+static void sigalrm(int sig)
+{
+       if (sig != SIGALRM)
+               abort();
+       test_uffdio_copy_eexist = true;
+       test_uffdio_zeropage_eexist = true;
+       alarm(ALARM_INTERVAL_SECS);
+}
+
+int main(int argc, char **argv)
+{
+       size_t bytes;
+
+       if (argc < 4)
+               usage();
+
+       if (signal(SIGALRM, sigalrm) == SIG_ERR)
+               err("failed to arm SIGALRM");
+       alarm(ALARM_INTERVAL_SECS);
+
+       hpage_size = default_huge_page_size();
+       parse_test_type_arg(argv[1]);
+       bytes = atol(argv[2]) * 1024 * 1024;
+
+       if (test_collapse && bytes & (hpage_size - 1))
+               err("MiB must be multiple of %lu if :collapse mod set",
+                   hpage_size >> 20);
+
+       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+       if (test_collapse) {
+               /* nr_cpus must divide (bytes / page_size), otherwise,
+                * area allocations of (nr_pages * paze_size) won't be a
+                * multiple of hpage_size, even if bytes is a multiple of
+                * hpage_size.
+                *
+                * This means that nr_cpus must divide (N * (2 << (H-P))
+                * where:
+                *      bytes = hpage_size * N
+                *      hpage_size = 2 << H
+                *      page_size = 2 << P
+                *
+                * And we want to chose nr_cpus to be the largest value
+                * satisfying this constraint, not larger than the number
+                * of online CPUs. Unfortunately, prime factorization of
+                * N and nr_cpus may be arbitrary, so have to search for it.
+                * Instead, just use the highest power of 2 dividing both
+                * nr_cpus and (bytes / page_size).
+                */
+               int x = factor_of_2(nr_cpus);
+               int y = factor_of_2(bytes / page_size);
+
+               nr_cpus = x < y ? x : y;
+       }
+       nr_pages_per_cpu = bytes / page_size / nr_cpus;
+       if (!nr_pages_per_cpu) {
+               _err("invalid MiB");
+               usage();
+       }
+
+       bounces = atoi(argv[3]);
+       if (bounces <= 0) {
+               _err("invalid bounces");
+               usage();
+       }
+       nr_pages = nr_pages_per_cpu * nr_cpus;
+
+       if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) {
+               unsigned int memfd_flags = 0;
+
+               if (test_type == TEST_HUGETLB)
+                       memfd_flags = MFD_HUGETLB;
+               mem_fd = memfd_create(argv[0], memfd_flags);
+               if (mem_fd < 0)
+                       err("memfd_create");
+               if (ftruncate(mem_fd, nr_pages * page_size * 2))
+                       err("ftruncate");
+               if (fallocate(mem_fd,
+                             FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+                             nr_pages * page_size * 2))
+                       err("fallocate");
+       }
+       printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+              nr_pages, nr_pages_per_cpu);
+       return userfaultfd_stress();
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+       printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+       return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/mm/util.h b/tools/testing/selftests/mm/util.h
new file mode 100644 (file)
index 0000000..b27d261
--- /dev/null
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KSELFTEST_VM_UTIL_H
+#define __KSELFTEST_VM_UTIL_H
+
+#include <stdint.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <string.h> /* ffsl() */
+#include <unistd.h> /* _SC_PAGESIZE */
+
+static unsigned int __page_size;
+static unsigned int __page_shift;
+
+static inline unsigned int page_size(void)
+{
+       if (!__page_size)
+               __page_size = sysconf(_SC_PAGESIZE);
+       return __page_size;
+}
+
+static inline unsigned int page_shift(void)
+{
+       if (!__page_shift)
+               __page_shift = (ffsl(page_size()) - 1);
+       return __page_shift;
+}
+
+#define PAGE_SHIFT     (page_shift())
+#define PAGE_SIZE      (page_size())
+/*
+ * On ppc64 this will only work with radix 2M hugepage size
+ */
+#define HPAGE_SHIFT 21
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent)       ((ent) & ((1ull << 55) - 1))
+
+
+static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd)
+{
+       uint64_t ent[2];
+
+       /* drop pmd */
+       if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+                MAP_FIXED | MAP_ANONYMOUS |
+                MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+               errx(2, "mmap transhuge");
+
+       if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+               err(2, "MADV_HUGEPAGE");
+
+       /* allocate transparent huge page */
+       *(volatile void **)ptr = ptr;
+
+       if (pread(pagemap_fd, ent, sizeof(ent),
+                 (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+               err(2, "read pagemap");
+
+       if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+           PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+           !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+               return PAGEMAP_PFN(ent[0]);
+
+       return -1;
+}
+
+#endif
diff --git a/tools/testing/selftests/mm/va_128TBswitch.c b/tools/testing/selftests/mm/va_128TBswitch.c
new file mode 100644 (file)
index 0000000..1d20689
--- /dev/null
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ */
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#ifdef __powerpc64__
+#define PAGE_SIZE      (64 << 10)
+/*
+ * This will work with 16M and 2M hugepage size
+ */
+#define HUGETLB_SIZE   (16 << 20)
+#else
+#define PAGE_SIZE      (4 << 10)
+#define HUGETLB_SIZE   (2 << 20)
+#endif
+
+/*
+ * >= 128TB is the hint addr value we used to select
+ * large address space.
+ */
+#define ADDR_SWITCH_HINT (1UL << 47)
+#define LOW_ADDR       ((void *) (1UL << 30))
+#define HIGH_ADDR      ((void *) (1UL << 48))
+
+struct testcase {
+       void *addr;
+       unsigned long size;
+       unsigned long flags;
+       const char *msg;
+       unsigned int low_addr_required:1;
+       unsigned int keep_mapped:1;
+};
+
+static struct testcase testcases[] = {
+       {
+               /*
+                * If stack is moved, we could possibly allocate
+                * this at the requested address.
+                */
+               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+               .low_addr_required = 1,
+       },
+       {
+               /*
+                * We should never allocate at the requested address or above it
+                * The len cross the 128TB boundary. Without MAP_FIXED
+                * we will always search in the lower address space.
+                */
+               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
+               .low_addr_required = 1,
+       },
+       {
+               /*
+                * Exact mapping at 128TB, the area is free we should get that
+                * even without MAP_FIXED.
+                */
+               .addr = ((void *)(ADDR_SWITCH_HINT)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+       },
+       {
+               .addr = NULL,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(NULL)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = LOW_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(LOW_ADDR)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR) again",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(HIGH_ADDR, MAP_FIXED)",
+       },
+       {
+               .addr = (void *) -1,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *) -1,
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1) again",
+       },
+       {
+               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
+               .low_addr_required = 1,
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
+               .low_addr_required = 1,
+               .keep_mapped = 1,
+       },
+       {
+               .addr = ((void *)(ADDR_SWITCH_HINT)),
+               .size = PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT),
+               .size = 2 * PAGE_SIZE,
+               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+       },
+};
+
+static struct testcase hugetlb_testcases[] = {
+       {
+               .addr = NULL,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(NULL, MAP_HUGETLB)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = LOW_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
+               .low_addr_required = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = HIGH_ADDR,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
+       },
+       {
+               .addr = (void *) -1,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1, MAP_HUGETLB)",
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *) -1,
+               .size = HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(-1, MAP_HUGETLB) again",
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+               .size = 2 * HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
+               .low_addr_required = 1,
+               .keep_mapped = 1,
+       },
+       {
+               .addr = (void *)(ADDR_SWITCH_HINT),
+               .size = 2 * HUGETLB_SIZE,
+               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+               .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
+       },
+};
+
+static int run_test(struct testcase *test, int count)
+{
+       void *p;
+       int i, ret = KSFT_PASS;
+
+       for (i = 0; i < count; i++) {
+               struct testcase *t = test + i;
+
+               p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
+
+               printf("%s: %p - ", t->msg, p);
+
+               if (p == MAP_FAILED) {
+                       printf("FAILED\n");
+                       ret = KSFT_FAIL;
+                       continue;
+               }
+
+               if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
+                       printf("FAILED\n");
+                       ret = KSFT_FAIL;
+               } else {
+                       /*
+                        * Do a dereference of the address returned so that we catch
+                        * bugs in page fault handling
+                        */
+                       memset(p, 0, t->size);
+                       printf("OK\n");
+               }
+               if (!t->keep_mapped)
+                       munmap(p, t->size);
+       }
+
+       return ret;
+}
+
+static int supported_arch(void)
+{
+#if defined(__powerpc64__)
+       return 1;
+#elif defined(__x86_64__)
+       return 1;
+#else
+       return 0;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+       int ret;
+
+       if (!supported_arch())
+               return KSFT_SKIP;
+
+       ret = run_test(testcases, ARRAY_SIZE(testcases));
+       if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
+               ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
+       return ret;
+}
diff --git a/tools/testing/selftests/mm/va_128TBswitch.sh b/tools/testing/selftests/mm/va_128TBswitch.sh
new file mode 100644 (file)
index 0000000..4158075
--- /dev/null
@@ -0,0 +1,54 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2022 Adam Sindelar (Meta) <adam@wowsignal.io>
+#
+# This is a test for mmap behavior with 5-level paging. This script wraps the
+# real test to check that the kernel is configured to support at least 5
+# pagetable levels.
+
+# 1 means the test failed
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+fail()
+{
+       echo "$1"
+       exit $exitcode
+}
+
+check_supported_x86_64()
+{
+       local config="/proc/config.gz"
+       [[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
+       [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
+
+       # gzip -dcfq automatically handles both compressed and plaintext input.
+       # See man 1 gzip under '-f'.
+       local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
+
+       if [[ "${pg_table_levels}" -lt 5 ]]; then
+               echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
+               exit $ksft_skip
+       fi
+}
+
+check_test_requirements()
+{
+       # The test supports x86_64 and powerpc64. We currently have no useful
+       # eligibility check for powerpc64, and the test itself will reject other
+       # architectures.
+       case `uname -m` in
+               "x86_64")
+                       check_supported_x86_64
+               ;;
+               *)
+                       return 0
+               ;;
+       esac
+}
+
+check_test_requirements
+./va_128TBswitch
diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c
new file mode 100644 (file)
index 0000000..c059264
--- /dev/null
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017, Anshuman Khandual, IBM Corp.
+ *
+ * Works on architectures which support 128TB virtual
+ * address range and beyond.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+/*
+ * Maximum address range mapped with a single mmap()
+ * call is little bit more than 16GB. Hence 16GB is
+ * chosen as the single chunk size for address space
+ * mapping.
+ */
+#define MAP_CHUNK_SIZE   17179869184UL /* 16GB */
+
+/*
+ * Address space till 128TB is mapped without any hint
+ * and is enabled by default. Address space beyond 128TB
+ * till 512TB is obtained by passing hint address as the
+ * first argument into mmap() system call.
+ *
+ * The process heap address space is divided into two
+ * different areas one below 128TB and one above 128TB
+ * till it reaches 512TB. One with size 128TB and the
+ * other being 384TB.
+ *
+ * On Arm64 the address space is 256TB and no high mappings
+ * are supported so far.
+ */
+
+#define NR_CHUNKS_128TB   8192UL /* Number of 16GB chunks for 128TB */
+#define NR_CHUNKS_256TB   (NR_CHUNKS_128TB * 2UL)
+#define NR_CHUNKS_384TB   (NR_CHUNKS_128TB * 3UL)
+
+#define ADDR_MARK_128TB  (1UL << 47) /* First address beyond 128TB */
+#define ADDR_MARK_256TB  (1UL << 48) /* First address beyond 256TB */
+
+#ifdef __aarch64__
+#define HIGH_ADDR_MARK  ADDR_MARK_256TB
+#define HIGH_ADDR_SHIFT 49
+#define NR_CHUNKS_LOW   NR_CHUNKS_256TB
+#define NR_CHUNKS_HIGH  0
+#else
+#define HIGH_ADDR_MARK  ADDR_MARK_128TB
+#define HIGH_ADDR_SHIFT 48
+#define NR_CHUNKS_LOW   NR_CHUNKS_128TB
+#define NR_CHUNKS_HIGH  NR_CHUNKS_384TB
+#endif
+
+static char *hind_addr(void)
+{
+       int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
+
+       return (char *) (1UL << bits);
+}
+
+static int validate_addr(char *ptr, int high_addr)
+{
+       unsigned long addr = (unsigned long) ptr;
+
+       if (high_addr) {
+               if (addr < HIGH_ADDR_MARK) {
+                       printf("Bad address %lx\n", addr);
+                       return 1;
+               }
+               return 0;
+       }
+
+       if (addr > HIGH_ADDR_MARK) {
+               printf("Bad address %lx\n", addr);
+               return 1;
+       }
+       return 0;
+}
+
+static int validate_lower_address_hint(void)
+{
+       char *ptr;
+
+       ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
+                       PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+       if (ptr == MAP_FAILED)
+               return 0;
+
+       return 1;
+}
+
+int main(int argc, char *argv[])
+{
+       char *ptr[NR_CHUNKS_LOW];
+       char *hptr[NR_CHUNKS_HIGH];
+       char *hint;
+       unsigned long i, lchunks, hchunks;
+
+       for (i = 0; i < NR_CHUNKS_LOW; i++) {
+               ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+               if (ptr[i] == MAP_FAILED) {
+                       if (validate_lower_address_hint())
+                               return 1;
+                       break;
+               }
+
+               if (validate_addr(ptr[i], 0))
+                       return 1;
+       }
+       lchunks = i;
+
+       for (i = 0; i < NR_CHUNKS_HIGH; i++) {
+               hint = hind_addr();
+               hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+               if (hptr[i] == MAP_FAILED)
+                       break;
+
+               if (validate_addr(hptr[i], 1))
+                       return 1;
+       }
+       hchunks = i;
+
+       for (i = 0; i < lchunks; i++)
+               munmap(ptr[i], MAP_CHUNK_SIZE);
+
+       for (i = 0; i < hchunks; i++)
+               munmap(hptr[i], MAP_CHUNK_SIZE);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c
new file mode 100644 (file)
index 0000000..40e7956
--- /dev/null
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <fcntl.h>
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
+#define SMAP_FILE_PATH "/proc/self/smaps"
+#define MAX_LINE_LENGTH 500
+
+uint64_t pagemap_get_entry(int fd, char *start)
+{
+       const unsigned long pfn = (unsigned long)start / getpagesize();
+       uint64_t entry;
+       int ret;
+
+       ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry));
+       if (ret != sizeof(entry))
+               ksft_exit_fail_msg("reading pagemap failed\n");
+       return entry;
+}
+
+bool pagemap_is_softdirty(int fd, char *start)
+{
+       uint64_t entry = pagemap_get_entry(fd, start);
+
+       // Check if dirty bit (55th bit) is set
+       return entry & 0x0080000000000000ull;
+}
+
+bool pagemap_is_swapped(int fd, char *start)
+{
+       uint64_t entry = pagemap_get_entry(fd, start);
+
+       return entry & 0x4000000000000000ull;
+}
+
+bool pagemap_is_populated(int fd, char *start)
+{
+       uint64_t entry = pagemap_get_entry(fd, start);
+
+       /* Present or swapped. */
+       return entry & 0xc000000000000000ull;
+}
+
+unsigned long pagemap_get_pfn(int fd, char *start)
+{
+       uint64_t entry = pagemap_get_entry(fd, start);
+
+       /* If present (63th bit), PFN is at bit 0 -- 54. */
+       if (entry & 0x8000000000000000ull)
+               return entry & 0x007fffffffffffffull;
+       return -1ul;
+}
+
+void clear_softdirty(void)
+{
+       int ret;
+       const char *ctrl = "4";
+       int fd = open("/proc/self/clear_refs", O_WRONLY);
+
+       if (fd < 0)
+               ksft_exit_fail_msg("opening clear_refs failed\n");
+       ret = write(fd, ctrl, strlen(ctrl));
+       close(fd);
+       if (ret != strlen(ctrl))
+               ksft_exit_fail_msg("writing clear_refs failed\n");
+}
+
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len)
+{
+       while (fgets(buf, len, fp)) {
+               if (!strncmp(buf, pattern, strlen(pattern)))
+                       return true;
+       }
+       return false;
+}
+
+uint64_t read_pmd_pagesize(void)
+{
+       int fd;
+       char buf[20];
+       ssize_t num_read;
+
+       fd = open(PMD_SIZE_FILE_PATH, O_RDONLY);
+       if (fd == -1)
+               ksft_exit_fail_msg("Open hpage_pmd_size failed\n");
+
+       num_read = read(fd, buf, 19);
+       if (num_read < 1) {
+               close(fd);
+               ksft_exit_fail_msg("Read hpage_pmd_size failed\n");
+       }
+       buf[num_read] = '\0';
+       close(fd);
+
+       return strtoul(buf, NULL, 10);
+}
+
+bool __check_huge(void *addr, char *pattern, int nr_hpages,
+                 uint64_t hpage_size)
+{
+       uint64_t thp = -1;
+       int ret;
+       FILE *fp;
+       char buffer[MAX_LINE_LENGTH];
+       char addr_pattern[MAX_LINE_LENGTH];
+
+       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+                      (unsigned long) addr);
+       if (ret >= MAX_LINE_LENGTH)
+               ksft_exit_fail_msg("%s: Pattern is too long\n", __func__);
+
+       fp = fopen(SMAP_FILE_PATH, "r");
+       if (!fp)
+               ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH);
+
+       if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
+               goto err_out;
+
+       /*
+        * Fetch the pattern in the same block and check the number of
+        * hugepages.
+        */
+       if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer)))
+               goto err_out;
+
+       snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern);
+
+       if (sscanf(buffer, addr_pattern, &thp) != 1)
+               ksft_exit_fail_msg("Reading smap error\n");
+
+err_out:
+       fclose(fp);
+       return thp == (nr_hpages * (hpage_size >> 10));
+}
+
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+       return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size);
+}
+
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+       return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size);
+}
+
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size)
+{
+       return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size);
+}
diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h
new file mode 100644 (file)
index 0000000..1995ee9
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdint.h>
+#include <stdbool.h>
+
+uint64_t pagemap_get_entry(int fd, char *start);
+bool pagemap_is_softdirty(int fd, char *start);
+bool pagemap_is_swapped(int fd, char *start);
+bool pagemap_is_populated(int fd, char *start);
+unsigned long pagemap_get_pfn(int fd, char *start);
+void clear_softdirty(void);
+bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
+uint64_t read_pmd_pagesize(void);
+bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
+bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh
new file mode 100644 (file)
index 0000000..70a0230
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+size=$1
+populate=$2
+write=$3
+cgroup=$4
+path=$5
+method=$6
+private=$7
+want_sleep=$8
+reserve=$9
+
+echo "Putting task in cgroup '$cgroup'"
+echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs
+
+echo "Method is $method"
+
+set +e
+./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \
+      "$private" "$want_sleep" "$reserve"
diff --git a/tools/testing/selftests/mm/write_to_hugetlbfs.c b/tools/testing/selftests/mm/write_to_hugetlbfs.c
new file mode 100644 (file)
index 0000000..6a2caba
--- /dev/null
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program reserves and uses hugetlb memory, supporting a bunch of
+ * scenarios needed by the charged_reserved_hugetlb.sh test.
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+/* Global definitions. */
+enum method {
+       HUGETLBFS,
+       MMAP_MAP_HUGETLB,
+       SHM,
+       MAX_METHOD
+};
+
+
+/* Global variables. */
+static const char *self;
+static char *shmaddr;
+static int shmid;
+
+/*
+ * Show usage and exit.
+ */
+static void exit_usage(void)
+{
+       printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> "
+              "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] "
+              "[-o] [-w] [-n]\n",
+              self);
+       exit(EXIT_FAILURE);
+}
+
+void sig_handler(int signo)
+{
+       printf("Received %d.\n", signo);
+       if (signo == SIGINT) {
+               printf("Deleting the memory\n");
+               if (shmdt((const void *)shmaddr) != 0) {
+                       perror("Detach failure");
+                       shmctl(shmid, IPC_RMID, NULL);
+                       exit(4);
+               }
+
+               shmctl(shmid, IPC_RMID, NULL);
+               printf("Done deleting the memory\n");
+       }
+       exit(2);
+}
+
+int main(int argc, char **argv)
+{
+       int fd = 0;
+       int key = 0;
+       int *ptr = NULL;
+       int c = 0;
+       int size = 0;
+       char path[256] = "";
+       enum method method = MAX_METHOD;
+       int want_sleep = 0, private = 0;
+       int populate = 0;
+       int write = 0;
+       int reserve = 1;
+
+       if (signal(SIGINT, sig_handler) == SIG_ERR)
+               err(1, "\ncan't catch SIGINT\n");
+
+       /* Parse command-line arguments. */
+       setvbuf(stdout, NULL, _IONBF, 0);
+       self = argv[0];
+
+       while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) {
+               switch (c) {
+               case 's':
+                       size = atoi(optarg);
+                       break;
+               case 'p':
+                       strncpy(path, optarg, sizeof(path));
+                       break;
+               case 'm':
+                       if (atoi(optarg) >= MAX_METHOD) {
+                               errno = EINVAL;
+                               perror("Invalid -m.");
+                               exit_usage();
+                       }
+                       method = atoi(optarg);
+                       break;
+               case 'o':
+                       populate = 1;
+                       break;
+               case 'w':
+                       write = 1;
+                       break;
+               case 'l':
+                       want_sleep = 1;
+                       break;
+               case 'r':
+                   private
+                       = 1;
+                       break;
+               case 'n':
+                       reserve = 0;
+                       break;
+               default:
+                       errno = EINVAL;
+                       perror("Invalid arg");
+                       exit_usage();
+               }
+       }
+
+       if (strncmp(path, "", sizeof(path)) != 0) {
+               printf("Writing to this path: %s\n", path);
+       } else {
+               errno = EINVAL;
+               perror("path not found");
+               exit_usage();
+       }
+
+       if (size != 0) {
+               printf("Writing this size: %d\n", size);
+       } else {
+               errno = EINVAL;
+               perror("size not found");
+               exit_usage();
+       }
+
+       if (!populate)
+               printf("Not populating.\n");
+       else
+               printf("Populating.\n");
+
+       if (!write)
+               printf("Not writing to memory.\n");
+
+       if (method == MAX_METHOD) {
+               errno = EINVAL;
+               perror("-m Invalid");
+               exit_usage();
+       } else
+               printf("Using method=%d\n", method);
+
+       if (!private)
+               printf("Shared mapping.\n");
+       else
+               printf("Private mapping.\n");
+
+       if (!reserve)
+               printf("NO_RESERVE mapping.\n");
+       else
+               printf("RESERVE mapping.\n");
+
+       switch (method) {
+       case HUGETLBFS:
+               printf("Allocating using HUGETLBFS.\n");
+               fd = open(path, O_CREAT | O_RDWR, 0777);
+               if (fd == -1)
+                       err(1, "Failed to open file.");
+
+               ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                          (private ? MAP_PRIVATE : MAP_SHARED) |
+                                  (populate ? MAP_POPULATE : 0) |
+                                  (reserve ? 0 : MAP_NORESERVE),
+                          fd, 0);
+
+               if (ptr == MAP_FAILED) {
+                       close(fd);
+                       err(1, "Error mapping the file");
+               }
+               break;
+       case MMAP_MAP_HUGETLB:
+               printf("Allocating using MAP_HUGETLB.\n");
+               ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+                          (private ? (MAP_PRIVATE | MAP_ANONYMOUS) :
+                                     MAP_SHARED) |
+                                  MAP_HUGETLB | (populate ? MAP_POPULATE : 0) |
+                                  (reserve ? 0 : MAP_NORESERVE),
+                          -1, 0);
+
+               if (ptr == MAP_FAILED)
+                       err(1, "mmap");
+
+               printf("Returned address is %p\n", ptr);
+               break;
+       case SHM:
+               printf("Allocating using SHM.\n");
+               shmid = shmget(key, size,
+                              SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+               if (shmid < 0) {
+                       shmid = shmget(++key, size,
+                                      SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+                       if (shmid < 0)
+                               err(1, "shmget");
+               }
+               printf("shmid: 0x%x, shmget key:%d\n", shmid, key);
+
+               ptr = shmat(shmid, NULL, 0);
+               if (ptr == (int *)-1) {
+                       perror("Shared memory attach failure");
+                       shmctl(shmid, IPC_RMID, NULL);
+                       exit(2);
+               }
+               printf("shmaddr: %p\n", ptr);
+
+               break;
+       default:
+               errno = EINVAL;
+               err(1, "Invalid method.");
+       }
+
+       if (write) {
+               printf("Writing to memory.\n");
+               memset(ptr, 1, size);
+       }
+
+       if (want_sleep) {
+               /* Signal to caller that we're done. */
+               printf("DONE\n");
+
+               /* Hold memory until external kill signal is delivered. */
+               while (1)
+                       sleep(100);
+       }
+
+       if (method == HUGETLBFS)
+               close(fd);
+
+       return 0;
+}
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
deleted file mode 100644 (file)
index 1f8c36a..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-cow
-hugepage-mmap
-hugepage-mremap
-hugepage-shm
-hugepage-vmemmap
-hugetlb-madvise
-khugepaged
-map_hugetlb
-map_populate
-thuge-gen
-compaction_test
-migration
-mlock2-tests
-mrelease_test
-mremap_dontunmap
-mremap_test
-on-fault-limit
-transhuge-stress
-protection_keys
-protection_keys_32
-protection_keys_64
-madv_populate
-userfaultfd
-mlock-intersect-test
-mlock-random-test
-virtual_address_range
-gup_test
-va_128TBswitch
-map_fixed_noreplace
-write_to_hugetlbfs
-hmm-tests
-memfd_secret
-soft-dirty
-split_huge_page_test
-ksm_tests
-local_config.h
-local_config.mk
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
deleted file mode 100644 (file)
index 89c14e4..0000000
+++ /dev/null
@@ -1,180 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Makefile for vm selftests
-
-LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h
-
-include local_config.mk
-
-uname_M := $(shell uname -m 2>/dev/null || echo not)
-MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
-
-# Without this, failed build products remain, with up-to-date timestamps,
-# thus tricking Make (and you!) into believing that All Is Well, in subsequent
-# make invocations:
-.DELETE_ON_ERROR:
-
-# Avoid accidental wrong builds, due to built-in rules working just a little
-# bit too well--but not quite as well as required for our situation here.
-#
-# In other words, "make userfaultfd" is supposed to fail to build at all,
-# because this Makefile only supports either "make" (all), or "make /full/path".
-# However,  the built-in rules, if not suppressed, will pick up CFLAGS and the
-# initial LDLIBS (but not the target-specific LDLIBS, because those are only
-# set for the full path target!). This causes it to get pretty far into building
-# things despite using incorrect values such as an *occasionally* incomplete
-# LDLIBS.
-MAKEFLAGS += --no-builtin-rules
-
-CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
-LDLIBS = -lrt -lpthread
-TEST_GEN_FILES = cow
-TEST_GEN_FILES += compaction_test
-TEST_GEN_FILES += gup_test
-TEST_GEN_FILES += hmm-tests
-TEST_GEN_FILES += hugetlb-madvise
-TEST_GEN_FILES += hugepage-mmap
-TEST_GEN_FILES += hugepage-mremap
-TEST_GEN_FILES += hugepage-shm
-TEST_GEN_FILES += hugepage-vmemmap
-TEST_GEN_FILES += khugepaged
-TEST_GEN_PROGS = madv_populate
-TEST_GEN_FILES += map_fixed_noreplace
-TEST_GEN_FILES += map_hugetlb
-TEST_GEN_FILES += map_populate
-TEST_GEN_FILES += memfd_secret
-TEST_GEN_FILES += migration
-TEST_GEN_FILES += mlock-random-test
-TEST_GEN_FILES += mlock2-tests
-TEST_GEN_FILES += mrelease_test
-TEST_GEN_FILES += mremap_dontunmap
-TEST_GEN_FILES += mremap_test
-TEST_GEN_FILES += on-fault-limit
-TEST_GEN_FILES += thuge-gen
-TEST_GEN_FILES += transhuge-stress
-TEST_GEN_FILES += userfaultfd
-TEST_GEN_PROGS += soft-dirty
-TEST_GEN_PROGS += split_huge_page_test
-TEST_GEN_FILES += ksm_tests
-TEST_GEN_PROGS += ksm_functional_tests
-
-ifeq ($(MACHINE),x86_64)
-CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
-CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c)
-CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
-
-VMTARGETS := protection_keys
-BINARIES_32 := $(VMTARGETS:%=%_32)
-BINARIES_64 := $(VMTARGETS:%=%_64)
-
-ifeq ($(CAN_BUILD_WITH_NOPIE),1)
-CFLAGS += -no-pie
-endif
-
-ifeq ($(CAN_BUILD_I386),1)
-TEST_GEN_FILES += $(BINARIES_32)
-endif
-
-ifeq ($(CAN_BUILD_X86_64),1)
-TEST_GEN_FILES += $(BINARIES_64)
-endif
-else
-
-ifneq (,$(findstring $(MACHINE),ppc64))
-TEST_GEN_FILES += protection_keys
-endif
-
-endif
-
-ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64))
-TEST_GEN_FILES += va_128TBswitch
-TEST_GEN_FILES += virtual_address_range
-TEST_GEN_FILES += write_to_hugetlbfs
-endif
-
-TEST_PROGS := run_vmtests.sh
-
-TEST_FILES := test_vmalloc.sh
-TEST_FILES += test_hmm.sh
-TEST_FILES += va_128TBswitch.sh
-
-include ../lib.mk
-
-$(OUTPUT)/cow: vm_util.c
-$(OUTPUT)/khugepaged: vm_util.c
-$(OUTPUT)/ksm_functional_tests: vm_util.c
-$(OUTPUT)/madv_populate: vm_util.c
-$(OUTPUT)/soft-dirty: vm_util.c
-$(OUTPUT)/split_huge_page_test: vm_util.c
-$(OUTPUT)/userfaultfd: vm_util.c
-
-ifeq ($(MACHINE),x86_64)
-BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
-BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
-
-define gen-target-rule-32
-$(1) $(1)_32: $(OUTPUT)/$(1)_32
-.PHONY: $(1) $(1)_32
-endef
-
-define gen-target-rule-64
-$(1) $(1)_64: $(OUTPUT)/$(1)_64
-.PHONY: $(1) $(1)_64
-endef
-
-ifeq ($(CAN_BUILD_I386),1)
-$(BINARIES_32): CFLAGS += -m32 -mxsave
-$(BINARIES_32): LDLIBS += -lrt -ldl -lm
-$(BINARIES_32): $(OUTPUT)/%_32: %.c
-       $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
-$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t))))
-endif
-
-ifeq ($(CAN_BUILD_X86_64),1)
-$(BINARIES_64): CFLAGS += -m64 -mxsave
-$(BINARIES_64): LDLIBS += -lrt -ldl
-$(BINARIES_64): $(OUTPUT)/%_64: %.c
-       $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
-$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t))))
-endif
-
-# x86_64 users should be encouraged to install 32-bit libraries
-ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
-all: warn_32bit_failure
-
-warn_32bit_failure:
-       @echo "Warning: you seem to have a broken 32-bit build" 2>&1;           \
-       echo  "environment. This will reduce test coverage of 64-bit" 2>&1;     \
-       echo  "kernels. If you are using a Debian-like distribution," 2>&1;     \
-       echo  "try:"; 2>&1;                                                     \
-       echo  "";                                                               \
-       echo  "  apt-get install gcc-multilib libc6-i386 libc6-dev-i386";       \
-       echo  "";                                                               \
-       echo  "If you are using a Fedora-like distribution, try:";              \
-       echo  "";                                                               \
-       echo  "  yum install glibc-devel.*i686";                                \
-       exit 0;
-endif
-endif
-
-# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
-$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS)
-
-$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap
-
-$(OUTPUT)/ksm_tests: LDLIBS += -lnuma
-
-$(OUTPUT)/migration: LDLIBS += -lnuma
-
-local_config.mk local_config.h: check_config.sh
-       /bin/sh ./check_config.sh $(CC)
-
-EXTRA_CLEAN += local_config.mk local_config.h
-
-ifeq ($(COW_EXTRA_LIBS),)
-all: warn_missing_liburing
-
-warn_missing_liburing:
-       @echo ; \
-       echo "Warning: missing liburing support. Some COW tests will be skipped." ; \
-       echo
-endif
diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
deleted file mode 100644 (file)
index a5cb4b0..0000000
+++ /dev/null
@@ -1,584 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-set -e
-
-if [[ $(id -u) -ne 0 ]]; then
-  echo "This test must be run as root. Skipping..."
-  exit $ksft_skip
-fi
-
-fault_limit_file=limit_in_bytes
-reservation_limit_file=rsvd.limit_in_bytes
-fault_usage_file=usage_in_bytes
-reservation_usage_file=rsvd.usage_in_bytes
-
-if [[ "$1" == "-cgroup-v2" ]]; then
-  cgroup2=1
-  fault_limit_file=max
-  reservation_limit_file=rsvd.max
-  fault_usage_file=current
-  reservation_usage_file=rsvd.current
-fi
-
-if [[ $cgroup2 ]]; then
-  cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
-  if [[ -z "$cgroup_path" ]]; then
-    cgroup_path=/dev/cgroup/memory
-    mount -t cgroup2 none $cgroup_path
-    do_umount=1
-  fi
-  echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
-else
-  cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
-  if [[ -z "$cgroup_path" ]]; then
-    cgroup_path=/dev/cgroup/memory
-    mount -t cgroup memory,hugetlb $cgroup_path
-    do_umount=1
-  fi
-fi
-export cgroup_path
-
-function cleanup() {
-  if [[ $cgroup2 ]]; then
-    echo $$ >$cgroup_path/cgroup.procs
-  else
-    echo $$ >$cgroup_path/tasks
-  fi
-
-  if [[ -e /mnt/huge ]]; then
-    rm -rf /mnt/huge/*
-    umount /mnt/huge || echo error
-    rmdir /mnt/huge
-  fi
-  if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then
-    rmdir $cgroup_path/hugetlb_cgroup_test
-  fi
-  if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then
-    rmdir $cgroup_path/hugetlb_cgroup_test1
-  fi
-  if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
-    rmdir $cgroup_path/hugetlb_cgroup_test2
-  fi
-  echo 0 >/proc/sys/vm/nr_hugepages
-  echo CLEANUP DONE
-}
-
-function expect_equal() {
-  local expected="$1"
-  local actual="$2"
-  local error="$3"
-
-  if [[ "$expected" != "$actual" ]]; then
-    echo "expected ($expected) != actual ($actual): $3"
-    cleanup
-    exit 1
-  fi
-}
-
-function get_machine_hugepage_size() {
-  hpz=$(grep -i hugepagesize /proc/meminfo)
-  kb=${hpz:14:-3}
-  mb=$(($kb / 1024))
-  echo $mb
-}
-
-MB=$(get_machine_hugepage_size)
-
-function setup_cgroup() {
-  local name="$1"
-  local cgroup_limit="$2"
-  local reservation_limit="$3"
-
-  mkdir $cgroup_path/$name
-
-  echo writing cgroup limit: "$cgroup_limit"
-  echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
-
-  echo writing reseravation limit: "$reservation_limit"
-  echo "$reservation_limit" > \
-    $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
-
-  if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
-    echo 0 >$cgroup_path/$name/cpuset.cpus
-  fi
-  if [ -e "$cgroup_path/$name/cpuset.mems" ]; then
-    echo 0 >$cgroup_path/$name/cpuset.mems
-  fi
-}
-
-function wait_for_hugetlb_memory_to_get_depleted() {
-  local cgroup="$1"
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
-  # Wait for hugetlbfs memory to get depleted.
-  while [ $(cat $path) != 0 ]; do
-    echo Waiting for hugetlb memory to get depleted.
-    cat $path
-    sleep 0.5
-  done
-}
-
-function wait_for_hugetlb_memory_to_get_reserved() {
-  local cgroup="$1"
-  local size="$2"
-
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
-  # Wait for hugetlbfs memory to get written.
-  while [ $(cat $path) != $size ]; do
-    echo Waiting for hugetlb memory reservation to reach size $size.
-    cat $path
-    sleep 0.5
-  done
-}
-
-function wait_for_hugetlb_memory_to_get_written() {
-  local cgroup="$1"
-  local size="$2"
-
-  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
-  # Wait for hugetlbfs memory to get written.
-  while [ $(cat $path) != $size ]; do
-    echo Waiting for hugetlb memory to reach size $size.
-    cat $path
-    sleep 0.5
-  done
-}
-
-function write_hugetlbfs_and_get_usage() {
-  local cgroup="$1"
-  local size="$2"
-  local populate="$3"
-  local write="$4"
-  local path="$5"
-  local method="$6"
-  local private="$7"
-  local expect_failure="$8"
-  local reserve="$9"
-
-  # Function return values.
-  reservation_failed=0
-  oom_killed=0
-  hugetlb_difference=0
-  reserved_difference=0
-
-  local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
-  local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file
-
-  local hugetlb_before=$(cat $hugetlb_usage)
-  local reserved_before=$(cat $reserved_usage)
-
-  echo
-  echo Starting:
-  echo hugetlb_usage="$hugetlb_before"
-  echo reserved_usage="$reserved_before"
-  echo expect_failure is "$expect_failure"
-
-  output=$(mktemp)
-  set +e
-  if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] ||
-    [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then
-
-    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
-      "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output &
-
-    local write_result=$?
-    local write_pid=$!
-
-    until grep -q -i "DONE" $output; do
-      echo waiting for DONE signal.
-      if ! ps $write_pid > /dev/null
-      then
-        echo "FAIL: The write died"
-        cleanup
-        exit 1
-      fi
-      sleep 0.5
-    done
-
-    echo ================= write_hugetlb_memory.sh output is:
-    cat $output
-    echo ================= end output.
-
-    if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then
-      wait_for_hugetlb_memory_to_get_written "$cgroup" "$size"
-    elif [[ "$reserve" != "-n" ]]; then
-      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
-    else
-      # This case doesn't produce visible effects, but we still have
-      # to wait for the async process to start and execute...
-      sleep 0.5
-    fi
-
-    echo write_result is $write_result
-  else
-    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
-      "$cgroup" "$path" "$method" "$private" "$reserve"
-    local write_result=$?
-
-    if [[ "$reserve" != "-n" ]]; then
-      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
-    fi
-  fi
-  set -e
-
-  if [[ "$write_result" == 1 ]]; then
-    reservation_failed=1
-  fi
-
-  # On linus/master, the above process gets SIGBUS'd on oomkill, with
-  # return code 135. On earlier kernels, it gets actual oomkill, with return
-  # code 137, so just check for both conditions in case we're testing
-  # against an earlier kernel.
-  if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then
-    oom_killed=1
-  fi
-
-  local hugetlb_after=$(cat $hugetlb_usage)
-  local reserved_after=$(cat $reserved_usage)
-
-  echo After write:
-  echo hugetlb_usage="$hugetlb_after"
-  echo reserved_usage="$reserved_after"
-
-  hugetlb_difference=$(($hugetlb_after - $hugetlb_before))
-  reserved_difference=$(($reserved_after - $reserved_before))
-}
-
-function cleanup_hugetlb_memory() {
-  set +e
-  local cgroup="$1"
-  if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
-    echo killing write_to_hugetlbfs
-    killall -2 write_to_hugetlbfs
-    wait_for_hugetlb_memory_to_get_depleted $cgroup
-  fi
-  set -e
-
-  if [[ -e /mnt/huge ]]; then
-    rm -rf /mnt/huge/*
-    umount /mnt/huge
-    rmdir /mnt/huge
-  fi
-}
-
-function run_test() {
-  local size=$(($1 * ${MB} * 1024 * 1024))
-  local populate="$2"
-  local write="$3"
-  local cgroup_limit=$(($4 * ${MB} * 1024 * 1024))
-  local reservation_limit=$(($5 * ${MB} * 1024 * 1024))
-  local nr_hugepages="$6"
-  local method="$7"
-  local private="$8"
-  local expect_failure="$9"
-  local reserve="${10}"
-
-  # Function return values.
-  hugetlb_difference=0
-  reserved_difference=0
-  reservation_failed=0
-  oom_killed=0
-
-  echo nr hugepages = "$nr_hugepages"
-  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
-
-  setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"
-
-  mkdir -p /mnt/huge
-  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
-
-  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
-    "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
-    "$reserve"
-
-  cleanup_hugetlb_memory "hugetlb_cgroup_test"
-
-  local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
-  local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)
-
-  echo $hugetlb_difference
-  echo $reserved_difference
-  expect_equal "0" "$final_hugetlb" "final hugetlb is not zero"
-  expect_equal "0" "$final_reservation" "final reservation is not zero"
-}
-
-function run_multiple_cgroup_test() {
-  local size1="$1"
-  local populate1="$2"
-  local write1="$3"
-  local cgroup_limit1="$4"
-  local reservation_limit1="$5"
-
-  local size2="$6"
-  local populate2="$7"
-  local write2="$8"
-  local cgroup_limit2="$9"
-  local reservation_limit2="${10}"
-
-  local nr_hugepages="${11}"
-  local method="${12}"
-  local private="${13}"
-  local expect_failure="${14}"
-  local reserve="${15}"
-
-  # Function return values.
-  hugetlb_difference1=0
-  reserved_difference1=0
-  reservation_failed1=0
-  oom_killed1=0
-
-  hugetlb_difference2=0
-  reserved_difference2=0
-  reservation_failed2=0
-  oom_killed2=0
-
-  echo nr hugepages = "$nr_hugepages"
-  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
-
-  setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1"
-  setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"
-
-  mkdir -p /mnt/huge
-  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
-
-  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
-    "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
-    "$expect_failure" "$reserve"
-
-  hugetlb_difference1=$hugetlb_difference
-  reserved_difference1=$reserved_difference
-  reservation_failed1=$reservation_failed
-  oom_killed1=$oom_killed
-
-  local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
-  local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
-  local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
-  local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file
-
-  local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
-  local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)
-
-  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \
-    "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \
-    "$expect_failure" "$reserve"
-
-  hugetlb_difference2=$hugetlb_difference
-  reserved_difference2=$reserved_difference
-  reservation_failed2=$reservation_failed
-  oom_killed2=$oom_killed
-
-  expect_equal "$usage_before_second_write" \
-    "$(cat $cgroup1_hugetlb_usage)" "Usage changed."
-  expect_equal "$reservation_usage_before_second_write" \
-    "$(cat $cgroup1_reservation_usage)" "Reservation usage changed."
-
-  cleanup_hugetlb_memory
-
-  local final_hugetlb=$(cat $cgroup1_hugetlb_usage)
-  local final_reservation=$(cat $cgroup1_reservation_usage)
-
-  expect_equal "0" "$final_hugetlb" \
-    "hugetlbt_cgroup_test1 final hugetlb is not zero"
-  expect_equal "0" "$final_reservation" \
-    "hugetlbt_cgroup_test1 final reservation is not zero"
-
-  local final_hugetlb=$(cat $cgroup2_hugetlb_usage)
-  local final_reservation=$(cat $cgroup2_reservation_usage)
-
-  expect_equal "0" "$final_hugetlb" \
-    "hugetlb_cgroup_test2 final hugetlb is not zero"
-  expect_equal "0" "$final_reservation" \
-    "hugetlb_cgroup_test2 final reservation is not zero"
-}
-
-cleanup
-
-for populate in "" "-o"; do
-  for method in 0 1 2; do
-    for private in "" "-r"; do
-      for reserve in "" "-n"; do
-
-        # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported.
-        if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then
-          continue
-        fi
-
-        # Skip populated shmem tests. Doesn't seem to be supported.
-        if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then
-          continue
-        fi
-
-        if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then
-          continue
-        fi
-
-        cleanup
-        echo
-        echo
-        echo
-        echo Test normal case.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb=$hugetlb_difference
-        echo Memory charged to reservation=$reserved_difference
-
-        if [[ "$populate" == "-o" ]]; then
-          expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
-            "Reserved memory charged to hugetlb cgroup."
-        else
-          expect_equal "0" "$hugetlb_difference" \
-            "Reserved memory charged to hugetlb cgroup."
-        fi
-
-        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
-          expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
-            "Reserved memory not charged to reservation usage."
-        else
-          expect_equal "0" "$reserved_difference" \
-            "Reserved memory not charged to reservation usage."
-        fi
-
-        echo 'PASS'
-
-        cleanup
-        echo
-        echo
-        echo
-        echo Test normal case with write.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb=$hugetlb_difference
-        echo Memory charged to reservation=$reserved_difference
-
-        expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
-          "Reserved memory charged to hugetlb cgroup."
-
-        expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
-          "Reserved memory not charged to reservation usage."
-
-        echo 'PASS'
-
-        cleanup
-        continue
-        echo
-        echo
-        echo
-        echo Test more than reservation case.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-
-        if [ "$reserve" != "-n" ]; then
-          run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \
-            "$reserve"
-
-          expect_equal "1" "$reservation_failed" "Reservation succeeded."
-        fi
-
-        echo 'PASS'
-
-        cleanup
-
-        echo
-        echo
-        echo
-        echo Test more than cgroup limit case.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-
-        # Not sure if shm memory can be cleaned up when the process gets sigbus'd.
-        if [[ "$method" != 2 ]]; then
-          run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve"
-
-          expect_equal "1" "$oom_killed" "Not oom killed."
-        fi
-        echo 'PASS'
-
-        cleanup
-
-        echo
-        echo
-        echo
-        echo Test normal case, multiple cgroups.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \
-          "$populate" "" "10" "10" "10" \
-          "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb1=$hugetlb_difference1
-        echo Memory charged to reservation1=$reserved_difference1
-        echo Memory charged to hugtlb2=$hugetlb_difference2
-        echo Memory charged to reservation2=$reserved_difference2
-
-        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
-          expect_equal "3" "$reserved_difference1" \
-            "Incorrect reservations charged to cgroup 1."
-
-          expect_equal "5" "$reserved_difference2" \
-            "Incorrect reservation charged to cgroup 2."
-
-        else
-          expect_equal "0" "$reserved_difference1" \
-            "Incorrect reservations charged to cgroup 1."
-
-          expect_equal "0" "$reserved_difference2" \
-            "Incorrect reservation charged to cgroup 2."
-        fi
-
-        if [[ "$populate" == "-o" ]]; then
-          expect_equal "3" "$hugetlb_difference1" \
-            "Incorrect hugetlb charged to cgroup 1."
-
-          expect_equal "5" "$hugetlb_difference2" \
-            "Incorrect hugetlb charged to cgroup 2."
-
-        else
-          expect_equal "0" "$hugetlb_difference1" \
-            "Incorrect hugetlb charged to cgroup 1."
-
-          expect_equal "0" "$hugetlb_difference2" \
-            "Incorrect hugetlb charged to cgroup 2."
-        fi
-        echo 'PASS'
-
-        cleanup
-        echo
-        echo
-        echo
-        echo Test normal case with write, multiple cgroups.
-        echo private=$private, populate=$populate, method=$method, reserve=$reserve
-        run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \
-          "$populate" "-w" "10" "10" "10" \
-          "$method" "$private" "0" "$reserve"
-
-        echo Memory charged to hugtlb1=$hugetlb_difference1
-        echo Memory charged to reservation1=$reserved_difference1
-        echo Memory charged to hugtlb2=$hugetlb_difference2
-        echo Memory charged to reservation2=$reserved_difference2
-
-        expect_equal "3" "$hugetlb_difference1" \
-          "Incorrect hugetlb charged to cgroup 1."
-
-        expect_equal "3" "$reserved_difference1" \
-          "Incorrect reservation charged to cgroup 1."
-
-        expect_equal "5" "$hugetlb_difference2" \
-          "Incorrect hugetlb charged to cgroup 2."
-
-        expect_equal "5" "$reserved_difference2" \
-          "Incorrected reservation charged to cgroup 2."
-        echo 'PASS'
-
-        cleanup
-
-      done # reserve
-    done   # private
-  done     # populate
-done       # method
-
-if [[ $do_umount ]]; then
-  umount $cgroup_path
-  rmdir $cgroup_path
-fi
diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh
deleted file mode 100644 (file)
index bcba3af..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-#
-# Probe for libraries and create header files to record the results. Both C
-# header files and Makefile include fragments are created.
-
-OUTPUT_H_FILE=local_config.h
-OUTPUT_MKFILE=local_config.mk
-
-tmpname=$(mktemp)
-tmpfile_c=${tmpname}.c
-tmpfile_o=${tmpname}.o
-
-# liburing
-echo "#include <sys/types.h>"        > $tmpfile_c
-echo "#include <liburing.h>"        >> $tmpfile_c
-echo "int func(void) { return 0; }" >> $tmpfile_c
-
-CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"}
-$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
-
-if [ -f $tmpfile_o ]; then
-    echo "#define LOCAL_CONFIG_HAVE_LIBURING 1"  > $OUTPUT_H_FILE
-    echo "COW_EXTRA_LIBS = -luring"              > $OUTPUT_MKFILE
-else
-    echo "// No liburing support found"          > $OUTPUT_H_FILE
-    echo "# No liburing support found, so:"      > $OUTPUT_MKFILE
-    echo "COW_EXTRA_LIBS = "                    >> $OUTPUT_MKFILE
-fi
-
-rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c
deleted file mode 100644 (file)
index 9b42014..0000000
+++ /dev/null
@@ -1,231 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- *
- * A test for the patch "Allow compaction of unevictable pages".
- * With this patch we should be able to allocate at least 1/4
- * of RAM in huge pages. Without the patch much less is
- * allocated.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/resource.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <unistd.h>
-#include <string.h>
-
-#include "../kselftest.h"
-
-#define MAP_SIZE_MB    100
-#define MAP_SIZE       (MAP_SIZE_MB * 1024 * 1024)
-
-struct map_list {
-       void *map;
-       struct map_list *next;
-};
-
-int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
-{
-       char  buffer[256] = {0};
-       char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
-       FILE *cmdfile = popen(cmd, "r");
-
-       if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
-               perror("Failed to read meminfo\n");
-               return -1;
-       }
-
-       pclose(cmdfile);
-
-       *memfree = atoll(buffer);
-       cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
-       cmdfile = popen(cmd, "r");
-
-       if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
-               perror("Failed to read meminfo\n");
-               return -1;
-       }
-
-       pclose(cmdfile);
-       *hugepagesize = atoll(buffer);
-
-       return 0;
-}
-
-int prereq(void)
-{
-       char allowed;
-       int fd;
-
-       fd = open("/proc/sys/vm/compact_unevictable_allowed",
-                 O_RDONLY | O_NONBLOCK);
-       if (fd < 0) {
-               perror("Failed to open\n"
-                      "/proc/sys/vm/compact_unevictable_allowed\n");
-               return -1;
-       }
-
-       if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
-               perror("Failed to read from\n"
-                      "/proc/sys/vm/compact_unevictable_allowed\n");
-               close(fd);
-               return -1;
-       }
-
-       close(fd);
-       if (allowed == '1')
-               return 0;
-
-       return -1;
-}
-
-int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
-{
-       int fd;
-       int compaction_index = 0;
-       char initial_nr_hugepages[10] = {0};
-       char nr_hugepages[10] = {0};
-
-       /* We want to test with 80% of available memory. Else, OOM killer comes
-          in to play */
-       mem_free = mem_free * 0.8;
-
-       fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
-       if (fd < 0) {
-               perror("Failed to open /proc/sys/vm/nr_hugepages");
-               return -1;
-       }
-
-       if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
-               perror("Failed to read from /proc/sys/vm/nr_hugepages");
-               goto close_fd;
-       }
-
-       /* Start with the initial condition of 0 huge pages*/
-       if (write(fd, "0", sizeof(char)) != sizeof(char)) {
-               perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n");
-               goto close_fd;
-       }
-
-       lseek(fd, 0, SEEK_SET);
-
-       /* Request a large number of huge pages. The Kernel will allocate
-          as much as it can */
-       if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
-               perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n");
-               goto close_fd;
-       }
-
-       lseek(fd, 0, SEEK_SET);
-
-       if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
-               perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n");
-               goto close_fd;
-       }
-
-       /* We should have been able to request at least 1/3 rd of the memory in
-          huge pages */
-       compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
-
-       if (compaction_index > 3) {
-               printf("No of huge pages allocated = %d\n",
-                      (atoi(nr_hugepages)));
-               fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
-                       "as huge pages\n", compaction_index);
-               goto close_fd;
-       }
-
-       printf("No of huge pages allocated = %d\n",
-              (atoi(nr_hugepages)));
-
-       lseek(fd, 0, SEEK_SET);
-
-       if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
-           != strlen(initial_nr_hugepages)) {
-               perror("Failed to write value to /proc/sys/vm/nr_hugepages\n");
-               goto close_fd;
-       }
-
-       close(fd);
-       return 0;
-
- close_fd:
-       close(fd);
-       printf("Not OK. Compaction test failed.");
-       return -1;
-}
-
-
-int main(int argc, char **argv)
-{
-       struct rlimit lim;
-       struct map_list *list, *entry;
-       size_t page_size, i;
-       void *map = NULL;
-       unsigned long mem_free = 0;
-       unsigned long hugepage_size = 0;
-       long mem_fragmentable_MB = 0;
-
-       if (prereq() != 0) {
-               printf("Either the sysctl compact_unevictable_allowed is not\n"
-                      "set to 1 or couldn't read the proc file.\n"
-                      "Skipping the test\n");
-               return KSFT_SKIP;
-       }
-
-       lim.rlim_cur = RLIM_INFINITY;
-       lim.rlim_max = RLIM_INFINITY;
-       if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
-               perror("Failed to set rlimit:\n");
-               return -1;
-       }
-
-       page_size = getpagesize();
-
-       list = NULL;
-
-       if (read_memory_info(&mem_free, &hugepage_size) != 0) {
-               printf("ERROR: Cannot read meminfo\n");
-               return -1;
-       }
-
-       mem_fragmentable_MB = mem_free * 0.8 / 1024;
-
-       while (mem_fragmentable_MB > 0) {
-               map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
-                          MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
-               if (map == MAP_FAILED)
-                       break;
-
-               entry = malloc(sizeof(struct map_list));
-               if (!entry) {
-                       munmap(map, MAP_SIZE);
-                       break;
-               }
-               entry->map = map;
-               entry->next = list;
-               list = entry;
-
-               /* Write something (in this case the address of the map) to
-                * ensure that KSM can't merge the mapped pages
-                */
-               for (i = 0; i < MAP_SIZE; i += page_size)
-                       *(unsigned long *)(map + i) = (unsigned long)map + i;
-
-               mem_fragmentable_MB -= MAP_SIZE_MB;
-       }
-
-       for (entry = list; entry != NULL; entry = entry->next) {
-               munmap(entry->map, MAP_SIZE);
-               if (!entry->next)
-                       break;
-               entry = entry->next;
-       }
-
-       if (check_compaction(mem_free, hugepage_size) == 0)
-               return 0;
-
-       return -1;
-}
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
deleted file mode 100644 (file)
index be087c4..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-CONFIG_SYSVIPC=y
-CONFIG_USERFAULTFD=y
-CONFIG_TEST_VMALLOC=m
-CONFIG_DEVICE_PRIVATE=y
-CONFIG_TEST_HMM=m
-CONFIG_GUP_TEST=y
-CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_MEM_SOFT_DIRTY=y
diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c
deleted file mode 100644 (file)
index 16216d8..0000000
+++ /dev/null
@@ -1,1764 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * COW (Copy On Write) tests.
- *
- * Copyright 2022, Red Hat, Inc.
- *
- * Author(s): David Hildenbrand <david@redhat.com>
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <dirent.h>
-#include <assert.h>
-#include <sys/mman.h>
-#include <sys/ioctl.h>
-#include <sys/wait.h>
-#include <linux/memfd.h>
-
-#include "local_config.h"
-#ifdef LOCAL_CONFIG_HAVE_LIBURING
-#include <liburing.h>
-#endif /* LOCAL_CONFIG_HAVE_LIBURING */
-
-#include "../../../../mm/gup_test.h"
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#ifndef MADV_COLLAPSE
-#define MADV_COLLAPSE 25
-#endif
-
-static size_t pagesize;
-static int pagemap_fd;
-static size_t thpsize;
-static int nr_hugetlbsizes;
-static size_t hugetlbsizes[10];
-static int gup_fd;
-static bool has_huge_zeropage;
-
-static void detect_thpsize(void)
-{
-       int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size",
-                     O_RDONLY);
-       size_t size = 0;
-       char buf[15];
-       int ret;
-
-       if (fd < 0)
-               return;
-
-       ret = pread(fd, buf, sizeof(buf), 0);
-       if (ret > 0 && ret < sizeof(buf)) {
-               buf[ret] = 0;
-
-               size = strtoul(buf, NULL, 10);
-               if (size < pagesize)
-                       size = 0;
-               if (size > 0) {
-                       thpsize = size;
-                       ksft_print_msg("[INFO] detected THP size: %zu KiB\n",
-                                      thpsize / 1024);
-               }
-       }
-
-       close(fd);
-}
-
-static void detect_huge_zeropage(void)
-{
-       int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page",
-                     O_RDONLY);
-       size_t enabled = 0;
-       char buf[15];
-       int ret;
-
-       if (fd < 0)
-               return;
-
-       ret = pread(fd, buf, sizeof(buf), 0);
-       if (ret > 0 && ret < sizeof(buf)) {
-               buf[ret] = 0;
-
-               enabled = strtoul(buf, NULL, 10);
-               if (enabled == 1) {
-                       has_huge_zeropage = true;
-                       ksft_print_msg("[INFO] huge zeropage is enabled\n");
-               }
-       }
-
-       close(fd);
-}
-
-static void detect_hugetlbsizes(void)
-{
-       DIR *dir = opendir("/sys/kernel/mm/hugepages/");
-
-       if (!dir)
-               return;
-
-       while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) {
-               struct dirent *entry = readdir(dir);
-               size_t kb;
-
-               if (!entry)
-                       break;
-               if (entry->d_type != DT_DIR)
-                       continue;
-               if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1)
-                       continue;
-               hugetlbsizes[nr_hugetlbsizes] = kb * 1024;
-               nr_hugetlbsizes++;
-               ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n",
-                              kb);
-       }
-       closedir(dir);
-}
-
-static bool range_is_swapped(void *addr, size_t size)
-{
-       for (; size; addr += pagesize, size -= pagesize)
-               if (!pagemap_is_swapped(pagemap_fd, addr))
-                       return false;
-       return true;
-}
-
-struct comm_pipes {
-       int child_ready[2];
-       int parent_ready[2];
-};
-
-static int setup_comm_pipes(struct comm_pipes *comm_pipes)
-{
-       if (pipe(comm_pipes->child_ready) < 0)
-               return -errno;
-       if (pipe(comm_pipes->parent_ready) < 0) {
-               close(comm_pipes->child_ready[0]);
-               close(comm_pipes->child_ready[1]);
-               return -errno;
-       }
-
-       return 0;
-}
-
-static void close_comm_pipes(struct comm_pipes *comm_pipes)
-{
-       close(comm_pipes->child_ready[0]);
-       close(comm_pipes->child_ready[1]);
-       close(comm_pipes->parent_ready[0]);
-       close(comm_pipes->parent_ready[1]);
-}
-
-static int child_memcmp_fn(char *mem, size_t size,
-                          struct comm_pipes *comm_pipes)
-{
-       char *old = malloc(size);
-       char buf;
-
-       /* Backup the original content. */
-       memcpy(old, mem, size);
-
-       /* Wait until the parent modified the page. */
-       write(comm_pipes->child_ready[1], "0", 1);
-       while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
-               ;
-
-       /* See if we still read the old values. */
-       return memcmp(old, mem, size);
-}
-
-static int child_vmsplice_memcmp_fn(char *mem, size_t size,
-                                   struct comm_pipes *comm_pipes)
-{
-       struct iovec iov = {
-               .iov_base = mem,
-               .iov_len = size,
-       };
-       ssize_t cur, total, transferred;
-       char *old, *new;
-       int fds[2];
-       char buf;
-
-       old = malloc(size);
-       new = malloc(size);
-
-       /* Backup the original content. */
-       memcpy(old, mem, size);
-
-       if (pipe(fds) < 0)
-               return -errno;
-
-       /* Trigger a read-only pin. */
-       transferred = vmsplice(fds[1], &iov, 1, 0);
-       if (transferred < 0)
-               return -errno;
-       if (transferred == 0)
-               return -EINVAL;
-
-       /* Unmap it from our page tables. */
-       if (munmap(mem, size) < 0)
-               return -errno;
-
-       /* Wait until the parent modified it. */
-       write(comm_pipes->child_ready[1], "0", 1);
-       while (read(comm_pipes->parent_ready[0], &buf, 1) != 1)
-               ;
-
-       /* See if we still read the old values via the pipe. */
-       for (total = 0; total < transferred; total += cur) {
-               cur = read(fds[0], new + total, transferred - total);
-               if (cur < 0)
-                       return -errno;
-       }
-
-       return memcmp(old, new, transferred);
-}
-
-typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes);
-
-static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect,
-                                 child_fn fn)
-{
-       struct comm_pipes comm_pipes;
-       char buf;
-       int ret;
-
-       ret = setup_comm_pipes(&comm_pipes);
-       if (ret) {
-               ksft_test_result_fail("pipe() failed\n");
-               return;
-       }
-
-       ret = fork();
-       if (ret < 0) {
-               ksft_test_result_fail("fork() failed\n");
-               goto close_comm_pipes;
-       } else if (!ret) {
-               exit(fn(mem, size, &comm_pipes));
-       }
-
-       while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-               ;
-
-       if (do_mprotect) {
-               /*
-                * mprotect() optimizations might try avoiding
-                * write-faults by directly mapping pages writable.
-                */
-               ret = mprotect(mem, size, PROT_READ);
-               ret |= mprotect(mem, size, PROT_READ|PROT_WRITE);
-               if (ret) {
-                       ksft_test_result_fail("mprotect() failed\n");
-                       write(comm_pipes.parent_ready[1], "0", 1);
-                       wait(&ret);
-                       goto close_comm_pipes;
-               }
-       }
-
-       /* Modify the page. */
-       memset(mem, 0xff, size);
-       write(comm_pipes.parent_ready[1], "0", 1);
-
-       wait(&ret);
-       if (WIFEXITED(ret))
-               ret = WEXITSTATUS(ret);
-       else
-               ret = -EINVAL;
-
-       ksft_test_result(!ret, "No leak from parent into child\n");
-close_comm_pipes:
-       close_comm_pipes(&comm_pipes);
-}
-
-static void test_cow_in_parent(char *mem, size_t size)
-{
-       do_test_cow_in_parent(mem, size, false, child_memcmp_fn);
-}
-
-static void test_cow_in_parent_mprotect(char *mem, size_t size)
-{
-       do_test_cow_in_parent(mem, size, true, child_memcmp_fn);
-}
-
-static void test_vmsplice_in_child(char *mem, size_t size)
-{
-       do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn);
-}
-
-static void test_vmsplice_in_child_mprotect(char *mem, size_t size)
-{
-       do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn);
-}
-
-static void do_test_vmsplice_in_parent(char *mem, size_t size,
-                                      bool before_fork)
-{
-       struct iovec iov = {
-               .iov_base = mem,
-               .iov_len = size,
-       };
-       ssize_t cur, total, transferred;
-       struct comm_pipes comm_pipes;
-       char *old, *new;
-       int ret, fds[2];
-       char buf;
-
-       old = malloc(size);
-       new = malloc(size);
-
-       memcpy(old, mem, size);
-
-       ret = setup_comm_pipes(&comm_pipes);
-       if (ret) {
-               ksft_test_result_fail("pipe() failed\n");
-               goto free;
-       }
-
-       if (pipe(fds) < 0) {
-               ksft_test_result_fail("pipe() failed\n");
-               goto close_comm_pipes;
-       }
-
-       if (before_fork) {
-               transferred = vmsplice(fds[1], &iov, 1, 0);
-               if (transferred <= 0) {
-                       ksft_test_result_fail("vmsplice() failed\n");
-                       goto close_pipe;
-               }
-       }
-
-       ret = fork();
-       if (ret < 0) {
-               ksft_test_result_fail("fork() failed\n");
-               goto close_pipe;
-       } else if (!ret) {
-               write(comm_pipes.child_ready[1], "0", 1);
-               while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
-                       ;
-               /* Modify page content in the child. */
-               memset(mem, 0xff, size);
-               exit(0);
-       }
-
-       if (!before_fork) {
-               transferred = vmsplice(fds[1], &iov, 1, 0);
-               if (transferred <= 0) {
-                       ksft_test_result_fail("vmsplice() failed\n");
-                       wait(&ret);
-                       goto close_pipe;
-               }
-       }
-
-       while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-               ;
-       if (munmap(mem, size) < 0) {
-               ksft_test_result_fail("munmap() failed\n");
-               goto close_pipe;
-       }
-       write(comm_pipes.parent_ready[1], "0", 1);
-
-       /* Wait until the child is done writing. */
-       wait(&ret);
-       if (!WIFEXITED(ret)) {
-               ksft_test_result_fail("wait() failed\n");
-               goto close_pipe;
-       }
-
-       /* See if we still read the old values. */
-       for (total = 0; total < transferred; total += cur) {
-               cur = read(fds[0], new + total, transferred - total);
-               if (cur < 0) {
-                       ksft_test_result_fail("read() failed\n");
-                       goto close_pipe;
-               }
-       }
-
-       ksft_test_result(!memcmp(old, new, transferred),
-                        "No leak from child into parent\n");
-close_pipe:
-       close(fds[0]);
-       close(fds[1]);
-close_comm_pipes:
-       close_comm_pipes(&comm_pipes);
-free:
-       free(old);
-       free(new);
-}
-
-static void test_vmsplice_before_fork(char *mem, size_t size)
-{
-       do_test_vmsplice_in_parent(mem, size, true);
-}
-
-static void test_vmsplice_after_fork(char *mem, size_t size)
-{
-       do_test_vmsplice_in_parent(mem, size, false);
-}
-
-#ifdef LOCAL_CONFIG_HAVE_LIBURING
-static void do_test_iouring(char *mem, size_t size, bool use_fork)
-{
-       struct comm_pipes comm_pipes;
-       struct io_uring_cqe *cqe;
-       struct io_uring_sqe *sqe;
-       struct io_uring ring;
-       ssize_t cur, total;
-       struct iovec iov;
-       char *buf, *tmp;
-       int ret, fd;
-       FILE *file;
-
-       ret = setup_comm_pipes(&comm_pipes);
-       if (ret) {
-               ksft_test_result_fail("pipe() failed\n");
-               return;
-       }
-
-       file = tmpfile();
-       if (!file) {
-               ksft_test_result_fail("tmpfile() failed\n");
-               goto close_comm_pipes;
-       }
-       fd = fileno(file);
-       assert(fd);
-
-       tmp = malloc(size);
-       if (!tmp) {
-               ksft_test_result_fail("malloc() failed\n");
-               goto close_file;
-       }
-
-       /* Skip on errors, as we might just lack kernel support. */
-       ret = io_uring_queue_init(1, &ring, 0);
-       if (ret < 0) {
-               ksft_test_result_skip("io_uring_queue_init() failed\n");
-               goto free_tmp;
-       }
-
-       /*
-        * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN
-        * | FOLL_LONGTERM the range.
-        *
-        * Skip on errors, as we might just lack kernel support or might not
-        * have sufficient MEMLOCK permissions.
-        */
-       iov.iov_base = mem;
-       iov.iov_len = size;
-       ret = io_uring_register_buffers(&ring, &iov, 1);
-       if (ret) {
-               ksft_test_result_skip("io_uring_register_buffers() failed\n");
-               goto queue_exit;
-       }
-
-       if (use_fork) {
-               /*
-                * fork() and keep the child alive until we're done. Note that
-                * we expect the pinned page to not get shared with the child.
-                */
-               ret = fork();
-               if (ret < 0) {
-                       ksft_test_result_fail("fork() failed\n");
-                       goto unregister_buffers;
-               } else if (!ret) {
-                       write(comm_pipes.child_ready[1], "0", 1);
-                       while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
-                               ;
-                       exit(0);
-               }
-
-               while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-                       ;
-       } else {
-               /*
-                * Map the page R/O into the page table. Enable softdirty
-                * tracking to stop the page from getting mapped R/W immediately
-                * again by mprotect() optimizations. Note that we don't have an
-                * easy way to test if that worked (the pagemap does not export
-                * if the page is mapped R/O vs. R/W).
-                */
-               ret = mprotect(mem, size, PROT_READ);
-               clear_softdirty();
-               ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
-               if (ret) {
-                       ksft_test_result_fail("mprotect() failed\n");
-                       goto unregister_buffers;
-               }
-       }
-
-       /*
-        * Modify the page and write page content as observed by the fixed
-        * buffer pin to the file so we can verify it.
-        */
-       memset(mem, 0xff, size);
-       sqe = io_uring_get_sqe(&ring);
-       if (!sqe) {
-               ksft_test_result_fail("io_uring_get_sqe() failed\n");
-               goto quit_child;
-       }
-       io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0);
-
-       ret = io_uring_submit(&ring);
-       if (ret < 0) {
-               ksft_test_result_fail("io_uring_submit() failed\n");
-               goto quit_child;
-       }
-
-       ret = io_uring_wait_cqe(&ring, &cqe);
-       if (ret < 0) {
-               ksft_test_result_fail("io_uring_wait_cqe() failed\n");
-               goto quit_child;
-       }
-
-       if (cqe->res != size) {
-               ksft_test_result_fail("write_fixed failed\n");
-               goto quit_child;
-       }
-       io_uring_cqe_seen(&ring, cqe);
-
-       /* Read back the file content to the temporary buffer. */
-       total = 0;
-       while (total < size) {
-               cur = pread(fd, tmp + total, size - total, total);
-               if (cur < 0) {
-                       ksft_test_result_fail("pread() failed\n");
-                       goto quit_child;
-               }
-               total += cur;
-       }
-
-       /* Finally, check if we read what we expected. */
-       ksft_test_result(!memcmp(mem, tmp, size),
-                        "Longterm R/W pin is reliable\n");
-
-quit_child:
-       if (use_fork) {
-               write(comm_pipes.parent_ready[1], "0", 1);
-               wait(&ret);
-       }
-unregister_buffers:
-       io_uring_unregister_buffers(&ring);
-queue_exit:
-       io_uring_queue_exit(&ring);
-free_tmp:
-       free(tmp);
-close_file:
-       fclose(file);
-close_comm_pipes:
-       close_comm_pipes(&comm_pipes);
-}
-
-static void test_iouring_ro(char *mem, size_t size)
-{
-       do_test_iouring(mem, size, false);
-}
-
-static void test_iouring_fork(char *mem, size_t size)
-{
-       do_test_iouring(mem, size, true);
-}
-
-#endif /* LOCAL_CONFIG_HAVE_LIBURING */
-
-enum ro_pin_test {
-       RO_PIN_TEST,
-       RO_PIN_TEST_SHARED,
-       RO_PIN_TEST_PREVIOUSLY_SHARED,
-       RO_PIN_TEST_RO_EXCLUSIVE,
-};
-
-static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test,
-                          bool fast)
-{
-       struct pin_longterm_test args;
-       struct comm_pipes comm_pipes;
-       char *tmp, buf;
-       __u64 tmp_val;
-       int ret;
-
-       if (gup_fd < 0) {
-               ksft_test_result_skip("gup_test not available\n");
-               return;
-       }
-
-       tmp = malloc(size);
-       if (!tmp) {
-               ksft_test_result_fail("malloc() failed\n");
-               return;
-       }
-
-       ret = setup_comm_pipes(&comm_pipes);
-       if (ret) {
-               ksft_test_result_fail("pipe() failed\n");
-               goto free_tmp;
-       }
-
-       switch (test) {
-       case RO_PIN_TEST:
-               break;
-       case RO_PIN_TEST_SHARED:
-       case RO_PIN_TEST_PREVIOUSLY_SHARED:
-               /*
-                * Share the pages with our child. As the pages are not pinned,
-                * this should just work.
-                */
-               ret = fork();
-               if (ret < 0) {
-                       ksft_test_result_fail("fork() failed\n");
-                       goto close_comm_pipes;
-               } else if (!ret) {
-                       write(comm_pipes.child_ready[1], "0", 1);
-                       while (read(comm_pipes.parent_ready[0], &buf, 1) != 1)
-                               ;
-                       exit(0);
-               }
-
-               /* Wait until our child is ready. */
-               while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-                       ;
-
-               if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) {
-                       /*
-                        * Tell the child to quit now and wait until it quit.
-                        * The pages should now be mapped R/O into our page
-                        * tables, but they are no longer shared.
-                        */
-                       write(comm_pipes.parent_ready[1], "0", 1);
-                       wait(&ret);
-                       if (!WIFEXITED(ret))
-                               ksft_print_msg("[INFO] wait() failed\n");
-               }
-               break;
-       case RO_PIN_TEST_RO_EXCLUSIVE:
-               /*
-                * Map the page R/O into the page table. Enable softdirty
-                * tracking to stop the page from getting mapped R/W immediately
-                * again by mprotect() optimizations. Note that we don't have an
-                * easy way to test if that worked (the pagemap does not export
-                * if the page is mapped R/O vs. R/W).
-                */
-               ret = mprotect(mem, size, PROT_READ);
-               clear_softdirty();
-               ret |= mprotect(mem, size, PROT_READ | PROT_WRITE);
-               if (ret) {
-                       ksft_test_result_fail("mprotect() failed\n");
-                       goto close_comm_pipes;
-               }
-               break;
-       default:
-               assert(false);
-       }
-
-       /* Take a R/O pin. This should trigger unsharing. */
-       args.addr = (__u64)(uintptr_t)mem;
-       args.size = size;
-       args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0;
-       ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args);
-       if (ret) {
-               if (errno == EINVAL)
-                       ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n");
-               else
-                       ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n");
-               goto wait;
-       }
-
-       /* Modify the page. */
-       memset(mem, 0xff, size);
-
-       /*
-        * Read back the content via the pin to the temporary buffer and
-        * test if we observed the modification.
-        */
-       tmp_val = (__u64)(uintptr_t)tmp;
-       ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val);
-       if (ret)
-               ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n");
-       else
-               ksft_test_result(!memcmp(mem, tmp, size),
-                                "Longterm R/O pin is reliable\n");
-
-       ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP);
-       if (ret)
-               ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n");
-wait:
-       switch (test) {
-       case RO_PIN_TEST_SHARED:
-               write(comm_pipes.parent_ready[1], "0", 1);
-               wait(&ret);
-               if (!WIFEXITED(ret))
-                       ksft_print_msg("[INFO] wait() failed\n");
-               break;
-       default:
-               break;
-       }
-close_comm_pipes:
-       close_comm_pipes(&comm_pipes);
-free_tmp:
-       free(tmp);
-}
-
-static void test_ro_pin_on_shared(char *mem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false);
-}
-
-static void test_ro_fast_pin_on_shared(char *mem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true);
-}
-
-static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false);
-}
-
-static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true);
-}
-
-static void test_ro_pin_on_ro_exclusive(char *mem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false);
-}
-
-static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true);
-}
-
-typedef void (*test_fn)(char *mem, size_t size);
-
-static void do_run_with_base_page(test_fn fn, bool swapout)
-{
-       char *mem;
-       int ret;
-
-       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
-                  MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               return;
-       }
-
-       ret = madvise(mem, pagesize, MADV_NOHUGEPAGE);
-       /* Ignore if not around on a kernel. */
-       if (ret && errno != EINVAL) {
-               ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
-               goto munmap;
-       }
-
-       /* Populate a base page. */
-       memset(mem, 0, pagesize);
-
-       if (swapout) {
-               madvise(mem, pagesize, MADV_PAGEOUT);
-               if (!pagemap_is_swapped(pagemap_fd, mem)) {
-                       ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
-                       goto munmap;
-               }
-       }
-
-       fn(mem, pagesize);
-munmap:
-       munmap(mem, pagesize);
-}
-
-static void run_with_base_page(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with base page\n", desc);
-       do_run_with_base_page(fn, false);
-}
-
-static void run_with_base_page_swap(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc);
-       do_run_with_base_page(fn, true);
-}
-
-enum thp_run {
-       THP_RUN_PMD,
-       THP_RUN_PMD_SWAPOUT,
-       THP_RUN_PTE,
-       THP_RUN_PTE_SWAPOUT,
-       THP_RUN_SINGLE_PTE,
-       THP_RUN_SINGLE_PTE_SWAPOUT,
-       THP_RUN_PARTIAL_MREMAP,
-       THP_RUN_PARTIAL_SHARED,
-};
-
-static void do_run_with_thp(test_fn fn, enum thp_run thp_run)
-{
-       char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED;
-       size_t size, mmap_size, mremap_size;
-       int ret;
-
-       /* For alignment purposes, we need twice the thp size. */
-       mmap_size = 2 * thpsize;
-       mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       if (mmap_mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               return;
-       }
-
-       /* We need a THP-aligned memory area. */
-       mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
-
-       ret = madvise(mem, thpsize, MADV_HUGEPAGE);
-       if (ret) {
-               ksft_test_result_fail("MADV_HUGEPAGE failed\n");
-               goto munmap;
-       }
-
-       /*
-        * Try to populate a THP. Touch the first sub-page and test if we get
-        * another sub-page populated automatically.
-        */
-       mem[0] = 0;
-       if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) {
-               ksft_test_result_skip("Did not get a THP populated\n");
-               goto munmap;
-       }
-       memset(mem, 0, thpsize);
-
-       size = thpsize;
-       switch (thp_run) {
-       case THP_RUN_PMD:
-       case THP_RUN_PMD_SWAPOUT:
-               break;
-       case THP_RUN_PTE:
-       case THP_RUN_PTE_SWAPOUT:
-               /*
-                * Trigger PTE-mapping the THP by temporarily mapping a single
-                * subpage R/O.
-                */
-               ret = mprotect(mem + pagesize, pagesize, PROT_READ);
-               if (ret) {
-                       ksft_test_result_fail("mprotect() failed\n");
-                       goto munmap;
-               }
-               ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
-               if (ret) {
-                       ksft_test_result_fail("mprotect() failed\n");
-                       goto munmap;
-               }
-               break;
-       case THP_RUN_SINGLE_PTE:
-       case THP_RUN_SINGLE_PTE_SWAPOUT:
-               /*
-                * Discard all but a single subpage of that PTE-mapped THP. What
-                * remains is a single PTE mapping a single subpage.
-                */
-               ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED);
-               if (ret) {
-                       ksft_test_result_fail("MADV_DONTNEED failed\n");
-                       goto munmap;
-               }
-               size = pagesize;
-               break;
-       case THP_RUN_PARTIAL_MREMAP:
-               /*
-                * Remap half of the THP. We need some new memory location
-                * for that.
-                */
-               mremap_size = thpsize / 2;
-               mremap_mem = mmap(NULL, mremap_size, PROT_NONE,
-                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-               if (mem == MAP_FAILED) {
-                       ksft_test_result_fail("mmap() failed\n");
-                       goto munmap;
-               }
-               tmp = mremap(mem + mremap_size, mremap_size, mremap_size,
-                            MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem);
-               if (tmp != mremap_mem) {
-                       ksft_test_result_fail("mremap() failed\n");
-                       goto munmap;
-               }
-               size = mremap_size;
-               break;
-       case THP_RUN_PARTIAL_SHARED:
-               /*
-                * Share the first page of the THP with a child and quit the
-                * child. This will result in some parts of the THP never
-                * have been shared.
-                */
-               ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK);
-               if (ret) {
-                       ksft_test_result_fail("MADV_DONTFORK failed\n");
-                       goto munmap;
-               }
-               ret = fork();
-               if (ret < 0) {
-                       ksft_test_result_fail("fork() failed\n");
-                       goto munmap;
-               } else if (!ret) {
-                       exit(0);
-               }
-               wait(&ret);
-               /* Allow for sharing all pages again. */
-               ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK);
-               if (ret) {
-                       ksft_test_result_fail("MADV_DOFORK failed\n");
-                       goto munmap;
-               }
-               break;
-       default:
-               assert(false);
-       }
-
-       switch (thp_run) {
-       case THP_RUN_PMD_SWAPOUT:
-       case THP_RUN_PTE_SWAPOUT:
-       case THP_RUN_SINGLE_PTE_SWAPOUT:
-               madvise(mem, size, MADV_PAGEOUT);
-               if (!range_is_swapped(mem, size)) {
-                       ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n");
-                       goto munmap;
-               }
-               break;
-       default:
-               break;
-       }
-
-       fn(mem, size);
-munmap:
-       munmap(mmap_mem, mmap_size);
-       if (mremap_mem != MAP_FAILED)
-               munmap(mremap_mem, mremap_size);
-}
-
-static void run_with_thp(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_PMD);
-}
-
-static void run_with_thp_swap(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT);
-}
-
-static void run_with_pte_mapped_thp(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_PTE);
-}
-
-static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT);
-}
-
-static void run_with_single_pte_of_thp(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_SINGLE_PTE);
-}
-
-static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT);
-}
-
-static void run_with_partial_mremap_thp(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP);
-}
-
-static void run_with_partial_shared_thp(test_fn fn, const char *desc)
-{
-       ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc);
-       do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED);
-}
-
-static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize)
-{
-       int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB;
-       char *mem, *dummy;
-
-       ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc,
-                      hugetlbsize / 1024);
-
-       flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT;
-
-       mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_skip("need more free huge pages\n");
-               return;
-       }
-
-       /* Populate an huge page. */
-       memset(mem, 0, hugetlbsize);
-
-       /*
-        * We need a total of two hugetlb pages to handle COW/unsharing
-        * properly, otherwise we might get zapped by a SIGBUS.
-        */
-       dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0);
-       if (dummy == MAP_FAILED) {
-               ksft_test_result_skip("need more free huge pages\n");
-               goto munmap;
-       }
-       munmap(dummy, hugetlbsize);
-
-       fn(mem, hugetlbsize);
-munmap:
-       munmap(mem, hugetlbsize);
-}
-
-struct test_case {
-       const char *desc;
-       test_fn fn;
-};
-
-/*
- * Test cases that are specific to anonymous pages: pages in private mappings
- * that may get shared via COW during fork().
- */
-static const struct test_case anon_test_cases[] = {
-       /*
-        * Basic COW tests for fork() without any GUP. If we miss to break COW,
-        * either the child can observe modifications by the parent or the
-        * other way around.
-        */
-       {
-               "Basic COW after fork()",
-               test_cow_in_parent,
-       },
-       /*
-        * Basic test, but do an additional mprotect(PROT_READ)+
-        * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
-        */
-       {
-               "Basic COW after fork() with mprotect() optimization",
-               test_cow_in_parent_mprotect,
-       },
-       /*
-        * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If
-        * we miss to break COW, the child observes modifications by the parent.
-        * This is CVE-2020-29374 reported by Jann Horn.
-        */
-       {
-               "vmsplice() + unmap in child",
-               test_vmsplice_in_child
-       },
-       /*
-        * vmsplice() test, but do an additional mprotect(PROT_READ)+
-        * mprotect(PROT_READ|PROT_WRITE) in the parent before write access.
-        */
-       {
-               "vmsplice() + unmap in child with mprotect() optimization",
-               test_vmsplice_in_child_mprotect
-       },
-       /*
-        * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after
-        * fork(); modify in the child. If we miss to break COW, the parent
-        * observes modifications by the child.
-        */
-       {
-               "vmsplice() before fork(), unmap in parent after fork()",
-               test_vmsplice_before_fork,
-       },
-       /*
-        * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the
-        * child. If we miss to break COW, the parent observes modifications by
-        * the child.
-        */
-       {
-               "vmsplice() + unmap in parent after fork()",
-               test_vmsplice_after_fork,
-       },
-#ifdef LOCAL_CONFIG_HAVE_LIBURING
-       /*
-        * Take a R/W longterm pin and then map the page R/O into the page
-        * table to trigger a write fault on next access. When modifying the
-        * page, the page content must be visible via the pin.
-        */
-       {
-               "R/O-mapping a page registered as iouring fixed buffer",
-               test_iouring_ro,
-       },
-       /*
-        * Take a R/W longterm pin and then fork() a child. When modifying the
-        * page, the page content must be visible via the pin. We expect the
-        * pinned page to not get shared with the child.
-        */
-       {
-               "fork() with an iouring fixed buffer",
-               test_iouring_fork,
-       },
-
-#endif /* LOCAL_CONFIG_HAVE_LIBURING */
-       /*
-        * Take a R/O longterm pin on a R/O-mapped shared anonymous page.
-        * When modifying the page via the page table, the page content change
-        * must be visible via the pin.
-        */
-       {
-               "R/O GUP pin on R/O-mapped shared page",
-               test_ro_pin_on_shared,
-       },
-       /* Same as above, but using GUP-fast. */
-       {
-               "R/O GUP-fast pin on R/O-mapped shared page",
-               test_ro_fast_pin_on_shared,
-       },
-       /*
-        * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that
-        * was previously shared. When modifying the page via the page table,
-        * the page content change must be visible via the pin.
-        */
-       {
-               "R/O GUP pin on R/O-mapped previously-shared page",
-               test_ro_pin_on_ro_previously_shared,
-       },
-       /* Same as above, but using GUP-fast. */
-       {
-               "R/O GUP-fast pin on R/O-mapped previously-shared page",
-               test_ro_fast_pin_on_ro_previously_shared,
-       },
-       /*
-        * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page.
-        * When modifying the page via the page table, the page content change
-        * must be visible via the pin.
-        */
-       {
-               "R/O GUP pin on R/O-mapped exclusive page",
-               test_ro_pin_on_ro_exclusive,
-       },
-       /* Same as above, but using GUP-fast. */
-       {
-               "R/O GUP-fast pin on R/O-mapped exclusive page",
-               test_ro_fast_pin_on_ro_exclusive,
-       },
-};
-
-static void run_anon_test_case(struct test_case const *test_case)
-{
-       int i;
-
-       run_with_base_page(test_case->fn, test_case->desc);
-       run_with_base_page_swap(test_case->fn, test_case->desc);
-       if (thpsize) {
-               run_with_thp(test_case->fn, test_case->desc);
-               run_with_thp_swap(test_case->fn, test_case->desc);
-               run_with_pte_mapped_thp(test_case->fn, test_case->desc);
-               run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc);
-               run_with_single_pte_of_thp(test_case->fn, test_case->desc);
-               run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc);
-               run_with_partial_mremap_thp(test_case->fn, test_case->desc);
-               run_with_partial_shared_thp(test_case->fn, test_case->desc);
-       }
-       for (i = 0; i < nr_hugetlbsizes; i++)
-               run_with_hugetlb(test_case->fn, test_case->desc,
-                                hugetlbsizes[i]);
-}
-
-static void run_anon_test_cases(void)
-{
-       int i;
-
-       ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n");
-
-       for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++)
-               run_anon_test_case(&anon_test_cases[i]);
-}
-
-static int tests_per_anon_test_case(void)
-{
-       int tests = 2 + nr_hugetlbsizes;
-
-       if (thpsize)
-               tests += 8;
-       return tests;
-}
-
-enum anon_thp_collapse_test {
-       ANON_THP_COLLAPSE_UNSHARED,
-       ANON_THP_COLLAPSE_FULLY_SHARED,
-       ANON_THP_COLLAPSE_LOWER_SHARED,
-       ANON_THP_COLLAPSE_UPPER_SHARED,
-};
-
-static void do_test_anon_thp_collapse(char *mem, size_t size,
-                                     enum anon_thp_collapse_test test)
-{
-       struct comm_pipes comm_pipes;
-       char buf;
-       int ret;
-
-       ret = setup_comm_pipes(&comm_pipes);
-       if (ret) {
-               ksft_test_result_fail("pipe() failed\n");
-               return;
-       }
-
-       /*
-        * Trigger PTE-mapping the THP by temporarily mapping a single subpage
-        * R/O, such that we can try collapsing it later.
-        */
-       ret = mprotect(mem + pagesize, pagesize, PROT_READ);
-       if (ret) {
-               ksft_test_result_fail("mprotect() failed\n");
-               goto close_comm_pipes;
-       }
-       ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE);
-       if (ret) {
-               ksft_test_result_fail("mprotect() failed\n");
-               goto close_comm_pipes;
-       }
-
-       switch (test) {
-       case ANON_THP_COLLAPSE_UNSHARED:
-               /* Collapse before actually COW-sharing the page. */
-               ret = madvise(mem, size, MADV_COLLAPSE);
-               if (ret) {
-                       ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
-                                             strerror(errno));
-                       goto close_comm_pipes;
-               }
-               break;
-       case ANON_THP_COLLAPSE_FULLY_SHARED:
-               /* COW-share the full PTE-mapped THP. */
-               break;
-       case ANON_THP_COLLAPSE_LOWER_SHARED:
-               /* Don't COW-share the upper part of the THP. */
-               ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK);
-               if (ret) {
-                       ksft_test_result_fail("MADV_DONTFORK failed\n");
-                       goto close_comm_pipes;
-               }
-               break;
-       case ANON_THP_COLLAPSE_UPPER_SHARED:
-               /* Don't COW-share the lower part of the THP. */
-               ret = madvise(mem, size / 2, MADV_DONTFORK);
-               if (ret) {
-                       ksft_test_result_fail("MADV_DONTFORK failed\n");
-                       goto close_comm_pipes;
-               }
-               break;
-       default:
-               assert(false);
-       }
-
-       ret = fork();
-       if (ret < 0) {
-               ksft_test_result_fail("fork() failed\n");
-               goto close_comm_pipes;
-       } else if (!ret) {
-               switch (test) {
-               case ANON_THP_COLLAPSE_UNSHARED:
-               case ANON_THP_COLLAPSE_FULLY_SHARED:
-                       exit(child_memcmp_fn(mem, size, &comm_pipes));
-                       break;
-               case ANON_THP_COLLAPSE_LOWER_SHARED:
-                       exit(child_memcmp_fn(mem, size / 2, &comm_pipes));
-                       break;
-               case ANON_THP_COLLAPSE_UPPER_SHARED:
-                       exit(child_memcmp_fn(mem + size / 2, size / 2,
-                                            &comm_pipes));
-                       break;
-               default:
-                       assert(false);
-               }
-       }
-
-       while (read(comm_pipes.child_ready[0], &buf, 1) != 1)
-               ;
-
-       switch (test) {
-       case ANON_THP_COLLAPSE_UNSHARED:
-               break;
-       case ANON_THP_COLLAPSE_UPPER_SHARED:
-       case ANON_THP_COLLAPSE_LOWER_SHARED:
-               /*
-                * Revert MADV_DONTFORK such that we merge the VMAs and are
-                * able to actually collapse.
-                */
-               ret = madvise(mem, size, MADV_DOFORK);
-               if (ret) {
-                       ksft_test_result_fail("MADV_DOFORK failed\n");
-                       write(comm_pipes.parent_ready[1], "0", 1);
-                       wait(&ret);
-                       goto close_comm_pipes;
-               }
-               /* FALLTHROUGH */
-       case ANON_THP_COLLAPSE_FULLY_SHARED:
-               /* Collapse before anyone modified the COW-shared page. */
-               ret = madvise(mem, size, MADV_COLLAPSE);
-               if (ret) {
-                       ksft_test_result_skip("MADV_COLLAPSE failed: %s\n",
-                                             strerror(errno));
-                       write(comm_pipes.parent_ready[1], "0", 1);
-                       wait(&ret);
-                       goto close_comm_pipes;
-               }
-               break;
-       default:
-               assert(false);
-       }
-
-       /* Modify the page. */
-       memset(mem, 0xff, size);
-       write(comm_pipes.parent_ready[1], "0", 1);
-
-       wait(&ret);
-       if (WIFEXITED(ret))
-               ret = WEXITSTATUS(ret);
-       else
-               ret = -EINVAL;
-
-       ksft_test_result(!ret, "No leak from parent into child\n");
-close_comm_pipes:
-       close_comm_pipes(&comm_pipes);
-}
-
-static void test_anon_thp_collapse_unshared(char *mem, size_t size)
-{
-       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED);
-}
-
-static void test_anon_thp_collapse_fully_shared(char *mem, size_t size)
-{
-       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED);
-}
-
-static void test_anon_thp_collapse_lower_shared(char *mem, size_t size)
-{
-       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED);
-}
-
-static void test_anon_thp_collapse_upper_shared(char *mem, size_t size)
-{
-       do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED);
-}
-
-/*
- * Test cases that are specific to anonymous THP: pages in private mappings
- * that may get shared via COW during fork().
- */
-static const struct test_case anon_thp_test_cases[] = {
-       /*
-        * Basic COW test for fork() without any GUP when collapsing a THP
-        * before fork().
-        *
-        * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place
-        * collapse") might easily get COW handling wrong when not collapsing
-        * exclusivity information properly.
-        */
-       {
-               "Basic COW after fork() when collapsing before fork()",
-               test_anon_thp_collapse_unshared,
-       },
-       /* Basic COW test, but collapse after COW-sharing a full THP. */
-       {
-               "Basic COW after fork() when collapsing after fork() (fully shared)",
-               test_anon_thp_collapse_fully_shared,
-       },
-       /*
-        * Basic COW test, but collapse after COW-sharing the lower half of a
-        * THP.
-        */
-       {
-               "Basic COW after fork() when collapsing after fork() (lower shared)",
-               test_anon_thp_collapse_lower_shared,
-       },
-       /*
-        * Basic COW test, but collapse after COW-sharing the upper half of a
-        * THP.
-        */
-       {
-               "Basic COW after fork() when collapsing after fork() (upper shared)",
-               test_anon_thp_collapse_upper_shared,
-       },
-};
-
-static void run_anon_thp_test_cases(void)
-{
-       int i;
-
-       if (!thpsize)
-               return;
-
-       ksft_print_msg("[INFO] Anonymous THP tests\n");
-
-       for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) {
-               struct test_case const *test_case = &anon_thp_test_cases[i];
-
-               ksft_print_msg("[RUN] %s\n", test_case->desc);
-               do_run_with_thp(test_case->fn, THP_RUN_PMD);
-       }
-}
-
-static int tests_per_anon_thp_test_case(void)
-{
-       return thpsize ? 1 : 0;
-}
-
-typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size);
-
-static void test_cow(char *mem, const char *smem, size_t size)
-{
-       char *old = malloc(size);
-
-       /* Backup the original content. */
-       memcpy(old, smem, size);
-
-       /* Modify the page. */
-       memset(mem, 0xff, size);
-
-       /* See if we still read the old values via the other mapping. */
-       ksft_test_result(!memcmp(smem, old, size),
-                        "Other mapping not modified\n");
-       free(old);
-}
-
-static void test_ro_pin(char *mem, const char *smem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST, false);
-}
-
-static void test_ro_fast_pin(char *mem, const char *smem, size_t size)
-{
-       do_test_ro_pin(mem, size, RO_PIN_TEST, true);
-}
-
-static void run_with_zeropage(non_anon_test_fn fn, const char *desc)
-{
-       char *mem, *smem, tmp;
-
-       ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc);
-
-       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE,
-                  MAP_PRIVATE | MAP_ANON, -1, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               return;
-       }
-
-       smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               goto munmap;
-       }
-
-       /* Read from the page to populate the shared zeropage. */
-       tmp = *mem + *smem;
-       asm volatile("" : "+r" (tmp));
-
-       fn(mem, smem, pagesize);
-munmap:
-       munmap(mem, pagesize);
-       if (smem != MAP_FAILED)
-               munmap(smem, pagesize);
-}
-
-static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc)
-{
-       char *mem, *smem, *mmap_mem, *mmap_smem, tmp;
-       size_t mmap_size;
-       int ret;
-
-       ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc);
-
-       if (!has_huge_zeropage) {
-               ksft_test_result_skip("Huge zeropage not enabled\n");
-               return;
-       }
-
-       /* For alignment purposes, we need twice the thp size. */
-       mmap_size = 2 * thpsize;
-       mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       if (mmap_mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               return;
-       }
-       mmap_smem = mmap(NULL, mmap_size, PROT_READ,
-                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       if (mmap_smem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               goto munmap;
-       }
-
-       /* We need a THP-aligned memory area. */
-       mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1));
-       smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1));
-
-       ret = madvise(mem, thpsize, MADV_HUGEPAGE);
-       ret |= madvise(smem, thpsize, MADV_HUGEPAGE);
-       if (ret) {
-               ksft_test_result_fail("MADV_HUGEPAGE failed\n");
-               goto munmap;
-       }
-
-       /*
-        * Read from the memory to populate the huge shared zeropage. Read from
-        * the first sub-page and test if we get another sub-page populated
-        * automatically.
-        */
-       tmp = *mem + *smem;
-       asm volatile("" : "+r" (tmp));
-       if (!pagemap_is_populated(pagemap_fd, mem + pagesize) ||
-           !pagemap_is_populated(pagemap_fd, smem + pagesize)) {
-               ksft_test_result_skip("Did not get THPs populated\n");
-               goto munmap;
-       }
-
-       fn(mem, smem, thpsize);
-munmap:
-       munmap(mmap_mem, mmap_size);
-       if (mmap_smem != MAP_FAILED)
-               munmap(mmap_smem, mmap_size);
-}
-
-static void run_with_memfd(non_anon_test_fn fn, const char *desc)
-{
-       char *mem, *smem, tmp;
-       int fd;
-
-       ksft_print_msg("[RUN] %s ... with memfd\n", desc);
-
-       fd = memfd_create("test", 0);
-       if (fd < 0) {
-               ksft_test_result_fail("memfd_create() failed\n");
-               return;
-       }
-
-       /* File consists of a single page filled with zeroes. */
-       if (fallocate(fd, 0, 0, pagesize)) {
-               ksft_test_result_fail("fallocate() failed\n");
-               goto close;
-       }
-
-       /* Create a private mapping of the memfd. */
-       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               goto close;
-       }
-       smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               goto munmap;
-       }
-
-       /* Fault the page in. */
-       tmp = *mem + *smem;
-       asm volatile("" : "+r" (tmp));
-
-       fn(mem, smem, pagesize);
-munmap:
-       munmap(mem, pagesize);
-       if (smem != MAP_FAILED)
-               munmap(smem, pagesize);
-close:
-       close(fd);
-}
-
-static void run_with_tmpfile(non_anon_test_fn fn, const char *desc)
-{
-       char *mem, *smem, tmp;
-       FILE *file;
-       int fd;
-
-       ksft_print_msg("[RUN] %s ... with tmpfile\n", desc);
-
-       file = tmpfile();
-       if (!file) {
-               ksft_test_result_fail("tmpfile() failed\n");
-               return;
-       }
-
-       fd = fileno(file);
-       if (fd < 0) {
-               ksft_test_result_skip("fileno() failed\n");
-               return;
-       }
-
-       /* File consists of a single page filled with zeroes. */
-       if (fallocate(fd, 0, 0, pagesize)) {
-               ksft_test_result_fail("fallocate() failed\n");
-               goto close;
-       }
-
-       /* Create a private mapping of the memfd. */
-       mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               goto close;
-       }
-       smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               goto munmap;
-       }
-
-       /* Fault the page in. */
-       tmp = *mem + *smem;
-       asm volatile("" : "+r" (tmp));
-
-       fn(mem, smem, pagesize);
-munmap:
-       munmap(mem, pagesize);
-       if (smem != MAP_FAILED)
-               munmap(smem, pagesize);
-close:
-       fclose(file);
-}
-
-static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc,
-                                  size_t hugetlbsize)
-{
-       int flags = MFD_HUGETLB;
-       char *mem, *smem, tmp;
-       int fd;
-
-       ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc,
-                      hugetlbsize / 1024);
-
-       flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT;
-
-       fd = memfd_create("test", flags);
-       if (fd < 0) {
-               ksft_test_result_skip("memfd_create() failed\n");
-               return;
-       }
-
-       /* File consists of a single page filled with zeroes. */
-       if (fallocate(fd, 0, 0, hugetlbsize)) {
-               ksft_test_result_skip("need more free huge pages\n");
-               goto close;
-       }
-
-       /* Create a private mapping of the memfd. */
-       mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd,
-                  0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_skip("need more free huge pages\n");
-               goto close;
-       }
-       smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0);
-       if (mem == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               goto munmap;
-       }
-
-       /* Fault the page in. */
-       tmp = *mem + *smem;
-       asm volatile("" : "+r" (tmp));
-
-       fn(mem, smem, hugetlbsize);
-munmap:
-       munmap(mem, hugetlbsize);
-       if (mem != MAP_FAILED)
-               munmap(smem, hugetlbsize);
-close:
-       close(fd);
-}
-
-struct non_anon_test_case {
-       const char *desc;
-       non_anon_test_fn fn;
-};
-
-/*
- * Test cases that target any pages in private mappings that are not anonymous:
- * pages that may get shared via COW ndependent of fork(). This includes
- * the shared zeropage(s), pagecache pages, ...
- */
-static const struct non_anon_test_case non_anon_test_cases[] = {
-       /*
-        * Basic COW test without any GUP. If we miss to break COW, changes are
-        * visible via other private/shared mappings.
-        */
-       {
-               "Basic COW",
-               test_cow,
-       },
-       /*
-        * Take a R/O longterm pin. When modifying the page via the page table,
-        * the page content change must be visible via the pin.
-        */
-       {
-               "R/O longterm GUP pin",
-               test_ro_pin,
-       },
-       /* Same as above, but using GUP-fast. */
-       {
-               "R/O longterm GUP-fast pin",
-               test_ro_fast_pin,
-       },
-};
-
-static void run_non_anon_test_case(struct non_anon_test_case const *test_case)
-{
-       int i;
-
-       run_with_zeropage(test_case->fn, test_case->desc);
-       run_with_memfd(test_case->fn, test_case->desc);
-       run_with_tmpfile(test_case->fn, test_case->desc);
-       if (thpsize)
-               run_with_huge_zeropage(test_case->fn, test_case->desc);
-       for (i = 0; i < nr_hugetlbsizes; i++)
-               run_with_memfd_hugetlb(test_case->fn, test_case->desc,
-                                      hugetlbsizes[i]);
-}
-
-static void run_non_anon_test_cases(void)
-{
-       int i;
-
-       ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n");
-
-       for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++)
-               run_non_anon_test_case(&non_anon_test_cases[i]);
-}
-
-static int tests_per_non_anon_test_case(void)
-{
-       int tests = 3 + nr_hugetlbsizes;
-
-       if (thpsize)
-               tests += 1;
-       return tests;
-}
-
-int main(int argc, char **argv)
-{
-       int err;
-
-       pagesize = getpagesize();
-       detect_thpsize();
-       detect_hugetlbsizes();
-       detect_huge_zeropage();
-
-       ksft_print_header();
-       ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() +
-                     ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() +
-                     ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case());
-
-       gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
-       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-       if (pagemap_fd < 0)
-               ksft_exit_fail_msg("opening pagemap failed\n");
-
-       run_anon_test_cases();
-       run_anon_thp_test_cases();
-       run_non_anon_test_cases();
-
-       err = ksft_get_fail_cnt();
-       if (err)
-               ksft_exit_fail_msg("%d out of %d tests failed\n",
-                                  err, ksft_test_num());
-       return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c
deleted file mode 100644 (file)
index e438792..0000000
+++ /dev/null
@@ -1,271 +0,0 @@
-#include <fcntl.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <pthread.h>
-#include <assert.h>
-#include <mm/gup_test.h>
-#include "../kselftest.h"
-
-#include "util.h"
-
-#define MB (1UL << 20)
-
-/* Just the flags we need, copied from mm.h: */
-#define FOLL_WRITE     0x01    /* check pte is writable */
-#define FOLL_TOUCH     0x02    /* mark page accessed */
-
-#define GUP_TEST_FILE "/sys/kernel/debug/gup_test"
-
-static unsigned long cmd = GUP_FAST_BENCHMARK;
-static int gup_fd, repeats = 1;
-static unsigned long size = 128 * MB;
-/* Serialize prints */
-static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static char *cmd_to_str(unsigned long cmd)
-{
-       switch (cmd) {
-       case GUP_FAST_BENCHMARK:
-               return "GUP_FAST_BENCHMARK";
-       case PIN_FAST_BENCHMARK:
-               return "PIN_FAST_BENCHMARK";
-       case PIN_LONGTERM_BENCHMARK:
-               return "PIN_LONGTERM_BENCHMARK";
-       case GUP_BASIC_TEST:
-               return "GUP_BASIC_TEST";
-       case PIN_BASIC_TEST:
-               return "PIN_BASIC_TEST";
-       case DUMP_USER_PAGES_TEST:
-               return "DUMP_USER_PAGES_TEST";
-       }
-       return "Unknown command";
-}
-
-void *gup_thread(void *data)
-{
-       struct gup_test gup = *(struct gup_test *)data;
-       int i;
-
-       /* Only report timing information on the *_BENCHMARK commands: */
-       if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) ||
-            (cmd == PIN_LONGTERM_BENCHMARK)) {
-               for (i = 0; i < repeats; i++) {
-                       gup.size = size;
-                       if (ioctl(gup_fd, cmd, &gup))
-                               perror("ioctl"), exit(1);
-
-                       pthread_mutex_lock(&print_mutex);
-                       printf("%s: Time: get:%lld put:%lld us",
-                              cmd_to_str(cmd), gup.get_delta_usec,
-                              gup.put_delta_usec);
-                       if (gup.size != size)
-                               printf(", truncated (size: %lld)", gup.size);
-                       printf("\n");
-                       pthread_mutex_unlock(&print_mutex);
-               }
-       } else {
-               gup.size = size;
-               if (ioctl(gup_fd, cmd, &gup)) {
-                       perror("ioctl");
-                       exit(1);
-               }
-
-               pthread_mutex_lock(&print_mutex);
-               printf("%s: done\n", cmd_to_str(cmd));
-               if (gup.size != size)
-                       printf("Truncated (size: %lld)\n", gup.size);
-               pthread_mutex_unlock(&print_mutex);
-       }
-
-       return NULL;
-}
-
-int main(int argc, char **argv)
-{
-       struct gup_test gup = { 0 };
-       int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret;
-       int flags = MAP_PRIVATE, touch = 0;
-       char *file = "/dev/zero";
-       pthread_t *tid;
-       char *p;
-
-       while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) {
-               switch (opt) {
-               case 'a':
-                       cmd = PIN_FAST_BENCHMARK;
-                       break;
-               case 'b':
-                       cmd = PIN_BASIC_TEST;
-                       break;
-               case 'L':
-                       cmd = PIN_LONGTERM_BENCHMARK;
-                       break;
-               case 'c':
-                       cmd = DUMP_USER_PAGES_TEST;
-                       /*
-                        * Dump page 0 (index 1). May be overridden later, by
-                        * user's non-option arguments.
-                        *
-                        * .which_pages is zero-based, so that zero can mean "do
-                        * nothing".
-                        */
-                       gup.which_pages[0] = 1;
-                       break;
-               case 'p':
-                       /* works only with DUMP_USER_PAGES_TEST */
-                       gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN;
-                       break;
-               case 'F':
-                       /* strtol, so you can pass flags in hex form */
-                       gup.gup_flags = strtol(optarg, 0, 0);
-                       break;
-               case 'j':
-                       nthreads = atoi(optarg);
-                       break;
-               case 'm':
-                       size = atoi(optarg) * MB;
-                       break;
-               case 'r':
-                       repeats = atoi(optarg);
-                       break;
-               case 'n':
-                       nr_pages = atoi(optarg);
-                       break;
-               case 't':
-                       thp = 1;
-                       break;
-               case 'T':
-                       thp = 0;
-                       break;
-               case 'U':
-                       cmd = GUP_BASIC_TEST;
-                       break;
-               case 'u':
-                       cmd = GUP_FAST_BENCHMARK;
-                       break;
-               case 'w':
-                       write = 1;
-                       break;
-               case 'W':
-                       write = 0;
-                       break;
-               case 'f':
-                       file = optarg;
-                       break;
-               case 'S':
-                       flags &= ~MAP_PRIVATE;
-                       flags |= MAP_SHARED;
-                       break;
-               case 'H':
-                       flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
-                       break;
-               case 'z':
-                       /* fault pages in gup, do not fault in userland */
-                       touch = 1;
-                       break;
-               default:
-                       return -1;
-               }
-       }
-
-       if (optind < argc) {
-               int extra_arg_count = 0;
-               /*
-                * For example:
-                *
-                *   ./gup_test -c 0 1 0x1001
-                *
-                * ...to dump pages 0, 1, and 4097
-                */
-
-               while ((optind < argc) &&
-                      (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) {
-                       /*
-                        * Do the 1-based indexing here, so that the user can
-                        * use normal 0-based indexing on the command line.
-                        */
-                       long page_index = strtol(argv[optind], 0, 0) + 1;
-
-                       gup.which_pages[extra_arg_count] = page_index;
-                       extra_arg_count++;
-                       optind++;
-               }
-       }
-
-       filed = open(file, O_RDWR|O_CREAT);
-       if (filed < 0) {
-               perror("open");
-               exit(filed);
-       }
-
-       gup.nr_pages_per_call = nr_pages;
-       if (write)
-               gup.gup_flags |= FOLL_WRITE;
-
-       gup_fd = open(GUP_TEST_FILE, O_RDWR);
-       if (gup_fd == -1) {
-               switch (errno) {
-               case EACCES:
-                       if (getuid())
-                               printf("Please run this test as root\n");
-                       break;
-               case ENOENT:
-                       if (opendir("/sys/kernel/debug") == NULL) {
-                               printf("mount debugfs at /sys/kernel/debug\n");
-                               break;
-                       }
-                       printf("check if CONFIG_GUP_TEST is enabled in kernel config\n");
-                       break;
-               default:
-                       perror("failed to open " GUP_TEST_FILE);
-                       break;
-               }
-               exit(KSFT_SKIP);
-       }
-
-       p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
-       if (p == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-       gup.addr = (unsigned long)p;
-
-       if (thp == 1)
-               madvise(p, size, MADV_HUGEPAGE);
-       else if (thp == 0)
-               madvise(p, size, MADV_NOHUGEPAGE);
-
-       /*
-        * FOLL_TOUCH, in gup_test, is used as an either/or case: either
-        * fault pages in from the kernel via FOLL_TOUCH, or fault them
-        * in here, from user space. This allows comparison of performance
-        * between those two cases.
-        */
-       if (touch) {
-               gup.gup_flags |= FOLL_TOUCH;
-       } else {
-               for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
-                       p[0] = 0;
-       }
-
-       tid = malloc(sizeof(pthread_t) * nthreads);
-       assert(tid);
-       for (i = 0; i < nthreads; i++) {
-               ret = pthread_create(&tid[i], NULL, gup_thread, &gup);
-               assert(ret == 0);
-       }
-       for (i = 0; i < nthreads; i++) {
-               ret = pthread_join(tid[i], NULL);
-               assert(ret == 0);
-       }
-       free(tid);
-
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
deleted file mode 100644 (file)
index 4adaad1..0000000
+++ /dev/null
@@ -1,2054 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * HMM stands for Heterogeneous Memory Management, it is a helper layer inside
- * the linux kernel to help device drivers mirror a process address space in
- * the device. This allows the device to use the same address space which
- * makes communication and data exchange a lot easier.
- *
- * This framework's sole purpose is to exercise various code paths inside
- * the kernel to make sure that HMM performs as expected and to flush out any
- * bugs.
- */
-
-#include "../kselftest_harness.h"
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <strings.h>
-#include <time.h>
-#include <pthread.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <sys/ioctl.h>
-
-
-/*
- * This is a private UAPI to the kernel test module so it isn't exported
- * in the usual include/uapi/... directory.
- */
-#include <lib/test_hmm_uapi.h>
-#include <mm/gup_test.h>
-
-struct hmm_buffer {
-       void            *ptr;
-       void            *mirror;
-       unsigned long   size;
-       int             fd;
-       uint64_t        cpages;
-       uint64_t        faults;
-};
-
-enum {
-       HMM_PRIVATE_DEVICE_ONE,
-       HMM_PRIVATE_DEVICE_TWO,
-       HMM_COHERENCE_DEVICE_ONE,
-       HMM_COHERENCE_DEVICE_TWO,
-};
-
-#define TWOMEG         (1 << 21)
-#define HMM_BUFFER_SIZE (1024 << 12)
-#define HMM_PATH_MAX    64
-#define NTIMES         10
-
-#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
-/* Just the flags we need, copied from mm.h: */
-#define FOLL_WRITE     0x01    /* check pte is writable */
-#define FOLL_LONGTERM   0x10000 /* mapping lifetime is indefinite */
-
-FIXTURE(hmm)
-{
-       int             fd;
-       unsigned int    page_size;
-       unsigned int    page_shift;
-};
-
-FIXTURE_VARIANT(hmm)
-{
-       int     device_number;
-};
-
-FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
-{
-       .device_number = HMM_PRIVATE_DEVICE_ONE,
-};
-
-FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
-{
-       .device_number = HMM_COHERENCE_DEVICE_ONE,
-};
-
-FIXTURE(hmm2)
-{
-       int             fd0;
-       int             fd1;
-       unsigned int    page_size;
-       unsigned int    page_shift;
-};
-
-FIXTURE_VARIANT(hmm2)
-{
-       int     device_number0;
-       int     device_number1;
-};
-
-FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
-{
-       .device_number0 = HMM_PRIVATE_DEVICE_ONE,
-       .device_number1 = HMM_PRIVATE_DEVICE_TWO,
-};
-
-FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
-{
-       .device_number0 = HMM_COHERENCE_DEVICE_ONE,
-       .device_number1 = HMM_COHERENCE_DEVICE_TWO,
-};
-
-static int hmm_open(int unit)
-{
-       char pathname[HMM_PATH_MAX];
-       int fd;
-
-       snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit);
-       fd = open(pathname, O_RDWR, 0);
-       if (fd < 0)
-               fprintf(stderr, "could not open hmm dmirror driver (%s)\n",
-                       pathname);
-       return fd;
-}
-
-static bool hmm_is_coherent_type(int dev_num)
-{
-       return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
-}
-
-FIXTURE_SETUP(hmm)
-{
-       self->page_size = sysconf(_SC_PAGE_SIZE);
-       self->page_shift = ffs(self->page_size) - 1;
-
-       self->fd = hmm_open(variant->device_number);
-       if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
-               SKIP(exit(0), "DEVICE_COHERENT not available");
-       ASSERT_GE(self->fd, 0);
-}
-
-FIXTURE_SETUP(hmm2)
-{
-       self->page_size = sysconf(_SC_PAGE_SIZE);
-       self->page_shift = ffs(self->page_size) - 1;
-
-       self->fd0 = hmm_open(variant->device_number0);
-       if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
-               SKIP(exit(0), "DEVICE_COHERENT not available");
-       ASSERT_GE(self->fd0, 0);
-       self->fd1 = hmm_open(variant->device_number1);
-       ASSERT_GE(self->fd1, 0);
-}
-
-FIXTURE_TEARDOWN(hmm)
-{
-       int ret = close(self->fd);
-
-       ASSERT_EQ(ret, 0);
-       self->fd = -1;
-}
-
-FIXTURE_TEARDOWN(hmm2)
-{
-       int ret = close(self->fd0);
-
-       ASSERT_EQ(ret, 0);
-       self->fd0 = -1;
-
-       ret = close(self->fd1);
-       ASSERT_EQ(ret, 0);
-       self->fd1 = -1;
-}
-
-static int hmm_dmirror_cmd(int fd,
-                          unsigned long request,
-                          struct hmm_buffer *buffer,
-                          unsigned long npages)
-{
-       struct hmm_dmirror_cmd cmd;
-       int ret;
-
-       /* Simulate a device reading system memory. */
-       cmd.addr = (__u64)buffer->ptr;
-       cmd.ptr = (__u64)buffer->mirror;
-       cmd.npages = npages;
-
-       for (;;) {
-               ret = ioctl(fd, request, &cmd);
-               if (ret == 0)
-                       break;
-               if (errno == EINTR)
-                       continue;
-               return -errno;
-       }
-       buffer->cpages = cmd.cpages;
-       buffer->faults = cmd.faults;
-
-       return 0;
-}
-
-static void hmm_buffer_free(struct hmm_buffer *buffer)
-{
-       if (buffer == NULL)
-               return;
-
-       if (buffer->ptr)
-               munmap(buffer->ptr, buffer->size);
-       free(buffer->mirror);
-       free(buffer);
-}
-
-/*
- * Create a temporary file that will be deleted on close.
- */
-static int hmm_create_file(unsigned long size)
-{
-       char path[HMM_PATH_MAX];
-       int fd;
-
-       strcpy(path, "/tmp");
-       fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600);
-       if (fd >= 0) {
-               int r;
-
-               do {
-                       r = ftruncate(fd, size);
-               } while (r == -1 && errno == EINTR);
-               if (!r)
-                       return fd;
-               close(fd);
-       }
-       return -1;
-}
-
-/*
- * Return a random unsigned number.
- */
-static unsigned int hmm_random(void)
-{
-       static int fd = -1;
-       unsigned int r;
-
-       if (fd < 0) {
-               fd = open("/dev/urandom", O_RDONLY);
-               if (fd < 0) {
-                       fprintf(stderr, "%s:%d failed to open /dev/urandom\n",
-                                       __FILE__, __LINE__);
-                       return ~0U;
-               }
-       }
-       read(fd, &r, sizeof(r));
-       return r;
-}
-
-static void hmm_nanosleep(unsigned int n)
-{
-       struct timespec t;
-
-       t.tv_sec = 0;
-       t.tv_nsec = n;
-       nanosleep(&t, NULL);
-}
-
-static int hmm_migrate_sys_to_dev(int fd,
-                                  struct hmm_buffer *buffer,
-                                  unsigned long npages)
-{
-       return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
-}
-
-static int hmm_migrate_dev_to_sys(int fd,
-                                  struct hmm_buffer *buffer,
-                                  unsigned long npages)
-{
-       return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
-}
-
-/*
- * Simple NULL test of device open/close.
- */
-TEST_F(hmm, open_close)
-{
-}
-
-/*
- * Read private anonymous memory.
- */
-TEST_F(hmm, anon_read)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-       int val;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /*
-        * Initialize buffer in system memory but leave the first two pages
-        * zero (pte_none and pfn_zero).
-        */
-       i = 2 * self->page_size / sizeof(*ptr);
-       for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Set buffer permission to read-only. */
-       ret = mprotect(buffer->ptr, size, PROT_READ);
-       ASSERT_EQ(ret, 0);
-
-       /* Populate the CPU page table with a special zero page. */
-       val = *(int *)(buffer->ptr + self->page_size);
-       ASSERT_EQ(val, 0);
-
-       /* Simulate a device reading system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device read. */
-       ptr = buffer->mirror;
-       for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], 0);
-       for (; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Read private anonymous memory which has been protected with
- * mprotect() PROT_NONE.
- */
-TEST_F(hmm, anon_read_prot)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Initialize mirror buffer so we can verify it isn't written. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = -i;
-
-       /* Protect buffer from reading. */
-       ret = mprotect(buffer->ptr, size, PROT_NONE);
-       ASSERT_EQ(ret, 0);
-
-       /* Simulate a device reading system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
-       ASSERT_EQ(ret, -EFAULT);
-
-       /* Allow CPU to read the buffer so we can check it. */
-       ret = mprotect(buffer->ptr, size, PROT_READ);
-       ASSERT_EQ(ret, 0);
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], -i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Write private anonymous memory.
- */
-TEST_F(hmm, anon_write)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize data that the device will write to buffer->ptr. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Write private anonymous memory which has been protected with
- * mprotect() PROT_READ.
- */
-TEST_F(hmm, anon_write_prot)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Simulate a device reading a zero page of memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, 1);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Initialize data that the device will write to buffer->ptr. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, -EPERM);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], 0);
-
-       /* Now allow writing and see that the zero page is replaced. */
-       ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ);
-       ASSERT_EQ(ret, 0);
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Check that a device writing an anonymous private mapping
- * will copy-on-write if a child process inherits the mapping.
- */
-TEST_F(hmm, anon_write_child)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       pid_t pid;
-       int child_fd;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer->ptr so we can tell if it is written. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Initialize data that the device will write to buffer->ptr. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = -i;
-
-       pid = fork();
-       if (pid == -1)
-               ASSERT_EQ(pid, 0);
-       if (pid != 0) {
-               waitpid(pid, &ret, 0);
-               ASSERT_EQ(WIFEXITED(ret), 1);
-
-               /* Check that the parent's buffer did not change. */
-               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-                       ASSERT_EQ(ptr[i], i);
-               return;
-       }
-
-       /* Check that we see the parent's values. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], -i);
-
-       /* The child process needs its own mirror to its own mm. */
-       child_fd = hmm_open(0);
-       ASSERT_GE(child_fd, 0);
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], -i);
-
-       close(child_fd);
-       exit(0);
-}
-
-/*
- * Check that a device writing an anonymous shared mapping
- * will not copy-on-write if a child process inherits the mapping.
- */
-TEST_F(hmm, anon_write_child_shared)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       pid_t pid;
-       int child_fd;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_SHARED | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer->ptr so we can tell if it is written. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Initialize data that the device will write to buffer->ptr. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = -i;
-
-       pid = fork();
-       if (pid == -1)
-               ASSERT_EQ(pid, 0);
-       if (pid != 0) {
-               waitpid(pid, &ret, 0);
-               ASSERT_EQ(WIFEXITED(ret), 1);
-
-               /* Check that the parent's buffer did change. */
-               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-                       ASSERT_EQ(ptr[i], -i);
-               return;
-       }
-
-       /* Check that we see the parent's values. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], -i);
-
-       /* The child process needs its own mirror to its own mm. */
-       child_fd = hmm_open(0);
-       ASSERT_GE(child_fd, 0);
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], -i);
-
-       close(child_fd);
-       exit(0);
-}
-
-/*
- * Write private anonymous huge page.
- */
-TEST_F(hmm, anon_write_huge)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       void *old_ptr;
-       void *map;
-       int *ptr;
-       int ret;
-
-       size = 2 * TWOMEG;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       size = TWOMEG;
-       npages = size >> self->page_shift;
-       map = (void *)ALIGN((uintptr_t)buffer->ptr, size);
-       ret = madvise(map, size, MADV_HUGEPAGE);
-       ASSERT_EQ(ret, 0);
-       old_ptr = buffer->ptr;
-       buffer->ptr = map;
-
-       /* Initialize data that the device will write to buffer->ptr. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       buffer->ptr = old_ptr;
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Read numeric data from raw and tagged kernel status files.  Used to read
- * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag).
- */
-static long file_read_ulong(char *file, const char *tag)
-{
-       int fd;
-       char buf[2048];
-       int len;
-       char *p, *q;
-       long val;
-
-       fd = open(file, O_RDONLY);
-       if (fd < 0) {
-               /* Error opening the file */
-               return -1;
-       }
-
-       len = read(fd, buf, sizeof(buf));
-       close(fd);
-       if (len < 0) {
-               /* Error in reading the file */
-               return -1;
-       }
-       if (len == sizeof(buf)) {
-               /* Error file is too large */
-               return -1;
-       }
-       buf[len] = '\0';
-
-       /* Search for a tag if provided */
-       if (tag) {
-               p = strstr(buf, tag);
-               if (!p)
-                       return -1; /* looks like the line we want isn't there */
-               p += strlen(tag);
-       } else
-               p = buf;
-
-       val = strtol(p, &q, 0);
-       if (*q != ' ') {
-               /* Error parsing the file */
-               return -1;
-       }
-
-       return val;
-}
-
-/*
- * Write huge TLBFS page.
- */
-TEST_F(hmm, anon_write_hugetlbfs)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long default_hsize;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
-       if (default_hsize < 0 || default_hsize*1024 < default_hsize)
-               SKIP(return, "Huge page size could not be determined");
-       default_hsize = default_hsize*1024; /* KB to B */
-
-       size = ALIGN(TWOMEG, default_hsize);
-       npages = size >> self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                                  PROT_READ | PROT_WRITE,
-                                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-                                  -1, 0);
-       if (buffer->ptr == MAP_FAILED) {
-               free(buffer);
-               SKIP(return, "Huge page could not be allocated");
-       }
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       /* Initialize data that the device will write to buffer->ptr. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       munmap(buffer->ptr, buffer->size);
-       buffer->ptr = NULL;
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Read mmap'ed file memory.
- */
-TEST_F(hmm, file_read)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-       int fd;
-       ssize_t len;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       fd = hmm_create_file(size);
-       ASSERT_GE(fd, 0);
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = fd;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       /* Write initial contents of the file. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-       len = pwrite(fd, buffer->mirror, size, 0);
-       ASSERT_EQ(len, size);
-       memset(buffer->mirror, 0, size);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ,
-                          MAP_SHARED,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Simulate a device reading system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Write mmap'ed file memory.
- */
-TEST_F(hmm, file_write)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-       int fd;
-       ssize_t len;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       fd = hmm_create_file(size);
-       ASSERT_GE(fd, 0);
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = fd;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_SHARED,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize data that the device will write to buffer->ptr. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device wrote. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Check that the device also wrote the file. */
-       len = pread(fd, buffer->mirror, size, 0);
-       ASSERT_EQ(len, size);
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous memory to device private memory.
- */
-TEST_F(hmm, migrate)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Migrate memory to device. */
-       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous memory to device private memory and fault some of it back
- * to system memory, then try migrating the resulting mix of system and device
- * private memory to the device.
- */
-TEST_F(hmm, migrate_fault)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Migrate memory to device. */
-       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Fault half the pages back to system memory and check them. */
-       for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Migrate memory to the device again. */
-       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-TEST_F(hmm, migrate_release)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Migrate memory to device. */
-       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Release device memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-
-       /* Fault pages back to system memory and check them. */
-       for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous shared memory to device private memory.
- */
-TEST_F(hmm, migrate_shared)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_SHARED | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Migrate memory to device. */
-       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-       ASSERT_EQ(ret, -ENOENT);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Try to migrate various memory types to device private memory.
- */
-TEST_F(hmm2, migrate_mixed)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       int *ptr;
-       unsigned char *p;
-       int ret;
-       int val;
-
-       npages = 6;
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       /* Reserve a range of addresses. */
-       buffer->ptr = mmap(NULL, size,
-                          PROT_NONE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-       p = buffer->ptr;
-
-       /* Migrating a protected area should be an error. */
-       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
-       ASSERT_EQ(ret, -EINVAL);
-
-       /* Punch a hole after the first page address. */
-       ret = munmap(buffer->ptr + self->page_size, self->page_size);
-       ASSERT_EQ(ret, 0);
-
-       /* We expect an error if the vma doesn't cover the range. */
-       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3);
-       ASSERT_EQ(ret, -EINVAL);
-
-       /* Page 2 will be a read-only zero page. */
-       ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
-                               PROT_READ);
-       ASSERT_EQ(ret, 0);
-       ptr = (int *)(buffer->ptr + 2 * self->page_size);
-       val = *ptr + 3;
-       ASSERT_EQ(val, 3);
-
-       /* Page 3 will be read-only. */
-       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-                               PROT_READ | PROT_WRITE);
-       ASSERT_EQ(ret, 0);
-       ptr = (int *)(buffer->ptr + 3 * self->page_size);
-       *ptr = val;
-       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-                               PROT_READ);
-       ASSERT_EQ(ret, 0);
-
-       /* Page 4-5 will be read-write. */
-       ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size,
-                               PROT_READ | PROT_WRITE);
-       ASSERT_EQ(ret, 0);
-       ptr = (int *)(buffer->ptr + 4 * self->page_size);
-       *ptr = val;
-       ptr = (int *)(buffer->ptr + 5 * self->page_size);
-       *ptr = val;
-
-       /* Now try to migrate pages 2-5 to device 1. */
-       buffer->ptr = p + 2 * self->page_size;
-       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, 4);
-
-       /* Page 5 won't be migrated to device 0 because it's on device 1. */
-       buffer->ptr = p + 5 * self->page_size;
-       ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
-       ASSERT_EQ(ret, -ENOENT);
-       buffer->ptr = p;
-
-       buffer->ptr = p;
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Migrate anonymous memory to device memory and back to system memory
- * multiple times. In case of private zone configuration, this is done
- * through fault pages accessed by CPU. In case of coherent zone configuration,
- * the pages from the device should be explicitly migrated back to system memory.
- * The reason is Coherent device zone has coherent access by CPU, therefore
- * it will not generate any page fault.
- */
-TEST_F(hmm, migrate_multiple)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       unsigned long c;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       for (c = 0; c < NTIMES; c++) {
-               buffer = malloc(sizeof(*buffer));
-               ASSERT_NE(buffer, NULL);
-
-               buffer->fd = -1;
-               buffer->size = size;
-               buffer->mirror = malloc(size);
-               ASSERT_NE(buffer->mirror, NULL);
-
-               buffer->ptr = mmap(NULL, size,
-                                  PROT_READ | PROT_WRITE,
-                                  MAP_PRIVATE | MAP_ANONYMOUS,
-                                  buffer->fd, 0);
-               ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-               /* Initialize buffer in system memory. */
-               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-                       ptr[i] = i;
-
-               /* Migrate memory to device. */
-               ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-               ASSERT_EQ(ret, 0);
-               ASSERT_EQ(buffer->cpages, npages);
-
-               /* Check what the device read. */
-               for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-                       ASSERT_EQ(ptr[i], i);
-
-               /* Migrate back to system memory and check them. */
-               if (hmm_is_coherent_type(variant->device_number)) {
-                       ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages);
-                       ASSERT_EQ(ret, 0);
-                       ASSERT_EQ(buffer->cpages, npages);
-               }
-
-               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-                       ASSERT_EQ(ptr[i], i);
-
-               hmm_buffer_free(buffer);
-       }
-}
-
-/*
- * Read anonymous memory multiple times.
- */
-TEST_F(hmm, anon_read_multiple)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       unsigned long c;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       for (c = 0; c < NTIMES; c++) {
-               buffer = malloc(sizeof(*buffer));
-               ASSERT_NE(buffer, NULL);
-
-               buffer->fd = -1;
-               buffer->size = size;
-               buffer->mirror = malloc(size);
-               ASSERT_NE(buffer->mirror, NULL);
-
-               buffer->ptr = mmap(NULL, size,
-                                  PROT_READ | PROT_WRITE,
-                                  MAP_PRIVATE | MAP_ANONYMOUS,
-                                  buffer->fd, 0);
-               ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-               /* Initialize buffer in system memory. */
-               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-                       ptr[i] = i + c;
-
-               /* Simulate a device reading system memory. */
-               ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
-                                     npages);
-               ASSERT_EQ(ret, 0);
-               ASSERT_EQ(buffer->cpages, npages);
-               ASSERT_EQ(buffer->faults, 1);
-
-               /* Check what the device read. */
-               for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-                       ASSERT_EQ(ptr[i], i + c);
-
-               hmm_buffer_free(buffer);
-       }
-}
-
-void *unmap_buffer(void *p)
-{
-       struct hmm_buffer *buffer = p;
-
-       /* Delay for a bit and then unmap buffer while it is being read. */
-       hmm_nanosleep(hmm_random() % 32000);
-       munmap(buffer->ptr + buffer->size / 2, buffer->size / 2);
-       buffer->ptr = NULL;
-
-       return NULL;
-}
-
-/*
- * Try reading anonymous memory while it is being unmapped.
- */
-TEST_F(hmm, anon_teardown)
-{
-       unsigned long npages;
-       unsigned long size;
-       unsigned long c;
-       void *ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       for (c = 0; c < NTIMES; ++c) {
-               pthread_t thread;
-               struct hmm_buffer *buffer;
-               unsigned long i;
-               int *ptr;
-               int rc;
-
-               buffer = malloc(sizeof(*buffer));
-               ASSERT_NE(buffer, NULL);
-
-               buffer->fd = -1;
-               buffer->size = size;
-               buffer->mirror = malloc(size);
-               ASSERT_NE(buffer->mirror, NULL);
-
-               buffer->ptr = mmap(NULL, size,
-                                  PROT_READ | PROT_WRITE,
-                                  MAP_PRIVATE | MAP_ANONYMOUS,
-                                  buffer->fd, 0);
-               ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-               /* Initialize buffer in system memory. */
-               for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-                       ptr[i] = i + c;
-
-               rc = pthread_create(&thread, NULL, unmap_buffer, buffer);
-               ASSERT_EQ(rc, 0);
-
-               /* Simulate a device reading system memory. */
-               rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
-                                    npages);
-               if (rc == 0) {
-                       ASSERT_EQ(buffer->cpages, npages);
-                       ASSERT_EQ(buffer->faults, 1);
-
-                       /* Check what the device read. */
-                       for (i = 0, ptr = buffer->mirror;
-                            i < size / sizeof(*ptr);
-                            ++i)
-                               ASSERT_EQ(ptr[i], i + c);
-               }
-
-               pthread_join(thread, &ret);
-               hmm_buffer_free(buffer);
-       }
-}
-
-/*
- * Test memory snapshot without faulting in pages accessed by the device.
- */
-TEST_F(hmm, mixedmap)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned char *m;
-       int ret;
-
-       npages = 1;
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(npages);
-       ASSERT_NE(buffer->mirror, NULL);
-
-
-       /* Reserve a range of addresses. */
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE,
-                          self->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Simulate a device snapshotting CPU pagetables. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device saw. */
-       m = buffer->mirror;
-       ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Test memory snapshot without faulting in pages accessed by the device.
- */
-TEST_F(hmm2, snapshot)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       int *ptr;
-       unsigned char *p;
-       unsigned char *m;
-       int ret;
-       int val;
-
-       npages = 7;
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(npages);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       /* Reserve a range of addresses. */
-       buffer->ptr = mmap(NULL, size,
-                          PROT_NONE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-       p = buffer->ptr;
-
-       /* Punch a hole after the first page address. */
-       ret = munmap(buffer->ptr + self->page_size, self->page_size);
-       ASSERT_EQ(ret, 0);
-
-       /* Page 2 will be read-only zero page. */
-       ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
-                               PROT_READ);
-       ASSERT_EQ(ret, 0);
-       ptr = (int *)(buffer->ptr + 2 * self->page_size);
-       val = *ptr + 3;
-       ASSERT_EQ(val, 3);
-
-       /* Page 3 will be read-only. */
-       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-                               PROT_READ | PROT_WRITE);
-       ASSERT_EQ(ret, 0);
-       ptr = (int *)(buffer->ptr + 3 * self->page_size);
-       *ptr = val;
-       ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
-                               PROT_READ);
-       ASSERT_EQ(ret, 0);
-
-       /* Page 4-6 will be read-write. */
-       ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size,
-                               PROT_READ | PROT_WRITE);
-       ASSERT_EQ(ret, 0);
-       ptr = (int *)(buffer->ptr + 4 * self->page_size);
-       *ptr = val;
-
-       /* Page 5 will be migrated to device 0. */
-       buffer->ptr = p + 5 * self->page_size;
-       ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, 1);
-
-       /* Page 6 will be migrated to device 1. */
-       buffer->ptr = p + 6 * self->page_size;
-       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, 1);
-
-       /* Simulate a device snapshotting CPU pagetables. */
-       buffer->ptr = p;
-       ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device saw. */
-       m = buffer->mirror;
-       ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR);
-       ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR);
-       ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
-       ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
-       ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);
-       if (!hmm_is_coherent_type(variant->device_number0)) {
-               ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
-                               HMM_DMIRROR_PROT_WRITE);
-               ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
-       } else {
-               ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL |
-                               HMM_DMIRROR_PROT_WRITE);
-               ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE |
-                               HMM_DMIRROR_PROT_WRITE);
-       }
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
- * should be mapped by a large page table entry.
- */
-TEST_F(hmm, compound)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long default_hsize;
-       int *ptr;
-       unsigned char *m;
-       int ret;
-       unsigned long i;
-
-       /* Skip test if we can't allocate a hugetlbfs page. */
-
-       default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:");
-       if (default_hsize < 0 || default_hsize*1024 < default_hsize)
-               SKIP(return, "Huge page size could not be determined");
-       default_hsize = default_hsize*1024; /* KB to B */
-
-       size = ALIGN(TWOMEG, default_hsize);
-       npages = size >> self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                                  PROT_READ | PROT_WRITE,
-                                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-                                  -1, 0);
-       if (buffer->ptr == MAP_FAILED) {
-               free(buffer);
-               return;
-       }
-
-       buffer->size = size;
-       buffer->mirror = malloc(npages);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       /* Initialize the pages the device will snapshot in buffer->ptr. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Simulate a device snapshotting CPU pagetables. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device saw. */
-       m = buffer->mirror;
-       for (i = 0; i < npages; ++i)
-               ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE |
-                               HMM_DMIRROR_PROT_PMD);
-
-       /* Make the region read-only. */
-       ret = mprotect(buffer->ptr, size, PROT_READ);
-       ASSERT_EQ(ret, 0);
-
-       /* Simulate a device snapshotting CPU pagetables. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device saw. */
-       m = buffer->mirror;
-       for (i = 0; i < npages; ++i)
-               ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
-                               HMM_DMIRROR_PROT_PMD);
-
-       munmap(buffer->ptr, buffer->size);
-       buffer->ptr = NULL;
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Test two devices reading the same memory (double mapped).
- */
-TEST_F(hmm2, double_map)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = 6;
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(npages);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       /* Reserve a range of addresses. */
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Make region read-only. */
-       ret = mprotect(buffer->ptr, size, PROT_READ);
-       ASSERT_EQ(ret, 0);
-
-       /* Simulate device 0 reading system memory. */
-       ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Simulate device 1 reading system memory. */
-       ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Migrate pages to device 1 and try to read from device 0. */
-       ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       ASSERT_EQ(buffer->faults, 1);
-
-       /* Check what device 0 read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Basic check of exclusive faulting.
- */
-TEST_F(hmm, exclusive)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Map memory exclusively for device access. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       /* Fault pages back to system memory and check them. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i]++, i);
-
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i+1);
-
-       /* Check atomic access revoked */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-
-       hmm_buffer_free(buffer);
-}
-
-TEST_F(hmm, exclusive_mprotect)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Map memory exclusively for device access. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       ret = mprotect(buffer->ptr, size, PROT_READ);
-       ASSERT_EQ(ret, 0);
-
-       /* Simulate a device writing system memory. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
-       ASSERT_EQ(ret, -EPERM);
-
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Check copy-on-write works.
- */
-TEST_F(hmm, exclusive_cow)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-
-       npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
-       ASSERT_NE(npages, 0);
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Map memory exclusively for device access. */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       fork();
-
-       /* Fault pages back to system memory and check them. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i]++, i);
-
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i+1);
-
-       hmm_buffer_free(buffer);
-}
-
-static int gup_test_exec(int gup_fd, unsigned long addr, int cmd,
-                        int npages, int size, int flags)
-{
-       struct gup_test gup = {
-               .nr_pages_per_call      = npages,
-               .addr                   = addr,
-               .gup_flags              = FOLL_WRITE | flags,
-               .size                   = size,
-       };
-
-       if (ioctl(gup_fd, cmd, &gup)) {
-               perror("ioctl on error\n");
-               return errno;
-       }
-
-       return 0;
-}
-
-/*
- * Test get user device pages through gup_test. Setting PIN_LONGTERM flag.
- * This should trigger a migration back to system memory for both, private
- * and coherent type pages.
- * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added
- * to your configuration before you run it.
- */
-TEST_F(hmm, hmm_gup_test)
-{
-       struct hmm_buffer *buffer;
-       int gup_fd;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-       unsigned char *m;
-
-       gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR);
-       if (gup_fd == -1)
-               SKIP(return, "Skipping test, could not find gup_test driver");
-
-       npages = 4;
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Migrate memory to device. */
-       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       /* Check what the device read. */
-       for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       ASSERT_EQ(gup_test_exec(gup_fd,
-                               (unsigned long)buffer->ptr,
-                               GUP_BASIC_TEST, 1, self->page_size, 0), 0);
-       ASSERT_EQ(gup_test_exec(gup_fd,
-                               (unsigned long)buffer->ptr + 1 * self->page_size,
-                               GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0);
-       ASSERT_EQ(gup_test_exec(gup_fd,
-                               (unsigned long)buffer->ptr + 2 * self->page_size,
-                               PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0);
-       ASSERT_EQ(gup_test_exec(gup_fd,
-                               (unsigned long)buffer->ptr + 3 * self->page_size,
-                               PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0);
-
-       /* Take snapshot to CPU pagetables */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       m = buffer->mirror;
-       if (hmm_is_coherent_type(variant->device_number)) {
-               ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]);
-               ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]);
-       } else {
-               ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]);
-               ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]);
-       }
-       ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]);
-       ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]);
-       /*
-        * Check again the content on the pages. Make sure there's no
-        * corrupted data.
-        */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ASSERT_EQ(ptr[i], i);
-
-       close(gup_fd);
-       hmm_buffer_free(buffer);
-}
-
-/*
- * Test copy-on-write in device pages.
- * In case of writing to COW private page(s), a page fault will migrate pages
- * back to system memory first. Then, these pages will be duplicated. In case
- * of COW device coherent type, pages are duplicated directly from device
- * memory.
- */
-TEST_F(hmm, hmm_cow_in_device)
-{
-       struct hmm_buffer *buffer;
-       unsigned long npages;
-       unsigned long size;
-       unsigned long i;
-       int *ptr;
-       int ret;
-       unsigned char *m;
-       pid_t pid;
-       int status;
-
-       npages = 4;
-       size = npages << self->page_shift;
-
-       buffer = malloc(sizeof(*buffer));
-       ASSERT_NE(buffer, NULL);
-
-       buffer->fd = -1;
-       buffer->size = size;
-       buffer->mirror = malloc(size);
-       ASSERT_NE(buffer->mirror, NULL);
-
-       buffer->ptr = mmap(NULL, size,
-                          PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS,
-                          buffer->fd, 0);
-       ASSERT_NE(buffer->ptr, MAP_FAILED);
-
-       /* Initialize buffer in system memory. */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Migrate memory to device. */
-
-       ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-
-       pid = fork();
-       if (pid == -1)
-               ASSERT_EQ(pid, 0);
-       if (!pid) {
-               /* Child process waitd for SIGTERM from the parent. */
-               while (1) {
-               }
-               perror("Should not reach this\n");
-               exit(0);
-       }
-       /* Parent process writes to COW pages(s) and gets a
-        * new copy in system. In case of device private pages,
-        * this write causes a migration to system mem first.
-        */
-       for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
-               ptr[i] = i;
-
-       /* Terminate child and wait */
-       EXPECT_EQ(0, kill(pid, SIGTERM));
-       EXPECT_EQ(pid, waitpid(pid, &status, 0));
-       EXPECT_NE(0, WIFSIGNALED(status));
-       EXPECT_EQ(SIGTERM, WTERMSIG(status));
-
-       /* Take snapshot to CPU pagetables */
-       ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
-       ASSERT_EQ(ret, 0);
-       ASSERT_EQ(buffer->cpages, npages);
-       m = buffer->mirror;
-       for (i = 0; i < npages; i++)
-               ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]);
-
-       hmm_buffer_free(buffer);
-}
-TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c
deleted file mode 100644 (file)
index 955ef87..0000000
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-mmap:
- *
- * Example of using huge page memory in a user application using the mmap
- * system call.  Before running this application, make sure that the
- * administrator has mounted the hugetlbfs filesystem (on some directory
- * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
- * example, the app is requesting memory of size 256MB that is backed by
- * huge pages.
- *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define LENGTH (256UL*1024*1024)
-#define PROTECTION (PROT_READ | PROT_WRITE)
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_SHARED | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_SHARED)
-#endif
-
-static void check_bytes(char *addr)
-{
-       printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr)
-{
-       unsigned long i;
-
-       for (i = 0; i < LENGTH; i++)
-               *(addr + i) = (char)i;
-}
-
-static int read_bytes(char *addr)
-{
-       unsigned long i;
-
-       check_bytes(addr);
-       for (i = 0; i < LENGTH; i++)
-               if (*(addr + i) != (char)i) {
-                       printf("Mismatch at %lu\n", i);
-                       return 1;
-               }
-       return 0;
-}
-
-int main(void)
-{
-       void *addr;
-       int fd, ret;
-
-       fd = memfd_create("hugepage-mmap", MFD_HUGETLB);
-       if (fd < 0) {
-               perror("memfd_create() failed");
-               exit(1);
-       }
-
-       addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               close(fd);
-               exit(1);
-       }
-
-       printf("Returned address is %p\n", addr);
-       check_bytes(addr);
-       write_bytes(addr);
-       ret = read_bytes(addr);
-
-       munmap(addr, LENGTH);
-       close(fd);
-
-       return ret;
-}
diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c
deleted file mode 100644 (file)
index e53b5ea..0000000
+++ /dev/null
@@ -1,188 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-mremap:
- *
- * Example of remapping huge page memory in a user application using the
- * mremap system call.  The path to a file in a hugetlbfs filesystem must
- * be passed as the last argument to this test.  The amount of memory used
- * by this test in MBs can optionally be passed as an argument.  If no memory
- * amount is passed, the default amount is 10MB.
- *
- * To make sure the test triggers pmd sharing and goes through the 'unshare'
- * path in the mremap code use 1GB (1024) or more.
- */
-
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <errno.h>
-#include <fcntl.h> /* Definition of O_* constants */
-#include <sys/syscall.h> /* Definition of SYS_* constants */
-#include <linux/userfaultfd.h>
-#include <sys/ioctl.h>
-#include <string.h>
-
-#define DEFAULT_LENGTH_MB 10UL
-#define MB_TO_BYTES(x) (x * 1024 * 1024)
-
-#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC)
-#define FLAGS (MAP_SHARED | MAP_ANONYMOUS)
-
-static void check_bytes(char *addr)
-{
-       printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr, size_t len)
-{
-       unsigned long i;
-
-       for (i = 0; i < len; i++)
-               *(addr + i) = (char)i;
-}
-
-static int read_bytes(char *addr, size_t len)
-{
-       unsigned long i;
-
-       check_bytes(addr);
-       for (i = 0; i < len; i++)
-               if (*(addr + i) != (char)i) {
-                       printf("Mismatch at %lu\n", i);
-                       return 1;
-               }
-       return 0;
-}
-
-static void register_region_with_uffd(char *addr, size_t len)
-{
-       long uffd; /* userfaultfd file descriptor */
-       struct uffdio_api uffdio_api;
-       struct uffdio_register uffdio_register;
-
-       /* Create and enable userfaultfd object. */
-
-       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
-       if (uffd == -1) {
-               perror("userfaultfd");
-               exit(1);
-       }
-
-       uffdio_api.api = UFFD_API;
-       uffdio_api.features = 0;
-       if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
-               perror("ioctl-UFFDIO_API");
-               exit(1);
-       }
-
-       /* Create a private anonymous mapping. The memory will be
-        * demand-zero paged--that is, not yet allocated. When we
-        * actually touch the memory, it will be allocated via
-        * the userfaultfd.
-        */
-
-       addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
-                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       printf("Address returned by mmap() = %p\n", addr);
-
-       /* Register the memory range of the mapping we just created for
-        * handling by the userfaultfd object. In mode, we request to track
-        * missing pages (i.e., pages that have not yet been faulted in).
-        */
-
-       uffdio_register.range.start = (unsigned long)addr;
-       uffdio_register.range.len = len;
-       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
-               perror("ioctl-UFFDIO_REGISTER");
-               exit(1);
-       }
-}
-
-int main(int argc, char *argv[])
-{
-       size_t length = 0;
-       int ret = 0, fd;
-
-       if (argc >= 2 && !strcmp(argv[1], "-h")) {
-               printf("Usage: %s [length_in_MB]\n", argv[0]);
-               exit(1);
-       }
-
-       /* Read memory length as the first arg if valid, otherwise fallback to
-        * the default length.
-        */
-       if (argc >= 2)
-               length = (size_t)atoi(argv[1]);
-       else
-               length = DEFAULT_LENGTH_MB;
-
-       length = MB_TO_BYTES(length);
-       fd = memfd_create(argv[0], MFD_HUGETLB);
-       if (fd < 0) {
-               perror("Open failed");
-               exit(1);
-       }
-
-       /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */
-       unsigned long suggested_addr = 0x7eaa40000000;
-       void *haddr = mmap((void *)suggested_addr, length, PROTECTION,
-                          MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
-       printf("Map haddr: Returned address is %p\n", haddr);
-       if (haddr == MAP_FAILED) {
-               perror("mmap1");
-               exit(1);
-       }
-
-       /* mmap again to a dummy address to hopefully trigger pmd sharing. */
-       suggested_addr = 0x7daa40000000;
-       void *daddr = mmap((void *)suggested_addr, length, PROTECTION,
-                          MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0);
-       printf("Map daddr: Returned address is %p\n", daddr);
-       if (daddr == MAP_FAILED) {
-               perror("mmap3");
-               exit(1);
-       }
-
-       suggested_addr = 0x7faa40000000;
-       void *vaddr =
-               mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0);
-       printf("Map vaddr: Returned address is %p\n", vaddr);
-       if (vaddr == MAP_FAILED) {
-               perror("mmap2");
-               exit(1);
-       }
-
-       register_region_with_uffd(haddr, length);
-
-       void *addr = mremap(haddr, length, length,
-                           MREMAP_MAYMOVE | MREMAP_FIXED, vaddr);
-       if (addr == MAP_FAILED) {
-               perror("mremap");
-               exit(1);
-       }
-
-       printf("Mremap: Returned address is %p\n", addr);
-       check_bytes(addr);
-       write_bytes(addr, length);
-       ret = read_bytes(addr, length);
-
-       munmap(addr, length);
-
-       addr = mremap(addr, length, length, 0);
-       if (addr != MAP_FAILED) {
-               printf("mremap: Expected failure, but call succeeded\n");
-               exit(1);
-       }
-
-       close(fd);
-
-       return ret;
-}
diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c
deleted file mode 100644 (file)
index e2527f3..0000000
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-shm:
- *
- * Example of using huge page memory in a user application using Sys V shared
- * memory system calls.  In this example the app is requesting 256MB of
- * memory that is backed by huge pages.  The application uses the flag
- * SHM_HUGETLB in the shmget system call to inform the kernel that it is
- * requesting huge pages.
- *
- * For the ia64 architecture, the Linux kernel reserves Region number 4 for
- * huge pages.  That means that if one requires a fixed address, a huge page
- * aligned address starting with 0x800000... will be required.  If a fixed
- * address is not required, the kernel will select an address in the proper
- * range.
- * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
- *
- * Note: The default shared memory limit is quite low on many kernels,
- * you may need to increase it via:
- *
- * echo 268435456 > /proc/sys/kernel/shmmax
- *
- * This will increase the maximum size per shared memory segment to 256MB.
- * The other limit that you will hit eventually is shmall which is the
- * total amount of shared memory in pages. To set it to 16GB on a system
- * with a 4kB pagesize do:
- *
- * echo 4194304 > /proc/sys/kernel/shmall
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/mman.h>
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-#define LENGTH (256UL*1024*1024)
-
-#define dprintf(x)  printf(x)
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define SHMAT_FLAGS (SHM_RND)
-#else
-#define ADDR (void *)(0x0UL)
-#define SHMAT_FLAGS (0)
-#endif
-
-int main(void)
-{
-       int shmid;
-       unsigned long i;
-       char *shmaddr;
-
-       shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
-       if (shmid < 0) {
-               perror("shmget");
-               exit(1);
-       }
-       printf("shmid: 0x%x\n", shmid);
-
-       shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
-       if (shmaddr == (char *)-1) {
-               perror("Shared memory attach failure");
-               shmctl(shmid, IPC_RMID, NULL);
-               exit(2);
-       }
-       printf("shmaddr: %p\n", shmaddr);
-
-       dprintf("Starting the writes:\n");
-       for (i = 0; i < LENGTH; i++) {
-               shmaddr[i] = (char)(i);
-               if (!(i % (1024 * 1024)))
-                       dprintf(".");
-       }
-       dprintf("\n");
-
-       dprintf("Starting the Check...");
-       for (i = 0; i < LENGTH; i++)
-               if (shmaddr[i] != (char)i) {
-                       printf("\nIndex %lu mismatched\n", i);
-                       exit(3);
-               }
-       dprintf("Done.\n");
-
-       if (shmdt((const void *)shmaddr) != 0) {
-               perror("Detach failure");
-               shmctl(shmid, IPC_RMID, NULL);
-               exit(4);
-       }
-
-       shmctl(shmid, IPC_RMID, NULL);
-
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c
deleted file mode 100644 (file)
index 557bdbd..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * A test case of using hugepage memory in a user application using the
- * mmap system call with MAP_HUGETLB flag.  Before running this program
- * make sure the administrator has allocated enough default sized huge
- * pages to cover the 2 MB allocation.
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define MAP_LENGTH             (2UL * 1024 * 1024)
-
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB            0x40000 /* arch specific */
-#endif
-
-#define PAGE_SIZE              4096
-
-#define PAGE_COMPOUND_HEAD     (1UL << 15)
-#define PAGE_COMPOUND_TAIL     (1UL << 16)
-#define PAGE_HUGE              (1UL << 17)
-
-#define HEAD_PAGE_FLAGS                (PAGE_COMPOUND_HEAD | PAGE_HUGE)
-#define TAIL_PAGE_FLAGS                (PAGE_COMPOUND_TAIL | PAGE_HUGE)
-
-#define PM_PFRAME_BITS         55
-#define PM_PFRAME_MASK         ~((1UL << PM_PFRAME_BITS) - 1)
-
-/*
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
- * That means the addresses starting with 0x800000... will need to be
- * specified.  Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
- */
-#ifdef __ia64__
-#define MAP_ADDR               (void *)(0x8000000000000000UL)
-#define MAP_FLAGS              (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
-#else
-#define MAP_ADDR               NULL
-#define MAP_FLAGS              (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
-#endif
-
-static void write_bytes(char *addr, size_t length)
-{
-       unsigned long i;
-
-       for (i = 0; i < length; i++)
-               *(addr + i) = (char)i;
-}
-
-static unsigned long virt_to_pfn(void *addr)
-{
-       int fd;
-       unsigned long pagemap;
-
-       fd = open("/proc/self/pagemap", O_RDONLY);
-       if (fd < 0)
-               return -1UL;
-
-       lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET);
-       read(fd, &pagemap, sizeof(pagemap));
-       close(fd);
-
-       return pagemap & ~PM_PFRAME_MASK;
-}
-
-static int check_page_flags(unsigned long pfn)
-{
-       int fd, i;
-       unsigned long pageflags;
-
-       fd = open("/proc/kpageflags", O_RDONLY);
-       if (fd < 0)
-               return -1;
-
-       lseek(fd, pfn * sizeof(pageflags), SEEK_SET);
-
-       read(fd, &pageflags, sizeof(pageflags));
-       if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) {
-               close(fd);
-               printf("Head page flags (%lx) is invalid\n", pageflags);
-               return -1;
-       }
-
-       /*
-        * pages other than the first page must be tail and shouldn't be head;
-        * this also verifies kernel has correctly set the fake page_head to tail
-        * while hugetlb_free_vmemmap is enabled.
-        */
-       for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) {
-               read(fd, &pageflags, sizeof(pageflags));
-               if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS ||
-                   (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) {
-                       close(fd);
-                       printf("Tail page flags (%lx) is invalid\n", pageflags);
-                       return -1;
-               }
-       }
-
-       close(fd);
-
-       return 0;
-}
-
-int main(int argc, char **argv)
-{
-       void *addr;
-       unsigned long pfn;
-
-       addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       /* Trigger allocation of HugeTLB page. */
-       write_bytes(addr, MAP_LENGTH);
-
-       pfn = virt_to_pfn(addr);
-       if (pfn == -1UL) {
-               munmap(addr, MAP_LENGTH);
-               perror("virt_to_pfn");
-               exit(1);
-       }
-
-       printf("Returned address is %p whose pfn is %lx\n", addr, pfn);
-
-       if (check_page_flags(pfn) < 0) {
-               munmap(addr, MAP_LENGTH);
-               perror("check_page_flags");
-               exit(1);
-       }
-
-       /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
-       if (munmap(addr, MAP_LENGTH)) {
-               perror("munmap");
-               exit(1);
-       }
-
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c
deleted file mode 100644 (file)
index a634f47..0000000
+++ /dev/null
@@ -1,406 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * hugepage-madvise:
- *
- * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE
- * on hugetlb mappings.
- *
- * Before running this test, make sure the administrator has pre-allocated
- * at least MIN_FREE_PAGES hugetlb pages and they are free.  In addition,
- * the test takes an argument that is the path to a file in a hugetlbfs
- * filesystem.  Therefore, a hugetlbfs filesystem must be mounted on some
- * directory.
- */
-
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#define __USE_GNU
-#include <fcntl.h>
-
-#define MIN_FREE_PAGES 20
-#define NR_HUGE_PAGES  10      /* common number of pages to map/allocate */
-
-#define validate_free_pages(exp_free)                                  \
-       do {                                                            \
-               int fhp = get_free_hugepages();                         \
-               if (fhp != (exp_free)) {                                \
-                       printf("Unexpected number of free huge "        \
-                               "pages line %d\n", __LINE__);           \
-                       exit(1);                                        \
-               }                                                       \
-       } while (0)
-
-unsigned long huge_page_size;
-unsigned long base_page_size;
-
-/*
- * default_huge_page_size copied from mlock2-tests.c
- */
-unsigned long default_huge_page_size(void)
-{
-       unsigned long hps = 0;
-       char *line = NULL;
-       size_t linelen = 0;
-       FILE *f = fopen("/proc/meminfo", "r");
-
-       if (!f)
-               return 0;
-       while (getline(&line, &linelen, f) > 0) {
-               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
-                       hps <<= 10;
-                       break;
-               }
-       }
-
-       free(line);
-       fclose(f);
-       return hps;
-}
-
-unsigned long get_free_hugepages(void)
-{
-       unsigned long fhp = 0;
-       char *line = NULL;
-       size_t linelen = 0;
-       FILE *f = fopen("/proc/meminfo", "r");
-
-       if (!f)
-               return fhp;
-       while (getline(&line, &linelen, f) > 0) {
-               if (sscanf(line, "HugePages_Free:      %lu", &fhp) == 1)
-                       break;
-       }
-
-       free(line);
-       fclose(f);
-       return fhp;
-}
-
-void write_fault_pages(void *addr, unsigned long nr_pages)
-{
-       unsigned long i;
-
-       for (i = 0; i < nr_pages; i++)
-               *((unsigned long *)(addr + (i * huge_page_size))) = i;
-}
-
-void read_fault_pages(void *addr, unsigned long nr_pages)
-{
-       unsigned long dummy = 0;
-       unsigned long i;
-
-       for (i = 0; i < nr_pages; i++)
-               dummy += *((unsigned long *)(addr + (i * huge_page_size)));
-}
-
-int main(int argc, char **argv)
-{
-       unsigned long free_hugepages;
-       void *addr, *addr2;
-       int fd;
-       int ret;
-
-       huge_page_size = default_huge_page_size();
-       if (!huge_page_size) {
-               printf("Unable to determine huge page size, exiting!\n");
-               exit(1);
-       }
-       base_page_size = sysconf(_SC_PAGE_SIZE);
-       if (!huge_page_size) {
-               printf("Unable to determine base page size, exiting!\n");
-               exit(1);
-       }
-
-       free_hugepages = get_free_hugepages();
-       if (free_hugepages < MIN_FREE_PAGES) {
-               printf("Not enough free huge pages to test, exiting!\n");
-               exit(1);
-       }
-
-       fd = memfd_create(argv[0], MFD_HUGETLB);
-       if (fd < 0) {
-               perror("memfd_create() failed");
-               exit(1);
-       }
-
-       /*
-        * Test validity of MADV_DONTNEED addr and length arguments.  mmap
-        * size is NR_HUGE_PAGES + 2.  One page at the beginning and end of
-        * the mapping will be unmapped so we KNOW there is nothing mapped
-        * there.
-        */
-       addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size,
-                       PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-                       -1, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-       if (munmap(addr, huge_page_size) ||
-                       munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size,
-                               huge_page_size)) {
-               perror("munmap");
-               exit(1);
-       }
-       addr = addr + huge_page_size;
-
-       write_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* addr before mapping should fail */
-       ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size,
-               MADV_DONTNEED);
-       if (!ret) {
-               printf("Unexpected success of madvise call with invalid addr line %d\n",
-                               __LINE__);
-                       exit(1);
-       }
-
-       /* addr + length after mapping should fail */
-       ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size,
-               MADV_DONTNEED);
-       if (!ret) {
-               printf("Unexpected success of madvise call with invalid length line %d\n",
-                               __LINE__);
-                       exit(1);
-       }
-
-       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-       /*
-        * Test alignment of MADV_DONTNEED addr and length arguments
-        */
-       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-                       PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-                       -1, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-       write_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* addr is not huge page size aligned and should fail */
-       ret = madvise(addr + base_page_size,
-                       NR_HUGE_PAGES * huge_page_size - base_page_size,
-                       MADV_DONTNEED);
-       if (!ret) {
-               printf("Unexpected success of madvise call with unaligned start address %d\n",
-                               __LINE__);
-                       exit(1);
-       }
-
-       /* addr + length should be aligned down to huge page size */
-       if (madvise(addr,
-                       ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size,
-                       MADV_DONTNEED)) {
-               perror("madvise");
-               exit(1);
-       }
-
-       /* should free all but last page in mapping */
-       validate_free_pages(free_hugepages - 1);
-
-       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-       validate_free_pages(free_hugepages);
-
-       /*
-        * Test MADV_DONTNEED on anonymous private mapping
-        */
-       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-                       PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
-                       -1, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-       write_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-               perror("madvise");
-               exit(1);
-       }
-
-       /* should free all pages in mapping */
-       validate_free_pages(free_hugepages);
-
-       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-       /*
-        * Test MADV_DONTNEED on private mapping of hugetlb file
-        */
-       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-               perror("fallocate");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-                       PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE, fd, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       /* read should not consume any pages */
-       read_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* madvise should not free any pages */
-       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-               perror("madvise");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* writes should allocate private pages */
-       write_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-       /* madvise should free private pages */
-       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-               perror("madvise");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* writes should allocate private pages */
-       write_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-       /*
-        * The fallocate below certainly should free the pages associated
-        * with the file.  However, pages in the private mapping are also
-        * freed.  This is not the 'correct' behavior, but is expected
-        * because this is how it has worked since the initial hugetlb
-        * implementation.
-        */
-       if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-                                       0, NR_HUGE_PAGES * huge_page_size)) {
-               perror("fallocate");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages);
-
-       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-       /*
-        * Test MADV_DONTNEED on shared mapping of hugetlb file
-        */
-       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-               perror("fallocate");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-                       PROT_READ | PROT_WRITE,
-                       MAP_SHARED, fd, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       /* write should not consume any pages */
-       write_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* madvise should not free any pages */
-       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-               perror("madvise");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /*
-        * Test MADV_REMOVE on shared mapping of hugetlb file
-        *
-        * madvise is same as hole punch and should free all pages.
-        */
-       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
-               perror("madvise");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages);
-       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-
-       /*
-        * Test MADV_REMOVE on shared and private mapping of hugetlb file
-        */
-       if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) {
-               perror("fallocate");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-                       PROT_READ | PROT_WRITE,
-                       MAP_SHARED, fd, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       /* shared write should not consume any additional pages */
-       write_fault_pages(addr, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size,
-                       PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE, fd, 0);
-       if (addr2 == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       /* private read should not consume any pages */
-       read_fault_pages(addr2, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* private write should consume additional pages */
-       write_fault_pages(addr2, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-       /* madvise of shared mapping should not free any pages */
-       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-               perror("madvise");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-       /* madvise of private mapping should free private pages */
-       if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) {
-               perror("madvise");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages - NR_HUGE_PAGES);
-
-       /* private write should consume additional pages again */
-       write_fault_pages(addr2, NR_HUGE_PAGES);
-       validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES));
-
-       /*
-        * madvise should free both file and private pages although this is
-        * not correct.  private pages should not be freed, but this is
-        * expected.  See comment associated with FALLOC_FL_PUNCH_HOLE call.
-        */
-       if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) {
-               perror("madvise");
-               exit(1);
-       }
-       validate_free_pages(free_hugepages);
-
-       (void)munmap(addr, NR_HUGE_PAGES * huge_page_size);
-       (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size);
-
-       close(fd);
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
deleted file mode 100644 (file)
index bf2d2a6..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-set -e
-
-if [[ $(id -u) -ne 0 ]]; then
-  echo "This test must be run as root. Skipping..."
-  exit $ksft_skip
-fi
-
-usage_file=usage_in_bytes
-
-if [[ "$1" == "-cgroup-v2" ]]; then
-  cgroup2=1
-  usage_file=current
-fi
-
-
-if [[ $cgroup2 ]]; then
-  CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}')
-  if [[ -z "$CGROUP_ROOT" ]]; then
-    CGROUP_ROOT=/dev/cgroup/memory
-    mount -t cgroup2 none $CGROUP_ROOT
-    do_umount=1
-  fi
-  echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
-else
-  CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}')
-  if [[ -z "$CGROUP_ROOT" ]]; then
-    CGROUP_ROOT=/dev/cgroup/memory
-    mount -t cgroup memory,hugetlb $CGROUP_ROOT
-    do_umount=1
-  fi
-fi
-MNT='/mnt/huge/'
-
-function get_machine_hugepage_size() {
-  hpz=$(grep -i hugepagesize /proc/meminfo)
-  kb=${hpz:14:-3}
-  mb=$(($kb / 1024))
-  echo $mb
-}
-
-MB=$(get_machine_hugepage_size)
-
-function cleanup() {
-  echo cleanup
-  set +e
-  rm -rf "$MNT"/* 2>/dev/null
-  umount "$MNT" 2>/dev/null
-  rmdir "$MNT" 2>/dev/null
-  rmdir "$CGROUP_ROOT"/a/b 2>/dev/null
-  rmdir "$CGROUP_ROOT"/a 2>/dev/null
-  rmdir "$CGROUP_ROOT"/test1 2>/dev/null
-  echo 0 >/proc/sys/vm/nr_hugepages
-  set -e
-}
-
-function assert_state() {
-  local expected_a="$1"
-  local expected_a_hugetlb="$2"
-  local expected_b=""
-  local expected_b_hugetlb=""
-
-  if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then
-    expected_b="$3"
-    expected_b_hugetlb="$4"
-  fi
-  local tolerance=$((5 * 1024 * 1024))
-
-  local actual_a
-  actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
-  if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
-    [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
-    echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
-    echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-
-  local actual_a_hugetlb
-  actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
-    [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
-    echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
-    echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-
-  if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
-    return
-  fi
-
-  local actual_b
-  actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
-  if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
-    [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
-    echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
-    echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-
-  local actual_b_hugetlb
-  actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
-  if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
-    [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
-    echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
-    echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
-    echo fail
-
-    cleanup
-    exit 1
-  fi
-}
-
-function setup() {
-  echo 100 >/proc/sys/vm/nr_hugepages
-  mkdir "$CGROUP_ROOT"/a
-  sleep 1
-  if [[ $cgroup2 ]]; then
-    echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control
-  else
-    echo 0 >$CGROUP_ROOT/a/cpuset.mems
-    echo 0 >$CGROUP_ROOT/a/cpuset.cpus
-  fi
-
-  mkdir "$CGROUP_ROOT"/a/b
-
-  if [[ ! $cgroup2 ]]; then
-    echo 0 >$CGROUP_ROOT/a/b/cpuset.mems
-    echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus
-  fi
-
-  mkdir -p "$MNT"
-  mount -t hugetlbfs none "$MNT"
-}
-
-write_hugetlbfs() {
-  local cgroup="$1"
-  local path="$2"
-  local size="$3"
-
-  if [[ $cgroup2 ]]; then
-    echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs
-  else
-    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems
-    echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus
-    echo $$ >"$CGROUP_ROOT/$cgroup/tasks"
-  fi
-  ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o
-  if [[ $cgroup2 ]]; then
-    echo $$ >$CGROUP_ROOT/cgroup.procs
-  else
-    echo $$ >"$CGROUP_ROOT/tasks"
-  fi
-  echo
-}
-
-set -e
-
-size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages.
-
-cleanup
-
-echo
-echo
-echo Test charge, rmdir, uncharge
-setup
-echo mkdir
-mkdir $CGROUP_ROOT/test1
-
-echo write
-write_hugetlbfs test1 "$MNT"/test $size
-
-echo rmdir
-rmdir $CGROUP_ROOT/test1
-mkdir $CGROUP_ROOT/test1
-
-echo uncharge
-rm -rf /mnt/huge/*
-
-cleanup
-
-echo done
-echo
-echo
-if [[ ! $cgroup2 ]]; then
-  echo "Test parent and child hugetlb usage"
-  setup
-
-  echo write
-  write_hugetlbfs a "$MNT"/test $size
-
-  echo Assert memory charged correctly for parent use.
-  assert_state 0 $size 0 0
-
-  write_hugetlbfs a/b "$MNT"/test2 $size
-
-  echo Assert memory charged correctly for child use.
-  assert_state 0 $(($size * 2)) 0 $size
-
-  rmdir "$CGROUP_ROOT"/a/b
-  sleep 5
-  echo Assert memory reparent correctly.
-  assert_state 0 $(($size * 2))
-
-  rm -rf "$MNT"/*
-  umount "$MNT"
-  echo Assert memory uncharged correctly.
-  assert_state 0 0
-
-  cleanup
-fi
-
-echo
-echo
-echo "Test child only hugetlb usage"
-echo setup
-setup
-
-echo write
-write_hugetlbfs a/b "$MNT"/test2 $size
-
-echo Assert memory charged correctly for child only use.
-assert_state 0 $(($size)) 0 $size
-
-rmdir "$CGROUP_ROOT"/a/b
-echo Assert memory reparent correctly.
-assert_state 0 $size
-
-rm -rf "$MNT"/*
-umount "$MNT"
-echo Assert memory uncharged correctly.
-assert_state 0 0
-
-cleanup
-
-echo ALL PASS
-
-umount $CGROUP_ROOT
-rm -rf $CGROUP_ROOT
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
deleted file mode 100644 (file)
index 64126c8..0000000
+++ /dev/null
@@ -1,1558 +0,0 @@
-#define _GNU_SOURCE
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <dirent.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <sys/mman.h>
-#include <sys/wait.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-
-#include "linux/magic.h"
-
-#include "vm_util.h"
-
-#ifndef MADV_PAGEOUT
-#define MADV_PAGEOUT 21
-#endif
-#ifndef MADV_POPULATE_READ
-#define MADV_POPULATE_READ 22
-#endif
-#ifndef MADV_COLLAPSE
-#define MADV_COLLAPSE 25
-#endif
-
-#define BASE_ADDR ((void *)(1UL << 30))
-static unsigned long hpage_pmd_size;
-static unsigned long page_size;
-static int hpage_pmd_nr;
-
-#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
-#define PID_SMAPS "/proc/self/smaps"
-#define TEST_FILE "collapse_test_file"
-
-#define MAX_LINE_LENGTH 500
-
-enum vma_type {
-       VMA_ANON,
-       VMA_FILE,
-       VMA_SHMEM,
-};
-
-struct mem_ops {
-       void *(*setup_area)(int nr_hpages);
-       void (*cleanup_area)(void *p, unsigned long size);
-       void (*fault)(void *p, unsigned long start, unsigned long end);
-       bool (*check_huge)(void *addr, int nr_hpages);
-       const char *name;
-};
-
-static struct mem_ops *file_ops;
-static struct mem_ops *anon_ops;
-static struct mem_ops *shmem_ops;
-
-struct collapse_context {
-       void (*collapse)(const char *msg, char *p, int nr_hpages,
-                        struct mem_ops *ops, bool expect);
-       bool enforce_pte_scan_limits;
-       const char *name;
-};
-
-static struct collapse_context *khugepaged_context;
-static struct collapse_context *madvise_context;
-
-struct file_info {
-       const char *dir;
-       char path[PATH_MAX];
-       enum vma_type type;
-       int fd;
-       char dev_queue_read_ahead_path[PATH_MAX];
-};
-
-static struct file_info finfo;
-
-enum thp_enabled {
-       THP_ALWAYS,
-       THP_MADVISE,
-       THP_NEVER,
-};
-
-static const char *thp_enabled_strings[] = {
-       "always",
-       "madvise",
-       "never",
-       NULL
-};
-
-enum thp_defrag {
-       THP_DEFRAG_ALWAYS,
-       THP_DEFRAG_DEFER,
-       THP_DEFRAG_DEFER_MADVISE,
-       THP_DEFRAG_MADVISE,
-       THP_DEFRAG_NEVER,
-};
-
-static const char *thp_defrag_strings[] = {
-       "always",
-       "defer",
-       "defer+madvise",
-       "madvise",
-       "never",
-       NULL
-};
-
-enum shmem_enabled {
-       SHMEM_ALWAYS,
-       SHMEM_WITHIN_SIZE,
-       SHMEM_ADVISE,
-       SHMEM_NEVER,
-       SHMEM_DENY,
-       SHMEM_FORCE,
-};
-
-static const char *shmem_enabled_strings[] = {
-       "always",
-       "within_size",
-       "advise",
-       "never",
-       "deny",
-       "force",
-       NULL
-};
-
-struct khugepaged_settings {
-       bool defrag;
-       unsigned int alloc_sleep_millisecs;
-       unsigned int scan_sleep_millisecs;
-       unsigned int max_ptes_none;
-       unsigned int max_ptes_swap;
-       unsigned int max_ptes_shared;
-       unsigned long pages_to_scan;
-};
-
-struct settings {
-       enum thp_enabled thp_enabled;
-       enum thp_defrag thp_defrag;
-       enum shmem_enabled shmem_enabled;
-       bool use_zero_page;
-       struct khugepaged_settings khugepaged;
-       unsigned long read_ahead_kb;
-};
-
-static struct settings saved_settings;
-static bool skip_settings_restore;
-
-static int exit_status;
-
-static void success(const char *msg)
-{
-       printf(" \e[32m%s\e[0m\n", msg);
-}
-
-static void fail(const char *msg)
-{
-       printf(" \e[31m%s\e[0m\n", msg);
-       exit_status++;
-}
-
-static void skip(const char *msg)
-{
-       printf(" \e[33m%s\e[0m\n", msg);
-}
-
-static int read_file(const char *path, char *buf, size_t buflen)
-{
-       int fd;
-       ssize_t numread;
-
-       fd = open(path, O_RDONLY);
-       if (fd == -1)
-               return 0;
-
-       numread = read(fd, buf, buflen - 1);
-       if (numread < 1) {
-               close(fd);
-               return 0;
-       }
-
-       buf[numread] = '\0';
-       close(fd);
-
-       return (unsigned int) numread;
-}
-
-static int write_file(const char *path, const char *buf, size_t buflen)
-{
-       int fd;
-       ssize_t numwritten;
-
-       fd = open(path, O_WRONLY);
-       if (fd == -1) {
-               printf("open(%s)\n", path);
-               exit(EXIT_FAILURE);
-               return 0;
-       }
-
-       numwritten = write(fd, buf, buflen - 1);
-       close(fd);
-       if (numwritten < 1) {
-               printf("write(%s)\n", buf);
-               exit(EXIT_FAILURE);
-               return 0;
-       }
-
-       return (unsigned int) numwritten;
-}
-
-static int read_string(const char *name, const char *strings[])
-{
-       char path[PATH_MAX];
-       char buf[256];
-       char *c;
-       int ret;
-
-       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-       if (ret >= PATH_MAX) {
-               printf("%s: Pathname is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-
-       if (!read_file(path, buf, sizeof(buf))) {
-               perror(path);
-               exit(EXIT_FAILURE);
-       }
-
-       c = strchr(buf, '[');
-       if (!c) {
-               printf("%s: Parse failure\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-
-       c++;
-       memmove(buf, c, sizeof(buf) - (c - buf));
-
-       c = strchr(buf, ']');
-       if (!c) {
-               printf("%s: Parse failure\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-       *c = '\0';
-
-       ret = 0;
-       while (strings[ret]) {
-               if (!strcmp(strings[ret], buf))
-                       return ret;
-               ret++;
-       }
-
-       printf("Failed to parse %s\n", name);
-       exit(EXIT_FAILURE);
-}
-
-static void write_string(const char *name, const char *val)
-{
-       char path[PATH_MAX];
-       int ret;
-
-       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-       if (ret >= PATH_MAX) {
-               printf("%s: Pathname is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-
-       if (!write_file(path, val, strlen(val) + 1)) {
-               perror(path);
-               exit(EXIT_FAILURE);
-       }
-}
-
-static const unsigned long _read_num(const char *path)
-{
-       char buf[21];
-
-       if (read_file(path, buf, sizeof(buf)) < 0) {
-               perror("read_file(read_num)");
-               exit(EXIT_FAILURE);
-       }
-
-       return strtoul(buf, NULL, 10);
-}
-
-static const unsigned long read_num(const char *name)
-{
-       char path[PATH_MAX];
-       int ret;
-
-       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-       if (ret >= PATH_MAX) {
-               printf("%s: Pathname is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-       return _read_num(path);
-}
-
-static void _write_num(const char *path, unsigned long num)
-{
-       char buf[21];
-
-       sprintf(buf, "%ld", num);
-       if (!write_file(path, buf, strlen(buf) + 1)) {
-               perror(path);
-               exit(EXIT_FAILURE);
-       }
-}
-
-static void write_num(const char *name, unsigned long num)
-{
-       char path[PATH_MAX];
-       int ret;
-
-       ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
-       if (ret >= PATH_MAX) {
-               printf("%s: Pathname is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-       _write_num(path, num);
-}
-
-static void write_settings(struct settings *settings)
-{
-       struct khugepaged_settings *khugepaged = &settings->khugepaged;
-
-       write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
-       write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
-       write_string("shmem_enabled",
-                       shmem_enabled_strings[settings->shmem_enabled]);
-       write_num("use_zero_page", settings->use_zero_page);
-
-       write_num("khugepaged/defrag", khugepaged->defrag);
-       write_num("khugepaged/alloc_sleep_millisecs",
-                       khugepaged->alloc_sleep_millisecs);
-       write_num("khugepaged/scan_sleep_millisecs",
-                       khugepaged->scan_sleep_millisecs);
-       write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
-       write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
-       write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
-       write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
-
-       if (file_ops && finfo.type == VMA_FILE)
-               _write_num(finfo.dev_queue_read_ahead_path,
-                          settings->read_ahead_kb);
-}
-
-#define MAX_SETTINGS_DEPTH 4
-static struct settings settings_stack[MAX_SETTINGS_DEPTH];
-static int settings_index;
-
-static struct settings *current_settings(void)
-{
-       if (!settings_index) {
-               printf("Fail: No settings set");
-               exit(EXIT_FAILURE);
-       }
-       return settings_stack + settings_index - 1;
-}
-
-static void push_settings(struct settings *settings)
-{
-       if (settings_index >= MAX_SETTINGS_DEPTH) {
-               printf("Fail: Settings stack exceeded");
-               exit(EXIT_FAILURE);
-       }
-       settings_stack[settings_index++] = *settings;
-       write_settings(current_settings());
-}
-
-static void pop_settings(void)
-{
-       if (settings_index <= 0) {
-               printf("Fail: Settings stack empty");
-               exit(EXIT_FAILURE);
-       }
-       --settings_index;
-       write_settings(current_settings());
-}
-
-static void restore_settings(int sig)
-{
-       if (skip_settings_restore)
-               goto out;
-
-       printf("Restore THP and khugepaged settings...");
-       write_settings(&saved_settings);
-       success("OK");
-       if (sig)
-               exit(EXIT_FAILURE);
-out:
-       exit(exit_status);
-}
-
-static void save_settings(void)
-{
-       printf("Save THP and khugepaged settings...");
-       saved_settings = (struct settings) {
-               .thp_enabled = read_string("enabled", thp_enabled_strings),
-               .thp_defrag = read_string("defrag", thp_defrag_strings),
-               .shmem_enabled =
-                       read_string("shmem_enabled", shmem_enabled_strings),
-               .use_zero_page = read_num("use_zero_page"),
-       };
-       saved_settings.khugepaged = (struct khugepaged_settings) {
-               .defrag = read_num("khugepaged/defrag"),
-               .alloc_sleep_millisecs =
-                       read_num("khugepaged/alloc_sleep_millisecs"),
-               .scan_sleep_millisecs =
-                       read_num("khugepaged/scan_sleep_millisecs"),
-               .max_ptes_none = read_num("khugepaged/max_ptes_none"),
-               .max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
-               .max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
-               .pages_to_scan = read_num("khugepaged/pages_to_scan"),
-       };
-       if (file_ops && finfo.type == VMA_FILE)
-               saved_settings.read_ahead_kb =
-                               _read_num(finfo.dev_queue_read_ahead_path);
-
-       success("OK");
-
-       signal(SIGTERM, restore_settings);
-       signal(SIGINT, restore_settings);
-       signal(SIGHUP, restore_settings);
-       signal(SIGQUIT, restore_settings);
-}
-
-static void get_finfo(const char *dir)
-{
-       struct stat path_stat;
-       struct statfs fs;
-       char buf[1 << 10];
-       char path[PATH_MAX];
-       char *str, *end;
-
-       finfo.dir = dir;
-       stat(finfo.dir, &path_stat);
-       if (!S_ISDIR(path_stat.st_mode)) {
-               printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
-               exit(EXIT_FAILURE);
-       }
-       if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
-                    finfo.dir) >= sizeof(finfo.path)) {
-               printf("%s: Pathname is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-       if (statfs(finfo.dir, &fs)) {
-               perror("statfs()");
-               exit(EXIT_FAILURE);
-       }
-       finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
-       if (finfo.type == VMA_SHMEM)
-               return;
-
-       /* Find owning device's queue/read_ahead_kb control */
-       if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
-                    major(path_stat.st_dev), minor(path_stat.st_dev))
-           >= sizeof(path)) {
-               printf("%s: Pathname is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-       if (read_file(path, buf, sizeof(buf)) < 0) {
-               perror("read_file(read_num)");
-               exit(EXIT_FAILURE);
-       }
-       if (strstr(buf, "DEVTYPE=disk")) {
-               /* Found it */
-               if (snprintf(finfo.dev_queue_read_ahead_path,
-                            sizeof(finfo.dev_queue_read_ahead_path),
-                            "/sys/dev/block/%d:%d/queue/read_ahead_kb",
-                            major(path_stat.st_dev), minor(path_stat.st_dev))
-                   >= sizeof(finfo.dev_queue_read_ahead_path)) {
-                       printf("%s: Pathname is too long\n", __func__);
-                       exit(EXIT_FAILURE);
-               }
-               return;
-       }
-       if (!strstr(buf, "DEVTYPE=partition")) {
-               printf("%s: Unknown device type: %s\n", __func__, path);
-               exit(EXIT_FAILURE);
-       }
-       /*
-        * Partition of block device - need to find actual device.
-        * Using naming convention that devnameN is partition of
-        * device devname.
-        */
-       str = strstr(buf, "DEVNAME=");
-       if (!str) {
-               printf("%s: Could not read: %s", __func__, path);
-               exit(EXIT_FAILURE);
-       }
-       str += 8;
-       end = str;
-       while (*end) {
-               if (isdigit(*end)) {
-                       *end = '\0';
-                       if (snprintf(finfo.dev_queue_read_ahead_path,
-                                    sizeof(finfo.dev_queue_read_ahead_path),
-                                    "/sys/block/%s/queue/read_ahead_kb",
-                                    str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
-                               printf("%s: Pathname is too long\n", __func__);
-                               exit(EXIT_FAILURE);
-                       }
-                       return;
-               }
-               ++end;
-       }
-       printf("%s: Could not read: %s\n", __func__, path);
-       exit(EXIT_FAILURE);
-}
-
-static bool check_swap(void *addr, unsigned long size)
-{
-       bool swap = false;
-       int ret;
-       FILE *fp;
-       char buffer[MAX_LINE_LENGTH];
-       char addr_pattern[MAX_LINE_LENGTH];
-
-       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
-                      (unsigned long) addr);
-       if (ret >= MAX_LINE_LENGTH) {
-               printf("%s: Pattern is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-
-
-       fp = fopen(PID_SMAPS, "r");
-       if (!fp) {
-               printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
-               exit(EXIT_FAILURE);
-       }
-       if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
-               goto err_out;
-
-       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
-                      size >> 10);
-       if (ret >= MAX_LINE_LENGTH) {
-               printf("%s: Pattern is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-       /*
-        * Fetch the Swap: in the same block and check whether it got
-        * the expected number of hugeepages next.
-        */
-       if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
-               goto err_out;
-
-       if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
-               goto err_out;
-
-       swap = true;
-err_out:
-       fclose(fp);
-       return swap;
-}
-
-static void *alloc_mapping(int nr)
-{
-       void *p;
-
-       p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
-                MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-       if (p != BASE_ADDR) {
-               printf("Failed to allocate VMA at %p\n", BASE_ADDR);
-               exit(EXIT_FAILURE);
-       }
-
-       return p;
-}
-
-static void fill_memory(int *p, unsigned long start, unsigned long end)
-{
-       int i;
-
-       for (i = start / page_size; i < end / page_size; i++)
-               p[i * page_size / sizeof(*p)] = i + 0xdead0000;
-}
-
-/*
- * MADV_COLLAPSE is a best-effort request and may fail if an internal
- * resource is temporarily unavailable, in which case it will set errno to
- * EAGAIN.  In such a case, immediately reattempt the operation one more
- * time.
- */
-static int madvise_collapse_retry(void *p, unsigned long size)
-{
-       bool retry = true;
-       int ret;
-
-retry:
-       ret = madvise(p, size, MADV_COLLAPSE);
-       if (ret && errno == EAGAIN && retry) {
-               retry = false;
-               goto retry;
-       }
-       return ret;
-}
-
-/*
- * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
- * validate_memory()'able contents.
- */
-static void *alloc_hpage(struct mem_ops *ops)
-{
-       void *p = ops->setup_area(1);
-
-       ops->fault(p, 0, hpage_pmd_size);
-
-       /*
-        * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
-        * The latter is ineligible for collapse by MADV_COLLAPSE
-        * while the former might cause MADV_COLLAPSE to race with
-        * khugepaged on low-load system (like a test machine), which
-        * would cause MADV_COLLAPSE to fail with EAGAIN.
-        */
-       printf("Allocate huge page...");
-       if (madvise_collapse_retry(p, hpage_pmd_size)) {
-               perror("madvise(MADV_COLLAPSE)");
-               exit(EXIT_FAILURE);
-       }
-       if (!ops->check_huge(p, 1)) {
-               perror("madvise(MADV_COLLAPSE)");
-               exit(EXIT_FAILURE);
-       }
-       if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
-               perror("madvise(MADV_HUGEPAGE)");
-               exit(EXIT_FAILURE);
-       }
-       success("OK");
-       return p;
-}
-
-static void validate_memory(int *p, unsigned long start, unsigned long end)
-{
-       int i;
-
-       for (i = start / page_size; i < end / page_size; i++) {
-               if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
-                       printf("Page %d is corrupted: %#x\n",
-                                       i, p[i * page_size / sizeof(*p)]);
-                       exit(EXIT_FAILURE);
-               }
-       }
-}
-
-static void *anon_setup_area(int nr_hpages)
-{
-       return alloc_mapping(nr_hpages);
-}
-
-static void anon_cleanup_area(void *p, unsigned long size)
-{
-       munmap(p, size);
-}
-
-static void anon_fault(void *p, unsigned long start, unsigned long end)
-{
-       fill_memory(p, start, end);
-}
-
-static bool anon_check_huge(void *addr, int nr_hpages)
-{
-       return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
-}
-
-static void *file_setup_area(int nr_hpages)
-{
-       int fd;
-       void *p;
-       unsigned long size;
-
-       unlink(finfo.path);  /* Cleanup from previous failed tests */
-       printf("Creating %s for collapse%s...", finfo.path,
-              finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
-       fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
-                 777);
-       if (fd < 0) {
-               perror("open()");
-               exit(EXIT_FAILURE);
-       }
-
-       size = nr_hpages * hpage_pmd_size;
-       p = alloc_mapping(nr_hpages);
-       fill_memory(p, 0, size);
-       write(fd, p, size);
-       close(fd);
-       munmap(p, size);
-       success("OK");
-
-       printf("Opening %s read only for collapse...", finfo.path);
-       finfo.fd = open(finfo.path, O_RDONLY, 777);
-       if (finfo.fd < 0) {
-               perror("open()");
-               exit(EXIT_FAILURE);
-       }
-       p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
-                MAP_PRIVATE, finfo.fd, 0);
-       if (p == MAP_FAILED || p != BASE_ADDR) {
-               perror("mmap()");
-               exit(EXIT_FAILURE);
-       }
-
-       /* Drop page cache */
-       write_file("/proc/sys/vm/drop_caches", "3", 2);
-       success("OK");
-       return p;
-}
-
-static void file_cleanup_area(void *p, unsigned long size)
-{
-       munmap(p, size);
-       close(finfo.fd);
-       unlink(finfo.path);
-}
-
-static void file_fault(void *p, unsigned long start, unsigned long end)
-{
-       if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
-               perror("madvise(MADV_POPULATE_READ");
-               exit(EXIT_FAILURE);
-       }
-}
-
-static bool file_check_huge(void *addr, int nr_hpages)
-{
-       switch (finfo.type) {
-       case VMA_FILE:
-               return check_huge_file(addr, nr_hpages, hpage_pmd_size);
-       case VMA_SHMEM:
-               return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
-       default:
-               exit(EXIT_FAILURE);
-               return false;
-       }
-}
-
-static void *shmem_setup_area(int nr_hpages)
-{
-       void *p;
-       unsigned long size = nr_hpages * hpage_pmd_size;
-
-       finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
-       if (finfo.fd < 0)  {
-               perror("memfd_create()");
-               exit(EXIT_FAILURE);
-       }
-       if (ftruncate(finfo.fd, size)) {
-               perror("ftruncate()");
-               exit(EXIT_FAILURE);
-       }
-       p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
-                0);
-       if (p != BASE_ADDR) {
-               perror("mmap()");
-               exit(EXIT_FAILURE);
-       }
-       return p;
-}
-
-static void shmem_cleanup_area(void *p, unsigned long size)
-{
-       munmap(p, size);
-       close(finfo.fd);
-}
-
-static bool shmem_check_huge(void *addr, int nr_hpages)
-{
-       return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
-}
-
-static struct mem_ops __anon_ops = {
-       .setup_area = &anon_setup_area,
-       .cleanup_area = &anon_cleanup_area,
-       .fault = &anon_fault,
-       .check_huge = &anon_check_huge,
-       .name = "anon",
-};
-
-static struct mem_ops __file_ops = {
-       .setup_area = &file_setup_area,
-       .cleanup_area = &file_cleanup_area,
-       .fault = &file_fault,
-       .check_huge = &file_check_huge,
-       .name = "file",
-};
-
-static struct mem_ops __shmem_ops = {
-       .setup_area = &shmem_setup_area,
-       .cleanup_area = &shmem_cleanup_area,
-       .fault = &anon_fault,
-       .check_huge = &shmem_check_huge,
-       .name = "shmem",
-};
-
-static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
-                              struct mem_ops *ops, bool expect)
-{
-       int ret;
-       struct settings settings = *current_settings();
-
-       printf("%s...", msg);
-
-       /*
-        * Prevent khugepaged interference and tests that MADV_COLLAPSE
-        * ignores /sys/kernel/mm/transparent_hugepage/enabled
-        */
-       settings.thp_enabled = THP_NEVER;
-       settings.shmem_enabled = SHMEM_NEVER;
-       push_settings(&settings);
-
-       /* Clear VM_NOHUGEPAGE */
-       madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
-       ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
-       if (((bool)ret) == expect)
-               fail("Fail: Bad return value");
-       else if (!ops->check_huge(p, expect ? nr_hpages : 0))
-               fail("Fail: check_huge()");
-       else
-               success("OK");
-
-       pop_settings();
-}
-
-static void madvise_collapse(const char *msg, char *p, int nr_hpages,
-                            struct mem_ops *ops, bool expect)
-{
-       /* Sanity check */
-       if (!ops->check_huge(p, 0)) {
-               printf("Unexpected huge page\n");
-               exit(EXIT_FAILURE);
-       }
-       __madvise_collapse(msg, p, nr_hpages, ops, expect);
-}
-
-#define TICK 500000
-static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
-                         struct mem_ops *ops)
-{
-       int full_scans;
-       int timeout = 6; /* 3 seconds */
-
-       /* Sanity check */
-       if (!ops->check_huge(p, 0)) {
-               printf("Unexpected huge page\n");
-               exit(EXIT_FAILURE);
-       }
-
-       madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
-
-       /* Wait until the second full_scan completed */
-       full_scans = read_num("khugepaged/full_scans") + 2;
-
-       printf("%s...", msg);
-       while (timeout--) {
-               if (ops->check_huge(p, nr_hpages))
-                       break;
-               if (read_num("khugepaged/full_scans") >= full_scans)
-                       break;
-               printf(".");
-               usleep(TICK);
-       }
-
-       madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
-
-       return timeout == -1;
-}
-
-static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
-                               struct mem_ops *ops, bool expect)
-{
-       if (wait_for_scan(msg, p, nr_hpages, ops)) {
-               if (expect)
-                       fail("Timeout");
-               else
-                       success("OK");
-               return;
-       }
-
-       /*
-        * For file and shmem memory, khugepaged only retracts pte entries after
-        * putting the new hugepage in the page cache. The hugepage must be
-        * subsequently refaulted to install the pmd mapping for the mm.
-        */
-       if (ops != &__anon_ops)
-               ops->fault(p, 0, nr_hpages * hpage_pmd_size);
-
-       if (ops->check_huge(p, expect ? nr_hpages : 0))
-               success("OK");
-       else
-               fail("Fail");
-}
-
-static struct collapse_context __khugepaged_context = {
-       .collapse = &khugepaged_collapse,
-       .enforce_pte_scan_limits = true,
-       .name = "khugepaged",
-};
-
-static struct collapse_context __madvise_context = {
-       .collapse = &madvise_collapse,
-       .enforce_pte_scan_limits = false,
-       .name = "madvise",
-};
-
-static bool is_tmpfs(struct mem_ops *ops)
-{
-       return ops == &__file_ops && finfo.type == VMA_SHMEM;
-}
-
-static void alloc_at_fault(void)
-{
-       struct settings settings = *current_settings();
-       char *p;
-
-       settings.thp_enabled = THP_ALWAYS;
-       push_settings(&settings);
-
-       p = alloc_mapping(1);
-       *p = 1;
-       printf("Allocate huge page on fault...");
-       if (check_huge_anon(p, 1, hpage_pmd_size))
-               success("OK");
-       else
-               fail("Fail");
-
-       pop_settings();
-
-       madvise(p, page_size, MADV_DONTNEED);
-       printf("Split huge PMD on MADV_DONTNEED...");
-       if (check_huge_anon(p, 0, hpage_pmd_size))
-               success("OK");
-       else
-               fail("Fail");
-       munmap(p, hpage_pmd_size);
-}
-
-static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
-{
-       void *p;
-       int nr_hpages = 4;
-       unsigned long size = nr_hpages * hpage_pmd_size;
-
-       p = ops->setup_area(nr_hpages);
-       ops->fault(p, 0, size);
-       c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
-                   ops, true);
-       validate_memory(p, 0, size);
-       ops->cleanup_area(p, size);
-}
-
-static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
-{
-       void *p;
-
-       p = ops->setup_area(1);
-       c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
-{
-       void *p;
-
-       p = ops->setup_area(1);
-       ops->fault(p, 0, page_size);
-       c->collapse("Collapse PTE table with single PTE entry present", p,
-                   1, ops, true);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
-{
-       int max_ptes_none = hpage_pmd_nr / 2;
-       struct settings settings = *current_settings();
-       void *p;
-
-       settings.khugepaged.max_ptes_none = max_ptes_none;
-       push_settings(&settings);
-
-       p = ops->setup_area(1);
-
-       if (is_tmpfs(ops)) {
-               /* shmem pages always in the page cache */
-               printf("tmpfs...");
-               skip("Skip");
-               goto skip;
-       }
-
-       ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
-       c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
-                   ops, !c->enforce_pte_scan_limits);
-       validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
-
-       if (c->enforce_pte_scan_limits) {
-               ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
-               c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
-                           true);
-               validate_memory(p, 0,
-                               (hpage_pmd_nr - max_ptes_none) * page_size);
-       }
-skip:
-       ops->cleanup_area(p, hpage_pmd_size);
-       pop_settings();
-}
-
-static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
-{
-       void *p;
-
-       p = ops->setup_area(1);
-       ops->fault(p, 0, hpage_pmd_size);
-
-       printf("Swapout one page...");
-       if (madvise(p, page_size, MADV_PAGEOUT)) {
-               perror("madvise(MADV_PAGEOUT)");
-               exit(EXIT_FAILURE);
-       }
-       if (check_swap(p, page_size)) {
-               success("OK");
-       } else {
-               fail("Fail");
-               goto out;
-       }
-
-       c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
-                   true);
-       validate_memory(p, 0, hpage_pmd_size);
-out:
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
-{
-       int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
-       void *p;
-
-       p = ops->setup_area(1);
-       ops->fault(p, 0, hpage_pmd_size);
-
-       printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
-       if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
-               perror("madvise(MADV_PAGEOUT)");
-               exit(EXIT_FAILURE);
-       }
-       if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
-               success("OK");
-       } else {
-               fail("Fail");
-               goto out;
-       }
-
-       c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
-                   !c->enforce_pte_scan_limits);
-       validate_memory(p, 0, hpage_pmd_size);
-
-       if (c->enforce_pte_scan_limits) {
-               ops->fault(p, 0, hpage_pmd_size);
-               printf("Swapout %d of %d pages...", max_ptes_swap,
-                      hpage_pmd_nr);
-               if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
-                       perror("madvise(MADV_PAGEOUT)");
-                       exit(EXIT_FAILURE);
-               }
-               if (check_swap(p, max_ptes_swap * page_size)) {
-                       success("OK");
-               } else {
-                       fail("Fail");
-                       goto out;
-               }
-
-               c->collapse("Collapse with max_ptes_swap pages swapped out", p,
-                           1, ops, true);
-               validate_memory(p, 0, hpage_pmd_size);
-       }
-out:
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
-{
-       void *p;
-
-       p = alloc_hpage(ops);
-
-       if (is_tmpfs(ops)) {
-               /* MADV_DONTNEED won't evict tmpfs pages */
-               printf("tmpfs...");
-               skip("Skip");
-               goto skip;
-       }
-
-       madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-       printf("Split huge page leaving single PTE mapping compound page...");
-       madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
-       if (ops->check_huge(p, 0))
-               success("OK");
-       else
-               fail("Fail");
-
-       c->collapse("Collapse PTE table with single PTE mapping compound page",
-                   p, 1, ops, true);
-       validate_memory(p, 0, page_size);
-skip:
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
-{
-       void *p;
-
-       p = alloc_hpage(ops);
-       printf("Split huge page leaving single PTE page table full of compound pages...");
-       madvise(p, page_size, MADV_NOHUGEPAGE);
-       madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-       if (ops->check_huge(p, 0))
-               success("OK");
-       else
-               fail("Fail");
-
-       c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
-                   true);
-       validate_memory(p, 0, hpage_pmd_size);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
-{
-       void *p;
-       int i;
-
-       p = ops->setup_area(1);
-       for (i = 0; i < hpage_pmd_nr; i++) {
-               printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
-                               i + 1, hpage_pmd_nr);
-
-               madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
-               ops->fault(BASE_ADDR, 0, hpage_pmd_size);
-               if (!ops->check_huge(BASE_ADDR, 1)) {
-                       printf("Failed to allocate huge page\n");
-                       exit(EXIT_FAILURE);
-               }
-               madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
-
-               p = mremap(BASE_ADDR - i * page_size,
-                               i * page_size + hpage_pmd_size,
-                               (i + 1) * page_size,
-                               MREMAP_MAYMOVE | MREMAP_FIXED,
-                               BASE_ADDR + 2 * hpage_pmd_size);
-               if (p == MAP_FAILED) {
-                       perror("mremap+unmap");
-                       exit(EXIT_FAILURE);
-               }
-
-               p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
-                               (i + 1) * page_size,
-                               (i + 1) * page_size + hpage_pmd_size,
-                               MREMAP_MAYMOVE | MREMAP_FIXED,
-                               BASE_ADDR - (i + 1) * page_size);
-               if (p == MAP_FAILED) {
-                       perror("mremap+alloc");
-                       exit(EXIT_FAILURE);
-               }
-       }
-
-       ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
-       ops->fault(p, 0, hpage_pmd_size);
-       if (!ops->check_huge(p, 1))
-               success("OK");
-       else
-               fail("Fail");
-
-       c->collapse("Collapse PTE table full of different compound pages", p, 1,
-                   ops, true);
-
-       validate_memory(p, 0, hpage_pmd_size);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
-{
-       int wstatus;
-       void *p;
-
-       p = ops->setup_area(1);
-
-       printf("Allocate small page...");
-       ops->fault(p, 0, page_size);
-       if (ops->check_huge(p, 0))
-               success("OK");
-       else
-               fail("Fail");
-
-       printf("Share small page over fork()...");
-       if (!fork()) {
-               /* Do not touch settings on child exit */
-               skip_settings_restore = true;
-               exit_status = 0;
-
-               if (ops->check_huge(p, 0))
-                       success("OK");
-               else
-                       fail("Fail");
-
-               ops->fault(p, page_size, 2 * page_size);
-               c->collapse("Collapse PTE table with single page shared with parent process",
-                           p, 1, ops, true);
-
-               validate_memory(p, 0, page_size);
-               ops->cleanup_area(p, hpage_pmd_size);
-               exit(exit_status);
-       }
-
-       wait(&wstatus);
-       exit_status += WEXITSTATUS(wstatus);
-
-       printf("Check if parent still has small page...");
-       if (ops->check_huge(p, 0))
-               success("OK");
-       else
-               fail("Fail");
-       validate_memory(p, 0, page_size);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
-{
-       int wstatus;
-       void *p;
-
-       p = alloc_hpage(ops);
-       printf("Share huge page over fork()...");
-       if (!fork()) {
-               /* Do not touch settings on child exit */
-               skip_settings_restore = true;
-               exit_status = 0;
-
-               if (ops->check_huge(p, 1))
-                       success("OK");
-               else
-                       fail("Fail");
-
-               printf("Split huge page PMD in child process...");
-               madvise(p, page_size, MADV_NOHUGEPAGE);
-               madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
-               if (ops->check_huge(p, 0))
-                       success("OK");
-               else
-                       fail("Fail");
-               ops->fault(p, 0, page_size);
-
-               write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
-               c->collapse("Collapse PTE table full of compound pages in child",
-                           p, 1, ops, true);
-               write_num("khugepaged/max_ptes_shared",
-                         current_settings()->khugepaged.max_ptes_shared);
-
-               validate_memory(p, 0, hpage_pmd_size);
-               ops->cleanup_area(p, hpage_pmd_size);
-               exit(exit_status);
-       }
-
-       wait(&wstatus);
-       exit_status += WEXITSTATUS(wstatus);
-
-       printf("Check if parent still has huge page...");
-       if (ops->check_huge(p, 1))
-               success("OK");
-       else
-               fail("Fail");
-       validate_memory(p, 0, hpage_pmd_size);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
-{
-       int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
-       int wstatus;
-       void *p;
-
-       p = alloc_hpage(ops);
-       printf("Share huge page over fork()...");
-       if (!fork()) {
-               /* Do not touch settings on child exit */
-               skip_settings_restore = true;
-               exit_status = 0;
-
-               if (ops->check_huge(p, 1))
-                       success("OK");
-               else
-                       fail("Fail");
-
-               printf("Trigger CoW on page %d of %d...",
-                               hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
-               ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
-               if (ops->check_huge(p, 0))
-                       success("OK");
-               else
-                       fail("Fail");
-
-               c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
-                           1, ops, !c->enforce_pte_scan_limits);
-
-               if (c->enforce_pte_scan_limits) {
-                       printf("Trigger CoW on page %d of %d...",
-                              hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
-                       ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
-                                   page_size);
-                       if (ops->check_huge(p, 0))
-                               success("OK");
-                       else
-                               fail("Fail");
-
-                       c->collapse("Collapse with max_ptes_shared PTEs shared",
-                                   p, 1, ops, true);
-               }
-
-               validate_memory(p, 0, hpage_pmd_size);
-               ops->cleanup_area(p, hpage_pmd_size);
-               exit(exit_status);
-       }
-
-       wait(&wstatus);
-       exit_status += WEXITSTATUS(wstatus);
-
-       printf("Check if parent still has huge page...");
-       if (ops->check_huge(p, 1))
-               success("OK");
-       else
-               fail("Fail");
-       validate_memory(p, 0, hpage_pmd_size);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-static void madvise_collapse_existing_thps(struct collapse_context *c,
-                                          struct mem_ops *ops)
-{
-       void *p;
-
-       p = ops->setup_area(1);
-       ops->fault(p, 0, hpage_pmd_size);
-       c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
-       validate_memory(p, 0, hpage_pmd_size);
-
-       /* c->collapse() will find a hugepage and complain - call directly. */
-       __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
-       validate_memory(p, 0, hpage_pmd_size);
-       ops->cleanup_area(p, hpage_pmd_size);
-}
-
-/*
- * Test race with khugepaged where page tables have been retracted and
- * pmd cleared.
- */
-static void madvise_retracted_page_tables(struct collapse_context *c,
-                                         struct mem_ops *ops)
-{
-       void *p;
-       int nr_hpages = 1;
-       unsigned long size = nr_hpages * hpage_pmd_size;
-
-       p = ops->setup_area(nr_hpages);
-       ops->fault(p, 0, size);
-
-       /* Let khugepaged collapse and leave pmd cleared */
-       if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
-                         ops)) {
-               fail("Timeout");
-               return;
-       }
-       success("OK");
-       c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
-                   true);
-       validate_memory(p, 0, size);
-       ops->cleanup_area(p, size);
-}
-
-static void usage(void)
-{
-       fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
-       fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
-       fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
-       fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
-       fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
-       fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
-       fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
-       fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
-       fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n");
-       exit(1);
-}
-
-static void parse_test_type(int argc, const char **argv)
-{
-       char *buf;
-       const char *token;
-
-       if (argc == 1) {
-               /* Backwards compatibility */
-               khugepaged_context =  &__khugepaged_context;
-               madvise_context =  &__madvise_context;
-               anon_ops = &__anon_ops;
-               return;
-       }
-
-       buf = strdup(argv[1]);
-       token = strsep(&buf, ":");
-
-       if (!strcmp(token, "all")) {
-               khugepaged_context =  &__khugepaged_context;
-               madvise_context =  &__madvise_context;
-       } else if (!strcmp(token, "khugepaged")) {
-               khugepaged_context =  &__khugepaged_context;
-       } else if (!strcmp(token, "madvise")) {
-               madvise_context =  &__madvise_context;
-       } else {
-               usage();
-       }
-
-       if (!buf)
-               usage();
-
-       if (!strcmp(buf, "all")) {
-               file_ops =  &__file_ops;
-               anon_ops = &__anon_ops;
-               shmem_ops = &__shmem_ops;
-       } else if (!strcmp(buf, "anon")) {
-               anon_ops = &__anon_ops;
-       } else if (!strcmp(buf, "file")) {
-               file_ops =  &__file_ops;
-       } else if (!strcmp(buf, "shmem")) {
-               shmem_ops = &__shmem_ops;
-       } else {
-               usage();
-       }
-
-       if (!file_ops)
-               return;
-
-       if (argc != 3)
-               usage();
-}
-
-int main(int argc, const char **argv)
-{
-       struct settings default_settings = {
-               .thp_enabled = THP_MADVISE,
-               .thp_defrag = THP_DEFRAG_ALWAYS,
-               .shmem_enabled = SHMEM_ADVISE,
-               .use_zero_page = 0,
-               .khugepaged = {
-                       .defrag = 1,
-                       .alloc_sleep_millisecs = 10,
-                       .scan_sleep_millisecs = 10,
-               },
-               /*
-                * When testing file-backed memory, the collapse path
-                * looks at how many pages are found in the page cache, not
-                * what pages are mapped. Disable read ahead optimization so
-                * pages don't find their way into the page cache unless
-                * we mem_ops->fault() them in.
-                */
-               .read_ahead_kb = 0,
-       };
-
-       parse_test_type(argc, argv);
-
-       if (file_ops)
-               get_finfo(argv[2]);
-
-       setbuf(stdout, NULL);
-
-       page_size = getpagesize();
-       hpage_pmd_size = read_pmd_pagesize();
-       hpage_pmd_nr = hpage_pmd_size / page_size;
-
-       default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
-       default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
-       default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
-       default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
-
-       save_settings();
-       push_settings(&default_settings);
-
-       alloc_at_fault();
-
-#define TEST(t, c, o) do { \
-       if (c && o) { \
-               printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
-               t(c, o); \
-       } \
-       } while (0)
-
-       TEST(collapse_full, khugepaged_context, anon_ops);
-       TEST(collapse_full, khugepaged_context, file_ops);
-       TEST(collapse_full, khugepaged_context, shmem_ops);
-       TEST(collapse_full, madvise_context, anon_ops);
-       TEST(collapse_full, madvise_context, file_ops);
-       TEST(collapse_full, madvise_context, shmem_ops);
-
-       TEST(collapse_empty, khugepaged_context, anon_ops);
-       TEST(collapse_empty, madvise_context, anon_ops);
-
-       TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
-       TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
-       TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
-       TEST(collapse_single_pte_entry, madvise_context, anon_ops);
-       TEST(collapse_single_pte_entry, madvise_context, file_ops);
-       TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
-
-       TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
-       TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
-       TEST(collapse_max_ptes_none, madvise_context, anon_ops);
-       TEST(collapse_max_ptes_none, madvise_context, file_ops);
-
-       TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
-       TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
-       TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
-       TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
-
-       TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
-       TEST(collapse_full_of_compound, khugepaged_context, file_ops);
-       TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
-       TEST(collapse_full_of_compound, madvise_context, anon_ops);
-       TEST(collapse_full_of_compound, madvise_context, file_ops);
-       TEST(collapse_full_of_compound, madvise_context, shmem_ops);
-
-       TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
-       TEST(collapse_compound_extreme, madvise_context, anon_ops);
-
-       TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
-       TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
-
-       TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
-       TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
-
-       TEST(collapse_fork, khugepaged_context, anon_ops);
-       TEST(collapse_fork, madvise_context, anon_ops);
-
-       TEST(collapse_fork_compound, khugepaged_context, anon_ops);
-       TEST(collapse_fork_compound, madvise_context, anon_ops);
-
-       TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
-       TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
-
-       TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
-       TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
-       TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
-
-       TEST(madvise_retracted_page_tables, madvise_context, file_ops);
-       TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
-
-       restore_settings(0);
-}
diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c
deleted file mode 100644 (file)
index d8b5b49..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * KSM functional tests
- *
- * Copyright 2022, Red Hat, Inc.
- *
- * Author(s): David Hildenbrand <david@redhat.com>
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <linux/userfaultfd.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#define KiB 1024u
-#define MiB (1024 * KiB)
-
-static int ksm_fd;
-static int ksm_full_scans_fd;
-static int pagemap_fd;
-static size_t pagesize;
-
-static bool range_maps_duplicates(char *addr, unsigned long size)
-{
-       unsigned long offs_a, offs_b, pfn_a, pfn_b;
-
-       /*
-        * There is no easy way to check if there are KSM pages mapped into
-        * this range. We only check that the range does not map the same PFN
-        * twice by comparing each pair of mapped pages.
-        */
-       for (offs_a = 0; offs_a < size; offs_a += pagesize) {
-               pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a);
-               /* Page not present or PFN not exposed by the kernel. */
-               if (pfn_a == -1ul || !pfn_a)
-                       continue;
-
-               for (offs_b = offs_a + pagesize; offs_b < size;
-                    offs_b += pagesize) {
-                       pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b);
-                       if (pfn_b == -1ul || !pfn_b)
-                               continue;
-                       if (pfn_a == pfn_b)
-                               return true;
-               }
-       }
-       return false;
-}
-
-static long ksm_get_full_scans(void)
-{
-       char buf[10];
-       ssize_t ret;
-
-       ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0);
-       if (ret <= 0)
-               return -errno;
-       buf[ret] = 0;
-
-       return strtol(buf, NULL, 10);
-}
-
-static int ksm_merge(void)
-{
-       long start_scans, end_scans;
-
-       /* Wait for two full scans such that any possible merging happened. */
-       start_scans = ksm_get_full_scans();
-       if (start_scans < 0)
-               return start_scans;
-       if (write(ksm_fd, "1", 1) != 1)
-               return -errno;
-       do {
-               end_scans = ksm_get_full_scans();
-               if (end_scans < 0)
-                       return end_scans;
-       } while (end_scans < start_scans + 2);
-
-       return 0;
-}
-
-static char *mmap_and_merge_range(char val, unsigned long size)
-{
-       char *map;
-
-       map = mmap(NULL, size, PROT_READ|PROT_WRITE,
-                  MAP_PRIVATE|MAP_ANON, -1, 0);
-       if (map == MAP_FAILED) {
-               ksft_test_result_fail("mmap() failed\n");
-               return MAP_FAILED;
-       }
-
-       /* Don't use THP. Ignore if THP are not around on a kernel. */
-       if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) {
-               ksft_test_result_fail("MADV_NOHUGEPAGE failed\n");
-               goto unmap;
-       }
-
-       /* Make sure each page contains the same values to merge them. */
-       memset(map, val, size);
-       if (madvise(map, size, MADV_MERGEABLE)) {
-               ksft_test_result_fail("MADV_MERGEABLE failed\n");
-               goto unmap;
-       }
-
-       /* Run KSM to trigger merging and wait. */
-       if (ksm_merge()) {
-               ksft_test_result_fail("Running KSM failed\n");
-               goto unmap;
-       }
-       return map;
-unmap:
-       munmap(map, size);
-       return MAP_FAILED;
-}
-
-static void test_unmerge(void)
-{
-       const unsigned int size = 2 * MiB;
-       char *map;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       map = mmap_and_merge_range(0xcf, size);
-       if (map == MAP_FAILED)
-               return;
-
-       if (madvise(map, size, MADV_UNMERGEABLE)) {
-               ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
-               goto unmap;
-       }
-
-       ksft_test_result(!range_maps_duplicates(map, size),
-                        "Pages were unmerged\n");
-unmap:
-       munmap(map, size);
-}
-
-static void test_unmerge_discarded(void)
-{
-       const unsigned int size = 2 * MiB;
-       char *map;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       map = mmap_and_merge_range(0xcf, size);
-       if (map == MAP_FAILED)
-               return;
-
-       /* Discard half of all mapped pages so we have pte_none() entries. */
-       if (madvise(map, size / 2, MADV_DONTNEED)) {
-               ksft_test_result_fail("MADV_DONTNEED failed\n");
-               goto unmap;
-       }
-
-       if (madvise(map, size, MADV_UNMERGEABLE)) {
-               ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
-               goto unmap;
-       }
-
-       ksft_test_result(!range_maps_duplicates(map, size),
-                        "Pages were unmerged\n");
-unmap:
-       munmap(map, size);
-}
-
-#ifdef __NR_userfaultfd
-static void test_unmerge_uffd_wp(void)
-{
-       struct uffdio_writeprotect uffd_writeprotect;
-       struct uffdio_register uffdio_register;
-       const unsigned int size = 2 * MiB;
-       struct uffdio_api uffdio_api;
-       char *map;
-       int uffd;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       map = mmap_and_merge_range(0xcf, size);
-       if (map == MAP_FAILED)
-               return;
-
-       /* See if UFFD is around. */
-       uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
-       if (uffd < 0) {
-               ksft_test_result_skip("__NR_userfaultfd failed\n");
-               goto unmap;
-       }
-
-       /* See if UFFD-WP is around. */
-       uffdio_api.api = UFFD_API;
-       uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
-       if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
-               ksft_test_result_fail("UFFDIO_API failed\n");
-               goto close_uffd;
-       }
-       if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
-               ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n");
-               goto close_uffd;
-       }
-
-       /* Register UFFD-WP, no need for an actual handler. */
-       uffdio_register.range.start = (unsigned long) map;
-       uffdio_register.range.len = size;
-       uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
-       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
-               ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n");
-               goto close_uffd;
-       }
-
-       /* Write-protect the range using UFFD-WP. */
-       uffd_writeprotect.range.start = (unsigned long) map;
-       uffd_writeprotect.range.len = size;
-       uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
-       if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
-               ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n");
-               goto close_uffd;
-       }
-
-       if (madvise(map, size, MADV_UNMERGEABLE)) {
-               ksft_test_result_fail("MADV_UNMERGEABLE failed\n");
-               goto close_uffd;
-       }
-
-       ksft_test_result(!range_maps_duplicates(map, size),
-                        "Pages were unmerged\n");
-close_uffd:
-       close(uffd);
-unmap:
-       munmap(map, size);
-}
-#endif
-
-int main(int argc, char **argv)
-{
-       unsigned int tests = 2;
-       int err;
-
-#ifdef __NR_userfaultfd
-       tests++;
-#endif
-
-       ksft_print_header();
-       ksft_set_plan(tests);
-
-       pagesize = getpagesize();
-
-       ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR);
-       if (ksm_fd < 0)
-               ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n");
-       ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY);
-       if (ksm_full_scans_fd < 0)
-               ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n");
-       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-       if (pagemap_fd < 0)
-               ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n");
-
-       test_unmerge();
-       test_unmerge_discarded();
-#ifdef __NR_userfaultfd
-       test_unmerge_uffd_wp();
-#endif
-
-       err = ksft_get_fail_cnt();
-       if (err)
-               ksft_exit_fail_msg("%d out of %d tests failed\n",
-                                  err, ksft_test_num());
-       return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c
deleted file mode 100644 (file)
index f9eb4d6..0000000
+++ /dev/null
@@ -1,849 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <sys/mman.h>
-#include <stdbool.h>
-#include <time.h>
-#include <string.h>
-#include <numa.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <err.h>
-
-#include "../kselftest.h"
-#include <include/vdso/time64.h>
-#include "util.h"
-
-#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/"
-#define KSM_FP(s) (KSM_SYSFS_PATH s)
-#define KSM_SCAN_LIMIT_SEC_DEFAULT 120
-#define KSM_PAGE_COUNT_DEFAULT 10l
-#define KSM_PROT_STR_DEFAULT "rw"
-#define KSM_USE_ZERO_PAGES_DEFAULT false
-#define KSM_MERGE_ACROSS_NODES_DEFAULT true
-#define MB (1ul << 20)
-
-struct ksm_sysfs {
-       unsigned long max_page_sharing;
-       unsigned long merge_across_nodes;
-       unsigned long pages_to_scan;
-       unsigned long run;
-       unsigned long sleep_millisecs;
-       unsigned long stable_node_chains_prune_millisecs;
-       unsigned long use_zero_pages;
-};
-
-enum ksm_test_name {
-       CHECK_KSM_MERGE,
-       CHECK_KSM_UNMERGE,
-       CHECK_KSM_ZERO_PAGE_MERGE,
-       CHECK_KSM_NUMA_MERGE,
-       KSM_MERGE_TIME,
-       KSM_MERGE_TIME_HUGE_PAGES,
-       KSM_UNMERGE_TIME,
-       KSM_COW_TIME
-};
-
-static int ksm_write_sysfs(const char *file_path, unsigned long val)
-{
-       FILE *f = fopen(file_path, "w");
-
-       if (!f) {
-               fprintf(stderr, "f %s\n", file_path);
-               perror("fopen");
-               return 1;
-       }
-       if (fprintf(f, "%lu", val) < 0) {
-               perror("fprintf");
-               fclose(f);
-               return 1;
-       }
-       fclose(f);
-
-       return 0;
-}
-
-static int ksm_read_sysfs(const char *file_path, unsigned long *val)
-{
-       FILE *f = fopen(file_path, "r");
-
-       if (!f) {
-               fprintf(stderr, "f %s\n", file_path);
-               perror("fopen");
-               return 1;
-       }
-       if (fscanf(f, "%lu", val) != 1) {
-               perror("fscanf");
-               fclose(f);
-               return 1;
-       }
-       fclose(f);
-
-       return 0;
-}
-
-static int str_to_prot(char *prot_str)
-{
-       int prot = 0;
-
-       if ((strchr(prot_str, 'r')) != NULL)
-               prot |= PROT_READ;
-       if ((strchr(prot_str, 'w')) != NULL)
-               prot |= PROT_WRITE;
-       if ((strchr(prot_str, 'x')) != NULL)
-               prot |= PROT_EXEC;
-
-       return prot;
-}
-
-static void print_help(void)
-{
-       printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n"
-              "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n");
-
-       printf("Supported <test type>:\n"
-              " -M (page merging)\n"
-              " -Z (zero pages merging)\n"
-              " -N (merging of pages in different NUMA nodes)\n"
-              " -U (page unmerging)\n"
-              " -P evaluate merging time and speed.\n"
-              "    For this test, the size of duplicated memory area (in MiB)\n"
-              "    must be provided using -s option\n"
-              " -H evaluate merging time and speed of area allocated mostly with huge pages\n"
-              "    For this test, the size of duplicated memory area (in MiB)\n"
-              "    must be provided using -s option\n"
-              " -D evaluate unmerging time and speed when disabling KSM.\n"
-              "    For this test, the size of duplicated memory area (in MiB)\n"
-              "    must be provided using -s option\n"
-              " -C evaluate the time required to break COW of merged pages.\n\n");
-
-       printf(" -a: specify the access protections of pages.\n"
-              "     <prot> must be of the form [rwx].\n"
-              "     Default: %s\n", KSM_PROT_STR_DEFAULT);
-       printf(" -p: specify the number of pages to test.\n"
-              "     Default: %ld\n", KSM_PAGE_COUNT_DEFAULT);
-       printf(" -l: limit the maximum running time (in seconds) for a test.\n"
-              "     Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT);
-       printf(" -z: change use_zero_pages tunable\n"
-              "     Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT);
-       printf(" -m: change merge_across_nodes tunable\n"
-              "     Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT);
-       printf(" -s: the size of duplicated memory area (in MiB)\n");
-
-       exit(0);
-}
-
-static void  *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size)
-{
-       void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0);
-
-       if (!map_ptr) {
-               perror("mmap");
-               return NULL;
-       }
-       memset(map_ptr, data, map_size);
-       if (mprotect(map_ptr, map_size, prot)) {
-               perror("mprotect");
-               munmap(map_ptr, map_size);
-               return NULL;
-       }
-
-       return map_ptr;
-}
-
-static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout)
-{
-       struct timespec cur_time;
-       unsigned long cur_scan, init_scan;
-
-       if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan))
-               return 1;
-       cur_scan = init_scan;
-
-       while (cur_scan < init_scan + scan_count) {
-               if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan))
-                       return 1;
-               if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) {
-                       perror("clock_gettime");
-                       return 1;
-               }
-               if ((cur_time.tv_sec - start_time.tv_sec) > timeout) {
-                       printf("Scan time limit exceeded\n");
-                       return 1;
-               }
-       }
-
-       return 0;
-}
-
-static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout)
-{
-       if (madvise(addr, size, MADV_MERGEABLE)) {
-               perror("madvise");
-               return 1;
-       }
-       if (ksm_write_sysfs(KSM_FP("run"), 1))
-               return 1;
-
-       /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */
-       if (ksm_do_scan(2, start_time, timeout))
-               return 1;
-
-       return 0;
-}
-
-static int ksm_unmerge_pages(void *addr, size_t size,
-                            struct timespec start_time, int timeout)
-{
-       if (madvise(addr, size, MADV_UNMERGEABLE)) {
-               perror("madvise");
-               return 1;
-       }
-       return 0;
-}
-
-static bool assert_ksm_pages_count(long dupl_page_count)
-{
-       unsigned long max_page_sharing, pages_sharing, pages_shared;
-
-       if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) ||
-           ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) ||
-           ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing))
-               return false;
-
-       /*
-        * Since there must be at least 2 pages for merging and 1 page can be
-        * shared with the limited number of pages (max_page_sharing), sometimes
-        * there are 'leftover' pages that cannot be merged. For example, if there
-        * are 11 pages and max_page_sharing = 10, then only 10 pages will be
-        * merged and the 11th page won't be affected. As a result, when the number
-        * of duplicate pages is divided by max_page_sharing and the remainder is 1,
-        * pages_shared and pages_sharing values will be equal between dupl_page_count
-        * and dupl_page_count - 1.
-        */
-       if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) {
-               if (pages_shared == dupl_page_count / max_page_sharing &&
-                   pages_sharing == pages_shared * (max_page_sharing - 1))
-                       return true;
-       } else {
-               if (pages_shared == (dupl_page_count / max_page_sharing + 1) &&
-                   pages_sharing == dupl_page_count - pages_shared)
-                       return true;
-       }
-
-       return false;
-}
-
-static int ksm_save_def(struct ksm_sysfs *ksm_sysfs)
-{
-       if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) ||
-           numa_available() ? 0 :
-               ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) ||
-           ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) ||
-           ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) ||
-           ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) ||
-           ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
-                          &ksm_sysfs->stable_node_chains_prune_millisecs) ||
-           ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages))
-               return 1;
-
-       return 0;
-}
-
-static int ksm_restore(struct ksm_sysfs *ksm_sysfs)
-{
-       if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) ||
-           numa_available() ? 0 :
-               ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) ||
-           ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) ||
-           ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) ||
-           ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) ||
-           ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"),
-                           ksm_sysfs->stable_node_chains_prune_millisecs) ||
-           ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages))
-               return 1;
-
-       return 0;
-}
-
-static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size)
-{
-       void *map_ptr;
-       struct timespec start_time;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               return KSFT_FAIL;
-       }
-
-       /* fill pages with the same data and merge them */
-       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
-       if (!map_ptr)
-               return KSFT_FAIL;
-
-       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-               goto err_out;
-
-       /* verify that the right number of pages are merged */
-       if (assert_ksm_pages_count(page_count)) {
-               printf("OK\n");
-               munmap(map_ptr, page_size * page_count);
-               return KSFT_PASS;
-       }
-
-err_out:
-       printf("Not OK\n");
-       munmap(map_ptr, page_size * page_count);
-       return KSFT_FAIL;
-}
-
-static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size)
-{
-       void *map_ptr;
-       struct timespec start_time;
-       int page_count = 2;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               return KSFT_FAIL;
-       }
-
-       /* fill pages with the same data and merge them */
-       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
-       if (!map_ptr)
-               return KSFT_FAIL;
-
-       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-               goto err_out;
-
-       /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */
-       memset(map_ptr, '-', 1);
-       memset(map_ptr + page_size, '+', 1);
-
-       /* get at least 1 scan, so KSM can detect that the pages were modified */
-       if (ksm_do_scan(1, start_time, timeout))
-               goto err_out;
-
-       /* check that unmerging was successful and 0 pages are currently merged */
-       if (assert_ksm_pages_count(0)) {
-               printf("OK\n");
-               munmap(map_ptr, page_size * page_count);
-               return KSFT_PASS;
-       }
-
-err_out:
-       printf("Not OK\n");
-       munmap(map_ptr, page_size * page_count);
-       return KSFT_FAIL;
-}
-
-static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout,
-                                    bool use_zero_pages, size_t page_size)
-{
-       void *map_ptr;
-       struct timespec start_time;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               return KSFT_FAIL;
-       }
-
-       if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages))
-               return KSFT_FAIL;
-
-       /* fill pages with zero and try to merge them */
-       map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count);
-       if (!map_ptr)
-               return KSFT_FAIL;
-
-       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-               goto err_out;
-
-       /*
-       * verify that the right number of pages are merged:
-       * 1) if use_zero_pages is set to 1, empty pages are merged
-       *    with the kernel zero page instead of with each other;
-       * 2) if use_zero_pages is set to 0, empty pages are not treated specially
-       *    and merged as usual.
-       */
-       if (use_zero_pages && !assert_ksm_pages_count(0))
-               goto err_out;
-       else if (!use_zero_pages && !assert_ksm_pages_count(page_count))
-               goto err_out;
-
-       printf("OK\n");
-       munmap(map_ptr, page_size * page_count);
-       return KSFT_PASS;
-
-err_out:
-       printf("Not OK\n");
-       munmap(map_ptr, page_size * page_count);
-       return KSFT_FAIL;
-}
-
-static int get_next_mem_node(int node)
-{
-
-       long node_size;
-       int mem_node = 0;
-       int i, max_node = numa_max_node();
-
-       for (i = node + 1; i <= max_node + node; i++) {
-               mem_node = i % (max_node + 1);
-               node_size = numa_node_size(mem_node, NULL);
-               if (node_size > 0)
-                       break;
-       }
-       return mem_node;
-}
-
-static int get_first_mem_node(void)
-{
-       return get_next_mem_node(numa_max_node());
-}
-
-static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes,
-                               size_t page_size)
-{
-       void *numa1_map_ptr, *numa2_map_ptr;
-       struct timespec start_time;
-       int page_count = 2;
-       int first_node;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               return KSFT_FAIL;
-       }
-
-       if (numa_available() < 0) {
-               perror("NUMA support not enabled");
-               return KSFT_SKIP;
-       }
-       if (numa_num_configured_nodes() <= 1) {
-               printf("At least 2 NUMA nodes must be available\n");
-               return KSFT_SKIP;
-       }
-       if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes))
-               return KSFT_FAIL;
-
-       /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */
-       first_node = get_first_mem_node();
-       numa1_map_ptr = numa_alloc_onnode(page_size, first_node);
-       numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node));
-       if (!numa1_map_ptr || !numa2_map_ptr) {
-               perror("numa_alloc_onnode");
-               return KSFT_FAIL;
-       }
-
-       memset(numa1_map_ptr, '*', page_size);
-       memset(numa2_map_ptr, '*', page_size);
-
-       /* try to merge the pages */
-       if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) ||
-           ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout))
-               goto err_out;
-
-       /*
-       * verify that the right number of pages are merged:
-       * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged;
-       * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is
-       *    only 1 unique page in each node and they can't be shared.
-       */
-       if (merge_across_nodes && !assert_ksm_pages_count(page_count))
-               goto err_out;
-       else if (!merge_across_nodes && !assert_ksm_pages_count(0))
-               goto err_out;
-
-       numa_free(numa1_map_ptr, page_size);
-       numa_free(numa2_map_ptr, page_size);
-       printf("OK\n");
-       return KSFT_PASS;
-
-err_out:
-       numa_free(numa1_map_ptr, page_size);
-       numa_free(numa2_map_ptr, page_size);
-       printf("Not OK\n");
-       return KSFT_FAIL;
-}
-
-static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size)
-{
-       void *map_ptr, *map_ptr_orig;
-       struct timespec start_time, end_time;
-       unsigned long scan_time_ns;
-       int pagemap_fd, n_normal_pages, n_huge_pages;
-
-       map_size *= MB;
-       size_t len = map_size;
-
-       len -= len % HPAGE_SIZE;
-       map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
-                       MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
-       map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE;
-
-       if (map_ptr_orig == MAP_FAILED)
-               err(2, "initial mmap");
-
-       if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE))
-               err(2, "MADV_HUGEPAGE");
-
-       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-       if (pagemap_fd < 0)
-               err(2, "open pagemap");
-
-       n_normal_pages = 0;
-       n_huge_pages = 0;
-       for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) {
-               if (allocate_transhuge(p, pagemap_fd) < 0)
-                       n_normal_pages++;
-               else
-                       n_huge_pages++;
-       }
-       printf("Number of normal pages:    %d\n", n_normal_pages);
-       printf("Number of huge pages:    %d\n", n_huge_pages);
-
-       memset(map_ptr, '*', len);
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
-               goto err_out;
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-
-       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-                      (end_time.tv_nsec - start_time.tv_nsec);
-
-       printf("Total size:    %lu MiB\n", map_size / MB);
-       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
-              scan_time_ns % NSEC_PER_SEC);
-       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
-                                              ((double)scan_time_ns / NSEC_PER_SEC));
-
-       munmap(map_ptr_orig, len + HPAGE_SIZE);
-       return KSFT_PASS;
-
-err_out:
-       printf("Not OK\n");
-       munmap(map_ptr_orig, len + HPAGE_SIZE);
-       return KSFT_FAIL;
-}
-
-static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size)
-{
-       void *map_ptr;
-       struct timespec start_time, end_time;
-       unsigned long scan_time_ns;
-
-       map_size *= MB;
-
-       map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
-       if (!map_ptr)
-               return KSFT_FAIL;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
-               goto err_out;
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-
-       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-                      (end_time.tv_nsec - start_time.tv_nsec);
-
-       printf("Total size:    %lu MiB\n", map_size / MB);
-       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
-              scan_time_ns % NSEC_PER_SEC);
-       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
-                                              ((double)scan_time_ns / NSEC_PER_SEC));
-
-       munmap(map_ptr, map_size);
-       return KSFT_PASS;
-
-err_out:
-       printf("Not OK\n");
-       munmap(map_ptr, map_size);
-       return KSFT_FAIL;
-}
-
-static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size)
-{
-       void *map_ptr;
-       struct timespec start_time, end_time;
-       unsigned long scan_time_ns;
-
-       map_size *= MB;
-
-       map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size);
-       if (!map_ptr)
-               return KSFT_FAIL;
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-       if (ksm_merge_pages(map_ptr, map_size, start_time, timeout))
-               goto err_out;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-       if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout))
-               goto err_out;
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-
-       scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-                      (end_time.tv_nsec - start_time.tv_nsec);
-
-       printf("Total size:    %lu MiB\n", map_size / MB);
-       printf("Total time:    %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC,
-              scan_time_ns % NSEC_PER_SEC);
-       printf("Average speed:  %.3f MiB/s\n", (map_size / MB) /
-                                              ((double)scan_time_ns / NSEC_PER_SEC));
-
-       munmap(map_ptr, map_size);
-       return KSFT_PASS;
-
-err_out:
-       printf("Not OK\n");
-       munmap(map_ptr, map_size);
-       return KSFT_FAIL;
-}
-
-static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size)
-{
-       void *map_ptr;
-       struct timespec start_time, end_time;
-       unsigned long cow_time_ns;
-
-       /* page_count must be less than 2*page_size */
-       size_t page_count = 4000;
-
-       map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count);
-       if (!map_ptr)
-               return KSFT_FAIL;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               return KSFT_FAIL;
-       }
-       for (size_t i = 0; i < page_count - 1; i = i + 2)
-               memset(map_ptr + page_size * i, '-', 1);
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-               perror("clock_gettime");
-               return KSFT_FAIL;
-       }
-
-       cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-                      (end_time.tv_nsec - start_time.tv_nsec);
-
-       printf("Total size:    %lu MiB\n\n", (page_size * page_count) / MB);
-       printf("Not merged pages:\n");
-       printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
-              cow_time_ns % NSEC_PER_SEC);
-       printf("Average speed:  %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) /
-                                              ((double)cow_time_ns / NSEC_PER_SEC));
-
-       /* Create 2000 pairs of duplicate pages */
-       for (size_t i = 0; i < page_count - 1; i = i + 2) {
-               memset(map_ptr + page_size * i, '+', i / 2 + 1);
-               memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1);
-       }
-       if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout))
-               goto err_out;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-       for (size_t i = 0; i < page_count - 1; i = i + 2)
-               memset(map_ptr + page_size * i, '-', 1);
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) {
-               perror("clock_gettime");
-               goto err_out;
-       }
-
-       cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC +
-                      (end_time.tv_nsec - start_time.tv_nsec);
-
-       printf("Merged pages:\n");
-       printf("Total time:     %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC,
-              cow_time_ns % NSEC_PER_SEC);
-       printf("Average speed:  %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) /
-                                              ((double)cow_time_ns / NSEC_PER_SEC));
-
-       munmap(map_ptr, page_size * page_count);
-       return KSFT_PASS;
-
-err_out:
-       printf("Not OK\n");
-       munmap(map_ptr, page_size * page_count);
-       return KSFT_FAIL;
-}
-
-int main(int argc, char *argv[])
-{
-       int ret, opt;
-       int prot = 0;
-       int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT;
-       long page_count = KSM_PAGE_COUNT_DEFAULT;
-       size_t page_size = sysconf(_SC_PAGESIZE);
-       struct ksm_sysfs ksm_sysfs_old;
-       int test_name = CHECK_KSM_MERGE;
-       bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT;
-       bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT;
-       long size_MB = 0;
-
-       while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) {
-               switch (opt) {
-               case 'a':
-                       prot = str_to_prot(optarg);
-                       break;
-               case 'p':
-                       page_count = atol(optarg);
-                       if (page_count <= 0) {
-                               printf("The number of pages must be greater than 0\n");
-                               return KSFT_FAIL;
-                       }
-                       break;
-               case 'l':
-                       ksm_scan_limit_sec = atoi(optarg);
-                       if (ksm_scan_limit_sec <= 0) {
-                               printf("Timeout value must be greater than 0\n");
-                               return KSFT_FAIL;
-                       }
-                       break;
-               case 'h':
-                       print_help();
-                       break;
-               case 'z':
-                       if (strcmp(optarg, "0") == 0)
-                               use_zero_pages = 0;
-                       else
-                               use_zero_pages = 1;
-                       break;
-               case 'm':
-                       if (strcmp(optarg, "0") == 0)
-                               merge_across_nodes = 0;
-                       else
-                               merge_across_nodes = 1;
-                       break;
-               case 's':
-                       size_MB = atoi(optarg);
-                       if (size_MB <= 0) {
-                               printf("Size must be greater than 0\n");
-                               return KSFT_FAIL;
-                       }
-               case 'M':
-                       break;
-               case 'U':
-                       test_name = CHECK_KSM_UNMERGE;
-                       break;
-               case 'Z':
-                       test_name = CHECK_KSM_ZERO_PAGE_MERGE;
-                       break;
-               case 'N':
-                       test_name = CHECK_KSM_NUMA_MERGE;
-                       break;
-               case 'P':
-                       test_name = KSM_MERGE_TIME;
-                       break;
-               case 'H':
-                       test_name = KSM_MERGE_TIME_HUGE_PAGES;
-                       break;
-               case 'D':
-                       test_name = KSM_UNMERGE_TIME;
-                       break;
-               case 'C':
-                       test_name = KSM_COW_TIME;
-                       break;
-               default:
-                       return KSFT_FAIL;
-               }
-       }
-
-       if (prot == 0)
-               prot = str_to_prot(KSM_PROT_STR_DEFAULT);
-
-       if (access(KSM_SYSFS_PATH, F_OK)) {
-               printf("Config KSM not enabled\n");
-               return KSFT_SKIP;
-       }
-
-       if (ksm_save_def(&ksm_sysfs_old)) {
-               printf("Cannot save default tunables\n");
-               return KSFT_FAIL;
-       }
-
-       if (ksm_write_sysfs(KSM_FP("run"), 2) ||
-           ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) ||
-           numa_available() ? 0 :
-               ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) ||
-           ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count))
-               return KSFT_FAIL;
-
-       switch (test_name) {
-       case CHECK_KSM_MERGE:
-               ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
-                                     ksm_scan_limit_sec, page_size);
-               break;
-       case CHECK_KSM_UNMERGE:
-               ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-                                       page_size);
-               break;
-       case CHECK_KSM_ZERO_PAGE_MERGE:
-               ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count,
-                                               ksm_scan_limit_sec, use_zero_pages, page_size);
-               break;
-       case CHECK_KSM_NUMA_MERGE:
-               ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-                                          merge_across_nodes, page_size);
-               break;
-       case KSM_MERGE_TIME:
-               if (size_MB == 0) {
-                       printf("Option '-s' is required.\n");
-                       return KSFT_FAIL;
-               }
-               ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-                                    size_MB);
-               break;
-       case KSM_MERGE_TIME_HUGE_PAGES:
-               if (size_MB == 0) {
-                       printf("Option '-s' is required.\n");
-                       return KSFT_FAIL;
-               }
-               ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
-                               ksm_scan_limit_sec, size_MB);
-               break;
-       case KSM_UNMERGE_TIME:
-               if (size_MB == 0) {
-                       printf("Option '-s' is required.\n");
-                       return KSFT_FAIL;
-               }
-               ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot,
-                                      ksm_scan_limit_sec, size_MB);
-               break;
-       case KSM_COW_TIME:
-               ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec,
-                                  page_size);
-               break;
-       }
-
-       if (ksm_restore(&ksm_sysfs_old)) {
-               printf("Cannot restore default tunables\n");
-               return KSFT_FAIL;
-       }
-
-       return ret;
-}
diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c
deleted file mode 100644 (file)
index 262eae6..0000000
+++ /dev/null
@@ -1,296 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
- *
- * Copyright 2021, Red Hat, Inc.
- *
- * Author(s): David Hildenbrand <david@redhat.com>
- */
-#define _GNU_SOURCE
-#include <stdlib.h>
-#include <string.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <linux/mman.h>
-#include <sys/mman.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#ifndef MADV_POPULATE_READ
-#define MADV_POPULATE_READ     22
-#endif /* MADV_POPULATE_READ */
-#ifndef MADV_POPULATE_WRITE
-#define MADV_POPULATE_WRITE    23
-#endif /* MADV_POPULATE_WRITE */
-
-/*
- * For now, we're using 2 MiB of private anonymous memory for all tests.
- */
-#define SIZE (2 * 1024 * 1024)
-
-static size_t pagesize;
-
-static void sense_support(void)
-{
-       char *addr;
-       int ret;
-
-       addr = mmap(0, pagesize, PROT_READ | PROT_WRITE,
-                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-       if (!addr)
-               ksft_exit_fail_msg("mmap failed\n");
-
-       ret = madvise(addr, pagesize, MADV_POPULATE_READ);
-       if (ret)
-               ksft_exit_skip("MADV_POPULATE_READ is not available\n");
-
-       ret = madvise(addr, pagesize, MADV_POPULATE_WRITE);
-       if (ret)
-               ksft_exit_skip("MADV_POPULATE_WRITE is not available\n");
-
-       munmap(addr, pagesize);
-}
-
-static void test_prot_read(void)
-{
-       char *addr;
-       int ret;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-       if (addr == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed\n");
-
-       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-       ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n");
-
-       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-       ksft_test_result(ret == -1 && errno == EINVAL,
-                        "MADV_POPULATE_WRITE with PROT_READ\n");
-
-       munmap(addr, SIZE);
-}
-
-static void test_prot_write(void)
-{
-       char *addr;
-       int ret;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-       if (addr == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed\n");
-
-       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-       ksft_test_result(ret == -1 && errno == EINVAL,
-                        "MADV_POPULATE_READ with PROT_WRITE\n");
-
-       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-       ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n");
-
-       munmap(addr, SIZE);
-}
-
-static void test_holes(void)
-{
-       char *addr;
-       int ret;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-       if (addr == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed\n");
-       ret = munmap(addr + pagesize, pagesize);
-       if (ret)
-               ksft_exit_fail_msg("munmap failed\n");
-
-       /* Hole in the middle */
-       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-       ksft_test_result(ret == -1 && errno == ENOMEM,
-                        "MADV_POPULATE_READ with holes in the middle\n");
-       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-       ksft_test_result(ret == -1 && errno == ENOMEM,
-                        "MADV_POPULATE_WRITE with holes in the middle\n");
-
-       /* Hole at end */
-       ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ);
-       ksft_test_result(ret == -1 && errno == ENOMEM,
-                        "MADV_POPULATE_READ with holes at the end\n");
-       ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE);
-       ksft_test_result(ret == -1 && errno == ENOMEM,
-                        "MADV_POPULATE_WRITE with holes at the end\n");
-
-       /* Hole at beginning */
-       ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ);
-       ksft_test_result(ret == -1 && errno == ENOMEM,
-                        "MADV_POPULATE_READ with holes at the beginning\n");
-       ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE);
-       ksft_test_result(ret == -1 && errno == ENOMEM,
-                        "MADV_POPULATE_WRITE with holes at the beginning\n");
-
-       munmap(addr, SIZE);
-}
-
-static bool range_is_populated(char *start, ssize_t size)
-{
-       int fd = open("/proc/self/pagemap", O_RDONLY);
-       bool ret = true;
-
-       if (fd < 0)
-               ksft_exit_fail_msg("opening pagemap failed\n");
-       for (; size > 0 && ret; size -= pagesize, start += pagesize)
-               if (!pagemap_is_populated(fd, start))
-                       ret = false;
-       close(fd);
-       return ret;
-}
-
-static bool range_is_not_populated(char *start, ssize_t size)
-{
-       int fd = open("/proc/self/pagemap", O_RDONLY);
-       bool ret = true;
-
-       if (fd < 0)
-               ksft_exit_fail_msg("opening pagemap failed\n");
-       for (; size > 0 && ret; size -= pagesize, start += pagesize)
-               if (pagemap_is_populated(fd, start))
-                       ret = false;
-       close(fd);
-       return ret;
-}
-
-static void test_populate_read(void)
-{
-       char *addr;
-       int ret;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-       if (addr == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed\n");
-       ksft_test_result(range_is_not_populated(addr, SIZE),
-                        "range initially not populated\n");
-
-       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-       ksft_test_result(!ret, "MADV_POPULATE_READ\n");
-       ksft_test_result(range_is_populated(addr, SIZE),
-                        "range is populated\n");
-
-       munmap(addr, SIZE);
-}
-
-static void test_populate_write(void)
-{
-       char *addr;
-       int ret;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-       if (addr == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed\n");
-       ksft_test_result(range_is_not_populated(addr, SIZE),
-                        "range initially not populated\n");
-
-       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-       ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
-       ksft_test_result(range_is_populated(addr, SIZE),
-                        "range is populated\n");
-
-       munmap(addr, SIZE);
-}
-
-static bool range_is_softdirty(char *start, ssize_t size)
-{
-       int fd = open("/proc/self/pagemap", O_RDONLY);
-       bool ret = true;
-
-       if (fd < 0)
-               ksft_exit_fail_msg("opening pagemap failed\n");
-       for (; size > 0 && ret; size -= pagesize, start += pagesize)
-               if (!pagemap_is_softdirty(fd, start))
-                       ret = false;
-       close(fd);
-       return ret;
-}
-
-static bool range_is_not_softdirty(char *start, ssize_t size)
-{
-       int fd = open("/proc/self/pagemap", O_RDONLY);
-       bool ret = true;
-
-       if (fd < 0)
-               ksft_exit_fail_msg("opening pagemap failed\n");
-       for (; size > 0 && ret; size -= pagesize, start += pagesize)
-               if (pagemap_is_softdirty(fd, start))
-                       ret = false;
-       close(fd);
-       return ret;
-}
-
-static void test_softdirty(void)
-{
-       char *addr;
-       int ret;
-
-       ksft_print_msg("[RUN] %s\n", __func__);
-
-       addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
-                   MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
-       if (addr == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed\n");
-
-       /* Clear any softdirty bits. */
-       clear_softdirty();
-       ksft_test_result(range_is_not_softdirty(addr, SIZE),
-                        "range is not softdirty\n");
-
-       /* Populating READ should set softdirty. */
-       ret = madvise(addr, SIZE, MADV_POPULATE_READ);
-       ksft_test_result(!ret, "MADV_POPULATE_READ\n");
-       ksft_test_result(range_is_not_softdirty(addr, SIZE),
-                        "range is not softdirty\n");
-
-       /* Populating WRITE should set softdirty. */
-       ret = madvise(addr, SIZE, MADV_POPULATE_WRITE);
-       ksft_test_result(!ret, "MADV_POPULATE_WRITE\n");
-       ksft_test_result(range_is_softdirty(addr, SIZE),
-                        "range is softdirty\n");
-
-       munmap(addr, SIZE);
-}
-
-int main(int argc, char **argv)
-{
-       int err;
-
-       pagesize = getpagesize();
-
-       ksft_print_header();
-       ksft_set_plan(21);
-
-       sense_support();
-       test_prot_read();
-       test_prot_write();
-       test_holes();
-       test_populate_read();
-       test_populate_write();
-       test_softdirty();
-
-       err = ksft_get_fail_cnt();
-       if (err)
-               ksft_exit_fail_msg("%d out of %d tests failed\n",
-                                  err, ksft_test_num());
-       return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c
deleted file mode 100644 (file)
index eed4432..0000000
+++ /dev/null
@@ -1,231 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Test that MAP_FIXED_NOREPLACE works.
- *
- * Copyright 2018, Jann Horn <jannh@google.com>
- * Copyright 2018, Michael Ellerman, IBM Corporation.
- */
-
-#include <sys/mman.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#ifndef MAP_FIXED_NOREPLACE
-#define MAP_FIXED_NOREPLACE 0x100000
-#endif
-
-static void dump_maps(void)
-{
-       char cmd[32];
-
-       snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
-       system(cmd);
-}
-
-static unsigned long find_base_addr(unsigned long size)
-{
-       void *addr;
-       unsigned long flags;
-
-       flags = MAP_PRIVATE | MAP_ANONYMOUS;
-       addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
-       if (addr == MAP_FAILED) {
-               printf("Error: couldn't map the space we need for the test\n");
-               return 0;
-       }
-
-       if (munmap(addr, size) != 0) {
-               printf("Error: couldn't map the space we need for the test\n");
-               return 0;
-       }
-       return (unsigned long)addr;
-}
-
-int main(void)
-{
-       unsigned long base_addr;
-       unsigned long flags, addr, size, page_size;
-       char *p;
-
-       page_size = sysconf(_SC_PAGE_SIZE);
-
-       //let's find a base addr that is free before we start the tests
-       size = 5 * page_size;
-       base_addr = find_base_addr(size);
-       if (!base_addr) {
-               printf("Error: couldn't map the space we need for the test\n");
-               return 1;
-       }
-
-       flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
-
-       // Check we can map all the areas we need below
-       errno = 0;
-       addr = base_addr;
-       size = 5 * page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p == MAP_FAILED) {
-               dump_maps();
-               printf("Error: couldn't map the space we need for the test\n");
-               return 1;
-       }
-
-       errno = 0;
-       if (munmap((void *)addr, 5 * page_size) != 0) {
-               dump_maps();
-               printf("Error: munmap failed!?\n");
-               return 1;
-       }
-       printf("unmap() successful\n");
-
-       errno = 0;
-       addr = base_addr + page_size;
-       size = 3 * page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p == MAP_FAILED) {
-               dump_maps();
-               printf("Error: first mmap() failed unexpectedly\n");
-               return 1;
-       }
-
-       /*
-        * Exact same mapping again:
-        *   base |  free  | new
-        *     +1 | mapped | new
-        *     +2 | mapped | new
-        *     +3 | mapped | new
-        *     +4 |  free  | new
-        */
-       errno = 0;
-       addr = base_addr;
-       size = 5 * page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p != MAP_FAILED) {
-               dump_maps();
-               printf("Error:1: mmap() succeeded when it shouldn't have\n");
-               return 1;
-       }
-
-       /*
-        * Second mapping contained within first:
-        *
-        *   base |  free  |
-        *     +1 | mapped |
-        *     +2 | mapped | new
-        *     +3 | mapped |
-        *     +4 |  free  |
-        */
-       errno = 0;
-       addr = base_addr + (2 * page_size);
-       size = page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p != MAP_FAILED) {
-               dump_maps();
-               printf("Error:2: mmap() succeeded when it shouldn't have\n");
-               return 1;
-       }
-
-       /*
-        * Overlap end of existing mapping:
-        *   base |  free  |
-        *     +1 | mapped |
-        *     +2 | mapped |
-        *     +3 | mapped | new
-        *     +4 |  free  | new
-        */
-       errno = 0;
-       addr = base_addr + (3 * page_size);
-       size = 2 * page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p != MAP_FAILED) {
-               dump_maps();
-               printf("Error:3: mmap() succeeded when it shouldn't have\n");
-               return 1;
-       }
-
-       /*
-        * Overlap start of existing mapping:
-        *   base |  free  | new
-        *     +1 | mapped | new
-        *     +2 | mapped |
-        *     +3 | mapped |
-        *     +4 |  free  |
-        */
-       errno = 0;
-       addr = base_addr;
-       size = 2 * page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p != MAP_FAILED) {
-               dump_maps();
-               printf("Error:4: mmap() succeeded when it shouldn't have\n");
-               return 1;
-       }
-
-       /*
-        * Adjacent to start of existing mapping:
-        *   base |  free  | new
-        *     +1 | mapped |
-        *     +2 | mapped |
-        *     +3 | mapped |
-        *     +4 |  free  |
-        */
-       errno = 0;
-       addr = base_addr;
-       size = page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p == MAP_FAILED) {
-               dump_maps();
-               printf("Error:5: mmap() failed when it shouldn't have\n");
-               return 1;
-       }
-
-       /*
-        * Adjacent to end of existing mapping:
-        *   base |  free  |
-        *     +1 | mapped |
-        *     +2 | mapped |
-        *     +3 | mapped |
-        *     +4 |  free  |  new
-        */
-       errno = 0;
-       addr = base_addr + (4 * page_size);
-       size = page_size;
-       p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
-       printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
-
-       if (p == MAP_FAILED) {
-               dump_maps();
-               printf("Error:6: mmap() failed when it shouldn't have\n");
-               return 1;
-       }
-
-       addr = base_addr;
-       size = 5 * page_size;
-       if (munmap((void *)addr, size) != 0) {
-               dump_maps();
-               printf("Error: munmap failed!?\n");
-               return 1;
-       }
-       printf("unmap() successful\n");
-
-       printf("OK\n");
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
deleted file mode 100644 (file)
index 312889e..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Example of using hugepage memory in a user application using the mmap
- * system call with MAP_HUGETLB flag.  Before running this program make
- * sure the administrator has allocated enough default sized huge pages
- * to cover the 256 MB allocation.
- *
- * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
- * That means the addresses starting with 0x800000... will need to be
- * specified.  Specifying a fixed address is not required on ppc64, i386
- * or x86_64.
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-
-#define LENGTH (256UL*1024*1024)
-#define PROTECTION (PROT_READ | PROT_WRITE)
-
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB 0x40000 /* arch specific */
-#endif
-
-#ifndef MAP_HUGE_SHIFT
-#define MAP_HUGE_SHIFT 26
-#endif
-
-#ifndef MAP_HUGE_MASK
-#define MAP_HUGE_MASK 0x3f
-#endif
-
-/* Only ia64 requires this */
-#ifdef __ia64__
-#define ADDR (void *)(0x8000000000000000UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
-#else
-#define ADDR (void *)(0x0UL)
-#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
-#endif
-
-static void check_bytes(char *addr)
-{
-       printf("First hex is %x\n", *((unsigned int *)addr));
-}
-
-static void write_bytes(char *addr, size_t length)
-{
-       unsigned long i;
-
-       for (i = 0; i < length; i++)
-               *(addr + i) = (char)i;
-}
-
-static int read_bytes(char *addr, size_t length)
-{
-       unsigned long i;
-
-       check_bytes(addr);
-       for (i = 0; i < length; i++)
-               if (*(addr + i) != (char)i) {
-                       printf("Mismatch at %lu\n", i);
-                       return 1;
-               }
-       return 0;
-}
-
-int main(int argc, char **argv)
-{
-       void *addr;
-       int ret;
-       size_t length = LENGTH;
-       int flags = FLAGS;
-       int shift = 0;
-
-       if (argc > 1)
-               length = atol(argv[1]) << 20;
-       if (argc > 2) {
-               shift = atoi(argv[2]);
-               if (shift)
-                       flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
-       }
-
-       if (shift)
-               printf("%u kB hugepages\n", 1 << (shift - 10));
-       else
-               printf("Default size hugepages\n");
-       printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
-
-       addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
-       if (addr == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
-       }
-
-       printf("Returned address is %p\n", addr);
-       check_bytes(addr);
-       write_bytes(addr, length);
-       ret = read_bytes(addr, length);
-
-       /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
-       if (munmap(addr, length)) {
-               perror("munmap");
-               exit(1);
-       }
-
-       return ret;
-}
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c
deleted file mode 100644 (file)
index 6b8aeaa..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2018 Dmitry Safonov, Arista Networks
- *
- * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
- */
-
-#define _GNU_SOURCE
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#ifndef MMAP_SZ
-#define MMAP_SZ                4096
-#endif
-
-#define BUG_ON(condition, description)                                 \
-       do {                                                            \
-               if (condition) {                                        \
-                       fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
-                               __LINE__, (description), strerror(errno)); \
-                       exit(1);                                        \
-               }                                                       \
-       } while (0)
-
-static int parent_f(int sock, unsigned long *smap, int child)
-{
-       int status, ret;
-
-       ret = read(sock, &status, sizeof(int));
-       BUG_ON(ret <= 0, "read(sock)");
-
-       *smap = 0x22222BAD;
-       ret = msync(smap, MMAP_SZ, MS_SYNC);
-       BUG_ON(ret, "msync()");
-
-       ret = write(sock, &status, sizeof(int));
-       BUG_ON(ret <= 0, "write(sock)");
-
-       waitpid(child, &status, 0);
-       BUG_ON(!WIFEXITED(status), "child in unexpected state");
-
-       return WEXITSTATUS(status);
-}
-
-static int child_f(int sock, unsigned long *smap, int fd)
-{
-       int ret, buf = 0;
-
-       smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
-                       MAP_PRIVATE | MAP_POPULATE, fd, 0);
-       BUG_ON(smap == MAP_FAILED, "mmap()");
-
-       BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
-
-       ret = write(sock, &buf, sizeof(int));
-       BUG_ON(ret <= 0, "write(sock)");
-
-       ret = read(sock, &buf, sizeof(int));
-       BUG_ON(ret <= 0, "read(sock)");
-
-       BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
-       BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
-
-       return 0;
-}
-
-int main(int argc, char **argv)
-{
-       int sock[2], child, ret;
-       FILE *ftmp;
-       unsigned long *smap;
-
-       ftmp = tmpfile();
-       BUG_ON(ftmp == 0, "tmpfile()");
-
-       ret = ftruncate(fileno(ftmp), MMAP_SZ);
-       BUG_ON(ret, "ftruncate()");
-
-       smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
-                       MAP_SHARED, fileno(ftmp), 0);
-       BUG_ON(smap == MAP_FAILED, "mmap()");
-
-       *smap = 0xdeadbabe;
-       /* Probably unnecessary, but let it be. */
-       ret = msync(smap, MMAP_SZ, MS_SYNC);
-       BUG_ON(ret, "msync()");
-
-       ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
-       BUG_ON(ret, "socketpair()");
-
-       child = fork();
-       BUG_ON(child == -1, "fork()");
-
-       if (child) {
-               ret = close(sock[0]);
-               BUG_ON(ret, "close()");
-
-               return parent_f(sock[1], smap, child);
-       }
-
-       ret = close(sock[1]);
-       BUG_ON(ret, "close()");
-
-       return child_f(sock[0], smap, fileno(ftmp));
-}
diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c
deleted file mode 100644 (file)
index 957b9e1..0000000
+++ /dev/null
@@ -1,296 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright IBM Corporation, 2021
- *
- * Author: Mike Rapoport <rppt@linux.ibm.com>
- */
-
-#define _GNU_SOURCE
-#include <sys/uio.h>
-#include <sys/mman.h>
-#include <sys/wait.h>
-#include <sys/types.h>
-#include <sys/ptrace.h>
-#include <sys/syscall.h>
-#include <sys/resource.h>
-#include <sys/capability.h>
-
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
-#include <stdio.h>
-
-#include "../kselftest.h"
-
-#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__)
-#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__)
-#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__)
-
-#ifdef __NR_memfd_secret
-
-#define PATTERN        0x55
-
-static const int prot = PROT_READ | PROT_WRITE;
-static const int mode = MAP_SHARED;
-
-static unsigned long page_size;
-static unsigned long mlock_limit_cur;
-static unsigned long mlock_limit_max;
-
-static int memfd_secret(unsigned int flags)
-{
-       return syscall(__NR_memfd_secret, flags);
-}
-
-static void test_file_apis(int fd)
-{
-       char buf[64];
-
-       if ((read(fd, buf, sizeof(buf)) >= 0) ||
-           (write(fd, buf, sizeof(buf)) >= 0) ||
-           (pread(fd, buf, sizeof(buf), 0) >= 0) ||
-           (pwrite(fd, buf, sizeof(buf), 0) >= 0))
-               fail("unexpected file IO\n");
-       else
-               pass("file IO is blocked as expected\n");
-}
-
-static void test_mlock_limit(int fd)
-{
-       size_t len;
-       char *mem;
-
-       len = mlock_limit_cur;
-       mem = mmap(NULL, len, prot, mode, fd, 0);
-       if (mem == MAP_FAILED) {
-               fail("unable to mmap secret memory\n");
-               return;
-       }
-       munmap(mem, len);
-
-       len = mlock_limit_max * 2;
-       mem = mmap(NULL, len, prot, mode, fd, 0);
-       if (mem != MAP_FAILED) {
-               fail("unexpected mlock limit violation\n");
-               munmap(mem, len);
-               return;
-       }
-
-       pass("mlock limit is respected\n");
-}
-
-static void try_process_vm_read(int fd, int pipefd[2])
-{
-       struct iovec liov, riov;
-       char buf[64];
-       char *mem;
-
-       if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
-               fail("pipe write: %s\n", strerror(errno));
-               exit(KSFT_FAIL);
-       }
-
-       liov.iov_len = riov.iov_len = sizeof(buf);
-       liov.iov_base = buf;
-       riov.iov_base = mem;
-
-       if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) {
-               if (errno == ENOSYS)
-                       exit(KSFT_SKIP);
-               exit(KSFT_PASS);
-       }
-
-       exit(KSFT_FAIL);
-}
-
-static void try_ptrace(int fd, int pipefd[2])
-{
-       pid_t ppid = getppid();
-       int status;
-       char *mem;
-       long ret;
-
-       if (read(pipefd[0], &mem, sizeof(mem)) < 0) {
-               perror("pipe write");
-               exit(KSFT_FAIL);
-       }
-
-       ret = ptrace(PTRACE_ATTACH, ppid, 0, 0);
-       if (ret) {
-               perror("ptrace_attach");
-               exit(KSFT_FAIL);
-       }
-
-       ret = waitpid(ppid, &status, WUNTRACED);
-       if ((ret != ppid) || !(WIFSTOPPED(status))) {
-               fprintf(stderr, "weird waitppid result %ld stat %x\n",
-                       ret, status);
-               exit(KSFT_FAIL);
-       }
-
-       if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0))
-               exit(KSFT_PASS);
-
-       exit(KSFT_FAIL);
-}
-
-static void check_child_status(pid_t pid, const char *name)
-{
-       int status;
-
-       waitpid(pid, &status, 0);
-
-       if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) {
-               skip("%s is not supported\n", name);
-               return;
-       }
-
-       if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) ||
-           WIFSIGNALED(status)) {
-               pass("%s is blocked as expected\n", name);
-               return;
-       }
-
-       fail("%s: unexpected memory access\n", name);
-}
-
-static void test_remote_access(int fd, const char *name,
-                              void (*func)(int fd, int pipefd[2]))
-{
-       int pipefd[2];
-       pid_t pid;
-       char *mem;
-
-       if (pipe(pipefd)) {
-               fail("pipe failed: %s\n", strerror(errno));
-               return;
-       }
-
-       pid = fork();
-       if (pid < 0) {
-               fail("fork failed: %s\n", strerror(errno));
-               return;
-       }
-
-       if (pid == 0) {
-               func(fd, pipefd);
-               return;
-       }
-
-       mem = mmap(NULL, page_size, prot, mode, fd, 0);
-       if (mem == MAP_FAILED) {
-               fail("Unable to mmap secret memory\n");
-               return;
-       }
-
-       ftruncate(fd, page_size);
-       memset(mem, PATTERN, page_size);
-
-       if (write(pipefd[1], &mem, sizeof(mem)) < 0) {
-               fail("pipe write: %s\n", strerror(errno));
-               return;
-       }
-
-       check_child_status(pid, name);
-}
-
-static void test_process_vm_read(int fd)
-{
-       test_remote_access(fd, "process_vm_read", try_process_vm_read);
-}
-
-static void test_ptrace(int fd)
-{
-       test_remote_access(fd, "ptrace", try_ptrace);
-}
-
-static int set_cap_limits(rlim_t max)
-{
-       struct rlimit new;
-       cap_t cap = cap_init();
-
-       new.rlim_cur = max;
-       new.rlim_max = max;
-       if (setrlimit(RLIMIT_MEMLOCK, &new)) {
-               perror("setrlimit() returns error");
-               return -1;
-       }
-
-       /* drop capabilities including CAP_IPC_LOCK */
-       if (cap_set_proc(cap)) {
-               perror("cap_set_proc() returns error");
-               return -2;
-       }
-
-       return 0;
-}
-
-static void prepare(void)
-{
-       struct rlimit rlim;
-
-       page_size = sysconf(_SC_PAGE_SIZE);
-       if (!page_size)
-               ksft_exit_fail_msg("Failed to get page size %s\n",
-                                  strerror(errno));
-
-       if (getrlimit(RLIMIT_MEMLOCK, &rlim))
-               ksft_exit_fail_msg("Unable to detect mlock limit: %s\n",
-                                  strerror(errno));
-
-       mlock_limit_cur = rlim.rlim_cur;
-       mlock_limit_max = rlim.rlim_max;
-
-       printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n",
-              page_size, mlock_limit_cur, mlock_limit_max);
-
-       if (page_size > mlock_limit_cur)
-               mlock_limit_cur = page_size;
-       if (page_size > mlock_limit_max)
-               mlock_limit_max = page_size;
-
-       if (set_cap_limits(mlock_limit_max))
-               ksft_exit_fail_msg("Unable to set mlock limit: %s\n",
-                                  strerror(errno));
-}
-
-#define NUM_TESTS 4
-
-int main(int argc, char *argv[])
-{
-       int fd;
-
-       prepare();
-
-       ksft_print_header();
-       ksft_set_plan(NUM_TESTS);
-
-       fd = memfd_secret(0);
-       if (fd < 0) {
-               if (errno == ENOSYS)
-                       ksft_exit_skip("memfd_secret is not supported\n");
-               else
-                       ksft_exit_fail_msg("memfd_secret failed: %s\n",
-                                          strerror(errno));
-       }
-
-       test_mlock_limit(fd);
-       test_file_apis(fd);
-       test_process_vm_read(fd);
-       test_ptrace(fd);
-
-       close(fd);
-
-       ksft_finished();
-}
-
-#else /* __NR_memfd_secret */
-
-int main(int argc, char *argv[])
-{
-       printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n");
-       return KSFT_SKIP;
-}
-
-#endif /* __NR_memfd_secret */
diff --git a/tools/testing/selftests/vm/migration.c b/tools/testing/selftests/vm/migration.c
deleted file mode 100644 (file)
index 1cec842..0000000
+++ /dev/null
@@ -1,193 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * The main purpose of the tests here is to exercise the migration entry code
- * paths in the kernel.
- */
-
-#include "../kselftest_harness.h"
-#include <strings.h>
-#include <pthread.h>
-#include <numa.h>
-#include <numaif.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <time.h>
-
-#define TWOMEG (2<<20)
-#define RUNTIME (60)
-
-#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
-
-FIXTURE(migration)
-{
-       pthread_t *threads;
-       pid_t *pids;
-       int nthreads;
-       int n1;
-       int n2;
-};
-
-FIXTURE_SETUP(migration)
-{
-       int n;
-
-       ASSERT_EQ(numa_available(), 0);
-       self->nthreads = numa_num_task_cpus() - 1;
-       self->n1 = -1;
-       self->n2 = -1;
-
-       for (n = 0; n < numa_max_possible_node(); n++)
-               if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) {
-                       if (self->n1 == -1) {
-                               self->n1 = n;
-                       } else {
-                               self->n2 = n;
-                               break;
-                       }
-               }
-
-       self->threads = malloc(self->nthreads * sizeof(*self->threads));
-       ASSERT_NE(self->threads, NULL);
-       self->pids = malloc(self->nthreads * sizeof(*self->pids));
-       ASSERT_NE(self->pids, NULL);
-};
-
-FIXTURE_TEARDOWN(migration)
-{
-       free(self->threads);
-       free(self->pids);
-}
-
-int migrate(uint64_t *ptr, int n1, int n2)
-{
-       int ret, tmp;
-       int status = 0;
-       struct timespec ts1, ts2;
-
-       if (clock_gettime(CLOCK_MONOTONIC, &ts1))
-               return -1;
-
-       while (1) {
-               if (clock_gettime(CLOCK_MONOTONIC, &ts2))
-                       return -1;
-
-               if (ts2.tv_sec - ts1.tv_sec >= RUNTIME)
-                       return 0;
-
-               ret = move_pages(0, 1, (void **) &ptr, &n2, &status,
-                               MPOL_MF_MOVE_ALL);
-               if (ret) {
-                       if (ret > 0)
-                               printf("Didn't migrate %d pages\n", ret);
-                       else
-                               perror("Couldn't migrate pages");
-                       return -2;
-               }
-
-               tmp = n2;
-               n2 = n1;
-               n1 = tmp;
-       }
-
-       return 0;
-}
-
-void *access_mem(void *ptr)
-{
-       uint64_t y = 0;
-       volatile uint64_t *x = ptr;
-
-       while (1) {
-               pthread_testcancel();
-               y += *x;
-       }
-
-       return NULL;
-}
-
-/*
- * Basic migration entry testing. One thread will move pages back and forth
- * between nodes whilst other threads try and access them triggering the
- * migration entry wait paths in the kernel.
- */
-TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME)
-{
-       uint64_t *ptr;
-       int i;
-
-       if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
-               SKIP(return, "Not enough threads or NUMA nodes available");
-
-       ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
-               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       ASSERT_NE(ptr, MAP_FAILED);
-
-       memset(ptr, 0xde, TWOMEG);
-       for (i = 0; i < self->nthreads - 1; i++)
-               if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
-                       perror("Couldn't create thread");
-
-       ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
-       for (i = 0; i < self->nthreads - 1; i++)
-               ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
-}
-
-/*
- * Same as the previous test but with shared memory.
- */
-TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME)
-{
-       pid_t pid;
-       uint64_t *ptr;
-       int i;
-
-       if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
-               SKIP(return, "Not enough threads or NUMA nodes available");
-
-       ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE,
-               MAP_SHARED | MAP_ANONYMOUS, -1, 0);
-       ASSERT_NE(ptr, MAP_FAILED);
-
-       memset(ptr, 0xde, TWOMEG);
-       for (i = 0; i < self->nthreads - 1; i++) {
-               pid = fork();
-               if (!pid)
-                       access_mem(ptr);
-               else
-                       self->pids[i] = pid;
-       }
-
-       ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
-       for (i = 0; i < self->nthreads - 1; i++)
-               ASSERT_EQ(kill(self->pids[i], SIGTERM), 0);
-}
-
-/*
- * Tests the pmd migration entry paths.
- */
-TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME)
-{
-       uint64_t *ptr;
-       int i;
-
-       if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0)
-               SKIP(return, "Not enough threads or NUMA nodes available");
-
-       ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE,
-               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       ASSERT_NE(ptr, MAP_FAILED);
-
-       ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG);
-       ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0);
-       memset(ptr, 0xde, TWOMEG);
-       for (i = 0; i < self->nthreads - 1; i++)
-               if (pthread_create(&self->threads[i], NULL, access_mem, ptr))
-                       perror("Couldn't create thread");
-
-       ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0);
-       for (i = 0; i < self->nthreads - 1; i++)
-               ASSERT_EQ(pthread_cancel(self->threads[i]), 0);
-}
-
-TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c
deleted file mode 100644 (file)
index 782ea94..0000000
+++ /dev/null
@@ -1,294 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * It tests the mlock/mlock2() when they are invoked
- * on randomly memory region.
- */
-#include <unistd.h>
-#include <sys/resource.h>
-#include <sys/capability.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <time.h>
-#include "mlock2.h"
-
-#define CHUNK_UNIT (128 * 1024)
-#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
-#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
-#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
-
-#define TEST_LOOP 100
-#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
-
-int set_cap_limits(rlim_t max)
-{
-       struct rlimit new;
-       cap_t cap = cap_init();
-
-       new.rlim_cur = max;
-       new.rlim_max = max;
-       if (setrlimit(RLIMIT_MEMLOCK, &new)) {
-               perror("setrlimit() returns error\n");
-               return -1;
-       }
-
-       /* drop capabilities including CAP_IPC_LOCK */
-       if (cap_set_proc(cap)) {
-               perror("cap_set_proc() returns error\n");
-               return -2;
-       }
-
-       return 0;
-}
-
-int get_proc_locked_vm_size(void)
-{
-       FILE *f;
-       int ret = -1;
-       char line[1024] = {0};
-       unsigned long lock_size = 0;
-
-       f = fopen("/proc/self/status", "r");
-       if (!f) {
-               perror("fopen");
-               return -1;
-       }
-
-       while (fgets(line, 1024, f)) {
-               if (strstr(line, "VmLck")) {
-                       ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
-                       if (ret <= 0) {
-                               printf("sscanf() on VmLck error: %s: %d\n",
-                                               line, ret);
-                               fclose(f);
-                               return -1;
-                       }
-                       fclose(f);
-                       return (int)(lock_size << 10);
-               }
-       }
-
-       perror("cannot parse VmLck in /proc/self/status\n");
-       fclose(f);
-       return -1;
-}
-
-/*
- * Get the MMUPageSize of the memory region including input
- * address from proc file.
- *
- * return value: on error case, 0 will be returned.
- * Otherwise the page size(in bytes) is returned.
- */
-int get_proc_page_size(unsigned long addr)
-{
-       FILE *smaps;
-       char *line;
-       unsigned long mmupage_size = 0;
-       size_t size;
-
-       smaps = seek_to_smaps_entry(addr);
-       if (!smaps) {
-               printf("Unable to parse /proc/self/smaps\n");
-               return 0;
-       }
-
-       while (getline(&line, &size, smaps) > 0) {
-               if (!strstr(line, "MMUPageSize")) {
-                       free(line);
-                       line = NULL;
-                       size = 0;
-                       continue;
-               }
-
-               /* found the MMUPageSize of this section */
-               if (sscanf(line, "MMUPageSize:    %8lu kB",
-                                       &mmupage_size) < 1) {
-                       printf("Unable to parse smaps entry for Size:%s\n",
-                                       line);
-                       break;
-               }
-
-       }
-       free(line);
-       if (smaps)
-               fclose(smaps);
-       return mmupage_size << 10;
-}
-
-/*
- * Test mlock/mlock2() on provided memory chunk.
- * It expects the mlock/mlock2() to be successful (within rlimit)
- *
- * With allocated memory chunk [p, p + alloc_size), this
- * test will choose start/len randomly to perform mlock/mlock2
- * [start, start +  len] memory range. The range is within range
- * of the allocated chunk.
- *
- * The memory region size alloc_size is within the rlimit.
- * So we always expect a success of mlock/mlock2.
- *
- * VmLck is assumed to be 0 before this test.
- *
- *    return value: 0 - success
- *    else: failure
- */
-int test_mlock_within_limit(char *p, int alloc_size)
-{
-       int i;
-       int ret = 0;
-       int locked_vm_size = 0;
-       struct rlimit cur;
-       int page_size = 0;
-
-       getrlimit(RLIMIT_MEMLOCK, &cur);
-       if (cur.rlim_cur < alloc_size) {
-               printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
-                               alloc_size, (unsigned int)cur.rlim_cur);
-               return -1;
-       }
-
-       srand(time(NULL));
-       for (i = 0; i < TEST_LOOP; i++) {
-               /*
-                * - choose mlock/mlock2 randomly
-                * - choose lock_size randomly but lock_size < alloc_size
-                * - choose start_offset randomly but p+start_offset+lock_size
-                *   < p+alloc_size
-                */
-               int is_mlock = !!(rand() % 2);
-               int lock_size = rand() % alloc_size;
-               int start_offset = rand() % (alloc_size - lock_size);
-
-               if (is_mlock)
-                       ret = mlock(p + start_offset, lock_size);
-               else
-                       ret = mlock2_(p + start_offset, lock_size,
-                                      MLOCK_ONFAULT);
-
-               if (ret) {
-                       printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
-                                       is_mlock ? "mlock" : "mlock2",
-                                       p, alloc_size,
-                                       p + start_offset, lock_size);
-                       return ret;
-               }
-       }
-
-       /*
-        * Check VmLck left by the tests.
-        */
-       locked_vm_size = get_proc_locked_vm_size();
-       page_size = get_proc_page_size((unsigned long)p);
-       if (page_size == 0) {
-               printf("cannot get proc MMUPageSize\n");
-               return -1;
-       }
-
-       if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
-               printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
-                               locked_vm_size, alloc_size);
-               return -1;
-       }
-
-       return 0;
-}
-
-
-/*
- * We expect the mlock/mlock2() to be fail (outof limitation)
- *
- * With allocated memory chunk [p, p + alloc_size), this
- * test will randomly choose start/len and perform mlock/mlock2
- * on [start, start+len] range.
- *
- * The memory region size alloc_size is above the rlimit.
- * And the len to be locked is higher than rlimit.
- * So we always expect a failure of mlock/mlock2.
- * No locked page number should be increased as a side effect.
- *
- *    return value: 0 - success
- *    else: failure
- */
-int test_mlock_outof_limit(char *p, int alloc_size)
-{
-       int i;
-       int ret = 0;
-       int locked_vm_size = 0, old_locked_vm_size = 0;
-       struct rlimit cur;
-
-       getrlimit(RLIMIT_MEMLOCK, &cur);
-       if (cur.rlim_cur >= alloc_size) {
-               printf("alloc_size[%d] >%u rlimit, violates test condition\n",
-                               alloc_size, (unsigned int)cur.rlim_cur);
-               return -1;
-       }
-
-       old_locked_vm_size = get_proc_locked_vm_size();
-       srand(time(NULL));
-       for (i = 0; i < TEST_LOOP; i++) {
-               int is_mlock = !!(rand() % 2);
-               int lock_size = (rand() % (alloc_size - cur.rlim_cur))
-                       + cur.rlim_cur;
-               int start_offset = rand() % (alloc_size - lock_size);
-
-               if (is_mlock)
-                       ret = mlock(p + start_offset, lock_size);
-               else
-                       ret = mlock2_(p + start_offset, lock_size,
-                                       MLOCK_ONFAULT);
-               if (ret == 0) {
-                       printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
-                                       is_mlock ? "mlock" : "mlock2",
-                                       p, alloc_size,
-                                       p + start_offset, lock_size);
-                       return -1;
-               }
-       }
-
-       locked_vm_size = get_proc_locked_vm_size();
-       if (locked_vm_size != old_locked_vm_size) {
-               printf("tests leads to new mlocked page: old[%d], new[%d]\n",
-                               old_locked_vm_size,
-                               locked_vm_size);
-               return -1;
-       }
-
-       return 0;
-}
-
-int main(int argc, char **argv)
-{
-       char *p = NULL;
-       int ret = 0;
-
-       if (set_cap_limits(MLOCK_RLIMIT_SIZE))
-               return -1;
-
-       p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
-       if (p == NULL) {
-               perror("malloc() failure\n");
-               return -1;
-       }
-       ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
-       if (ret)
-               return ret;
-       munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
-       free(p);
-
-
-       p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
-       if (p == NULL) {
-               perror("malloc() failure\n");
-               return -1;
-       }
-       ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
-       if (ret)
-               return ret;
-       munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
-       free(p);
-
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
deleted file mode 100644 (file)
index 11b2301..0000000
+++ /dev/null
@@ -1,520 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sys/mman.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <stdbool.h>
-#include "mlock2.h"
-
-#include "../kselftest.h"
-
-struct vm_boundaries {
-       unsigned long start;
-       unsigned long end;
-};
-
-static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
-{
-       FILE *file;
-       int ret = 1;
-       char line[1024] = {0};
-       char *end_addr;
-       char *stop;
-       unsigned long start;
-       unsigned long end;
-
-       if (!area)
-               return ret;
-
-       file = fopen("/proc/self/maps", "r");
-       if (!file) {
-               perror("fopen");
-               return ret;
-       }
-
-       memset(area, 0, sizeof(struct vm_boundaries));
-
-       while(fgets(line, 1024, file)) {
-               end_addr = strchr(line, '-');
-               if (!end_addr) {
-                       printf("cannot parse /proc/self/maps\n");
-                       goto out;
-               }
-               *end_addr = '\0';
-               end_addr++;
-               stop = strchr(end_addr, ' ');
-               if (!stop) {
-                       printf("cannot parse /proc/self/maps\n");
-                       goto out;
-               }
-               stop = '\0';
-
-               sscanf(line, "%lx", &start);
-               sscanf(end_addr, "%lx", &end);
-
-               if (start <= addr && end > addr) {
-                       area->start = start;
-                       area->end = end;
-                       ret = 0;
-                       goto out;
-               }
-       }
-out:
-       fclose(file);
-       return ret;
-}
-
-#define VMFLAGS "VmFlags:"
-
-static bool is_vmflag_set(unsigned long addr, const char *vmflag)
-{
-       char *line = NULL;
-       char *flags;
-       size_t size = 0;
-       bool ret = false;
-       FILE *smaps;
-
-       smaps = seek_to_smaps_entry(addr);
-       if (!smaps) {
-               printf("Unable to parse /proc/self/smaps\n");
-               goto out;
-       }
-
-       while (getline(&line, &size, smaps) > 0) {
-               if (!strstr(line, VMFLAGS)) {
-                       free(line);
-                       line = NULL;
-                       size = 0;
-                       continue;
-               }
-
-               flags = line + strlen(VMFLAGS);
-               ret = (strstr(flags, vmflag) != NULL);
-               goto out;
-       }
-
-out:
-       free(line);
-       fclose(smaps);
-       return ret;
-}
-
-#define SIZE "Size:"
-#define RSS  "Rss:"
-#define LOCKED "lo"
-
-static unsigned long get_value_for_name(unsigned long addr, const char *name)
-{
-       char *line = NULL;
-       size_t size = 0;
-       char *value_ptr;
-       FILE *smaps = NULL;
-       unsigned long value = -1UL;
-
-       smaps = seek_to_smaps_entry(addr);
-       if (!smaps) {
-               printf("Unable to parse /proc/self/smaps\n");
-               goto out;
-       }
-
-       while (getline(&line, &size, smaps) > 0) {
-               if (!strstr(line, name)) {
-                       free(line);
-                       line = NULL;
-                       size = 0;
-                       continue;
-               }
-
-               value_ptr = line + strlen(name);
-               if (sscanf(value_ptr, "%lu kB", &value) < 1) {
-                       printf("Unable to parse smaps entry for Size\n");
-                       goto out;
-               }
-               break;
-       }
-
-out:
-       if (smaps)
-               fclose(smaps);
-       free(line);
-       return value;
-}
-
-static bool is_vma_lock_on_fault(unsigned long addr)
-{
-       bool locked;
-       unsigned long vma_size, vma_rss;
-
-       locked = is_vmflag_set(addr, LOCKED);
-       if (!locked)
-               return false;
-
-       vma_size = get_value_for_name(addr, SIZE);
-       vma_rss = get_value_for_name(addr, RSS);
-
-       /* only one page is faulted in */
-       return (vma_rss < vma_size);
-}
-
-#define PRESENT_BIT     0x8000000000000000ULL
-#define PFN_MASK        0x007FFFFFFFFFFFFFULL
-#define UNEVICTABLE_BIT (1UL << 18)
-
-static int lock_check(unsigned long addr)
-{
-       bool locked;
-       unsigned long vma_size, vma_rss;
-
-       locked = is_vmflag_set(addr, LOCKED);
-       if (!locked)
-               return false;
-
-       vma_size = get_value_for_name(addr, SIZE);
-       vma_rss = get_value_for_name(addr, RSS);
-
-       return (vma_rss == vma_size);
-}
-
-static int unlock_lock_check(char *map)
-{
-       if (is_vmflag_set((unsigned long)map, LOCKED)) {
-               printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
-               return 1;
-       }
-
-       return 0;
-}
-
-static int test_mlock_lock()
-{
-       char *map;
-       int ret = 1;
-       unsigned long page_size = getpagesize();
-
-       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-       if (map == MAP_FAILED) {
-               perror("test_mlock_locked mmap");
-               goto out;
-       }
-
-       if (mlock2_(map, 2 * page_size, 0)) {
-               if (errno == ENOSYS) {
-                       printf("Cannot call new mlock family, skipping test\n");
-                       _exit(KSFT_SKIP);
-               }
-               perror("mlock2(0)");
-               goto unmap;
-       }
-
-       if (!lock_check((unsigned long)map))
-               goto unmap;
-
-       /* Now unlock and recheck attributes */
-       if (munlock(map, 2 * page_size)) {
-               perror("munlock()");
-               goto unmap;
-       }
-
-       ret = unlock_lock_check(map);
-
-unmap:
-       munmap(map, 2 * page_size);
-out:
-       return ret;
-}
-
-static int onfault_check(char *map)
-{
-       *map = 'a';
-       if (!is_vma_lock_on_fault((unsigned long)map)) {
-               printf("VMA is not marked for lock on fault\n");
-               return 1;
-       }
-
-       return 0;
-}
-
-static int unlock_onfault_check(char *map)
-{
-       unsigned long page_size = getpagesize();
-
-       if (is_vma_lock_on_fault((unsigned long)map) ||
-           is_vma_lock_on_fault((unsigned long)map + page_size)) {
-               printf("VMA is still lock on fault after unlock\n");
-               return 1;
-       }
-
-       return 0;
-}
-
-static int test_mlock_onfault()
-{
-       char *map;
-       int ret = 1;
-       unsigned long page_size = getpagesize();
-
-       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-       if (map == MAP_FAILED) {
-               perror("test_mlock_locked mmap");
-               goto out;
-       }
-
-       if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-               if (errno == ENOSYS) {
-                       printf("Cannot call new mlock family, skipping test\n");
-                       _exit(KSFT_SKIP);
-               }
-               perror("mlock2(MLOCK_ONFAULT)");
-               goto unmap;
-       }
-
-       if (onfault_check(map))
-               goto unmap;
-
-       /* Now unlock and recheck attributes */
-       if (munlock(map, 2 * page_size)) {
-               if (errno == ENOSYS) {
-                       printf("Cannot call new mlock family, skipping test\n");
-                       _exit(KSFT_SKIP);
-               }
-               perror("munlock()");
-               goto unmap;
-       }
-
-       ret = unlock_onfault_check(map);
-unmap:
-       munmap(map, 2 * page_size);
-out:
-       return ret;
-}
-
-static int test_lock_onfault_of_present()
-{
-       char *map;
-       int ret = 1;
-       unsigned long page_size = getpagesize();
-
-       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-       if (map == MAP_FAILED) {
-               perror("test_mlock_locked mmap");
-               goto out;
-       }
-
-       *map = 'a';
-
-       if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
-               if (errno == ENOSYS) {
-                       printf("Cannot call new mlock family, skipping test\n");
-                       _exit(KSFT_SKIP);
-               }
-               perror("mlock2(MLOCK_ONFAULT)");
-               goto unmap;
-       }
-
-       if (!is_vma_lock_on_fault((unsigned long)map) ||
-           !is_vma_lock_on_fault((unsigned long)map + page_size)) {
-               printf("VMA with present pages is not marked lock on fault\n");
-               goto unmap;
-       }
-       ret = 0;
-unmap:
-       munmap(map, 2 * page_size);
-out:
-       return ret;
-}
-
-static int test_munlockall()
-{
-       char *map;
-       int ret = 1;
-       unsigned long page_size = getpagesize();
-
-       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-       if (map == MAP_FAILED) {
-               perror("test_munlockall mmap");
-               goto out;
-       }
-
-       if (mlockall(MCL_CURRENT)) {
-               perror("mlockall(MCL_CURRENT)");
-               goto out;
-       }
-
-       if (!lock_check((unsigned long)map))
-               goto unmap;
-
-       if (munlockall()) {
-               perror("munlockall()");
-               goto unmap;
-       }
-
-       if (unlock_lock_check(map))
-               goto unmap;
-
-       munmap(map, 2 * page_size);
-
-       map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
-                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-       if (map == MAP_FAILED) {
-               perror("test_munlockall second mmap");
-               goto out;
-       }
-
-       if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
-               perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
-               goto unmap;
-       }
-
-       if (onfault_check(map))
-               goto unmap;
-
-       if (munlockall()) {
-               perror("munlockall()");
-               goto unmap;
-       }
-
-       if (unlock_onfault_check(map))
-               goto unmap;
-
-       if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
-               perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
-               goto out;
-       }
-
-       if (!lock_check((unsigned long)map))
-               goto unmap;
-
-       if (munlockall()) {
-               perror("munlockall()");
-               goto unmap;
-       }
-
-       ret = unlock_lock_check(map);
-
-unmap:
-       munmap(map, 2 * page_size);
-out:
-       munlockall();
-       return ret;
-}
-
-static int test_vma_management(bool call_mlock)
-{
-       int ret = 1;
-       void *map;
-       unsigned long page_size = getpagesize();
-       struct vm_boundaries page1;
-       struct vm_boundaries page2;
-       struct vm_boundaries page3;
-
-       map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
-                  MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-       if (map == MAP_FAILED) {
-               perror("mmap()");
-               return ret;
-       }
-
-       if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
-               if (errno == ENOSYS) {
-                       printf("Cannot call new mlock family, skipping test\n");
-                       _exit(KSFT_SKIP);
-               }
-               perror("mlock(ONFAULT)\n");
-               goto out;
-       }
-
-       if (get_vm_area((unsigned long)map, &page1) ||
-           get_vm_area((unsigned long)map + page_size, &page2) ||
-           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-               printf("couldn't find mapping in /proc/self/maps\n");
-               goto out;
-       }
-
-       /*
-        * Before we unlock a portion, we need to that all three pages are in
-        * the same VMA.  If they are not we abort this test (Note that this is
-        * not a failure)
-        */
-       if (page1.start != page2.start || page2.start != page3.start) {
-               printf("VMAs are not merged to start, aborting test\n");
-               ret = 0;
-               goto out;
-       }
-
-       if (munlock(map + page_size, page_size)) {
-               perror("munlock()");
-               goto out;
-       }
-
-       if (get_vm_area((unsigned long)map, &page1) ||
-           get_vm_area((unsigned long)map + page_size, &page2) ||
-           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-               printf("couldn't find mapping in /proc/self/maps\n");
-               goto out;
-       }
-
-       /* All three VMAs should be different */
-       if (page1.start == page2.start || page2.start == page3.start) {
-               printf("failed to split VMA for munlock\n");
-               goto out;
-       }
-
-       /* Now unlock the first and third page and check the VMAs again */
-       if (munlock(map, page_size * 3)) {
-               perror("munlock()");
-               goto out;
-       }
-
-       if (get_vm_area((unsigned long)map, &page1) ||
-           get_vm_area((unsigned long)map + page_size, &page2) ||
-           get_vm_area((unsigned long)map + page_size * 2, &page3)) {
-               printf("couldn't find mapping in /proc/self/maps\n");
-               goto out;
-       }
-
-       /* Now all three VMAs should be the same */
-       if (page1.start != page2.start || page2.start != page3.start) {
-               printf("failed to merge VMAs after munlock\n");
-               goto out;
-       }
-
-       ret = 0;
-out:
-       munmap(map, 3 * page_size);
-       return ret;
-}
-
-static int test_mlockall(int (test_function)(bool call_mlock))
-{
-       int ret = 1;
-
-       if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
-               perror("mlockall");
-               return ret;
-       }
-
-       ret = test_function(false);
-       munlockall();
-       return ret;
-}
-
-int main(int argc, char **argv)
-{
-       int ret = 0;
-       ret += test_mlock_lock();
-       ret += test_mlock_onfault();
-       ret += test_munlockall();
-       ret += test_lock_onfault_of_present();
-       ret += test_vma_management(true);
-       ret += test_mlockall(test_vma_management);
-       return ret;
-}
diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h
deleted file mode 100644 (file)
index 2a6e76c..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <syscall.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifndef MLOCK_ONFAULT
-#define MLOCK_ONFAULT 1
-#endif
-
-#ifndef MCL_ONFAULT
-#define MCL_ONFAULT (MCL_FUTURE << 1)
-#endif
-
-static int mlock2_(void *start, size_t len, int flags)
-{
-#ifdef __NR_mlock2
-       return syscall(__NR_mlock2, start, len, flags);
-#else
-       errno = ENOSYS;
-       return -1;
-#endif
-}
-
-static FILE *seek_to_smaps_entry(unsigned long addr)
-{
-       FILE *file;
-       char *line = NULL;
-       size_t size = 0;
-       unsigned long start, end;
-       char perms[5];
-       unsigned long offset;
-       char dev[32];
-       unsigned long inode;
-       char path[BUFSIZ];
-
-       file = fopen("/proc/self/smaps", "r");
-       if (!file) {
-               perror("fopen smaps");
-               _exit(1);
-       }
-
-       while (getline(&line, &size, file) > 0) {
-               if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
-                          &start, &end, perms, &offset, dev, &inode, path) < 6)
-                       goto next;
-
-               if (start <= addr && addr < end)
-                       goto out;
-
-next:
-               free(line);
-               line = NULL;
-               size = 0;
-       }
-
-       fclose(file);
-       file = NULL;
-
-out:
-       free(line);
-       return file;
-}
diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c
deleted file mode 100644 (file)
index 6c62966..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2022 Google LLC
- */
-#define _GNU_SOURCE
-#include <errno.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-#include "util.h"
-
-#include "../kselftest.h"
-
-#ifndef __NR_pidfd_open
-#define __NR_pidfd_open -1
-#endif
-
-#ifndef __NR_process_mrelease
-#define __NR_process_mrelease -1
-#endif
-
-#define MB(x) (x << 20)
-#define MAX_SIZE_MB 1024
-
-static int alloc_noexit(unsigned long nr_pages, int pipefd)
-{
-       int ppid = getppid();
-       int timeout = 10; /* 10sec timeout to get killed */
-       unsigned long i;
-       char *buf;
-
-       buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANON, 0, 0);
-       if (buf == MAP_FAILED) {
-               perror("mmap failed, halting the test");
-               return KSFT_FAIL;
-       }
-
-       for (i = 0; i < nr_pages; i++)
-               *((unsigned long *)(buf + (i * PAGE_SIZE))) = i;
-
-       /* Signal the parent that the child is ready */
-       if (write(pipefd, "", 1) < 0) {
-               perror("write");
-               return KSFT_FAIL;
-       }
-
-       /* Wait to be killed (when reparenting happens) */
-       while (getppid() == ppid && timeout > 0) {
-               sleep(1);
-               timeout--;
-       }
-
-       munmap(buf, nr_pages * PAGE_SIZE);
-
-       return (timeout > 0) ? KSFT_PASS : KSFT_FAIL;
-}
-
-/* The process_mrelease calls in this test are expected to fail */
-static void run_negative_tests(int pidfd)
-{
-       int res;
-       /* Test invalid flags. Expect to fail with EINVAL error code. */
-       if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) ||
-                       errno != EINVAL) {
-               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-               perror("process_mrelease with wrong flags");
-               exit(res);
-       }
-       /*
-        * Test reaping while process is alive with no pending SIGKILL.
-        * Expect to fail with EINVAL error code.
-        */
-       if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) {
-               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-               perror("process_mrelease on a live process");
-               exit(res);
-       }
-}
-
-static int child_main(int pipefd[], size_t size)
-{
-       int res;
-
-       /* Allocate and fault-in memory and wait to be killed */
-       close(pipefd[0]);
-       res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]);
-       close(pipefd[1]);
-       return res;
-}
-
-int main(void)
-{
-       int pipefd[2], pidfd;
-       bool success, retry;
-       size_t size;
-       pid_t pid;
-       char byte;
-       int res;
-
-       /* Test a wrong pidfd */
-       if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) {
-               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-               perror("process_mrelease with wrong pidfd");
-               exit(res);
-       }
-
-       /* Start the test with 1MB child memory allocation */
-       size = 1;
-retry:
-       /*
-        * Pipe for the child to signal when it's done allocating
-        * memory
-        */
-       if (pipe(pipefd)) {
-               perror("pipe");
-               exit(KSFT_FAIL);
-       }
-       pid = fork();
-       if (pid < 0) {
-               perror("fork");
-               close(pipefd[0]);
-               close(pipefd[1]);
-               exit(KSFT_FAIL);
-       }
-
-       if (pid == 0) {
-               /* Child main routine */
-               res = child_main(pipefd, size);
-               exit(res);
-       }
-
-       /*
-        * Parent main routine:
-        * Wait for the child to finish allocations, then kill and reap
-        */
-       close(pipefd[1]);
-       /* Block until the child is ready */
-       res = read(pipefd[0], &byte, 1);
-       close(pipefd[0]);
-       if (res < 0) {
-               perror("read");
-               if (!kill(pid, SIGKILL))
-                       waitpid(pid, NULL, 0);
-               exit(KSFT_FAIL);
-       }
-
-       pidfd = syscall(__NR_pidfd_open, pid, 0);
-       if (pidfd < 0) {
-               perror("pidfd_open");
-               if (!kill(pid, SIGKILL))
-                       waitpid(pid, NULL, 0);
-               exit(KSFT_FAIL);
-       }
-
-       /* Run negative tests which require a live child */
-       run_negative_tests(pidfd);
-
-       if (kill(pid, SIGKILL)) {
-               res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-               perror("kill");
-               exit(res);
-       }
-
-       success = (syscall(__NR_process_mrelease, pidfd, 0) == 0);
-       if (!success) {
-               /*
-                * If we failed to reap because the child exited too soon,
-                * before we could call process_mrelease. Double child's memory
-                * which causes it to spend more time on cleanup and increases
-                * our chances of reaping its memory before it exits.
-                * Retry until we succeed or reach MAX_SIZE_MB.
-                */
-               if (errno == ESRCH) {
-                       retry = (size <= MAX_SIZE_MB);
-               } else {
-                       res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL);
-                       perror("process_mrelease");
-                       waitpid(pid, NULL, 0);
-                       exit(res);
-               }
-       }
-
-       /* Cleanup to prevent zombies */
-       if (waitpid(pid, NULL, 0) < 0) {
-               perror("waitpid");
-               exit(KSFT_FAIL);
-       }
-       close(pidfd);
-
-       if (!success) {
-               if (retry) {
-                       size *= 2;
-                       goto retry;
-               }
-               printf("All process_mrelease attempts failed!\n");
-               exit(KSFT_FAIL);
-       }
-
-       printf("Success reaping a child with %zuMB of memory allocations\n",
-              size);
-       return KSFT_PASS;
-}
diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c
deleted file mode 100644 (file)
index f01dc4a..0000000
+++ /dev/null
@@ -1,364 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * Tests for mremap w/ MREMAP_DONTUNMAP.
- *
- * Copyright 2020, Brian Geffon <bgeffon@google.com>
- */
-#define _GNU_SOURCE
-#include <sys/mman.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "../kselftest.h"
-
-#ifndef MREMAP_DONTUNMAP
-#define MREMAP_DONTUNMAP 4
-#endif
-
-unsigned long page_size;
-char *page_buffer;
-
-static void dump_maps(void)
-{
-       char cmd[32];
-
-       snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
-       system(cmd);
-}
-
-#define BUG_ON(condition, description)                                       \
-       do {                                                                  \
-               if (condition) {                                              \
-                       fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
-                               __LINE__, (description), strerror(errno));    \
-                       dump_maps();                                      \
-                       exit(1);                                              \
-               }                                                             \
-       } while (0)
-
-// Try a simple operation for to "test" for kernel support this prevents
-// reporting tests as failed when it's run on an older kernel.
-static int kernel_support_for_mremap_dontunmap()
-{
-       int ret = 0;
-       unsigned long num_pages = 1;
-       void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
-                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(source_mapping == MAP_FAILED, "mmap");
-
-       // This simple remap should only fail if MREMAP_DONTUNMAP isn't
-       // supported.
-       void *dest_mapping =
-           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
-       if (dest_mapping == MAP_FAILED) {
-               ret = errno;
-       } else {
-               BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-                      "unable to unmap destination mapping");
-       }
-
-       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-              "unable to unmap source mapping");
-       return ret;
-}
-
-// This helper will just validate that an entire mapping contains the expected
-// byte.
-static int check_region_contains_byte(void *addr, unsigned long size, char byte)
-{
-       BUG_ON(size & (page_size - 1),
-              "check_region_contains_byte expects page multiples");
-       BUG_ON((unsigned long)addr & (page_size - 1),
-              "check_region_contains_byte expects page alignment");
-
-       memset(page_buffer, byte, page_size);
-
-       unsigned long num_pages = size / page_size;
-       unsigned long i;
-
-       // Compare each page checking that it contains our expected byte.
-       for (i = 0; i < num_pages; ++i) {
-               int ret =
-                   memcmp(addr + (i * page_size), page_buffer, page_size);
-               if (ret) {
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
-// the source mapping mapped.
-static void mremap_dontunmap_simple()
-{
-       unsigned long num_pages = 5;
-
-       void *source_mapping =
-           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(source_mapping == MAP_FAILED, "mmap");
-
-       memset(source_mapping, 'a', num_pages * page_size);
-
-       // Try to just move the whole mapping anywhere (not fixed).
-       void *dest_mapping =
-           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
-       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-
-       // Validate that the pages have been moved, we know they were moved if
-       // the dest_mapping contains a's.
-       BUG_ON(check_region_contains_byte
-              (dest_mapping, num_pages * page_size, 'a') != 0,
-              "pages did not migrate");
-       BUG_ON(check_region_contains_byte
-              (source_mapping, num_pages * page_size, 0) != 0,
-              "source should have no ptes");
-
-       BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-              "unable to unmap destination mapping");
-       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-              "unable to unmap source mapping");
-}
-
-// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected.
-static void mremap_dontunmap_simple_shmem()
-{
-       unsigned long num_pages = 5;
-
-       int mem_fd = memfd_create("memfd", MFD_CLOEXEC);
-       BUG_ON(mem_fd < 0, "memfd_create");
-
-       BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0,
-                       "ftruncate");
-
-       void *source_mapping =
-           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-                MAP_FILE | MAP_SHARED, mem_fd, 0);
-       BUG_ON(source_mapping == MAP_FAILED, "mmap");
-
-       BUG_ON(close(mem_fd) < 0, "close");
-
-       memset(source_mapping, 'a', num_pages * page_size);
-
-       // Try to just move the whole mapping anywhere (not fixed).
-       void *dest_mapping =
-           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
-       if (dest_mapping == MAP_FAILED && errno == EINVAL) {
-               // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem.
-               BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-                       "unable to unmap source mapping");
-               return;
-       }
-
-       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-
-       // Validate that the pages have been moved, we know they were moved if
-       // the dest_mapping contains a's.
-       BUG_ON(check_region_contains_byte
-              (dest_mapping, num_pages * page_size, 'a') != 0,
-              "pages did not migrate");
-
-       // Because the region is backed by shmem, we will actually see the same
-       // memory at the source location still.
-       BUG_ON(check_region_contains_byte
-              (source_mapping, num_pages * page_size, 'a') != 0,
-              "source should have no ptes");
-
-       BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-              "unable to unmap destination mapping");
-       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-              "unable to unmap source mapping");
-}
-
-// This test validates MREMAP_DONTUNMAP will move page tables to a specific
-// destination using MREMAP_FIXED, also while validating that the source
-// remains intact.
-static void mremap_dontunmap_simple_fixed()
-{
-       unsigned long num_pages = 5;
-
-       // Since we want to guarantee that we can remap to a point, we will
-       // create a mapping up front.
-       void *dest_mapping =
-           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(dest_mapping == MAP_FAILED, "mmap");
-       memset(dest_mapping, 'X', num_pages * page_size);
-
-       void *source_mapping =
-           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(source_mapping == MAP_FAILED, "mmap");
-       memset(source_mapping, 'a', num_pages * page_size);
-
-       void *remapped_mapping =
-           mremap(source_mapping, num_pages * page_size, num_pages * page_size,
-                  MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
-                  dest_mapping);
-       BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
-       BUG_ON(remapped_mapping != dest_mapping,
-              "mremap should have placed the remapped mapping at dest_mapping");
-
-       // The dest mapping will have been unmap by mremap so we expect the Xs
-       // to be gone and replaced with a's.
-       BUG_ON(check_region_contains_byte
-              (dest_mapping, num_pages * page_size, 'a') != 0,
-              "pages did not migrate");
-
-       // And the source mapping will have had its ptes dropped.
-       BUG_ON(check_region_contains_byte
-              (source_mapping, num_pages * page_size, 0) != 0,
-              "source should have no ptes");
-
-       BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
-              "unable to unmap destination mapping");
-       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-              "unable to unmap source mapping");
-}
-
-// This test validates that we can MREMAP_DONTUNMAP for a portion of an
-// existing mapping.
-static void mremap_dontunmap_partial_mapping()
-{
-       /*
-        *  source mapping:
-        *  --------------
-        *  | aaaaaaaaaa |
-        *  --------------
-        *  to become:
-        *  --------------
-        *  | aaaaa00000 |
-        *  --------------
-        *  With the destination mapping containing 5 pages of As.
-        *  ---------
-        *  | aaaaa |
-        *  ---------
-        */
-       unsigned long num_pages = 10;
-       void *source_mapping =
-           mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
-                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(source_mapping == MAP_FAILED, "mmap");
-       memset(source_mapping, 'a', num_pages * page_size);
-
-       // We will grab the last 5 pages of the source and move them.
-       void *dest_mapping =
-           mremap(source_mapping + (5 * page_size), 5 * page_size,
-                  5 * page_size,
-                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
-       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-
-       // We expect the first 5 pages of the source to contain a's and the
-       // final 5 pages to contain zeros.
-       BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
-              0, "first 5 pages of source should have original pages");
-       BUG_ON(check_region_contains_byte
-              (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
-              "final 5 pages of source should have no ptes");
-
-       // Finally we expect the destination to have 5 pages worth of a's.
-       BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
-              0, "dest mapping should contain ptes from the source");
-
-       BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
-              "unable to unmap destination mapping");
-       BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
-              "unable to unmap source mapping");
-}
-
-// This test validates that we can remap over only a portion of a mapping.
-static void mremap_dontunmap_partial_mapping_overwrite(void)
-{
-       /*
-        *  source mapping:
-        *  ---------
-        *  |aaaaa|
-        *  ---------
-        *  dest mapping initially:
-        *  -----------
-        *  |XXXXXXXXXX|
-        *  ------------
-        *  Source to become:
-        *  ---------
-        *  |00000|
-        *  ---------
-        *  With the destination mapping containing 5 pages of As.
-        *  ------------
-        *  |aaaaaXXXXX|
-        *  ------------
-        */
-       void *source_mapping =
-           mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
-                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(source_mapping == MAP_FAILED, "mmap");
-       memset(source_mapping, 'a', 5 * page_size);
-
-       void *dest_mapping =
-           mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
-                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(dest_mapping == MAP_FAILED, "mmap");
-       memset(dest_mapping, 'X', 10 * page_size);
-
-       // We will grab the last 5 pages of the source and move them.
-       void *remapped_mapping =
-           mremap(source_mapping, 5 * page_size,
-                  5 * page_size,
-                  MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping);
-       BUG_ON(dest_mapping == MAP_FAILED, "mremap");
-       BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping");
-
-       BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
-              0, "first 5 pages of source should have no ptes");
-
-       // Finally we expect the destination to have 5 pages worth of a's.
-       BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0,
-                       "dest mapping should contain ptes from the source");
-
-       // Finally the last 5 pages shouldn't have been touched.
-       BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
-                               5 * page_size, 'X') != 0,
-                       "dest mapping should have retained the last 5 pages");
-
-       BUG_ON(munmap(dest_mapping, 10 * page_size) == -1,
-              "unable to unmap destination mapping");
-       BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
-              "unable to unmap source mapping");
-}
-
-int main(void)
-{
-       page_size = sysconf(_SC_PAGE_SIZE);
-
-       // test for kernel support for MREMAP_DONTUNMAP skipping the test if
-       // not.
-       if (kernel_support_for_mremap_dontunmap() != 0) {
-               printf("No kernel support for MREMAP_DONTUNMAP\n");
-               return KSFT_SKIP;
-       }
-
-       // Keep a page sized buffer around for when we need it.
-       page_buffer =
-           mmap(NULL, page_size, PROT_READ | PROT_WRITE,
-                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-       BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
-
-       mremap_dontunmap_simple();
-       mremap_dontunmap_simple_shmem();
-       mremap_dontunmap_simple_fixed();
-       mremap_dontunmap_partial_mapping();
-       mremap_dontunmap_partial_mapping_overwrite();
-
-       BUG_ON(munmap(page_buffer, page_size) == -1,
-              "unable to unmap page buffer");
-
-       printf("OK\n");
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c
deleted file mode 100644 (file)
index 9496346..0000000
+++ /dev/null
@@ -1,475 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2020 Google LLC
- */
-#define _GNU_SOURCE
-
-#include <errno.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <time.h>
-#include <stdbool.h>
-
-#include "../kselftest.h"
-
-#define EXPECT_SUCCESS 0
-#define EXPECT_FAILURE 1
-#define NON_OVERLAPPING 0
-#define OVERLAPPING 1
-#define NS_PER_SEC 1000000000ULL
-#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */
-#define VALIDATION_NO_THRESHOLD 0      /* Verify the entire region */
-
-#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
-
-struct config {
-       unsigned long long src_alignment;
-       unsigned long long dest_alignment;
-       unsigned long long region_size;
-       int overlapping;
-};
-
-struct test {
-       const char *name;
-       struct config config;
-       int expect_failure;
-};
-
-enum {
-       _1KB = 1ULL << 10,      /* 1KB -> not page aligned */
-       _4KB = 4ULL << 10,
-       _8KB = 8ULL << 10,
-       _1MB = 1ULL << 20,
-       _2MB = 2ULL << 20,
-       _4MB = 4ULL << 20,
-       _1GB = 1ULL << 30,
-       _2GB = 2ULL << 30,
-       PMD = _2MB,
-       PUD = _1GB,
-};
-
-#define PTE page_size
-
-#define MAKE_TEST(source_align, destination_align, size,       \
-                 overlaps, should_fail, test_name)             \
-(struct test){                                                 \
-       .name = test_name,                                      \
-       .config = {                                             \
-               .src_alignment = source_align,                  \
-               .dest_alignment = destination_align,            \
-               .region_size = size,                            \
-               .overlapping = overlaps,                        \
-       },                                                      \
-       .expect_failure = should_fail                           \
-}
-
-/*
- * Returns false if the requested remap region overlaps with an
- * existing mapping (e.g text, stack) else returns true.
- */
-static bool is_remap_region_valid(void *addr, unsigned long long size)
-{
-       void *remap_addr = NULL;
-       bool ret = true;
-
-       /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
-       remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
-                                        MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
-                                        -1, 0);
-
-       if (remap_addr == MAP_FAILED) {
-               if (errno == EEXIST)
-                       ret = false;
-       } else {
-               munmap(remap_addr, size);
-       }
-
-       return ret;
-}
-
-/* Returns mmap_min_addr sysctl tunable from procfs */
-static unsigned long long get_mmap_min_addr(void)
-{
-       FILE *fp;
-       int n_matched;
-       static unsigned long long addr;
-
-       if (addr)
-               return addr;
-
-       fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
-       if (fp == NULL) {
-               ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
-                       strerror(errno));
-               exit(KSFT_SKIP);
-       }
-
-       n_matched = fscanf(fp, "%llu", &addr);
-       if (n_matched != 1) {
-               ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
-                       strerror(errno));
-               fclose(fp);
-               exit(KSFT_SKIP);
-       }
-
-       fclose(fp);
-       return addr;
-}
-
-/*
- * This test validates that merge is called when expanding a mapping.
- * Mapping containing three pages is created, middle page is unmapped
- * and then the mapping containing the first page is expanded so that
- * it fills the created hole. The two parts should merge creating
- * single mapping with three pages.
- */
-static void mremap_expand_merge(unsigned long page_size)
-{
-       char *test_name = "mremap expand merge";
-       FILE *fp;
-       char *line = NULL;
-       size_t len = 0;
-       bool success = false;
-       char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
-                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-       munmap(start + page_size, page_size);
-       mremap(start, page_size, 2 * page_size, 0);
-
-       fp = fopen("/proc/self/maps", "r");
-       if (fp == NULL) {
-               ksft_test_result_fail("%s\n", test_name);
-               return;
-       }
-
-       while (getline(&line, &len, fp) != -1) {
-               char *first = strtok(line, "- ");
-               void *first_val = (void *)strtol(first, NULL, 16);
-               char *second = strtok(NULL, "- ");
-               void *second_val = (void *) strtol(second, NULL, 16);
-
-               if (first_val == start && second_val == start + 3 * page_size) {
-                       success = true;
-                       break;
-               }
-       }
-       if (success)
-               ksft_test_result_pass("%s\n", test_name);
-       else
-               ksft_test_result_fail("%s\n", test_name);
-       fclose(fp);
-}
-
-/*
- * Returns the start address of the mapping on success, else returns
- * NULL on failure.
- */
-static void *get_source_mapping(struct config c)
-{
-       unsigned long long addr = 0ULL;
-       void *src_addr = NULL;
-       unsigned long long mmap_min_addr;
-
-       mmap_min_addr = get_mmap_min_addr();
-
-retry:
-       addr += c.src_alignment;
-       if (addr < mmap_min_addr)
-               goto retry;
-
-       src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
-                                       MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
-                                       -1, 0);
-       if (src_addr == MAP_FAILED) {
-               if (errno == EPERM || errno == EEXIST)
-                       goto retry;
-               goto error;
-       }
-       /*
-        * Check that the address is aligned to the specified alignment.
-        * Addresses which have alignments that are multiples of that
-        * specified are not considered valid. For instance, 1GB address is
-        * 2MB-aligned, however it will not be considered valid for a
-        * requested alignment of 2MB. This is done to reduce coincidental
-        * alignment in the tests.
-        */
-       if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
-                       !((unsigned long long) src_addr & c.src_alignment)) {
-               munmap(src_addr, c.region_size);
-               goto retry;
-       }
-
-       if (!src_addr)
-               goto error;
-
-       return src_addr;
-error:
-       ksft_print_msg("Failed to map source region: %s\n",
-                       strerror(errno));
-       return NULL;
-}
-
-/* Returns the time taken for the remap on success else returns -1. */
-static long long remap_region(struct config c, unsigned int threshold_mb,
-                             char pattern_seed)
-{
-       void *addr, *src_addr, *dest_addr;
-       unsigned long long i;
-       struct timespec t_start = {0, 0}, t_end = {0, 0};
-       long long  start_ns, end_ns, align_mask, ret, offset;
-       unsigned long long threshold;
-
-       if (threshold_mb == VALIDATION_NO_THRESHOLD)
-               threshold = c.region_size;
-       else
-               threshold = MIN(threshold_mb * _1MB, c.region_size);
-
-       src_addr = get_source_mapping(c);
-       if (!src_addr) {
-               ret = -1;
-               goto out;
-       }
-
-       /* Set byte pattern */
-       srand(pattern_seed);
-       for (i = 0; i < threshold; i++)
-               memset((char *) src_addr + i, (char) rand(), 1);
-
-       /* Mask to zero out lower bits of address for alignment */
-       align_mask = ~(c.dest_alignment - 1);
-       /* Offset of destination address from the end of the source region */
-       offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment;
-       addr = (void *) (((unsigned long long) src_addr + c.region_size
-                         + offset) & align_mask);
-
-       /* See comment in get_source_mapping() */
-       if (!((unsigned long long) addr & c.dest_alignment))
-               addr = (void *) ((unsigned long long) addr | c.dest_alignment);
-
-       /* Don't destroy existing mappings unless expected to overlap */
-       while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
-               /* Check for unsigned overflow */
-               if (addr + c.dest_alignment < addr) {
-                       ksft_print_msg("Couldn't find a valid region to remap to\n");
-                       ret = -1;
-                       goto out;
-               }
-               addr += c.dest_alignment;
-       }
-
-       clock_gettime(CLOCK_MONOTONIC, &t_start);
-       dest_addr = mremap(src_addr, c.region_size, c.region_size,
-                                         MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
-       clock_gettime(CLOCK_MONOTONIC, &t_end);
-
-       if (dest_addr == MAP_FAILED) {
-               ksft_print_msg("mremap failed: %s\n", strerror(errno));
-               ret = -1;
-               goto clean_up_src;
-       }
-
-       /* Verify byte pattern after remapping */
-       srand(pattern_seed);
-       for (i = 0; i < threshold; i++) {
-               char c = (char) rand();
-
-               if (((char *) dest_addr)[i] != c) {
-                       ksft_print_msg("Data after remap doesn't match at offset %d\n",
-                                      i);
-                       ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff,
-                                       ((char *) dest_addr)[i] & 0xff);
-                       ret = -1;
-                       goto clean_up_dest;
-               }
-       }
-
-       start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec;
-       end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec;
-       ret = end_ns - start_ns;
-
-/*
- * Since the destination address is specified using MREMAP_FIXED, subsequent
- * mremap will unmap any previous mapping at the address range specified by
- * dest_addr and region_size. This significantly affects the remap time of
- * subsequent tests. So we clean up mappings after each test.
- */
-clean_up_dest:
-       munmap(dest_addr, c.region_size);
-clean_up_src:
-       munmap(src_addr, c.region_size);
-out:
-       return ret;
-}
-
-static void run_mremap_test_case(struct test test_case, int *failures,
-                                unsigned int threshold_mb,
-                                unsigned int pattern_seed)
-{
-       long long remap_time = remap_region(test_case.config, threshold_mb,
-                                           pattern_seed);
-
-       if (remap_time < 0) {
-               if (test_case.expect_failure)
-                       ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
-                                             test_case.name);
-               else {
-                       ksft_test_result_fail("%s\n", test_case.name);
-                       *failures += 1;
-               }
-       } else {
-               /*
-                * Comparing mremap time is only applicable if entire region
-                * was faulted in.
-                */
-               if (threshold_mb == VALIDATION_NO_THRESHOLD ||
-                   test_case.config.region_size <= threshold_mb * _1MB)
-                       ksft_test_result_pass("%s\n\tmremap time: %12lldns\n",
-                                             test_case.name, remap_time);
-               else
-                       ksft_test_result_pass("%s\n", test_case.name);
-       }
-}
-
-static void usage(const char *cmd)
-{
-       fprintf(stderr,
-               "Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n"
-               "-t\t only validate threshold_mb of the remapped region\n"
-               "  \t if 0 is supplied no threshold is used; all tests\n"
-               "  \t are run and remapped regions validated fully.\n"
-               "  \t The default threshold used is 4MB.\n"
-               "-p\t provide a seed to generate the random pattern for\n"
-               "  \t validating the remapped region.\n", cmd);
-}
-
-static int parse_args(int argc, char **argv, unsigned int *threshold_mb,
-                     unsigned int *pattern_seed)
-{
-       const char *optstr = "t:p:";
-       int opt;
-
-       while ((opt = getopt(argc, argv, optstr)) != -1) {
-               switch (opt) {
-               case 't':
-                       *threshold_mb = atoi(optarg);
-                       break;
-               case 'p':
-                       *pattern_seed = atoi(optarg);
-                       break;
-               default:
-                       usage(argv[0]);
-                       return -1;
-               }
-       }
-
-       if (optind < argc) {
-               usage(argv[0]);
-               return -1;
-       }
-
-       return 0;
-}
-
-#define MAX_TEST 13
-#define MAX_PERF_TEST 3
-int main(int argc, char **argv)
-{
-       int failures = 0;
-       int i, run_perf_tests;
-       unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
-       unsigned int pattern_seed;
-       int num_expand_tests = 1;
-       struct test test_cases[MAX_TEST];
-       struct test perf_test_cases[MAX_PERF_TEST];
-       int page_size;
-       time_t t;
-
-       pattern_seed = (unsigned int) time(&t);
-
-       if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0)
-               exit(EXIT_FAILURE);
-
-       ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
-                      threshold_mb, pattern_seed);
-
-       page_size = sysconf(_SC_PAGESIZE);
-
-       /* Expected mremap failures */
-       test_cases[0] = MAKE_TEST(page_size, page_size, page_size,
-                                 OVERLAPPING, EXPECT_FAILURE,
-                                 "mremap - Source and Destination Regions Overlapping");
-
-       test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size,
-                                 NON_OVERLAPPING, EXPECT_FAILURE,
-                                 "mremap - Destination Address Misaligned (1KB-aligned)");
-       test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size,
-                                 NON_OVERLAPPING, EXPECT_FAILURE,
-                                 "mremap - Source Address Misaligned (1KB-aligned)");
-
-       /* Src addr PTE aligned */
-       test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2,
-                                 NON_OVERLAPPING, EXPECT_SUCCESS,
-                                 "8KB mremap - Source PTE-aligned, Destination PTE-aligned");
-
-       /* Src addr 1MB aligned */
-       test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                 "2MB mremap - Source 1MB-aligned, Destination PTE-aligned");
-       test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                 "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned");
-
-       /* Src addr PMD aligned */
-       test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                 "4MB mremap - Source PMD-aligned, Destination PTE-aligned");
-       test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                 "4MB mremap - Source PMD-aligned, Destination 1MB-aligned");
-       test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                 "4MB mremap - Source PMD-aligned, Destination PMD-aligned");
-
-       /* Src addr PUD aligned */
-       test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                 "2GB mremap - Source PUD-aligned, Destination PTE-aligned");
-       test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                  "2GB mremap - Source PUD-aligned, Destination 1MB-aligned");
-       test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                  "2GB mremap - Source PUD-aligned, Destination PMD-aligned");
-       test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                  "2GB mremap - Source PUD-aligned, Destination PUD-aligned");
-
-       perf_test_cases[0] =  MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                       "1GB mremap - Source PTE-aligned, Destination PTE-aligned");
-       /*
-        * mremap 1GB region - Page table level aligned time
-        * comparison.
-        */
-       perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                      "1GB mremap - Source PMD-aligned, Destination PMD-aligned");
-       perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
-                                      "1GB mremap - Source PUD-aligned, Destination PUD-aligned");
-
-       run_perf_tests =  (threshold_mb == VALIDATION_NO_THRESHOLD) ||
-                               (threshold_mb * _1MB >= _1GB);
-
-       ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ?
-                     ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests);
-
-       for (i = 0; i < ARRAY_SIZE(test_cases); i++)
-               run_mremap_test_case(test_cases[i], &failures, threshold_mb,
-                                    pattern_seed);
-
-       mremap_expand_merge(page_size);
-
-       if (run_perf_tests) {
-               ksft_print_msg("\n%s\n",
-                "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:");
-               for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++)
-                       run_mremap_test_case(perf_test_cases[i], &failures,
-                                            threshold_mb, pattern_seed);
-       }
-
-       if (failures > 0)
-               ksft_exit_fail();
-       else
-               ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
deleted file mode 100644 (file)
index 634d87d..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <sys/mman.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#ifndef MCL_ONFAULT
-#define MCL_ONFAULT (MCL_FUTURE << 1)
-#endif
-
-static int test_limit(void)
-{
-       int ret = 1;
-       struct rlimit lims;
-       void *map;
-
-       if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
-               perror("getrlimit");
-               return ret;
-       }
-
-       if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
-               perror("mlockall");
-               return ret;
-       }
-
-       map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
-                  MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
-       if (map != MAP_FAILED)
-               printf("mmap should have failed, but didn't\n");
-       else {
-               ret = 0;
-               munmap(map, 2 * lims.rlim_max);
-       }
-
-       munlockall();
-       return ret;
-}
-
-int main(int argc, char **argv)
-{
-       int ret = 0;
-
-       ret += test_limit();
-       return ret;
-}
diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h
deleted file mode 100644 (file)
index 92f3be3..0000000
+++ /dev/null
@@ -1,226 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _PKEYS_HELPER_H
-#define _PKEYS_HELPER_H
-#define _GNU_SOURCE
-#include <string.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <ucontext.h>
-#include <sys/mman.h>
-
-#include "../kselftest.h"
-
-/* Define some kernel-like types */
-#define  u8 __u8
-#define u16 __u16
-#define u32 __u32
-#define u64 __u64
-
-#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
-
-#ifndef DEBUG_LEVEL
-#define DEBUG_LEVEL 0
-#endif
-#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
-extern int dprint_in_signal;
-extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
-
-extern int test_nr;
-extern int iteration_nr;
-
-#ifdef __GNUC__
-__attribute__((format(printf, 1, 2)))
-#endif
-static inline void sigsafe_printf(const char *format, ...)
-{
-       va_list ap;
-
-       if (!dprint_in_signal) {
-               va_start(ap, format);
-               vprintf(format, ap);
-               va_end(ap);
-       } else {
-               int ret;
-               /*
-                * No printf() functions are signal-safe.
-                * They deadlock easily. Write the format
-                * string to get some output, even if
-                * incomplete.
-                */
-               ret = write(1, format, strlen(format));
-               if (ret < 0)
-                       exit(1);
-       }
-}
-#define dprintf_level(level, args...) do {     \
-       if (level <= DEBUG_LEVEL)               \
-               sigsafe_printf(args);           \
-} while (0)
-#define dprintf0(args...) dprintf_level(0, args)
-#define dprintf1(args...) dprintf_level(1, args)
-#define dprintf2(args...) dprintf_level(2, args)
-#define dprintf3(args...) dprintf_level(3, args)
-#define dprintf4(args...) dprintf_level(4, args)
-
-extern void abort_hooks(void);
-#define pkey_assert(condition) do {            \
-       if (!(condition)) {                     \
-               dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
-                               __FILE__, __LINE__,     \
-                               test_nr, iteration_nr); \
-               dprintf0("errno at assert: %d", errno); \
-               abort_hooks();                  \
-               exit(__LINE__);                 \
-       }                                       \
-} while (0)
-
-__attribute__((noinline)) int read_ptr(int *ptr);
-void expected_pkey_fault(int pkey);
-int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
-int sys_pkey_free(unsigned long pkey);
-int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
-               unsigned long pkey);
-void record_pkey_malloc(void *ptr, long size, int prot);
-
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-#include "pkey-x86.h"
-#elif defined(__powerpc64__) /* arch */
-#include "pkey-powerpc.h"
-#else /* arch */
-#error Architecture not supported
-#endif /* arch */
-
-#define PKEY_MASK      (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
-
-static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
-{
-       u32 shift = pkey_bit_position(pkey);
-       /* mask out bits from pkey in old value */
-       reg &= ~((u64)PKEY_MASK << shift);
-       /* OR in new bits for pkey */
-       reg |= (flags & PKEY_MASK) << shift;
-       return reg;
-}
-
-static inline u64 get_pkey_bits(u64 reg, int pkey)
-{
-       u32 shift = pkey_bit_position(pkey);
-       /*
-        * shift down the relevant bits to the lowest two, then
-        * mask off all the other higher bits
-        */
-       return ((reg >> shift) & PKEY_MASK);
-}
-
-extern u64 shadow_pkey_reg;
-
-static inline u64 _read_pkey_reg(int line)
-{
-       u64 pkey_reg = __read_pkey_reg();
-
-       dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx"
-                       " shadow: %016llx\n",
-                       line, pkey_reg, shadow_pkey_reg);
-       assert(pkey_reg == shadow_pkey_reg);
-
-       return pkey_reg;
-}
-
-#define read_pkey_reg() _read_pkey_reg(__LINE__)
-
-static inline void write_pkey_reg(u64 pkey_reg)
-{
-       dprintf4("%s() changing %016llx to %016llx\n", __func__,
-                       __read_pkey_reg(), pkey_reg);
-       /* will do the shadow check for us: */
-       read_pkey_reg();
-       __write_pkey_reg(pkey_reg);
-       shadow_pkey_reg = pkey_reg;
-       dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__,
-                       pkey_reg, __read_pkey_reg());
-}
-
-/*
- * These are technically racy. since something could
- * change PKEY register between the read and the write.
- */
-static inline void __pkey_access_allow(int pkey, int do_allow)
-{
-       u64 pkey_reg = read_pkey_reg();
-       int bit = pkey * 2;
-
-       if (do_allow)
-               pkey_reg &= (1<<bit);
-       else
-               pkey_reg |= (1<<bit);
-
-       dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
-       write_pkey_reg(pkey_reg);
-}
-
-static inline void __pkey_write_allow(int pkey, int do_allow_write)
-{
-       u64 pkey_reg = read_pkey_reg();
-       int bit = pkey * 2 + 1;
-
-       if (do_allow_write)
-               pkey_reg &= (1<<bit);
-       else
-               pkey_reg |= (1<<bit);
-
-       write_pkey_reg(pkey_reg);
-       dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
-}
-
-#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
-#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
-#define ALIGN_PTR_UP(p, ptr_align_to)  \
-       ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
-#define ALIGN_PTR_DOWN(p, ptr_align_to)        \
-       ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
-#define __stringify_1(x...)     #x
-#define __stringify(x...)       __stringify_1(x)
-
-static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
-{
-#ifdef si_pkey
-       return &si->si_pkey;
-#else
-       return (u32 *)(((u8 *)si) + si_pkey_offset);
-#endif
-}
-
-static inline int kernel_has_pkeys(void)
-{
-       /* try allocating a key and see if it succeeds */
-       int ret = sys_pkey_alloc(0, 0);
-       if (ret <= 0) {
-               return 0;
-       }
-       sys_pkey_free(ret);
-       return 1;
-}
-
-static inline int is_pkeys_supported(void)
-{
-       /* check if the cpu supports pkeys */
-       if (!cpu_has_pkeys()) {
-               dprintf1("SKIP: %s: no CPU support\n", __func__);
-               return 0;
-       }
-
-       /* check if the kernel supports pkeys */
-       if (!kernel_has_pkeys()) {
-               dprintf1("SKIP: %s: no kernel support\n", __func__);
-               return 0;
-       }
-
-       return 1;
-}
-
-#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h
deleted file mode 100644 (file)
index 1ebb586..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _PKEYS_POWERPC_H
-#define _PKEYS_POWERPC_H
-
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key      386
-#endif
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc                384
-# define SYS_pkey_free         385
-#endif
-#define REG_IP_IDX             PT_NIP
-#define REG_TRAPNO             PT_TRAP
-#define gregs                  gp_regs
-#define fpregs                 fp_regs
-#define si_pkey_offset         0x20
-
-#undef PKEY_DISABLE_ACCESS
-#define PKEY_DISABLE_ACCESS    0x3  /* disable read and write */
-
-#undef PKEY_DISABLE_WRITE
-#define PKEY_DISABLE_WRITE     0x2
-
-#define NR_PKEYS               32
-#define NR_RESERVED_PKEYS_4K   27 /* pkey-0, pkey-1, exec-only-pkey
-                                     and 24 other keys that cannot be
-                                     represented in the PTE */
-#define NR_RESERVED_PKEYS_64K_3KEYS    3 /* PowerNV and KVM: pkey-0,
-                                            pkey-1 and exec-only key */
-#define NR_RESERVED_PKEYS_64K_4KEYS    4 /* PowerVM: pkey-0, pkey-1,
-                                            pkey-31 and exec-only key */
-#define PKEY_BITS_PER_PKEY     2
-#define HPAGE_SIZE             (1UL << 24)
-#define PAGE_SIZE              sysconf(_SC_PAGESIZE)
-
-static inline u32 pkey_bit_position(int pkey)
-{
-       return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
-}
-
-static inline u64 __read_pkey_reg(void)
-{
-       u64 pkey_reg;
-
-       asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
-
-       return pkey_reg;
-}
-
-static inline void __write_pkey_reg(u64 pkey_reg)
-{
-       u64 amr = pkey_reg;
-
-       dprintf4("%s() changing %016llx to %016llx\n",
-                        __func__, __read_pkey_reg(), pkey_reg);
-
-       asm volatile("isync; mtspr 0xd, %0; isync"
-                    : : "r" ((unsigned long)(amr)) : "memory");
-
-       dprintf4("%s() pkey register after changing %016llx to %016llx\n",
-                       __func__, __read_pkey_reg(), pkey_reg);
-}
-
-static inline int cpu_has_pkeys(void)
-{
-       /* No simple way to determine this */
-       return 1;
-}
-
-static inline bool arch_is_powervm()
-{
-       struct stat buf;
-
-       if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) &&
-           (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) &&
-           (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) )
-               return true;
-
-       return false;
-}
-
-static inline int get_arch_reserved_keys(void)
-{
-       if (sysconf(_SC_PAGESIZE) == 4096)
-               return NR_RESERVED_PKEYS_4K;
-       else
-               if (arch_is_powervm())
-                       return NR_RESERVED_PKEYS_64K_4KEYS;
-               else
-                       return NR_RESERVED_PKEYS_64K_3KEYS;
-}
-
-void expect_fault_on_read_execonly_key(void *p1, int pkey)
-{
-       /*
-        * powerpc does not allow userspace to change permissions of exec-only
-        * keys since those keys are not allocated by userspace. The signal
-        * handler wont be able to reset the permissions, which means the code
-        * will infinitely continue to segfault here.
-        */
-       return;
-}
-
-/* 4-byte instructions * 16384 = 64K page */
-#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
-
-void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
-{
-       void *ptr;
-       int ret;
-
-       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-                       size, prot, pkey);
-       pkey_assert(pkey < NR_PKEYS);
-       ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-       pkey_assert(ptr != (void *)-1);
-
-       ret = syscall(__NR_subpage_prot, ptr, size, NULL);
-       if (ret) {
-               perror("subpage_perm");
-               return PTR_ERR_ENOTSUP;
-       }
-
-       ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
-       pkey_assert(!ret);
-       record_pkey_malloc(ptr, size, prot);
-
-       dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
-       return ptr;
-}
-
-#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h
deleted file mode 100644 (file)
index 72c14cd..0000000
+++ /dev/null
@@ -1,177 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _PKEYS_X86_H
-#define _PKEYS_X86_H
-
-#ifdef __i386__
-
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key      380
-#endif
-
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc                381
-# define SYS_pkey_free         382
-#endif
-
-#define REG_IP_IDX             REG_EIP
-#define si_pkey_offset         0x14
-
-#else
-
-#ifndef SYS_mprotect_key
-# define SYS_mprotect_key      329
-#endif
-
-#ifndef SYS_pkey_alloc
-# define SYS_pkey_alloc                330
-# define SYS_pkey_free         331
-#endif
-
-#define REG_IP_IDX             REG_RIP
-#define si_pkey_offset         0x20
-
-#endif
-
-#ifndef PKEY_DISABLE_ACCESS
-# define PKEY_DISABLE_ACCESS   0x1
-#endif
-
-#ifndef PKEY_DISABLE_WRITE
-# define PKEY_DISABLE_WRITE    0x2
-#endif
-
-#define NR_PKEYS               16
-#define NR_RESERVED_PKEYS      2 /* pkey-0 and exec-only-pkey */
-#define PKEY_BITS_PER_PKEY     2
-#define HPAGE_SIZE             (1UL<<21)
-#define PAGE_SIZE              4096
-#define MB                     (1<<20)
-
-static inline void __page_o_noops(void)
-{
-       /* 8-bytes of instruction * 512 bytes = 1 page */
-       asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
-}
-
-static inline u64 __read_pkey_reg(void)
-{
-       unsigned int eax, edx;
-       unsigned int ecx = 0;
-       unsigned pkey_reg;
-
-       asm volatile(".byte 0x0f,0x01,0xee\n\t"
-                    : "=a" (eax), "=d" (edx)
-                    : "c" (ecx));
-       pkey_reg = eax;
-       return pkey_reg;
-}
-
-static inline void __write_pkey_reg(u64 pkey_reg)
-{
-       unsigned int eax = pkey_reg;
-       unsigned int ecx = 0;
-       unsigned int edx = 0;
-
-       dprintf4("%s() changing %016llx to %016llx\n", __func__,
-                       __read_pkey_reg(), pkey_reg);
-       asm volatile(".byte 0x0f,0x01,0xef\n\t"
-                    : : "a" (eax), "c" (ecx), "d" (edx));
-       assert(pkey_reg == __read_pkey_reg());
-}
-
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
-#define X86_FEATURE_PKU        (1<<3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE      (1<<4) /* OS Protection Keys Enable */
-
-static inline int cpu_has_pkeys(void)
-{
-       unsigned int eax;
-       unsigned int ebx;
-       unsigned int ecx;
-       unsigned int edx;
-
-       __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx);
-
-       if (!(ecx & X86_FEATURE_PKU)) {
-               dprintf2("cpu does not have PKU\n");
-               return 0;
-       }
-       if (!(ecx & X86_FEATURE_OSPKE)) {
-               dprintf2("cpu does not have OSPKE\n");
-               return 0;
-       }
-       return 1;
-}
-
-static inline int cpu_max_xsave_size(void)
-{
-       unsigned long XSTATE_CPUID = 0xd;
-       unsigned int eax;
-       unsigned int ebx;
-       unsigned int ecx;
-       unsigned int edx;
-
-       __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx);
-       return ecx;
-}
-
-static inline u32 pkey_bit_position(int pkey)
-{
-       return pkey * PKEY_BITS_PER_PKEY;
-}
-
-#define XSTATE_PKEY_BIT        (9)
-#define XSTATE_PKEY    0x200
-#define XSTATE_BV_OFFSET       512
-
-int pkey_reg_xstate_offset(void)
-{
-       unsigned int eax;
-       unsigned int ebx;
-       unsigned int ecx;
-       unsigned int edx;
-       int xstate_offset;
-       int xstate_size;
-       unsigned long XSTATE_CPUID = 0xd;
-       int leaf;
-
-       /* assume that XSTATE_PKEY is set in XCR0 */
-       leaf = XSTATE_PKEY_BIT;
-       {
-               __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx);
-
-               if (leaf == XSTATE_PKEY_BIT) {
-                       xstate_offset = ebx;
-                       xstate_size = eax;
-               }
-       }
-
-       if (xstate_size == 0) {
-               printf("could not find size/offset of PKEY in xsave state\n");
-               return 0;
-       }
-
-       return xstate_offset;
-}
-
-static inline int get_arch_reserved_keys(void)
-{
-       return NR_RESERVED_PKEYS;
-}
-
-void expect_fault_on_read_execonly_key(void *p1, int pkey)
-{
-       int ptr_contents;
-
-       ptr_contents = read_ptr(p1);
-       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-       expected_pkey_fault(pkey);
-}
-
-void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
-{
-       return PTR_ERR_ENOTSUP;
-}
-
-#endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c
deleted file mode 100644 (file)
index 95f403a..0000000
+++ /dev/null
@@ -1,1788 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
- *
- * There are examples in here of:
- *  * how to set protection keys on memory
- *  * how to set/clear bits in pkey registers (the rights register)
- *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
- *    information from the siginfo
- *
- * Things to add:
- *     make sure KSM and KSM COW breaking works
- *     prefault pages in at malloc, or not
- *     protect MPX bounds tables with protection keys?
- *     make sure VMA splitting/merging is working correctly
- *     OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
- *     look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
- *     do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
- *
- * Compile like this:
- *     gcc -mxsave      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
- *     gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
- */
-#define _GNU_SOURCE
-#define __SANE_USERSPACE_TYPES__
-#include <errno.h>
-#include <linux/elf.h>
-#include <linux/futex.h>
-#include <time.h>
-#include <sys/time.h>
-#include <sys/syscall.h>
-#include <string.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <ucontext.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/ptrace.h>
-#include <setjmp.h>
-
-#include "pkey-helpers.h"
-
-int iteration_nr = 1;
-int test_nr;
-
-u64 shadow_pkey_reg;
-int dprint_in_signal;
-char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
-
-void cat_into_file(char *str, char *file)
-{
-       int fd = open(file, O_RDWR);
-       int ret;
-
-       dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
-       /*
-        * these need to be raw because they are called under
-        * pkey_assert()
-        */
-       if (fd < 0) {
-               fprintf(stderr, "error opening '%s'\n", str);
-               perror("error: ");
-               exit(__LINE__);
-       }
-
-       ret = write(fd, str, strlen(str));
-       if (ret != strlen(str)) {
-               perror("write to file failed");
-               fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
-               exit(__LINE__);
-       }
-       close(fd);
-}
-
-#if CONTROL_TRACING > 0
-static int warned_tracing;
-int tracing_root_ok(void)
-{
-       if (geteuid() != 0) {
-               if (!warned_tracing)
-                       fprintf(stderr, "WARNING: not run as root, "
-                                       "can not do tracing control\n");
-               warned_tracing = 1;
-               return 0;
-       }
-       return 1;
-}
-#endif
-
-void tracing_on(void)
-{
-#if CONTROL_TRACING > 0
-#define TRACEDIR "/sys/kernel/debug/tracing"
-       char pidstr[32];
-
-       if (!tracing_root_ok())
-               return;
-
-       sprintf(pidstr, "%d", getpid());
-       cat_into_file("0", TRACEDIR "/tracing_on");
-       cat_into_file("\n", TRACEDIR "/trace");
-       if (1) {
-               cat_into_file("function_graph", TRACEDIR "/current_tracer");
-               cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
-       } else {
-               cat_into_file("nop", TRACEDIR "/current_tracer");
-       }
-       cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
-       cat_into_file("1", TRACEDIR "/tracing_on");
-       dprintf1("enabled tracing\n");
-#endif
-}
-
-void tracing_off(void)
-{
-#if CONTROL_TRACING > 0
-       if (!tracing_root_ok())
-               return;
-       cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
-#endif
-}
-
-void abort_hooks(void)
-{
-       fprintf(stderr, "running %s()...\n", __func__);
-       tracing_off();
-#ifdef SLEEP_ON_ABORT
-       sleep(SLEEP_ON_ABORT);
-#endif
-}
-
-/*
- * This attempts to have roughly a page of instructions followed by a few
- * instructions that do a write, and another page of instructions.  That
- * way, we are pretty sure that the write is in the second page of
- * instructions and has at least a page of padding behind it.
- *
- * *That* lets us be sure to madvise() away the write instruction, which
- * will then fault, which makes sure that the fault code handles
- * execute-only memory properly.
- */
-#ifdef __powerpc64__
-/* This way, both 4K and 64K alignment are maintained */
-__attribute__((__aligned__(65536)))
-#else
-__attribute__((__aligned__(PAGE_SIZE)))
-#endif
-void lots_o_noops_around_write(int *write_to_me)
-{
-       dprintf3("running %s()\n", __func__);
-       __page_o_noops();
-       /* Assume this happens in the second page of instructions: */
-       *write_to_me = __LINE__;
-       /* pad out by another page: */
-       __page_o_noops();
-       dprintf3("%s() done\n", __func__);
-}
-
-void dump_mem(void *dumpme, int len_bytes)
-{
-       char *c = (void *)dumpme;
-       int i;
-
-       for (i = 0; i < len_bytes; i += sizeof(u64)) {
-               u64 *ptr = (u64 *)(c + i);
-               dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
-       }
-}
-
-static u32 hw_pkey_get(int pkey, unsigned long flags)
-{
-       u64 pkey_reg = __read_pkey_reg();
-
-       dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
-                       __func__, pkey, flags, 0, 0);
-       dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
-
-       return (u32) get_pkey_bits(pkey_reg, pkey);
-}
-
-static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
-{
-       u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
-       u64 old_pkey_reg = __read_pkey_reg();
-       u64 new_pkey_reg;
-
-       /* make sure that 'rights' only contains the bits we expect: */
-       assert(!(rights & ~mask));
-
-       /* modify bits accordingly in old pkey_reg and assign it */
-       new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
-
-       __write_pkey_reg(new_pkey_reg);
-
-       dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
-               " pkey_reg now: %016llx old_pkey_reg: %016llx\n",
-               __func__, pkey, rights, flags, 0, __read_pkey_reg(),
-               old_pkey_reg);
-       return 0;
-}
-
-void pkey_disable_set(int pkey, int flags)
-{
-       unsigned long syscall_flags = 0;
-       int ret;
-       int pkey_rights;
-       u64 orig_pkey_reg = read_pkey_reg();
-
-       dprintf1("START->%s(%d, 0x%x)\n", __func__,
-               pkey, flags);
-       pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
-       pkey_rights = hw_pkey_get(pkey, syscall_flags);
-
-       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-                       pkey, pkey, pkey_rights);
-
-       pkey_assert(pkey_rights >= 0);
-
-       pkey_rights |= flags;
-
-       ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
-       assert(!ret);
-       /* pkey_reg and flags have the same format */
-       shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
-       dprintf1("%s(%d) shadow: 0x%016llx\n",
-               __func__, pkey, shadow_pkey_reg);
-
-       pkey_assert(ret >= 0);
-
-       pkey_rights = hw_pkey_get(pkey, syscall_flags);
-       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-                       pkey, pkey, pkey_rights);
-
-       dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
-               __func__, pkey, read_pkey_reg());
-       if (flags)
-               pkey_assert(read_pkey_reg() >= orig_pkey_reg);
-       dprintf1("END<---%s(%d, 0x%x)\n", __func__,
-               pkey, flags);
-}
-
-void pkey_disable_clear(int pkey, int flags)
-{
-       unsigned long syscall_flags = 0;
-       int ret;
-       int pkey_rights = hw_pkey_get(pkey, syscall_flags);
-       u64 orig_pkey_reg = read_pkey_reg();
-
-       pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
-
-       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-                       pkey, pkey, pkey_rights);
-       pkey_assert(pkey_rights >= 0);
-
-       pkey_rights &= ~flags;
-
-       ret = hw_pkey_set(pkey, pkey_rights, 0);
-       shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
-       pkey_assert(ret >= 0);
-
-       pkey_rights = hw_pkey_get(pkey, syscall_flags);
-       dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
-                       pkey, pkey, pkey_rights);
-
-       dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
-                       pkey, read_pkey_reg());
-       if (flags)
-               assert(read_pkey_reg() <= orig_pkey_reg);
-}
-
-void pkey_write_allow(int pkey)
-{
-       pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
-}
-void pkey_write_deny(int pkey)
-{
-       pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
-}
-void pkey_access_allow(int pkey)
-{
-       pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
-}
-void pkey_access_deny(int pkey)
-{
-       pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
-}
-
-/* Failed address bound checks: */
-#ifndef SEGV_BNDERR
-# define SEGV_BNDERR           3
-#endif
-
-#ifndef SEGV_PKUERR
-# define SEGV_PKUERR           4
-#endif
-
-static char *si_code_str(int si_code)
-{
-       if (si_code == SEGV_MAPERR)
-               return "SEGV_MAPERR";
-       if (si_code == SEGV_ACCERR)
-               return "SEGV_ACCERR";
-       if (si_code == SEGV_BNDERR)
-               return "SEGV_BNDERR";
-       if (si_code == SEGV_PKUERR)
-               return "SEGV_PKUERR";
-       return "UNKNOWN";
-}
-
-int pkey_faults;
-int last_si_pkey = -1;
-void signal_handler(int signum, siginfo_t *si, void *vucontext)
-{
-       ucontext_t *uctxt = vucontext;
-       int trapno;
-       unsigned long ip;
-       char *fpregs;
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-       u32 *pkey_reg_ptr;
-       int pkey_reg_offset;
-#endif /* arch */
-       u64 siginfo_pkey;
-       u32 *si_pkey_ptr;
-
-       dprint_in_signal = 1;
-       dprintf1(">>>>===============SIGSEGV============================\n");
-       dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
-                       __func__, __LINE__,
-                       __read_pkey_reg(), shadow_pkey_reg);
-
-       trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
-       ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
-       fpregs = (char *) uctxt->uc_mcontext.fpregs;
-
-       dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
-                       __func__, trapno, ip, si_code_str(si->si_code),
-                       si->si_code);
-
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-#ifdef __i386__
-       /*
-        * 32-bit has some extra padding so that userspace can tell whether
-        * the XSTATE header is present in addition to the "legacy" FPU
-        * state.  We just assume that it is here.
-        */
-       fpregs += 0x70;
-#endif /* i386 */
-       pkey_reg_offset = pkey_reg_xstate_offset();
-       pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
-
-       /*
-        * If we got a PKEY fault, we *HAVE* to have at least one bit set in
-        * here.
-        */
-       dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
-       if (DEBUG_LEVEL > 4)
-               dump_mem(pkey_reg_ptr - 128, 256);
-       pkey_assert(*pkey_reg_ptr);
-#endif /* arch */
-
-       dprintf1("siginfo: %p\n", si);
-       dprintf1(" fpregs: %p\n", fpregs);
-
-       if ((si->si_code == SEGV_MAPERR) ||
-           (si->si_code == SEGV_ACCERR) ||
-           (si->si_code == SEGV_BNDERR)) {
-               printf("non-PK si_code, exiting...\n");
-               exit(4);
-       }
-
-       si_pkey_ptr = siginfo_get_pkey_ptr(si);
-       dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
-       dump_mem((u8 *)si_pkey_ptr - 8, 24);
-       siginfo_pkey = *si_pkey_ptr;
-       pkey_assert(siginfo_pkey < NR_PKEYS);
-       last_si_pkey = siginfo_pkey;
-
-       /*
-        * need __read_pkey_reg() version so we do not do shadow_pkey_reg
-        * checking
-        */
-       dprintf1("signal pkey_reg from  pkey_reg: %016llx\n",
-                       __read_pkey_reg());
-       dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-       dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
-       *(u64 *)pkey_reg_ptr = 0x00000000;
-       dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
-#elif defined(__powerpc64__) /* arch */
-       /* restore access and let the faulting instruction continue */
-       pkey_access_allow(siginfo_pkey);
-#endif /* arch */
-       pkey_faults++;
-       dprintf1("<<<<==================================================\n");
-       dprint_in_signal = 0;
-}
-
-int wait_all_children(void)
-{
-       int status;
-       return waitpid(-1, &status, 0);
-}
-
-void sig_chld(int x)
-{
-       dprint_in_signal = 1;
-       dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
-       dprint_in_signal = 0;
-}
-
-void setup_sigsegv_handler(void)
-{
-       int r, rs;
-       struct sigaction newact;
-       struct sigaction oldact;
-
-       /* #PF is mapped to sigsegv */
-       int signum  = SIGSEGV;
-
-       newact.sa_handler = 0;
-       newact.sa_sigaction = signal_handler;
-
-       /*sigset_t - signals to block while in the handler */
-       /* get the old signal mask. */
-       rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
-       pkey_assert(rs == 0);
-
-       /* call sa_sigaction, not sa_handler*/
-       newact.sa_flags = SA_SIGINFO;
-
-       newact.sa_restorer = 0;  /* void(*)(), obsolete */
-       r = sigaction(signum, &newact, &oldact);
-       r = sigaction(SIGALRM, &newact, &oldact);
-       pkey_assert(r == 0);
-}
-
-void setup_handlers(void)
-{
-       signal(SIGCHLD, &sig_chld);
-       setup_sigsegv_handler();
-}
-
-pid_t fork_lazy_child(void)
-{
-       pid_t forkret;
-
-       forkret = fork();
-       pkey_assert(forkret >= 0);
-       dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
-
-       if (!forkret) {
-               /* in the child */
-               while (1) {
-                       dprintf1("child sleeping...\n");
-                       sleep(30);
-               }
-       }
-       return forkret;
-}
-
-int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
-               unsigned long pkey)
-{
-       int sret;
-
-       dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
-                       ptr, size, orig_prot, pkey);
-
-       errno = 0;
-       sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
-       if (errno) {
-               dprintf2("SYS_mprotect_key sret: %d\n", sret);
-               dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
-               dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
-               if (DEBUG_LEVEL >= 2)
-                       perror("SYS_mprotect_pkey");
-       }
-       return sret;
-}
-
-int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
-{
-       int ret = syscall(SYS_pkey_alloc, flags, init_val);
-       dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
-                       __func__, flags, init_val, ret, errno);
-       return ret;
-}
-
-int alloc_pkey(void)
-{
-       int ret;
-       unsigned long init_val = 0x0;
-
-       dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
-                       __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
-       ret = sys_pkey_alloc(0, init_val);
-       /*
-        * pkey_alloc() sets PKEY register, so we need to reflect it in
-        * shadow_pkey_reg:
-        */
-       dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                       " shadow: 0x%016llx\n",
-                       __func__, __LINE__, ret, __read_pkey_reg(),
-                       shadow_pkey_reg);
-       if (ret > 0) {
-               /* clear both the bits: */
-               shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
-                                               ~PKEY_MASK);
-               dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                               " shadow: 0x%016llx\n",
-                               __func__,
-                               __LINE__, ret, __read_pkey_reg(),
-                               shadow_pkey_reg);
-               /*
-                * move the new state in from init_val
-                * (remember, we cheated and init_val == pkey_reg format)
-                */
-               shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
-                                               init_val);
-       }
-       dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                       " shadow: 0x%016llx\n",
-                       __func__, __LINE__, ret, __read_pkey_reg(),
-                       shadow_pkey_reg);
-       dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
-       /* for shadow checking: */
-       read_pkey_reg();
-       dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                " shadow: 0x%016llx\n",
-               __func__, __LINE__, ret, __read_pkey_reg(),
-               shadow_pkey_reg);
-       return ret;
-}
-
-int sys_pkey_free(unsigned long pkey)
-{
-       int ret = syscall(SYS_pkey_free, pkey);
-       dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
-       return ret;
-}
-
-/*
- * I had a bug where pkey bits could be set by mprotect() but
- * not cleared.  This ensures we get lots of random bit sets
- * and clears on the vma and pte pkey bits.
- */
-int alloc_random_pkey(void)
-{
-       int max_nr_pkey_allocs;
-       int ret;
-       int i;
-       int alloced_pkeys[NR_PKEYS];
-       int nr_alloced = 0;
-       int random_index;
-       memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
-
-       /* allocate every possible key and make a note of which ones we got */
-       max_nr_pkey_allocs = NR_PKEYS;
-       for (i = 0; i < max_nr_pkey_allocs; i++) {
-               int new_pkey = alloc_pkey();
-               if (new_pkey < 0)
-                       break;
-               alloced_pkeys[nr_alloced++] = new_pkey;
-       }
-
-       pkey_assert(nr_alloced > 0);
-       /* select a random one out of the allocated ones */
-       random_index = rand() % nr_alloced;
-       ret = alloced_pkeys[random_index];
-       /* now zero it out so we don't free it next */
-       alloced_pkeys[random_index] = 0;
-
-       /* go through the allocated ones that we did not want and free them */
-       for (i = 0; i < nr_alloced; i++) {
-               int free_ret;
-               if (!alloced_pkeys[i])
-                       continue;
-               free_ret = sys_pkey_free(alloced_pkeys[i]);
-               pkey_assert(!free_ret);
-       }
-       dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                        " shadow: 0x%016llx\n", __func__,
-                       __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
-       return ret;
-}
-
-int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
-               unsigned long pkey)
-{
-       int nr_iterations = random() % 100;
-       int ret;
-
-       while (0) {
-               int rpkey = alloc_random_pkey();
-               ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
-               dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
-                               ptr, size, orig_prot, pkey, ret);
-               if (nr_iterations-- < 0)
-                       break;
-
-               dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                       " shadow: 0x%016llx\n",
-                       __func__, __LINE__, ret, __read_pkey_reg(),
-                       shadow_pkey_reg);
-               sys_pkey_free(rpkey);
-               dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                       " shadow: 0x%016llx\n",
-                       __func__, __LINE__, ret, __read_pkey_reg(),
-                       shadow_pkey_reg);
-       }
-       pkey_assert(pkey < NR_PKEYS);
-
-       ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
-       dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
-                       ptr, size, orig_prot, pkey, ret);
-       pkey_assert(!ret);
-       dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
-                       " shadow: 0x%016llx\n", __func__,
-                       __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
-       return ret;
-}
-
-struct pkey_malloc_record {
-       void *ptr;
-       long size;
-       int prot;
-};
-struct pkey_malloc_record *pkey_malloc_records;
-struct pkey_malloc_record *pkey_last_malloc_record;
-long nr_pkey_malloc_records;
-void record_pkey_malloc(void *ptr, long size, int prot)
-{
-       long i;
-       struct pkey_malloc_record *rec = NULL;
-
-       for (i = 0; i < nr_pkey_malloc_records; i++) {
-               rec = &pkey_malloc_records[i];
-               /* find a free record */
-               if (rec)
-                       break;
-       }
-       if (!rec) {
-               /* every record is full */
-               size_t old_nr_records = nr_pkey_malloc_records;
-               size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
-               size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
-               dprintf2("new_nr_records: %zd\n", new_nr_records);
-               dprintf2("new_size: %zd\n", new_size);
-               pkey_malloc_records = realloc(pkey_malloc_records, new_size);
-               pkey_assert(pkey_malloc_records != NULL);
-               rec = &pkey_malloc_records[nr_pkey_malloc_records];
-               /*
-                * realloc() does not initialize memory, so zero it from
-                * the first new record all the way to the end.
-                */
-               for (i = 0; i < new_nr_records - old_nr_records; i++)
-                       memset(rec + i, 0, sizeof(*rec));
-       }
-       dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
-               (int)(rec - pkey_malloc_records), rec, ptr, size);
-       rec->ptr = ptr;
-       rec->size = size;
-       rec->prot = prot;
-       pkey_last_malloc_record = rec;
-       nr_pkey_malloc_records++;
-}
-
-void free_pkey_malloc(void *ptr)
-{
-       long i;
-       int ret;
-       dprintf3("%s(%p)\n", __func__, ptr);
-       for (i = 0; i < nr_pkey_malloc_records; i++) {
-               struct pkey_malloc_record *rec = &pkey_malloc_records[i];
-               dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
-                               ptr, i, rec, rec->ptr, rec->size);
-               if ((ptr <  rec->ptr) ||
-                   (ptr >= rec->ptr + rec->size))
-                       continue;
-
-               dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
-                               ptr, i, rec, rec->ptr, rec->size);
-               nr_pkey_malloc_records--;
-               ret = munmap(rec->ptr, rec->size);
-               dprintf3("munmap ret: %d\n", ret);
-               pkey_assert(!ret);
-               dprintf3("clearing rec->ptr, rec: %p\n", rec);
-               rec->ptr = NULL;
-               dprintf3("done clearing rec->ptr, rec: %p\n", rec);
-               return;
-       }
-       pkey_assert(false);
-}
-
-
-void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
-{
-       void *ptr;
-       int ret;
-
-       read_pkey_reg();
-       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-                       size, prot, pkey);
-       pkey_assert(pkey < NR_PKEYS);
-       ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-       pkey_assert(ptr != (void *)-1);
-       ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
-       pkey_assert(!ret);
-       record_pkey_malloc(ptr, size, prot);
-       read_pkey_reg();
-
-       dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
-       return ptr;
-}
-
-void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
-{
-       int ret;
-       void *ptr;
-
-       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-                       size, prot, pkey);
-       /*
-        * Guarantee we can fit at least one huge page in the resulting
-        * allocation by allocating space for 2:
-        */
-       size = ALIGN_UP(size, HPAGE_SIZE * 2);
-       ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-       pkey_assert(ptr != (void *)-1);
-       record_pkey_malloc(ptr, size, prot);
-       mprotect_pkey(ptr, size, prot, pkey);
-
-       dprintf1("unaligned ptr: %p\n", ptr);
-       ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
-       dprintf1("  aligned ptr: %p\n", ptr);
-       ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
-       dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
-       ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
-       dprintf1("MADV_WILLNEED ret: %d\n", ret);
-       memset(ptr, 0, HPAGE_SIZE);
-
-       dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
-       return ptr;
-}
-
-int hugetlb_setup_ok;
-#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
-#define GET_NR_HUGE_PAGES 10
-void setup_hugetlbfs(void)
-{
-       int err;
-       int fd;
-       char buf[256];
-       long hpagesz_kb;
-       long hpagesz_mb;
-
-       if (geteuid() != 0) {
-               fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
-               return;
-       }
-
-       cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
-
-       /*
-        * Now go make sure that we got the pages and that they
-        * are PMD-level pages. Someone might have made PUD-level
-        * pages the default.
-        */
-       hpagesz_kb = HPAGE_SIZE / 1024;
-       hpagesz_mb = hpagesz_kb / 1024;
-       sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
-       fd = open(buf, O_RDONLY);
-       if (fd < 0) {
-               fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
-                       hpagesz_mb, strerror(errno));
-               return;
-       }
-
-       /* -1 to guarantee leaving the trailing \0 */
-       err = read(fd, buf, sizeof(buf)-1);
-       close(fd);
-       if (err <= 0) {
-               fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
-                       hpagesz_mb, strerror(errno));
-               return;
-       }
-
-       if (atoi(buf) != GET_NR_HUGE_PAGES) {
-               fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
-                       hpagesz_mb, buf, GET_NR_HUGE_PAGES);
-               return;
-       }
-
-       hugetlb_setup_ok = 1;
-}
-
-void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
-{
-       void *ptr;
-       int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
-
-       if (!hugetlb_setup_ok)
-               return PTR_ERR_ENOTSUP;
-
-       dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
-       size = ALIGN_UP(size, HPAGE_SIZE * 2);
-       pkey_assert(pkey < NR_PKEYS);
-       ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
-       pkey_assert(ptr != (void *)-1);
-       mprotect_pkey(ptr, size, prot, pkey);
-
-       record_pkey_malloc(ptr, size, prot);
-
-       dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
-       return ptr;
-}
-
-void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
-{
-       void *ptr;
-       int fd;
-
-       dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
-                       size, prot, pkey);
-       pkey_assert(pkey < NR_PKEYS);
-       fd = open("/dax/foo", O_RDWR);
-       pkey_assert(fd >= 0);
-
-       ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
-       pkey_assert(ptr != (void *)-1);
-
-       mprotect_pkey(ptr, size, prot, pkey);
-
-       record_pkey_malloc(ptr, size, prot);
-
-       dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
-       close(fd);
-       return ptr;
-}
-
-void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
-
-       malloc_pkey_with_mprotect,
-       malloc_pkey_with_mprotect_subpage,
-       malloc_pkey_anon_huge,
-       malloc_pkey_hugetlb
-/* can not do direct with the pkey_mprotect() API:
-       malloc_pkey_mmap_direct,
-       malloc_pkey_mmap_dax,
-*/
-};
-
-void *malloc_pkey(long size, int prot, u16 pkey)
-{
-       void *ret;
-       static int malloc_type;
-       int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
-
-       pkey_assert(pkey < NR_PKEYS);
-
-       while (1) {
-               pkey_assert(malloc_type < nr_malloc_types);
-
-               ret = pkey_malloc[malloc_type](size, prot, pkey);
-               pkey_assert(ret != (void *)-1);
-
-               malloc_type++;
-               if (malloc_type >= nr_malloc_types)
-                       malloc_type = (random()%nr_malloc_types);
-
-               /* try again if the malloc_type we tried is unsupported */
-               if (ret == PTR_ERR_ENOTSUP)
-                       continue;
-
-               break;
-       }
-
-       dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
-                       size, prot, pkey, ret);
-       return ret;
-}
-
-int last_pkey_faults;
-#define UNKNOWN_PKEY -2
-void expected_pkey_fault(int pkey)
-{
-       dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
-                       __func__, last_pkey_faults, pkey_faults);
-       dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
-       pkey_assert(last_pkey_faults + 1 == pkey_faults);
-
-       /*
-       * For exec-only memory, we do not know the pkey in
-       * advance, so skip this check.
-       */
-       if (pkey != UNKNOWN_PKEY)
-               pkey_assert(last_si_pkey == pkey);
-
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-       /*
-        * The signal handler shold have cleared out PKEY register to let the
-        * test program continue.  We now have to restore it.
-        */
-       if (__read_pkey_reg() != 0)
-#else /* arch */
-       if (__read_pkey_reg() != shadow_pkey_reg)
-#endif /* arch */
-               pkey_assert(0);
-
-       __write_pkey_reg(shadow_pkey_reg);
-       dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
-                      "nuked it\n", __func__, shadow_pkey_reg);
-       last_pkey_faults = pkey_faults;
-       last_si_pkey = -1;
-}
-
-#define do_not_expect_pkey_fault(msg)  do {                    \
-       if (last_pkey_faults != pkey_faults)                    \
-               dprintf0("unexpected PKey fault: %s\n", msg);   \
-       pkey_assert(last_pkey_faults == pkey_faults);           \
-} while (0)
-
-int test_fds[10] = { -1 };
-int nr_test_fds;
-void __save_test_fd(int fd)
-{
-       pkey_assert(fd >= 0);
-       pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
-       test_fds[nr_test_fds] = fd;
-       nr_test_fds++;
-}
-
-int get_test_read_fd(void)
-{
-       int test_fd = open("/etc/passwd", O_RDONLY);
-       __save_test_fd(test_fd);
-       return test_fd;
-}
-
-void close_test_fds(void)
-{
-       int i;
-
-       for (i = 0; i < nr_test_fds; i++) {
-               if (test_fds[i] < 0)
-                       continue;
-               close(test_fds[i]);
-               test_fds[i] = -1;
-       }
-       nr_test_fds = 0;
-}
-
-#define barrier() __asm__ __volatile__("": : :"memory")
-__attribute__((noinline)) int read_ptr(int *ptr)
-{
-       /*
-        * Keep GCC from optimizing this away somehow
-        */
-       barrier();
-       return *ptr;
-}
-
-void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
-{
-       int i, err;
-       int max_nr_pkey_allocs;
-       int alloced_pkeys[NR_PKEYS];
-       int nr_alloced = 0;
-       long size;
-
-       pkey_assert(pkey_last_malloc_record);
-       size = pkey_last_malloc_record->size;
-       /*
-        * This is a bit of a hack.  But mprotect() requires
-        * huge-page-aligned sizes when operating on hugetlbfs.
-        * So, make sure that we use something that's a multiple
-        * of a huge page when we can.
-        */
-       if (size >= HPAGE_SIZE)
-               size = HPAGE_SIZE;
-
-       /* allocate every possible key and make sure key-0 never got allocated */
-       max_nr_pkey_allocs = NR_PKEYS;
-       for (i = 0; i < max_nr_pkey_allocs; i++) {
-               int new_pkey = alloc_pkey();
-               pkey_assert(new_pkey != 0);
-
-               if (new_pkey < 0)
-                       break;
-               alloced_pkeys[nr_alloced++] = new_pkey;
-       }
-       /* free all the allocated keys */
-       for (i = 0; i < nr_alloced; i++) {
-               int free_ret;
-
-               if (!alloced_pkeys[i])
-                       continue;
-               free_ret = sys_pkey_free(alloced_pkeys[i]);
-               pkey_assert(!free_ret);
-       }
-
-       /* attach key-0 in various modes */
-       err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
-       pkey_assert(!err);
-       err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
-       pkey_assert(!err);
-       err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
-       pkey_assert(!err);
-       err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
-       pkey_assert(!err);
-       err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
-       pkey_assert(!err);
-}
-
-void test_read_of_write_disabled_region(int *ptr, u16 pkey)
-{
-       int ptr_contents;
-
-       dprintf1("disabling write access to PKEY[1], doing read\n");
-       pkey_write_deny(pkey);
-       ptr_contents = read_ptr(ptr);
-       dprintf1("*ptr: %d\n", ptr_contents);
-       dprintf1("\n");
-}
-void test_read_of_access_disabled_region(int *ptr, u16 pkey)
-{
-       int ptr_contents;
-
-       dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
-       read_pkey_reg();
-       pkey_access_deny(pkey);
-       ptr_contents = read_ptr(ptr);
-       dprintf1("*ptr: %d\n", ptr_contents);
-       expected_pkey_fault(pkey);
-}
-
-void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
-               u16 pkey)
-{
-       int ptr_contents;
-
-       dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
-                               pkey, ptr);
-       ptr_contents = read_ptr(ptr);
-       dprintf1("reading ptr before disabling the read : %d\n",
-                       ptr_contents);
-       read_pkey_reg();
-       pkey_access_deny(pkey);
-       ptr_contents = read_ptr(ptr);
-       dprintf1("*ptr: %d\n", ptr_contents);
-       expected_pkey_fault(pkey);
-}
-
-void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
-               u16 pkey)
-{
-       *ptr = __LINE__;
-       dprintf1("disabling write access; after accessing the page, "
-               "to PKEY[%02d], doing write\n", pkey);
-       pkey_write_deny(pkey);
-       *ptr = __LINE__;
-       expected_pkey_fault(pkey);
-}
-
-void test_write_of_write_disabled_region(int *ptr, u16 pkey)
-{
-       dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
-       pkey_write_deny(pkey);
-       *ptr = __LINE__;
-       expected_pkey_fault(pkey);
-}
-void test_write_of_access_disabled_region(int *ptr, u16 pkey)
-{
-       dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
-       pkey_access_deny(pkey);
-       *ptr = __LINE__;
-       expected_pkey_fault(pkey);
-}
-
-void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
-                       u16 pkey)
-{
-       *ptr = __LINE__;
-       dprintf1("disabling access; after accessing the page, "
-               " to PKEY[%02d], doing write\n", pkey);
-       pkey_access_deny(pkey);
-       *ptr = __LINE__;
-       expected_pkey_fault(pkey);
-}
-
-void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
-{
-       int ret;
-       int test_fd = get_test_read_fd();
-
-       dprintf1("disabling access to PKEY[%02d], "
-                "having kernel read() to buffer\n", pkey);
-       pkey_access_deny(pkey);
-       ret = read(test_fd, ptr, 1);
-       dprintf1("read ret: %d\n", ret);
-       pkey_assert(ret);
-}
-void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
-{
-       int ret;
-       int test_fd = get_test_read_fd();
-
-       pkey_write_deny(pkey);
-       ret = read(test_fd, ptr, 100);
-       dprintf1("read ret: %d\n", ret);
-       if (ret < 0 && (DEBUG_LEVEL > 0))
-               perror("verbose read result (OK for this to be bad)");
-       pkey_assert(ret);
-}
-
-void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
-{
-       int pipe_ret, vmsplice_ret;
-       struct iovec iov;
-       int pipe_fds[2];
-
-       pipe_ret = pipe(pipe_fds);
-
-       pkey_assert(pipe_ret == 0);
-       dprintf1("disabling access to PKEY[%02d], "
-                "having kernel vmsplice from buffer\n", pkey);
-       pkey_access_deny(pkey);
-       iov.iov_base = ptr;
-       iov.iov_len = PAGE_SIZE;
-       vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
-       dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
-       pkey_assert(vmsplice_ret == -1);
-
-       close(pipe_fds[0]);
-       close(pipe_fds[1]);
-}
-
-void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
-{
-       int ignored = 0xdada;
-       int futex_ret;
-       int some_int = __LINE__;
-
-       dprintf1("disabling write to PKEY[%02d], "
-                "doing futex gunk in buffer\n", pkey);
-       *ptr = some_int;
-       pkey_write_deny(pkey);
-       futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
-                       &ignored, ignored);
-       if (DEBUG_LEVEL > 0)
-               perror("futex");
-       dprintf1("futex() ret: %d\n", futex_ret);
-}
-
-/* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
-{
-       int err;
-       int i;
-
-       /* Note: 0 is the default pkey, so don't mess with it */
-       for (i = 1; i < NR_PKEYS; i++) {
-               if (pkey == i)
-                       continue;
-
-               dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
-               err = sys_pkey_free(i);
-               pkey_assert(err);
-
-               err = sys_pkey_free(i);
-               pkey_assert(err);
-
-               err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
-               pkey_assert(err);
-       }
-}
-
-/* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
-{
-       int err;
-       int bad_pkey = NR_PKEYS+99;
-
-       /* pass a known-invalid pkey in: */
-       err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
-       pkey_assert(err);
-}
-
-void become_child(void)
-{
-       pid_t forkret;
-
-       forkret = fork();
-       pkey_assert(forkret >= 0);
-       dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
-
-       if (!forkret) {
-               /* in the child */
-               return;
-       }
-       exit(0);
-}
-
-/* Assumes that all pkeys other than 'pkey' are unallocated */
-void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
-{
-       int err;
-       int allocated_pkeys[NR_PKEYS] = {0};
-       int nr_allocated_pkeys = 0;
-       int i;
-
-       for (i = 0; i < NR_PKEYS*3; i++) {
-               int new_pkey;
-               dprintf1("%s() alloc loop: %d\n", __func__, i);
-               new_pkey = alloc_pkey();
-               dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
-                               " shadow: 0x%016llx\n",
-                               __func__, __LINE__, err, __read_pkey_reg(),
-                               shadow_pkey_reg);
-               read_pkey_reg(); /* for shadow checking */
-               dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
-               if ((new_pkey == -1) && (errno == ENOSPC)) {
-                       dprintf2("%s() failed to allocate pkey after %d tries\n",
-                               __func__, nr_allocated_pkeys);
-               } else {
-                       /*
-                        * Ensure the number of successes never
-                        * exceeds the number of keys supported
-                        * in the hardware.
-                        */
-                       pkey_assert(nr_allocated_pkeys < NR_PKEYS);
-                       allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
-               }
-
-               /*
-                * Make sure that allocation state is properly
-                * preserved across fork().
-                */
-               if (i == NR_PKEYS*2)
-                       become_child();
-       }
-
-       dprintf3("%s()::%d\n", __func__, __LINE__);
-
-       /*
-        * On x86:
-        * There are 16 pkeys supported in hardware.  Three are
-        * allocated by the time we get here:
-        *   1. The default key (0)
-        *   2. One possibly consumed by an execute-only mapping.
-        *   3. One allocated by the test code and passed in via
-        *      'pkey' to this function.
-        * Ensure that we can allocate at least another 13 (16-3).
-        *
-        * On powerpc:
-        * There are either 5, 28, 29 or 32 pkeys supported in
-        * hardware depending on the page size (4K or 64K) and
-        * platform (powernv or powervm). Four are allocated by
-        * the time we get here. These include pkey-0, pkey-1,
-        * exec-only pkey and the one allocated by the test code.
-        * Ensure that we can allocate the remaining.
-        */
-       pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
-
-       for (i = 0; i < nr_allocated_pkeys; i++) {
-               err = sys_pkey_free(allocated_pkeys[i]);
-               pkey_assert(!err);
-               read_pkey_reg(); /* for shadow checking */
-       }
-}
-
-void arch_force_pkey_reg_init(void)
-{
-#if defined(__i386__) || defined(__x86_64__) /* arch */
-       u64 *buf;
-
-       /*
-        * All keys should be allocated and set to allow reads and
-        * writes, so the register should be all 0.  If not, just
-        * skip the test.
-        */
-       if (read_pkey_reg())
-               return;
-
-       /*
-        * Just allocate an absurd about of memory rather than
-        * doing the XSAVE size enumeration dance.
-        */
-       buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-
-       /* These __builtins require compiling with -mxsave */
-
-       /* XSAVE to build a valid buffer: */
-       __builtin_ia32_xsave(buf, XSTATE_PKEY);
-       /* Clear XSTATE_BV[PKRU]: */
-       buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY;
-       /* XRSTOR will likely get PKRU back to the init state: */
-       __builtin_ia32_xrstor(buf, XSTATE_PKEY);
-
-       munmap(buf, 1*MB);
-#endif
-}
-
-
-/*
- * This is mostly useless on ppc for now.  But it will not
- * hurt anything and should give some better coverage as
- * a long-running test that continually checks the pkey
- * register.
- */
-void test_pkey_init_state(int *ptr, u16 pkey)
-{
-       int err;
-       int allocated_pkeys[NR_PKEYS] = {0};
-       int nr_allocated_pkeys = 0;
-       int i;
-
-       for (i = 0; i < NR_PKEYS; i++) {
-               int new_pkey = alloc_pkey();
-
-               if (new_pkey < 0)
-                       continue;
-               allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
-       }
-
-       dprintf3("%s()::%d\n", __func__, __LINE__);
-
-       arch_force_pkey_reg_init();
-
-       /*
-        * Loop for a bit, hoping to get exercise the kernel
-        * context switch code.
-        */
-       for (i = 0; i < 1000000; i++)
-               read_pkey_reg();
-
-       for (i = 0; i < nr_allocated_pkeys; i++) {
-               err = sys_pkey_free(allocated_pkeys[i]);
-               pkey_assert(!err);
-               read_pkey_reg(); /* for shadow checking */
-       }
-}
-
-/*
- * pkey 0 is special.  It is allocated by default, so you do not
- * have to call pkey_alloc() to use it first.  Make sure that it
- * is usable.
- */
-void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
-{
-       long size;
-       int prot;
-
-       assert(pkey_last_malloc_record);
-       size = pkey_last_malloc_record->size;
-       /*
-        * This is a bit of a hack.  But mprotect() requires
-        * huge-page-aligned sizes when operating on hugetlbfs.
-        * So, make sure that we use something that's a multiple
-        * of a huge page when we can.
-        */
-       if (size >= HPAGE_SIZE)
-               size = HPAGE_SIZE;
-       prot = pkey_last_malloc_record->prot;
-
-       /* Use pkey 0 */
-       mprotect_pkey(ptr, size, prot, 0);
-
-       /* Make sure that we can set it back to the original pkey. */
-       mprotect_pkey(ptr, size, prot, pkey);
-}
-
-void test_ptrace_of_child(int *ptr, u16 pkey)
-{
-       __attribute__((__unused__)) int peek_result;
-       pid_t child_pid;
-       void *ignored = 0;
-       long ret;
-       int status;
-       /*
-        * This is the "control" for our little expermient.  Make sure
-        * we can always access it when ptracing.
-        */
-       int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
-       int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
-
-       /*
-        * Fork a child which is an exact copy of this process, of course.
-        * That means we can do all of our tests via ptrace() and then plain
-        * memory access and ensure they work differently.
-        */
-       child_pid = fork_lazy_child();
-       dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
-
-       ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
-       if (ret)
-               perror("attach");
-       dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
-       pkey_assert(ret != -1);
-       ret = waitpid(child_pid, &status, WUNTRACED);
-       if ((ret != child_pid) || !(WIFSTOPPED(status))) {
-               fprintf(stderr, "weird waitpid result %ld stat %x\n",
-                               ret, status);
-               pkey_assert(0);
-       }
-       dprintf2("waitpid ret: %ld\n", ret);
-       dprintf2("waitpid status: %d\n", status);
-
-       pkey_access_deny(pkey);
-       pkey_write_deny(pkey);
-
-       /* Write access, untested for now:
-       ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
-       pkey_assert(ret != -1);
-       dprintf1("poke at %p: %ld\n", peek_at, ret);
-       */
-
-       /*
-        * Try to access the pkey-protected "ptr" via ptrace:
-        */
-       ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
-       /* expect it to work, without an error: */
-       pkey_assert(ret != -1);
-       /* Now access from the current task, and expect an exception: */
-       peek_result = read_ptr(ptr);
-       expected_pkey_fault(pkey);
-
-       /*
-        * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
-        */
-       ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
-       /* expect it to work, without an error: */
-       pkey_assert(ret != -1);
-       /* Now access from the current task, and expect NO exception: */
-       peek_result = read_ptr(plain_ptr);
-       do_not_expect_pkey_fault("read plain pointer after ptrace");
-
-       ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
-       pkey_assert(ret != -1);
-
-       ret = kill(child_pid, SIGKILL);
-       pkey_assert(ret != -1);
-
-       wait(&status);
-
-       free(plain_ptr_unaligned);
-}
-
-void *get_pointer_to_instructions(void)
-{
-       void *p1;
-
-       p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
-       dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
-       /* lots_o_noops_around_write should be page-aligned already */
-       assert(p1 == &lots_o_noops_around_write);
-
-       /* Point 'p1' at the *second* page of the function: */
-       p1 += PAGE_SIZE;
-
-       /*
-        * Try to ensure we fault this in on next touch to ensure
-        * we get an instruction fault as opposed to a data one
-        */
-       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
-
-       return p1;
-}
-
-void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
-{
-       void *p1;
-       int scratch;
-       int ptr_contents;
-       int ret;
-
-       p1 = get_pointer_to_instructions();
-       lots_o_noops_around_write(&scratch);
-       ptr_contents = read_ptr(p1);
-       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-
-       ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
-       pkey_assert(!ret);
-       pkey_access_deny(pkey);
-
-       dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
-
-       /*
-        * Make sure this is an *instruction* fault
-        */
-       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
-       lots_o_noops_around_write(&scratch);
-       do_not_expect_pkey_fault("executing on PROT_EXEC memory");
-       expect_fault_on_read_execonly_key(p1, pkey);
-}
-
-void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
-{
-       void *p1;
-       int scratch;
-       int ptr_contents;
-       int ret;
-
-       dprintf1("%s() start\n", __func__);
-
-       p1 = get_pointer_to_instructions();
-       lots_o_noops_around_write(&scratch);
-       ptr_contents = read_ptr(p1);
-       dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-
-       /* Use a *normal* mprotect(), not mprotect_pkey(): */
-       ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
-       pkey_assert(!ret);
-
-       /*
-        * Reset the shadow, assuming that the above mprotect()
-        * correctly changed PKRU, but to an unknown value since
-        * the actual allocated pkey is unknown.
-        */
-       shadow_pkey_reg = __read_pkey_reg();
-
-       dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
-
-       /* Make sure this is an *instruction* fault */
-       madvise(p1, PAGE_SIZE, MADV_DONTNEED);
-       lots_o_noops_around_write(&scratch);
-       do_not_expect_pkey_fault("executing on PROT_EXEC memory");
-       expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
-
-       /*
-        * Put the memory back to non-PROT_EXEC.  Should clear the
-        * exec-only pkey off the VMA and allow it to be readable
-        * again.  Go to PROT_NONE first to check for a kernel bug
-        * that did not clear the pkey when doing PROT_NONE.
-        */
-       ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
-       pkey_assert(!ret);
-
-       ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
-       pkey_assert(!ret);
-       ptr_contents = read_ptr(p1);
-       do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
-}
-
-#if defined(__i386__) || defined(__x86_64__)
-void test_ptrace_modifies_pkru(int *ptr, u16 pkey)
-{
-       u32 new_pkru;
-       pid_t child;
-       int status, ret;
-       int pkey_offset = pkey_reg_xstate_offset();
-       size_t xsave_size = cpu_max_xsave_size();
-       void *xsave;
-       u32 *pkey_register;
-       u64 *xstate_bv;
-       struct iovec iov;
-
-       new_pkru = ~read_pkey_reg();
-       /* Don't make PROT_EXEC mappings inaccessible */
-       new_pkru &= ~3;
-
-       child = fork();
-       pkey_assert(child >= 0);
-       dprintf3("[%d] fork() ret: %d\n", getpid(), child);
-       if (!child) {
-               ptrace(PTRACE_TRACEME, 0, 0, 0);
-               /* Stop and allow the tracer to modify PKRU directly */
-               raise(SIGSTOP);
-
-               /*
-                * need __read_pkey_reg() version so we do not do shadow_pkey_reg
-                * checking
-                */
-               if (__read_pkey_reg() != new_pkru)
-                       exit(1);
-
-               /* Stop and allow the tracer to clear XSTATE_BV for PKRU */
-               raise(SIGSTOP);
-
-               if (__read_pkey_reg() != 0)
-                       exit(1);
-
-               /* Stop and allow the tracer to examine PKRU */
-               raise(SIGSTOP);
-
-               exit(0);
-       }
-
-       pkey_assert(child == waitpid(child, &status, 0));
-       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-       pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-
-       xsave = (void *)malloc(xsave_size);
-       pkey_assert(xsave > 0);
-
-       /* Modify the PKRU register directly */
-       iov.iov_base = xsave;
-       iov.iov_len = xsave_size;
-       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-       pkey_assert(ret == 0);
-
-       pkey_register = (u32 *)(xsave + pkey_offset);
-       pkey_assert(*pkey_register == read_pkey_reg());
-
-       *pkey_register = new_pkru;
-
-       ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-       pkey_assert(ret == 0);
-
-       /* Test that the modification is visible in ptrace before any execution */
-       memset(xsave, 0xCC, xsave_size);
-       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-       pkey_assert(ret == 0);
-       pkey_assert(*pkey_register == new_pkru);
-
-       /* Execute the tracee */
-       ret = ptrace(PTRACE_CONT, child, 0, 0);
-       pkey_assert(ret == 0);
-
-       /* Test that the tracee saw the PKRU value change */
-       pkey_assert(child == waitpid(child, &status, 0));
-       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-       pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-
-       /* Test that the modification is visible in ptrace after execution */
-       memset(xsave, 0xCC, xsave_size);
-       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-       pkey_assert(ret == 0);
-       pkey_assert(*pkey_register == new_pkru);
-
-       /* Clear the PKRU bit from XSTATE_BV */
-       xstate_bv = (u64 *)(xsave + 512);
-       *xstate_bv &= ~(1 << 9);
-
-       ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-       pkey_assert(ret == 0);
-
-       /* Test that the modification is visible in ptrace before any execution */
-       memset(xsave, 0xCC, xsave_size);
-       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-       pkey_assert(ret == 0);
-       pkey_assert(*pkey_register == 0);
-
-       ret = ptrace(PTRACE_CONT, child, 0, 0);
-       pkey_assert(ret == 0);
-
-       /* Test that the tracee saw the PKRU value go to 0 */
-       pkey_assert(child == waitpid(child, &status, 0));
-       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-       pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
-
-       /* Test that the modification is visible in ptrace after execution */
-       memset(xsave, 0xCC, xsave_size);
-       ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov);
-       pkey_assert(ret == 0);
-       pkey_assert(*pkey_register == 0);
-
-       ret = ptrace(PTRACE_CONT, child, 0, 0);
-       pkey_assert(ret == 0);
-       pkey_assert(child == waitpid(child, &status, 0));
-       dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status);
-       pkey_assert(WIFEXITED(status));
-       pkey_assert(WEXITSTATUS(status) == 0);
-       free(xsave);
-}
-#endif
-
-void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
-{
-       int size = PAGE_SIZE;
-       int sret;
-
-       if (cpu_has_pkeys()) {
-               dprintf1("SKIP: %s: no CPU support\n", __func__);
-               return;
-       }
-
-       sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
-       pkey_assert(sret < 0);
-}
-
-void (*pkey_tests[])(int *ptr, u16 pkey) = {
-       test_read_of_write_disabled_region,
-       test_read_of_access_disabled_region,
-       test_read_of_access_disabled_region_with_page_already_mapped,
-       test_write_of_write_disabled_region,
-       test_write_of_write_disabled_region_with_page_already_mapped,
-       test_write_of_access_disabled_region,
-       test_write_of_access_disabled_region_with_page_already_mapped,
-       test_kernel_write_of_access_disabled_region,
-       test_kernel_write_of_write_disabled_region,
-       test_kernel_gup_of_access_disabled_region,
-       test_kernel_gup_write_to_write_disabled_region,
-       test_executing_on_unreadable_memory,
-       test_implicit_mprotect_exec_only_memory,
-       test_mprotect_with_pkey_0,
-       test_ptrace_of_child,
-       test_pkey_init_state,
-       test_pkey_syscalls_on_non_allocated_pkey,
-       test_pkey_syscalls_bad_args,
-       test_pkey_alloc_exhaust,
-       test_pkey_alloc_free_attach_pkey0,
-#if defined(__i386__) || defined(__x86_64__)
-       test_ptrace_modifies_pkru,
-#endif
-};
-
-void run_tests_once(void)
-{
-       int *ptr;
-       int prot = PROT_READ|PROT_WRITE;
-
-       for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
-               int pkey;
-               int orig_pkey_faults = pkey_faults;
-
-               dprintf1("======================\n");
-               dprintf1("test %d preparing...\n", test_nr);
-
-               tracing_on();
-               pkey = alloc_random_pkey();
-               dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
-               ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
-               dprintf1("test %d starting...\n", test_nr);
-               pkey_tests[test_nr](ptr, pkey);
-               dprintf1("freeing test memory: %p\n", ptr);
-               free_pkey_malloc(ptr);
-               sys_pkey_free(pkey);
-
-               dprintf1("pkey_faults: %d\n", pkey_faults);
-               dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
-
-               tracing_off();
-               close_test_fds();
-
-               printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
-               dprintf1("======================\n\n");
-       }
-       iteration_nr++;
-}
-
-void pkey_setup_shadow(void)
-{
-       shadow_pkey_reg = __read_pkey_reg();
-}
-
-int main(void)
-{
-       int nr_iterations = 22;
-       int pkeys_supported = is_pkeys_supported();
-
-       srand((unsigned int)time(NULL));
-
-       setup_handlers();
-
-       printf("has pkeys: %d\n", pkeys_supported);
-
-       if (!pkeys_supported) {
-               int size = PAGE_SIZE;
-               int *ptr;
-
-               printf("running PKEY tests for unsupported CPU/OS\n");
-
-               ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-               assert(ptr != (void *)-1);
-               test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
-               exit(0);
-       }
-
-       pkey_setup_shadow();
-       printf("startup pkey_reg: %016llx\n", read_pkey_reg());
-       setup_hugetlbfs();
-
-       while (nr_iterations-- > 0)
-               run_tests_once();
-
-       printf("done (all tests OK)\n");
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh
deleted file mode 100755 (executable)
index 8984e0b..0000000
+++ /dev/null
@@ -1,274 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-# Please run as root
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-exitcode=0
-
-usage() {
-       cat <<EOF
-usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"]
-  -t: specify specific categories to tests to run
-  -h: display this message
-
-The default behavior is to run all tests.
-
-Alternatively, specific groups tests can be run by passing a string
-to the -t argument containing one or more of the following categories
-separated by spaces:
-- mmap
-       tests for mmap(2)
-- gup_test
-       tests for gup using gup_test interface
-- userfaultfd
-       tests for  userfaultfd(2)
-- compaction
-       a test for the patch "Allow compaction of unevictable pages"
-- mlock
-       tests for mlock(2)
-- mremap
-       tests for mremap(2)
-- hugevm
-       tests for very large virtual address space
-- vmalloc
-       vmalloc smoke tests
-- hmm
-       hmm smoke tests
-- madv_populate
-       test memadvise(2) MADV_POPULATE_{READ,WRITE} options
-- memfd_secret
-       test memfd_secret(2)
-- process_mrelease
-       test process_mrelease(2)
-- ksm
-       ksm tests that do not require >=2 NUMA nodes
-- ksm_numa
-       ksm tests that require >=2 NUMA nodes
-- pkey
-       memory protection key tests
-- soft_dirty
-       test soft dirty page bit semantics
-- cow
-       test copy-on-write semantics
-example: ./run_vmtests.sh -t "hmm mmap ksm"
-EOF
-       exit 0
-}
-
-
-while getopts "ht:" OPT; do
-       case ${OPT} in
-               "h") usage ;;
-               "t") VM_SELFTEST_ITEMS=${OPTARG} ;;
-       esac
-done
-shift $((OPTIND -1))
-
-# default behavior: run all tests
-VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default}
-
-test_selected() {
-       if [ "$VM_SELFTEST_ITEMS" == "default" ]; then
-               # If no VM_SELFTEST_ITEMS are specified, run all tests
-               return 0
-       fi
-       # If test selected argument is one of the test items
-       if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then
-               return 0
-       else
-               return 1
-       fi
-}
-
-# get huge pagesize and freepages from /proc/meminfo
-while read -r name size unit; do
-       if [ "$name" = "HugePages_Free:" ]; then
-               freepgs="$size"
-       fi
-       if [ "$name" = "Hugepagesize:" ]; then
-               hpgsize_KB="$size"
-       fi
-done < /proc/meminfo
-
-# Simple hugetlbfs tests have a hardcoded minimum requirement of
-# huge pages totaling 256MB (262144KB) in size.  The userfaultfd
-# hugetlb test requires a minimum of 2 * nr_cpus huge pages.  Take
-# both of these requirements into account and attempt to increase
-# number of huge pages available.
-nr_cpus=$(nproc)
-hpgsize_MB=$((hpgsize_KB / 1024))
-half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
-needmem_KB=$((half_ufd_size_MB * 2 * 1024))
-
-# set proper nr_hugepages
-if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
-       nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
-       needpgs=$((needmem_KB / hpgsize_KB))
-       tries=2
-       while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do
-               lackpgs=$((needpgs - freepgs))
-               echo 3 > /proc/sys/vm/drop_caches
-               if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then
-                       echo "Please run this test as root"
-                       exit $ksft_skip
-               fi
-               while read -r name size unit; do
-                       if [ "$name" = "HugePages_Free:" ]; then
-                               freepgs=$size
-                       fi
-               done < /proc/meminfo
-               tries=$((tries - 1))
-       done
-       if [ "$freepgs" -lt "$needpgs" ]; then
-               printf "Not enough huge pages available (%d < %d)\n" \
-                      "$freepgs" "$needpgs"
-               exit 1
-       fi
-else
-       echo "no hugetlbfs support in kernel?"
-       exit 1
-fi
-
-# filter 64bit architectures
-ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64"
-if [ -z "$ARCH" ]; then
-       ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/')
-fi
-VADDR64=0
-echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1
-
-# Usage: run_test [test binary] [arbitrary test arguments...]
-run_test() {
-       if test_selected ${CATEGORY}; then
-               local title="running $*"
-               local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -)
-               printf "%s\n%s\n%s\n" "$sep" "$title" "$sep"
-
-               "$@"
-               local ret=$?
-               if [ $ret -eq 0 ]; then
-                       echo "[PASS]"
-               elif [ $ret -eq $ksft_skip ]; then
-                       echo "[SKIP]"
-                       exitcode=$ksft_skip
-               else
-                       echo "[FAIL]"
-                       exitcode=1
-               fi
-       fi # test_selected
-}
-
-CATEGORY="hugetlb" run_test ./hugepage-mmap
-
-shmmax=$(cat /proc/sys/kernel/shmmax)
-shmall=$(cat /proc/sys/kernel/shmall)
-echo 268435456 > /proc/sys/kernel/shmmax
-echo 4194304 > /proc/sys/kernel/shmall
-CATEGORY="hugetlb" run_test ./hugepage-shm
-echo "$shmmax" > /proc/sys/kernel/shmmax
-echo "$shmall" > /proc/sys/kernel/shmall
-
-CATEGORY="hugetlb" run_test ./map_hugetlb
-CATEGORY="hugetlb" run_test ./hugepage-mremap
-CATEGORY="hugetlb" run_test ./hugepage-vmemmap
-CATEGORY="hugetlb" run_test ./hugetlb-madvise
-
-if test_selected "hugetlb"; then
-       echo "NOTE: These hugetlb tests provide minimal coverage.  Use"
-       echo "      https://github.com/libhugetlbfs/libhugetlbfs.git for"
-       echo "      hugetlb regression testing."
-fi
-
-CATEGORY="mmap" run_test ./map_fixed_noreplace
-
-# get_user_pages_fast() benchmark
-CATEGORY="gup_test" run_test ./gup_test -u
-# pin_user_pages_fast() benchmark
-CATEGORY="gup_test" run_test ./gup_test -a
-# Dump pages 0, 19, and 4096, using pin_user_pages:
-CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000
-
-uffd_mods=("" ":dev")
-for mod in "${uffd_mods[@]}"; do
-       CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16
-       # Hugetlb tests require source and destination huge pages. Pass in half
-       # the size ($half_ufd_size_MB), which is used for *each*.
-       CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
-       CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32
-       CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16
-done
-
-#cleanup
-echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
-
-CATEGORY="compaction" run_test ./compaction_test
-
-CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit
-
-CATEGORY="mmap" run_test ./map_populate
-
-CATEGORY="mlock" run_test ./mlock-random-test
-
-CATEGORY="mlock" run_test ./mlock2-tests
-
-CATEGORY="process_mrelease" run_test ./mrelease_test
-
-CATEGORY="mremap" run_test ./mremap_test
-
-CATEGORY="hugetlb" run_test ./thuge-gen
-
-if [ $VADDR64 -ne 0 ]; then
-       CATEGORY="hugevm" run_test ./virtual_address_range
-
-       # virtual address 128TB switch test
-       CATEGORY="hugevm" run_test ./va_128TBswitch.sh
-fi # VADDR64
-
-# vmalloc stability smoke test
-CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke
-
-CATEGORY="mremap" run_test ./mremap_dontunmap
-
-CATEGORY="hmm" run_test ./test_hmm.sh smoke
-
-# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
-CATEGORY="madv_populate" run_test ./madv_populate
-
-CATEGORY="memfd_secret" run_test ./memfd_secret
-
-# KSM MADV_MERGEABLE test with 10 identical pages
-CATEGORY="ksm" run_test ./ksm_tests -M -p 10
-# KSM unmerge test
-CATEGORY="ksm" run_test ./ksm_tests -U
-# KSM test with 10 zero pages and use_zero_pages = 0
-CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0
-# KSM test with 10 zero pages and use_zero_pages = 1
-CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1
-# KSM test with 2 NUMA nodes and merge_across_nodes = 1
-CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1
-# KSM test with 2 NUMA nodes and merge_across_nodes = 0
-CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
-
-CATEGORY="ksm" run_test ./ksm_functional_tests
-
-run_test ./ksm_functional_tests
-
-# protection_keys tests
-if [ -x ./protection_keys_32 ]
-then
-       CATEGORY="pkey" run_test ./protection_keys_32
-fi
-
-if [ -x ./protection_keys_64 ]
-then
-       CATEGORY="pkey" run_test ./protection_keys_64
-fi
-
-CATEGORY="soft_dirty" run_test ./soft-dirty
-
-# COW tests
-CATEGORY="cow" run_test ./cow
-
-exit $exitcode
diff --git a/tools/testing/selftests/vm/settings b/tools/testing/selftests/vm/settings
deleted file mode 100644 (file)
index 9abfc60..0000000
+++ /dev/null
@@ -1 +0,0 @@
-timeout=45
diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c
deleted file mode 100644 (file)
index 21d8830..0000000
+++ /dev/null
@@ -1,210 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <stdio.h>
-#include <string.h>
-#include <stdbool.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <malloc.h>
-#include <sys/mman.h>
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#define PAGEMAP_FILE_PATH "/proc/self/pagemap"
-#define TEST_ITERATIONS 10000
-
-static void test_simple(int pagemap_fd, int pagesize)
-{
-       int i;
-       char *map;
-
-       map = aligned_alloc(pagesize, pagesize);
-       if (!map)
-               ksft_exit_fail_msg("mmap failed\n");
-
-       clear_softdirty();
-
-       for (i = 0 ; i < TEST_ITERATIONS; i++) {
-               if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
-                       ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
-                       break;
-               }
-
-               clear_softdirty();
-               // Write something to the page to get the dirty bit enabled on the page
-               map[0]++;
-
-               if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
-                       ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
-                       break;
-               }
-
-               clear_softdirty();
-       }
-       free(map);
-
-       ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__);
-}
-
-static void test_vma_reuse(int pagemap_fd, int pagesize)
-{
-       char *map, *map2;
-
-       map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
-       if (map == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed");
-
-       // The kernel always marks new regions as soft dirty
-       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
-                        "Test %s dirty bit of allocated page\n", __func__);
-
-       clear_softdirty();
-       munmap(map, pagesize);
-
-       map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0);
-       if (map2 == MAP_FAILED)
-               ksft_exit_fail_msg("mmap failed");
-
-       // Dirty bit is set for new regions even if they are reused
-       if (map == map2)
-               ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1,
-                                "Test %s dirty bit of reused address page\n", __func__);
-       else
-               ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__);
-
-       munmap(map2, pagesize);
-}
-
-static void test_hugepage(int pagemap_fd, int pagesize)
-{
-       char *map;
-       int i, ret;
-       size_t hpage_len = read_pmd_pagesize();
-
-       map = memalign(hpage_len, hpage_len);
-       if (!map)
-               ksft_exit_fail_msg("memalign failed\n");
-
-       ret = madvise(map, hpage_len, MADV_HUGEPAGE);
-       if (ret)
-               ksft_exit_fail_msg("madvise failed %d\n", ret);
-
-       for (i = 0; i < hpage_len; i++)
-               map[i] = (char)i;
-
-       if (check_huge_anon(map, 1, hpage_len)) {
-               ksft_test_result_pass("Test %s huge page allocation\n", __func__);
-
-               clear_softdirty();
-               for (i = 0 ; i < TEST_ITERATIONS ; i++) {
-                       if (pagemap_is_softdirty(pagemap_fd, map) == 1) {
-                               ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i);
-                               break;
-                       }
-
-                       clear_softdirty();
-                       // Write something to the page to get the dirty bit enabled on the page
-                       map[0]++;
-
-                       if (pagemap_is_softdirty(pagemap_fd, map) == 0) {
-                               ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i);
-                               break;
-                       }
-                       clear_softdirty();
-               }
-
-               ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__);
-       } else {
-               // hugepage allocation failed. skip these tests
-               ksft_test_result_skip("Test %s huge page allocation\n", __func__);
-               ksft_test_result_skip("Test %s huge page dirty bit\n", __func__);
-       }
-       free(map);
-}
-
-static void test_mprotect(int pagemap_fd, int pagesize, bool anon)
-{
-       const char *type[] = {"file", "anon"};
-       const char *fname = "./soft-dirty-test-file";
-       int test_fd;
-       char *map;
-
-       if (anon) {
-               map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
-                          MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-               if (!map)
-                       ksft_exit_fail_msg("anon mmap failed\n");
-       } else {
-               test_fd = open(fname, O_RDWR | O_CREAT);
-               if (test_fd < 0) {
-                       ksft_test_result_skip("Test %s open() file failed\n", __func__);
-                       return;
-               }
-               unlink(fname);
-               ftruncate(test_fd, pagesize);
-               map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE,
-                          MAP_SHARED, test_fd, 0);
-               if (!map)
-                       ksft_exit_fail_msg("file mmap failed\n");
-       }
-
-       *map = 1;
-       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
-                        "Test %s-%s dirty bit of new written page\n",
-                        __func__, type[anon]);
-       clear_softdirty();
-       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
-                        "Test %s-%s soft-dirty clear after clear_refs\n",
-                        __func__, type[anon]);
-       mprotect(map, pagesize, PROT_READ);
-       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
-                        "Test %s-%s soft-dirty clear after marking RO\n",
-                        __func__, type[anon]);
-       mprotect(map, pagesize, PROT_READ|PROT_WRITE);
-       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0,
-                        "Test %s-%s soft-dirty clear after marking RW\n",
-                        __func__, type[anon]);
-       *map = 2;
-       ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1,
-                        "Test %s-%s soft-dirty after rewritten\n",
-                        __func__, type[anon]);
-
-       munmap(map, pagesize);
-
-       if (!anon)
-               close(test_fd);
-}
-
-static void test_mprotect_anon(int pagemap_fd, int pagesize)
-{
-       test_mprotect(pagemap_fd, pagesize, true);
-}
-
-static void test_mprotect_file(int pagemap_fd, int pagesize)
-{
-       test_mprotect(pagemap_fd, pagesize, false);
-}
-
-int main(int argc, char **argv)
-{
-       int pagemap_fd;
-       int pagesize;
-
-       ksft_print_header();
-       ksft_set_plan(15);
-
-       pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY);
-       if (pagemap_fd < 0)
-               ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH);
-
-       pagesize = getpagesize();
-
-       test_simple(pagemap_fd, pagesize);
-       test_vma_reuse(pagemap_fd, pagesize);
-       test_hugepage(pagemap_fd, pagesize);
-       test_mprotect_anon(pagemap_fd, pagesize);
-       test_mprotect_file(pagemap_fd, pagesize);
-
-       close(pagemap_fd);
-
-       return ksft_exit_pass();
-}
diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c
deleted file mode 100644 (file)
index 76e1c36..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual
- * address range in a process via <debugfs>/split_huge_pages interface.
- */
-
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <inttypes.h>
-#include <string.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/mount.h>
-#include <malloc.h>
-#include <stdbool.h>
-#include "vm_util.h"
-
-uint64_t pagesize;
-unsigned int pageshift;
-uint64_t pmd_pagesize;
-
-#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages"
-#define INPUT_MAX 80
-
-#define PID_FMT "%d,0x%lx,0x%lx"
-#define PATH_FMT "%s,0x%lx,0x%lx"
-
-#define PFN_MASK     ((1UL<<55)-1)
-#define KPF_THP      (1UL<<22)
-
-int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file)
-{
-       uint64_t paddr;
-       uint64_t page_flags;
-
-       if (pagemap_file) {
-               pread(pagemap_file, &paddr, sizeof(paddr),
-                       ((long)vaddr >> pageshift) * sizeof(paddr));
-
-               if (kpageflags_file) {
-                       pread(kpageflags_file, &page_flags, sizeof(page_flags),
-                               (paddr & PFN_MASK) * sizeof(page_flags));
-
-                       return !!(page_flags & KPF_THP);
-               }
-       }
-       return 0;
-}
-
-static int write_file(const char *path, const char *buf, size_t buflen)
-{
-       int fd;
-       ssize_t numwritten;
-
-       fd = open(path, O_WRONLY);
-       if (fd == -1)
-               return 0;
-
-       numwritten = write(fd, buf, buflen - 1);
-       close(fd);
-       if (numwritten < 1)
-               return 0;
-
-       return (unsigned int) numwritten;
-}
-
-static void write_debugfs(const char *fmt, ...)
-{
-       char input[INPUT_MAX];
-       int ret;
-       va_list argp;
-
-       va_start(argp, fmt);
-       ret = vsnprintf(input, INPUT_MAX, fmt, argp);
-       va_end(argp);
-
-       if (ret >= INPUT_MAX) {
-               printf("%s: Debugfs input is too long\n", __func__);
-               exit(EXIT_FAILURE);
-       }
-
-       if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) {
-               perror(SPLIT_DEBUGFS);
-               exit(EXIT_FAILURE);
-       }
-}
-
-void split_pmd_thp(void)
-{
-       char *one_page;
-       size_t len = 4 * pmd_pagesize;
-       size_t i;
-
-       one_page = memalign(pmd_pagesize, len);
-
-       if (!one_page) {
-               printf("Fail to allocate memory\n");
-               exit(EXIT_FAILURE);
-       }
-
-       madvise(one_page, len, MADV_HUGEPAGE);
-
-       for (i = 0; i < len; i++)
-               one_page[i] = (char)i;
-
-       if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
-               printf("No THP is allocated\n");
-               exit(EXIT_FAILURE);
-       }
-
-       /* split all THPs */
-       write_debugfs(PID_FMT, getpid(), (uint64_t)one_page,
-               (uint64_t)one_page + len);
-
-       for (i = 0; i < len; i++)
-               if (one_page[i] != (char)i) {
-                       printf("%ld byte corrupted\n", i);
-                       exit(EXIT_FAILURE);
-               }
-
-
-       if (check_huge_anon(one_page, 0, pmd_pagesize)) {
-               printf("Still AnonHugePages not split\n");
-               exit(EXIT_FAILURE);
-       }
-
-       printf("Split huge pages successful\n");
-       free(one_page);
-}
-
-void split_pte_mapped_thp(void)
-{
-       char *one_page, *pte_mapped, *pte_mapped2;
-       size_t len = 4 * pmd_pagesize;
-       uint64_t thp_size;
-       size_t i;
-       const char *pagemap_template = "/proc/%d/pagemap";
-       const char *kpageflags_proc = "/proc/kpageflags";
-       char pagemap_proc[255];
-       int pagemap_fd;
-       int kpageflags_fd;
-
-       if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) {
-               perror("get pagemap proc error");
-               exit(EXIT_FAILURE);
-       }
-       pagemap_fd = open(pagemap_proc, O_RDONLY);
-
-       if (pagemap_fd == -1) {
-               perror("read pagemap:");
-               exit(EXIT_FAILURE);
-       }
-
-       kpageflags_fd = open(kpageflags_proc, O_RDONLY);
-
-       if (kpageflags_fd == -1) {
-               perror("read kpageflags:");
-               exit(EXIT_FAILURE);
-       }
-
-       one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE,
-                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-
-       madvise(one_page, len, MADV_HUGEPAGE);
-
-       for (i = 0; i < len; i++)
-               one_page[i] = (char)i;
-
-       if (!check_huge_anon(one_page, 1, pmd_pagesize)) {
-               printf("No THP is allocated\n");
-               exit(EXIT_FAILURE);
-       }
-
-       /* remap the first pagesize of first THP */
-       pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE);
-
-       /* remap the Nth pagesize of Nth THP */
-       for (i = 1; i < 4; i++) {
-               pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i,
-                                    pagesize, pagesize,
-                                    MREMAP_MAYMOVE|MREMAP_FIXED,
-                                    pte_mapped + pagesize * i);
-               if (pte_mapped2 == (char *)-1) {
-                       perror("mremap failed");
-                       exit(EXIT_FAILURE);
-               }
-       }
-
-       /* smap does not show THPs after mremap, use kpageflags instead */
-       thp_size = 0;
-       for (i = 0; i < pagesize * 4; i++)
-               if (i % pagesize == 0 &&
-                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
-                       thp_size++;
-
-       if (thp_size != 4) {
-               printf("Some THPs are missing during mremap\n");
-               exit(EXIT_FAILURE);
-       }
-
-       /* split all remapped THPs */
-       write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped,
-                     (uint64_t)pte_mapped + pagesize * 4);
-
-       /* smap does not show THPs after mremap, use kpageflags instead */
-       thp_size = 0;
-       for (i = 0; i < pagesize * 4; i++) {
-               if (pte_mapped[i] != (char)i) {
-                       printf("%ld byte corrupted\n", i);
-                       exit(EXIT_FAILURE);
-               }
-               if (i % pagesize == 0 &&
-                   is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd))
-                       thp_size++;
-       }
-
-       if (thp_size) {
-               printf("Still %ld THPs not split\n", thp_size);
-               exit(EXIT_FAILURE);
-       }
-
-       printf("Split PTE-mapped huge pages successful\n");
-       munmap(one_page, len);
-       close(pagemap_fd);
-       close(kpageflags_fd);
-}
-
-void split_file_backed_thp(void)
-{
-       int status;
-       int fd;
-       ssize_t num_written;
-       char tmpfs_template[] = "/tmp/thp_split_XXXXXX";
-       const char *tmpfs_loc = mkdtemp(tmpfs_template);
-       char testfile[INPUT_MAX];
-       uint64_t pgoff_start = 0, pgoff_end = 1024;
-
-       printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n");
-
-       status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m");
-
-       if (status) {
-               printf("Unable to create a tmpfs for testing\n");
-               exit(EXIT_FAILURE);
-       }
-
-       status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc);
-       if (status >= INPUT_MAX) {
-               printf("Fail to create file-backed THP split testing file\n");
-               goto cleanup;
-       }
-
-       fd = open(testfile, O_CREAT|O_WRONLY);
-       if (fd == -1) {
-               perror("Cannot open testing file\n");
-               goto cleanup;
-       }
-
-       /* write something to the file, so a file-backed THP can be allocated */
-       num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1);
-       close(fd);
-
-       if (num_written < 1) {
-               printf("Fail to write data to testing file\n");
-               goto cleanup;
-       }
-
-       /* split the file-backed THP */
-       write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end);
-
-       status = unlink(testfile);
-       if (status)
-               perror("Cannot remove testing file\n");
-
-cleanup:
-       status = umount(tmpfs_loc);
-       if (status) {
-               printf("Unable to umount %s\n", tmpfs_loc);
-               exit(EXIT_FAILURE);
-       }
-       status = rmdir(tmpfs_loc);
-       if (status) {
-               perror("cannot remove tmp dir");
-               exit(EXIT_FAILURE);
-       }
-
-       printf("file-backed THP split test done, please check dmesg for more information\n");
-}
-
-int main(int argc, char **argv)
-{
-       if (geteuid() != 0) {
-               printf("Please run the benchmark as root\n");
-               exit(EXIT_FAILURE);
-       }
-
-       pagesize = getpagesize();
-       pageshift = ffs(pagesize) - 1;
-       pmd_pagesize = read_pmd_pagesize();
-
-       split_pmd_thp();
-       split_pte_mapped_thp();
-       split_file_backed_thp();
-
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh
deleted file mode 100755 (executable)
index 46e19b5..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
-#
-# This is a test script for the kernel test driver to analyse vmalloc
-# allocator. Therefore it is just a kernel module loader. You can specify
-# and pass different parameters in order to:
-#     a) analyse performance of vmalloc allocations;
-#     b) stressing and stability check of vmalloc subsystem.
-
-TEST_NAME="test_hmm"
-DRIVER="test_hmm"
-
-# 1 if fails
-exitcode=1
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-check_test_requirements()
-{
-       uid=$(id -u)
-       if [ $uid -ne 0 ]; then
-               echo "$0: Must be run as root"
-               exit $ksft_skip
-       fi
-
-       if ! which modprobe > /dev/null 2>&1; then
-               echo "$0: You need modprobe installed"
-               exit $ksft_skip
-       fi
-
-       if ! modinfo $DRIVER > /dev/null 2>&1; then
-               echo "$0: You must have the following enabled in your kernel:"
-               echo "CONFIG_TEST_HMM=m"
-               exit $ksft_skip
-       fi
-}
-
-load_driver()
-{
-       if [ $# -eq 0 ]; then
-               modprobe $DRIVER > /dev/null 2>&1
-       else
-               if [ $# -eq 2 ]; then
-                       modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
-                               > /dev/null 2>&1
-               else
-                       echo "Missing module parameters. Make sure pass"\
-                       "spm_addr_dev0 and spm_addr_dev1"
-                       usage
-               fi
-       fi
-}
-
-unload_driver()
-{
-       modprobe -r $DRIVER > /dev/null 2>&1
-}
-
-run_smoke()
-{
-       echo "Running smoke test. Note, this test provides basic coverage."
-
-       load_driver $1 $2
-       $(dirname "${BASH_SOURCE[0]}")/hmm-tests
-       unload_driver
-}
-
-usage()
-{
-       echo -n "Usage: $0"
-       echo
-       echo "Example usage:"
-       echo
-       echo "# Shows help message"
-       echo "./${TEST_NAME}.sh"
-       echo
-       echo "# Smoke testing"
-       echo "./${TEST_NAME}.sh smoke"
-       echo
-       echo "# Smoke testing with SPM enabled"
-       echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>"
-       echo
-       exit 0
-}
-
-function run_test()
-{
-       if [ $# -eq 0 ]; then
-               usage
-       else
-               if [ "$1" = "smoke" ]; then
-                       run_smoke $2 $3
-               else
-                       usage
-               fi
-       fi
-}
-
-check_test_requirements
-run_test $@
-
-exit 0
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh
deleted file mode 100755 (executable)
index d73b846..0000000
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
-#
-# This is a test script for the kernel test driver to analyse vmalloc
-# allocator. Therefore it is just a kernel module loader. You can specify
-# and pass different parameters in order to:
-#     a) analyse performance of vmalloc allocations;
-#     b) stressing and stability check of vmalloc subsystem.
-
-TEST_NAME="vmalloc"
-DRIVER="test_${TEST_NAME}"
-NUM_CPUS=`grep -c ^processor /proc/cpuinfo`
-
-# 1 if fails
-exitcode=1
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-#
-# Static templates for performance, stressing and smoke tests.
-# Also it is possible to pass any supported parameters manualy.
-#
-PERF_PARAM="sequential_test_order=1 test_repeat_count=3"
-SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10"
-STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20"
-
-check_test_requirements()
-{
-       uid=$(id -u)
-       if [ $uid -ne 0 ]; then
-               echo "$0: Must be run as root"
-               exit $ksft_skip
-       fi
-
-       if ! which modprobe > /dev/null 2>&1; then
-               echo "$0: You need modprobe installed"
-               exit $ksft_skip
-       fi
-
-       if ! modinfo $DRIVER > /dev/null 2>&1; then
-               echo "$0: You must have the following enabled in your kernel:"
-               echo "CONFIG_TEST_VMALLOC=m"
-               exit $ksft_skip
-       fi
-}
-
-run_perfformance_check()
-{
-       echo "Run performance tests to evaluate how fast vmalloc allocation is."
-       echo "It runs all test cases on one single CPU with sequential order."
-
-       modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
-       echo "Done."
-       echo "Ccheck the kernel message buffer to see the summary."
-}
-
-run_stability_check()
-{
-       echo "Run stability tests. In order to stress vmalloc subsystem all"
-       echo "available test cases are run by NUM_CPUS workers simultaneously."
-       echo "It will take time, so be patient."
-
-       modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
-       echo "Done."
-       echo "Check the kernel ring buffer to see the summary."
-}
-
-run_smoke_check()
-{
-       echo "Run smoke test. Note, this test provides basic coverage."
-       echo "Please check $0 output how it can be used"
-       echo "for deep performance analysis as well as stress testing."
-
-       modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
-       echo "Done."
-       echo "Check the kernel ring buffer to see the summary."
-}
-
-usage()
-{
-       echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
-       echo "manual parameters"
-       echo
-       echo "Valid tests and parameters:"
-       echo
-       modinfo $DRIVER
-       echo
-       echo "Example usage:"
-       echo
-       echo "# Shows help message"
-       echo "./${DRIVER}.sh"
-       echo
-       echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers"
-       echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5"
-       echo
-       echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
-       echo "sequential order"
-       echo -n "./${DRIVER}.sh sequential_test_order=1 "
-       echo "run_test_mask=23"
-       echo
-       echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats "
-       echo "20 times"
-       echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20"
-       echo
-       echo "# Performance analysis"
-       echo "./${DRIVER}.sh performance"
-       echo
-       echo "# Stress testing"
-       echo "./${DRIVER}.sh stress"
-       echo
-       exit 0
-}
-
-function validate_passed_args()
-{
-       VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
-
-       #
-       # Something has been passed, check it.
-       #
-       for passed_arg in $@; do
-               key=${passed_arg//=*/}
-               val="${passed_arg:$((${#key}+1))}"
-               valid=0
-
-               for valid_arg in $VALID_ARGS; do
-                       if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
-                               valid=1
-                               break
-                       fi
-               done
-
-               if [[ $valid -ne 1 ]]; then
-                       echo "Error: key or value is not correct: ${key} $val"
-                       exit $exitcode
-               fi
-       done
-}
-
-function run_manual_check()
-{
-       #
-       # Validate passed parameters. If there is wrong one,
-       # the script exists and does not execute further.
-       #
-       validate_passed_args $@
-
-       echo "Run the test with following parameters: $@"
-       modprobe $DRIVER $@ > /dev/null 2>&1
-       echo "Done."
-       echo "Check the kernel ring buffer to see the summary."
-}
-
-function run_test()
-{
-       if [ $# -eq 0 ]; then
-               usage
-       else
-               if [[ "$1" = "performance" ]]; then
-                       run_perfformance_check
-               elif [[ "$1" = "stress" ]]; then
-                       run_stability_check
-               elif [[ "$1" = "smoke" ]]; then
-                       run_smoke_check
-               else
-                       run_manual_check $@
-               fi
-       fi
-}
-
-check_test_requirements
-run_test $@
-
-exit 0
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c
deleted file mode 100644 (file)
index 361ef71..0000000
+++ /dev/null
@@ -1,257 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Test selecting other page sizes for mmap/shmget.
-
-   Before running this huge pages for each huge page size must have been
-   reserved.
-   For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
-   Also shmmax must be increased.
-   And you need to run as root to work around some weird permissions in shm.
-   And nothing using huge pages should run in parallel.
-   When the program aborts you may need to clean up the shm segments with
-   ipcrm -m by hand, like this
-   sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
-   (warning this will remove all if someone else uses them) */
-
-#define _GNU_SOURCE 1
-#include <sys/mman.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/ipc.h>
-#include <sys/shm.h>
-#include <sys/stat.h>
-#include <glob.h>
-#include <assert.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <string.h>
-
-#define err(x) perror(x), exit(1)
-
-#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
-#define MAP_HUGE_SHIFT  26
-#define MAP_HUGE_MASK   0x3f
-#if !defined(MAP_HUGETLB)
-#define MAP_HUGETLB    0x40000
-#endif
-
-#define SHM_HUGETLB     04000   /* segment will use huge TLB pages */
-#define SHM_HUGE_SHIFT  26
-#define SHM_HUGE_MASK   0x3f
-#define SHM_HUGE_2MB    (21 << SHM_HUGE_SHIFT)
-#define SHM_HUGE_1GB    (30 << SHM_HUGE_SHIFT)
-
-#define NUM_PAGESIZES   5
-
-#define NUM_PAGES 4
-
-#define Dprintf(fmt...) // printf(fmt)
-
-unsigned long page_sizes[NUM_PAGESIZES];
-int num_page_sizes;
-
-int ilog2(unsigned long v)
-{
-       int l = 0;
-       while ((1UL << l) < v)
-               l++;
-       return l;
-}
-
-void find_pagesizes(void)
-{
-       glob_t g;
-       int i;
-       glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
-       assert(g.gl_pathc <= NUM_PAGESIZES);
-       for (i = 0; i < g.gl_pathc; i++) {
-               sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
-                               &page_sizes[i]);
-               page_sizes[i] <<= 10;
-               printf("Found %luMB\n", page_sizes[i] >> 20);
-       }
-       num_page_sizes = g.gl_pathc;
-       globfree(&g);
-}
-
-unsigned long default_huge_page_size(void)
-{
-       unsigned long hps = 0;
-       char *line = NULL;
-       size_t linelen = 0;
-       FILE *f = fopen("/proc/meminfo", "r");
-       if (!f)
-               return 0;
-       while (getline(&line, &linelen, f) > 0) {
-               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
-                       hps <<= 10;
-                       break;
-               }
-       }
-       free(line);
-       return hps;
-}
-
-void show(unsigned long ps)
-{
-       char buf[100];
-       if (ps == getpagesize())
-               return;
-       printf("%luMB: ", ps >> 20);
-       fflush(stdout);
-       snprintf(buf, sizeof buf,
-               "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
-               ps >> 10);
-       system(buf);
-}
-
-unsigned long read_sysfs(int warn, char *fmt, ...)
-{
-       char *line = NULL;
-       size_t linelen = 0;
-       char buf[100];
-       FILE *f;
-       va_list ap;
-       unsigned long val = 0;
-
-       va_start(ap, fmt);
-       vsnprintf(buf, sizeof buf, fmt, ap);
-       va_end(ap);
-
-       f = fopen(buf, "r");
-       if (!f) {
-               if (warn)
-                       printf("missing %s\n", buf);
-               return 0;
-       }
-       if (getline(&line, &linelen, f) > 0) {
-               sscanf(line, "%lu", &val);
-       }
-       fclose(f);
-       free(line);
-       return val;
-}
-
-unsigned long read_free(unsigned long ps)
-{
-       return read_sysfs(ps != getpagesize(),
-                       "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
-                       ps >> 10);
-}
-
-void test_mmap(unsigned long size, unsigned flags)
-{
-       char *map;
-       unsigned long before, after;
-       int err;
-
-       before = read_free(size);
-       map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
-                       MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
-
-       if (map == (char *)-1) err("mmap");
-       memset(map, 0xff, size*NUM_PAGES);
-       after = read_free(size);
-       Dprintf("before %lu after %lu diff %ld size %lu\n",
-               before, after, before - after, size);
-       assert(size == getpagesize() || (before - after) == NUM_PAGES);
-       show(size);
-       err = munmap(map, size);
-       assert(!err);
-}
-
-void test_shmget(unsigned long size, unsigned flags)
-{
-       int id;
-       unsigned long before, after;
-       int err;
-
-       before = read_free(size);
-       id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
-       if (id < 0) err("shmget");
-
-       struct shm_info i;
-       if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
-       Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
-
-
-       Dprintf("id %d\n", id);
-       char *map = shmat(id, NULL, 0600);
-       if (map == (char*)-1) err("shmat");
-
-       shmctl(id, IPC_RMID, NULL);
-
-       memset(map, 0xff, size*NUM_PAGES);
-       after = read_free(size);
-
-       Dprintf("before %lu after %lu diff %ld size %lu\n",
-               before, after, before - after, size);
-       assert(size == getpagesize() || (before - after) == NUM_PAGES);
-       show(size);
-       err = shmdt(map);
-       assert(!err);
-}
-
-void sanity_checks(void)
-{
-       int i;
-       unsigned long largest = getpagesize();
-
-       for (i = 0; i < num_page_sizes; i++) {
-               if (page_sizes[i] > largest)
-                       largest = page_sizes[i];
-
-               if (read_free(page_sizes[i]) < NUM_PAGES) {
-                       printf("Not enough huge pages for page size %lu MB, need %u\n",
-                               page_sizes[i] >> 20,
-                               NUM_PAGES);
-                       exit(0);
-               }
-       }
-
-       if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
-               printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
-               exit(0);
-       }
-
-#if defined(__x86_64__)
-       if (largest != 1U<<30) {
-               printf("No GB pages available on x86-64\n"
-                      "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
-               exit(0);
-       }
-#endif
-}
-
-int main(void)
-{
-       int i;
-       unsigned default_hps = default_huge_page_size();
-
-       find_pagesizes();
-
-       sanity_checks();
-
-       for (i = 0; i < num_page_sizes; i++) {
-               unsigned long ps = page_sizes[i];
-               int arg = ilog2(ps) << MAP_HUGE_SHIFT;
-               printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
-               test_mmap(ps, MAP_HUGETLB | arg);
-       }
-       printf("Testing default huge mmap\n");
-       test_mmap(default_hps, SHM_HUGETLB);
-
-       puts("Testing non-huge shmget");
-       test_shmget(getpagesize(), 0);
-
-       for (i = 0; i < num_page_sizes; i++) {
-               unsigned long ps = page_sizes[i];
-               int arg = ilog2(ps) << SHM_HUGE_SHIFT;
-               printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
-               test_shmget(ps, SHM_HUGETLB | arg);
-       }
-       puts("default huge shmget");
-       test_shmget(default_hps, SHM_HUGETLB);
-
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c
deleted file mode 100644 (file)
index e3f00ad..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Stress test for transparent huge pages, memory compaction and migration.
- *
- * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
- *
- * This is free and unencumbered software released into the public domain.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <err.h>
-#include <time.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/mman.h>
-#include "util.h"
-
-int backing_fd = -1;
-int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE;
-#define PROT_RW (PROT_READ | PROT_WRITE)
-
-int main(int argc, char **argv)
-{
-       size_t ram, len;
-       void *ptr, *p;
-       struct timespec a, b;
-       int i = 0;
-       char *name = NULL;
-       double s;
-       uint8_t *map;
-       size_t map_len;
-       int pagemap_fd;
-
-       ram = sysconf(_SC_PHYS_PAGES);
-       if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
-               ram = SIZE_MAX / 4;
-       else
-               ram *= sysconf(_SC_PAGESIZE);
-       len = ram;
-
-       while (++i < argc) {
-               if (!strcmp(argv[i], "-h"))
-                       errx(1, "usage: %s [size in MiB]", argv[0]);
-               else if (!strcmp(argv[i], "-f"))
-                       name = argv[++i];
-               else
-                       len = atoll(argv[i]) << 20;
-       }
-
-       if (name) {
-               backing_fd = open(name, O_RDWR);
-               if (backing_fd == -1)
-                       errx(2, "open %s", name);
-               mmap_flags = MAP_SHARED;
-       }
-
-       warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
-             " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
-             ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
-
-       pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
-       if (pagemap_fd < 0)
-               err(2, "open pagemap");
-
-       len -= len % HPAGE_SIZE;
-       ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0);
-       if (ptr == MAP_FAILED)
-               err(2, "initial mmap");
-       ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
-
-       if (madvise(ptr, len, MADV_HUGEPAGE))
-               err(2, "MADV_HUGEPAGE");
-
-       map_len = ram >> (HPAGE_SHIFT - 1);
-       map = malloc(map_len);
-       if (!map)
-               errx(2, "map malloc");
-
-       while (1) {
-               int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
-
-               memset(map, 0, map_len);
-
-               clock_gettime(CLOCK_MONOTONIC, &a);
-               for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
-                       int64_t pfn;
-
-                       pfn = allocate_transhuge(p, pagemap_fd);
-
-                       if (pfn < 0) {
-                               nr_failed++;
-                       } else {
-                               size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
-
-                               nr_succeed++;
-                               if (idx >= map_len) {
-                                       map = realloc(map, idx + 1);
-                                       if (!map)
-                                               errx(2, "map realloc");
-                                       memset(map + map_len, 0, idx + 1 - map_len);
-                                       map_len = idx + 1;
-                               }
-                               if (!map[idx])
-                                       nr_pages++;
-                               map[idx] = 1;
-                       }
-
-                       /* split transhuge page, keep last page */
-                       if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
-                               err(2, "MADV_DONTNEED");
-               }
-               clock_gettime(CLOCK_MONOTONIC, &b);
-               s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
-
-               warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
-                     "%4d succeed, %4d failed, %4d different pages",
-                     s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
-                     nr_succeed, nr_failed, nr_pages);
-       }
-}
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
deleted file mode 100644 (file)
index 7f22844..0000000
+++ /dev/null
@@ -1,1858 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Stress userfaultfd syscall.
- *
- *  Copyright (C) 2015  Red Hat, Inc.
- *
- * This test allocates two virtual areas and bounces the physical
- * memory across the two virtual areas (from area_src to area_dst)
- * using userfaultfd.
- *
- * There are three threads running per CPU:
- *
- * 1) one per-CPU thread takes a per-page pthread_mutex in a random
- *    page of the area_dst (while the physical page may still be in
- *    area_src), and increments a per-page counter in the same page,
- *    and checks its value against a verification region.
- *
- * 2) another per-CPU thread handles the userfaults generated by
- *    thread 1 above. userfaultfd blocking reads or poll() modes are
- *    exercised interleaved.
- *
- * 3) one last per-CPU thread transfers the memory in the background
- *    at maximum bandwidth (if not already transferred by thread
- *    2). Each cpu thread takes cares of transferring a portion of the
- *    area.
- *
- * When all threads of type 3 completed the transfer, one bounce is
- * complete. area_src and area_dst are then swapped. All threads are
- * respawned and so the bounce is immediately restarted in the
- * opposite direction.
- *
- * per-CPU threads 1 by triggering userfaults inside
- * pthread_mutex_lock will also verify the atomicity of the memory
- * transfer (UFFDIO_COPY).
- */
-
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <time.h>
-#include <signal.h>
-#include <poll.h>
-#include <string.h>
-#include <linux/mman.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/wait.h>
-#include <pthread.h>
-#include <linux/userfaultfd.h>
-#include <setjmp.h>
-#include <stdbool.h>
-#include <assert.h>
-#include <inttypes.h>
-#include <stdint.h>
-#include <sys/random.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#ifdef __NR_userfaultfd
-
-static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
-
-#define BOUNCE_RANDOM          (1<<0)
-#define BOUNCE_RACINGFAULTS    (1<<1)
-#define BOUNCE_VERIFY          (1<<2)
-#define BOUNCE_POLL            (1<<3)
-static int bounces;
-
-#define TEST_ANON      1
-#define TEST_HUGETLB   2
-#define TEST_SHMEM     3
-static int test_type;
-
-#define UFFD_FLAGS     (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
-
-#define BASE_PMD_ADDR ((void *)(1UL << 30))
-
-/* test using /dev/userfaultfd, instead of userfaultfd(2) */
-static bool test_dev_userfaultfd;
-
-/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
-#define ALARM_INTERVAL_SECS 10
-static volatile bool test_uffdio_copy_eexist = true;
-static volatile bool test_uffdio_zeropage_eexist = true;
-/* Whether to test uffd write-protection */
-static bool test_uffdio_wp = true;
-/* Whether to test uffd minor faults */
-static bool test_uffdio_minor = false;
-static bool map_shared;
-static int mem_fd;
-static unsigned long long *count_verify;
-static int uffd = -1;
-static int uffd_flags, finished, *pipefd;
-static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
-static char *zeropage;
-pthread_attr_t attr;
-static bool test_collapse;
-
-/* Userfaultfd test statistics */
-struct uffd_stats {
-       int cpu;
-       unsigned long missing_faults;
-       unsigned long wp_faults;
-       unsigned long minor_faults;
-};
-
-/* pthread_mutex_t starts at page offset 0 */
-#define area_mutex(___area, ___nr)                                     \
-       ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
-/*
- * count is placed in the page after pthread_mutex_t naturally aligned
- * to avoid non alignment faults on non-x86 archs.
- */
-#define area_count(___area, ___nr)                                     \
-       ((volatile unsigned long long *) ((unsigned long)               \
-                                ((___area) + (___nr)*page_size +       \
-                                 sizeof(pthread_mutex_t) +             \
-                                 sizeof(unsigned long long) - 1) &     \
-                                ~(unsigned long)(sizeof(unsigned long long) \
-                                                 -  1)))
-
-#define swap(a, b) \
-       do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
-
-#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1)))
-
-const char *examples =
-    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
-    "./userfaultfd anon 100 99999\n\n"
-    "# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
-    "./userfaultfd anon:dev 100 99999\n\n"
-    "# Run share memory test on 1GiB region with 99 bounces:\n"
-    "./userfaultfd shmem 1000 99\n\n"
-    "# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
-    "./userfaultfd hugetlb 256 50\n\n"
-    "# Run the same hugetlb test but using shared file:\n"
-    "./userfaultfd hugetlb_shared 256 50\n\n"
-    "# 10MiB-~6GiB 999 bounces anonymous test, "
-    "continue forever unless an error triggers\n"
-    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
-
-static void usage(void)
-{
-       fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
-               "[hugetlbfs_file]\n\n");
-       fprintf(stderr, "Supported <test type>: anon, hugetlb, "
-               "hugetlb_shared, shmem\n\n");
-       fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
-               "Supported mods:\n");
-       fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
-       fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
-       fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n"
-               "memory\n");
-       fprintf(stderr, "\nExample test mod usage:\n");
-       fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
-       fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
-
-       fprintf(stderr, "Examples:\n\n");
-       fprintf(stderr, "%s", examples);
-       exit(1);
-}
-
-#define _err(fmt, ...)                                         \
-       do {                                                    \
-               int ret = errno;                                \
-               fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);  \
-               fprintf(stderr, " (errno=%d, line=%d)\n",       \
-                       ret, __LINE__);                         \
-       } while (0)
-
-#define errexit(exitcode, fmt, ...)            \
-       do {                                    \
-               _err(fmt, ##__VA_ARGS__);       \
-               exit(exitcode);                 \
-       } while (0)
-
-#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
-
-static void uffd_stats_reset(struct uffd_stats *uffd_stats,
-                            unsigned long n_cpus)
-{
-       int i;
-
-       for (i = 0; i < n_cpus; i++) {
-               uffd_stats[i].cpu = i;
-               uffd_stats[i].missing_faults = 0;
-               uffd_stats[i].wp_faults = 0;
-               uffd_stats[i].minor_faults = 0;
-       }
-}
-
-static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
-{
-       int i;
-       unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
-
-       for (i = 0; i < n_cpus; i++) {
-               miss_total += stats[i].missing_faults;
-               wp_total += stats[i].wp_faults;
-               minor_total += stats[i].minor_faults;
-       }
-
-       printf("userfaults: ");
-       if (miss_total) {
-               printf("%llu missing (", miss_total);
-               for (i = 0; i < n_cpus; i++)
-                       printf("%lu+", stats[i].missing_faults);
-               printf("\b) ");
-       }
-       if (wp_total) {
-               printf("%llu wp (", wp_total);
-               for (i = 0; i < n_cpus; i++)
-                       printf("%lu+", stats[i].wp_faults);
-               printf("\b) ");
-       }
-       if (minor_total) {
-               printf("%llu minor (", minor_total);
-               for (i = 0; i < n_cpus; i++)
-                       printf("%lu+", stats[i].minor_faults);
-               printf("\b)");
-       }
-       printf("\n");
-}
-
-static void anon_release_pages(char *rel_area)
-{
-       if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
-               err("madvise(MADV_DONTNEED) failed");
-}
-
-static void anon_allocate_area(void **alloc_area, bool is_src)
-{
-       *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
-                          MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-}
-
-static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-}
-
-static void hugetlb_release_pages(char *rel_area)
-{
-       if (!map_shared) {
-               if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
-                       err("madvise(MADV_DONTNEED) failed");
-       } else {
-               if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
-                       err("madvise(MADV_REMOVE) failed");
-       }
-}
-
-static void hugetlb_allocate_area(void **alloc_area, bool is_src)
-{
-       off_t size = nr_pages * page_size;
-       off_t offset = is_src ? 0 : size;
-       void *area_alias = NULL;
-       char **alloc_area_alias;
-
-       *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
-                          (map_shared ? MAP_SHARED : MAP_PRIVATE) |
-                          (is_src ? 0 : MAP_NORESERVE),
-                          mem_fd, offset);
-       if (*alloc_area == MAP_FAILED)
-               err("mmap of hugetlbfs file failed");
-
-       if (map_shared) {
-               area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
-                                 MAP_SHARED, mem_fd, offset);
-               if (area_alias == MAP_FAILED)
-                       err("mmap of hugetlb file alias failed");
-       }
-
-       if (is_src) {
-               alloc_area_alias = &area_src_alias;
-       } else {
-               alloc_area_alias = &area_dst_alias;
-       }
-       if (area_alias)
-               *alloc_area_alias = area_alias;
-}
-
-static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-       if (!map_shared)
-               return;
-
-       *start = (unsigned long) area_dst_alias + offset;
-}
-
-static void shmem_release_pages(char *rel_area)
-{
-       if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
-               err("madvise(MADV_REMOVE) failed");
-}
-
-static void shmem_allocate_area(void **alloc_area, bool is_src)
-{
-       void *area_alias = NULL;
-       size_t bytes = nr_pages * page_size;
-       unsigned long offset = is_src ? 0 : bytes;
-       char *p = NULL, *p_alias = NULL;
-
-       if (test_collapse) {
-               p = BASE_PMD_ADDR;
-               if (!is_src)
-                       /* src map + alias + interleaved hpages */
-                       p += 2 * (bytes + hpage_size);
-               p_alias = p;
-               p_alias += bytes;
-               p_alias += hpage_size;  /* Prevent src/dst VMA merge */
-       }
-
-       *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
-                          mem_fd, offset);
-       if (*alloc_area == MAP_FAILED)
-               err("mmap of memfd failed");
-       if (test_collapse && *alloc_area != p)
-               err("mmap of memfd failed at %p", p);
-
-       area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
-                         mem_fd, offset);
-       if (area_alias == MAP_FAILED)
-               err("mmap of memfd alias failed");
-       if (test_collapse && area_alias != p_alias)
-               err("mmap of anonymous memory failed at %p", p_alias);
-
-       if (is_src)
-               area_src_alias = area_alias;
-       else
-               area_dst_alias = area_alias;
-}
-
-static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-       *start = (unsigned long)area_dst_alias + offset;
-}
-
-static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
-{
-       if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
-               err("Did not find expected %d number of hugepages",
-                   expect_nr_hpages);
-}
-
-struct uffd_test_ops {
-       void (*allocate_area)(void **alloc_area, bool is_src);
-       void (*release_pages)(char *rel_area);
-       void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
-       void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
-};
-
-static struct uffd_test_ops anon_uffd_test_ops = {
-       .allocate_area  = anon_allocate_area,
-       .release_pages  = anon_release_pages,
-       .alias_mapping = noop_alias_mapping,
-       .check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops shmem_uffd_test_ops = {
-       .allocate_area  = shmem_allocate_area,
-       .release_pages  = shmem_release_pages,
-       .alias_mapping = shmem_alias_mapping,
-       .check_pmd_mapping = shmem_check_pmd_mapping,
-};
-
-static struct uffd_test_ops hugetlb_uffd_test_ops = {
-       .allocate_area  = hugetlb_allocate_area,
-       .release_pages  = hugetlb_release_pages,
-       .alias_mapping = hugetlb_alias_mapping,
-       .check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops *uffd_test_ops;
-
-static inline uint64_t uffd_minor_feature(void)
-{
-       if (test_type == TEST_HUGETLB && map_shared)
-               return UFFD_FEATURE_MINOR_HUGETLBFS;
-       else if (test_type == TEST_SHMEM)
-               return UFFD_FEATURE_MINOR_SHMEM;
-       else
-               return 0;
-}
-
-static uint64_t get_expected_ioctls(uint64_t mode)
-{
-       uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
-
-       if (test_type == TEST_HUGETLB)
-               ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
-
-       if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
-               ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
-
-       if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
-               ioctls &= ~(1 << _UFFDIO_CONTINUE);
-
-       return ioctls;
-}
-
-static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
-{
-       uint64_t expected = get_expected_ioctls(mode);
-       uint64_t actual = ioctls & expected;
-
-       if (actual != expected) {
-               err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
-                   expected, actual);
-       }
-}
-
-static int __userfaultfd_open_dev(void)
-{
-       int fd, _uffd;
-
-       fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
-       if (fd < 0)
-               errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
-
-       _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
-       if (_uffd < 0)
-               errexit(errno == ENOTTY ? KSFT_SKIP : 1,
-                       "creating userfaultfd failed");
-       close(fd);
-       return _uffd;
-}
-
-static void userfaultfd_open(uint64_t *features)
-{
-       struct uffdio_api uffdio_api;
-
-       if (test_dev_userfaultfd)
-               uffd = __userfaultfd_open_dev();
-       else {
-               uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
-               if (uffd < 0)
-                       errexit(errno == ENOSYS ? KSFT_SKIP : 1,
-                               "creating userfaultfd failed");
-       }
-       uffd_flags = fcntl(uffd, F_GETFD, NULL);
-
-       uffdio_api.api = UFFD_API;
-       uffdio_api.features = *features;
-       if (ioctl(uffd, UFFDIO_API, &uffdio_api))
-               err("UFFDIO_API failed.\nPlease make sure to "
-                   "run with either root or ptrace capability.");
-       if (uffdio_api.api != UFFD_API)
-               err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
-
-       *features = uffdio_api.features;
-}
-
-static inline void munmap_area(void **area)
-{
-       if (*area)
-               if (munmap(*area, nr_pages * page_size))
-                       err("munmap");
-
-       *area = NULL;
-}
-
-static void uffd_test_ctx_clear(void)
-{
-       size_t i;
-
-       if (pipefd) {
-               for (i = 0; i < nr_cpus * 2; ++i) {
-                       if (close(pipefd[i]))
-                               err("close pipefd");
-               }
-               free(pipefd);
-               pipefd = NULL;
-       }
-
-       if (count_verify) {
-               free(count_verify);
-               count_verify = NULL;
-       }
-
-       if (uffd != -1) {
-               if (close(uffd))
-                       err("close uffd");
-               uffd = -1;
-       }
-
-       munmap_area((void **)&area_src);
-       munmap_area((void **)&area_src_alias);
-       munmap_area((void **)&area_dst);
-       munmap_area((void **)&area_dst_alias);
-       munmap_area((void **)&area_remap);
-}
-
-static void uffd_test_ctx_init(uint64_t features)
-{
-       unsigned long nr, cpu;
-
-       uffd_test_ctx_clear();
-
-       uffd_test_ops->allocate_area((void **)&area_src, true);
-       uffd_test_ops->allocate_area((void **)&area_dst, false);
-
-       userfaultfd_open(&features);
-
-       count_verify = malloc(nr_pages * sizeof(unsigned long long));
-       if (!count_verify)
-               err("count_verify");
-
-       for (nr = 0; nr < nr_pages; nr++) {
-               *area_mutex(area_src, nr) =
-                       (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
-               count_verify[nr] = *area_count(area_src, nr) = 1;
-               /*
-                * In the transition between 255 to 256, powerpc will
-                * read out of order in my_bcmp and see both bytes as
-                * zero, so leave a placeholder below always non-zero
-                * after the count, to avoid my_bcmp to trigger false
-                * positives.
-                */
-               *(area_count(area_src, nr) + 1) = 1;
-       }
-
-       /*
-        * After initialization of area_src, we must explicitly release pages
-        * for area_dst to make sure it's fully empty.  Otherwise we could have
-        * some area_dst pages be errornously initialized with zero pages,
-        * hence we could hit memory corruption later in the test.
-        *
-        * One example is when THP is globally enabled, above allocate_area()
-        * calls could have the two areas merged into a single VMA (as they
-        * will have the same VMA flags so they're mergeable).  When we
-        * initialize the area_src above, it's possible that some part of
-        * area_dst could have been faulted in via one huge THP that will be
-        * shared between area_src and area_dst.  It could cause some of the
-        * area_dst won't be trapped by missing userfaults.
-        *
-        * This release_pages() will guarantee even if that happened, we'll
-        * proactively split the thp and drop any accidentally initialized
-        * pages within area_dst.
-        */
-       uffd_test_ops->release_pages(area_dst);
-
-       pipefd = malloc(sizeof(int) * nr_cpus * 2);
-       if (!pipefd)
-               err("pipefd");
-       for (cpu = 0; cpu < nr_cpus; cpu++)
-               if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
-                       err("pipe");
-}
-
-static int my_bcmp(char *str1, char *str2, size_t n)
-{
-       unsigned long i;
-       for (i = 0; i < n; i++)
-               if (str1[i] != str2[i])
-                       return 1;
-       return 0;
-}
-
-static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
-{
-       struct uffdio_writeprotect prms;
-
-       /* Write protection page faults */
-       prms.range.start = start;
-       prms.range.len = len;
-       /* Undo write-protect, do wakeup after that */
-       prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
-
-       if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
-               err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
-}
-
-static void continue_range(int ufd, __u64 start, __u64 len)
-{
-       struct uffdio_continue req;
-       int ret;
-
-       req.range.start = start;
-       req.range.len = len;
-       req.mode = 0;
-
-       if (ioctl(ufd, UFFDIO_CONTINUE, &req))
-               err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
-                   (uint64_t)start);
-
-       /*
-        * Error handling within the kernel for continue is subtly different
-        * from copy or zeropage, so it may be a source of bugs. Trigger an
-        * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
-        */
-       req.mapped = 0;
-       ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
-       if (ret >= 0 || req.mapped != -EEXIST)
-               err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
-                   ret, (int64_t) req.mapped);
-}
-
-static void *locking_thread(void *arg)
-{
-       unsigned long cpu = (unsigned long) arg;
-       unsigned long page_nr;
-       unsigned long long count;
-
-       if (!(bounces & BOUNCE_RANDOM)) {
-               page_nr = -bounces;
-               if (!(bounces & BOUNCE_RACINGFAULTS))
-                       page_nr += cpu * nr_pages_per_cpu;
-       }
-
-       while (!finished) {
-               if (bounces & BOUNCE_RANDOM) {
-                       if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr))
-                               err("getrandom failed");
-               } else
-                       page_nr += 1;
-               page_nr %= nr_pages;
-               pthread_mutex_lock(area_mutex(area_dst, page_nr));
-               count = *area_count(area_dst, page_nr);
-               if (count != count_verify[page_nr])
-                       err("page_nr %lu memory corruption %llu %llu",
-                           page_nr, count, count_verify[page_nr]);
-               count++;
-               *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
-               pthread_mutex_unlock(area_mutex(area_dst, page_nr));
-       }
-
-       return NULL;
-}
-
-static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
-                           unsigned long offset)
-{
-       uffd_test_ops->alias_mapping(&uffdio_copy->dst,
-                                    uffdio_copy->len,
-                                    offset);
-       if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
-               /* real retval in ufdio_copy.copy */
-               if (uffdio_copy->copy != -EEXIST)
-                       err("UFFDIO_COPY retry error: %"PRId64,
-                           (int64_t)uffdio_copy->copy);
-       } else {
-               err("UFFDIO_COPY retry unexpected: %"PRId64,
-                   (int64_t)uffdio_copy->copy);
-       }
-}
-
-static void wake_range(int ufd, unsigned long addr, unsigned long len)
-{
-       struct uffdio_range uffdio_wake;
-
-       uffdio_wake.start = addr;
-       uffdio_wake.len = len;
-
-       if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
-               fprintf(stderr, "error waking %lu\n",
-                       addr), exit(1);
-}
-
-static int __copy_page(int ufd, unsigned long offset, bool retry)
-{
-       struct uffdio_copy uffdio_copy;
-
-       if (offset >= nr_pages * page_size)
-               err("unexpected offset %lu\n", offset);
-       uffdio_copy.dst = (unsigned long) area_dst + offset;
-       uffdio_copy.src = (unsigned long) area_src + offset;
-       uffdio_copy.len = page_size;
-       if (test_uffdio_wp)
-               uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
-       else
-               uffdio_copy.mode = 0;
-       uffdio_copy.copy = 0;
-       if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
-               /* real retval in ufdio_copy.copy */
-               if (uffdio_copy.copy != -EEXIST)
-                       err("UFFDIO_COPY error: %"PRId64,
-                           (int64_t)uffdio_copy.copy);
-               wake_range(ufd, uffdio_copy.dst, page_size);
-       } else if (uffdio_copy.copy != page_size) {
-               err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
-       } else {
-               if (test_uffdio_copy_eexist && retry) {
-                       test_uffdio_copy_eexist = false;
-                       retry_copy_page(ufd, &uffdio_copy, offset);
-               }
-               return 1;
-       }
-       return 0;
-}
-
-static int copy_page_retry(int ufd, unsigned long offset)
-{
-       return __copy_page(ufd, offset, true);
-}
-
-static int copy_page(int ufd, unsigned long offset)
-{
-       return __copy_page(ufd, offset, false);
-}
-
-static int uffd_read_msg(int ufd, struct uffd_msg *msg)
-{
-       int ret = read(uffd, msg, sizeof(*msg));
-
-       if (ret != sizeof(*msg)) {
-               if (ret < 0) {
-                       if (errno == EAGAIN || errno == EINTR)
-                               return 1;
-                       err("blocking read error");
-               } else {
-                       err("short read");
-               }
-       }
-
-       return 0;
-}
-
-static void uffd_handle_page_fault(struct uffd_msg *msg,
-                                  struct uffd_stats *stats)
-{
-       unsigned long offset;
-
-       if (msg->event != UFFD_EVENT_PAGEFAULT)
-               err("unexpected msg event %u", msg->event);
-
-       if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
-               /* Write protect page faults */
-               wp_range(uffd, msg->arg.pagefault.address, page_size, false);
-               stats->wp_faults++;
-       } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
-               uint8_t *area;
-               int b;
-
-               /*
-                * Minor page faults
-                *
-                * To prove we can modify the original range for testing
-                * purposes, we're going to bit flip this range before
-                * continuing.
-                *
-                * Note that this requires all minor page fault tests operate on
-                * area_dst (non-UFFD-registered) and area_dst_alias
-                * (UFFD-registered).
-                */
-
-               area = (uint8_t *)(area_dst +
-                                  ((char *)msg->arg.pagefault.address -
-                                   area_dst_alias));
-               for (b = 0; b < page_size; ++b)
-                       area[b] = ~area[b];
-               continue_range(uffd, msg->arg.pagefault.address, page_size);
-               stats->minor_faults++;
-       } else {
-               /*
-                * Missing page faults.
-                *
-                * Here we force a write check for each of the missing mode
-                * faults.  It's guaranteed because the only threads that
-                * will trigger uffd faults are the locking threads, and
-                * their first instruction to touch the missing page will
-                * always be pthread_mutex_lock().
-                *
-                * Note that here we relied on an NPTL glibc impl detail to
-                * always read the lock type at the entry of the lock op
-                * (pthread_mutex_t.__data.__type, offset 0x10) before
-                * doing any locking operations to guarantee that.  It's
-                * actually not good to rely on this impl detail because
-                * logically a pthread-compatible lib can implement the
-                * locks without types and we can fail when linking with
-                * them.  However since we used to find bugs with this
-                * strict check we still keep it around.  Hopefully this
-                * could be a good hint when it fails again.  If one day
-                * it'll break on some other impl of glibc we'll revisit.
-                */
-               if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
-                       err("unexpected write fault");
-
-               offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
-               offset &= ~(page_size-1);
-
-               if (copy_page(uffd, offset))
-                       stats->missing_faults++;
-       }
-}
-
-static void *uffd_poll_thread(void *arg)
-{
-       struct uffd_stats *stats = (struct uffd_stats *)arg;
-       unsigned long cpu = stats->cpu;
-       struct pollfd pollfd[2];
-       struct uffd_msg msg;
-       struct uffdio_register uffd_reg;
-       int ret;
-       char tmp_chr;
-
-       pollfd[0].fd = uffd;
-       pollfd[0].events = POLLIN;
-       pollfd[1].fd = pipefd[cpu*2];
-       pollfd[1].events = POLLIN;
-
-       for (;;) {
-               ret = poll(pollfd, 2, -1);
-               if (ret <= 0) {
-                       if (errno == EINTR || errno == EAGAIN)
-                               continue;
-                       err("poll error: %d", ret);
-               }
-               if (pollfd[1].revents & POLLIN) {
-                       if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
-                               err("read pipefd error");
-                       break;
-               }
-               if (!(pollfd[0].revents & POLLIN))
-                       err("pollfd[0].revents %d", pollfd[0].revents);
-               if (uffd_read_msg(uffd, &msg))
-                       continue;
-               switch (msg.event) {
-               default:
-                       err("unexpected msg event %u\n", msg.event);
-                       break;
-               case UFFD_EVENT_PAGEFAULT:
-                       uffd_handle_page_fault(&msg, stats);
-                       break;
-               case UFFD_EVENT_FORK:
-                       close(uffd);
-                       uffd = msg.arg.fork.ufd;
-                       pollfd[0].fd = uffd;
-                       break;
-               case UFFD_EVENT_REMOVE:
-                       uffd_reg.range.start = msg.arg.remove.start;
-                       uffd_reg.range.len = msg.arg.remove.end -
-                               msg.arg.remove.start;
-                       if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
-                               err("remove failure");
-                       break;
-               case UFFD_EVENT_REMAP:
-                       area_remap = area_dst;  /* save for later unmap */
-                       area_dst = (char *)(unsigned long)msg.arg.remap.to;
-                       break;
-               }
-       }
-
-       return NULL;
-}
-
-pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static void *uffd_read_thread(void *arg)
-{
-       struct uffd_stats *stats = (struct uffd_stats *)arg;
-       struct uffd_msg msg;
-
-       pthread_mutex_unlock(&uffd_read_mutex);
-       /* from here cancellation is ok */
-
-       for (;;) {
-               if (uffd_read_msg(uffd, &msg))
-                       continue;
-               uffd_handle_page_fault(&msg, stats);
-       }
-
-       return NULL;
-}
-
-static void *background_thread(void *arg)
-{
-       unsigned long cpu = (unsigned long) arg;
-       unsigned long page_nr, start_nr, mid_nr, end_nr;
-
-       start_nr = cpu * nr_pages_per_cpu;
-       end_nr = (cpu+1) * nr_pages_per_cpu;
-       mid_nr = (start_nr + end_nr) / 2;
-
-       /* Copy the first half of the pages */
-       for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
-               copy_page_retry(uffd, page_nr * page_size);
-
-       /*
-        * If we need to test uffd-wp, set it up now.  Then we'll have
-        * at least the first half of the pages mapped already which
-        * can be write-protected for testing
-        */
-       if (test_uffdio_wp)
-               wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
-                       nr_pages_per_cpu * page_size, true);
-
-       /*
-        * Continue the 2nd half of the page copying, handling write
-        * protection faults if any
-        */
-       for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
-               copy_page_retry(uffd, page_nr * page_size);
-
-       return NULL;
-}
-
-static int stress(struct uffd_stats *uffd_stats)
-{
-       unsigned long cpu;
-       pthread_t locking_threads[nr_cpus];
-       pthread_t uffd_threads[nr_cpus];
-       pthread_t background_threads[nr_cpus];
-
-       finished = 0;
-       for (cpu = 0; cpu < nr_cpus; cpu++) {
-               if (pthread_create(&locking_threads[cpu], &attr,
-                                  locking_thread, (void *)cpu))
-                       return 1;
-               if (bounces & BOUNCE_POLL) {
-                       if (pthread_create(&uffd_threads[cpu], &attr,
-                                          uffd_poll_thread,
-                                          (void *)&uffd_stats[cpu]))
-                               return 1;
-               } else {
-                       if (pthread_create(&uffd_threads[cpu], &attr,
-                                          uffd_read_thread,
-                                          (void *)&uffd_stats[cpu]))
-                               return 1;
-                       pthread_mutex_lock(&uffd_read_mutex);
-               }
-               if (pthread_create(&background_threads[cpu], &attr,
-                                  background_thread, (void *)cpu))
-                       return 1;
-       }
-       for (cpu = 0; cpu < nr_cpus; cpu++)
-               if (pthread_join(background_threads[cpu], NULL))
-                       return 1;
-
-       /*
-        * Be strict and immediately zap area_src, the whole area has
-        * been transferred already by the background treads. The
-        * area_src could then be faulted in a racy way by still
-        * running uffdio_threads reading zeropages after we zapped
-        * area_src (but they're guaranteed to get -EEXIST from
-        * UFFDIO_COPY without writing zero pages into area_dst
-        * because the background threads already completed).
-        */
-       uffd_test_ops->release_pages(area_src);
-
-       finished = 1;
-       for (cpu = 0; cpu < nr_cpus; cpu++)
-               if (pthread_join(locking_threads[cpu], NULL))
-                       return 1;
-
-       for (cpu = 0; cpu < nr_cpus; cpu++) {
-               char c;
-               if (bounces & BOUNCE_POLL) {
-                       if (write(pipefd[cpu*2+1], &c, 1) != 1)
-                               err("pipefd write error");
-                       if (pthread_join(uffd_threads[cpu],
-                                        (void *)&uffd_stats[cpu]))
-                               return 1;
-               } else {
-                       if (pthread_cancel(uffd_threads[cpu]))
-                               return 1;
-                       if (pthread_join(uffd_threads[cpu], NULL))
-                               return 1;
-               }
-       }
-
-       return 0;
-}
-
-sigjmp_buf jbuf, *sigbuf;
-
-static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
-{
-       if (sig == SIGBUS) {
-               if (sigbuf)
-                       siglongjmp(*sigbuf, 1);
-               abort();
-       }
-}
-
-/*
- * For non-cooperative userfaultfd test we fork() a process that will
- * generate pagefaults, will mremap the area monitored by the
- * userfaultfd and at last this process will release the monitored
- * area.
- * For the anonymous and shared memory the area is divided into two
- * parts, the first part is accessed before mremap, and the second
- * part is accessed after mremap. Since hugetlbfs does not support
- * mremap, the entire monitored area is accessed in a single pass for
- * HUGETLB_TEST.
- * The release of the pages currently generates event for shmem and
- * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
- * for hugetlb.
- * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
- * monitored area, generate pagefaults and test that signal is delivered.
- * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
- * test robustness use case - we release monitored area, fork a process
- * that will generate pagefaults and verify signal is generated.
- * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
- * feature. Using monitor thread, verify no userfault events are generated.
- */
-static int faulting_process(int signal_test)
-{
-       unsigned long nr;
-       unsigned long long count;
-       unsigned long split_nr_pages;
-       unsigned long lastnr;
-       struct sigaction act;
-       volatile unsigned long signalled = 0;
-
-       split_nr_pages = (nr_pages + 1) / 2;
-
-       if (signal_test) {
-               sigbuf = &jbuf;
-               memset(&act, 0, sizeof(act));
-               act.sa_sigaction = sighndl;
-               act.sa_flags = SA_SIGINFO;
-               if (sigaction(SIGBUS, &act, 0))
-                       err("sigaction");
-               lastnr = (unsigned long)-1;
-       }
-
-       for (nr = 0; nr < split_nr_pages; nr++) {
-               volatile int steps = 1;
-               unsigned long offset = nr * page_size;
-
-               if (signal_test) {
-                       if (sigsetjmp(*sigbuf, 1) != 0) {
-                               if (steps == 1 && nr == lastnr)
-                                       err("Signal repeated");
-
-                               lastnr = nr;
-                               if (signal_test == 1) {
-                                       if (steps == 1) {
-                                               /* This is a MISSING request */
-                                               steps++;
-                                               if (copy_page(uffd, offset))
-                                                       signalled++;
-                                       } else {
-                                               /* This is a WP request */
-                                               assert(steps == 2);
-                                               wp_range(uffd,
-                                                        (__u64)area_dst +
-                                                        offset,
-                                                        page_size, false);
-                                       }
-                               } else {
-                                       signalled++;
-                                       continue;
-                               }
-                       }
-               }
-
-               count = *area_count(area_dst, nr);
-               if (count != count_verify[nr])
-                       err("nr %lu memory corruption %llu %llu\n",
-                           nr, count, count_verify[nr]);
-               /*
-                * Trigger write protection if there is by writing
-                * the same value back.
-                */
-               *area_count(area_dst, nr) = count;
-       }
-
-       if (signal_test)
-               return signalled != split_nr_pages;
-
-       area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
-                         MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
-       if (area_dst == MAP_FAILED)
-               err("mremap");
-       /* Reset area_src since we just clobbered it */
-       area_src = NULL;
-
-       for (; nr < nr_pages; nr++) {
-               count = *area_count(area_dst, nr);
-               if (count != count_verify[nr]) {
-                       err("nr %lu memory corruption %llu %llu\n",
-                           nr, count, count_verify[nr]);
-               }
-               /*
-                * Trigger write protection if there is by writing
-                * the same value back.
-                */
-               *area_count(area_dst, nr) = count;
-       }
-
-       uffd_test_ops->release_pages(area_dst);
-
-       for (nr = 0; nr < nr_pages; nr++)
-               if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
-                       err("nr %lu is not zero", nr);
-
-       return 0;
-}
-
-static void retry_uffdio_zeropage(int ufd,
-                                 struct uffdio_zeropage *uffdio_zeropage,
-                                 unsigned long offset)
-{
-       uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
-                                    uffdio_zeropage->range.len,
-                                    offset);
-       if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
-               if (uffdio_zeropage->zeropage != -EEXIST)
-                       err("UFFDIO_ZEROPAGE error: %"PRId64,
-                           (int64_t)uffdio_zeropage->zeropage);
-       } else {
-               err("UFFDIO_ZEROPAGE error: %"PRId64,
-                   (int64_t)uffdio_zeropage->zeropage);
-       }
-}
-
-static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
-{
-       struct uffdio_zeropage uffdio_zeropage;
-       int ret;
-       bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE);
-       __s64 res;
-
-       if (offset >= nr_pages * page_size)
-               err("unexpected offset %lu", offset);
-       uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
-       uffdio_zeropage.range.len = page_size;
-       uffdio_zeropage.mode = 0;
-       ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
-       res = uffdio_zeropage.zeropage;
-       if (ret) {
-               /* real retval in ufdio_zeropage.zeropage */
-               if (has_zeropage)
-                       err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
-               else if (res != -EINVAL)
-                       err("UFFDIO_ZEROPAGE not -EINVAL");
-       } else if (has_zeropage) {
-               if (res != page_size) {
-                       err("UFFDIO_ZEROPAGE unexpected size");
-               } else {
-                       if (test_uffdio_zeropage_eexist && retry) {
-                               test_uffdio_zeropage_eexist = false;
-                               retry_uffdio_zeropage(ufd, &uffdio_zeropage,
-                                                     offset);
-                       }
-                       return 1;
-               }
-       } else
-               err("UFFDIO_ZEROPAGE succeeded");
-
-       return 0;
-}
-
-static int uffdio_zeropage(int ufd, unsigned long offset)
-{
-       return __uffdio_zeropage(ufd, offset, false);
-}
-
-/* exercise UFFDIO_ZEROPAGE */
-static int userfaultfd_zeropage_test(void)
-{
-       struct uffdio_register uffdio_register;
-
-       printf("testing UFFDIO_ZEROPAGE: ");
-       fflush(stdout);
-
-       uffd_test_ctx_init(0);
-
-       uffdio_register.range.start = (unsigned long) area_dst;
-       uffdio_register.range.len = nr_pages * page_size;
-       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-       if (test_uffdio_wp)
-               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-               err("register failure");
-
-       assert_expected_ioctls_present(
-               uffdio_register.mode, uffdio_register.ioctls);
-
-       if (uffdio_zeropage(uffd, 0))
-               if (my_bcmp(area_dst, zeropage, page_size))
-                       err("zeropage is not zero");
-
-       printf("done.\n");
-       return 0;
-}
-
-static int userfaultfd_events_test(void)
-{
-       struct uffdio_register uffdio_register;
-       pthread_t uffd_mon;
-       int err, features;
-       pid_t pid;
-       char c;
-       struct uffd_stats stats = { 0 };
-
-       printf("testing events (fork, remap, remove): ");
-       fflush(stdout);
-
-       features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
-               UFFD_FEATURE_EVENT_REMOVE;
-       uffd_test_ctx_init(features);
-
-       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-
-       uffdio_register.range.start = (unsigned long) area_dst;
-       uffdio_register.range.len = nr_pages * page_size;
-       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-       if (test_uffdio_wp)
-               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-               err("register failure");
-
-       assert_expected_ioctls_present(
-               uffdio_register.mode, uffdio_register.ioctls);
-
-       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
-               err("uffd_poll_thread create");
-
-       pid = fork();
-       if (pid < 0)
-               err("fork");
-
-       if (!pid)
-               exit(faulting_process(0));
-
-       waitpid(pid, &err, 0);
-       if (err)
-               err("faulting process failed");
-       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
-               err("pipe write");
-       if (pthread_join(uffd_mon, NULL))
-               return 1;
-
-       uffd_stats_report(&stats, 1);
-
-       return stats.missing_faults != nr_pages;
-}
-
-static int userfaultfd_sig_test(void)
-{
-       struct uffdio_register uffdio_register;
-       unsigned long userfaults;
-       pthread_t uffd_mon;
-       int err, features;
-       pid_t pid;
-       char c;
-       struct uffd_stats stats = { 0 };
-
-       printf("testing signal delivery: ");
-       fflush(stdout);
-
-       features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
-       uffd_test_ctx_init(features);
-
-       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-
-       uffdio_register.range.start = (unsigned long) area_dst;
-       uffdio_register.range.len = nr_pages * page_size;
-       uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-       if (test_uffdio_wp)
-               uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-               err("register failure");
-
-       assert_expected_ioctls_present(
-               uffdio_register.mode, uffdio_register.ioctls);
-
-       if (faulting_process(1))
-               err("faulting process failed");
-
-       uffd_test_ops->release_pages(area_dst);
-
-       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
-               err("uffd_poll_thread create");
-
-       pid = fork();
-       if (pid < 0)
-               err("fork");
-
-       if (!pid)
-               exit(faulting_process(2));
-
-       waitpid(pid, &err, 0);
-       if (err)
-               err("faulting process failed");
-       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
-               err("pipe write");
-       if (pthread_join(uffd_mon, (void **)&userfaults))
-               return 1;
-
-       printf("done.\n");
-       if (userfaults)
-               err("Signal test failed, userfaults: %ld", userfaults);
-
-       return userfaults != 0;
-}
-
-void check_memory_contents(char *p)
-{
-       unsigned long i;
-       uint8_t expected_byte;
-       void *expected_page;
-
-       if (posix_memalign(&expected_page, page_size, page_size))
-               err("out of memory");
-
-       for (i = 0; i < nr_pages; ++i) {
-               expected_byte = ~((uint8_t)(i % ((uint8_t)-1)));
-               memset(expected_page, expected_byte, page_size);
-               if (my_bcmp(expected_page, p + (i * page_size), page_size))
-                       err("unexpected page contents after minor fault");
-       }
-
-       free(expected_page);
-}
-
-static int userfaultfd_minor_test(void)
-{
-       unsigned long p;
-       struct uffdio_register uffdio_register;
-       pthread_t uffd_mon;
-       char c;
-       struct uffd_stats stats = { 0 };
-
-       if (!test_uffdio_minor)
-               return 0;
-
-       printf("testing minor faults: ");
-       fflush(stdout);
-
-       uffd_test_ctx_init(uffd_minor_feature());
-
-       uffdio_register.range.start = (unsigned long)area_dst_alias;
-       uffdio_register.range.len = nr_pages * page_size;
-       uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
-       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-               err("register failure");
-
-       assert_expected_ioctls_present(
-               uffdio_register.mode, uffdio_register.ioctls);
-
-       /*
-        * After registering with UFFD, populate the non-UFFD-registered side of
-        * the shared mapping. This should *not* trigger any UFFD minor faults.
-        */
-       for (p = 0; p < nr_pages; ++p) {
-               memset(area_dst + (p * page_size), p % ((uint8_t)-1),
-                      page_size);
-       }
-
-       if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
-               err("uffd_poll_thread create");
-
-       /*
-        * Read each of the pages back using the UFFD-registered mapping. We
-        * expect that the first time we touch a page, it will result in a minor
-        * fault. uffd_poll_thread will resolve the fault by bit-flipping the
-        * page's contents, and then issuing a CONTINUE ioctl.
-        */
-       check_memory_contents(area_dst_alias);
-
-       if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
-               err("pipe write");
-       if (pthread_join(uffd_mon, NULL))
-               return 1;
-
-       uffd_stats_report(&stats, 1);
-
-       if (test_collapse) {
-               printf("testing collapse of uffd memory into PMD-mapped THPs:");
-               if (madvise(area_dst_alias, nr_pages * page_size,
-                           MADV_COLLAPSE))
-                       err("madvise(MADV_COLLAPSE)");
-
-               uffd_test_ops->check_pmd_mapping(area_dst,
-                                                nr_pages * page_size /
-                                                hpage_size);
-               /*
-                * This won't cause uffd-fault - it purely just makes sure there
-                * was no corruption.
-                */
-               check_memory_contents(area_dst_alias);
-               printf(" done.\n");
-       }
-
-       return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
-}
-
-#define BIT_ULL(nr)                   (1ULL << (nr))
-#define PM_SOFT_DIRTY                 BIT_ULL(55)
-#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
-#define PM_UFFD_WP                    BIT_ULL(57)
-#define PM_FILE                       BIT_ULL(61)
-#define PM_SWAP                       BIT_ULL(62)
-#define PM_PRESENT                    BIT_ULL(63)
-
-static int pagemap_open(void)
-{
-       int fd = open("/proc/self/pagemap", O_RDONLY);
-
-       if (fd < 0)
-               err("open pagemap");
-
-       return fd;
-}
-
-static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
-{
-       uint64_t value;
-       int ret;
-
-       ret = pread(fd, &value, sizeof(uint64_t),
-                   ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
-       if (ret != sizeof(uint64_t))
-               err("pread() on pagemap failed");
-
-       return value;
-}
-
-/* This macro let __LINE__ works in err() */
-#define  pagemap_check_wp(value, wp) do {                              \
-               if (!!(value & PM_UFFD_WP) != wp)                       \
-                       err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
-       } while (0)
-
-static int pagemap_test_fork(bool present)
-{
-       pid_t child = fork();
-       uint64_t value;
-       int fd, result;
-
-       if (!child) {
-               /* Open the pagemap fd of the child itself */
-               fd = pagemap_open();
-               value = pagemap_read_vaddr(fd, area_dst);
-               /*
-                * After fork() uffd-wp bit should be gone as long as we're
-                * without UFFD_FEATURE_EVENT_FORK
-                */
-               pagemap_check_wp(value, false);
-               /* Succeed */
-               exit(0);
-       }
-       waitpid(child, &result, 0);
-       return result;
-}
-
-static void userfaultfd_pagemap_test(unsigned int test_pgsize)
-{
-       struct uffdio_register uffdio_register;
-       int pagemap_fd;
-       uint64_t value;
-
-       /* Pagemap tests uffd-wp only */
-       if (!test_uffdio_wp)
-               return;
-
-       /* Not enough memory to test this page size */
-       if (test_pgsize > nr_pages * page_size)
-               return;
-
-       printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
-       /* Flush so it doesn't flush twice in parent/child later */
-       fflush(stdout);
-
-       uffd_test_ctx_init(0);
-
-       if (test_pgsize > page_size) {
-               /* This is a thp test */
-               if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
-                       err("madvise(MADV_HUGEPAGE) failed");
-       } else if (test_pgsize == page_size) {
-               /* This is normal page test; force no thp */
-               if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
-                       err("madvise(MADV_NOHUGEPAGE) failed");
-       }
-
-       uffdio_register.range.start = (unsigned long) area_dst;
-       uffdio_register.range.len = nr_pages * page_size;
-       uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
-       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-               err("register failed");
-
-       pagemap_fd = pagemap_open();
-
-       /* Touch the page */
-       *area_dst = 1;
-       wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
-       value = pagemap_read_vaddr(pagemap_fd, area_dst);
-       pagemap_check_wp(value, true);
-       /* Make sure uffd-wp bit dropped when fork */
-       if (pagemap_test_fork(true))
-               err("Detected stall uffd-wp bit in child");
-
-       /* Exclusive required or PAGEOUT won't work */
-       if (!(value & PM_MMAP_EXCLUSIVE))
-               err("multiple mapping detected: 0x%"PRIx64, value);
-
-       if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
-               err("madvise(MADV_PAGEOUT) failed");
-
-       /* Uffd-wp should persist even swapped out */
-       value = pagemap_read_vaddr(pagemap_fd, area_dst);
-       pagemap_check_wp(value, true);
-       /* Make sure uffd-wp bit dropped when fork */
-       if (pagemap_test_fork(false))
-               err("Detected stall uffd-wp bit in child");
-
-       /* Unprotect; this tests swap pte modifications */
-       wp_range(uffd, (uint64_t)area_dst, page_size, false);
-       value = pagemap_read_vaddr(pagemap_fd, area_dst);
-       pagemap_check_wp(value, false);
-
-       /* Fault in the page from disk */
-       *area_dst = 2;
-       value = pagemap_read_vaddr(pagemap_fd, area_dst);
-       pagemap_check_wp(value, false);
-
-       close(pagemap_fd);
-       printf("done\n");
-}
-
-static int userfaultfd_stress(void)
-{
-       void *area;
-       unsigned long nr;
-       struct uffdio_register uffdio_register;
-       struct uffd_stats uffd_stats[nr_cpus];
-
-       uffd_test_ctx_init(0);
-
-       if (posix_memalign(&area, page_size, page_size))
-               err("out of memory");
-       zeropage = area;
-       bzero(zeropage, page_size);
-
-       pthread_mutex_lock(&uffd_read_mutex);
-
-       pthread_attr_init(&attr);
-       pthread_attr_setstacksize(&attr, 16*1024*1024);
-
-       while (bounces--) {
-               printf("bounces: %d, mode:", bounces);
-               if (bounces & BOUNCE_RANDOM)
-                       printf(" rnd");
-               if (bounces & BOUNCE_RACINGFAULTS)
-                       printf(" racing");
-               if (bounces & BOUNCE_VERIFY)
-                       printf(" ver");
-               if (bounces & BOUNCE_POLL)
-                       printf(" poll");
-               else
-                       printf(" read");
-               printf(", ");
-               fflush(stdout);
-
-               if (bounces & BOUNCE_POLL)
-                       fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
-               else
-                       fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
-
-               /* register */
-               uffdio_register.range.start = (unsigned long) area_dst;
-               uffdio_register.range.len = nr_pages * page_size;
-               uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-               if (test_uffdio_wp)
-                       uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
-               if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-                       err("register failure");
-               assert_expected_ioctls_present(
-                       uffdio_register.mode, uffdio_register.ioctls);
-
-               if (area_dst_alias) {
-                       uffdio_register.range.start = (unsigned long)
-                               area_dst_alias;
-                       if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
-                               err("register failure alias");
-               }
-
-               /*
-                * The madvise done previously isn't enough: some
-                * uffd_thread could have read userfaults (one of
-                * those already resolved by the background thread)
-                * and it may be in the process of calling
-                * UFFDIO_COPY. UFFDIO_COPY will read the zapped
-                * area_src and it would map a zero page in it (of
-                * course such a UFFDIO_COPY is perfectly safe as it'd
-                * return -EEXIST). The problem comes at the next
-                * bounce though: that racing UFFDIO_COPY would
-                * generate zeropages in the area_src, so invalidating
-                * the previous MADV_DONTNEED. Without this additional
-                * MADV_DONTNEED those zeropages leftovers in the
-                * area_src would lead to -EEXIST failure during the
-                * next bounce, effectively leaving a zeropage in the
-                * area_dst.
-                *
-                * Try to comment this out madvise to see the memory
-                * corruption being caught pretty quick.
-                *
-                * khugepaged is also inhibited to collapse THP after
-                * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
-                * required to MADV_DONTNEED here.
-                */
-               uffd_test_ops->release_pages(area_dst);
-
-               uffd_stats_reset(uffd_stats, nr_cpus);
-
-               /* bounce pass */
-               if (stress(uffd_stats))
-                       return 1;
-
-               /* Clear all the write protections if there is any */
-               if (test_uffdio_wp)
-                       wp_range(uffd, (unsigned long)area_dst,
-                                nr_pages * page_size, false);
-
-               /* unregister */
-               if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
-                       err("unregister failure");
-               if (area_dst_alias) {
-                       uffdio_register.range.start = (unsigned long) area_dst;
-                       if (ioctl(uffd, UFFDIO_UNREGISTER,
-                                 &uffdio_register.range))
-                               err("unregister failure alias");
-               }
-
-               /* verification */
-               if (bounces & BOUNCE_VERIFY)
-                       for (nr = 0; nr < nr_pages; nr++)
-                               if (*area_count(area_dst, nr) != count_verify[nr])
-                                       err("error area_count %llu %llu %lu\n",
-                                           *area_count(area_src, nr),
-                                           count_verify[nr], nr);
-
-               /* prepare next bounce */
-               swap(area_src, area_dst);
-
-               swap(area_src_alias, area_dst_alias);
-
-               uffd_stats_report(uffd_stats, nr_cpus);
-       }
-
-       if (test_type == TEST_ANON) {
-               /*
-                * shmem/hugetlb won't be able to run since they have different
-                * behavior on fork() (file-backed memory normally drops ptes
-                * directly when fork), meanwhile the pagemap test will verify
-                * pgtable entry of fork()ed child.
-                */
-               userfaultfd_pagemap_test(page_size);
-               /*
-                * Hard-code for x86_64 for now for 2M THP, as x86_64 is
-                * currently the only one that supports uffd-wp
-                */
-               userfaultfd_pagemap_test(page_size * 512);
-       }
-
-       return userfaultfd_zeropage_test() || userfaultfd_sig_test()
-               || userfaultfd_events_test() || userfaultfd_minor_test();
-}
-
-/*
- * Copied from mlock2-tests.c
- */
-unsigned long default_huge_page_size(void)
-{
-       unsigned long hps = 0;
-       char *line = NULL;
-       size_t linelen = 0;
-       FILE *f = fopen("/proc/meminfo", "r");
-
-       if (!f)
-               return 0;
-       while (getline(&line, &linelen, f) > 0) {
-               if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
-                       hps <<= 10;
-                       break;
-               }
-       }
-
-       free(line);
-       fclose(f);
-       return hps;
-}
-
-static void set_test_type(const char *type)
-{
-       if (!strcmp(type, "anon")) {
-               test_type = TEST_ANON;
-               uffd_test_ops = &anon_uffd_test_ops;
-       } else if (!strcmp(type, "hugetlb")) {
-               test_type = TEST_HUGETLB;
-               uffd_test_ops = &hugetlb_uffd_test_ops;
-       } else if (!strcmp(type, "hugetlb_shared")) {
-               map_shared = true;
-               test_type = TEST_HUGETLB;
-               uffd_test_ops = &hugetlb_uffd_test_ops;
-               /* Minor faults require shared hugetlb; only enable here. */
-               test_uffdio_minor = true;
-       } else if (!strcmp(type, "shmem")) {
-               map_shared = true;
-               test_type = TEST_SHMEM;
-               uffd_test_ops = &shmem_uffd_test_ops;
-               test_uffdio_minor = true;
-       }
-}
-
-static void parse_test_type_arg(const char *raw_type)
-{
-       char *buf = strdup(raw_type);
-       uint64_t features = UFFD_API_FEATURES;
-
-       while (buf) {
-               const char *token = strsep(&buf, ":");
-
-               if (!test_type)
-                       set_test_type(token);
-               else if (!strcmp(token, "dev"))
-                       test_dev_userfaultfd = true;
-               else if (!strcmp(token, "syscall"))
-                       test_dev_userfaultfd = false;
-               else if (!strcmp(token, "collapse"))
-                       test_collapse = true;
-               else
-                       err("unrecognized test mod '%s'", token);
-       }
-
-       if (!test_type)
-               err("failed to parse test type argument: '%s'", raw_type);
-
-       if (test_collapse && test_type != TEST_SHMEM)
-               err("Unsupported test: %s", raw_type);
-
-       if (test_type == TEST_HUGETLB)
-               page_size = hpage_size;
-       else
-               page_size = sysconf(_SC_PAGE_SIZE);
-
-       if (!page_size)
-               err("Unable to determine page size");
-       if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
-           > page_size)
-               err("Impossible to run this test");
-
-       /*
-        * Whether we can test certain features depends not just on test type,
-        * but also on whether or not this particular kernel supports the
-        * feature.
-        */
-
-       userfaultfd_open(&features);
-
-       test_uffdio_wp = test_uffdio_wp &&
-               (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP);
-       test_uffdio_minor = test_uffdio_minor &&
-               (features & uffd_minor_feature());
-
-       close(uffd);
-       uffd = -1;
-}
-
-static void sigalrm(int sig)
-{
-       if (sig != SIGALRM)
-               abort();
-       test_uffdio_copy_eexist = true;
-       test_uffdio_zeropage_eexist = true;
-       alarm(ALARM_INTERVAL_SECS);
-}
-
-int main(int argc, char **argv)
-{
-       size_t bytes;
-
-       if (argc < 4)
-               usage();
-
-       if (signal(SIGALRM, sigalrm) == SIG_ERR)
-               err("failed to arm SIGALRM");
-       alarm(ALARM_INTERVAL_SECS);
-
-       hpage_size = default_huge_page_size();
-       parse_test_type_arg(argv[1]);
-       bytes = atol(argv[2]) * 1024 * 1024;
-
-       if (test_collapse && bytes & (hpage_size - 1))
-               err("MiB must be multiple of %lu if :collapse mod set",
-                   hpage_size >> 20);
-
-       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-
-       if (test_collapse) {
-               /* nr_cpus must divide (bytes / page_size), otherwise,
-                * area allocations of (nr_pages * paze_size) won't be a
-                * multiple of hpage_size, even if bytes is a multiple of
-                * hpage_size.
-                *
-                * This means that nr_cpus must divide (N * (2 << (H-P))
-                * where:
-                *      bytes = hpage_size * N
-                *      hpage_size = 2 << H
-                *      page_size = 2 << P
-                *
-                * And we want to chose nr_cpus to be the largest value
-                * satisfying this constraint, not larger than the number
-                * of online CPUs. Unfortunately, prime factorization of
-                * N and nr_cpus may be arbitrary, so have to search for it.
-                * Instead, just use the highest power of 2 dividing both
-                * nr_cpus and (bytes / page_size).
-                */
-               int x = factor_of_2(nr_cpus);
-               int y = factor_of_2(bytes / page_size);
-
-               nr_cpus = x < y ? x : y;
-       }
-       nr_pages_per_cpu = bytes / page_size / nr_cpus;
-       if (!nr_pages_per_cpu) {
-               _err("invalid MiB");
-               usage();
-       }
-
-       bounces = atoi(argv[3]);
-       if (bounces <= 0) {
-               _err("invalid bounces");
-               usage();
-       }
-       nr_pages = nr_pages_per_cpu * nr_cpus;
-
-       if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) {
-               unsigned int memfd_flags = 0;
-
-               if (test_type == TEST_HUGETLB)
-                       memfd_flags = MFD_HUGETLB;
-               mem_fd = memfd_create(argv[0], memfd_flags);
-               if (mem_fd < 0)
-                       err("memfd_create");
-               if (ftruncate(mem_fd, nr_pages * page_size * 2))
-                       err("ftruncate");
-               if (fallocate(mem_fd,
-                             FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
-                             nr_pages * page_size * 2))
-                       err("fallocate");
-       }
-       printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
-              nr_pages, nr_pages_per_cpu);
-       return userfaultfd_stress();
-}
-
-#else /* __NR_userfaultfd */
-
-#warning "missing __NR_userfaultfd definition"
-
-int main(void)
-{
-       printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
-       return KSFT_SKIP;
-}
-
-#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h
deleted file mode 100644 (file)
index b27d261..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __KSELFTEST_VM_UTIL_H
-#define __KSELFTEST_VM_UTIL_H
-
-#include <stdint.h>
-#include <sys/mman.h>
-#include <err.h>
-#include <string.h> /* ffsl() */
-#include <unistd.h> /* _SC_PAGESIZE */
-
-static unsigned int __page_size;
-static unsigned int __page_shift;
-
-static inline unsigned int page_size(void)
-{
-       if (!__page_size)
-               __page_size = sysconf(_SC_PAGESIZE);
-       return __page_size;
-}
-
-static inline unsigned int page_shift(void)
-{
-       if (!__page_shift)
-               __page_shift = (ffsl(page_size()) - 1);
-       return __page_shift;
-}
-
-#define PAGE_SHIFT     (page_shift())
-#define PAGE_SIZE      (page_size())
-/*
- * On ppc64 this will only work with radix 2M hugepage size
- */
-#define HPAGE_SHIFT 21
-#define HPAGE_SIZE (1 << HPAGE_SHIFT)
-
-#define PAGEMAP_PRESENT(ent)   (((ent) & (1ull << 63)) != 0)
-#define PAGEMAP_PFN(ent)       ((ent) & ((1ull << 55) - 1))
-
-
-static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd)
-{
-       uint64_t ent[2];
-
-       /* drop pmd */
-       if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
-                MAP_FIXED | MAP_ANONYMOUS |
-                MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
-               errx(2, "mmap transhuge");
-
-       if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
-               err(2, "MADV_HUGEPAGE");
-
-       /* allocate transparent huge page */
-       *(volatile void **)ptr = ptr;
-
-       if (pread(pagemap_fd, ent, sizeof(ent),
-                 (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
-               err(2, "read pagemap");
-
-       if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
-           PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
-           !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
-               return PAGEMAP_PFN(ent[0]);
-
-       return -1;
-}
-
-#endif
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c
deleted file mode 100644 (file)
index 1d20689..0000000
+++ /dev/null
@@ -1,289 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *
- * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
- * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- */
-
-#include <stdio.h>
-#include <sys/mman.h>
-#include <string.h>
-
-#include "../kselftest.h"
-
-#ifdef __powerpc64__
-#define PAGE_SIZE      (64 << 10)
-/*
- * This will work with 16M and 2M hugepage size
- */
-#define HUGETLB_SIZE   (16 << 20)
-#else
-#define PAGE_SIZE      (4 << 10)
-#define HUGETLB_SIZE   (2 << 20)
-#endif
-
-/*
- * >= 128TB is the hint addr value we used to select
- * large address space.
- */
-#define ADDR_SWITCH_HINT (1UL << 47)
-#define LOW_ADDR       ((void *) (1UL << 30))
-#define HIGH_ADDR      ((void *) (1UL << 48))
-
-struct testcase {
-       void *addr;
-       unsigned long size;
-       unsigned long flags;
-       const char *msg;
-       unsigned int low_addr_required:1;
-       unsigned int keep_mapped:1;
-};
-
-static struct testcase testcases[] = {
-       {
-               /*
-                * If stack is moved, we could possibly allocate
-                * this at the requested address.
-                */
-               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
-               .size = PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
-               .low_addr_required = 1,
-       },
-       {
-               /*
-                * We should never allocate at the requested address or above it
-                * The len cross the 128TB boundary. Without MAP_FIXED
-                * we will always search in the lower address space.
-                */
-               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
-               .low_addr_required = 1,
-       },
-       {
-               /*
-                * Exact mapping at 128TB, the area is free we should get that
-                * even without MAP_FIXED.
-                */
-               .addr = ((void *)(ADDR_SWITCH_HINT)),
-               .size = PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *)(ADDR_SWITCH_HINT),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
-       },
-       {
-               .addr = NULL,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(NULL)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = LOW_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(LOW_ADDR)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR) again",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap(HIGH_ADDR, MAP_FIXED)",
-       },
-       {
-               .addr = (void *) -1,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *) -1,
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1) again",
-       },
-       {
-               .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
-               .size = PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
-               .low_addr_required = 1,
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
-               .low_addr_required = 1,
-               .keep_mapped = 1,
-       },
-       {
-               .addr = ((void *)(ADDR_SWITCH_HINT)),
-               .size = PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
-       },
-       {
-               .addr = (void *)(ADDR_SWITCH_HINT),
-               .size = 2 * PAGE_SIZE,
-               .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
-       },
-};
-
-static struct testcase hugetlb_testcases[] = {
-       {
-               .addr = NULL,
-               .size = HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(NULL, MAP_HUGETLB)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = LOW_ADDR,
-               .size = HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
-               .low_addr_required = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = HIGH_ADDR,
-               .size = HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
-       },
-       {
-               .addr = (void *) -1,
-               .size = HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1, MAP_HUGETLB)",
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *) -1,
-               .size = HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(-1, MAP_HUGETLB) again",
-       },
-       {
-               .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
-               .size = 2 * HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
-               .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
-               .low_addr_required = 1,
-               .keep_mapped = 1,
-       },
-       {
-               .addr = (void *)(ADDR_SWITCH_HINT),
-               .size = 2 * HUGETLB_SIZE,
-               .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
-               .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
-       },
-};
-
-static int run_test(struct testcase *test, int count)
-{
-       void *p;
-       int i, ret = KSFT_PASS;
-
-       for (i = 0; i < count; i++) {
-               struct testcase *t = test + i;
-
-               p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
-
-               printf("%s: %p - ", t->msg, p);
-
-               if (p == MAP_FAILED) {
-                       printf("FAILED\n");
-                       ret = KSFT_FAIL;
-                       continue;
-               }
-
-               if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
-                       printf("FAILED\n");
-                       ret = KSFT_FAIL;
-               } else {
-                       /*
-                        * Do a dereference of the address returned so that we catch
-                        * bugs in page fault handling
-                        */
-                       memset(p, 0, t->size);
-                       printf("OK\n");
-               }
-               if (!t->keep_mapped)
-                       munmap(p, t->size);
-       }
-
-       return ret;
-}
-
-static int supported_arch(void)
-{
-#if defined(__powerpc64__)
-       return 1;
-#elif defined(__x86_64__)
-       return 1;
-#else
-       return 0;
-#endif
-}
-
-int main(int argc, char **argv)
-{
-       int ret;
-
-       if (!supported_arch())
-               return KSFT_SKIP;
-
-       ret = run_test(testcases, ARRAY_SIZE(testcases));
-       if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
-               ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
-       return ret;
-}
diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/vm/va_128TBswitch.sh
deleted file mode 100755 (executable)
index 4158075..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# Copyright (C) 2022 Adam Sindelar (Meta) <adam@wowsignal.io>
-#
-# This is a test for mmap behavior with 5-level paging. This script wraps the
-# real test to check that the kernel is configured to support at least 5
-# pagetable levels.
-
-# 1 means the test failed
-exitcode=1
-
-# Kselftest framework requirement - SKIP code is 4.
-ksft_skip=4
-
-fail()
-{
-       echo "$1"
-       exit $exitcode
-}
-
-check_supported_x86_64()
-{
-       local config="/proc/config.gz"
-       [[ -f "${config}" ]] || config="/boot/config-$(uname -r)"
-       [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot"
-
-       # gzip -dcfq automatically handles both compressed and plaintext input.
-       # See man 1 gzip under '-f'.
-       local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2)
-
-       if [[ "${pg_table_levels}" -lt 5 ]]; then
-               echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test"
-               exit $ksft_skip
-       fi
-}
-
-check_test_requirements()
-{
-       # The test supports x86_64 and powerpc64. We currently have no useful
-       # eligibility check for powerpc64, and the test itself will reject other
-       # architectures.
-       case `uname -m` in
-               "x86_64")
-                       check_supported_x86_64
-               ;;
-               *)
-                       return 0
-               ;;
-       esac
-}
-
-check_test_requirements
-./va_128TBswitch
diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/vm/virtual_address_range.c
deleted file mode 100644 (file)
index c059264..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2017, Anshuman Khandual, IBM Corp.
- *
- * Works on architectures which support 128TB virtual
- * address range and beyond.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/time.h>
-
-/*
- * Maximum address range mapped with a single mmap()
- * call is little bit more than 16GB. Hence 16GB is
- * chosen as the single chunk size for address space
- * mapping.
- */
-#define MAP_CHUNK_SIZE   17179869184UL /* 16GB */
-
-/*
- * Address space till 128TB is mapped without any hint
- * and is enabled by default. Address space beyond 128TB
- * till 512TB is obtained by passing hint address as the
- * first argument into mmap() system call.
- *
- * The process heap address space is divided into two
- * different areas one below 128TB and one above 128TB
- * till it reaches 512TB. One with size 128TB and the
- * other being 384TB.
- *
- * On Arm64 the address space is 256TB and no high mappings
- * are supported so far.
- */
-
-#define NR_CHUNKS_128TB   8192UL /* Number of 16GB chunks for 128TB */
-#define NR_CHUNKS_256TB   (NR_CHUNKS_128TB * 2UL)
-#define NR_CHUNKS_384TB   (NR_CHUNKS_128TB * 3UL)
-
-#define ADDR_MARK_128TB  (1UL << 47) /* First address beyond 128TB */
-#define ADDR_MARK_256TB  (1UL << 48) /* First address beyond 256TB */
-
-#ifdef __aarch64__
-#define HIGH_ADDR_MARK  ADDR_MARK_256TB
-#define HIGH_ADDR_SHIFT 49
-#define NR_CHUNKS_LOW   NR_CHUNKS_256TB
-#define NR_CHUNKS_HIGH  0
-#else
-#define HIGH_ADDR_MARK  ADDR_MARK_128TB
-#define HIGH_ADDR_SHIFT 48
-#define NR_CHUNKS_LOW   NR_CHUNKS_128TB
-#define NR_CHUNKS_HIGH  NR_CHUNKS_384TB
-#endif
-
-static char *hind_addr(void)
-{
-       int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
-
-       return (char *) (1UL << bits);
-}
-
-static int validate_addr(char *ptr, int high_addr)
-{
-       unsigned long addr = (unsigned long) ptr;
-
-       if (high_addr) {
-               if (addr < HIGH_ADDR_MARK) {
-                       printf("Bad address %lx\n", addr);
-                       return 1;
-               }
-               return 0;
-       }
-
-       if (addr > HIGH_ADDR_MARK) {
-               printf("Bad address %lx\n", addr);
-               return 1;
-       }
-       return 0;
-}
-
-static int validate_lower_address_hint(void)
-{
-       char *ptr;
-
-       ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
-                       PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-       if (ptr == MAP_FAILED)
-               return 0;
-
-       return 1;
-}
-
-int main(int argc, char *argv[])
-{
-       char *ptr[NR_CHUNKS_LOW];
-       char *hptr[NR_CHUNKS_HIGH];
-       char *hint;
-       unsigned long i, lchunks, hchunks;
-
-       for (i = 0; i < NR_CHUNKS_LOW; i++) {
-               ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
-                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-               if (ptr[i] == MAP_FAILED) {
-                       if (validate_lower_address_hint())
-                               return 1;
-                       break;
-               }
-
-               if (validate_addr(ptr[i], 0))
-                       return 1;
-       }
-       lchunks = i;
-
-       for (i = 0; i < NR_CHUNKS_HIGH; i++) {
-               hint = hind_addr();
-               hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
-                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-
-               if (hptr[i] == MAP_FAILED)
-                       break;
-
-               if (validate_addr(hptr[i], 1))
-                       return 1;
-       }
-       hchunks = i;
-
-       for (i = 0; i < lchunks; i++)
-               munmap(ptr[i], MAP_CHUNK_SIZE);
-
-       for (i = 0; i < hchunks; i++)
-               munmap(hptr[i], MAP_CHUNK_SIZE);
-
-       return 0;
-}
diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c
deleted file mode 100644 (file)
index 40e7956..0000000
+++ /dev/null
@@ -1,151 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <string.h>
-#include <fcntl.h>
-#include "../kselftest.h"
-#include "vm_util.h"
-
-#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
-#define SMAP_FILE_PATH "/proc/self/smaps"
-#define MAX_LINE_LENGTH 500
-
-uint64_t pagemap_get_entry(int fd, char *start)
-{
-       const unsigned long pfn = (unsigned long)start / getpagesize();
-       uint64_t entry;
-       int ret;
-
-       ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry));
-       if (ret != sizeof(entry))
-               ksft_exit_fail_msg("reading pagemap failed\n");
-       return entry;
-}
-
-bool pagemap_is_softdirty(int fd, char *start)
-{
-       uint64_t entry = pagemap_get_entry(fd, start);
-
-       // Check if dirty bit (55th bit) is set
-       return entry & 0x0080000000000000ull;
-}
-
-bool pagemap_is_swapped(int fd, char *start)
-{
-       uint64_t entry = pagemap_get_entry(fd, start);
-
-       return entry & 0x4000000000000000ull;
-}
-
-bool pagemap_is_populated(int fd, char *start)
-{
-       uint64_t entry = pagemap_get_entry(fd, start);
-
-       /* Present or swapped. */
-       return entry & 0xc000000000000000ull;
-}
-
-unsigned long pagemap_get_pfn(int fd, char *start)
-{
-       uint64_t entry = pagemap_get_entry(fd, start);
-
-       /* If present (63th bit), PFN is at bit 0 -- 54. */
-       if (entry & 0x8000000000000000ull)
-               return entry & 0x007fffffffffffffull;
-       return -1ul;
-}
-
-void clear_softdirty(void)
-{
-       int ret;
-       const char *ctrl = "4";
-       int fd = open("/proc/self/clear_refs", O_WRONLY);
-
-       if (fd < 0)
-               ksft_exit_fail_msg("opening clear_refs failed\n");
-       ret = write(fd, ctrl, strlen(ctrl));
-       close(fd);
-       if (ret != strlen(ctrl))
-               ksft_exit_fail_msg("writing clear_refs failed\n");
-}
-
-bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len)
-{
-       while (fgets(buf, len, fp)) {
-               if (!strncmp(buf, pattern, strlen(pattern)))
-                       return true;
-       }
-       return false;
-}
-
-uint64_t read_pmd_pagesize(void)
-{
-       int fd;
-       char buf[20];
-       ssize_t num_read;
-
-       fd = open(PMD_SIZE_FILE_PATH, O_RDONLY);
-       if (fd == -1)
-               ksft_exit_fail_msg("Open hpage_pmd_size failed\n");
-
-       num_read = read(fd, buf, 19);
-       if (num_read < 1) {
-               close(fd);
-               ksft_exit_fail_msg("Read hpage_pmd_size failed\n");
-       }
-       buf[num_read] = '\0';
-       close(fd);
-
-       return strtoul(buf, NULL, 10);
-}
-
-bool __check_huge(void *addr, char *pattern, int nr_hpages,
-                 uint64_t hpage_size)
-{
-       uint64_t thp = -1;
-       int ret;
-       FILE *fp;
-       char buffer[MAX_LINE_LENGTH];
-       char addr_pattern[MAX_LINE_LENGTH];
-
-       ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
-                      (unsigned long) addr);
-       if (ret >= MAX_LINE_LENGTH)
-               ksft_exit_fail_msg("%s: Pattern is too long\n", __func__);
-
-       fp = fopen(SMAP_FILE_PATH, "r");
-       if (!fp)
-               ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH);
-
-       if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
-               goto err_out;
-
-       /*
-        * Fetch the pattern in the same block and check the number of
-        * hugepages.
-        */
-       if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer)))
-               goto err_out;
-
-       snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern);
-
-       if (sscanf(buffer, addr_pattern, &thp) != 1)
-               ksft_exit_fail_msg("Reading smap error\n");
-
-err_out:
-       fclose(fp);
-       return thp == (nr_hpages * (hpage_size >> 10));
-}
-
-bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size)
-{
-       return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size);
-}
-
-bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size)
-{
-       return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size);
-}
-
-bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size)
-{
-       return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size);
-}
diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h
deleted file mode 100644 (file)
index 1995ee9..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <stdint.h>
-#include <stdbool.h>
-
-uint64_t pagemap_get_entry(int fd, char *start);
-bool pagemap_is_softdirty(int fd, char *start);
-bool pagemap_is_swapped(int fd, char *start);
-bool pagemap_is_populated(int fd, char *start);
-unsigned long pagemap_get_pfn(int fd, char *start);
-void clear_softdirty(void);
-bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len);
-uint64_t read_pmd_pagesize(void);
-bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size);
-bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size);
-bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size);
diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh
deleted file mode 100644 (file)
index 70a0230..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-set -e
-
-size=$1
-populate=$2
-write=$3
-cgroup=$4
-path=$5
-method=$6
-private=$7
-want_sleep=$8
-reserve=$9
-
-echo "Putting task in cgroup '$cgroup'"
-echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs
-
-echo "Method is $method"
-
-set +e
-./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \
-      "$private" "$want_sleep" "$reserve"
diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/vm/write_to_hugetlbfs.c
deleted file mode 100644 (file)
index 6a2caba..0000000
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This program reserves and uses hugetlb memory, supporting a bunch of
- * scenarios needed by the charged_reserved_hugetlb.sh test.
- */
-
-#include <err.h>
-#include <errno.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/shm.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-/* Global definitions. */
-enum method {
-       HUGETLBFS,
-       MMAP_MAP_HUGETLB,
-       SHM,
-       MAX_METHOD
-};
-
-
-/* Global variables. */
-static const char *self;
-static char *shmaddr;
-static int shmid;
-
-/*
- * Show usage and exit.
- */
-static void exit_usage(void)
-{
-       printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> "
-              "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] "
-              "[-o] [-w] [-n]\n",
-              self);
-       exit(EXIT_FAILURE);
-}
-
-void sig_handler(int signo)
-{
-       printf("Received %d.\n", signo);
-       if (signo == SIGINT) {
-               printf("Deleting the memory\n");
-               if (shmdt((const void *)shmaddr) != 0) {
-                       perror("Detach failure");
-                       shmctl(shmid, IPC_RMID, NULL);
-                       exit(4);
-               }
-
-               shmctl(shmid, IPC_RMID, NULL);
-               printf("Done deleting the memory\n");
-       }
-       exit(2);
-}
-
-int main(int argc, char **argv)
-{
-       int fd = 0;
-       int key = 0;
-       int *ptr = NULL;
-       int c = 0;
-       int size = 0;
-       char path[256] = "";
-       enum method method = MAX_METHOD;
-       int want_sleep = 0, private = 0;
-       int populate = 0;
-       int write = 0;
-       int reserve = 1;
-
-       if (signal(SIGINT, sig_handler) == SIG_ERR)
-               err(1, "\ncan't catch SIGINT\n");
-
-       /* Parse command-line arguments. */
-       setvbuf(stdout, NULL, _IONBF, 0);
-       self = argv[0];
-
-       while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) {
-               switch (c) {
-               case 's':
-                       size = atoi(optarg);
-                       break;
-               case 'p':
-                       strncpy(path, optarg, sizeof(path));
-                       break;
-               case 'm':
-                       if (atoi(optarg) >= MAX_METHOD) {
-                               errno = EINVAL;
-                               perror("Invalid -m.");
-                               exit_usage();
-                       }
-                       method = atoi(optarg);
-                       break;
-               case 'o':
-                       populate = 1;
-                       break;
-               case 'w':
-                       write = 1;
-                       break;
-               case 'l':
-                       want_sleep = 1;
-                       break;
-               case 'r':
-                   private
-                       = 1;
-                       break;
-               case 'n':
-                       reserve = 0;
-                       break;
-               default:
-                       errno = EINVAL;
-                       perror("Invalid arg");
-                       exit_usage();
-               }
-       }
-
-       if (strncmp(path, "", sizeof(path)) != 0) {
-               printf("Writing to this path: %s\n", path);
-       } else {
-               errno = EINVAL;
-               perror("path not found");
-               exit_usage();
-       }
-
-       if (size != 0) {
-               printf("Writing this size: %d\n", size);
-       } else {
-               errno = EINVAL;
-               perror("size not found");
-               exit_usage();
-       }
-
-       if (!populate)
-               printf("Not populating.\n");
-       else
-               printf("Populating.\n");
-
-       if (!write)
-               printf("Not writing to memory.\n");
-
-       if (method == MAX_METHOD) {
-               errno = EINVAL;
-               perror("-m Invalid");
-               exit_usage();
-       } else
-               printf("Using method=%d\n", method);
-
-       if (!private)
-               printf("Shared mapping.\n");
-       else
-               printf("Private mapping.\n");
-
-       if (!reserve)
-               printf("NO_RESERVE mapping.\n");
-       else
-               printf("RESERVE mapping.\n");
-
-       switch (method) {
-       case HUGETLBFS:
-               printf("Allocating using HUGETLBFS.\n");
-               fd = open(path, O_CREAT | O_RDWR, 0777);
-               if (fd == -1)
-                       err(1, "Failed to open file.");
-
-               ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
-                          (private ? MAP_PRIVATE : MAP_SHARED) |
-                                  (populate ? MAP_POPULATE : 0) |
-                                  (reserve ? 0 : MAP_NORESERVE),
-                          fd, 0);
-
-               if (ptr == MAP_FAILED) {
-                       close(fd);
-                       err(1, "Error mapping the file");
-               }
-               break;
-       case MMAP_MAP_HUGETLB:
-               printf("Allocating using MAP_HUGETLB.\n");
-               ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
-                          (private ? (MAP_PRIVATE | MAP_ANONYMOUS) :
-                                     MAP_SHARED) |
-                                  MAP_HUGETLB | (populate ? MAP_POPULATE : 0) |
-                                  (reserve ? 0 : MAP_NORESERVE),
-                          -1, 0);
-
-               if (ptr == MAP_FAILED)
-                       err(1, "mmap");
-
-               printf("Returned address is %p\n", ptr);
-               break;
-       case SHM:
-               printf("Allocating using SHM.\n");
-               shmid = shmget(key, size,
-                              SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
-               if (shmid < 0) {
-                       shmid = shmget(++key, size,
-                                      SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
-                       if (shmid < 0)
-                               err(1, "shmget");
-               }
-               printf("shmid: 0x%x, shmget key:%d\n", shmid, key);
-
-               ptr = shmat(shmid, NULL, 0);
-               if (ptr == (int *)-1) {
-                       perror("Shared memory attach failure");
-                       shmctl(shmid, IPC_RMID, NULL);
-                       exit(2);
-               }
-               printf("shmaddr: %p\n", ptr);
-
-               break;
-       default:
-               errno = EINVAL;
-               err(1, "Invalid method.");
-       }
-
-       if (write) {
-               printf("Writing to memory.\n");
-               memset(ptr, 1, size);
-       }
-
-       if (want_sleep) {
-               /* Signal to caller that we're done. */
-               printf("DONE\n");
-
-               /* Hold memory until external kill signal is delivered. */
-               while (1)
-                       sleep(100);
-       }
-
-       if (method == HUGETLBFS)
-               close(fd);
-
-       return 0;
-}