]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 4 May 2017 00:55:59 +0000 (17:55 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 4 May 2017 00:55:59 +0000 (17:55 -0700)
Merge misc updates from Andrew Morton:

 - a few misc things

 - most of MM

 - KASAN updates

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (102 commits)
  kasan: separate report parts by empty lines
  kasan: improve double-free report format
  kasan: print page description after stacks
  kasan: improve slab object description
  kasan: change report header
  kasan: simplify address description logic
  kasan: change allocation and freeing stack traces headers
  kasan: unify report headers
  kasan: introduce helper functions for determining bug type
  mm: hwpoison: call shake_page() after try_to_unmap() for mlocked page
  mm: hwpoison: call shake_page() unconditionally
  mm/swapfile.c: fix swap space leak in error path of swap_free_entries()
  mm/gup.c: fix access_ok() argument type
  mm/truncate: avoid pointless cleancache_invalidate_inode() calls.
  mm/truncate: bail out early from invalidate_inode_pages2_range() if mapping is empty
  fs/block_dev: always invalidate cleancache in invalidate_bdev()
  fs: fix data invalidation in the cleancache during direct IO
  zram: reduce load operation in page_same_filled
  zram: use zram_free_page instead of open-coded
  zram: introduce zram data accessor
  ...

79 files changed:
Documentation/cgroup-v2.txt
Documentation/filesystems/proc.txt
Documentation/vm/00-INDEX
Documentation/vm/hugetlbfs_reserv.txt [new file with mode: 0644]
arch/blackfin/mach-bf609/clock.c
drivers/block/zram/zram_drv.c
drivers/block/zram/zram_drv.h
drivers/tty/sysrq.c
fs/block_dev.c
fs/iomap.c
fs/jbd2/journal.c
fs/jbd2/transaction.c
fs/ocfs2/cluster/heartbeat.c
fs/ocfs2/cluster/tcp.c
fs/proc/task_mmu.c
fs/xfs/kmem.c
fs/xfs/kmem.h
fs/xfs/libxfs/xfs_btree.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_trans.c
include/linux/gfp.h
include/linux/jbd2.h
include/linux/ksm.h
include/linux/memcontrol.h
include/linux/migrate.h
include/linux/mm.h
include/linux/mmzone.h
include/linux/rmap.h
include/linux/rodata_test.h
include/linux/sched.h
include/linux/sched/mm.h
include/linux/swap.h
include/linux/vm_event_item.h
kernel/locking/lockdep.c
lib/dma-debug.c
lib/radix-tree.c
mm/Kconfig.debug
mm/compaction.c
mm/filemap.c
mm/gup.c
mm/huge_memory.c
mm/hwpoison-inject.c
mm/internal.h
mm/kasan/kasan.c
mm/kasan/kasan.h
mm/kasan/report.c
mm/khugepaged.c
mm/ksm.c
mm/madvise.c
mm/memcontrol.c
mm/memory-failure.c
mm/memory_hotplug.c
mm/migrate.c
mm/mlock.c
mm/mmap.c
mm/oom_kill.c
mm/page-writeback.c
mm/page_alloc.c
mm/page_ext.c
mm/page_idle.c
mm/page_isolation.c
mm/page_poison.c
mm/rmap.c
mm/rodata_test.c
mm/slab.c
mm/sparse.c
mm/swap.c
mm/swap_slots.c
mm/swap_state.c
mm/swapfile.c
mm/truncate.c
mm/vmscan.c
mm/vmstat.c
mm/workingset.c
scripts/spelling.txt
tools/testing/selftests/vm/Makefile
tools/testing/selftests/vm/run_vmtests
tools/testing/selftests/vm/userfaultfd.c

index 49d7c997fa1ee7f759b5ba319bb57be464f0bd47..e50b95c25868f15f101e97176a8665391b9e71da 100644 (file)
@@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
 
                Amount of memory used in network transmission buffers
 
+         shmem
+
+               Amount of cached filesystem data that is swap-backed,
+               such as tmpfs, shm segments, shared anonymous mmap()s
+
          file_mapped
 
                Amount of cached filesystem data mapped with mmap()
index 9036dbf16156fa4a26b3ecd4b9d15e52e3af931b..4cddbce85ac9ad175743f5f4384a32937bf87888 100644 (file)
@@ -413,6 +413,7 @@ Private_Clean:         0 kB
 Private_Dirty:         0 kB
 Referenced:          892 kB
 Anonymous:             0 kB
+LazyFree:              0 kB
 AnonHugePages:         0 kB
 ShmemPmdMapped:        0 kB
 Shared_Hugetlb:        0 kB
@@ -442,6 +443,11 @@ accessed.
 "Anonymous" shows the amount of memory that does not belong to any file.  Even
 a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
 and a page is modified, the file page is replaced by a private anonymous copy.
+"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE).
+The memory isn't freed immediately with madvise(). It's freed in memory
+pressure if the memory is clean. Please note that the printed value might
+be lower than the real value due to optimizations used in the current
+implementation. If this is not desirable please file a bug report.
 "AnonHugePages" shows the ammount of memory backed by transparent hugepage.
 "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
 huge pages.
index 6a5e2a102a451b186344361603b43bff4c4ea15c..11d3d8dcb449804eadcb2c416406ef00a2dfdcf1 100644 (file)
@@ -12,6 +12,8 @@ highmem.txt
        - Outline of highmem and common issues.
 hugetlbpage.txt
        - a brief summary of hugetlbpage support in the Linux kernel.
+hugetlbfs_reserv.txt
+       - A brief overview of hugetlbfs reservation design/implementation.
 hwpoison.txt
        - explains what hwpoison is
 idle_page_tracking.txt
diff --git a/Documentation/vm/hugetlbfs_reserv.txt b/Documentation/vm/hugetlbfs_reserv.txt
new file mode 100644 (file)
index 0000000..9aca09a
--- /dev/null
@@ -0,0 +1,529 @@
+Hugetlbfs Reservation Overview
+------------------------------
+Huge pages as described at 'Documentation/vm/hugetlbpage.txt' are typically
+preallocated for application use.  These huge pages are instantiated in a
+task's address space at page fault time if the VMA indicates huge pages are
+to be used.  If no huge page exists at page fault time, the task is sent
+a SIGBUS and often dies an unhappy death.  Shortly after huge page support
+was added, it was determined that it would be better to detect a shortage
+of huge pages at mmap() time.  The idea is that if there were not enough
+huge pages to cover the mapping, the mmap() would fail.  This was first
+done with a simple check in the code at mmap() time to determine if there
+were enough free huge pages to cover the mapping.  Like most things in the
+kernel, the code has evolved over time.  However, the basic idea was to
+'reserve' huge pages at mmap() time to ensure that huge pages would be
+available for page faults in that mapping.  The description below attempts to
+describe how huge page reserve processing is done in the v4.10 kernel.
+
+
+Audience
+--------
+This description is primarily targeted at kernel developers who are modifying
+hugetlbfs code.
+
+
+The Data Structures
+-------------------
+resv_huge_pages
+       This is a global (per-hstate) count of reserved huge pages.  Reserved
+       huge pages are only available to the task which reserved them.
+       Therefore, the number of huge pages generally available is computed
+       as (free_huge_pages - resv_huge_pages).
+Reserve Map
+       A reserve map is described by the structure:
+       struct resv_map {
+               struct kref refs;
+               spinlock_t lock;
+               struct list_head regions;
+               long adds_in_progress;
+               struct list_head region_cache;
+               long region_cache_count;
+       };
+       There is one reserve map for each huge page mapping in the system.
+       The regions list within the resv_map describes the regions within
+       the mapping.  A region is described as:
+       struct file_region {
+               struct list_head link;
+               long from;
+               long to;
+       };
+       The 'from' and 'to' fields of the file region structure are huge page
+       indices into the mapping.  Depending on the type of mapping, a
+       region in the reserv_map may indicate reservations exist for the
+       range, or reservations do not exist.
+Flags for MAP_PRIVATE Reservations
+       These are stored in the bottom bits of the reservation map pointer.
+       #define HPAGE_RESV_OWNER    (1UL << 0) Indicates this task is the
+               owner of the reservations associated with the mapping.
+       #define HPAGE_RESV_UNMAPPED (1UL << 1) Indicates task originally
+               mapping this range (and creating reserves) has unmapped a
+               page from this task (the child) due to a failed COW.
+Page Flags
+       The PagePrivate page flag is used to indicate that a huge page
+       reservation must be restored when the huge page is freed.  More
+       details will be discussed in the "Freeing huge pages" section.
+
+
+Reservation Map Location (Private or Shared)
+--------------------------------------------
+A huge page mapping or segment is either private or shared.  If private,
+it is typically only available to a single address space (task).  If shared,
+it can be mapped into multiple address spaces (tasks).  The location and
+semantics of the reservation map is significantly different for two types
+of mappings.  Location differences are:
+- For private mappings, the reservation map hangs off the the VMA structure.
+  Specifically, vma->vm_private_data.  This reserve map is created at the
+  time the mapping (mmap(MAP_PRIVATE)) is created.
+- For shared mappings, the reservation map hangs off the inode.  Specifically,
+  inode->i_mapping->private_data.  Since shared mappings are always backed
+  by files in the hugetlbfs filesystem, the hugetlbfs code ensures each inode
+  contains a reservation map.  As a result, the reservation map is allocated
+  when the inode is created.
+
+
+Creating Reservations
+---------------------
+Reservations are created when a huge page backed shared memory segment is
+created (shmget(SHM_HUGETLB)) or a mapping is created via mmap(MAP_HUGETLB).
+These operations result in a call to the routine hugetlb_reserve_pages()
+
+int hugetlb_reserve_pages(struct inode *inode,
+                                       long from, long to,
+                                       struct vm_area_struct *vma,
+                                       vm_flags_t vm_flags)
+
+The first thing hugetlb_reserve_pages() does is check for the NORESERVE
+flag was specified in either the shmget() or mmap() call.  If NORESERVE
+was specified, then this routine returns immediately as no reservation
+are desired.
+
+The arguments 'from' and 'to' are huge page indices into the mapping or
+underlying file.  For shmget(), 'from' is always 0 and 'to' corresponds to
+the length of the segment/mapping.  For mmap(), the offset argument could
+be used to specify the offset into the underlying file.  In such a case
+the 'from' and 'to' arguments have been adjusted by this offset.
+
+One of the big differences between PRIVATE and SHARED mappings is the way
+in which reservations are represented in the reservation map.
+- For shared mappings, an entry in the reservation map indicates a reservation
+  exists or did exist for the corresponding page.  As reservations are
+  consumed, the reservation map is not modified.
+- For private mappings, the lack of an entry in the reservation map indicates
+  a reservation exists for the corresponding page.  As reservations are
+  consumed, entries are added to the reservation map.  Therefore, the
+  reservation map can also be used to determine which reservations have
+  been consumed.
+
+For private mappings, hugetlb_reserve_pages() creates the reservation map and
+hangs it off the VMA structure.  In addition, the HPAGE_RESV_OWNER flag is set
+to indicate this VMA owns the reservations.
+
+The reservation map is consulted to determine how many huge page reservations
+are needed for the current mapping/segment.  For private mappings, this is
+always the value (to - from).  However, for shared mappings it is possible that some reservations may already exist within the range (to - from).  See the
+section "Reservation Map Modifications" for details on how this is accomplished.
+
+The mapping may be associated with a subpool.  If so, the subpool is consulted
+to ensure there is sufficient space for the mapping.  It is possible that the
+subpool has set aside reservations that can be used for the mapping.  See the
+section "Subpool Reservations" for more details.
+
+After consulting the reservation map and subpool, the number of needed new
+reservations is known.  The routine hugetlb_acct_memory() is called to check
+for and take the requested number of reservations.  hugetlb_acct_memory()
+calls into routines that potentially allocate and adjust surplus page counts.
+However, within those routines the code is simply checking to ensure there
+are enough free huge pages to accommodate the reservation.  If there are,
+the global reservation count resv_huge_pages is adjusted something like the
+following.
+       if (resv_needed <= (resv_huge_pages - free_huge_pages))
+               resv_huge_pages += resv_needed;
+Note that the global lock hugetlb_lock is held when checking and adjusting
+these counters.
+
+If there were enough free huge pages and the global count resv_huge_pages
+was adjusted, then the reservation map associated with the mapping is
+modified to reflect the reservations.  In the case of a shared mapping, a
+file_region will exist that includes the range 'from' 'to'.  For private
+mappings, no modifications are made to the reservation map as lack of an
+entry indicates a reservation exists.
+
+If hugetlb_reserve_pages() was successful, the global reservation count and
+reservation map associated with the mapping will be modified as required to
+ensure reservations exist for the range 'from' - 'to'.
+
+
+Consuming Reservations/Allocating a Huge Page
+---------------------------------------------
+Reservations are consumed when huge pages associated with the reservations
+are allocated and instantiated in the corresponding mapping.  The allocation
+is performed within the routine alloc_huge_page().
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                    unsigned long addr, int avoid_reserve)
+alloc_huge_page is passed a VMA pointer and a virtual address, so it can
+consult the reservation map to determine if a reservation exists.  In addition,
+alloc_huge_page takes the argument avoid_reserve which indicates reserves
+should not be used even if it appears they have been set aside for the
+specified address.  The avoid_reserve argument is most often used in the case
+of Copy on Write and Page Migration where additional copies of an existing
+page are being allocated.
+
+The helper routine vma_needs_reservation() is called to determine if a
+reservation exists for the address within the mapping(vma).  See the section
+"Reservation Map Helper Routines" for detailed information on what this
+routine does.  The value returned from vma_needs_reservation() is generally
+0 or 1.  0 if a reservation exists for the address, 1 if no reservation exists.
+If a reservation does not exist, and there is a subpool associated with the
+mapping the subpool is consulted to determine if it contains reservations.
+If the subpool contains reservations, one can be used for this allocation.
+However, in every case the avoid_reserve argument overrides the use of
+a reservation for the allocation.  After determining whether a reservation
+exists and can be used for the allocation, the routine dequeue_huge_page_vma()
+is called.  This routine takes two arguments related to reservations:
+- avoid_reserve, this is the same value/argument passed to alloc_huge_page()
+- chg, even though this argument is of type long only the values 0 or 1 are
+  passed to dequeue_huge_page_vma.  If the value is 0, it indicates a
+  reservation exists (see the section "Memory Policy and Reservations" for
+  possible issues).  If the value is 1, it indicates a reservation does not
+  exist and the page must be taken from the global free pool if possible.
+The free lists associated with the memory policy of the VMA are searched for
+a free page.  If a page is found, the value free_huge_pages is decremented
+when the page is removed from the free list.  If there was a reservation
+associated with the page, the following adjustments are made:
+       SetPagePrivate(page);   /* Indicates allocating this page consumed
+                                * a reservation, and if an error is
+                                * encountered such that the page must be
+                                * freed, the reservation will be restored. */
+       resv_huge_pages--;      /* Decrement the global reservation count */
+Note, if no huge page can be found that satisfies the VMA's memory policy
+an attempt will be made to allocate one using the buddy allocator.  This
+brings up the issue of surplus huge pages and overcommit which is beyond
+the scope reservations.  Even if a surplus page is allocated, the same
+reservation based adjustments as above will be made: SetPagePrivate(page) and
+resv_huge_pages--.
+
+After obtaining a new huge page, (page)->private is set to the value of
+the subpool associated with the page if it exists.  This will be used for
+subpool accounting when the page is freed.
+
+The routine vma_commit_reservation() is then called to adjust the reserve
+map based on the consumption of the reservation.  In general, this involves
+ensuring the page is represented within a file_region structure of the region
+map.  For shared mappings where the the reservation was present, an entry
+in the reserve map already existed so no change is made.  However, if there
+was no reservation in a shared mapping or this was a private mapping a new
+entry must be created.
+
+It is possible that the reserve map could have been changed between the call
+to vma_needs_reservation() at the beginning of alloc_huge_page() and the
+call to vma_commit_reservation() after the page was allocated.  This would
+be possible if hugetlb_reserve_pages was called for the same page in a shared
+mapping.  In such cases, the reservation count and subpool free page count
+will be off by one.  This rare condition can be identified by comparing the
+return value from vma_needs_reservation and vma_commit_reservation.  If such
+a race is detected, the subpool and global reserve counts are adjusted to
+compensate.  See the section "Reservation Map Helper Routines" for more
+information on these routines.
+
+
+Instantiate Huge Pages
+----------------------
+After huge page allocation, the page is typically added to the page tables
+of the allocating task.  Before this, pages in a shared mapping are added
+to the page cache and pages in private mappings are added to an anonymous
+reverse mapping.  In both cases, the PagePrivate flag is cleared.  Therefore,
+when a huge page that has been instantiated is freed no adjustment is made
+to the global reservation count (resv_huge_pages).
+
+
+Freeing Huge Pages
+------------------
+Huge page freeing is performed by the routine free_huge_page().  This routine
+is the destructor for hugetlbfs compound pages.  As a result, it is only
+passed a pointer to the page struct.  When a huge page is freed, reservation
+accounting may need to be performed.  This would be the case if the page was
+associated with a subpool that contained reserves, or the page is being freed
+on an error path where a global reserve count must be restored.
+
+The page->private field points to any subpool associated with the page.
+If the PagePrivate flag is set, it indicates the global reserve count should
+be adjusted (see the section "Consuming Reservations/Allocating a Huge Page"
+for information on how these are set).
+
+The routine first calls hugepage_subpool_put_pages() for the page.  If this
+routine returns a value of 0 (which does not equal the value passed 1) it
+indicates reserves are associated with the subpool, and this newly free page
+must be used to keep the number of subpool reserves above the minimum size.
+Therefore, the global resv_huge_pages counter is incremented in this case.
+
+If the PagePrivate flag was set in the page, the global resv_huge_pages counter
+will always be incremented.
+
+
+Subpool Reservations
+--------------------
+There is a struct hstate associated with each huge page size.  The hstate
+tracks all huge pages of the specified size.  A subpool represents a subset
+of pages within a hstate that is associated with a mounted hugetlbfs
+filesystem.
+
+When a hugetlbfs filesystem is mounted a min_size option can be specified
+which indicates the minimum number of huge pages required by the filesystem.
+If this option is specified, the number of huge pages corresponding to
+min_size are reserved for use by the filesystem.  This number is tracked in
+the min_hpages field of a struct hugepage_subpool.  At mount time,
+hugetlb_acct_memory(min_hpages) is called to reserve the specified number of
+huge pages.  If they can not be reserved, the mount fails.
+
+The routines hugepage_subpool_get/put_pages() are called when pages are
+obtained from or released back to a subpool.  They perform all subpool
+accounting, and track any reservations associated with the subpool.
+hugepage_subpool_get/put_pages are passed the number of huge pages by which
+to adjust the subpool 'used page' count (down for get, up for put).  Normally,
+they return the same value that was passed or an error if not enough pages
+exist in the subpool.
+
+However, if reserves are associated with the subpool a return value less
+than the passed value may be returned.  This return value indicates the
+number of additional global pool adjustments which must be made.  For example,
+suppose a subpool contains 3 reserved huge pages and someone asks for 5.
+The 3 reserved pages associated with the subpool can be used to satisfy part
+of the request.  But, 2 pages must be obtained from the global pools.  To
+relay this information to the caller, the value 2 is returned.  The caller
+is then responsible for attempting to obtain the additional two pages from
+the global pools.
+
+
+COW and Reservations
+--------------------
+Since shared mappings all point to and use the same underlying pages, the
+biggest reservation concern for COW is private mappings.  In this case,
+two tasks can be pointing at the same previously allocated page.  One task
+attempts to write to the page, so a new page must be allocated so that each
+task points to its own page.
+
+When the page was originally allocated, the reservation for that page was
+consumed.  When an attempt to allocate a new page is made as a result of
+COW, it is possible that no free huge pages are free and the allocation
+will fail.
+
+When the private mapping was originally created, the owner of the mapping
+was noted by setting the HPAGE_RESV_OWNER bit in the pointer to the reservation
+map of the owner.  Since the owner created the mapping, the owner owns all
+the reservations associated with the mapping.  Therefore, when a write fault
+occurs and there is no page available, different action is taken for the owner
+and non-owner of the reservation.
+
+In the case where the faulting task is not the owner, the fault will fail and
+the task will typically receive a SIGBUS.
+
+If the owner is the faulting task, we want it to succeed since it owned the
+original reservation.  To accomplish this, the page is unmapped from the
+non-owning task.  In this way, the only reference is from the owning task.
+In addition, the HPAGE_RESV_UNMAPPED bit is set in the reservation map pointer
+of the non-owning task.  The non-owning task may receive a SIGBUS if it later
+faults on a non-present page.  But, the original owner of the
+mapping/reservation will behave as expected.
+
+
+Reservation Map Modifications
+-----------------------------
+The following low level routines are used to make modifications to a
+reservation map.  Typically, these routines are not called directly.  Rather,
+a reservation map helper routine is called which calls one of these low level
+routines.  These low level routines are fairly well documented in the source
+code (mm/hugetlb.c).  These routines are:
+long region_chg(struct resv_map *resv, long f, long t);
+long region_add(struct resv_map *resv, long f, long t);
+void region_abort(struct resv_map *resv, long f, long t);
+long region_count(struct resv_map *resv, long f, long t);
+
+Operations on the reservation map typically involve two operations:
+1) region_chg() is called to examine the reserve map and determine how
+   many pages in the specified range [f, t) are NOT currently represented.
+
+   The calling code performs global checks and allocations to determine if
+   there are enough huge pages for the operation to succeed.
+
+2a) If the operation can succeed, region_add() is called to actually modify
+    the reservation map for the same range [f, t) previously passed to
+    region_chg().
+2b) If the operation can not succeed, region_abort is called for the same range
+    [f, t) to abort the operation.
+
+Note that this is a two step process where region_add() and region_abort()
+are guaranteed to succeed after a prior call to region_chg() for the same
+range.  region_chg() is responsible for pre-allocating any data structures
+necessary to ensure the subsequent operations (specifically region_add()))
+will succeed.
+
+As mentioned above, region_chg() determines the number of pages in the range
+which are NOT currently represented in the map.  This number is returned to
+the caller.  region_add() returns the number of pages in the range added to
+the map.  In most cases, the return value of region_add() is the same as the
+return value of region_chg().  However, in the case of shared mappings it is
+possible for changes to the reservation map to be made between the calls to
+region_chg() and region_add().  In this case, the return value of region_add()
+will not match the return value of region_chg().  It is likely that in such
+cases global counts and subpool accounting will be incorrect and in need of
+adjustment.  It is the responsibility of the caller to check for this condition
+and make the appropriate adjustments.
+
+The routine region_del() is called to remove regions from a reservation map.
+It is typically called in the following situations:
+- When a file in the hugetlbfs filesystem is being removed, the inode will
+  be released and the reservation map freed.  Before freeing the reservation
+  map, all the individual file_region structures must be freed.  In this case
+  region_del is passed the range [0, LONG_MAX).
+- When a hugetlbfs file is being truncated.  In this case, all allocated pages
+  after the new file size must be freed.  In addition, any file_region entries
+  in the reservation map past the new end of file must be deleted.  In this
+  case, region_del is passed the range [new_end_of_file, LONG_MAX).
+- When a hole is being punched in a hugetlbfs file.  In this case, huge pages
+  are removed from the middle of the file one at a time.  As the pages are
+  removed, region_del() is called to remove the corresponding entry from the
+  reservation map.  In this case, region_del is passed the range
+  [page_idx, page_idx + 1).
+In every case, region_del() will return the number of pages removed from the
+reservation map.  In VERY rare cases, region_del() can fail.  This can only
+happen in the hole punch case where it has to split an existing file_region
+entry and can not allocate a new structure.  In this error case, region_del()
+will return -ENOMEM.  The problem here is that the reservation map will
+indicate that there is a reservation for the page.  However, the subpool and
+global reservation counts will not reflect the reservation.  To handle this
+situation, the routine hugetlb_fix_reserve_counts() is called to adjust the
+counters so that they correspond with the reservation map entry that could
+not be deleted.
+
+region_count() is called when unmapping a private huge page mapping.  In
+private mappings, the lack of a entry in the reservation map indicates that
+a reservation exists.  Therefore, by counting the number of entries in the
+reservation map we know how many reservations were consumed and how many are
+outstanding (outstanding = (end - start) - region_count(resv, start, end)).
+Since the mapping is going away, the subpool and global reservation counts
+are decremented by the number of outstanding reservations.
+
+
+Reservation Map Helper Routines
+-------------------------------
+Several helper routines exist to query and modify the reservation maps.
+These routines are only interested with reservations for a specific huge
+page, so they just pass in an address instead of a range.  In addition,
+they pass in the associated VMA.  From the VMA, the type of mapping (private
+or shared) and the location of the reservation map (inode or VMA) can be
+determined.  These routines simply call the underlying routines described
+in the section "Reservation Map Modifications".  However, they do take into
+account the 'opposite' meaning of reservation map entries for private and
+shared mappings and hide this detail from the caller.
+
+long vma_needs_reservation(struct hstate *h,
+                               struct vm_area_struct *vma, unsigned long addr)
+This routine calls region_chg() for the specified page.  If no reservation
+exists, 1 is returned.  If a reservation exists, 0 is returned.
+
+long vma_commit_reservation(struct hstate *h,
+                               struct vm_area_struct *vma, unsigned long addr)
+This calls region_add() for the specified page.  As in the case of region_chg
+and region_add, this routine is to be called after a previous call to
+vma_needs_reservation.  It will add a reservation entry for the page.  It
+returns 1 if the reservation was added and 0 if not.  The return value should
+be compared with the return value of the previous call to
+vma_needs_reservation.  An unexpected difference indicates the reservation
+map was modified between calls.
+
+void vma_end_reservation(struct hstate *h,
+                               struct vm_area_struct *vma, unsigned long addr)
+This calls region_abort() for the specified page.  As in the case of region_chg
+and region_abort, this routine is to be called after a previous call to
+vma_needs_reservation.  It will abort/end the in progress reservation add
+operation.
+
+long vma_add_reservation(struct hstate *h,
+                               struct vm_area_struct *vma, unsigned long addr)
+This is a special wrapper routine to help facilitate reservation cleanup
+on error paths.  It is only called from the routine restore_reserve_on_error().
+This routine is used in conjunction with vma_needs_reservation in an attempt
+to add a reservation to the reservation map.  It takes into account the
+different reservation map semantics for private and shared mappings.  Hence,
+region_add is called for shared mappings (as an entry present in the map
+indicates a reservation), and region_del is called for private mappings (as
+the absence of an entry in the map indicates a reservation).  See the section
+"Reservation cleanup in error paths" for more information on what needs to
+be done on error paths.
+
+
+Reservation Cleanup in Error Paths
+----------------------------------
+As mentioned in the section "Reservation Map Helper Routines", reservation
+map modifications are performed in two steps.  First vma_needs_reservation
+is called before a page is allocated.  If the allocation is successful,
+then vma_commit_reservation is called.  If not, vma_end_reservation is called.
+Global and subpool reservation counts are adjusted based on success or failure
+of the operation and all is well.
+
+Additionally, after a huge page is instantiated the PagePrivate flag is
+cleared so that accounting when the page is ultimately freed is correct.
+
+However, there are several instances where errors are encountered after a huge
+page is allocated but before it is instantiated.  In this case, the page
+allocation has consumed the reservation and made the appropriate subpool,
+reservation map and global count adjustments.  If the page is freed at this
+time (before instantiation and clearing of PagePrivate), then free_huge_page
+will increment the global reservation count.  However, the reservation map
+indicates the reservation was consumed.  This resulting inconsistent state
+will cause the 'leak' of a reserved huge page.  The global reserve count will
+be  higher than it should and prevent allocation of a pre-allocated page.
+
+The routine restore_reserve_on_error() attempts to handle this situation.  It
+is fairly well documented.  The intention of this routine is to restore
+the reservation map to the way it was before the page allocation.   In this
+way, the state of the reservation map will correspond to the global reservation
+count after the page is freed.
+
+The routine restore_reserve_on_error itself may encounter errors while
+attempting to restore the reservation map entry.  In this case, it will
+simply clear the PagePrivate flag of the page.  In this way, the global
+reserve count will not be incremented when the page is freed.  However, the
+reservation map will continue to look as though the reservation was consumed.
+A page can still be allocated for the address, but it will not use a reserved
+page as originally intended.
+
+There is some code (most notably userfaultfd) which can not call
+restore_reserve_on_error.  In this case, it simply modifies the PagePrivate
+so that a reservation will not be leaked when the huge page is freed.
+
+
+Reservations and Memory Policy
+------------------------------
+Per-node huge page lists existed in struct hstate when git was first used
+to manage Linux code.  The concept of reservations was added some time later.
+When reservations were added, no attempt was made to take memory policy
+into account.  While cpusets are not exactly the same as memory policy, this
+comment in hugetlb_acct_memory sums up the interaction between reservations
+and cpusets/memory policy.
+       /*
+        * When cpuset is configured, it breaks the strict hugetlb page
+        * reservation as the accounting is done on a global variable. Such
+        * reservation is completely rubbish in the presence of cpuset because
+        * the reservation is not checked against page availability for the
+        * current cpuset. Application can still potentially OOM'ed by kernel
+        * with lack of free htlb page in cpuset that the task is in.
+        * Attempt to enforce strict accounting with cpuset is almost
+        * impossible (or too ugly) because cpuset is too fluid that
+        * task or memory node can be dynamically moved between cpusets.
+        *
+        * The change of semantics for shared hugetlb mapping with cpuset is
+        * undesirable. However, in order to preserve some of the semantics,
+        * we fall back to check against current free page availability as
+        * a best attempt and hopefully to minimize the impact of changing
+        * semantics that cpuset has.
+        */
+
+Huge page reservations were added to prevent unexpected page allocation
+failures (OOM) at page fault time.  However, if an application makes use
+of cpusets or memory policy there is no guarantee that huge pages will be
+available on the required nodes.  This is true even if there are a sufficient
+number of global reservations.
+
+
+Mike Kravetz, 7 April 2017
index 378305844b2c9ec14d2f20197216519eb8249010..392a59b9a504f3d1f56d7ffc311283376b4440f1 100644 (file)
@@ -97,6 +97,9 @@ EXPORT_SYMBOL(clk_enable);
 
 void clk_disable(struct clk *clk)
 {
+       if (!clk)
+               return;
+
        if (clk->ops && clk->ops->disable)
                clk->ops->disable(clk);
 }
index 6fac5fedd6107b8b86bd1cef612c0f34899c5d8e..debee952dcc18a9a8cb4217d728dcdfde29e6b1c 100644 (file)
@@ -45,6 +45,8 @@ static const char *default_compressor = "lzo";
 /* Module params (documentation at end) */
 static unsigned int num_devices = 1;
 
+static void zram_free_page(struct zram *zram, size_t index);
+
 static inline bool init_done(struct zram *zram)
 {
        return zram->disksize;
@@ -55,53 +57,70 @@ static inline struct zram *dev_to_zram(struct device *dev)
        return (struct zram *)dev_to_disk(dev)->private_data;
 }
 
+static unsigned long zram_get_handle(struct zram *zram, u32 index)
+{
+       return zram->table[index].handle;
+}
+
+static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+{
+       zram->table[index].handle = handle;
+}
+
 /* flag operations require table entry bit_spin_lock() being held */
-static int zram_test_flag(struct zram_meta *meta, u32 index,
+static int zram_test_flag(struct zram *zram, u32 index,
                        enum zram_pageflags flag)
 {
-       return meta->table[index].value & BIT(flag);
+       return zram->table[index].value & BIT(flag);
 }
 
-static void zram_set_flag(struct zram_meta *meta, u32 index,
+static void zram_set_flag(struct zram *zram, u32 index,
                        enum zram_pageflags flag)
 {
-       meta->table[index].value |= BIT(flag);
+       zram->table[index].value |= BIT(flag);
 }
 
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
+static void zram_clear_flag(struct zram *zram, u32 index,
                        enum zram_pageflags flag)
 {
-       meta->table[index].value &= ~BIT(flag);
+       zram->table[index].value &= ~BIT(flag);
 }
 
-static inline void zram_set_element(struct zram_meta *meta, u32 index,
+static inline void zram_set_element(struct zram *zram, u32 index,
                        unsigned long element)
 {
-       meta->table[index].element = element;
+       zram->table[index].element = element;
 }
 
-static inline void zram_clear_element(struct zram_meta *meta, u32 index)
+static unsigned long zram_get_element(struct zram *zram, u32 index)
 {
-       meta->table[index].element = 0;
+       return zram->table[index].element;
 }
 
-static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+static size_t zram_get_obj_size(struct zram *zram, u32 index)
 {
-       return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+       return zram->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
 }
 
-static void zram_set_obj_size(struct zram_meta *meta,
+static void zram_set_obj_size(struct zram *zram,
                                        u32 index, size_t size)
 {
-       unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+       unsigned long flags = zram->table[index].value >> ZRAM_FLAG_SHIFT;
 
-       meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+       zram->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
 }
 
+#if PAGE_SIZE != 4096
 static inline bool is_partial_io(struct bio_vec *bvec)
 {
        return bvec->bv_len != PAGE_SIZE;
 }
+#else
+static inline bool is_partial_io(struct bio_vec *bvec)
+{
+       return false;
+}
+#endif
 
 static void zram_revalidate_disk(struct zram *zram)
 {
@@ -137,8 +156,7 @@ static inline bool valid_io_request(struct zram *zram,
 
 static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
 {
-       if (*offset + bvec->bv_len >= PAGE_SIZE)
-               (*index)++;
+       *index  += (*offset + bvec->bv_len) / PAGE_SIZE;
        *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
 }
 
@@ -177,31 +195,21 @@ static bool page_same_filled(void *ptr, unsigned long *element)
 {
        unsigned int pos;
        unsigned long *page;
+       unsigned long val;
 
        page = (unsigned long *)ptr;
+       val = page[0];
 
-       for (pos = 0; pos < PAGE_SIZE / sizeof(*page) - 1; pos++) {
-               if (page[pos] != page[pos + 1])
+       for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
+               if (val != page[pos])
                        return false;
        }
 
-       *element = page[pos];
+       *element = val;
 
        return true;
 }
 
-static void handle_same_page(struct bio_vec *bvec, unsigned long element)
-{
-       struct page *page = bvec->bv_page;
-       void *user_mem;
-
-       user_mem = kmap_atomic(page);
-       zram_fill_page(user_mem + bvec->bv_offset, bvec->bv_len, element);
-       kunmap_atomic(user_mem);
-
-       flush_dcache_page(page);
-}
-
 static ssize_t initstate_show(struct device *dev,
                struct device_attribute *attr, char *buf)
 {
@@ -254,9 +262,8 @@ static ssize_t mem_used_max_store(struct device *dev,
 
        down_read(&zram->init_lock);
        if (init_done(zram)) {
-               struct zram_meta *meta = zram->meta;
                atomic_long_set(&zram->stats.max_used_pages,
-                               zs_get_total_pages(meta->mem_pool));
+                               zs_get_total_pages(zram->mem_pool));
        }
        up_read(&zram->init_lock);
 
@@ -329,7 +336,6 @@ static ssize_t compact_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t len)
 {
        struct zram *zram = dev_to_zram(dev);
-       struct zram_meta *meta;
 
        down_read(&zram->init_lock);
        if (!init_done(zram)) {
@@ -337,8 +343,7 @@ static ssize_t compact_store(struct device *dev,
                return -EINVAL;
        }
 
-       meta = zram->meta;
-       zs_compact(meta->mem_pool);
+       zs_compact(zram->mem_pool);
        up_read(&zram->init_lock);
 
        return len;
@@ -375,8 +380,8 @@ static ssize_t mm_stat_show(struct device *dev,
 
        down_read(&zram->init_lock);
        if (init_done(zram)) {
-               mem_used = zs_get_total_pages(zram->meta->mem_pool);
-               zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+               mem_used = zs_get_total_pages(zram->mem_pool);
+               zs_pool_stats(zram->mem_pool, &pool_stats);
        }
 
        orig_size = atomic64_read(&zram->stats.pages_stored);
@@ -417,56 +422,89 @@ static DEVICE_ATTR_RO(io_stat);
 static DEVICE_ATTR_RO(mm_stat);
 static DEVICE_ATTR_RO(debug_stat);
 
-static void zram_meta_free(struct zram_meta *meta, u64 disksize)
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+       bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+       bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value);
+}
+
+static bool zram_same_page_read(struct zram *zram, u32 index,
+                               struct page *page,
+                               unsigned int offset, unsigned int len)
+{
+       zram_slot_lock(zram, index);
+       if (unlikely(!zram_get_handle(zram, index) ||
+                       zram_test_flag(zram, index, ZRAM_SAME))) {
+               void *mem;
+
+               zram_slot_unlock(zram, index);
+               mem = kmap_atomic(page);
+               zram_fill_page(mem + offset, len,
+                                       zram_get_element(zram, index));
+               kunmap_atomic(mem);
+               return true;
+       }
+       zram_slot_unlock(zram, index);
+
+       return false;
+}
+
+static bool zram_same_page_write(struct zram *zram, u32 index,
+                                       struct page *page)
+{
+       unsigned long element;
+       void *mem = kmap_atomic(page);
+
+       if (page_same_filled(mem, &element)) {
+               kunmap_atomic(mem);
+               /* Free memory associated with this sector now. */
+               zram_slot_lock(zram, index);
+               zram_free_page(zram, index);
+               zram_set_flag(zram, index, ZRAM_SAME);
+               zram_set_element(zram, index, element);
+               zram_slot_unlock(zram, index);
+
+               atomic64_inc(&zram->stats.same_pages);
+               return true;
+       }
+       kunmap_atomic(mem);
+
+       return false;
+}
+
+static void zram_meta_free(struct zram *zram, u64 disksize)
 {
        size_t num_pages = disksize >> PAGE_SHIFT;
        size_t index;
 
        /* Free all pages that are still in this zram device */
-       for (index = 0; index < num_pages; index++) {
-               unsigned long handle = meta->table[index].handle;
-               /*
-                * No memory is allocated for same element filled pages.
-                * Simply clear same page flag.
-                */
-               if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
-                       continue;
-
-               zs_free(meta->mem_pool, handle);
-       }
+       for (index = 0; index < num_pages; index++)
+               zram_free_page(zram, index);
 
-       zs_destroy_pool(meta->mem_pool);
-       vfree(meta->table);
-       kfree(meta);
+       zs_destroy_pool(zram->mem_pool);
+       vfree(zram->table);
 }
 
-static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 {
        size_t num_pages;
-       struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-
-       if (!meta)
-               return NULL;
 
        num_pages = disksize >> PAGE_SHIFT;
-       meta->table = vzalloc(num_pages * sizeof(*meta->table));
-       if (!meta->table) {
-               pr_err("Error allocating zram address table\n");
-               goto out_error;
-       }
+       zram->table = vzalloc(num_pages * sizeof(*zram->table));
+       if (!zram->table)
+               return false;
 
-       meta->mem_pool = zs_create_pool(pool_name);
-       if (!meta->mem_pool) {
-               pr_err("Error creating memory pool\n");
-               goto out_error;
+       zram->mem_pool = zs_create_pool(zram->disk->disk_name);
+       if (!zram->mem_pool) {
+               vfree(zram->table);
+               return false;
        }
 
-       return meta;
-
-out_error:
-       vfree(meta->table);
-       kfree(meta);
-       return NULL;
+       return true;
 }
 
 /*
@@ -476,16 +514,15 @@ out_error:
  */
 static void zram_free_page(struct zram *zram, size_t index)
 {
-       struct zram_meta *meta = zram->meta;
-       unsigned long handle = meta->table[index].handle;
+       unsigned long handle = zram_get_handle(zram, index);
 
        /*
         * No memory is allocated for same element filled pages.
         * Simply clear same page flag.
         */
-       if (zram_test_flag(meta, index, ZRAM_SAME)) {
-               zram_clear_flag(meta, index, ZRAM_SAME);
-               zram_clear_element(meta, index);
+       if (zram_test_flag(zram, index, ZRAM_SAME)) {
+               zram_clear_flag(zram, index, ZRAM_SAME);
+               zram_set_element(zram, index, 0);
                atomic64_dec(&zram->stats.same_pages);
                return;
        }
@@ -493,179 +530,111 @@ static void zram_free_page(struct zram *zram, size_t index)
        if (!handle)
                return;
 
-       zs_free(meta->mem_pool, handle);
+       zs_free(zram->mem_pool, handle);
 
-       atomic64_sub(zram_get_obj_size(meta, index),
+       atomic64_sub(zram_get_obj_size(zram, index),
                        &zram->stats.compr_data_size);
        atomic64_dec(&zram->stats.pages_stored);
 
-       meta->table[index].handle = 0;
-       zram_set_obj_size(meta, index, 0);
+       zram_set_handle(zram, index, 0);
+       zram_set_obj_size(zram, index, 0);
 }
 
-static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
 {
-       int ret = 0;
-       unsigned char *cmem;
-       struct zram_meta *meta = zram->meta;
+       int ret;
        unsigned long handle;
        unsigned int size;
+       void *src, *dst;
 
-       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-       handle = meta->table[index].handle;
-       size = zram_get_obj_size(meta, index);
-
-       if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
-               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-               zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
+       if (zram_same_page_read(zram, index, page, 0, PAGE_SIZE))
                return 0;
-       }
 
-       cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+       zram_slot_lock(zram, index);
+       handle = zram_get_handle(zram, index);
+       size = zram_get_obj_size(zram, index);
+
+       src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
        if (size == PAGE_SIZE) {
-               memcpy(mem, cmem, PAGE_SIZE);
+               dst = kmap_atomic(page);
+               memcpy(dst, src, PAGE_SIZE);
+               kunmap_atomic(dst);
+               ret = 0;
        } else {
                struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
 
-               ret = zcomp_decompress(zstrm, cmem, size, mem);
+               dst = kmap_atomic(page);
+               ret = zcomp_decompress(zstrm, src, size, dst);
+               kunmap_atomic(dst);
                zcomp_stream_put(zram->comp);
        }
-       zs_unmap_object(meta->mem_pool, handle);
-       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+       zs_unmap_object(zram->mem_pool, handle);
+       zram_slot_unlock(zram, index);
 
        /* Should NEVER happen. Return bio error if it does. */
-       if (unlikely(ret)) {
+       if (unlikely(ret))
                pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-               return ret;
-       }
 
-       return 0;
+       return ret;
 }
 
 static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-                         u32 index, int offset)
+                               u32 index, int offset)
 {
        int ret;
        struct page *page;
-       unsigned char *user_mem, *uncmem = NULL;
-       struct zram_meta *meta = zram->meta;
-       page = bvec->bv_page;
 
-       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-       if (unlikely(!meta->table[index].handle) ||
-                       zram_test_flag(meta, index, ZRAM_SAME)) {
-               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-               handle_same_page(bvec, meta->table[index].element);
-               return 0;
+       page = bvec->bv_page;
+       if (is_partial_io(bvec)) {
+               /* Use a temporary buffer to decompress the page */
+               page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+               if (!page)
+                       return -ENOMEM;
        }
-       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
 
-       if (is_partial_io(bvec))
-               /* Use  a temporary buffer to decompress the page */
-               uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+       ret = zram_decompress_page(zram, page, index);
+       if (unlikely(ret))
+               goto out;
 
-       user_mem = kmap_atomic(page);
-       if (!is_partial_io(bvec))
-               uncmem = user_mem;
+       if (is_partial_io(bvec)) {
+               void *dst = kmap_atomic(bvec->bv_page);
+               void *src = kmap_atomic(page);
 
-       if (!uncmem) {
-               pr_err("Unable to allocate temp memory\n");
-               ret = -ENOMEM;
-               goto out_cleanup;
+               memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len);
+               kunmap_atomic(src);
+               kunmap_atomic(dst);
        }
-
-       ret = zram_decompress_page(zram, uncmem, index);
-       /* Should NEVER happen. Return bio error if it does. */
-       if (unlikely(ret))
-               goto out_cleanup;
-
+out:
        if (is_partial_io(bvec))
-               memcpy(user_mem + bvec->bv_offset, uncmem + offset,
-                               bvec->bv_len);
+               __free_page(page);
 
-       flush_dcache_page(page);
-       ret = 0;
-out_cleanup:
-       kunmap_atomic(user_mem);
-       if (is_partial_io(bvec))
-               kfree(uncmem);
        return ret;
 }
 
-static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
-                          int offset)
+static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
+                       struct page *page,
+                       unsigned long *out_handle, unsigned int *out_comp_len)
 {
-       int ret = 0;
-       unsigned int clen;
-       unsigned long handle = 0;
-       struct page *page;
-       unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
-       struct zram_meta *meta = zram->meta;
-       struct zcomp_strm *zstrm = NULL;
+       int ret;
+       unsigned int comp_len;
+       void *src;
        unsigned long alloced_pages;
-       unsigned long element;
-
-       page = bvec->bv_page;
-       if (is_partial_io(bvec)) {
-               /*
-                * This is a partial IO. We need to read the full page
-                * before to write the changes.
-                */
-               uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-               if (!uncmem) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-               ret = zram_decompress_page(zram, uncmem, index);
-               if (ret)
-                       goto out;
-       }
+       unsigned long handle = 0;
 
 compress_again:
-       user_mem = kmap_atomic(page);
-       if (is_partial_io(bvec)) {
-               memcpy(uncmem + offset, user_mem + bvec->bv_offset,
-                      bvec->bv_len);
-               kunmap_atomic(user_mem);
-               user_mem = NULL;
-       } else {
-               uncmem = user_mem;
-       }
-
-       if (page_same_filled(uncmem, &element)) {
-               if (user_mem)
-                       kunmap_atomic(user_mem);
-               /* Free memory associated with this sector now. */
-               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
-               zram_free_page(zram, index);
-               zram_set_flag(meta, index, ZRAM_SAME);
-               zram_set_element(meta, index, element);
-               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
-
-               atomic64_inc(&zram->stats.same_pages);
-               ret = 0;
-               goto out;
-       }
-
-       zstrm = zcomp_stream_get(zram->comp);
-       ret = zcomp_compress(zstrm, uncmem, &clen);
-       if (!is_partial_io(bvec)) {
-               kunmap_atomic(user_mem);
-               user_mem = NULL;
-               uncmem = NULL;
-       }
+       src = kmap_atomic(page);
+       ret = zcomp_compress(*zstrm, src, &comp_len);
+       kunmap_atomic(src);
 
        if (unlikely(ret)) {
                pr_err("Compression failed! err=%d\n", ret);
-               goto out;
+               if (handle)
+                       zs_free(zram->mem_pool, handle);
+               return ret;
        }
 
-       src = zstrm->buffer;
-       if (unlikely(clen > max_zpage_size)) {
-               clen = PAGE_SIZE;
-               if (is_partial_io(bvec))
-                       src = uncmem;
-       }
+       if (unlikely(comp_len > max_zpage_size))
+               comp_len = PAGE_SIZE;
 
        /*
         * handle allocation has 2 paths:
@@ -681,71 +650,121 @@ compress_again:
         * from the slow path and handle has already been allocated.
         */
        if (!handle)
-               handle = zs_malloc(meta->mem_pool, clen,
+               handle = zs_malloc(zram->mem_pool, comp_len,
                                __GFP_KSWAPD_RECLAIM |
                                __GFP_NOWARN |
                                __GFP_HIGHMEM |
                                __GFP_MOVABLE);
        if (!handle) {
                zcomp_stream_put(zram->comp);
-               zstrm = NULL;
-
                atomic64_inc(&zram->stats.writestall);
-
-               handle = zs_malloc(meta->mem_pool, clen,
+               handle = zs_malloc(zram->mem_pool, comp_len,
                                GFP_NOIO | __GFP_HIGHMEM |
                                __GFP_MOVABLE);
+               *zstrm = zcomp_stream_get(zram->comp);
                if (handle)
                        goto compress_again;
-
-               pr_err("Error allocating memory for compressed page: %u, size=%u\n",
-                       index, clen);
-               ret = -ENOMEM;
-               goto out;
+               return -ENOMEM;
        }
 
-       alloced_pages = zs_get_total_pages(meta->mem_pool);
+       alloced_pages = zs_get_total_pages(zram->mem_pool);
        update_used_max(zram, alloced_pages);
 
        if (zram->limit_pages && alloced_pages > zram->limit_pages) {
-               zs_free(meta->mem_pool, handle);
-               ret = -ENOMEM;
-               goto out;
+               zs_free(zram->mem_pool, handle);
+               return -ENOMEM;
        }
 
-       cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+       *out_handle = handle;
+       *out_comp_len = comp_len;
+       return 0;
+}
+
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
+{
+       int ret;
+       unsigned long handle;
+       unsigned int comp_len;
+       void *src, *dst;
+       struct zcomp_strm *zstrm;
+       struct page *page = bvec->bv_page;
 
-       if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+       if (zram_same_page_write(zram, index, page))
+               return 0;
+
+       zstrm = zcomp_stream_get(zram->comp);
+       ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
+       if (ret) {
+               zcomp_stream_put(zram->comp);
+               return ret;
+       }
+
+       dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
+
+       src = zstrm->buffer;
+       if (comp_len == PAGE_SIZE)
                src = kmap_atomic(page);
-               memcpy(cmem, src, PAGE_SIZE);
+       memcpy(dst, src, comp_len);
+       if (comp_len == PAGE_SIZE)
                kunmap_atomic(src);
-       } else {
-               memcpy(cmem, src, clen);
-       }
 
        zcomp_stream_put(zram->comp);
-       zstrm = NULL;
-       zs_unmap_object(meta->mem_pool, handle);
+       zs_unmap_object(zram->mem_pool, handle);
 
        /*
         * Free memory associated with this sector
         * before overwriting unused sectors.
         */
-       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+       zram_slot_lock(zram, index);
        zram_free_page(zram, index);
-
-       meta->table[index].handle = handle;
-       zram_set_obj_size(meta, index, clen);
-       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+       zram_set_handle(zram, index, handle);
+       zram_set_obj_size(zram, index, comp_len);
+       zram_slot_unlock(zram, index);
 
        /* Update stats */
-       atomic64_add(clen, &zram->stats.compr_data_size);
+       atomic64_add(comp_len, &zram->stats.compr_data_size);
        atomic64_inc(&zram->stats.pages_stored);
+       return 0;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+                               u32 index, int offset)
+{
+       int ret;
+       struct page *page = NULL;
+       void *src;
+       struct bio_vec vec;
+
+       vec = *bvec;
+       if (is_partial_io(bvec)) {
+               void *dst;
+               /*
+                * This is a partial IO. We need to read the full page
+                * before to write the changes.
+                */
+               page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+               if (!page)
+                       return -ENOMEM;
+
+               ret = zram_decompress_page(zram, page, index);
+               if (ret)
+                       goto out;
+
+               src = kmap_atomic(bvec->bv_page);
+               dst = kmap_atomic(page);
+               memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len);
+               kunmap_atomic(dst);
+               kunmap_atomic(src);
+
+               vec.bv_page = page;
+               vec.bv_len = PAGE_SIZE;
+               vec.bv_offset = 0;
+       }
+
+       ret = __zram_bvec_write(zram, &vec, index);
 out:
-       if (zstrm)
-               zcomp_stream_put(zram->comp);
        if (is_partial_io(bvec))
-               kfree(uncmem);
+               __free_page(page);
        return ret;
 }
 
@@ -758,7 +777,6 @@ static void zram_bio_discard(struct zram *zram, u32 index,
                             int offset, struct bio *bio)
 {
        size_t n = bio->bi_iter.bi_size;
-       struct zram_meta *meta = zram->meta;
 
        /*
         * zram manages data in physical block size units. Because logical block
@@ -779,9 +797,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
        }
 
        while (n >= PAGE_SIZE) {
-               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+               zram_slot_lock(zram, index);
                zram_free_page(zram, index);
-               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+               zram_slot_unlock(zram, index);
                atomic64_inc(&zram->stats.notify_free);
                index++;
                n -= PAGE_SIZE;
@@ -801,6 +819,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
        if (!is_write) {
                atomic64_inc(&zram->stats.num_reads);
                ret = zram_bvec_read(zram, bvec, index, offset);
+               flush_dcache_page(bvec->bv_page);
        } else {
                atomic64_inc(&zram->stats.num_writes);
                ret = zram_bvec_write(zram, bvec, index, offset);
@@ -840,34 +859,21 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
        }
 
        bio_for_each_segment(bvec, bio, iter) {
-               int max_transfer_size = PAGE_SIZE - offset;
-
-               if (bvec.bv_len > max_transfer_size) {
-                       /*
-                        * zram_bvec_rw() can only make operation on a single
-                        * zram page. Split the bio vector.
-                        */
-                       struct bio_vec bv;
-
-                       bv.bv_page = bvec.bv_page;
-                       bv.bv_len = max_transfer_size;
-                       bv.bv_offset = bvec.bv_offset;
+               struct bio_vec bv = bvec;
+               unsigned int unwritten = bvec.bv_len;
 
+               do {
+                       bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
+                                                       unwritten);
                        if (zram_bvec_rw(zram, &bv, index, offset,
-                                        op_is_write(bio_op(bio))) < 0)
+                                       op_is_write(bio_op(bio))) < 0)
                                goto out;
 
-                       bv.bv_len = bvec.bv_len - max_transfer_size;
-                       bv.bv_offset += max_transfer_size;
-                       if (zram_bvec_rw(zram, &bv, index + 1, 0,
-                                        op_is_write(bio_op(bio))) < 0)
-                               goto out;
-               } else
-                       if (zram_bvec_rw(zram, &bvec, index, offset,
-                                        op_is_write(bio_op(bio))) < 0)
-                               goto out;
+                       bv.bv_offset += bv.bv_len;
+                       unwritten -= bv.bv_len;
 
-               update_position(&index, &offset, &bvec);
+                       update_position(&index, &offset, &bv);
+               } while (unwritten);
        }
 
        bio_endio(bio);
@@ -884,8 +890,6 @@ static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio)
 {
        struct zram *zram = queue->queuedata;
 
-       blk_queue_split(queue, &bio, queue->bio_split);
-
        if (!valid_io_request(zram, bio->bi_iter.bi_sector,
                                        bio->bi_iter.bi_size)) {
                atomic64_inc(&zram->stats.invalid_io);
@@ -904,14 +908,12 @@ static void zram_slot_free_notify(struct block_device *bdev,
                                unsigned long index)
 {
        struct zram *zram;
-       struct zram_meta *meta;
 
        zram = bdev->bd_disk->private_data;
-       meta = zram->meta;
 
-       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+       zram_slot_lock(zram, index);
        zram_free_page(zram, index);
-       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+       zram_slot_unlock(zram, index);
        atomic64_inc(&zram->stats.notify_free);
 }
 
@@ -955,7 +957,6 @@ out:
 
 static void zram_reset_device(struct zram *zram)
 {
-       struct zram_meta *meta;
        struct zcomp *comp;
        u64 disksize;
 
@@ -968,12 +969,8 @@ static void zram_reset_device(struct zram *zram)
                return;
        }
 
-       meta = zram->meta;
        comp = zram->comp;
        disksize = zram->disksize;
-
-       /* Reset stats */
-       memset(&zram->stats, 0, sizeof(zram->stats));
        zram->disksize = 0;
 
        set_capacity(zram->disk, 0);
@@ -981,7 +978,8 @@ static void zram_reset_device(struct zram *zram)
 
        up_write(&zram->init_lock);
        /* I/O operation under all of CPU are done so let's free */
-       zram_meta_free(meta, disksize);
+       zram_meta_free(zram, disksize);
+       memset(&zram->stats, 0, sizeof(zram->stats));
        zcomp_destroy(comp);
 }
 
@@ -990,7 +988,6 @@ static ssize_t disksize_store(struct device *dev,
 {
        u64 disksize;
        struct zcomp *comp;
-       struct zram_meta *meta;
        struct zram *zram = dev_to_zram(dev);
        int err;
 
@@ -998,10 +995,18 @@ static ssize_t disksize_store(struct device *dev,
        if (!disksize)
                return -EINVAL;
 
+       down_write(&zram->init_lock);
+       if (init_done(zram)) {
+               pr_info("Cannot change disksize for initialized device\n");
+               err = -EBUSY;
+               goto out_unlock;
+       }
+
        disksize = PAGE_ALIGN(disksize);
-       meta = zram_meta_alloc(zram->disk->disk_name, disksize);
-       if (!meta)
-               return -ENOMEM;
+       if (!zram_meta_alloc(zram, disksize)) {
+               err = -ENOMEM;
+               goto out_unlock;
+       }
 
        comp = zcomp_create(zram->compressor);
        if (IS_ERR(comp)) {
@@ -1011,14 +1016,6 @@ static ssize_t disksize_store(struct device *dev,
                goto out_free_meta;
        }
 
-       down_write(&zram->init_lock);
-       if (init_done(zram)) {
-               pr_info("Cannot change disksize for initialized device\n");
-               err = -EBUSY;
-               goto out_destroy_comp;
-       }
-
-       zram->meta = meta;
        zram->comp = comp;
        zram->disksize = disksize;
        set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
@@ -1027,11 +1024,10 @@ static ssize_t disksize_store(struct device *dev,
 
        return len;
 
-out_destroy_comp:
-       up_write(&zram->init_lock);
-       zcomp_destroy(comp);
 out_free_meta:
-       zram_meta_free(meta, disksize);
+       zram_meta_free(zram, disksize);
+out_unlock:
+       up_write(&zram->init_lock);
        return err;
 }
 
@@ -1193,8 +1189,6 @@ static int zram_add(void)
        blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
        blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
        zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
-       zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
-       zram->disk->queue->limits.chunk_sectors = 0;
        blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
 
@@ -1219,7 +1213,6 @@ static int zram_add(void)
                goto out_free_disk;
        }
        strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
-       zram->meta = NULL;
 
        pr_info("Added device: %s\n", zram->disk->disk_name);
        return device_id;
index caeff51f1571af0957112bc28fe020f522503f94..e34e44d02e3ec63d58ee7f9c9b6b3694c74482a0 100644 (file)
@@ -92,13 +92,9 @@ struct zram_stats {
        atomic64_t writestall;          /* no. of write slow paths */
 };
 
-struct zram_meta {
+struct zram {
        struct zram_table_entry *table;
        struct zs_pool *mem_pool;
-};
-
-struct zram {
-       struct zram_meta *meta;
        struct zcomp *comp;
        struct gendisk *disk;
        /* Prevent concurrent execution of device init */
index 677f0ddc986c11005dc9469dc0fb3adf96d1f5a5..3ffc1ce29023b5f6476d0006aa94b8b1b858c480 100644 (file)
@@ -372,7 +372,7 @@ static void moom_callback(struct work_struct *ignored)
 
        mutex_lock(&oom_lock);
        if (!out_of_memory(&oc))
-               pr_info("OOM request ignored because killer is disabled\n");
+               pr_info("OOM request ignored. No task eligible\n");
        mutex_unlock(&oom_lock);
 }
 
index 9ccabe3bb7de1635ba3cac78751db68bb5ead93e..0d435c794d760b7530945e3b82cf40f8cfcc7fb7 100644 (file)
@@ -103,12 +103,11 @@ void invalidate_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-       if (mapping->nrpages == 0)
-               return;
-
-       invalidate_bh_lrus();
-       lru_add_drain_all();    /* make sure all lru add caches are flushed */
-       invalidate_mapping_pages(mapping, 0, -1);
+       if (mapping->nrpages) {
+               invalidate_bh_lrus();
+               lru_add_drain_all();    /* make sure all lru add caches are flushed */
+               invalidate_mapping_pages(mapping, 0, -1);
+       }
        /* 99% of the time, we don't need to flush the cleancache on the bdev.
         * But, for the strange corners, lets be cautious
         */
index 141c3cd55a8b2d974f431d7710fbe4de58f78355..1c25ae30500e675b0ae69905feff7d0040e0e3cd 100644 (file)
@@ -887,16 +887,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                flags |= IOMAP_WRITE;
        }
 
-       if (mapping->nrpages) {
-               ret = filemap_write_and_wait_range(mapping, start, end);
-               if (ret)
-                       goto out_free_dio;
+       ret = filemap_write_and_wait_range(mapping, start, end);
+       if (ret)
+               goto out_free_dio;
 
-               ret = invalidate_inode_pages2_range(mapping,
-                               start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-               WARN_ON_ONCE(ret);
-               ret = 0;
-       }
+       ret = invalidate_inode_pages2_range(mapping,
+                       start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+       WARN_ON_ONCE(ret);
+       ret = 0;
 
        inode_dio_begin(inode);
 
@@ -951,7 +949,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
         * one is a pretty crazy thing to do, so we don't support it 100%.  If
         * this invalidation fails, tough, the write still worked...
         */
-       if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+       if (iov_iter_rw(iter) == WRITE) {
                int err = invalidate_inode_pages2_range(mapping,
                                start >> PAGE_SHIFT, end >> PAGE_SHIFT);
                WARN_ON_ONCE(err);
index 5adc2fb62b0fab89899e5d0acba1e8019a73c766..c43fe83ee708fbc079ab692515a878d872b8df9c 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -205,6 +206,14 @@ static int kjournald2(void *arg)
        journal->j_task = current;
        wake_up(&journal->j_wait_done_commit);
 
+       /*
+        * Make sure that no allocations from this kernel thread will ever
+        * recurse to the fs layer because we are responsible for the
+        * transaction commit and any fs involvement might get stuck waiting for
+        * the trasn. commit.
+        */
+       memalloc_nofs_save();
+
        /*
         * And now, wait forever for commit wakeup events.
         */
index 5e659ee08d6ae84046b9b8d59f41641e9ef28b22..9ee4832b6f8b3664430e31bcf3775b412e1b81d8 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 
 #include <trace/events/jbd2.h>
 
@@ -388,6 +389,11 @@ repeat:
 
        rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
        jbd2_journal_free_transaction(new_transaction);
+       /*
+        * Ensure that no allocations done while the transaction is open are
+        * going to recurse back to the fs layer.
+        */
+       handle->saved_alloc_context = memalloc_nofs_save();
        return 0;
 }
 
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
        trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
                                handle->h_transaction->t_tid, type,
                                line_no, nblocks);
+
        return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
        if (handle->h_rsv_handle)
                jbd2_journal_free_reserved(handle->h_rsv_handle);
 free_and_exit:
+       /*
+        * Scope of the GFP_NOFS context is over here and so we can restore the
+        * original alloc context.
+        */
+       memalloc_nofs_restore(handle->saved_alloc_context);
        jbd2_free_handle(handle);
        return err;
 }
index f6e871760f8d97265f4974c159dca7a7b6439b7f..0da0332725aafcf253707f7ca59c9385cd6b4fb4 100644 (file)
@@ -2242,13 +2242,13 @@ unlock:
        spin_unlock(&o2hb_live_lock);
 }
 
-static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
                char *page)
 {
        return sprintf(page, "%u\n", o2hb_dead_threshold);
 }
 
-static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
                const char *page, size_t count)
 {
        unsigned long tmp;
@@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
 
 }
 
-CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
 CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
 
 static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
-       &o2hb_heartbeat_group_attr_threshold,
+       &o2hb_heartbeat_group_attr_dead_threshold,
        &o2hb_heartbeat_group_attr_mode,
        NULL,
 };
index 5b51c31c892d54e36c74de081fddb0375a26acf5..8d779227370ab1d121fdc2fc6f546f7844e95a5b 100644 (file)
@@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
        INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
        INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
 
-       init_timer(&sc->sc_idle_timeout);
-       sc->sc_idle_timeout.function = o2net_idle_timer;
-       sc->sc_idle_timeout.data = (unsigned long)sc;
+       setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
+                   (unsigned long)sc);
 
        sclog(sc, "alloced\n");
 
@@ -956,7 +955,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
                mutex_lock(&sc->sc_send_lock);
                ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
                                                 virt_to_page(kmalloced_virt),
-                                                (long)kmalloced_virt & ~PAGE_MASK,
+                                                offset_in_page(kmalloced_virt),
                                                 size, MSG_DONTWAIT);
                mutex_unlock(&sc->sc_send_lock);
                if (ret == size)
index 312578089544dbd652aac303d0cbabee8fbcb968..f0c8b33d99b137e0c607448237216c2826df3fe8 100644 (file)
@@ -441,6 +441,7 @@ struct mem_size_stats {
        unsigned long private_dirty;
        unsigned long referenced;
        unsigned long anonymous;
+       unsigned long lazyfree;
        unsigned long anonymous_thp;
        unsigned long shmem_thp;
        unsigned long swap;
@@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
        int i, nr = compound ? 1 << compound_order(page) : 1;
        unsigned long size = nr * PAGE_SIZE;
 
-       if (PageAnon(page))
+       if (PageAnon(page)) {
                mss->anonymous += size;
+               if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+                       mss->lazyfree += size;
+       }
 
        mss->resident += size;
        /* Accumulate the size in pages that have been accessed. */
@@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
                   "Anonymous:      %8lu kB\n"
+                  "LazyFree:       %8lu kB\n"
                   "AnonHugePages:  %8lu kB\n"
                   "ShmemPmdMapped: %8lu kB\n"
                   "Shared_Hugetlb: %8lu kB\n"
@@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
                   mss.anonymous >> 10,
+                  mss.lazyfree >> 10,
                   mss.anonymous_thp >> 10,
                   mss.shmem_thp >> 10,
                   mss.shared_hugetlb >> 10,
index 70a5b55e0870a0523c0dd8ce629debf2fccebe25..780fc8986dabd163058747e6b71e3aae90b7e67c 100644 (file)
@@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 void *
 kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 {
-       unsigned noio_flag = 0;
+       unsigned nofs_flag = 0;
        void    *ptr;
        gfp_t   lflags;
 
@@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
         * __vmalloc() will allocate data pages and auxillary structures (e.g.
         * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
         * here. Hence we need to tell memory reclaim that we are in such a
-        * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+        * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
         * the filesystem here and potentially deadlocking.
         */
-       if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
-               noio_flag = memalloc_noio_save();
+       if (flags & KM_NOFS)
+               nofs_flag = memalloc_nofs_save();
 
        lflags = kmem_flags_convert(flags);
        ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
 
-       if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
-               memalloc_noio_restore(noio_flag);
+       if (flags & KM_NOFS)
+               memalloc_nofs_restore(nofs_flag);
 
        return ptr;
 }
index f0fc84fcaac2553283f90bc3f157b924bd03d932..d6ea520162b26530bbb560b667d925b6c6690b63 100644 (file)
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
                lflags = GFP_ATOMIC | __GFP_NOWARN;
        } else {
                lflags = GFP_KERNEL | __GFP_NOWARN;
-               if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+               if (flags & KM_NOFS)
                        lflags &= ~__GFP_FS;
        }
 
index c3decedc94557e14c3cb7fa7eee81d09e54f4c9e..3059a3ec7ecbf9b6260ef977366540bbd21c9e70 100644 (file)
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
        struct xfs_btree_split_args     *args = container_of(work,
                                                struct xfs_btree_split_args, work);
        unsigned long           pflags;
-       unsigned long           new_pflags = PF_FSTRANS;
+       unsigned long           new_pflags = PF_MEMALLOC_NOFS;
 
        /*
         * we are in a transaction context here, but may also be doing work
index 61494295d92fe1acb7d343bc3a4e1594f09027ab..05eca126c688cfaa606ada5d161379f73b734fcf 100644 (file)
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
         * We hand off the transaction to the completion thread now, so
         * clear the flag here.
         */
-       current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
        return 0;
 }
 
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
         * thus we need to mark ourselves as being in a transaction manually.
         * Similarly for freeze protection.
         */
-       current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
        __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
        /* we abort the update if there was an IO error */
@@ -1016,7 +1016,7 @@ xfs_do_writepage(
         * Given that we do not allow direct reclaim to call us, we should
         * never be called while in a filesystem transaction.
         */
-       if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+       if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
                goto redirty;
 
        /*
index b6208728ba39767bbb2898880fbb515abcc5ab4f..ca09061369cb539510ecd6cb7dfdd05a986671e7 100644 (file)
@@ -443,17 +443,17 @@ _xfs_buf_map_pages(
                bp->b_addr = NULL;
        } else {
                int retried = 0;
-               unsigned noio_flag;
+               unsigned nofs_flag;
 
                /*
                 * vm_map_ram() will allocate auxillary structures (e.g.
                 * pagetables) with GFP_KERNEL, yet we are likely to be under
                 * GFP_NOFS context here. Hence we need to tell memory reclaim
-                * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+                * that we are in such a context via PF_MEMALLOC_NOFS to prevent
                 * memory reclaim re-entering the filesystem here and
                 * potentially deadlocking.
                 */
-               noio_flag = memalloc_noio_save();
+               nofs_flag = memalloc_nofs_save();
                do {
                        bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
                                                -1, PAGE_KERNEL);
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
                                break;
                        vm_unmap_aliases();
                } while (retried++ <= 1);
-               memalloc_noio_restore(noio_flag);
+               memalloc_nofs_restore(nofs_flag);
 
                if (!bp->b_addr)
                        return -ENOMEM;
index 70f42ea86dfbd4a7de39d0709ee4a90e59f0a758..f5969c8274fc6a76d491779a249d4839ba830dd3 100644 (file)
@@ -134,7 +134,7 @@ xfs_trans_reserve(
        bool            rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
        /* Mark this thread as being in a transaction */
-       current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
        /*
         * Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
        if (blocks > 0) {
                error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
                if (error != 0) {
-                       current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+                       current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
                        return -ENOSPC;
                }
                tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
                tp->t_blk_res = 0;
        }
 
-       current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
        return error;
 }
@@ -914,7 +914,7 @@ __xfs_trans_commit(
 
        xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
 
-       current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
        xfs_trans_free(tp);
 
        /*
@@ -944,7 +944,7 @@ out_unreserve:
                if (commit_lsn == -1 && !error)
                        error = -EIO;
        }
-       current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
        xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
        xfs_trans_free(tp);
 
@@ -998,7 +998,7 @@ xfs_trans_cancel(
                xfs_log_done(mp, tp->t_ticket, NULL, false);
 
        /* mark this thread as no longer being in a transaction */
-       current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
        xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
        xfs_trans_free(tp);
index db373b9d322361f7553cfbe36026d918fb9d09ee..2b1a44f5bdb60e6a28d874630f71dec7a0f4c40b 100644 (file)
@@ -40,6 +40,11 @@ struct vm_area_struct;
 #define ___GFP_DIRECT_RECLAIM  0x400000u
 #define ___GFP_WRITE           0x800000u
 #define ___GFP_KSWAPD_RECLAIM  0x1000000u
+#ifdef CONFIG_LOCKDEP
+#define ___GFP_NOLOCKDEP       0x4000000u
+#else
+#define ___GFP_NOLOCKDEP       0
+#endif
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
@@ -179,8 +184,11 @@ struct vm_area_struct;
 #define __GFP_NOTRACK  ((__force gfp_t)___GFP_NOTRACK)
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
+/* Disable lockdep for GFP context tracking */
+#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT 25
+#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /*
@@ -202,8 +210,16 @@ struct vm_area_struct;
  *
  * GFP_NOIO will use direct reclaim to discard clean pages or slab pages
  *   that do not require the starting of any physical IO.
+ *   Please try to avoid using this flag directly and instead use
+ *   memalloc_noio_{save,restore} to mark the whole scope which cannot
+ *   perform any IO with a short explanation why. All allocation requests
+ *   will inherit GFP_NOIO implicitly.
  *
  * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ *   Please try to avoid using this flag directly and instead use
+ *   memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ *   recurse into the FS layer with a short explanation why. All allocation
+ *   requests will inherit GFP_NOFS implicitly.
  *
  * GFP_USER is for userspace allocations that also need to be directly
  *   accessibly by the kernel or hardware. It is typically used by hardware
@@ -297,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 
 /*
  * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
- * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
- * and there are 16 of them to cover all possible combinations of
+ * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
+ * bits long and there are 16 of them to cover all possible combinations of
  * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
  *
  * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
index dfaa1f4dcb0c54911c16a8683a38dabbfb4dcb0f..606b6bce3a5bb062d12b35d84bf8a022cbe0abe8 100644 (file)
@@ -491,6 +491,8 @@ struct jbd2_journal_handle
 
        unsigned long           h_start_jiffies;
        unsigned int            h_requested_credits;
+
+       unsigned int            saved_alloc_context;
 };
 
 
index e1cfda4bee588d726e2cfe9089ccc20baa031864..78b44a024eaae8e9a9a0e2c448b4b3e2b7f48719 100644 (file)
@@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page,
 struct page *ksm_might_need_to_copy(struct page *page,
                        struct vm_area_struct *vma, unsigned long address);
 
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 
 #else  /* !CONFIG_KSM */
@@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page,
        return 0;
 }
 
-static inline int rmap_walk_ksm(struct page *page,
+static inline void rmap_walk_ksm(struct page *page,
                        struct rmap_walk_control *rwc)
 {
-       return 0;
 }
 
 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
index bb7250c45cb8356b03df2d4b9f1325dc6e30069b..899949bbb2f9362182e7fd28dc34833cca7de78e 100644 (file)
@@ -35,48 +35,43 @@ struct page;
 struct mm_struct;
 struct kmem_cache;
 
-/*
- * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
- * These two lists should keep in accord with each other.
- */
-enum mem_cgroup_stat_index {
-       /*
-        * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
-        */
-       MEM_CGROUP_STAT_CACHE,          /* # of pages charged as cache */
-       MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
-       MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
-       MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
-       MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
-       MEM_CGROUP_STAT_WRITEBACK,      /* # of pages under writeback */
-       MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
-       MEM_CGROUP_STAT_NSTATS,
-       /* default hierarchy stats */
-       MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
+/* Cgroup-specific page state, on top of universal node page state */
+enum memcg_stat_item {
+       MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
+       MEMCG_RSS,
+       MEMCG_RSS_HUGE,
+       MEMCG_SWAP,
+       MEMCG_SOCK,
+       /* XXX: why are these zone and not node counters? */
+       MEMCG_KERNEL_STACK_KB,
        MEMCG_SLAB_RECLAIMABLE,
        MEMCG_SLAB_UNRECLAIMABLE,
-       MEMCG_SOCK,
        MEMCG_NR_STAT,
 };
 
+/* Cgroup-specific events, on top of universal VM events */
+enum memcg_event_item {
+       MEMCG_LOW = NR_VM_EVENT_ITEMS,
+       MEMCG_HIGH,
+       MEMCG_MAX,
+       MEMCG_OOM,
+       MEMCG_NR_EVENTS,
+};
+
 struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        int priority;
        unsigned int generation;
 };
 
-enum mem_cgroup_events_index {
-       MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
-       MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
-       MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
-       MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
-       MEM_CGROUP_EVENTS_NSTATS,
-       /* default hierarchy events */
-       MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS,
-       MEMCG_HIGH,
-       MEMCG_MAX,
-       MEMCG_OOM,
-       MEMCG_NR_EVENTS,
+#ifdef CONFIG_MEMCG
+
+#define MEM_CGROUP_ID_SHIFT    16
+#define MEM_CGROUP_ID_MAX      USHRT_MAX
+
+struct mem_cgroup_id {
+       int id;
+       atomic_t ref;
 };
 
 /*
@@ -92,16 +87,6 @@ enum mem_cgroup_events_target {
        MEM_CGROUP_NTARGETS,
 };
 
-#ifdef CONFIG_MEMCG
-
-#define MEM_CGROUP_ID_SHIFT    16
-#define MEM_CGROUP_ID_MAX      USHRT_MAX
-
-struct mem_cgroup_id {
-       int id;
-       atomic_t ref;
-};
-
 struct mem_cgroup_stat_cpu {
        long count[MEMCG_NR_STAT];
        unsigned long events[MEMCG_NR_EVENTS];
@@ -283,17 +268,10 @@ static inline bool mem_cgroup_disabled(void)
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-static inline void mem_cgroup_events(struct mem_cgroup *memcg,
-                      enum mem_cgroup_events_index idx,
-                      unsigned int nr)
+static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+                                   enum memcg_event_item event)
 {
-       this_cpu_add(memcg->stat->events[idx], nr);
+       this_cpu_inc(memcg->stat->events[event]);
        cgroup_file_notify(&memcg->events_file);
 }
 
@@ -494,8 +472,42 @@ extern int do_swap_account;
 void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
+                                            enum memcg_stat_item idx)
+{
+       long val = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               val += per_cpu(memcg->stat->count[idx], cpu);
+
+       if (val < 0)
+               val = 0;
+
+       return val;
+}
+
+static inline void mod_memcg_state(struct mem_cgroup *memcg,
+                                  enum memcg_stat_item idx, int val)
+{
+       if (!mem_cgroup_disabled())
+               this_cpu_add(memcg->stat->count[idx], val);
+}
+
+static inline void inc_memcg_state(struct mem_cgroup *memcg,
+                                  enum memcg_stat_item idx)
+{
+       mod_memcg_state(memcg, idx, 1);
+}
+
+static inline void dec_memcg_state(struct mem_cgroup *memcg,
+                                  enum memcg_stat_item idx)
+{
+       mod_memcg_state(memcg, idx, -1);
+}
+
 /**
- * mem_cgroup_update_page_stat - update page state statistics
+ * mod_memcg_page_state - update page state statistics
  * @page: the page
  * @idx: page state item to account
  * @val: number of pages (positive or negative)
@@ -506,28 +518,28 @@ void unlock_page_memcg(struct page *page);
  *
  *   lock_page(page) or lock_page_memcg(page)
  *   if (TestClearPageState(page))
- *     mem_cgroup_update_page_stat(page, state, -1);
+ *     mod_memcg_page_state(page, state, -1);
  *   unlock_page(page) or unlock_page_memcg(page)
+ *
+ * Kernel pages are an exception to this, since they'll never move.
  */
-static inline void mem_cgroup_update_page_stat(struct page *page,
-                                enum mem_cgroup_stat_index idx, int val)
+static inline void mod_memcg_page_state(struct page *page,
+                                       enum memcg_stat_item idx, int val)
 {
-       VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
-
        if (page->mem_cgroup)
-               this_cpu_add(page->mem_cgroup->stat->count[idx], val);
+               mod_memcg_state(page->mem_cgroup, idx, val);
 }
 
-static inline void mem_cgroup_inc_page_stat(struct page *page,
-                                           enum mem_cgroup_stat_index idx)
+static inline void inc_memcg_page_state(struct page *page,
+                                       enum memcg_stat_item idx)
 {
-       mem_cgroup_update_page_stat(page, idx, 1);
+       mod_memcg_page_state(page, idx, 1);
 }
 
-static inline void mem_cgroup_dec_page_stat(struct page *page,
-                                           enum mem_cgroup_stat_index idx)
+static inline void dec_memcg_page_state(struct page *page,
+                                       enum memcg_stat_item idx)
 {
-       mem_cgroup_update_page_stat(page, idx, -1);
+       mod_memcg_page_state(page, idx, -1);
 }
 
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -544,20 +556,8 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 
        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-       if (unlikely(!memcg))
-               goto out;
-
-       switch (idx) {
-       case PGFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
-               break;
-       case PGMAJFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
-               break;
-       default:
-               BUG();
-       }
-out:
+       if (likely(memcg))
+               this_cpu_inc(memcg->stat->events[idx]);
        rcu_read_unlock();
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -576,9 +576,8 @@ static inline bool mem_cgroup_disabled(void)
        return true;
 }
 
-static inline void mem_cgroup_events(struct mem_cgroup *memcg,
-                                    enum mem_cgroup_events_index idx,
-                                    unsigned int nr)
+static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+                                   enum memcg_event_item event)
 {
 }
 
@@ -740,19 +739,41 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
        return false;
 }
 
-static inline void mem_cgroup_update_page_stat(struct page *page,
-                                              enum mem_cgroup_stat_index idx,
-                                              int nr)
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
+                                            enum memcg_stat_item idx)
+{
+       return 0;
+}
+
+static inline void mod_memcg_state(struct mem_cgroup *memcg,
+                                  enum memcg_stat_item idx,
+                                  int nr)
+{
+}
+
+static inline void inc_memcg_state(struct mem_cgroup *memcg,
+                                  enum memcg_stat_item idx)
+{
+}
+
+static inline void dec_memcg_state(struct mem_cgroup *memcg,
+                                  enum memcg_stat_item idx)
+{
+}
+
+static inline void mod_memcg_page_state(struct page *page,
+                                       enum memcg_stat_item idx,
+                                       int nr)
 {
 }
 
-static inline void mem_cgroup_inc_page_stat(struct page *page,
-                                           enum mem_cgroup_stat_index idx)
+static inline void inc_memcg_page_state(struct page *page,
+                                       enum memcg_stat_item idx)
 {
 }
 
-static inline void mem_cgroup_dec_page_stat(struct page *page,
-                                           enum mem_cgroup_stat_index idx)
+static inline void dec_memcg_page_state(struct page *page,
+                                       enum memcg_stat_item idx)
 {
 }
 
@@ -872,7 +893,7 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
  * @val: number of pages (positive or negative)
  */
 static inline void memcg_kmem_update_page_stat(struct page *page,
-                               enum mem_cgroup_stat_index idx, int val)
+                               enum memcg_stat_item idx, int val)
 {
        if (memcg_kmem_enabled() && page->mem_cgroup)
                this_cpu_add(page->mem_cgroup->stat->count[idx], val);
@@ -901,7 +922,7 @@ static inline void memcg_put_cache_ids(void)
 }
 
 static inline void memcg_kmem_update_page_stat(struct page *page,
-                               enum mem_cgroup_stat_index idx, int val)
+                               enum memcg_stat_item idx, int val)
 {
 }
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
index fa76b516fa473bdfd803d09e0206923153613d65..48e24844b3c5074c8bf8c26dcf2be11a32c657e0 100644 (file)
@@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES];
 #ifdef CONFIG_MIGRATION
 
 extern void putback_movable_pages(struct list_head *l);
-extern int migrate_page(struct address_space *,
-                       struct page *, struct page *, enum migrate_mode);
+extern int migrate_page(struct address_space *mapping,
+                       struct page *newpage, struct page *page,
+                       enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
                unsigned long private, enum migrate_mode mode, int reason);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
index 695da2a19b4cbb355810b88df2095b6c3e242801..5d22e69f51ea6ff5f7d3fffef5239fd84bf60d6c 100644 (file)
@@ -2487,7 +2487,6 @@ extern long copy_huge_page_from_user(struct page *dst_page,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 extern struct page_ext_operations debug_guardpage_ops;
-extern struct page_ext_operations page_poisoning_ops;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern unsigned int _debug_guardpage_minorder;
index 8e02b3750fe0f6e18afeb8cfc096705f23405728..e0c3c5e3d8a0e507c2703716ec4f5ef9b2bdd6dd 100644 (file)
@@ -35,7 +35,7 @@
  */
 #define PAGE_ALLOC_COSTLY_ORDER 3
 
-enum {
+enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
@@ -149,7 +149,6 @@ enum node_stat_item {
        NR_UNEVICTABLE,         /*  "     "     "   "       "         */
        NR_ISOLATED_ANON,       /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,       /* Temporary isolated pages from file lru */
-       NR_PAGES_SCANNED,       /* pages scanned since last reclaim */
        WORKINGSET_REFAULT,
        WORKINGSET_ACTIVATE,
        WORKINGSET_NODERECLAIM,
@@ -226,6 +225,8 @@ struct lruvec {
        struct zone_reclaim_stat        reclaim_stat;
        /* Evictions & activations on the inactive file list */
        atomic_long_t                   inactive_age;
+       /* Refaults at the time of last reclaim cycle */
+       unsigned long                   refaults;
 #ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
 #endif
@@ -630,6 +631,8 @@ typedef struct pglist_data {
        int kswapd_order;
        enum zone_type kswapd_classzone_idx;
 
+       int kswapd_failures;            /* Number of 'reclaimed == 0' runs */
+
 #ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_classzone_idx;
index 8c89e902df3e7ad35ecbff3009e2d88d06026b2a..43ef2c30cb0f59f83c05d23eea4a8aad87edefd1 100644 (file)
@@ -83,19 +83,17 @@ struct anon_vma_chain {
 };
 
 enum ttu_flags {
-       TTU_UNMAP = 1,                  /* unmap mode */
-       TTU_MIGRATION = 2,              /* migration mode */
-       TTU_MUNLOCK = 4,                /* munlock mode */
-       TTU_LZFREE = 8,                 /* lazy free mode */
-       TTU_SPLIT_HUGE_PMD = 16,        /* split huge PMD if any */
-
-       TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
-       TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
-       TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
-       TTU_BATCH_FLUSH = (1 << 11),    /* Batch TLB flushes where possible
+       TTU_MIGRATION           = 0x1,  /* migration mode */
+       TTU_MUNLOCK             = 0x2,  /* munlock mode */
+
+       TTU_SPLIT_HUGE_PMD      = 0x4,  /* split huge PMD if any */
+       TTU_IGNORE_MLOCK        = 0x8,  /* ignore mlock */
+       TTU_IGNORE_ACCESS       = 0x10, /* don't age */
+       TTU_IGNORE_HWPOISON     = 0x20, /* corrupted page is recoverable */
+       TTU_BATCH_FLUSH         = 0x40, /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
-       TTU_RMAP_LOCKED = (1 << 12)     /* do not grab rmap lock:
+       TTU_RMAP_LOCKED         = 0x80  /* do not grab rmap lock:
                                         * caller holds it */
 };
 
@@ -193,9 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
 int page_referenced(struct page *, int is_locked,
                        struct mem_cgroup *memcg, unsigned long *vm_flags);
 
-#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
-
-int try_to_unmap(struct page *, enum ttu_flags flags);
+bool try_to_unmap(struct page *, enum ttu_flags flags);
 
 /* Avoid racy checks */
 #define PVMW_SYNC              (1 << 0)
@@ -239,7 +235,7 @@ int page_mkclean(struct page *);
  * called in munlock()/munmap() path to check for other vmas holding
  * the page mlocked.
  */
-int try_to_munlock(struct page *);
+void try_to_munlock(struct page *);
 
 void remove_migration_ptes(struct page *old, struct page *new, bool locked);
 
@@ -261,15 +257,19 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
  */
 struct rmap_walk_control {
        void *arg;
-       int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
+       /*
+        * Return false if page table scanning in rmap_walk should be stopped.
+        * Otherwise, return true.
+        */
+       bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct page *page);
        struct anon_vma *(*anon_lock)(struct page *page);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 };
 
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
 
 #else  /* !CONFIG_MMU */
 
@@ -285,7 +285,7 @@ static inline int page_referenced(struct page *page, int is_locked,
        return 0;
 }
 
-#define try_to_unmap(page, refs) SWAP_FAIL
+#define try_to_unmap(page, refs) false
 
 static inline int page_mkclean(struct page *page)
 {
@@ -295,13 +295,4 @@ static inline int page_mkclean(struct page *page)
 
 #endif /* CONFIG_MMU */
 
-/*
- * Return values of try_to_unmap
- */
-#define SWAP_SUCCESS   0
-#define SWAP_AGAIN     1
-#define SWAP_FAIL      2
-#define SWAP_MLOCK     3
-#define SWAP_LZFREE    4
-
 #endif /* _LINUX_RMAP_H */
index ea05f6c514139c39e1c52abd7d45c7337d764152..84766bcdd01f934a632af261f998882cb46eeb27 100644 (file)
@@ -14,7 +14,6 @@
 #define _RODATA_TEST_H
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
-extern const int rodata_test_data;
 void rodata_test(void);
 #else
 static inline void rodata_test(void) {}
index 3d4fa448223fd60f0af37f3b6b274483593af468..993e7e25a3a55c283b9fe1bc3b4a2a6796574954 100644 (file)
@@ -1224,9 +1224,9 @@ extern struct pid *cad_pid;
 #define PF_USED_ASYNC          0x00004000      /* Used async_schedule*(), used by module init */
 #define PF_NOFREEZE            0x00008000      /* This thread should not be frozen */
 #define PF_FROZEN              0x00010000      /* Frozen for system suspend */
-#define PF_FSTRANS             0x00020000      /* Inside a filesystem transaction */
-#define PF_KSWAPD              0x00040000      /* I am kswapd */
-#define PF_MEMALLOC_NOIO       0x00080000      /* Allocating memory without IO involved */
+#define PF_KSWAPD              0x00020000      /* I am kswapd */
+#define PF_MEMALLOC_NOFS       0x00040000      /* All allocation requests will inherit GFP_NOFS */
+#define PF_MEMALLOC_NOIO       0x00080000      /* All allocation requests will inherit GFP_NOIO */
 #define PF_LESS_THROTTLE       0x00100000      /* Throttle me less: I clean memory */
 #define PF_KTHREAD             0x00200000      /* I am a kernel thread */
 #define PF_RANDOMIZE           0x00400000      /* Randomize virtual address space */
index 830953ebb391fa80e39af4c4f61f1cdd766a3040..9daabe138c9905fea8d596281afabdd3a206fa78 100644 (file)
@@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk)
        return ret;
 }
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- * __GFP_FS is also cleared as it implies __GFP_IO.
+/*
+ * Applies per-task gfp context to the given allocation flags.
+ * PF_MEMALLOC_NOIO implies GFP_NOIO
+ * PF_MEMALLOC_NOFS implies GFP_NOFS
  */
-static inline gfp_t memalloc_noio_flags(gfp_t flags)
+static inline gfp_t current_gfp_context(gfp_t flags)
 {
+       /*
+        * NOIO implies both NOIO and NOFS and it is a weaker context
+        * so always make sure it makes precendence
+        */
        if (unlikely(current->flags & PF_MEMALLOC_NOIO))
                flags &= ~(__GFP_IO | __GFP_FS);
+       else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
+               flags &= ~__GFP_FS;
        return flags;
 }
 
@@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags)
        current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
 }
 
+static inline unsigned int memalloc_nofs_save(void)
+{
+       unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
+       current->flags |= PF_MEMALLOC_NOFS;
+       return flags;
+}
+
+static inline void memalloc_nofs_restore(unsigned int flags)
+{
+       current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
index 45e91dd6716d89b0ffa8f2d97481d12732d20173..ba5882419a7dbcebe0cbf7edee276346429d8b79 100644 (file)
@@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
-extern void deactivate_page(struct page *page);
+extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
@@ -411,9 +411,6 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 
-extern int get_swap_slots(int n, swp_entry_t *slots);
-extern void swapcache_free_batch(swp_entry_t *entries, int n);
-
 #else /* CONFIG_SWAP */
 
 #define swap_address_space(entry)              (NULL)
index a80b7b59cf33418811217faca1b9c6b041dad814..d84ae90ccd5c40de69de9217aa7c4e23e4c26be8 100644 (file)
@@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                FOR_ALL_ZONES(PGALLOC),
                FOR_ALL_ZONES(ALLOCSTALL),
                FOR_ALL_ZONES(PGSCAN_SKIP),
-               PGFREE, PGACTIVATE, PGDEACTIVATE,
+               PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
                PGFAULT, PGMAJFAULT,
                PGLAZYFREED,
                PGREFILL,
index dc3297895ce38fac7049e4df4aeb67804cca2ea5..0a1b3c748478313dded15665692550a7264516ae 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/task.h>
+#include <linux/sched/mm.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
@@ -2876,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
        if (unlikely(!debug_locks))
                return;
 
+       gfp_mask = current_gfp_context(gfp_mask);
+
        /* no reclaim without waiting on it */
        if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
                return;
@@ -2885,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
                return;
 
        /* We're only interested __GFP_FS allocations for now */
-       if (!(gfp_mask & __GFP_FS))
+       if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
                return;
 
        /*
@@ -2894,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
        if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
                return;
 
+       /* Disable lockdep if explicitly requested */
+       if (gfp_mask & __GFP_NOLOCKDEP)
+               return;
+
        mark_held_locks(curr, RECLAIM_FS);
 }
 
@@ -3947,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
 
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
-       current->lockdep_reclaim_gfp = gfp_mask;
+       current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
 }
 EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state);
 
index b157b46cc9a69ca830a2cdbc8abcbc9baab4a63e..fe4d50c992df618a914e4696d7f698e4e8b76c83 100644 (file)
@@ -942,21 +942,17 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o
        unsigned long flags;
        int count = 0, i;
 
-       local_irq_save(flags);
-
        for (i = 0; i < HASH_SIZE; ++i) {
-               spin_lock(&dma_entry_hash[i].lock);
+               spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
                list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
                        if (entry->dev == dev) {
                                count += 1;
                                *out_entry = entry;
                        }
                }
-               spin_unlock(&dma_entry_hash[i].lock);
+               spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
        }
 
-       local_irq_restore(flags);
-
        return count;
 }
 
index 691a9ad48497b02e3b09304d6565165ef2317b16..898e8799841759ff20f1dc3eb25f277ebb3a8119 100644 (file)
@@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu)
 void __init radix_tree_init(void)
 {
        int ret;
+
+       BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
                        sizeof(struct radix_tree_node), 0,
                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
index 79d0fd13b5b3c1a826f472398fd60a1ae1cb5da6..5b0adf1435de7dba61e7d0b34c6a7fd912041fd1 100644 (file)
@@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
 
 config PAGE_POISONING
        bool "Poison pages after freeing"
-       select PAGE_EXTENSION
        select PAGE_POISONING_NO_SANITY if HIBERNATION
        ---help---
          Fill the pages with poison patterns after free_pages() and verify
index 81e1eaa2a2cf1bea89767185e9cb9549f2139ca2..09c5282ebdd2812fbb758255194d9063cf5dcf97 100644 (file)
@@ -992,9 +992,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 static bool suitable_migration_target(struct compact_control *cc,
                                                        struct page *page)
 {
-       if (cc->ignore_block_suitable)
-               return true;
-
        /* If the page is a large free page, then disallow migration */
        if (PageBuddy(page)) {
                /*
@@ -1006,6 +1003,9 @@ static bool suitable_migration_target(struct compact_control *cc,
                        return false;
        }
 
+       if (cc->ignore_block_suitable)
+               return true;
+
        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
        if (migrate_async_suitable(get_pageblock_migratetype(page)))
                return true;
index dc59c5f35b3796785b8282f46425e2bc58d6ebf3..681da61080bc35fe6a3cee9084f65fd7acd3ecdc 100644 (file)
@@ -2204,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf)
        struct file_ra_state *ra = &file->f_ra;
        struct inode *inode = mapping->host;
        pgoff_t offset = vmf->pgoff;
+       pgoff_t max_off;
        struct page *page;
-       loff_t size;
        int ret = 0;
 
-       size = round_up(i_size_read(inode), PAGE_SIZE);
-       if (offset >= size >> PAGE_SHIFT)
+       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+       if (unlikely(offset >= max_off))
                return VM_FAULT_SIGBUS;
 
        /*
@@ -2258,8 +2258,8 @@ retry_find:
         * Found the page and have a reference on it.
         * We must recheck i_size under page lock.
         */
-       size = round_up(i_size_read(inode), PAGE_SIZE);
-       if (unlikely(offset >= size >> PAGE_SHIFT)) {
+       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+       if (unlikely(offset >= max_off)) {
                unlock_page(page);
                put_page(page);
                return VM_FAULT_SIGBUS;
@@ -2325,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf,
        struct file *file = vmf->vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t last_pgoff = start_pgoff;
-       loff_t size;
+       unsigned long max_idx;
        struct page *head, *page;
 
        rcu_read_lock();
@@ -2371,8 +2371,8 @@ repeat:
                if (page->mapping != mapping || !PageUptodate(page))
                        goto unlock;
 
-               size = round_up(i_size_read(mapping->host), PAGE_SIZE);
-               if (page->index >= size >> PAGE_SHIFT)
+               max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+               if (page->index >= max_idx)
                        goto unlock;
 
                if (file->f_ra.mmap_miss > 0)
@@ -2720,18 +2720,16 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
         * about to write.  We do this *before* the write so that we can return
         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
-       if (mapping->nrpages) {
-               written = invalidate_inode_pages2_range(mapping,
+       written = invalidate_inode_pages2_range(mapping,
                                        pos >> PAGE_SHIFT, end);
-               /*
-                * If a page can not be invalidated, return 0 to fall back
-                * to buffered write.
-                */
-               if (written) {
-                       if (written == -EBUSY)
-                               return 0;
-                       goto out;
-               }
+       /*
+        * If a page can not be invalidated, return 0 to fall back
+        * to buffered write.
+        */
+       if (written) {
+               if (written == -EBUSY)
+                       return 0;
+               goto out;
        }
 
        written = mapping->a_ops->direct_IO(iocb, from);
@@ -2744,10 +2742,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         */
-       if (mapping->nrpages) {
-               invalidate_inode_pages2_range(mapping,
-                                             pos >> PAGE_SHIFT, end);
-       }
+       invalidate_inode_pages2_range(mapping,
+                               pos >> PAGE_SHIFT, end);
 
        if (written > 0) {
                pos += written;
index 527ec2c6cca3b758eeb9a5e353b4075146fe8634..d9e6fddcc51f06a1286c56a24c510c1a3efa8add 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1575,7 +1575,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
        end = start + len;
 
        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
-                                       start, len)))
+                                       (void __user *)start, len)))
                return 0;
 
        /*
index f3c4f9d22821f889104340332eee93c5e124df4d..b787c4cfda0e61debd37f168a3061d4a5dfd4ce5 100644 (file)
@@ -1564,9 +1564,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                ClearPageDirty(page);
        unlock_page(page);
 
-       if (PageActive(page))
-               deactivate_page(page);
-
        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
                pmdp_invalidate(vma, addr, pmd);
                orig_pmd = pmd_mkold(orig_pmd);
@@ -1575,6 +1572,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                set_pmd_at(mm, addr, pmd, orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        }
+
+       mark_page_lazyfree(page);
        ret = true;
 out:
        spin_unlock(ptl);
@@ -2145,15 +2144,15 @@ static void freeze_page(struct page *page)
 {
        enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
                TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
-       int ret;
+       bool unmap_success;
 
        VM_BUG_ON_PAGE(!PageHead(page), page);
 
        if (PageAnon(page))
                ttu_flags |= TTU_MIGRATION;
 
-       ret = try_to_unmap(page, ttu_flags);
-       VM_BUG_ON_PAGE(ret, page);
+       unmap_success = try_to_unmap(page, ttu_flags);
+       VM_BUG_ON_PAGE(!unmap_success, page);
 }
 
 static void unfreeze_page(struct page *page)
@@ -2399,7 +2398,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 
        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
-       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
        if (PageAnon(head)) {
index 9d26fd9fefe4a1f4ec78279455c3b27f4ddbb875..356df057a2a8d3752b9e058ce74fe7b48142e02b 100644 (file)
@@ -34,8 +34,7 @@ static int hwpoison_inject(void *data, u64 val)
        if (!hwpoison_filter_enable)
                goto inject;
 
-       if (!PageLRU(hpage) && !PageHuge(p))
-               shake_page(hpage, 0);
+       shake_page(hpage, 0);
        /*
         * This implies unable to support non-LRU pages.
         */
index 266efaeaa370a46debcc5b6b614a72e33833ac4d..04d08ef91224bfb8be0a6a2aed456281f7702c44 100644 (file)
@@ -80,12 +80,17 @@ static inline void set_page_refcounted(struct page *page)
 
 extern unsigned long highest_memmap_pfn;
 
+/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
 /*
  * in mm/vmscan.c:
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
-extern bool pgdat_reclaimable(struct pglist_data *pgdat);
 
 /*
  * in mm/rmap.c:
@@ -505,4 +510,14 @@ extern const struct trace_print_flags pageflag_names[];
 extern const struct trace_print_flags vmaflag_names[];
 extern const struct trace_print_flags gfpflag_names[];
 
+static inline bool is_migrate_highatomic(enum migratetype migratetype)
+{
+       return migratetype == MIGRATE_HIGHATOMIC;
+}
+
+static inline bool is_migrate_highatomic_page(struct page *page)
+{
+       return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+}
+
 #endif /* __MM_INTERNAL_H */
index 98b27195e38b07fc1b6e20c1f9c49db9bc112303..9348d27088c1145872495859c8eee26f6dc7f23f 100644 (file)
@@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 
        shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
        if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
-               kasan_report_double_free(cache, object, shadow_byte);
+               kasan_report_double_free(cache, object,
+                               __builtin_return_address(1));
                return true;
        }
 
index dd2dea8eb0771a506c0b510efc79c3fc5253bda5..1229298cce646ddc725c4f1ea9d823a0c71e3cd1 100644 (file)
@@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 void kasan_report(unsigned long addr, size_t size,
                bool is_write, unsigned long ip);
 void kasan_report_double_free(struct kmem_cache *cache, void *object,
-                       s8 shadow);
+                                       void *ip);
 
 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
index ab42a0803f161c6834b1362aefd5ded1990eb04f..beee0e980e2dd3385b007fbcb119f599207dacc0 100644 (file)
@@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
        return first_bad_addr;
 }
 
-static void print_error_description(struct kasan_access_info *info)
+static bool addr_has_shadow(struct kasan_access_info *info)
+{
+       return (info->access_addr >=
+               kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
 {
        const char *bug_type = "unknown-crash";
        u8 *shadow_addr;
@@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info)
                break;
        }
 
-       pr_err("BUG: KASAN: %s in %pS at addr %p\n",
-               bug_type, (void *)info->ip,
-               info->access_addr);
-       pr_err("%s of size %zu by task %s/%d\n",
-               info->is_write ? "Write" : "Read",
-               info->access_size, current->comm, task_pid_nr(current));
+       return bug_type;
+}
+
+const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+       const char *bug_type = "unknown-crash";
+
+       if ((unsigned long)info->access_addr < PAGE_SIZE)
+               bug_type = "null-ptr-deref";
+       else if ((unsigned long)info->access_addr < TASK_SIZE)
+               bug_type = "user-memory-access";
+       else
+               bug_type = "wild-memory-access";
+
+       return bug_type;
+}
+
+static const char *get_bug_type(struct kasan_access_info *info)
+{
+       if (addr_has_shadow(info))
+               return get_shadow_bug_type(info);
+       return get_wild_bug_type(info);
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+       const char *bug_type = get_bug_type(info);
+
+       pr_err("BUG: KASAN: %s in %pS\n",
+               bug_type, (void *)info->ip);
+       pr_err("%s of size %zu at addr %p by task %s/%d\n",
+               info->is_write ? "Write" : "Read", info->access_size,
+               info->access_addr, current->comm, task_pid_nr(current));
 }
 
 static inline bool kernel_or_module_addr(const void *addr)
@@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags)
        kasan_enable_current();
 }
 
-static void print_track(struct kasan_track *track)
+static void print_track(struct kasan_track *track, const char *prefix)
 {
-       pr_err("PID = %u\n", track->pid);
+       pr_err("%s by task %u:\n", prefix, track->pid);
        if (track->stack) {
                struct stack_trace trace;
 
@@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track)
        }
 }
 
-static void kasan_object_err(struct kmem_cache *cache, void *object)
+static struct page *addr_to_page(const void *addr)
 {
-       struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+       if ((addr >= (void *)PAGE_OFFSET) &&
+                       (addr < high_memory))
+               return virt_to_head_page(addr);
+       return NULL;
+}
 
-       dump_stack();
-       pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
-               cache->object_size);
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+                               const void *addr)
+{
+       unsigned long access_addr = (unsigned long)addr;
+       unsigned long object_addr = (unsigned long)object;
+       const char *rel_type;
+       int rel_bytes;
 
-       if (!(cache->flags & SLAB_KASAN))
+       pr_err("The buggy address belongs to the object at %p\n"
+              " which belongs to the cache %s of size %d\n",
+               object, cache->name, cache->object_size);
+
+       if (!addr)
                return;
 
-       pr_err("Allocated:\n");
-       print_track(&alloc_info->alloc_track);
-       pr_err("Freed:\n");
-       print_track(&alloc_info->free_track);
+       if (access_addr < object_addr) {
+               rel_type = "to the left";
+               rel_bytes = object_addr - access_addr;
+       } else if (access_addr >= object_addr + cache->object_size) {
+               rel_type = "to the right";
+               rel_bytes = access_addr - (object_addr + cache->object_size);
+       } else {
+               rel_type = "inside";
+               rel_bytes = access_addr - object_addr;
+       }
+
+       pr_err("The buggy address is located %d bytes %s of\n"
+              " %d-byte region [%p, %p)\n",
+               rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+               (void *)(object_addr + cache->object_size));
 }
 
-void kasan_report_double_free(struct kmem_cache *cache, void *object,
-                       s8 shadow)
+static void describe_object(struct kmem_cache *cache, void *object,
+                               const void *addr)
 {
-       unsigned long flags;
+       struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
 
-       kasan_start_report(&flags);
-       pr_err("BUG: Double free or freeing an invalid pointer\n");
-       pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
-       kasan_object_err(cache, object);
-       kasan_end_report(&flags);
+       if (cache->flags & SLAB_KASAN) {
+               print_track(&alloc_info->alloc_track, "Allocated");
+               pr_err("\n");
+               print_track(&alloc_info->free_track, "Freed");
+               pr_err("\n");
+       }
+
+       describe_object_addr(cache, object, addr);
 }
 
-static void print_address_description(struct kasan_access_info *info)
+static void print_address_description(void *addr)
 {
-       const void *addr = info->access_addr;
+       struct page *page = addr_to_page(addr);
 
-       if ((addr >= (void *)PAGE_OFFSET) &&
-               (addr < high_memory)) {
-               struct page *page = virt_to_head_page(addr);
-
-               if (PageSlab(page)) {
-                       void *object;
-                       struct kmem_cache *cache = page->slab_cache;
-                       object = nearest_obj(cache, page,
-                                               (void *)info->access_addr);
-                       kasan_object_err(cache, object);
-                       return;
-               }
-               dump_page(page, "kasan: bad access detected");
+       dump_stack();
+       pr_err("\n");
+
+       if (page && PageSlab(page)) {
+               struct kmem_cache *cache = page->slab_cache;
+               void *object = nearest_obj(cache, page, addr);
+
+               describe_object(cache, object, addr);
        }
 
-       if (kernel_or_module_addr(addr)) {
-               if (!init_task_stack_addr(addr))
-                       pr_err("Address belongs to variable %pS\n", addr);
+       if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+               pr_err("The buggy address belongs to the variable:\n");
+               pr_err(" %pS\n", addr);
+       }
+
+       if (page) {
+               pr_err("The buggy address belongs to the page:\n");
+               dump_page(page, "kasan: bad access detected");
        }
-       dump_stack();
 }
 
 static bool row_is_guilty(const void *row, const void *guilty)
@@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr)
        }
 }
 
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+                               void *ip)
+{
+       unsigned long flags;
+
+       kasan_start_report(&flags);
+       pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+       pr_err("\n");
+       print_address_description(object);
+       pr_err("\n");
+       print_shadow_for_address(object);
+       kasan_end_report(&flags);
+}
+
 static void kasan_report_error(struct kasan_access_info *info)
 {
        unsigned long flags;
-       const char *bug_type;
 
        kasan_start_report(&flags);
 
-       if (info->access_addr <
-                       kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
-               if ((unsigned long)info->access_addr < PAGE_SIZE)
-                       bug_type = "null-ptr-deref";
-               else if ((unsigned long)info->access_addr < TASK_SIZE)
-                       bug_type = "user-memory-access";
-               else
-                       bug_type = "wild-memory-access";
-               pr_err("BUG: KASAN: %s on address %p\n",
-                       bug_type, info->access_addr);
-               pr_err("%s of size %zu by task %s/%d\n",
-                       info->is_write ? "Write" : "Read",
-                       info->access_size, current->comm,
-                       task_pid_nr(current));
+       print_error_description(info);
+       pr_err("\n");
+
+       if (!addr_has_shadow(info)) {
                dump_stack();
        } else {
-               print_error_description(info);
-               print_address_description(info);
+               print_address_description((void *)info->access_addr);
+               pr_err("\n");
                print_shadow_for_address(info->first_bad_addr);
        }
 
index ba40b7f673f4dd44af403c7ed33860c6e2094046..7cb9c88bb4a33ca11c3266f398ffffb53c5afc13 100644 (file)
@@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm)
 
 static void release_pte_page(struct page *page)
 {
-       /* 0 stands for page_is_file_cache(page) == false */
-       dec_node_page_state(page, NR_ISOLATED_ANON + 0);
+       dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
        unlock_page(page);
        putback_lru_page(page);
 }
@@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
                VM_BUG_ON_PAGE(PageCompound(page), page);
                VM_BUG_ON_PAGE(!PageAnon(page), page);
-               VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
                /*
                 * We can do it before isolate_lru_page because the
@@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                 * The page must only be referenced by the scanned process
                 * and page swap cache.
                 */
-               if (page_count(page) != 1 + !!PageSwapCache(page)) {
+               if (page_count(page) != 1 + PageSwapCache(page)) {
                        unlock_page(page);
                        result = SCAN_PAGE_COUNT;
                        goto out;
@@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                        result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
-               /* 0 stands for page_is_file_cache(page) == false */
-               inc_node_page_state(page, NR_ISOLATED_ANON + 0);
+               inc_node_page_state(page,
+                               NR_ISOLATED_ANON + page_is_file_cache(page));
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageLRU(page), page);
 
@@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                 * The page must only be referenced by the scanned process
                 * and page swap cache.
                 */
-               if (page_count(page) != 1 + !!PageSwapCache(page)) {
+               if (page_count(page) != 1 + PageSwapCache(page)) {
                        result = SCAN_PAGE_COUNT;
                        goto out_unmap;
                }
index 19b4f2dea7a591793ff8e18b6eeeafdc5df1de30..d9fc0e4561283d9a351f6dd7c4cfb74ad26ab566 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
        return new_page;
 }
 
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 {
        struct stable_node *stable_node;
        struct rmap_item *rmap_item;
-       int ret = SWAP_AGAIN;
        int search_new_forks = 0;
 
        VM_BUG_ON_PAGE(!PageKsm(page), page);
@@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 
        stable_node = page_stable_node(page);
        if (!stable_node)
-               return ret;
+               return;
 again:
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1978,23 +1977,20 @@ again:
                        if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                                continue;
 
-                       ret = rwc->rmap_one(page, vma,
-                                       rmap_item->address, rwc->arg);
-                       if (ret != SWAP_AGAIN) {
+                       if (!rwc->rmap_one(page, vma,
+                                       rmap_item->address, rwc->arg)) {
                                anon_vma_unlock_read(anon_vma);
-                               goto out;
+                               return;
                        }
                        if (rwc->done && rwc->done(page)) {
                                anon_vma_unlock_read(anon_vma);
-                               goto out;
+                               return;
                        }
                }
                anon_vma_unlock_read(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
-out:
-       return ret;
 }
 
 #ifdef CONFIG_MIGRATION
index 7a2abf0127aef7a9d4879278293d8cab766133e1..25b78ee4fc2c77addde06a22580e02bf05bb3b51 100644 (file)
@@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                        ptent = pte_mkold(ptent);
                        ptent = pte_mkclean(ptent);
                        set_pte_at(mm, addr, pte, ptent);
-                       if (PageActive(page))
-                               deactivate_page(page);
                        tlb_remove_tlb_entry(tlb, pte, addr);
                }
+               mark_page_lazyfree(page);
        }
 out:
        if (nr_swap) {
@@ -606,34 +605,40 @@ static long madvise_remove(struct vm_area_struct *vma,
 /*
  * Error injection support for memory error handling.
  */
-static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
+static int madvise_inject_error(int behavior,
+               unsigned long start, unsigned long end)
 {
-       struct page *p;
+       struct page *page;
+
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+
        for (; start < end; start += PAGE_SIZE <<
-                               compound_order(compound_head(p))) {
+                               compound_order(compound_head(page))) {
                int ret;
 
-               ret = get_user_pages_fast(start, 1, 0, &p);
+               ret = get_user_pages_fast(start, 1, 0, &page);
                if (ret != 1)
                        return ret;
 
-               if (PageHWPoison(p)) {
-                       put_page(p);
+               if (PageHWPoison(page)) {
+                       put_page(page);
                        continue;
                }
-               if (bhv == MADV_SOFT_OFFLINE) {
-                       pr_info("Soft offlining page %#lx at %#lx\n",
-                               page_to_pfn(p), start);
-                       ret = soft_offline_page(p, MF_COUNT_INCREASED);
+
+               if (behavior == MADV_SOFT_OFFLINE) {
+                       pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
+                                               page_to_pfn(page), start);
+
+                       ret = soft_offline_page(page, MF_COUNT_INCREASED);
                        if (ret)
                                return ret;
                        continue;
                }
-               pr_info("Injecting memory failure for page %#lx at %#lx\n",
-                      page_to_pfn(p), start);
-               ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+               pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+                                               page_to_pfn(page), start);
+
+               ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED);
                if (ret)
                        return ret;
        }
@@ -651,13 +656,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        case MADV_WILLNEED:
                return madvise_willneed(vma, prev, start, end);
        case MADV_FREE:
-               /*
-                * XXX: In this implementation, MADV_FREE works like
-                * MADV_DONTNEED on swapless system or full swap.
-                */
-               if (get_nr_swap_pages() > 0)
-                       return madvise_free(vma, prev, start, end);
-               /* passthrough */
+               return madvise_free(vma, prev, start, end);
        case MADV_DONTNEED:
                return madvise_dontneed(vma, prev, start, end);
        default:
@@ -688,6 +687,10 @@ madvise_behavior_valid(int behavior)
 #endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
+#ifdef CONFIG_MEMORY_FAILURE
+       case MADV_SOFT_OFFLINE:
+       case MADV_HWPOISON:
+#endif
                return true;
 
        default:
@@ -761,10 +764,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        size_t len;
        struct blk_plug plug;
 
-#ifdef CONFIG_MEMORY_FAILURE
-       if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
-               return madvise_hwpoison(behavior, start, start+len_in);
-#endif
        if (!madvise_behavior_valid(behavior))
                return error;
 
@@ -784,6 +783,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        if (end == start)
                return error;
 
+#ifdef CONFIG_MEMORY_FAILURE
+       if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
+               return madvise_inject_error(behavior, start, start + len_in);
+#endif
+
        write = madvise_need_mmap_write(behavior);
        if (write) {
                if (down_write_killable(&current->mm->mmap_sem))
index 2bd7541d7c11231431c060ca6cfe84a89f096fe3..ff73899af61a2c25582a66f56d473b0da6ce1b9c 100644 (file)
@@ -100,24 +100,7 @@ static bool do_memsw_account(void)
        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
 }
 
-static const char * const mem_cgroup_stat_names[] = {
-       "cache",
-       "rss",
-       "rss_huge",
-       "mapped_file",
-       "dirty",
-       "writeback",
-       "swap",
-};
-
-static const char * const mem_cgroup_events_names[] = {
-       "pgpgin",
-       "pgpgout",
-       "pgfault",
-       "pgmajfault",
-};
-
-static const char * const mem_cgroup_lru_names[] = {
+static const char *const mem_cgroup_lru_names[] = {
        "inactive_anon",
        "active_anon",
        "inactive_file",
@@ -568,32 +551,15 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
  */
-static unsigned long
-mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
-{
-       long val = 0;
-       int cpu;
-
-       /* Per-cpu values can be negative, use a signed accumulator */
-       for_each_possible_cpu(cpu)
-               val += per_cpu(memcg->stat->count[idx], cpu);
-       /*
-        * Summing races with updates, so val may be negative.  Avoid exposing
-        * transient negative values.
-        */
-       if (val < 0)
-               val = 0;
-       return val;
-}
 
-static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
-                                           enum mem_cgroup_events_index idx)
+static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
+                                     enum memcg_event_item event)
 {
        unsigned long val = 0;
        int cpu;
 
        for_each_possible_cpu(cpu)
-               val += per_cpu(memcg->stat->events[idx], cpu);
+               val += per_cpu(memcg->stat->events[event], cpu);
        return val;
 }
 
@@ -606,23 +572,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
         * counted as CACHE even if it's on ANON LRU.
         */
        if (PageAnon(page))
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
-                               nr_pages);
-       else
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
-                               nr_pages);
+               __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
+       else {
+               __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
+               if (PageSwapBacked(page))
+                       __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
+       }
 
        if (compound) {
                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-               __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
-                               nr_pages);
+               __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
        }
 
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
-               __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+               __this_cpu_inc(memcg->stat->events[PGPGIN]);
        else {
-               __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+               __this_cpu_inc(memcg->stat->events[PGPGOUT]);
                nr_pages = -nr_pages; /* for event */
        }
 
@@ -1144,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
        return false;
 }
 
+unsigned int memcg1_stats[] = {
+       MEMCG_CACHE,
+       MEMCG_RSS,
+       MEMCG_RSS_HUGE,
+       NR_SHMEM,
+       NR_FILE_MAPPED,
+       NR_FILE_DIRTY,
+       NR_WRITEBACK,
+       MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+       "cache",
+       "rss",
+       "rss_huge",
+       "shmem",
+       "mapped_file",
+       "dirty",
+       "writeback",
+       "swap",
+};
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /**
  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1188,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
                pr_cont_cgroup_path(iter->css.cgroup);
                pr_cont(":");
 
-               for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-                       if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+               for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+                       if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
                                continue;
-                       pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
-                               K(mem_cgroup_read_stat(iter, i)));
+                       pr_cont(" %s:%luKB", memcg1_stat_names[i],
+                               K(memcg_page_state(iter, memcg1_stats[i])));
                }
 
                for (i = 0; i < NR_LRU_LISTS; i++)
@@ -1837,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
        do {
                if (page_counter_read(&memcg->memory) <= memcg->high)
                        continue;
-               mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+               mem_cgroup_event(memcg, MEMCG_HIGH);
                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
        } while ((memcg = parent_mem_cgroup(memcg)));
 }
@@ -1928,7 +1916,7 @@ retry:
        if (!gfpflags_allow_blocking(gfp_mask))
                goto nomem;
 
-       mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+       mem_cgroup_event(mem_over_limit, MEMCG_MAX);
 
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                    gfp_mask, may_swap);
@@ -1971,7 +1959,7 @@ retry:
        if (fatal_signal_pending(current))
                goto force;
 
-       mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+       mem_cgroup_event(mem_over_limit, MEMCG_OOM);
 
        mem_cgroup_oom(mem_over_limit, gfp_mask,
                       get_order(nr_pages * PAGE_SIZE));
@@ -2381,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
        for (i = 1; i < HPAGE_PMD_NR; i++)
                head[i].mem_cgroup = head->mem_cgroup;
 
-       __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+       __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
                       HPAGE_PMD_NR);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -2391,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-       this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+       this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
 }
 
 /**
@@ -2725,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
 
        for_each_mem_cgroup_tree(iter, memcg) {
                for (i = 0; i < MEMCG_NR_STAT; i++)
-                       stat[i] += mem_cgroup_read_stat(iter, i);
+                       stat[i] += memcg_page_state(iter, i);
        }
 }
 
@@ -2738,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
 
        for_each_mem_cgroup_tree(iter, memcg) {
                for (i = 0; i < MEMCG_NR_EVENTS; i++)
-                       events[i] += mem_cgroup_read_events(iter, i);
+                       events[i] += memcg_sum_events(iter, i);
        }
 }
 
@@ -2750,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
                struct mem_cgroup *iter;
 
                for_each_mem_cgroup_tree(iter, memcg) {
-                       val += mem_cgroup_read_stat(iter,
-                                       MEM_CGROUP_STAT_CACHE);
-                       val += mem_cgroup_read_stat(iter,
-                                       MEM_CGROUP_STAT_RSS);
+                       val += memcg_page_state(iter, MEMCG_CACHE);
+                       val += memcg_page_state(iter, MEMCG_RSS);
                        if (swap)
-                               val += mem_cgroup_read_stat(iter,
-                                               MEM_CGROUP_STAT_SWAP);
+                               val += memcg_page_state(iter, MEMCG_SWAP);
                }
        } else {
                if (!swap)
@@ -3131,6 +3116,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
 }
 #endif /* CONFIG_NUMA */
 
+/* Universal VM events cgroup1 shows, original sort order */
+unsigned int memcg1_events[] = {
+       PGPGIN,
+       PGPGOUT,
+       PGFAULT,
+       PGMAJFAULT,
+};
+
+static const char *const memcg1_event_names[] = {
+       "pgpgin",
+       "pgpgout",
+       "pgfault",
+       "pgmajfault",
+};
+
 static int memcg_stat_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -3138,22 +3138,20 @@ static int memcg_stat_show(struct seq_file *m, void *v)
        struct mem_cgroup *mi;
        unsigned int i;
 
-       BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
-                    MEM_CGROUP_STAT_NSTATS);
-       BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
-                    MEM_CGROUP_EVENTS_NSTATS);
+       BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+       for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+               if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
                        continue;
-               seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
-                          mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
+               seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+                          memcg_page_state(memcg, memcg1_stats[i]) *
+                          PAGE_SIZE);
        }
 
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
-               seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
-                          mem_cgroup_read_events(memcg, i));
+       for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+               seq_printf(m, "%s %lu\n", memcg1_event_names[i],
+                          memcg_sum_events(memcg, memcg1_events[i]));
 
        for (i = 0; i < NR_LRU_LISTS; i++)
                seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
@@ -3171,23 +3169,23 @@ static int memcg_stat_show(struct seq_file *m, void *v)
                seq_printf(m, "hierarchical_memsw_limit %llu\n",
                           (u64)memsw * PAGE_SIZE);
 
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+       for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                unsigned long long val = 0;
 
-               if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+               if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
                        continue;
                for_each_mem_cgroup_tree(mi, memcg)
-                       val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
-               seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
+                       val += memcg_page_state(mi, memcg1_stats[i]) *
+                       PAGE_SIZE;
+               seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
        }
 
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+       for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
                unsigned long long val = 0;
 
                for_each_mem_cgroup_tree(mi, memcg)
-                       val += mem_cgroup_read_events(mi, i);
-               seq_printf(m, "total_%s %llu\n",
-                          mem_cgroup_events_names[i], val);
+                       val += memcg_sum_events(mi, memcg1_events[i]);
+               seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
        }
 
        for (i = 0; i < NR_LRU_LISTS; i++) {
@@ -3652,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
        struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
        struct mem_cgroup *parent;
 
-       *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+       *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 
        /* this should eventually include NR_UNSTABLE_NFS */
-       *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+       *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
        *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
                                                     (1 << LRU_ACTIVE_FILE));
        *pheadroom = PAGE_COUNTER_MAX;
@@ -4511,33 +4509,29 @@ static int mem_cgroup_move_account(struct page *page,
        spin_lock_irqsave(&from->move_lock, flags);
 
        if (!anon && page_mapped(page)) {
-               __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                              nr_pages);
-               __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-                              nr_pages);
+               __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
+               __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
        }
 
        /*
         * move_lock grabbed above and caller set from->moving_account, so
-        * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+        * mod_memcg_page_state will serialize updates to PageDirty.
         * So mapping should be stable for dirty pages.
         */
        if (!anon && PageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
 
                if (mapping_cap_account_dirty(mapping)) {
-                       __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+                       __this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
                                       nr_pages);
-                       __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+                       __this_cpu_add(to->stat->count[NR_FILE_DIRTY],
                                       nr_pages);
                }
        }
 
        if (PageWriteback(page)) {
-               __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                              nr_pages);
-               __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-                              nr_pages);
+               __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
+               __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
        }
 
        /*
@@ -5154,7 +5148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
                        continue;
                }
 
-               mem_cgroup_events(memcg, MEMCG_OOM, 1);
+               mem_cgroup_event(memcg, MEMCG_OOM);
                if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
                        break;
        }
@@ -5167,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 
-       seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
-       seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
-       seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
-       seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
+       seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
+       seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
+       seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
+       seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
 
        return 0;
 }
@@ -5197,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v)
        tree_events(memcg, events);
 
        seq_printf(m, "anon %llu\n",
-                  (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
+                  (u64)stat[MEMCG_RSS] * PAGE_SIZE);
        seq_printf(m, "file %llu\n",
-                  (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+                  (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
        seq_printf(m, "kernel_stack %llu\n",
                   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
        seq_printf(m, "slab %llu\n",
@@ -5208,12 +5202,14 @@ static int memory_stat_show(struct seq_file *m, void *v)
        seq_printf(m, "sock %llu\n",
                   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
 
+       seq_printf(m, "shmem %llu\n",
+                  (u64)stat[NR_SHMEM] * PAGE_SIZE);
        seq_printf(m, "file_mapped %llu\n",
-                  (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
+                  (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
        seq_printf(m, "file_dirty %llu\n",
-                  (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
+                  (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
        seq_printf(m, "file_writeback %llu\n",
-                  (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
+                  (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
 
        for (i = 0; i < NR_LRU_LISTS; i++) {
                struct mem_cgroup *mi;
@@ -5232,10 +5228,15 @@ static int memory_stat_show(struct seq_file *m, void *v)
 
        /* Accumulated memory events */
 
-       seq_printf(m, "pgfault %lu\n",
-                  events[MEM_CGROUP_EVENTS_PGFAULT]);
-       seq_printf(m, "pgmajfault %lu\n",
-                  events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+       seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
+       seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+
+       seq_printf(m, "workingset_refault %lu\n",
+                  stat[WORKINGSET_REFAULT]);
+       seq_printf(m, "workingset_activate %lu\n",
+                  stat[WORKINGSET_ACTIVATE]);
+       seq_printf(m, "workingset_nodereclaim %lu\n",
+                  stat[WORKINGSET_NODERECLAIM]);
 
        return 0;
 }
@@ -5476,8 +5477,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
 
 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
                           unsigned long nr_anon, unsigned long nr_file,
-                          unsigned long nr_huge, unsigned long nr_kmem,
-                          struct page *dummy_page)
+                          unsigned long nr_kmem, unsigned long nr_huge,
+                          unsigned long nr_shmem, struct page *dummy_page)
 {
        unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
        unsigned long flags;
@@ -5492,10 +5493,11 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
        }
 
        local_irq_save(flags);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
-       __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
-       __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+       __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
+       __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
+       __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
+       __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
+       __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
        memcg_check_events(memcg, dummy_page);
        local_irq_restore(flags);
@@ -5507,6 +5509,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 static void uncharge_list(struct list_head *page_list)
 {
        struct mem_cgroup *memcg = NULL;
+       unsigned long nr_shmem = 0;
        unsigned long nr_anon = 0;
        unsigned long nr_file = 0;
        unsigned long nr_huge = 0;
@@ -5539,9 +5542,9 @@ static void uncharge_list(struct list_head *page_list)
                if (memcg != page->mem_cgroup) {
                        if (memcg) {
                                uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                                              nr_huge, nr_kmem, page);
-                               pgpgout = nr_anon = nr_file =
-                                       nr_huge = nr_kmem = 0;
+                                              nr_kmem, nr_huge, nr_shmem, page);
+                               pgpgout = nr_anon = nr_file = nr_kmem = 0;
+                               nr_huge = nr_shmem = 0;
                        }
                        memcg = page->mem_cgroup;
                }
@@ -5555,8 +5558,11 @@ static void uncharge_list(struct list_head *page_list)
                        }
                        if (PageAnon(page))
                                nr_anon += nr_pages;
-                       else
+                       else {
                                nr_file += nr_pages;
+                               if (PageSwapBacked(page))
+                                       nr_shmem += nr_pages;
+                       }
                        pgpgout++;
                } else {
                        nr_kmem += 1 << compound_order(page);
@@ -5568,7 +5574,7 @@ static void uncharge_list(struct list_head *page_list)
 
        if (memcg)
                uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-                              nr_huge, nr_kmem, page);
+                              nr_kmem, nr_huge, nr_shmem, page);
 }
 
 /**
index 27f7210e7fabd1441d699d328213f95302c79378..73066b80d14af70d0fdf12e2228823b84c258903 100644 (file)
@@ -220,6 +220,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
  */
 void shake_page(struct page *p, int access)
 {
+       if (PageHuge(p))
+               return;
+
        if (!PageSlab(p)) {
                lru_add_drain_all();
                if (PageLRU(p))
@@ -322,7 +325,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
  * wrong earlier.
  */
 static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
-                         int fail, struct page *page, unsigned long pfn,
+                         bool fail, struct page *page, unsigned long pfn,
                          int flags)
 {
        struct to_kill *tk, *next;
@@ -904,35 +907,36 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
-static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
                                  int trapno, int flags, struct page **hpagep)
 {
-       enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+       enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
        struct address_space *mapping;
        LIST_HEAD(tokill);
-       int ret;
+       bool unmap_success;
        int kill = 1, forcekill;
        struct page *hpage = *hpagep;
+       bool mlocked = PageMlocked(hpage);
 
        /*
         * Here we are interested only in user-mapped pages, so skip any
         * other types of pages.
         */
        if (PageReserved(p) || PageSlab(p))
-               return SWAP_SUCCESS;
+               return true;
        if (!(PageLRU(hpage) || PageHuge(p)))
-               return SWAP_SUCCESS;
+               return true;
 
        /*
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
         */
        if (!page_mapped(hpage))
-               return SWAP_SUCCESS;
+               return true;
 
        if (PageKsm(p)) {
                pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
-               return SWAP_FAIL;
+               return false;
        }
 
        if (PageSwapCache(p)) {
@@ -971,11 +975,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (kill)
                collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
 
-       ret = try_to_unmap(hpage, ttu);
-       if (ret != SWAP_SUCCESS)
+       unmap_success = try_to_unmap(hpage, ttu);
+       if (!unmap_success)
                pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
                       pfn, page_mapcount(hpage));
 
+       /*
+        * try_to_unmap() might put mlocked page in lru cache, so call
+        * shake_page() again to ensure that it's flushed.
+        */
+       if (mlocked)
+               shake_page(hpage, 0);
+
        /*
         * Now that the dirty bit has been propagated to the
         * struct page and all unmaps done we can decide if
@@ -987,10 +998,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * any accesses to the poisoned memory.
         */
        forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
-       kill_procs(&tokill, forcekill, trapno,
-                     ret != SWAP_SUCCESS, p, pfn, flags);
+       kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
 
-       return ret;
+       return unmap_success;
 }
 
 static void set_page_hwpoison_huge_page(struct page *hpage)
@@ -1138,22 +1148,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-       if (!PageHuge(p)) {
-               if (!PageLRU(p))
-                       shake_page(p, 0);
-               if (!PageLRU(p)) {
-                       /*
-                        * shake_page could have turned it free.
-                        */
-                       if (is_free_buddy_page(p)) {
-                               if (flags & MF_COUNT_INCREASED)
-                                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
-                               else
-                                       action_result(pfn, MF_MSG_BUDDY_2ND,
-                                                     MF_DELAYED);
-                               return 0;
-                       }
-               }
+       shake_page(p, 0);
+       /* shake_page could have turned it free. */
+       if (!PageLRU(p) && is_free_buddy_page(p)) {
+               if (flags & MF_COUNT_INCREASED)
+                       action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
+               else
+                       action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
+               return 0;
        }
 
        lock_page(hpage);
@@ -1230,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
         * When the raw error page is thp tail page, hpage points to the raw
         * page after thp split.
         */
-       if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
-           != SWAP_SUCCESS) {
+       if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
                res = -EBUSY;
                goto out;
@@ -1543,8 +1544,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
                if (ret == 1 && !PageLRU(page)) {
                        /* Drop page reference which is from __get_any_page() */
                        put_hwpoison_page(page);
-                       pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
-                               pfn, page->flags);
+                       pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
+                               pfn, page->flags, &page->flags);
                        return -EIO;
                }
        }
@@ -1585,8 +1586,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
        ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
                                MIGRATE_SYNC, MR_MEMORY_FAILURE);
        if (ret) {
-               pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
-                       pfn, ret, page->flags);
+               pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
+                       pfn, ret, page->flags, &page->flags);
                /*
                 * We know that soft_offline_huge_page() tries to migrate
                 * only one hugepage pointed to by hpage, so we need not
@@ -1677,14 +1678,14 @@ static int __soft_offline_page(struct page *page, int flags)
                        if (!list_empty(&pagelist))
                                putback_movable_pages(&pagelist);
 
-                       pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
-                               pfn, ret, page->flags);
+                       pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
+                               pfn, ret, page->flags, &page->flags);
                        if (ret > 0)
                                ret = -EIO;
                }
        } else {
-               pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
-                       pfn, ret, page_count(page), page->flags);
+               pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
+                       pfn, ret, page_count(page), page->flags, &page->flags);
        }
        return ret;
 }
index 6fa7208bcd564ec8fb6bcf25e206aef9bd724ecb..b63d7d1239df22714da632e191ce4d03049daeb0 100644 (file)
@@ -1208,7 +1208,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 
                arch_refresh_nodedata(nid, pgdat);
        } else {
-               /* Reset the nr_zones, order and classzone_idx before reuse */
+               /*
+                * Reset the nr_zones, order and classzone_idx before reuse.
+                * Note that kswapd will init kswapd_classzone_idx properly
+                * when it starts in the near future.
+                */
                pgdat->nr_zones = 0;
                pgdat->kswapd_order = 0;
                pgdat->kswapd_classzone_idx = 0;
index 738f1d5f83503e546960d005a034abf2dde2c0e7..89a0a1707f4c67deb77dc466cb6f5c2ecfe19051 100644 (file)
@@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l)
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
+static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                                 unsigned long addr, void *old)
 {
        struct page_vma_mapped_walk pvmw = {
@@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                update_mmu_cache(vma, pvmw.address, pvmw.pte);
        }
 
-       return SWAP_AGAIN;
+       return true;
 }
 
 /*
@@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
 {
        int z;
 
-       if (!pgdat_reclaimable(pgdat))
-               return false;
-
        for (z = pgdat->nr_zones - 1; z >= 0; z--) {
                struct zone *zone = pgdat->node_zones + z;
 
@@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
        /* Prepare a page as a migration target */
        __SetPageLocked(new_page);
-       __SetPageSwapBacked(new_page);
+       if (PageSwapBacked(page))
+               __SetPageSwapBacked(new_page);
 
        /* anon mapping, we can simply copy page->mapping to the new page: */
        new_page->mapping = page->mapping;
index 0dd9ca18e19ed7ddb499a480c5831c312791b10a..c483c5c20b4bd12bcca50972c9f74a0dbd3a713e 100644 (file)
@@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
  */
 static void __munlock_isolated_page(struct page *page)
 {
-       int ret = SWAP_AGAIN;
-
        /*
         * Optimization: if the page was mapped just once, that's our mapping
         * and we don't need to check all the other vmas.
         */
        if (page_mapcount(page) > 1)
-               ret = try_to_munlock(page);
+               try_to_munlock(page);
 
        /* Did try_to_unlock() succeed or punt? */
-       if (ret != SWAP_MLOCK)
+       if (!PageMlocked(page))
                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
 
        putback_lru_page(page);
index bfbe8856d134f367464e3582d9a432260487777c..f82741e199c0b06d971bd30d4503bc3e6e6f9dc5 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                struct user_struct *user = NULL;
                struct hstate *hs;
 
-               hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
+               hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;
 
index d083714a2bb924a0303fa7535acf09d7355aedf0..04c9143a86255a179aa40c06c2f36d203608117b 100644 (file)
@@ -685,6 +685,7 @@ void exit_oom_victim(void)
 void oom_killer_enable(void)
 {
        oom_killer_disabled = false;
+       pr_info("OOM killer enabled.\n");
 }
 
 /**
@@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout)
                oom_killer_enable();
                return false;
        }
+       pr_info("OOM killer disabled.\n");
 
        return true;
 }
index d8ac2a7fb9e7b6db9de3755ab7898095f70f8383..2359608d2568ac027dfdbcd901fa01d58e77f17a 100644 (file)
@@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
 
        spin_lock_init(&dom->lock);
 
-       init_timer_deferrable(&dom->period_timer);
-       dom->period_timer.function = writeout_period;
-       dom->period_timer.data = (unsigned long)dom;
+       setup_deferrable_timer(&dom->period_timer, writeout_period,
+                              (unsigned long)dom);
 
        dom->dirty_limit_tstamp = jiffies;
 
@@ -2428,7 +2427,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
                inode_attach_wb(inode, page);
                wb = inode_to_wb(inode);
 
-               mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+               inc_memcg_page_state(page, NR_FILE_DIRTY);
                __inc_node_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                __inc_node_page_state(page, NR_DIRTIED);
@@ -2450,7 +2449,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
                          struct bdi_writeback *wb)
 {
        if (mapping_cap_account_dirty(mapping)) {
-               mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+               dec_memcg_page_state(page, NR_FILE_DIRTY);
                dec_node_page_state(page, NR_FILE_DIRTY);
                dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2707,7 +2706,7 @@ int clear_page_dirty_for_io(struct page *page)
                 */
                wb = unlocked_inode_to_wb_begin(inode, &locked);
                if (TestClearPageDirty(page)) {
-                       mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+                       dec_memcg_page_state(page, NR_FILE_DIRTY);
                        dec_node_page_state(page, NR_FILE_DIRTY);
                        dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                        dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2754,7 +2753,7 @@ int test_clear_page_writeback(struct page *page)
                ret = TestClearPageWriteback(page);
        }
        if (ret) {
-               mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+               dec_memcg_page_state(page, NR_WRITEBACK);
                dec_node_page_state(page, NR_WRITEBACK);
                dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
                inc_node_page_state(page, NR_WRITTEN);
@@ -2809,7 +2808,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                ret = TestSetPageWriteback(page);
        }
        if (!ret) {
-               mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+               inc_memcg_page_state(page, NR_WRITEBACK);
                inc_node_page_state(page, NR_WRITEBACK);
                inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
        }
index bd01501efab9141f9071dc741abf17719de7b027..1e2af704938dbc628b31a888b4d7f9e04a266bda 100644 (file)
@@ -1090,14 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
        int migratetype = 0;
        int batch_free = 0;
-       unsigned long nr_scanned;
        bool isolated_pageblocks;
 
        spin_lock(&zone->lock);
        isolated_pageblocks = has_isolate_pageblock(zone);
-       nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-       if (nr_scanned)
-               __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
 
        while (count) {
                struct page *page;
@@ -1150,12 +1146,7 @@ static void free_one_page(struct zone *zone,
                                unsigned int order,
                                int migratetype)
 {
-       unsigned long nr_scanned;
        spin_lock(&zone->lock);
-       nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-       if (nr_scanned)
-               __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
        if (unlikely(has_isolate_pageblock(zone) ||
                is_migrate_isolate(migratetype))) {
                migratetype = get_pfnblock_migratetype(page, pfn);
@@ -1698,10 +1689,10 @@ static inline int check_new_page(struct page *page)
        return 1;
 }
 
-static inline bool free_pages_prezeroed(bool poisoned)
+static inline bool free_pages_prezeroed(void)
 {
        return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-               page_poisoning_enabled() && poisoned;
+               page_poisoning_enabled();
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -1755,17 +1746,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
                                                        unsigned int alloc_flags)
 {
        int i;
-       bool poisoned = true;
-
-       for (i = 0; i < (1 << order); i++) {
-               struct page *p = page + i;
-               if (poisoned)
-                       poisoned &= page_is_poisoned(p);
-       }
 
        post_alloc_hook(page, order, gfp_flags);
 
-       if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
+       if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
                for (i = 0; i < (1 << order); i++)
                        clear_highpage(page + i);
 
@@ -2045,8 +2029,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 
        /* Yoink! */
        mt = get_pageblock_migratetype(page);
-       if (mt != MIGRATE_HIGHATOMIC &&
-                       !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+       if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
+           && !is_migrate_cma(mt)) {
                zone->nr_reserved_highatomic += pageblock_nr_pages;
                set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
                move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
@@ -2103,8 +2087,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
                         * from highatomic to ac->migratetype. So we should
                         * adjust the count once.
                         */
-                       if (get_pageblock_migratetype(page) ==
-                                                       MIGRATE_HIGHATOMIC) {
+                       if (is_migrate_highatomic_page(page)) {
                                /*
                                 * It should never happen but changes to
                                 * locking could inadvertently allow a per-cpu
@@ -2161,8 +2144,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 
                page = list_first_entry(&area->free_list[fallback_mt],
                                                struct page, lru);
-               if (can_steal &&
-                       get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
+               if (can_steal && !is_migrate_highatomic_page(page))
                        steal_suitable_fallback(zone, page, start_migratetype);
 
                /* Remove the page from the freelists */
@@ -2502,7 +2484,7 @@ void free_hot_cold_page(struct page *page, bool cold)
        /*
         * We only track unmovable, reclaimable and movable on pcp lists.
         * Free ISOLATE pages back to the allocator because they are being
-        * offlined but treat RESERVE as movable pages so we can get those
+        * offlined but treat HIGHATOMIC as movable pages so we can get those
         * areas back if necessary. Otherwise, we may have to free
         * excessively into the page allocator
         */
@@ -2612,7 +2594,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
                for (; page < endpage; page += pageblock_nr_pages) {
                        int mt = get_pageblock_migratetype(page);
                        if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
-                               && mt != MIGRATE_HIGHATOMIC)
+                           && !is_migrate_highatomic(mt))
                                set_pageblock_migratetype(page,
                                                          MIGRATE_MOVABLE);
                }
@@ -3110,8 +3092,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
        static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
 
-       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
-           debug_guardpage_minorder() > 0)
+       if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
                return;
 
        pr_warn("%s: ", current->comm);
@@ -3521,20 +3502,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
        return false;
 }
 
-/*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
 /*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
- * The reclaim feedback represented by did_some_progress (any progress during
- * the last reclaim round) and no_progress_loops (number of reclaim rounds without
- * any progress in a row) is considered as well as the reclaimable pages on the
- * applicable zone list (with a backoff mechanism which is a function of
- * no_progress_loops).
+ *
+ * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
+ * without success, or when we couldn't even meet the watermark if we
+ * reclaimed all remaining pages on the LRU lists.
  *
  * Returns true if a retry is viable or false to enter the oom path.
  */
@@ -3579,13 +3553,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                bool wmark;
 
                available = reclaimable = zone_reclaimable_pages(zone);
-               available -= DIV_ROUND_UP((*no_progress_loops) * available,
-                                         MAX_RECLAIM_RETRIES);
                available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
 
                /*
-                * Would the allocation succeed if we reclaimed the whole
-                * available?
+                * Would the allocation succeed if we reclaimed all
+                * reclaimable pages?
                 */
                wmark = __zone_watermark_ok(zone, order, min_wmark,
                                ac_classzone_idx(ac), alloc_flags, available);
@@ -3771,7 +3743,7 @@ retry:
 
        /* Make sure we know about allocations which stall for too long */
        if (time_after(jiffies, alloc_start + stall_timeout)) {
-               warn_alloc(gfp_mask, ac->nodemask,
+               warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
                        "page allocation stalls for %ums, order:%u",
                        jiffies_to_msecs(jiffies-alloc_start), order);
                stall_timeout += 10 * HZ;
@@ -3971,10 +3943,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                goto out;
 
        /*
-        * Runtime PM, block IO and its error handling path can deadlock
-        * because I/O on the device might not complete.
+        * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+        * resp. GFP_NOIO which has to be inherited for all allocation requests
+        * from a particular context which has been marked by
+        * memalloc_no{fs,io}_{save,restore}.
         */
-       alloc_mask = memalloc_noio_flags(gfp_mask);
+       alloc_mask = current_gfp_context(gfp_mask);
        ac.spread_dirty_pages = false;
 
        /*
@@ -4510,7 +4484,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
                        " writeback_tmp:%lukB"
                        " unstable:%lukB"
-                       " pages_scanned:%lu"
                        " all_unreclaimable? %s"
                        "\n",
                        pgdat->node_id,
@@ -4533,8 +4506,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
                        K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
                        K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
-                       node_page_state(pgdat, NR_PAGES_SCANNED),
-                       !pgdat_reclaimable(pgdat) ? "yes" : "no");
+                       pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+                               "yes" : "no");
        }
 
        for_each_populated_zone(zone) {
@@ -7429,7 +7402,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                .zone = page_zone(pfn_to_page(start)),
                .mode = MIGRATE_SYNC,
                .ignore_skip_hint = true,
-               .gfp_mask = memalloc_noio_flags(gfp_mask),
+               .gfp_mask = current_gfp_context(gfp_mask),
        };
        INIT_LIST_HEAD(&cc.migratepages);
 
index 121dcffc4ec1768a6fc71dda8af878aa0d16cb92..88ccc044b09a41504fb212afdb5ca8a45e842998 100644 (file)
@@ -59,9 +59,6 @@
 
 static struct page_ext_operations *page_ext_ops[] = {
        &debug_guardpage_ops,
-#ifdef CONFIG_PAGE_POISONING
-       &page_poisoning_ops,
-#endif
 #ifdef CONFIG_PAGE_OWNER
        &page_owner_ops,
 #endif
@@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page)
        struct page_ext *base;
 
        base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
-        *
-        * This check is also necessary for ensuring page poisoning
-        * works as expected when enabled
         */
        if (unlikely(!base))
                return NULL;
@@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page)
 {
        unsigned long pfn = page_to_pfn(page);
        struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
-        *
-        * This check is also necessary for ensuring page poisoning
-        * works as expected when enabled
         */
        if (!section->page_ext)
                return NULL;
index b0ee56c56b5850f32f5b3157f9b142788840f50f..1b0f48c62316b3ebf1e09cc7b0a16f413db7e9ae 100644 (file)
@@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn)
        return page;
 }
 
-static int page_idle_clear_pte_refs_one(struct page *page,
+static bool page_idle_clear_pte_refs_one(struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned long addr, void *arg)
 {
@@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page,
                 */
                set_page_young(page);
        }
-       return SWAP_AGAIN;
+       return true;
 }
 
 static void page_idle_clear_pte_refs(struct page *page)
index f4e17a57926afffa33fceabf81a742ffc254d914..7927bbb54a4e3200e39fc098b787794f21f6c855 100644 (file)
@@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 
        zone = page_zone(page);
        spin_lock_irqsave(&zone->lock, flags);
-       if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+       if (!is_migrate_isolate_page(page))
                goto out;
 
        /*
@@ -205,7 +205,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
             pfn < end_pfn;
             pfn += pageblock_nr_pages) {
                page = __first_valid_page(pfn, pageblock_nr_pages);
-               if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+               if (!page || !is_migrate_isolate_page(page))
                        continue;
                unset_migratetype_isolate(page, migratetype);
        }
@@ -262,7 +262,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
         */
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                page = __first_valid_page(pfn, pageblock_nr_pages);
-               if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+               if (page && !is_migrate_isolate_page(page))
                        break;
        }
        page = __first_valid_page(start_pfn, end_pfn - start_pfn);
index 2e647c65916b91b00177837370e210d71c568f5e..be19e989ccff51f667c9698c9cb8fea3d5303f8e 100644 (file)
@@ -6,7 +6,6 @@
 #include <linux/poison.h>
 #include <linux/ratelimit.h>
 
-static bool __page_poisoning_enabled __read_mostly;
 static bool want_page_poisoning __read_mostly;
 
 static int early_page_poison_param(char *buf)
@@ -18,75 +17,22 @@ static int early_page_poison_param(char *buf)
 early_param("page_poison", early_page_poison_param);
 
 bool page_poisoning_enabled(void)
-{
-       return __page_poisoning_enabled;
-}
-
-static bool need_page_poisoning(void)
-{
-       return want_page_poisoning;
-}
-
-static void init_page_poisoning(void)
 {
        /*
-        * page poisoning is debug page alloc for some arches. If either
-        * of those options are enabled, enable poisoning
+        * Assumes that debug_pagealloc_enabled is set before
+        * free_all_bootmem.
+        * Page poisoning is debug page alloc for some arches. If
+        * either of those options are enabled, enable poisoning.
         */
-       if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
-               if (!want_page_poisoning && !debug_pagealloc_enabled())
-                       return;
-       } else {
-               if (!want_page_poisoning)
-                       return;
-       }
-
-       __page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
-       .need = need_page_poisoning,
-       .init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
-       struct page_ext *page_ext;
-
-       page_ext = lookup_page_ext(page);
-       if (unlikely(!page_ext))
-               return;
-
-       __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
-       struct page_ext *page_ext;
-
-       page_ext = lookup_page_ext(page);
-       if (unlikely(!page_ext))
-               return;
-
-       __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-bool page_is_poisoned(struct page *page)
-{
-       struct page_ext *page_ext;
-
-       page_ext = lookup_page_ext(page);
-       if (unlikely(!page_ext))
-               return false;
-
-       return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+       return (want_page_poisoning ||
+               (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+               debug_pagealloc_enabled()));
 }
 
 static void poison_page(struct page *page)
 {
        void *addr = kmap_atomic(page);
 
-       set_page_poison(page);
        memset(addr, PAGE_POISON, PAGE_SIZE);
        kunmap_atomic(addr);
 }
@@ -140,12 +86,13 @@ static void unpoison_page(struct page *page)
 {
        void *addr;
 
-       if (!page_is_poisoned(page))
-               return;
-
        addr = kmap_atomic(page);
+       /*
+        * Page poisoning when enabled poisons each and every page
+        * that is freed to buddy. Thus no extra check is done to
+        * see if a page was posioned.
+        */
        check_poison_mem(addr, PAGE_SIZE);
-       clear_page_poison(page);
        kunmap_atomic(addr);
 }
 
index f6838015810f5610abe039daec170aa1da634422..3ff241f714ebc066ce34a4d68381efcb303f4043 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -724,7 +724,7 @@ struct page_referenced_arg {
 /*
  * arg: page_referenced_arg will be passed
  */
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long address, void *arg)
 {
        struct page_referenced_arg *pra = arg;
@@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                if (vma->vm_flags & VM_LOCKED) {
                        page_vma_mapped_walk_done(&pvmw);
                        pra->vm_flags |= VM_LOCKED;
-                       return SWAP_FAIL; /* To break the loop */
+                       return false; /* To break the loop */
                }
 
                if (pvmw.pte) {
@@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
        }
 
        if (!pra->mapcount)
-               return SWAP_SUCCESS; /* To break the loop */
+               return false; /* To break the loop */
 
-       return SWAP_AGAIN;
+       return true;
 }
 
 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
@@ -812,7 +812,6 @@ int page_referenced(struct page *page,
                    struct mem_cgroup *memcg,
                    unsigned long *vm_flags)
 {
-       int ret;
        int we_locked = 0;
        struct page_referenced_arg pra = {
                .mapcount = total_mapcount(page),
@@ -846,7 +845,7 @@ int page_referenced(struct page *page,
                rwc.invalid_vma = invalid_page_referenced_vma;
        }
 
-       ret = rmap_walk(page, &rwc);
+       rmap_walk(page, &rwc);
        *vm_flags = pra.vm_flags;
 
        if (we_locked)
@@ -855,7 +854,7 @@ int page_referenced(struct page *page,
        return pra.referenced;
 }
 
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                            unsigned long address, void *arg)
 {
        struct page_vma_mapped_walk pvmw = {
@@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
                }
        }
 
-       return SWAP_AGAIN;
+       return true;
 }
 
 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
@@ -1159,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound)
                        goto out;
        }
        __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
-       mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
+       mod_memcg_page_state(page, NR_FILE_MAPPED, nr);
 out:
        unlock_page_memcg(page);
 }
@@ -1199,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
         * pte lock(a spinlock) is held, which implies preemption disabled.
         */
        __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
-       mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
+       mod_memcg_page_state(page, NR_FILE_MAPPED, -nr);
 
        if (unlikely(PageMlocked(page)))
                clear_page_mlock(page);
@@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound)
         */
 }
 
-struct rmap_private {
-       enum ttu_flags flags;
-       int lazyfreed;
-};
-
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        };
        pte_t pteval;
        struct page *subpage;
-       int ret = SWAP_AGAIN;
-       struct rmap_private *rp = arg;
-       enum ttu_flags flags = rp->flags;
+       bool ret = true;
+       enum ttu_flags flags = (enum ttu_flags)arg;
 
        /* munlock has nothing to gain from examining un-locked vmas */
        if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
-               return SWAP_AGAIN;
+               return true;
 
        if (flags & TTU_SPLIT_HUGE_PMD) {
                split_huge_pmd_address(vma, address,
@@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                         */
                                        mlock_vma_page(page);
                                }
-                               ret = SWAP_MLOCK;
+                               ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
@@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                if (!(flags & TTU_IGNORE_ACCESS)) {
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte)) {
-                               ret = SWAP_FAIL;
+                               ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
@@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
-                       VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+                       if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+                               WARN_ON_ONCE(1);
+                               ret = false;
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
+                       }
+
+                       /* MADV_FREE page check */
+                       if (!PageSwapBacked(page)) {
+                               if (!PageDirty(page)) {
+                                       dec_mm_counter(mm, MM_ANONPAGES);
+                                       goto discard;
+                               }
 
-                       if (!PageDirty(page) && (flags & TTU_LZFREE)) {
-                               /* It's a freeable page by MADV_FREE */
-                               dec_mm_counter(mm, MM_ANONPAGES);
-                               rp->lazyfreed++;
-                               goto discard;
+                               /*
+                                * If the page was redirtied, it cannot be
+                                * discarded. Remap the page to page table.
+                                */
+                               set_pte_at(mm, address, pvmw.pte, pteval);
+                               SetPageSwapBacked(page);
+                               ret = false;
+                               page_vma_mapped_walk_done(&pvmw);
+                               break;
                        }
 
                        if (swap_duplicate(entry) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
-                               ret = SWAP_FAIL;
+                               ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
@@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page)
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
- * Return values are:
  *
- * SWAP_SUCCESS        - we succeeded in removing all mappings
- * SWAP_AGAIN  - we missed a mapping, try again later
- * SWAP_FAIL   - the page is unswappable
- * SWAP_MLOCK  - page is mlocked.
+ * If unmap is successful, return true. Otherwise, false.
  */
-int try_to_unmap(struct page *page, enum ttu_flags flags)
+bool try_to_unmap(struct page *page, enum ttu_flags flags)
 {
-       int ret;
-       struct rmap_private rp = {
-               .flags = flags,
-               .lazyfreed = 0,
-       };
-
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-               .arg = &rp,
+               .arg = (void *)flags,
                .done = page_mapcount_is_zero,
                .anon_lock = page_lock_anon_vma_read,
        };
@@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
                rwc.invalid_vma = invalid_migration_vma;
 
        if (flags & TTU_RMAP_LOCKED)
-               ret = rmap_walk_locked(page, &rwc);
+               rmap_walk_locked(page, &rwc);
        else
-               ret = rmap_walk(page, &rwc);
+               rmap_walk(page, &rwc);
 
-       if (ret != SWAP_MLOCK && !page_mapcount(page)) {
-               ret = SWAP_SUCCESS;
-               if (rp.lazyfreed && !PageDirty(page))
-                       ret = SWAP_LZFREE;
-       }
-       return ret;
+       return !page_mapcount(page) ? true : false;
 }
 
 static int page_not_mapped(struct page *page)
@@ -1550,34 +1544,22 @@ static int page_not_mapped(struct page *page)
  * Called from munlock code.  Checks all of the VMAs mapping the page
  * to make sure nobody else has this page mlocked. The page will be
  * returned with PG_mlocked cleared if no other vmas have it mlocked.
- *
- * Return values are:
- *
- * SWAP_AGAIN  - no vma is holding page mlocked, or,
- * SWAP_AGAIN  - page mapped in mlocked vma -- couldn't acquire mmap sem
- * SWAP_FAIL   - page cannot be located at present
- * SWAP_MLOCK  - page is now mlocked.
  */
-int try_to_munlock(struct page *page)
-{
-       int ret;
-       struct rmap_private rp = {
-               .flags = TTU_MUNLOCK,
-               .lazyfreed = 0,
-       };
 
+void try_to_munlock(struct page *page)
+{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
-               .arg = &rp,
+               .arg = (void *)TTU_MUNLOCK,
                .done = page_not_mapped,
                .anon_lock = page_lock_anon_vma_read,
 
        };
 
        VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+       VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
 
-       ret = rmap_walk(page, &rwc);
-       return ret;
+       rmap_walk(page, &rwc);
 }
 
 void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1625,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
  */
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
                bool locked)
 {
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
        struct anon_vma_chain *avc;
-       int ret = SWAP_AGAIN;
 
        if (locked) {
                anon_vma = page_anon_vma(page);
@@ -1641,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
                anon_vma = rmap_walk_anon_lock(page, rwc);
        }
        if (!anon_vma)
-               return ret;
+               return;
 
        pgoff_start = page_to_pgoff(page);
        pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1655,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;
 
-               ret = rwc->rmap_one(page, vma, address, rwc->arg);
-               if (ret != SWAP_AGAIN)
+               if (!rwc->rmap_one(page, vma, address, rwc->arg))
                        break;
                if (rwc->done && rwc->done(page))
                        break;
@@ -1664,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 
        if (!locked)
                anon_vma_unlock_read(anon_vma);
-       return ret;
 }
 
 /*
@@ -1680,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
  */
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
                bool locked)
 {
        struct address_space *mapping = page_mapping(page);
        pgoff_t pgoff_start, pgoff_end;
        struct vm_area_struct *vma;
-       int ret = SWAP_AGAIN;
 
        /*
         * The page lock not only makes sure that page->mapping cannot
@@ -1697,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
        VM_BUG_ON_PAGE(!PageLocked(page), page);
 
        if (!mapping)
-               return ret;
+               return;
 
        pgoff_start = page_to_pgoff(page);
        pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1712,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;
 
-               ret = rwc->rmap_one(page, vma, address, rwc->arg);
-               if (ret != SWAP_AGAIN)
+               if (!rwc->rmap_one(page, vma, address, rwc->arg))
                        goto done;
                if (rwc->done && rwc->done(page))
                        goto done;
@@ -1722,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 done:
        if (!locked)
                i_mmap_unlock_read(mapping);
-       return ret;
 }
 
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
 {
        if (unlikely(PageKsm(page)))
-               return rmap_walk_ksm(page, rwc);
+               rmap_walk_ksm(page, rwc);
        else if (PageAnon(page))
-               return rmap_walk_anon(page, rwc, false);
+               rmap_walk_anon(page, rwc, false);
        else
-               return rmap_walk_file(page, rwc, false);
+               rmap_walk_file(page, rwc, false);
 }
 
 /* Like rmap_walk, but caller holds relevant rmap lock */
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
 {
        /* no ksm support for now */
        VM_BUG_ON_PAGE(PageKsm(page), page);
        if (PageAnon(page))
-               return rmap_walk_anon(page, rwc, true);
+               rmap_walk_anon(page, rwc, true);
        else
-               return rmap_walk_file(page, rwc, true);
+               rmap_walk_file(page, rwc, true);
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
index 0fd21670b513f735e554b340adc0d86938e32d69..6bb4deb12e78b8e3724c40446069e2cd0731b4a4 100644 (file)
@@ -9,11 +9,12 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+#define pr_fmt(fmt) "rodata_test: " fmt
+
 #include <linux/uaccess.h>
 #include <asm/sections.h>
 
 const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void rodata_test(void)
 {
@@ -23,20 +24,20 @@ void rodata_test(void)
        /* test 1: read the value */
        /* If this test fails, some previous testrun has clobbered the state */
        if (!rodata_test_data) {
-               pr_err("rodata_test: test 1 fails (start data)\n");
+               pr_err("test 1 fails (start data)\n");
                return;
        }
 
        /* test 2: write to the variable; this should fault */
        if (!probe_kernel_write((void *)&rodata_test_data,
-                                               (void *)&zero, sizeof(zero))) {
-               pr_err("rodata_test: test data was not read only\n");
+                               (void *)&zero, sizeof(zero))) {
+               pr_err("test data was not read only\n");
                return;
        }
 
        /* test 3: check the value hasn't changed */
        if (rodata_test_data == zero) {
-               pr_err("rodata_test: test data was changed\n");
+               pr_err("test data was changed\n");
                return;
        }
 
@@ -44,13 +45,13 @@ void rodata_test(void)
        start = (unsigned long)__start_rodata;
        end = (unsigned long)__end_rodata;
        if (start & (PAGE_SIZE - 1)) {
-               pr_err("rodata_test: start of .rodata is not page size aligned\n");
+               pr_err("start of .rodata is not page size aligned\n");
                return;
        }
        if (end & (PAGE_SIZE - 1)) {
-               pr_err("rodata_test: end of .rodata is not page size aligned\n");
+               pr_err("end of .rodata is not page size aligned\n");
                return;
        }
 
-       pr_info("rodata_test: all tests were successful\n");
+       pr_info("all tests were successful\n");
 }
index 807d86c769088681b47f41c0cc0a721307bd16c1..1880d482a0cbeb0174c9a55d45b842ab2d36685f 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3879,7 +3879,12 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 
        prev = cachep->cpu_cache;
        cachep->cpu_cache = cpu_cache;
-       kick_all_cpus_sync();
+       /*
+        * Without a previous cpu_cache there's no need to synchronize remote
+        * cpus, so skip the IPIs.
+        */
+       if (prev)
+               kick_all_cpus_sync();
 
        check_irq_on();
        cachep->batchcount = batchcount;
index db6bf3c97ea2cd7e593922ab88840b84d67cdd88..6903c8fc308502eab4ca3570075cd6a881efa724 100644 (file)
@@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
 
 unsigned long usemap_size(void)
 {
-       unsigned long size_bytes;
-       size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
-       size_bytes = roundup(size_bytes, sizeof(unsigned long));
-       return size_bytes;
+       return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
index d8d9ee9e311a6f25520f304a01c77f24a3538584..98d08b4579faa42b8ef43ba4563ed6cb76f19893 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,7 +46,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
 #endif
@@ -571,20 +571,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
 }
 
 
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
                            void *arg)
 {
-       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-               int file = page_is_file_cache(page);
-               int lru = page_lru_base_type(page);
+       if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+           !PageUnevictable(page)) {
+               bool active = PageActive(page);
 
-               del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+               del_page_from_lru_list(page, lruvec,
+                                      LRU_INACTIVE_ANON + active);
                ClearPageActive(page);
                ClearPageReferenced(page);
-               add_page_to_lru_list(page, lruvec, lru);
+               /*
+                * lazyfree pages are clean anonymous pages. They have
+                * SwapBacked flag cleared to distinguish normal anonymous
+                * pages
+                */
+               ClearPageSwapBacked(page);
+               add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
 
-               __count_vm_event(PGDEACTIVATE);
-               update_page_reclaim_stat(lruvec, file, 0);
+               __count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+               update_page_reclaim_stat(lruvec, 1, 0);
        }
 }
 
@@ -614,9 +621,9 @@ void lru_add_drain_cpu(int cpu)
        if (pagevec_count(pvec))
                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
-       pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+       pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
        if (pagevec_count(pvec))
-               pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+               pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
 
        activate_page_drain(cpu);
 }
@@ -648,22 +655,22 @@ void deactivate_file_page(struct page *page)
 }
 
 /**
- * deactivate_page - deactivate a page
+ * mark_page_lazyfree - make an anon page lazyfree
  * @page: page to deactivate
  *
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page.  This is done to accelerate the reclaim
- * of @page.
+ * mark_page_lazyfree() moves @page to the inactive file list.
+ * This is done to accelerate the reclaim of @page.
  */
-void deactivate_page(struct page *page)
+void mark_page_lazyfree(struct page *page)
 {
-       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+       if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+           !PageUnevictable(page)) {
+               struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
 
                get_page(page);
                if (!pagevec_add(pvec, page) || PageCompound(page))
-                       pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
-               put_cpu_var(lru_deactivate_pvecs);
+                       pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+               put_cpu_var(lru_lazyfree_pvecs);
        }
 }
 
@@ -703,7 +710,7 @@ void lru_add_drain_all(void)
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
-                   pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+                   pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
index b1ccb58ad397403214a220e4a0ac7901a6b6ae1e..aa1c415f4abd6785ee7ee5d6f3f56ab74cd7bc60 100644 (file)
@@ -241,8 +241,10 @@ int enable_swap_slots_cache(void)
 
        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
                                alloc_swap_slot_cache, free_slot_cache);
-       if (ret < 0)
+       if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+                              "without swap slots cache.\n", __func__))
                goto out_unlock;
+
        swap_slot_cache_initialized = true;
        __reenable_swap_slots_cache();
 out_unlock:
index 473b71e052a8ed29df7c496af747e2b43491c782..7bfb9bd1ca211ef0682085b99e9723cfeab9b5fe 100644 (file)
@@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                        /*
                         * We might race against get_swap_page() and stumble
                         * across a SWAP_HAS_CACHE swap_map entry whose page
-                        * has not been brought into the swapcache yet, while
-                        * the other end is scheduled away waiting on discard
-                        * I/O completion at scan_swap_map().
-                        *
-                        * In order to avoid turning this transitory state
-                        * into a permanent loop around this -EEXIST case
-                        * if !CONFIG_PREEMPT and the I/O completion happens
-                        * to be waiting on the CPU waitqueue where we are now
-                        * busy looping, we just conditionally invoke the
-                        * scheduler here, if there are some more important
-                        * tasks to run.
+                        * has not been brought into the swapcache yet.
                         */
                        cond_resched();
                        continue;
index 178130880b908515a105eccf9fa428f7cf61719a..b86b2aca3fb9b10321ea8f6770c7d10905724b2d 100644 (file)
@@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
                ci_tail = ci + tail;
                spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
                cluster_set_next(ci_tail, idx);
-               unlock_cluster(ci_tail);
+               spin_unlock(&ci_tail->lock);
                cluster_set_next_flag(&list->tail, idx, 0);
        }
 }
@@ -672,6 +672,9 @@ checks:
                else
                        goto done;
        }
+       si->swap_map[offset] = usage;
+       inc_cluster_info_page(si, si->cluster_info, offset);
+       unlock_cluster(ci);
 
        if (offset == si->lowest_bit)
                si->lowest_bit++;
@@ -685,9 +688,6 @@ checks:
                plist_del(&si->avail_list, &swap_avail_head);
                spin_unlock(&swap_avail_lock);
        }
-       si->swap_map[offset] = usage;
-       inc_cluster_info_page(si, si->cluster_info, offset);
-       unlock_cluster(ci);
        si->cluster_next = offset + 1;
        slots[n_ret++] = swp_entry(si->type, offset);
 
@@ -1079,8 +1079,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
                p = swap_info_get_cont(entries[i], prev);
                if (p)
                        swap_entry_free(p, entries[i]);
-               else
-                       break;
                prev = p;
        }
        if (p)
@@ -1111,6 +1109,18 @@ int page_swapcount(struct page *page)
        return count;
 }
 
+static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+{
+       int count = 0;
+       pgoff_t offset = swp_offset(entry);
+       struct swap_cluster_info *ci;
+
+       ci = lock_cluster_or_swap_info(si, offset);
+       count = swap_count(si->swap_map[offset]);
+       unlock_cluster_or_swap_info(si, ci);
+       return count;
+}
+
 /*
  * How many references to @entry are currently swapped out?
  * This does not give an exact answer when swap count is continued,
@@ -1119,17 +1129,11 @@ int page_swapcount(struct page *page)
 int __swp_swapcount(swp_entry_t entry)
 {
        int count = 0;
-       pgoff_t offset;
        struct swap_info_struct *si;
-       struct swap_cluster_info *ci;
 
        si = __swap_info_get(entry);
-       if (si) {
-               offset = swp_offset(entry);
-               ci = lock_cluster_or_swap_info(si, offset);
-               count = swap_count(si->swap_map[offset]);
-               unlock_cluster_or_swap_info(si, ci);
-       }
+       if (si)
+               count = swap_swapcount(si, entry);
        return count;
 }
 
@@ -1291,7 +1295,8 @@ int free_swap_and_cache(swp_entry_t entry)
                 * Also recheck PageSwapCache now page is locked (above).
                 */
                if (PageSwapCache(page) && !PageWriteback(page) &&
-                   (!page_mapped(page) || mem_cgroup_swap_full(page))) {
+                   (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
+                   !swap_swapcount(p, entry)) {
                        delete_from_swap_cache(page);
                        SetPageDirty(page);
                }
index 6263affdef8866135f28256b5b1f5e598515d204..83a059e8cd1de2c950dfc4ad790fcf1140df34b6 100644 (file)
@@ -266,9 +266,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
        pgoff_t         index;
        int             i;
 
-       cleancache_invalidate_inode(mapping);
        if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
-               return;
+               goto out;
 
        /* Offsets within partial pages */
        partial_start = lstart & (PAGE_SIZE - 1);
@@ -363,7 +362,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
         * will be released, just zeroed, so we can bail out now.
         */
        if (start >= end)
-               return;
+               goto out;
 
        index = start;
        for ( ; ; ) {
@@ -410,6 +409,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
                pagevec_release(&pvec);
                index++;
        }
+
+out:
        cleancache_invalidate_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -623,7 +624,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        int ret2 = 0;
        int did_range_unmap = 0;
 
-       cleancache_invalidate_inode(mapping);
+       if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+               goto out;
+
        pagevec_init(&pvec, 0);
        index = start;
        while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
@@ -686,6 +689,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                cond_resched();
                index++;
        }
+
+out:
        cleancache_invalidate_inode(mapping);
        return ret;
 }
index bc8031ef994d57a1d1622468f8df6d745853562b..4e7ed65842aff72beb90815262dacd29987cbcf8 100644 (file)
@@ -97,8 +97,13 @@ struct scan_control {
        /* Can pages be swapped as part of reclaim? */
        unsigned int may_swap:1;
 
-       /* Can cgroups be reclaimed below their normal consumption range? */
-       unsigned int may_thrash:1;
+       /*
+        * Cgroups are not reclaimed below their configured memory.low,
+        * unless we threaten to OOM. If any cgroups are skipped due to
+        * memory.low and nothing was reclaimed, go back for memory.low.
+        */
+       unsigned int memcg_low_reclaim:1;
+       unsigned int memcg_low_skipped:1;
 
        unsigned int hibernation_mode:1;
 
@@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
        return nr;
 }
 
-bool pgdat_reclaimable(struct pglist_data *pgdat)
-{
-       return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
-               pgdat_reclaimable_pages(pgdat) * 6;
-}
-
 /**
  * lruvec_lru_size -  Returns the number of pages on the given LRU list.
  * @lruvec: lru vector
@@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page,
         * Anonymous pages are not handled by flushers and must be written
         * from reclaim context. Do not stall reclaim based on them
         */
-       if (!page_is_file_cache(page)) {
+       if (!page_is_file_cache(page) ||
+           (PageAnon(page) && !PageSwapBacked(page))) {
                *dirty = false;
                *writeback = false;
                return;
@@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                int may_enter_fs;
                enum page_references references = PAGEREF_RECLAIM_CLEAN;
                bool dirty, writeback;
-               bool lazyfree = false;
-               int ret = SWAP_SUCCESS;
 
                cond_resched();
 
@@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                sc->nr_scanned++;
 
                if (unlikely(!page_evictable(page)))
-                       goto cull_mlocked;
+                       goto activate_locked;
 
                if (!sc->may_unmap && page_mapped(page))
                        goto keep_locked;
 
                /* Double the slab pressure for mapped and swapcache pages */
-               if (page_mapped(page) || PageSwapCache(page))
+               if ((page_mapped(page) || PageSwapCache(page)) &&
+                   !(PageAnon(page) && !PageSwapBacked(page)))
                        sc->nr_scanned++;
 
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
@@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                /*
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
+                * Lazyfree page could be freed directly
                 */
-               if (PageAnon(page) && !PageSwapCache(page)) {
+               if (PageAnon(page) && PageSwapBacked(page) &&
+                   !PageSwapCache(page)) {
                        if (!(sc->gfp_mask & __GFP_IO))
                                goto keep_locked;
                        if (!add_to_swap(page, page_list))
                                goto activate_locked;
-                       lazyfree = true;
                        may_enter_fs = 1;
 
                        /* Adding to swap updated mapping */
@@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * The page is mapped into the page tables of one or more
                 * processes. Try to unmap it here.
                 */
-               if (page_mapped(page) && mapping) {
-                       switch (ret = try_to_unmap(page, lazyfree ?
-                               (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
-                               (ttu_flags | TTU_BATCH_FLUSH))) {
-                       case SWAP_FAIL:
+               if (page_mapped(page)) {
+                       if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
                                nr_unmap_fail++;
                                goto activate_locked;
-                       case SWAP_AGAIN:
-                               goto keep_locked;
-                       case SWAP_MLOCK:
-                               goto cull_mlocked;
-                       case SWAP_LZFREE:
-                               goto lazyfree;
-                       case SWAP_SUCCESS:
-                               ; /* try to free the page below */
                        }
                }
 
@@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                }
 
-lazyfree:
-               if (!mapping || !__remove_mapping(mapping, page, true))
-                       goto keep_locked;
+               if (PageAnon(page) && !PageSwapBacked(page)) {
+                       /* follow __remove_mapping for reference */
+                       if (!page_ref_freeze(page, 1))
+                               goto keep_locked;
+                       if (PageDirty(page)) {
+                               page_ref_unfreeze(page, 1);
+                               goto keep_locked;
+                       }
 
+                       count_vm_event(PGLAZYFREED);
+               } else if (!mapping || !__remove_mapping(mapping, page, true))
+                       goto keep_locked;
                /*
                 * At this point, we have no other references and there is
                 * no way to pick any more up (removed from LRU, removed
@@ -1280,9 +1277,6 @@ lazyfree:
                 */
                __ClearPageLocked(page);
 free_it:
-               if (ret == SWAP_LZFREE)
-                       count_vm_event(PGLAZYFREED);
-
                nr_reclaimed++;
 
                /*
@@ -1292,20 +1286,16 @@ free_it:
                list_add(&page->lru, &free_pages);
                continue;
 
-cull_mlocked:
-               if (PageSwapCache(page))
-                       try_to_free_swap(page);
-               unlock_page(page);
-               list_add(&page->lru, &ret_pages);
-               continue;
-
 activate_locked:
                /* Not a candidate for swapping, so reclaim swap space. */
-               if (PageSwapCache(page) && mem_cgroup_swap_full(page))
+               if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+                                               PageMlocked(page)))
                        try_to_free_swap(page);
                VM_BUG_ON_PAGE(PageActive(page), page);
-               SetPageActive(page);
-               pgactivate++;
+               if (!PageMlocked(page)) {
+                       SetPageActive(page);
+                       pgactivate++;
+               }
 keep_locked:
                unlock_page(page);
 keep:
@@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
        }
 
        ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
-                       TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+                       TTU_IGNORE_ACCESS, NULL, true);
        list_splice(&clean_pages, page_list);
        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
        return ret;
@@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        unsigned long nr_taken = 0;
        unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
        unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
-       unsigned long skipped = 0, total_skipped = 0;
+       unsigned long skipped = 0;
        unsigned long scan, nr_pages;
        LIST_HEAD(pages_skipped);
 
        for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
-                                       !list_empty(src);) {
+                                       !list_empty(src); scan++) {
                struct page *page;
 
                page = lru_to_page(src);
@@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        continue;
                }
 
-               /*
-                * Account for scanned and skipped separetly to avoid the pgdat
-                * being prematurely marked unreclaimable by pgdat_reclaimable.
-                */
-               scan++;
-
                switch (__isolate_lru_page(page, mode)) {
                case 0:
                        nr_pages = hpage_nr_pages(page);
@@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        if (!list_empty(&pages_skipped)) {
                int zid;
 
+               list_splice(&pages_skipped, src);
                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                        if (!nr_skipped[zid])
                                continue;
@@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
                        skipped += nr_skipped[zid];
                }
-
-               /*
-                * Account skipped pages as a partial scan as the pgdat may be
-                * close to unreclaimable. If the LRU list is empty, account
-                * skipped pages as a full scan.
-                */
-               total_skipped = list_empty(src) ? skipped : skipped >> 2;
-
-               list_splice(&pages_skipped, src);
        }
-       *nr_scanned = scan + total_skipped;
+       *nr_scanned = scan;
        trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
                                    scan, skipped, nr_taken, mode, lru);
        update_lru_sizes(lruvec, lru, nr_zone_taken);
@@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        reclaim_stat->recent_scanned[file] += nr_taken;
 
        if (global_reclaim(sc)) {
-               __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
                if (current_is_kswapd())
                        __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
                else
@@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (nr_taken == 0)
                return 0;
 
-       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
                                &stat, false);
 
        spin_lock_irq(&pgdat->lru_lock);
@@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
        reclaim_stat->recent_scanned[file] += nr_taken;
 
-       if (global_reclaim(sc))
-               __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
        __count_vm_events(PGREFILL, nr_scanned);
 
        spin_unlock_irq(&pgdat->lru_lock);
@@ -2033,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
  * Both inactive lists should also be large enough that each inactive
  * page has a chance to be referenced again before it is reclaimed.
  *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
  * on this LRU, maintained by the pageout code. A zone->inactive_ratio
  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -2049,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                               struct scan_control *sc, bool trace)
+                                struct mem_cgroup *memcg,
+                                struct scan_control *sc, bool actual_reclaim)
 {
-       unsigned long inactive_ratio;
-       unsigned long inactive, active;
-       enum lru_list inactive_lru = file * LRU_FILE;
        enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       enum lru_list inactive_lru = file * LRU_FILE;
+       unsigned long inactive, active;
+       unsigned long inactive_ratio;
+       unsigned long refaults;
        unsigned long gb;
 
        /*
@@ -2067,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
        inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
        active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
+       if (memcg)
+               refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
        else
-               inactive_ratio = 1;
+               refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+       /*
+        * When refaults are being observed, it means a new workingset
+        * is being established. Disable active list protection to get
+        * rid of the stale workingset quickly.
+        */
+       if (file && actual_reclaim && lruvec->refaults != refaults) {
+               inactive_ratio = 0;
+       } else {
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+               if (gb)
+                       inactive_ratio = int_sqrt(10 * gb);
+               else
+                       inactive_ratio = 1;
+       }
 
-       if (trace)
-               trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
-                               sc->reclaim_idx,
-                               lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
-                               lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
-                               inactive_ratio, file);
+       if (actual_reclaim)
+               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+                       inactive_ratio, file);
 
        return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-                                struct lruvec *lruvec, struct scan_control *sc)
+                                struct lruvec *lruvec, struct mem_cgroup *memcg,
+                                struct scan_control *sc)
 {
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru),
+                                        memcg, sc, true))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
@@ -2123,30 +2116,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
        unsigned long anon_prio, file_prio;
        enum scan_balance scan_balance;
        unsigned long anon, file;
-       bool force_scan = false;
        unsigned long ap, fp;
        enum lru_list lru;
-       bool some_scanned;
-       int pass;
-
-       /*
-        * If the zone or memcg is small, nr[l] can be 0.  This
-        * results in no scanning on this priority and a potential
-        * priority drop.  Global direct reclaim can go to the next
-        * zone and tends to have no problems. Global kswapd is for
-        * zone balancing and it needs to scan a minimum amount. When
-        * reclaiming for a memcg, a priority drop can cause high
-        * latencies, so it's better to scan a minimum amount there as
-        * well.
-        */
-       if (current_is_kswapd()) {
-               if (!pgdat_reclaimable(pgdat))
-                       force_scan = true;
-               if (!mem_cgroup_online(memcg))
-                       force_scan = true;
-       }
-       if (!global_reclaim(sc))
-               force_scan = true;
 
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2218,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * lruvec even if it has plenty of old anonymous pages unless the
         * system is under heavy pressure.
         */
-       if (!inactive_list_is_low(lruvec, true, sc, false) &&
+       if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
            lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
@@ -2277,55 +2248,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
        fraction[1] = fp;
        denominator = ap + fp + 1;
 out:
-       some_scanned = false;
-       /* Only use force_scan on second pass. */
-       for (pass = 0; !some_scanned && pass < 2; pass++) {
-               *lru_pages = 0;
-               for_each_evictable_lru(lru) {
-                       int file = is_file_lru(lru);
-                       unsigned long size;
-                       unsigned long scan;
-
-                       size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-                       scan = size >> sc->priority;
-
-                       if (!scan && pass && force_scan)
-                               scan = min(size, SWAP_CLUSTER_MAX);
-
-                       switch (scan_balance) {
-                       case SCAN_EQUAL:
-                               /* Scan lists relative to size */
-                               break;
-                       case SCAN_FRACT:
-                               /*
-                                * Scan types proportional to swappiness and
-                                * their relative recent reclaim efficiency.
-                                */
-                               scan = div64_u64(scan * fraction[file],
-                                                       denominator);
-                               break;
-                       case SCAN_FILE:
-                       case SCAN_ANON:
-                               /* Scan one type exclusively */
-                               if ((scan_balance == SCAN_FILE) != file) {
-                                       size = 0;
-                                       scan = 0;
-                               }
-                               break;
-                       default:
-                               /* Look ma, no brain */
-                               BUG();
-                       }
+       *lru_pages = 0;
+       for_each_evictable_lru(lru) {
+               int file = is_file_lru(lru);
+               unsigned long size;
+               unsigned long scan;
 
-                       *lru_pages += size;
-                       nr[lru] = scan;
+               size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+               scan = size >> sc->priority;
+               /*
+                * If the cgroup's already been deleted, make sure to
+                * scrape out the remaining cache.
+                */
+               if (!scan && !mem_cgroup_online(memcg))
+                       scan = min(size, SWAP_CLUSTER_MAX);
 
+               switch (scan_balance) {
+               case SCAN_EQUAL:
+                       /* Scan lists relative to size */
+                       break;
+               case SCAN_FRACT:
                        /*
-                        * Skip the second pass and don't force_scan,
-                        * if we found something to scan.
+                        * Scan types proportional to swappiness and
+                        * their relative recent reclaim efficiency.
                         */
-                       some_scanned |= !!scan;
+                       scan = div64_u64(scan * fraction[file],
+                                        denominator);
+                       break;
+               case SCAN_FILE:
+               case SCAN_ANON:
+                       /* Scan one type exclusively */
+                       if ((scan_balance == SCAN_FILE) != file) {
+                               size = 0;
+                               scan = 0;
+                       }
+                       break;
+               default:
+                       /* Look ma, no brain */
+                       BUG();
                }
+
+               *lru_pages += size;
+               nr[lru] = scan;
        }
 }
 
@@ -2376,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
                                nr[lru] -= nr_to_scan;
 
                                nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                           lruvec, sc);
+                                                           lruvec, memcg, sc);
                        }
                }
 
@@ -2443,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false, sc, true))
+       if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 }
@@ -2557,9 +2521,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                        unsigned long scanned;
 
                        if (mem_cgroup_low(root, memcg)) {
-                               if (!sc->may_thrash)
+                               if (!sc->memcg_low_reclaim) {
+                                       sc->memcg_low_skipped = 1;
                                        continue;
-                               mem_cgroup_events(memcg, MEMCG_LOW, 1);
+                               }
+                               mem_cgroup_event(memcg, MEMCG_LOW);
                        }
 
                        reclaimed = sc->nr_reclaimed;
@@ -2620,6 +2586,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
        } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
 
+       /*
+        * Kswapd gives up on balancing particular nodes after too
+        * many failures to reclaim anything from them and goes to
+        * sleep. On reclaim progress, reset the failure counter. A
+        * successful direct reclaim run will revive a dormant kswapd.
+        */
+       if (reclaimable)
+               pgdat->kswapd_failures = 0;
+
        return reclaimable;
 }
 
@@ -2694,10 +2669,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                                                 GFP_KERNEL | __GFP_HARDWALL))
                                continue;
 
-                       if (sc->priority != DEF_PRIORITY &&
-                           !pgdat_reclaimable(zone->zone_pgdat))
-                               continue;       /* Let kswapd poll it */
-
                        /*
                         * If we already have plenty of memory free for
                         * compaction in this zone, don't free any more.
@@ -2752,6 +2723,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        sc->gfp_mask = orig_mask;
 }
 
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+       do {
+               unsigned long refaults;
+               struct lruvec *lruvec;
+
+               if (memcg)
+                       refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
+               else
+                       refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+               lruvec = mem_cgroup_lruvec(pgdat, memcg);
+               lruvec->refaults = refaults;
+       } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
 /*
  * This is the main entry point to direct page reclaim.
  *
@@ -2772,6 +2762,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                          struct scan_control *sc)
 {
        int initial_priority = sc->priority;
+       pg_data_t *last_pgdat;
+       struct zoneref *z;
+       struct zone *zone;
 retry:
        delayacct_freepages_start();
 
@@ -2798,6 +2791,15 @@ retry:
                        sc->may_writepage = 1;
        } while (--sc->priority >= 0);
 
+       last_pgdat = NULL;
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+                                       sc->nodemask) {
+               if (zone->zone_pgdat == last_pgdat)
+                       continue;
+               last_pgdat = zone->zone_pgdat;
+               snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+       }
+
        delayacct_freepages_end();
 
        if (sc->nr_reclaimed)
@@ -2808,16 +2810,17 @@ retry:
                return 1;
 
        /* Untapped cgroup reserves?  Don't OOM, retry. */
-       if (!sc->may_thrash) {
+       if (sc->memcg_low_skipped) {
                sc->priority = initial_priority;
-               sc->may_thrash = 1;
+               sc->memcg_low_reclaim = 1;
+               sc->memcg_low_skipped = 0;
                goto retry;
        }
 
        return 0;
 }
 
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
 {
        struct zone *zone;
        unsigned long pfmemalloc_reserve = 0;
@@ -2825,10 +2828,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
        int i;
        bool wmark_ok;
 
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
+
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
-               if (!managed_zone(zone) ||
-                   pgdat_reclaimable_pages(pgdat) == 0)
+               if (!managed_zone(zone))
+                       continue;
+
+               if (!zone_reclaimable_pages(zone))
                        continue;
 
                pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
                /* Throttle based on the first usable node */
                pgdat = zone->zone_pgdat;
-               if (pfmemalloc_watermark_ok(pgdat))
+               if (allow_direct_reclaim(pgdat))
                        goto out;
                break;
        }
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         */
        if (!(gfp_mask & __GFP_FS)) {
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-                       pfmemalloc_watermark_ok(pgdat), HZ);
+                       allow_direct_reclaim(pgdat), HZ);
 
                goto check_pending;
        }
 
        /* Throttle until kswapd wakes the process */
        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               pfmemalloc_watermark_ok(pgdat));
+               allow_direct_reclaim(pgdat));
 
 check_pending:
        if (fatal_signal_pending(current))
@@ -2950,7 +2958,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
                .reclaim_idx = gfp_zone(gfp_mask),
                .order = order,
                .nodemask = nodemask,
@@ -3030,7 +3038,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
        int nid;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-               .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+               .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
                .reclaim_idx = MAX_NR_ZONES - 1,
                .target_mem_cgroup = memcg,
@@ -3076,7 +3084,7 @@ static void age_active_anon(struct pglist_data *pgdat,
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-               if (inactive_list_is_low(lruvec, false, sc, true))
+               if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                           sc, LRU_ACTIVE_ANON);
 
@@ -3084,22 +3092,44 @@ static void age_active_anon(struct pglist_data *pgdat,
        } while (memcg);
 }
 
-static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and classzone_idx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 {
-       unsigned long mark = high_wmark_pages(zone);
+       int i;
+       unsigned long mark = -1;
+       struct zone *zone;
 
-       if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
-               return false;
+       for (i = 0; i <= classzone_idx; i++) {
+               zone = pgdat->node_zones + i;
+
+               if (!managed_zone(zone))
+                       continue;
+
+               mark = high_wmark_pages(zone);
+               if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+                       return true;
+       }
 
        /*
-        * If any eligible zone is balanced then the node is not considered
-        * to be congested or dirty
+        * If a node has no populated zone within classzone_idx, it does not
+        * need balancing by definition. This can happen if a zone-restricted
+        * allocation tries to wake a remote kswapd.
         */
-       clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
-       clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
-       clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
+       if (mark == -1)
+               return true;
 
-       return true;
+       return false;
+}
+
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+       clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+       clear_bit(PGDAT_DIRTY, &pgdat->flags);
+       clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
 }
 
 /*
@@ -3110,11 +3140,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
  */
 static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 {
-       int i;
-
        /*
         * The throttled processes are normally woken up in balance_pgdat() as
-        * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+        * soon as allow_direct_reclaim() is true. But there is a potential
         * race between when kswapd checks the watermarks and a process gets
         * throttled. There is also a potential race if processes get
         * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3128,17 +3156,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        if (waitqueue_active(&pgdat->pfmemalloc_wait))
                wake_up_all(&pgdat->pfmemalloc_wait);
 
-       for (i = 0; i <= classzone_idx; i++) {
-               struct zone *zone = pgdat->node_zones + i;
-
-               if (!managed_zone(zone))
-                       continue;
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return true;
 
-               if (!zone_balanced(zone, order, classzone_idx))
-                       return false;
+       if (pgdat_balanced(pgdat, order, classzone_idx)) {
+               clear_pgdat_congested(pgdat);
+               return true;
        }
 
-       return true;
+       return false;
 }
 
 /*
@@ -3214,9 +3241,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
        count_vm_event(PAGEOUTRUN);
 
        do {
+               unsigned long nr_reclaimed = sc.nr_reclaimed;
                bool raise_priority = true;
 
-               sc.nr_reclaimed = 0;
                sc.reclaim_idx = classzone_idx;
 
                /*
@@ -3241,23 +3268,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                }
 
                /*
-                * Only reclaim if there are no eligible zones. Check from
-                * high to low zone as allocations prefer higher zones.
-                * Scanning from low to high zone would allow congestion to be
-                * cleared during a very small window when a small low
-                * zone was balanced even under extreme pressure when the
-                * overall node may be congested. Note that sc.reclaim_idx
-                * is not used as buffer_heads_over_limit may have adjusted
-                * it.
+                * Only reclaim if there are no eligible zones. Note that
+                * sc.reclaim_idx is not used as buffer_heads_over_limit may
+                * have adjusted it.
                 */
-               for (i = classzone_idx; i >= 0; i--) {
-                       zone = pgdat->node_zones + i;
-                       if (!managed_zone(zone))
-                               continue;
-
-                       if (zone_balanced(zone, sc.order, classzone_idx))
-                               goto out;
-               }
+               if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+                       goto out;
 
                /*
                 * Do some background aging of the anon list, to give
@@ -3271,7 +3287,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * If we're getting trouble reclaiming, start doing writepage
                 * even in laptop mode.
                 */
-               if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+               if (sc.priority < DEF_PRIORITY - 2)
                        sc.may_writepage = 1;
 
                /* Call soft limit reclaim before calling shrink_node. */
@@ -3295,7 +3311,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * able to safely make forward progress. Wake them
                 */
                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-                               pfmemalloc_watermark_ok(pgdat))
+                               allow_direct_reclaim(pgdat))
                        wake_up_all(&pgdat->pfmemalloc_wait);
 
                /* Check if kswapd should be suspending */
@@ -3306,11 +3322,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 * Raise priority if scanning rate is too low or there was no
                 * progress in reclaiming pages
                 */
-               if (raise_priority || !sc.nr_reclaimed)
+               nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+               if (raise_priority || !nr_reclaimed)
                        sc.priority--;
        } while (sc.priority >= 1);
 
+       if (!sc.nr_reclaimed)
+               pgdat->kswapd_failures++;
+
 out:
+       snapshot_refaults(NULL, pgdat);
        /*
         * Return the order kswapd stopped reclaiming at as
         * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3320,6 +3341,22 @@ out:
        return sc.order;
 }
 
+/*
+ * pgdat->kswapd_classzone_idx is the highest zone index that a recent
+ * allocation request woke kswapd for. When kswapd has not woken recently,
+ * the value is MAX_NR_ZONES which is not a valid index. This compares a
+ * given classzone and returns it or the highest classzone index kswapd
+ * was recently woke for.
+ */
+static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
+                                          enum zone_type classzone_idx)
+{
+       if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+               return classzone_idx;
+
+       return max(pgdat->kswapd_classzone_idx, classzone_idx);
+}
+
 static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
                                unsigned int classzone_idx)
 {
@@ -3331,7 +3368,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 
-       /* Try to sleep for a short interval */
+       /*
+        * Try to sleep for a short interval. Note that kcompactd will only be
+        * woken if it is possible to sleep for a short interval. This is
+        * deliberate on the assumption that if reclaim cannot keep an
+        * eligible zone balanced that it's also unlikely that compaction will
+        * succeed.
+        */
        if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                /*
                 * Compaction records what page blocks it recently failed to
@@ -3355,7 +3398,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
                 * the previous request that slept prematurely.
                 */
                if (remaining) {
-                       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+                       pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
                        pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
                }
 
@@ -3409,7 +3452,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  */
 static int kswapd(void *p)
 {
-       unsigned int alloc_order, reclaim_order, classzone_idx;
+       unsigned int alloc_order, reclaim_order;
+       unsigned int classzone_idx = MAX_NR_ZONES - 1;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
 
@@ -3439,20 +3483,23 @@ static int kswapd(void *p)
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        set_freezable();
 
-       pgdat->kswapd_order = alloc_order = reclaim_order = 0;
-       pgdat->kswapd_classzone_idx = classzone_idx = 0;
+       pgdat->kswapd_order = 0;
+       pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
        for ( ; ; ) {
                bool ret;
 
+               alloc_order = reclaim_order = pgdat->kswapd_order;
+               classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+
 kswapd_try_sleep:
                kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
                                        classzone_idx);
 
                /* Read the new order and classzone_idx */
                alloc_order = reclaim_order = pgdat->kswapd_order;
-               classzone_idx = pgdat->kswapd_classzone_idx;
+               classzone_idx = kswapd_classzone_idx(pgdat, 0);
                pgdat->kswapd_order = 0;
-               pgdat->kswapd_classzone_idx = 0;
+               pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
 
                ret = try_to_freeze();
                if (kthread_should_stop())
@@ -3478,9 +3525,6 @@ kswapd_try_sleep:
                reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
                if (reclaim_order < alloc_order)
                        goto kswapd_try_sleep;
-
-               alloc_order = reclaim_order = pgdat->kswapd_order;
-               classzone_idx = pgdat->kswapd_classzone_idx;
        }
 
        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3496,7 +3540,6 @@ kswapd_try_sleep:
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
-       int z;
 
        if (!managed_zone(zone))
                return;
@@ -3504,22 +3547,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
                return;
        pgdat = zone->zone_pgdat;
-       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+       pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
+                                                          classzone_idx);
        pgdat->kswapd_order = max(pgdat->kswapd_order, order);
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
 
-       /* Only wake kswapd if all zones are unbalanced */
-       for (z = 0; z <= classzone_idx; z++) {
-               zone = pgdat->node_zones + z;
-               if (!managed_zone(zone))
-                       continue;
+       /* Hopeless node, leave it to direct reclaim */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+               return;
 
-               if (zone_balanced(zone, order, classzone_idx))
-                       return;
-       }
+       if (pgdat_balanced(pgdat, order, classzone_idx))
+               return;
 
-       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
@@ -3725,7 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
        int classzone_idx = gfp_zone(gfp_mask);
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
                .order = order,
                .priority = NODE_RECLAIM_PRIORITY,
                .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
@@ -3779,9 +3820,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
            sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
                return NODE_RECLAIM_FULL;
 
-       if (!pgdat_reclaimable(pgdat))
-               return NODE_RECLAIM_FULL;
-
        /*
         * Do not scan if the allocation should not be delayed.
         */
index 5a4f5c5a31e88ee558f536d22f61f05a3fd13c45..f5fa1bd1eb1656cb4e608bd301c83e92f66ac3b2 100644 (file)
@@ -954,7 +954,6 @@ const char * const vmstat_text[] = {
        "nr_unevictable",
        "nr_isolated_anon",
        "nr_isolated_file",
-       "nr_pages_scanned",
        "workingset_refault",
        "workingset_activate",
        "workingset_nodereclaim",
@@ -992,6 +991,7 @@ const char * const vmstat_text[] = {
        "pgfree",
        "pgactivate",
        "pgdeactivate",
+       "pglazyfree",
 
        "pgfault",
        "pgmajfault",
@@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg)
 {
 }
 
-/* Walk all the zones in a node and print using a callback */
+/*
+ * Walk zones in a node and print using a callback.
+ * If @assert_populated is true, only use callback for zones that are populated.
+ */
 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+               bool assert_populated,
                void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
 {
        struct zone *zone;
@@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
        unsigned long flags;
 
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-               if (!populated_zone(zone))
+               if (assert_populated && !populated_zone(zone))
                        continue;
 
                spin_lock_irqsave(&zone->lock, flags);
@@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
 static int frag_show(struct seq_file *m, void *arg)
 {
        pg_data_t *pgdat = (pg_data_t *)arg;
-       walk_zones_in_node(m, pgdat, frag_show_print);
+       walk_zones_in_node(m, pgdat, true, frag_show_print);
        return 0;
 }
 
@@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
                seq_printf(m, "%6d ", order);
        seq_putc(m, '\n');
 
-       walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+       walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
 
        return 0;
 }
@@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
                seq_printf(m, "%12s ", migratetype_names[mtype]);
        seq_putc(m, '\n');
-       walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+       walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
 
        return 0;
 }
@@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
                seq_printf(m, "%12s ", migratetype_names[mtype]);
        seq_putc(m, '\n');
 
-       walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+       walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
 #endif /* CONFIG_PAGE_OWNER */
 }
 
@@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        min      %lu"
                   "\n        low      %lu"
                   "\n        high     %lu"
-                  "\n   node_scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu"
                   "\n        managed  %lu",
@@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
-                  node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
                   zone->spanned_pages,
                   zone->present_pages,
                   zone->managed_pages);
 
-       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-               seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
-                               zone_page_state(zone, i));
-
        seq_printf(m,
                   "\n        protection: (%ld",
                   zone->lowmem_reserve[0]);
        for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
                seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
-       seq_printf(m,
-                  ")"
-                  "\n  pagesets");
+       seq_putc(m, ')');
+
+       /* If unpopulated, no other information is useful */
+       if (!populated_zone(zone)) {
+               seq_putc(m, '\n');
+               return;
+       }
+
+       for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+               seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
+                               zone_page_state(zone, i));
+
+       seq_printf(m, "\n  pagesets");
        for_each_online_cpu(i) {
                struct per_cpu_pageset *pageset;
 
@@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n  node_unreclaimable:  %u"
                   "\n  start_pfn:           %lu"
                   "\n  node_inactive_ratio: %u",
-                  !pgdat_reclaimable(zone->zone_pgdat),
+                  pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
                   zone->zone_start_pfn,
                   zone->zone_pgdat->inactive_ratio);
        seq_putc(m, '\n');
 }
 
 /*
- * Output information about zones in @pgdat.
+ * Output information about zones in @pgdat.  All zones are printed regardless
+ * of whether they are populated or not: lowmem_reserve_ratio operates on the
+ * set of all zones and userspace would not be aware of such zones if they are
+ * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
  */
 static int zoneinfo_show(struct seq_file *m, void *arg)
 {
        pg_data_t *pgdat = (pg_data_t *)arg;
-       walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+       walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
        return 0;
 }
 
@@ -1586,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write,
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
                val = atomic_long_read(&vm_zone_stat[i]);
                if (val < 0) {
-                       switch (i) {
-                       case NR_PAGES_SCANNED:
-                               /*
-                                * This is often seen to go negative in
-                                * recent kernels, but not to go permanently
-                                * negative.  Whilst it would be nicer not to
-                                * have exceptions, rooting them out would be
-                                * another task, of rather low priority.
-                                */
-                               break;
-                       default:
-                               pr_warn("%s: %s %ld\n",
-                                       __func__, vmstat_text[i], val);
-                               err = -EINVAL;
-                               break;
-                       }
+                       pr_warn("%s: %s %ld\n",
+                               __func__, vmstat_text[i], val);
+                       err = -EINVAL;
                }
        }
        if (err)
@@ -1856,7 +1854,7 @@ static int unusable_show(struct seq_file *m, void *arg)
        if (!node_state(pgdat->node_id, N_MEMORY))
                return 0;
 
-       walk_zones_in_node(m, pgdat, unusable_show_print);
+       walk_zones_in_node(m, pgdat, true, unusable_show_print);
 
        return 0;
 }
@@ -1908,7 +1906,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
 {
        pg_data_t *pgdat = (pg_data_t *)arg;
 
-       walk_zones_in_node(m, pgdat, extfrag_show_print);
+       walk_zones_in_node(m, pgdat, true, extfrag_show_print);
 
        return 0;
 }
index eda05c71fa49e6e1e4f93a4029ddef04a4f8ab4c..b8c9ab6784795a0e7f8eff82a9a071d481e91a5f 100644 (file)
@@ -269,7 +269,6 @@ bool workingset_refault(void *shadow)
        lruvec = mem_cgroup_lruvec(pgdat, memcg);
        refault = atomic_long_read(&lruvec->inactive_age);
        active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
-       rcu_read_unlock();
 
        /*
         * The unsigned subtraction here gives an accurate distance
@@ -290,11 +289,15 @@ bool workingset_refault(void *shadow)
        refault_distance = (refault - eviction) & EVICTION_MASK;
 
        inc_node_state(pgdat, WORKINGSET_REFAULT);
+       inc_memcg_state(memcg, WORKINGSET_REFAULT);
 
        if (refault_distance <= active_file) {
                inc_node_state(pgdat, WORKINGSET_ACTIVATE);
+               inc_memcg_state(memcg, WORKINGSET_ACTIVATE);
+               rcu_read_unlock();
                return true;
        }
+       rcu_read_unlock();
        return false;
 }
 
@@ -472,6 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
        if (WARN_ON_ONCE(node->exceptional))
                goto out_invalid;
        inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+       inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
        __radix_tree_delete_node(&mapping->page_tree, node,
                                 workingset_update_node, mapping);
 
index 0545f5a8cabed76cb2c49cfd8c2d08f567bc4980..b67e74b22826d5f237d2eeaf16275c357ffcc20c 100644 (file)
@@ -46,6 +46,8 @@ ackowledge||acknowledge
 ackowledged||acknowledged
 acording||according
 activete||activate
+actived||activated
+actualy||actually
 acumulating||accumulating
 acumulator||accumulator
 adapater||adapter
@@ -76,6 +78,8 @@ algorritm||algorithm
 aligment||alignment
 alignement||alignment
 allign||align
+alligned||aligned
+allocatote||allocate
 allocatrd||allocated
 allocte||allocate
 allpication||application
@@ -141,6 +145,7 @@ asycronous||asynchronous
 asynchnous||asynchronous
 atomatically||automatically
 atomicly||atomically
+atempt||attempt
 attachement||attachment
 attched||attached
 attemps||attempts
@@ -270,6 +275,7 @@ comunication||communication
 conbination||combination
 conditionaly||conditionally
 conected||connected
+connecetd||connected
 configuartion||configuration
 configuratoin||configuration
 configuraton||configuration
@@ -291,11 +297,14 @@ continous||continuous
 continously||continuously
 continueing||continuing
 contraints||constraints
+contol||control
+contoller||controller
 controled||controlled
 controler||controller
 controll||control
 contruction||construction
 contry||country
+conuntry||country
 convertion||conversion
 convertor||converter
 convienient||convenient
@@ -310,6 +319,7 @@ coutner||counter
 cryptocraphic||cryptographic
 cunter||counter
 curently||currently
+cylic||cyclic
 dafault||default
 deafult||default
 deamon||daemon
@@ -398,6 +408,7 @@ efective||effective
 efficently||efficiently
 ehther||ether
 eigth||eight
+elementry||elementary
 eletronic||electronic
 embeded||embedded
 enabledi||enabled
@@ -443,6 +454,7 @@ extened||extended
 extensability||extensibility
 extention||extension
 extracter||extractor
+falied||failed
 faild||failed
 faill||fail
 failied||failed
@@ -492,6 +504,7 @@ futhermore||furthermore
 futrue||future
 gaurenteed||guaranteed
 generiously||generously
+genereate||generate
 genric||generic
 globel||global
 grabing||grabbing
@@ -513,8 +526,10 @@ hierachy||hierarchy
 hierarchie||hierarchy
 howver||however
 hsould||should
+hypervior||hypervisor
 hypter||hyper
 identidier||identifier
+iligal||illegal
 illigal||illegal
 imblance||imbalance
 immeadiately||immediately
@@ -600,6 +615,7 @@ intuative||intuitive
 invaid||invalid
 invalde||invalid
 invalide||invalid
+invalud||invalid
 invididual||individual
 invokation||invocation
 invokations||invocations
@@ -663,11 +679,14 @@ messsages||messages
 microprocesspr||microprocessor
 milliseonds||milliseconds
 minium||minimum
+minimam||minimum
 minumum||minimum
+misalinged||misaligned
 miscelleneous||miscellaneous
 misformed||malformed
 mispelled||misspelled
 mispelt||misspelt
+mising||missing
 miximum||maximum
 mmnemonic||mnemonic
 mnay||many
@@ -888,6 +907,7 @@ replys||replies
 reponse||response
 representaion||representation
 reqeust||request
+requestied||requested
 requiere||require
 requirment||requirement
 requred||required
@@ -981,6 +1001,7 @@ spinlcok||spinlock
 spinock||spinlock
 splitted||split
 spreaded||spread
+spurrious||spurious
 sructure||structure
 stablilization||stabilization
 staically||statically
@@ -1013,6 +1034,7 @@ superseeded||superseded
 suplied||supplied
 suported||supported
 suport||support
+supportet||supported
 suppored||supported
 supportin||supporting
 suppoted||supported
@@ -1056,6 +1078,7 @@ throught||through
 thses||these
 tiggered||triggered
 tipically||typically
+timout||timeout
 tmis||this
 torerable||tolerable
 tramsmitted||transmitted
@@ -1081,6 +1104,7 @@ unconditionaly||unconditionally
 underun||underrun
 unecessary||unnecessary
 unexecpted||unexpected
+unexepected||unexpected
 unexpcted||unexpected
 unexpectd||unexpected
 unexpeted||unexpected
@@ -1096,6 +1120,7 @@ unneded||unneeded
 unneedingly||unnecessarily
 unnsupported||unsupported
 unmached||unmatched
+unregester||unregister
 unresgister||unregister
 unrgesiter||unregister
 unsinged||unsigned
index 41642ba5e318a153d805720e47475436817be53e..dba889004ea1298248b70e09fccb4f46141df1c7 100644 (file)
@@ -15,21 +15,14 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
-TEST_GEN_FILES += userfaultfd_hugetlb
-TEST_GEN_FILES += userfaultfd_shmem
 TEST_GEN_FILES += mlock-random-test
 
 TEST_PROGS := run_vmtests
 
 include ../lib.mk
 
-$(OUTPUT)/userfaultfd: LDLIBS += -lpthread ../../../../usr/include/linux/kernel.h
-
-$(OUTPUT)/userfaultfd_hugetlb: userfaultfd.c ../../../../usr/include/linux/kernel.h
-       $(CC) $(CFLAGS) -DHUGETLB_TEST -O2 -o $@ $< -lpthread
-
-$(OUTPUT)/userfaultfd_shmem: userfaultfd.c  ../../../../usr/include/linux/kernel.h
-       $(CC) $(CFLAGS) -DSHMEM_TEST -O2 -o $@ $< -lpthread
+$(OUTPUT)/userfaultfd: ../../../../usr/include/linux/kernel.h
+$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
 
 $(OUTPUT)/mlock-random-test: LDLIBS += -lcap
 
index c92f6cf31d0a85714ae519601902dd274774de03..3214a6456d1301a9c6158f66b2d8ad04257518fb 100755 (executable)
@@ -95,7 +95,7 @@ echo "      hugetlb regression testing."
 echo "--------------------"
 echo "running userfaultfd"
 echo "--------------------"
-./userfaultfd 128 32
+./userfaultfd anon 128 32
 if [ $? -ne 0 ]; then
        echo "[FAIL]"
        exitcode=1
@@ -107,7 +107,7 @@ echo "----------------------------"
 echo "running userfaultfd_hugetlb"
 echo "----------------------------"
 # 258MB total huge pages == 128MB src and 128MB dst
-./userfaultfd_hugetlb 128 32 $mnt/ufd_test_file
+./userfaultfd hugetlb 128 32 $mnt/ufd_test_file
 if [ $? -ne 0 ]; then
        echo "[FAIL]"
        exitcode=1
@@ -119,7 +119,7 @@ rm -f $mnt/ufd_test_file
 echo "----------------------------"
 echo "running userfaultfd_shmem"
 echo "----------------------------"
-./userfaultfd_shmem 128 32
+./userfaultfd shmem 128 32
 if [ $? -ne 0 ]; then
        echo "[FAIL]"
        exitcode=1
index e9449c8018887785e9ab17219019423244e1b2b0..1eae79ae5b4e93f8862732b13fd2bd57d08d168f 100644 (file)
@@ -77,10 +77,13 @@ static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
 #define BOUNCE_POLL            (1<<3)
 static int bounces;
 
-#ifdef HUGETLB_TEST
+#define TEST_ANON      1
+#define TEST_HUGETLB   2
+#define TEST_SHMEM     3
+static int test_type;
+
 static int huge_fd;
 static char *huge_fd_off0;
-#endif
 static unsigned long long *count_verify;
 static int uffd, uffd_flags, finished, *pipefd;
 static char *area_src, *area_dst;
@@ -102,14 +105,7 @@ pthread_attr_t attr;
                                 ~(unsigned long)(sizeof(unsigned long long) \
                                                  -  1)))
 
-#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
-
-/* Anonymous memory */
-#define EXPECTED_IOCTLS                ((1 << _UFFDIO_WAKE) | \
-                                (1 << _UFFDIO_COPY) | \
-                                (1 << _UFFDIO_ZEROPAGE))
-
-static int release_pages(char *rel_area)
+static int anon_release_pages(char *rel_area)
 {
        int ret = 0;
 
@@ -121,7 +117,7 @@ static int release_pages(char *rel_area)
        return ret;
 }
 
-static void allocate_area(void **alloc_area)
+static void anon_allocate_area(void **alloc_area)
 {
        if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
                fprintf(stderr, "out of memory\n");
@@ -129,14 +125,9 @@ static void allocate_area(void **alloc_area)
        }
 }
 
-#else /* HUGETLB_TEST or SHMEM_TEST */
-
-#define EXPECTED_IOCTLS                UFFD_API_RANGE_IOCTLS_BASIC
-
-#ifdef HUGETLB_TEST
 
 /* HugeTLB memory */
-static int release_pages(char *rel_area)
+static int hugetlb_release_pages(char *rel_area)
 {
        int ret = 0;
 
@@ -152,7 +143,7 @@ static int release_pages(char *rel_area)
 }
 
 
-static void allocate_area(void **alloc_area)
+static void hugetlb_allocate_area(void **alloc_area)
 {
        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
                                MAP_PRIVATE | MAP_HUGETLB, huge_fd,
@@ -167,10 +158,8 @@ static void allocate_area(void **alloc_area)
                huge_fd_off0 = *alloc_area;
 }
 
-#elif defined(SHMEM_TEST)
-
 /* Shared memory */
-static int release_pages(char *rel_area)
+static int shmem_release_pages(char *rel_area)
 {
        int ret = 0;
 
@@ -182,7 +171,7 @@ static int release_pages(char *rel_area)
        return ret;
 }
 
-static void allocate_area(void **alloc_area)
+static void shmem_allocate_area(void **alloc_area)
 {
        *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
                           MAP_ANONYMOUS | MAP_SHARED, -1, 0);
@@ -192,11 +181,35 @@ static void allocate_area(void **alloc_area)
        }
 }
 
-#else /* SHMEM_TEST */
-#error "Undefined test type"
-#endif /* HUGETLB_TEST */
-
-#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
+struct uffd_test_ops {
+       unsigned long expected_ioctls;
+       void (*allocate_area)(void **alloc_area);
+       int (*release_pages)(char *rel_area);
+};
+
+#define ANON_EXPECTED_IOCTLS           ((1 << _UFFDIO_WAKE) | \
+                                        (1 << _UFFDIO_COPY) | \
+                                        (1 << _UFFDIO_ZEROPAGE))
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+       .expected_ioctls = ANON_EXPECTED_IOCTLS,
+       .allocate_area  = anon_allocate_area,
+       .release_pages  = anon_release_pages,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+       .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+       .allocate_area  = shmem_allocate_area,
+       .release_pages  = shmem_release_pages,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+       .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+       .allocate_area  = hugetlb_allocate_area,
+       .release_pages  = hugetlb_release_pages,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
 
 static int my_bcmp(char *str1, char *str2, size_t n)
 {
@@ -505,7 +518,7 @@ static int stress(unsigned long *userfaults)
         * UFFDIO_COPY without writing zero pages into area_dst
         * because the background threads already completed).
         */
-       if (release_pages(area_src))
+       if (uffd_test_ops->release_pages(area_src))
                return 1;
 
        for (cpu = 0; cpu < nr_cpus; cpu++) {
@@ -577,12 +590,12 @@ static int faulting_process(void)
 {
        unsigned long nr;
        unsigned long long count;
+       unsigned long split_nr_pages;
 
-#ifndef HUGETLB_TEST
-       unsigned long split_nr_pages = (nr_pages + 1) / 2;
-#else
-       unsigned long split_nr_pages = nr_pages;
-#endif
+       if (test_type != TEST_HUGETLB)
+               split_nr_pages = (nr_pages + 1) / 2;
+       else
+               split_nr_pages = nr_pages;
 
        for (nr = 0; nr < split_nr_pages; nr++) {
                count = *area_count(area_dst, nr);
@@ -594,7 +607,9 @@ static int faulting_process(void)
                }
        }
 
-#ifndef HUGETLB_TEST
+       if (test_type == TEST_HUGETLB)
+               return 0;
+
        area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
                          MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
        if (area_dst == MAP_FAILED)
@@ -610,7 +625,7 @@ static int faulting_process(void)
                }
        }
 
-       if (release_pages(area_dst))
+       if (uffd_test_ops->release_pages(area_dst))
                return 1;
 
        for (nr = 0; nr < nr_pages; nr++) {
@@ -618,8 +633,6 @@ static int faulting_process(void)
                        fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
        }
 
-#endif /* HUGETLB_TEST */
-
        return 0;
 }
 
@@ -627,7 +640,9 @@ static int uffdio_zeropage(int ufd, unsigned long offset)
 {
        struct uffdio_zeropage uffdio_zeropage;
        int ret;
-       unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
+       unsigned long has_zeropage;
+
+       has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
 
        if (offset >= nr_pages * page_size)
                fprintf(stderr, "unexpected offset %lu\n",
@@ -675,7 +690,7 @@ static int userfaultfd_zeropage_test(void)
        printf("testing UFFDIO_ZEROPAGE: ");
        fflush(stdout);
 
-       if (release_pages(area_dst))
+       if (uffd_test_ops->release_pages(area_dst))
                return 1;
 
        if (userfaultfd_open(0) < 0)
@@ -686,7 +701,7 @@ static int userfaultfd_zeropage_test(void)
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                fprintf(stderr, "register failure\n"), exit(1);
 
-       expected_ioctls = EXPECTED_IOCTLS;
+       expected_ioctls = uffd_test_ops->expected_ioctls;
        if ((uffdio_register.ioctls & expected_ioctls) !=
            expected_ioctls)
                fprintf(stderr,
@@ -716,7 +731,7 @@ static int userfaultfd_events_test(void)
        printf("testing events (fork, remap, remove): ");
        fflush(stdout);
 
-       if (release_pages(area_dst))
+       if (uffd_test_ops->release_pages(area_dst))
                return 1;
 
        features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
@@ -731,7 +746,7 @@ static int userfaultfd_events_test(void)
        if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
                fprintf(stderr, "register failure\n"), exit(1);
 
-       expected_ioctls = EXPECTED_IOCTLS;
+       expected_ioctls = uffd_test_ops->expected_ioctls;
        if ((uffdio_register.ioctls & expected_ioctls) !=
            expected_ioctls)
                fprintf(stderr,
@@ -773,10 +788,10 @@ static int userfaultfd_stress(void)
        int err;
        unsigned long userfaults[nr_cpus];
 
-       allocate_area((void **)&area_src);
+       uffd_test_ops->allocate_area((void **)&area_src);
        if (!area_src)
                return 1;
-       allocate_area((void **)&area_dst);
+       uffd_test_ops->allocate_area((void **)&area_dst);
        if (!area_dst)
                return 1;
 
@@ -856,7 +871,7 @@ static int userfaultfd_stress(void)
                        fprintf(stderr, "register failure\n");
                        return 1;
                }
-               expected_ioctls = EXPECTED_IOCTLS;
+               expected_ioctls = uffd_test_ops->expected_ioctls;
                if ((uffdio_register.ioctls & expected_ioctls) !=
                    expected_ioctls) {
                        fprintf(stderr,
@@ -888,7 +903,7 @@ static int userfaultfd_stress(void)
                 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
                 * required to MADV_DONTNEED here.
                 */
-               if (release_pages(area_dst))
+               if (uffd_test_ops->release_pages(area_dst))
                        return 1;
 
                /* bounce pass */
@@ -934,36 +949,6 @@ static int userfaultfd_stress(void)
        return userfaultfd_zeropage_test() || userfaultfd_events_test();
 }
 
-#ifndef HUGETLB_TEST
-
-int main(int argc, char **argv)
-{
-       if (argc < 3)
-               fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
-       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-       page_size = sysconf(_SC_PAGE_SIZE);
-       if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
-           > page_size)
-               fprintf(stderr, "Impossible to run this test\n"), exit(2);
-       nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
-               nr_cpus;
-       if (!nr_pages_per_cpu) {
-               fprintf(stderr, "invalid MiB\n");
-               fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
-       }
-       bounces = atoi(argv[2]);
-       if (bounces <= 0) {
-               fprintf(stderr, "invalid bounces\n");
-               fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
-       }
-       nr_pages = nr_pages_per_cpu * nr_cpus;
-       printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
-              nr_pages, nr_pages_per_cpu);
-       return userfaultfd_stress();
-}
-
-#else /* HUGETLB_TEST */
-
 /*
  * Copied from mlock2-tests.c
  */
@@ -988,48 +973,78 @@ unsigned long default_huge_page_size(void)
        return hps;
 }
 
-int main(int argc, char **argv)
+static void set_test_type(const char *type)
 {
-       if (argc < 4)
-               fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
-                               exit(1);
-       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-       page_size = default_huge_page_size();
+       if (!strcmp(type, "anon")) {
+               test_type = TEST_ANON;
+               uffd_test_ops = &anon_uffd_test_ops;
+       } else if (!strcmp(type, "hugetlb")) {
+               test_type = TEST_HUGETLB;
+               uffd_test_ops = &hugetlb_uffd_test_ops;
+       } else if (!strcmp(type, "shmem")) {
+               test_type = TEST_SHMEM;
+               uffd_test_ops = &shmem_uffd_test_ops;
+       } else {
+               fprintf(stderr, "Unknown test type: %s\n", type), exit(1);
+       }
+
+       if (test_type == TEST_HUGETLB)
+               page_size = default_huge_page_size();
+       else
+               page_size = sysconf(_SC_PAGE_SIZE);
+
        if (!page_size)
-               fprintf(stderr, "Unable to determine huge page size\n"),
+               fprintf(stderr, "Unable to determine page size\n"),
                                exit(2);
        if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
            > page_size)
                fprintf(stderr, "Impossible to run this test\n"), exit(2);
-       nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+}
+
+int main(int argc, char **argv)
+{
+       if (argc < 4)
+               fprintf(stderr, "Usage: <test type> <MiB> <bounces> [hugetlbfs_file]\n"),
+                               exit(1);
+
+       set_test_type(argv[1]);
+
+       nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+       nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
                nr_cpus;
        if (!nr_pages_per_cpu) {
                fprintf(stderr, "invalid MiB\n");
                fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
        }
-       bounces = atoi(argv[2]);
+
+       bounces = atoi(argv[3]);
        if (bounces <= 0) {
                fprintf(stderr, "invalid bounces\n");
                fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
        }
        nr_pages = nr_pages_per_cpu * nr_cpus;
-       huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
-       if (huge_fd < 0) {
-               fprintf(stderr, "Open of %s failed", argv[3]);
-               perror("open");
-               exit(1);
-       }
-       if (ftruncate(huge_fd, 0)) {
-               fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
-               perror("ftruncate");
-               exit(1);
+
+       if (test_type == TEST_HUGETLB) {
+               if (argc < 5)
+                       fprintf(stderr, "Usage: hugetlb <MiB> <bounces> <hugetlbfs_file>\n"),
+                               exit(1);
+               huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
+               if (huge_fd < 0) {
+                       fprintf(stderr, "Open of %s failed", argv[3]);
+                       perror("open");
+                       exit(1);
+               }
+               if (ftruncate(huge_fd, 0)) {
+                       fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+                       perror("ftruncate");
+                       exit(1);
+               }
        }
        printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
               nr_pages, nr_pages_per_cpu);
        return userfaultfd_stress();
 }
 
-#endif
 #else /* __NR_userfaultfd */
 
 #warning "missing __NR_userfaultfd definition"