/*
* Add a shrinker callback to be called from the vm.
*/
-int register_shrinker(struct shrinker *shrinker)
+int prealloc_shrinker(struct shrinker *shrinker)
{
size_t size = sizeof(*shrinker->nr_deferred);
shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
if (!shrinker->nr_deferred)
return -ENOMEM;
+ return 0;
+}
+
+void free_prealloced_shrinker(struct shrinker *shrinker)
+{
+ kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
+}
+void register_shrinker_prepared(struct shrinker *shrinker)
+{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
up_write(&shrinker_rwsem);
+}
+
+int register_shrinker(struct shrinker *shrinker)
+{
+ int err = prealloc_shrinker(shrinker);
+
+ if (err)
+ return err;
+ register_shrinker_prepared(shrinker);
return 0;
}
EXPORT_SYMBOL(register_shrinker);
sc.nid = 0;
freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+ /*
+ * Bail out if someone want to register a new shrinker to
+ * prevent the regsitration from being stalled for long periods
+ * by parallel ongoing shrinking.
+ */
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
}
up_read(&shrinker_rwsem);
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !__PageMovable(page)) {
+ !__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
if (PageDirty(page)) {
struct address_space *mapping;
+ bool migrate_dirty;
/*
* Only pages without mappings or that have a
* ->migratepage callback are possible to migrate
- * without blocking
+ * without blocking. However, we can be racing with
+ * truncation so it's necessary to lock the page
+ * to stabilise the mapping as truncation holds
+ * the page lock until after the page is removed
+ * from the page cache.
*/
+ if (!trylock_page(page))
+ return ret;
+
mapping = page_mapping(page);
- if (mapping && !mapping->a_ops->migratepage)
+ migrate_dirty = !mapping || mapping->a_ops->migratepage;
+ unlock_page(page);
+ if (!migrate_dirty)
return ret;
}
}
if (stat.nr_writeback && stat.nr_writeback == nr_taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+ /*
+ * If dirty pages are scanned that are not queued for IO, it
+ * implies that flushers are not doing their job. This can
+ * happen when memory pressure pushes dirty pages to the end of
+ * the LRU before the dirty limits are breached and the dirty
+ * data has expired. It can also happen when the proportion of
+ * dirty pages grows not through writes but through memory
+ * pressure reclaiming all the clean cache. And in some cases,
+ * the flushers simply cannot keep up with the allocation
+ * rate. Nudge the flusher threads in case they are asleep.
+ */
+ if (stat.nr_unqueued_dirty == nr_taken)
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
+
/*
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling here.
if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
set_bit(PGDAT_CONGESTED, &pgdat->flags);
- /*
- * If dirty pages are scanned that are not queued for IO, it
- * implies that flushers are not doing their job. This can
- * happen when memory pressure pushes dirty pages to the end of
- * the LRU before the dirty limits are breached and the dirty
- * data has expired. It can also happen when the proportion of
- * dirty pages grows not through writes but through memory
- * pressure reclaiming all the clean cache. And in some cases,
- * the flushers simply cannot keep up with the allocation
- * rate. Nudge the flusher threads in case they are asleep, but
- * also allow kswapd to start writing pages during reclaim.
- */
- if (stat.nr_unqueued_dirty == nr_taken) {
- wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /* Allow kswapd to start writing pages during reclaim. */
+ if (stat.nr_unqueued_dirty == nr_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);
- }
/*
* If kswapd scans pages marked marked for immediate
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct mem_cgroup *memcg,
- struct scan_control *sc, bool actual_reclaim)
+ struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
/*
* When refaults are being observed, it means a new workingset
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
- if (file && actual_reclaim && lruvec->refaults != refaults) {
+ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+ if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
inactive_ratio = 1;
}
- if (actual_reclaim)
+ if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc)
+ struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru),
- memcg, sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
* anonymous pages on the LRU in eligible zones.
* Otherwise, the small LRU gets thrashed.
*/
- if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, false, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
>> sc->priority) {
scan_balance = SCAN_ANON;
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
+ * Make sure we don't miss the last page
+ * because of a round-off error.
*/
- scan = div64_u64(scan * fraction[file],
- denominator);
+ scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, memcg, sc);
+ lruvec, sc);
}
}
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
unsigned long refaults;
struct lruvec *lruvec;
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
}
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
/*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
*/
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type classzone_idx)
+ enum zone_type prev_classzone_idx)
{
if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return classzone_idx;
-
- return max(pgdat->kswapd_classzone_idx, classzone_idx);
+ return prev_classzone_idx;
+ return pgdat->kswapd_classzone_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = kswapd_classzone_idx(pgdat, 0);
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
- classzone_idx);
+
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ pgdat->kswapd_classzone_idx = classzone_idx;
+ else
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
*/
int page_evictable(struct page *page)
{
- return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+ int ret;
+
+ /* Prevent address_space of inode and swap cache from being freed */
+ rcu_read_lock();
+ ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+ rcu_read_unlock();
+ return ret;
}
#ifdef CONFIG_SHMEM