]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - mm/vmstat.c
UBUNTU: Ubuntu-5.11.0-23.24
[mirror_ubuntu-hirsute-kernel.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
1d90ca89
KW
34#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
35
4518085e
KW
36#ifdef CONFIG_NUMA
37int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39/* zero numa counters within a zone */
40static void zero_zone_numa_counters(struct zone *zone)
41{
42 int item, cpu;
43
44 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
45 atomic_long_set(&zone->vm_numa_stat[item], 0);
46 for_each_online_cpu(cpu)
47 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
48 = 0;
49 }
50}
51
52/* zero numa counters of all the populated zones */
53static void zero_zones_numa_counters(void)
54{
55 struct zone *zone;
56
57 for_each_populated_zone(zone)
58 zero_zone_numa_counters(zone);
59}
60
61/* zero global numa counters */
62static void zero_global_numa_counters(void)
63{
64 int item;
65
66 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
67 atomic_long_set(&vm_numa_stat[item], 0);
68}
69
70static void invalid_numa_statistics(void)
71{
72 zero_zones_numa_counters();
73 zero_global_numa_counters();
74}
75
76static DEFINE_MUTEX(vm_numa_stat_lock);
77
78int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 79 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
80{
81 int ret, oldval;
82
83 mutex_lock(&vm_numa_stat_lock);
84 if (write)
85 oldval = sysctl_vm_numa_stat;
86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87 if (ret || !write)
88 goto out;
89
90 if (oldval == sysctl_vm_numa_stat)
91 goto out;
92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93 static_branch_enable(&vm_numa_stat_key);
94 pr_info("enable numa statistics\n");
95 } else {
96 static_branch_disable(&vm_numa_stat_key);
97 invalid_numa_statistics();
98 pr_info("disable numa statistics, and clear numa counters\n");
99 }
100
101out:
102 mutex_unlock(&vm_numa_stat_lock);
103 return ret;
104}
105#endif
106
f8891e5e
CL
107#ifdef CONFIG_VM_EVENT_COUNTERS
108DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109EXPORT_PER_CPU_SYMBOL(vm_event_states);
110
31f961a8 111static void sum_vm_events(unsigned long *ret)
f8891e5e 112{
9eccf2a8 113 int cpu;
f8891e5e
CL
114 int i;
115
116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117
31f961a8 118 for_each_online_cpu(cpu) {
f8891e5e
CL
119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120
f8891e5e
CL
121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122 ret[i] += this->event[i];
123 }
124}
125
126/*
127 * Accumulate the vm event counters across all CPUs.
128 * The result is unavoidably approximate - it can change
129 * during and after execution of this function.
130*/
131void all_vm_events(unsigned long *ret)
132{
b5be1132 133 get_online_cpus();
31f961a8 134 sum_vm_events(ret);
b5be1132 135 put_online_cpus();
f8891e5e 136}
32dd66fc 137EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 138
f8891e5e
CL
139/*
140 * Fold the foreign cpu events into our own.
141 *
142 * This is adding to the events on one processor
143 * but keeps the global counts constant.
144 */
145void vm_events_fold_cpu(int cpu)
146{
147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148 int i;
149
150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151 count_vm_events(i, fold_state->event[i]);
152 fold_state->event[i] = 0;
153 }
154}
f8891e5e
CL
155
156#endif /* CONFIG_VM_EVENT_COUNTERS */
157
2244b95a
CL
158/*
159 * Manage combined zone based / global counters
160 *
161 * vm_stat contains the global counters
162 */
75ef7184 163atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
3a321d2a 164atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
166EXPORT_SYMBOL(vm_zone_stat);
3a321d2a 167EXPORT_SYMBOL(vm_numa_stat);
75ef7184 168EXPORT_SYMBOL(vm_node_stat);
2244b95a
CL
169
170#ifdef CONFIG_SMP
171
b44129b3 172int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
173{
174 int threshold;
175 int watermark_distance;
176
177 /*
178 * As vmstats are not up to date, there is drift between the estimated
179 * and real values. For high thresholds and a high number of CPUs, it
180 * is possible for the min watermark to be breached while the estimated
181 * value looks fine. The pressure threshold is a reduced value such
182 * that even the maximum amount of drift will not accidentally breach
183 * the min watermark
184 */
185 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
186 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
187
188 /*
189 * Maximum threshold is 125
190 */
191 threshold = min(125, threshold);
192
193 return threshold;
194}
195
b44129b3 196int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
197{
198 int threshold;
199 int mem; /* memory in 128 MB units */
200
201 /*
202 * The threshold scales with the number of processors and the amount
203 * of memory per zone. More memory means that we can defer updates for
204 * longer, more processors could lead to more contention.
205 * fls() is used to have a cheap way of logarithmic scaling.
206 *
207 * Some sample thresholds:
208 *
209 * Threshold Processors (fls) Zonesize fls(mem+1)
210 * ------------------------------------------------------------------
211 * 8 1 1 0.9-1 GB 4
212 * 16 2 2 0.9-1 GB 4
213 * 20 2 2 1-2 GB 5
214 * 24 2 2 2-4 GB 6
215 * 28 2 2 4-8 GB 7
216 * 32 2 2 8-16 GB 8
217 * 4 2 2 <128M 1
218 * 30 4 3 2-4 GB 5
219 * 48 4 3 8-16 GB 8
220 * 32 8 4 1-2 GB 4
221 * 32 8 4 0.9-1GB 4
222 * 10 16 5 <128M 1
223 * 40 16 5 900M 4
224 * 70 64 7 2-4 GB 5
225 * 84 64 7 4-8 GB 6
226 * 108 512 9 4-8 GB 6
227 * 125 1024 10 8-16 GB 8
228 * 125 1024 10 16-32 GB 9
229 */
230
9705bea5 231 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
232
233 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
234
235 /*
236 * Maximum threshold is 125
237 */
238 threshold = min(125, threshold);
239
240 return threshold;
241}
2244b95a
CL
242
243/*
df9ecaba 244 * Refresh the thresholds for each zone.
2244b95a 245 */
a6cccdc3 246void refresh_zone_stat_thresholds(void)
2244b95a 247{
75ef7184 248 struct pglist_data *pgdat;
df9ecaba
CL
249 struct zone *zone;
250 int cpu;
251 int threshold;
252
75ef7184
MG
253 /* Zero current pgdat thresholds */
254 for_each_online_pgdat(pgdat) {
255 for_each_online_cpu(cpu) {
256 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
257 }
258 }
259
ee99c71c 260 for_each_populated_zone(zone) {
75ef7184 261 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
262 unsigned long max_drift, tolerate_drift;
263
b44129b3 264 threshold = calculate_normal_threshold(zone);
df9ecaba 265
75ef7184
MG
266 for_each_online_cpu(cpu) {
267 int pgdat_threshold;
268
99dcc3e5
CL
269 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
270 = threshold;
1d90ca89 271
75ef7184
MG
272 /* Base nodestat threshold on the largest populated zone. */
273 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
274 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
275 = max(threshold, pgdat_threshold);
276 }
277
aa454840
CL
278 /*
279 * Only set percpu_drift_mark if there is a danger that
280 * NR_FREE_PAGES reports the low watermark is ok when in fact
281 * the min watermark could be breached by an allocation
282 */
283 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
284 max_drift = num_online_cpus() * threshold;
285 if (max_drift > tolerate_drift)
286 zone->percpu_drift_mark = high_wmark_pages(zone) +
287 max_drift;
df9ecaba 288 }
2244b95a
CL
289}
290
b44129b3
MG
291void set_pgdat_percpu_threshold(pg_data_t *pgdat,
292 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
293{
294 struct zone *zone;
295 int cpu;
296 int threshold;
297 int i;
298
88f5acf8
MG
299 for (i = 0; i < pgdat->nr_zones; i++) {
300 zone = &pgdat->node_zones[i];
301 if (!zone->percpu_drift_mark)
302 continue;
303
b44129b3 304 threshold = (*calculate_pressure)(zone);
1d90ca89 305 for_each_online_cpu(cpu)
88f5acf8
MG
306 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
307 = threshold;
308 }
88f5acf8
MG
309}
310
2244b95a 311/*
bea04b07
JZ
312 * For use when we know that interrupts are disabled,
313 * or when we know that preemption is disabled and that
314 * particular counter cannot be updated from interrupt context.
2244b95a
CL
315 */
316void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 317 long delta)
2244b95a 318{
12938a92
CL
319 struct per_cpu_pageset __percpu *pcp = zone->pageset;
320 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 321 long x;
12938a92
CL
322 long t;
323
324 x = delta + __this_cpu_read(*p);
2244b95a 325
12938a92 326 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 327
40610076 328 if (unlikely(abs(x) > t)) {
2244b95a
CL
329 zone_page_state_add(x, zone, item);
330 x = 0;
331 }
12938a92 332 __this_cpu_write(*p, x);
2244b95a
CL
333}
334EXPORT_SYMBOL(__mod_zone_page_state);
335
75ef7184
MG
336void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
337 long delta)
338{
339 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
340 s8 __percpu *p = pcp->vm_node_stat_diff + item;
341 long x;
342 long t;
343
ea426c2a
RG
344 if (vmstat_item_in_bytes(item)) {
345 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
346 delta >>= PAGE_SHIFT;
347 }
348
75ef7184
MG
349 x = delta + __this_cpu_read(*p);
350
351 t = __this_cpu_read(pcp->stat_threshold);
352
40610076 353 if (unlikely(abs(x) > t)) {
75ef7184
MG
354 node_page_state_add(x, pgdat, item);
355 x = 0;
356 }
357 __this_cpu_write(*p, x);
358}
359EXPORT_SYMBOL(__mod_node_page_state);
360
2244b95a
CL
361/*
362 * Optimized increment and decrement functions.
363 *
364 * These are only for a single page and therefore can take a struct page *
365 * argument instead of struct zone *. This allows the inclusion of the code
366 * generated for page_zone(page) into the optimized functions.
367 *
368 * No overflow check is necessary and therefore the differential can be
369 * incremented or decremented in place which may allow the compilers to
370 * generate better code.
2244b95a
CL
371 * The increment or decrement is known and therefore one boundary check can
372 * be omitted.
373 *
df9ecaba
CL
374 * NOTE: These functions are very performance sensitive. Change only
375 * with care.
376 *
2244b95a
CL
377 * Some processors have inc/dec instructions that are atomic vs an interrupt.
378 * However, the code must first determine the differential location in a zone
379 * based on the processor number and then inc/dec the counter. There is no
380 * guarantee without disabling preemption that the processor will not change
381 * in between and therefore the atomicity vs. interrupt cannot be exploited
382 * in a useful way here.
383 */
c8785385 384void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 385{
12938a92
CL
386 struct per_cpu_pageset __percpu *pcp = zone->pageset;
387 s8 __percpu *p = pcp->vm_stat_diff + item;
388 s8 v, t;
2244b95a 389
908ee0f1 390 v = __this_cpu_inc_return(*p);
12938a92
CL
391 t = __this_cpu_read(pcp->stat_threshold);
392 if (unlikely(v > t)) {
393 s8 overstep = t >> 1;
df9ecaba 394
12938a92
CL
395 zone_page_state_add(v + overstep, zone, item);
396 __this_cpu_write(*p, -overstep);
2244b95a
CL
397 }
398}
ca889e6c 399
75ef7184
MG
400void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
401{
402 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
403 s8 __percpu *p = pcp->vm_node_stat_diff + item;
404 s8 v, t;
405
ea426c2a
RG
406 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
407
75ef7184
MG
408 v = __this_cpu_inc_return(*p);
409 t = __this_cpu_read(pcp->stat_threshold);
410 if (unlikely(v > t)) {
411 s8 overstep = t >> 1;
412
413 node_page_state_add(v + overstep, pgdat, item);
414 __this_cpu_write(*p, -overstep);
415 }
416}
417
ca889e6c
CL
418void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
419{
420 __inc_zone_state(page_zone(page), item);
421}
2244b95a
CL
422EXPORT_SYMBOL(__inc_zone_page_state);
423
75ef7184
MG
424void __inc_node_page_state(struct page *page, enum node_stat_item item)
425{
426 __inc_node_state(page_pgdat(page), item);
427}
428EXPORT_SYMBOL(__inc_node_page_state);
429
c8785385 430void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 431{
12938a92
CL
432 struct per_cpu_pageset __percpu *pcp = zone->pageset;
433 s8 __percpu *p = pcp->vm_stat_diff + item;
434 s8 v, t;
2244b95a 435
908ee0f1 436 v = __this_cpu_dec_return(*p);
12938a92
CL
437 t = __this_cpu_read(pcp->stat_threshold);
438 if (unlikely(v < - t)) {
439 s8 overstep = t >> 1;
2244b95a 440
12938a92
CL
441 zone_page_state_add(v - overstep, zone, item);
442 __this_cpu_write(*p, overstep);
2244b95a
CL
443 }
444}
c8785385 445
75ef7184
MG
446void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
447{
448 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
449 s8 __percpu *p = pcp->vm_node_stat_diff + item;
450 s8 v, t;
451
ea426c2a
RG
452 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
453
75ef7184
MG
454 v = __this_cpu_dec_return(*p);
455 t = __this_cpu_read(pcp->stat_threshold);
456 if (unlikely(v < - t)) {
457 s8 overstep = t >> 1;
458
459 node_page_state_add(v - overstep, pgdat, item);
460 __this_cpu_write(*p, overstep);
461 }
462}
463
c8785385
CL
464void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
465{
466 __dec_zone_state(page_zone(page), item);
467}
2244b95a
CL
468EXPORT_SYMBOL(__dec_zone_page_state);
469
75ef7184
MG
470void __dec_node_page_state(struct page *page, enum node_stat_item item)
471{
472 __dec_node_state(page_pgdat(page), item);
473}
474EXPORT_SYMBOL(__dec_node_page_state);
475
4156153c 476#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
477/*
478 * If we have cmpxchg_local support then we do not need to incur the overhead
479 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
480 *
481 * mod_state() modifies the zone counter state through atomic per cpu
482 * operations.
483 *
484 * Overstep mode specifies how overstep should handled:
485 * 0 No overstepping
486 * 1 Overstepping half of threshold
487 * -1 Overstepping minus half of threshold
488*/
75ef7184
MG
489static inline void mod_zone_state(struct zone *zone,
490 enum zone_stat_item item, long delta, int overstep_mode)
7c839120
CL
491{
492 struct per_cpu_pageset __percpu *pcp = zone->pageset;
493 s8 __percpu *p = pcp->vm_stat_diff + item;
494 long o, n, t, z;
495
496 do {
497 z = 0; /* overflow to zone counters */
498
499 /*
500 * The fetching of the stat_threshold is racy. We may apply
501 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
502 * rescheduled while executing here. However, the next
503 * counter update will apply the threshold again and
504 * therefore bring the counter under the threshold again.
505 *
506 * Most of the time the thresholds are the same anyways
507 * for all cpus in a zone.
7c839120
CL
508 */
509 t = this_cpu_read(pcp->stat_threshold);
510
511 o = this_cpu_read(*p);
512 n = delta + o;
513
40610076 514 if (abs(n) > t) {
7c839120
CL
515 int os = overstep_mode * (t >> 1) ;
516
517 /* Overflow must be added to zone counters */
518 z = n + os;
519 n = -os;
520 }
521 } while (this_cpu_cmpxchg(*p, o, n) != o);
522
523 if (z)
524 zone_page_state_add(z, zone, item);
525}
526
527void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 528 long delta)
7c839120 529{
75ef7184 530 mod_zone_state(zone, item, delta, 0);
7c839120
CL
531}
532EXPORT_SYMBOL(mod_zone_page_state);
533
7c839120
CL
534void inc_zone_page_state(struct page *page, enum zone_stat_item item)
535{
75ef7184 536 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
537}
538EXPORT_SYMBOL(inc_zone_page_state);
539
540void dec_zone_page_state(struct page *page, enum zone_stat_item item)
541{
75ef7184 542 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
543}
544EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
545
546static inline void mod_node_state(struct pglist_data *pgdat,
547 enum node_stat_item item, int delta, int overstep_mode)
548{
549 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
550 s8 __percpu *p = pcp->vm_node_stat_diff + item;
551 long o, n, t, z;
552
ea426c2a
RG
553 if (vmstat_item_in_bytes(item)) {
554 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
555 delta >>= PAGE_SHIFT;
556 }
557
75ef7184
MG
558 do {
559 z = 0; /* overflow to node counters */
560
561 /*
562 * The fetching of the stat_threshold is racy. We may apply
563 * a counter threshold to the wrong the cpu if we get
564 * rescheduled while executing here. However, the next
565 * counter update will apply the threshold again and
566 * therefore bring the counter under the threshold again.
567 *
568 * Most of the time the thresholds are the same anyways
569 * for all cpus in a node.
570 */
571 t = this_cpu_read(pcp->stat_threshold);
572
573 o = this_cpu_read(*p);
574 n = delta + o;
575
40610076 576 if (abs(n) > t) {
75ef7184
MG
577 int os = overstep_mode * (t >> 1) ;
578
579 /* Overflow must be added to node counters */
580 z = n + os;
581 n = -os;
582 }
583 } while (this_cpu_cmpxchg(*p, o, n) != o);
584
585 if (z)
586 node_page_state_add(z, pgdat, item);
587}
588
589void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
590 long delta)
591{
592 mod_node_state(pgdat, item, delta, 0);
593}
594EXPORT_SYMBOL(mod_node_page_state);
595
596void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
597{
598 mod_node_state(pgdat, item, 1, 1);
599}
600
601void inc_node_page_state(struct page *page, enum node_stat_item item)
602{
603 mod_node_state(page_pgdat(page), item, 1, 1);
604}
605EXPORT_SYMBOL(inc_node_page_state);
606
607void dec_node_page_state(struct page *page, enum node_stat_item item)
608{
609 mod_node_state(page_pgdat(page), item, -1, -1);
610}
611EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
612#else
613/*
614 * Use interrupt disable to serialize counter updates
615 */
616void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 617 long delta)
7c839120
CL
618{
619 unsigned long flags;
620
621 local_irq_save(flags);
622 __mod_zone_page_state(zone, item, delta);
623 local_irq_restore(flags);
624}
625EXPORT_SYMBOL(mod_zone_page_state);
626
2244b95a
CL
627void inc_zone_page_state(struct page *page, enum zone_stat_item item)
628{
629 unsigned long flags;
630 struct zone *zone;
2244b95a
CL
631
632 zone = page_zone(page);
633 local_irq_save(flags);
ca889e6c 634 __inc_zone_state(zone, item);
2244b95a
CL
635 local_irq_restore(flags);
636}
637EXPORT_SYMBOL(inc_zone_page_state);
638
639void dec_zone_page_state(struct page *page, enum zone_stat_item item)
640{
641 unsigned long flags;
2244b95a 642
2244b95a 643 local_irq_save(flags);
a302eb4e 644 __dec_zone_page_state(page, item);
2244b95a
CL
645 local_irq_restore(flags);
646}
647EXPORT_SYMBOL(dec_zone_page_state);
648
75ef7184
MG
649void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
650{
651 unsigned long flags;
652
653 local_irq_save(flags);
654 __inc_node_state(pgdat, item);
655 local_irq_restore(flags);
656}
657EXPORT_SYMBOL(inc_node_state);
658
659void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
660 long delta)
661{
662 unsigned long flags;
663
664 local_irq_save(flags);
665 __mod_node_page_state(pgdat, item, delta);
666 local_irq_restore(flags);
667}
668EXPORT_SYMBOL(mod_node_page_state);
669
670void inc_node_page_state(struct page *page, enum node_stat_item item)
671{
672 unsigned long flags;
673 struct pglist_data *pgdat;
674
675 pgdat = page_pgdat(page);
676 local_irq_save(flags);
677 __inc_node_state(pgdat, item);
678 local_irq_restore(flags);
679}
680EXPORT_SYMBOL(inc_node_page_state);
681
682void dec_node_page_state(struct page *page, enum node_stat_item item)
683{
684 unsigned long flags;
685
686 local_irq_save(flags);
687 __dec_node_page_state(page, item);
688 local_irq_restore(flags);
689}
690EXPORT_SYMBOL(dec_node_page_state);
691#endif
7cc36bbd
CL
692
693/*
694 * Fold a differential into the global counters.
695 * Returns the number of counters updated.
696 */
3a321d2a
KW
697#ifdef CONFIG_NUMA
698static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
699{
700 int i;
701 int changes = 0;
702
703 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
704 if (zone_diff[i]) {
705 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
706 changes++;
707 }
708
709 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
710 if (numa_diff[i]) {
711 atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
712 changes++;
713 }
714
715 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
716 if (node_diff[i]) {
717 atomic_long_add(node_diff[i], &vm_node_stat[i]);
718 changes++;
719 }
720 return changes;
721}
722#else
75ef7184 723static int fold_diff(int *zone_diff, int *node_diff)
4edb0748
CL
724{
725 int i;
7cc36bbd 726 int changes = 0;
4edb0748
CL
727
728 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
75ef7184
MG
729 if (zone_diff[i]) {
730 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
731 changes++;
732 }
733
734 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
735 if (node_diff[i]) {
736 atomic_long_add(node_diff[i], &vm_node_stat[i]);
7cc36bbd
CL
737 changes++;
738 }
739 return changes;
4edb0748 740}
3a321d2a 741#endif /* CONFIG_NUMA */
4edb0748 742
2244b95a 743/*
2bb921e5 744 * Update the zone counters for the current cpu.
a7f75e25 745 *
4037d452
CL
746 * Note that refresh_cpu_vm_stats strives to only access
747 * node local memory. The per cpu pagesets on remote zones are placed
748 * in the memory local to the processor using that pageset. So the
749 * loop over all zones will access a series of cachelines local to
750 * the processor.
751 *
752 * The call to zone_page_state_add updates the cachelines with the
753 * statistics in the remote zone struct as well as the global cachelines
754 * with the global counters. These could cause remote node cache line
755 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
756 *
757 * The function returns the number of global counters updated.
2244b95a 758 */
0eb77e98 759static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 760{
75ef7184 761 struct pglist_data *pgdat;
2244b95a
CL
762 struct zone *zone;
763 int i;
75ef7184 764 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
765#ifdef CONFIG_NUMA
766 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
767#endif
75ef7184 768 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 769 int changes = 0;
2244b95a 770
ee99c71c 771 for_each_populated_zone(zone) {
fbc2edb0 772 struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a 773
fbc2edb0
CL
774 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
775 int v;
2244b95a 776
fbc2edb0
CL
777 v = this_cpu_xchg(p->vm_stat_diff[i], 0);
778 if (v) {
a7f75e25 779
a7f75e25 780 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 781 global_zone_diff[i] += v;
4037d452
CL
782#ifdef CONFIG_NUMA
783 /* 3 seconds idle till flush */
fbc2edb0 784 __this_cpu_write(p->expire, 3);
4037d452 785#endif
2244b95a 786 }
fbc2edb0 787 }
4037d452 788#ifdef CONFIG_NUMA
3a321d2a
KW
789 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
790 int v;
791
792 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
793 if (v) {
794
795 atomic_long_add(v, &zone->vm_numa_stat[i]);
796 global_numa_diff[i] += v;
797 __this_cpu_write(p->expire, 3);
798 }
799 }
800
0eb77e98
CL
801 if (do_pagesets) {
802 cond_resched();
803 /*
804 * Deal with draining the remote pageset of this
805 * processor
806 *
807 * Check if there are pages remaining in this pageset
808 * if not then there is nothing to expire.
809 */
810 if (!__this_cpu_read(p->expire) ||
fbc2edb0 811 !__this_cpu_read(p->pcp.count))
0eb77e98 812 continue;
4037d452 813
0eb77e98
CL
814 /*
815 * We never drain zones local to this processor.
816 */
817 if (zone_to_nid(zone) == numa_node_id()) {
818 __this_cpu_write(p->expire, 0);
819 continue;
820 }
4037d452 821
0eb77e98
CL
822 if (__this_cpu_dec_return(p->expire))
823 continue;
4037d452 824
0eb77e98
CL
825 if (__this_cpu_read(p->pcp.count)) {
826 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
827 changes++;
828 }
7cc36bbd 829 }
4037d452 830#endif
2244b95a 831 }
75ef7184
MG
832
833 for_each_online_pgdat(pgdat) {
834 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
835
836 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
837 int v;
838
839 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
840 if (v) {
841 atomic_long_add(v, &pgdat->vm_stat[i]);
842 global_node_diff[i] += v;
843 }
844 }
845 }
846
3a321d2a
KW
847#ifdef CONFIG_NUMA
848 changes += fold_diff(global_zone_diff, global_numa_diff,
849 global_node_diff);
850#else
75ef7184 851 changes += fold_diff(global_zone_diff, global_node_diff);
3a321d2a 852#endif
7cc36bbd 853 return changes;
2244b95a
CL
854}
855
2bb921e5
CL
856/*
857 * Fold the data for an offline cpu into the global array.
858 * There cannot be any access by the offline cpu and therefore
859 * synchronization is simplified.
860 */
861void cpu_vm_stats_fold(int cpu)
862{
75ef7184 863 struct pglist_data *pgdat;
2bb921e5
CL
864 struct zone *zone;
865 int i;
75ef7184 866 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
867#ifdef CONFIG_NUMA
868 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
869#endif
75ef7184 870 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
871
872 for_each_populated_zone(zone) {
873 struct per_cpu_pageset *p;
874
875 p = per_cpu_ptr(zone->pageset, cpu);
876
877 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
878 if (p->vm_stat_diff[i]) {
879 int v;
880
881 v = p->vm_stat_diff[i];
882 p->vm_stat_diff[i] = 0;
883 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 884 global_zone_diff[i] += v;
2bb921e5 885 }
3a321d2a
KW
886
887#ifdef CONFIG_NUMA
888 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
889 if (p->vm_numa_stat_diff[i]) {
890 int v;
891
892 v = p->vm_numa_stat_diff[i];
893 p->vm_numa_stat_diff[i] = 0;
894 atomic_long_add(v, &zone->vm_numa_stat[i]);
895 global_numa_diff[i] += v;
896 }
897#endif
2bb921e5
CL
898 }
899
75ef7184
MG
900 for_each_online_pgdat(pgdat) {
901 struct per_cpu_nodestat *p;
902
903 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
904
905 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
906 if (p->vm_node_stat_diff[i]) {
907 int v;
908
909 v = p->vm_node_stat_diff[i];
910 p->vm_node_stat_diff[i] = 0;
911 atomic_long_add(v, &pgdat->vm_stat[i]);
912 global_node_diff[i] += v;
913 }
914 }
915
3a321d2a
KW
916#ifdef CONFIG_NUMA
917 fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
918#else
75ef7184 919 fold_diff(global_zone_diff, global_node_diff);
3a321d2a 920#endif
2bb921e5
CL
921}
922
40f4b1ea
CS
923/*
924 * this is only called if !populated_zone(zone), which implies no other users of
925 * pset->vm_stat_diff[] exsist.
926 */
5a883813
MK
927void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
928{
929 int i;
930
931 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
932 if (pset->vm_stat_diff[i]) {
933 int v = pset->vm_stat_diff[i];
934 pset->vm_stat_diff[i] = 0;
935 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 936 atomic_long_add(v, &vm_zone_stat[i]);
5a883813 937 }
3a321d2a
KW
938
939#ifdef CONFIG_NUMA
940 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
941 if (pset->vm_numa_stat_diff[i]) {
942 int v = pset->vm_numa_stat_diff[i];
943
944 pset->vm_numa_stat_diff[i] = 0;
945 atomic_long_add(v, &zone->vm_numa_stat[i]);
946 atomic_long_add(v, &vm_numa_stat[i]);
947 }
948#endif
5a883813 949}
2244b95a
CL
950#endif
951
ca889e6c 952#ifdef CONFIG_NUMA
3a321d2a
KW
953void __inc_numa_state(struct zone *zone,
954 enum numa_stat_item item)
955{
956 struct per_cpu_pageset __percpu *pcp = zone->pageset;
1d90ca89
KW
957 u16 __percpu *p = pcp->vm_numa_stat_diff + item;
958 u16 v;
3a321d2a
KW
959
960 v = __this_cpu_inc_return(*p);
3a321d2a 961
1d90ca89
KW
962 if (unlikely(v > NUMA_STATS_THRESHOLD)) {
963 zone_numa_state_add(v, zone, item);
964 __this_cpu_write(*p, 0);
3a321d2a
KW
965 }
966}
967
c2d42c16 968/*
75ef7184
MG
969 * Determine the per node value of a stat item. This function
970 * is called frequently in a NUMA machine, so try to be as
971 * frugal as possible.
c2d42c16 972 */
75ef7184
MG
973unsigned long sum_zone_node_page_state(int node,
974 enum zone_stat_item item)
c2d42c16
AM
975{
976 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
977 int i;
978 unsigned long count = 0;
c2d42c16 979
e87d59f7
JK
980 for (i = 0; i < MAX_NR_ZONES; i++)
981 count += zone_page_state(zones + i, item);
982
983 return count;
c2d42c16
AM
984}
985
63803222
KW
986/*
987 * Determine the per node value of a numa stat item. To avoid deviation,
988 * the per cpu stat number in vm_numa_stat_diff[] is also included.
989 */
3a321d2a
KW
990unsigned long sum_zone_numa_state(int node,
991 enum numa_stat_item item)
992{
993 struct zone *zones = NODE_DATA(node)->node_zones;
994 int i;
995 unsigned long count = 0;
996
997 for (i = 0; i < MAX_NR_ZONES; i++)
63803222 998 count += zone_numa_state_snapshot(zones + i, item);
3a321d2a
KW
999
1000 return count;
1001}
1002
75ef7184
MG
1003/*
1004 * Determine the per node value of a stat item.
1005 */
ea426c2a
RG
1006unsigned long node_page_state_pages(struct pglist_data *pgdat,
1007 enum node_stat_item item)
75ef7184
MG
1008{
1009 long x = atomic_long_read(&pgdat->vm_stat[item]);
1010#ifdef CONFIG_SMP
1011 if (x < 0)
1012 x = 0;
1013#endif
1014 return x;
1015}
ea426c2a
RG
1016
1017unsigned long node_page_state(struct pglist_data *pgdat,
1018 enum node_stat_item item)
1019{
1020 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1021
1022 return node_page_state_pages(pgdat, item);
1023}
ca889e6c
CL
1024#endif
1025
d7a5752c 1026#ifdef CONFIG_COMPACTION
36deb0be 1027
d7a5752c
MG
1028struct contig_page_info {
1029 unsigned long free_pages;
1030 unsigned long free_blocks_total;
1031 unsigned long free_blocks_suitable;
1032};
1033
1034/*
1035 * Calculate the number of free pages in a zone, how many contiguous
1036 * pages are free and how many are large enough to satisfy an allocation of
1037 * the target size. Note that this function makes no attempt to estimate
1038 * how many suitable free blocks there *might* be if MOVABLE pages were
1039 * migrated. Calculating that is possible, but expensive and can be
1040 * figured out from userspace
1041 */
1042static void fill_contig_page_info(struct zone *zone,
1043 unsigned int suitable_order,
1044 struct contig_page_info *info)
1045{
1046 unsigned int order;
1047
1048 info->free_pages = 0;
1049 info->free_blocks_total = 0;
1050 info->free_blocks_suitable = 0;
1051
1052 for (order = 0; order < MAX_ORDER; order++) {
1053 unsigned long blocks;
1054
1055 /* Count number of free blocks */
1056 blocks = zone->free_area[order].nr_free;
1057 info->free_blocks_total += blocks;
1058
1059 /* Count free base pages */
1060 info->free_pages += blocks << order;
1061
1062 /* Count the suitable free blocks */
1063 if (order >= suitable_order)
1064 info->free_blocks_suitable += blocks <<
1065 (order - suitable_order);
1066 }
1067}
f1a5ab12
MG
1068
1069/*
1070 * A fragmentation index only makes sense if an allocation of a requested
1071 * size would fail. If that is true, the fragmentation index indicates
1072 * whether external fragmentation or a lack of memory was the problem.
1073 * The value can be used to determine if page reclaim or compaction
1074 * should be used
1075 */
56de7263 1076static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1077{
1078 unsigned long requested = 1UL << order;
1079
88d6ac40
WY
1080 if (WARN_ON_ONCE(order >= MAX_ORDER))
1081 return 0;
1082
f1a5ab12
MG
1083 if (!info->free_blocks_total)
1084 return 0;
1085
1086 /* Fragmentation index only makes sense when a request would fail */
1087 if (info->free_blocks_suitable)
1088 return -1000;
1089
1090 /*
1091 * Index is between 0 and 1 so return within 3 decimal places
1092 *
1093 * 0 => allocation would fail due to lack of memory
1094 * 1 => allocation would fail due to fragmentation
1095 */
1096 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1097}
56de7263 1098
facdaa91
NG
1099/*
1100 * Calculates external fragmentation within a zone wrt the given order.
1101 * It is defined as the percentage of pages found in blocks of size
1102 * less than 1 << order. It returns values in range [0, 100].
1103 */
d34c0a75 1104unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1105{
1106 struct contig_page_info info;
1107
1108 fill_contig_page_info(zone, order, &info);
1109 if (info.free_pages == 0)
1110 return 0;
1111
1112 return div_u64((info.free_pages -
1113 (info.free_blocks_suitable << order)) * 100,
1114 info.free_pages);
1115}
1116
56de7263
MG
1117/* Same as __fragmentation index but allocs contig_page_info on stack */
1118int fragmentation_index(struct zone *zone, unsigned int order)
1119{
1120 struct contig_page_info info;
1121
1122 fill_contig_page_info(zone, order, &info);
1123 return __fragmentation_index(order, &info);
1124}
d7a5752c
MG
1125#endif
1126
ebc5d83d
KK
1127#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1128 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1129#ifdef CONFIG_ZONE_DMA
1130#define TEXT_FOR_DMA(xx) xx "_dma",
1131#else
1132#define TEXT_FOR_DMA(xx)
1133#endif
1134
1135#ifdef CONFIG_ZONE_DMA32
1136#define TEXT_FOR_DMA32(xx) xx "_dma32",
1137#else
1138#define TEXT_FOR_DMA32(xx)
1139#endif
1140
1141#ifdef CONFIG_HIGHMEM
1142#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1143#else
1144#define TEXT_FOR_HIGHMEM(xx)
1145#endif
1146
1147#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1148 TEXT_FOR_HIGHMEM(xx) xx "_movable",
1149
1150const char * const vmstat_text[] = {
8d92890b 1151 /* enum zone_stat_item counters */
fa25c503 1152 "nr_free_pages",
71c799f4
MK
1153 "nr_zone_inactive_anon",
1154 "nr_zone_active_anon",
1155 "nr_zone_inactive_file",
1156 "nr_zone_active_file",
1157 "nr_zone_unevictable",
5a1c84b4 1158 "nr_zone_write_pending",
fa25c503 1159 "nr_mlock",
fa25c503 1160 "nr_bounce",
91537fee
MK
1161#if IS_ENABLED(CONFIG_ZSMALLOC)
1162 "nr_zspages",
1163#endif
3a321d2a
KW
1164 "nr_free_cma",
1165
1166 /* enum numa_stat_item counters */
fa25c503
KM
1167#ifdef CONFIG_NUMA
1168 "numa_hit",
1169 "numa_miss",
1170 "numa_foreign",
1171 "numa_interleave",
1172 "numa_local",
1173 "numa_other",
1174#endif
09316c09 1175
9d7ea9a2 1176 /* enum node_stat_item counters */
599d0c95
MG
1177 "nr_inactive_anon",
1178 "nr_active_anon",
1179 "nr_inactive_file",
1180 "nr_active_file",
1181 "nr_unevictable",
385386cf
JW
1182 "nr_slab_reclaimable",
1183 "nr_slab_unreclaimable",
599d0c95
MG
1184 "nr_isolated_anon",
1185 "nr_isolated_file",
68d48e6a 1186 "workingset_nodes",
170b04b7
JK
1187 "workingset_refault_anon",
1188 "workingset_refault_file",
1189 "workingset_activate_anon",
1190 "workingset_activate_file",
1191 "workingset_restore_anon",
1192 "workingset_restore_file",
1e6b1085 1193 "workingset_nodereclaim",
50658e2e
MG
1194 "nr_anon_pages",
1195 "nr_mapped",
11fb9989
MG
1196 "nr_file_pages",
1197 "nr_dirty",
1198 "nr_writeback",
1199 "nr_writeback_temp",
1200 "nr_shmem",
1201 "nr_shmem_hugepages",
1202 "nr_shmem_pmdmapped",
60fbf0ab
SL
1203 "nr_file_hugepages",
1204 "nr_file_pmdmapped",
11fb9989 1205 "nr_anon_transparent_hugepages",
c4a25635
MG
1206 "nr_vmscan_write",
1207 "nr_vmscan_immediate_reclaim",
1208 "nr_dirtied",
1209 "nr_written",
b29940c1 1210 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1211 "nr_foll_pin_acquired",
1212 "nr_foll_pin_released",
991e7673
SB
1213 "nr_kernel_stack",
1214#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1215 "nr_shadow_call_stack",
1216#endif
f0c0c115 1217 "nr_page_table_pages",
599d0c95 1218
09316c09 1219 /* enum writeback_stat_item counters */
fa25c503
KM
1220 "nr_dirty_threshold",
1221 "nr_dirty_background_threshold",
1222
ebc5d83d 1223#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1224 /* enum vm_event_item counters */
fa25c503
KM
1225 "pgpgin",
1226 "pgpgout",
1227 "pswpin",
1228 "pswpout",
1229
1230 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1231 TEXTS_FOR_ZONES("allocstall")
1232 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1233
1234 "pgfree",
1235 "pgactivate",
1236 "pgdeactivate",
f7ad2a6c 1237 "pglazyfree",
fa25c503
KM
1238
1239 "pgfault",
1240 "pgmajfault",
854e9ed0 1241 "pglazyfreed",
fa25c503 1242
599d0c95 1243 "pgrefill",
798a6b87 1244 "pgreuse",
599d0c95
MG
1245 "pgsteal_kswapd",
1246 "pgsteal_direct",
1247 "pgscan_kswapd",
1248 "pgscan_direct",
68243e76 1249 "pgscan_direct_throttle",
497a6c1b
JW
1250 "pgscan_anon",
1251 "pgscan_file",
1252 "pgsteal_anon",
1253 "pgsteal_file",
fa25c503
KM
1254
1255#ifdef CONFIG_NUMA
1256 "zone_reclaim_failed",
1257#endif
1258 "pginodesteal",
1259 "slabs_scanned",
fa25c503
KM
1260 "kswapd_inodesteal",
1261 "kswapd_low_wmark_hit_quickly",
1262 "kswapd_high_wmark_hit_quickly",
fa25c503 1263 "pageoutrun",
fa25c503
KM
1264
1265 "pgrotated",
1266
5509a5d2
DH
1267 "drop_pagecache",
1268 "drop_slab",
8e675f7a 1269 "oom_kill",
5509a5d2 1270
03c5a6e1
MG
1271#ifdef CONFIG_NUMA_BALANCING
1272 "numa_pte_updates",
72403b4a 1273 "numa_huge_pte_updates",
03c5a6e1
MG
1274 "numa_hint_faults",
1275 "numa_hint_faults_local",
1276 "numa_pages_migrated",
1277#endif
5647bc29
MG
1278#ifdef CONFIG_MIGRATION
1279 "pgmigrate_success",
1280 "pgmigrate_fail",
1a5bae25
AK
1281 "thp_migration_success",
1282 "thp_migration_fail",
1283 "thp_migration_split",
5647bc29 1284#endif
fa25c503 1285#ifdef CONFIG_COMPACTION
397487db
MG
1286 "compact_migrate_scanned",
1287 "compact_free_scanned",
1288 "compact_isolated",
fa25c503
KM
1289 "compact_stall",
1290 "compact_fail",
1291 "compact_success",
698b1b30 1292 "compact_daemon_wake",
7f354a54
DR
1293 "compact_daemon_migrate_scanned",
1294 "compact_daemon_free_scanned",
fa25c503
KM
1295#endif
1296
1297#ifdef CONFIG_HUGETLB_PAGE
1298 "htlb_buddy_alloc_success",
1299 "htlb_buddy_alloc_fail",
1300#endif
1301 "unevictable_pgs_culled",
1302 "unevictable_pgs_scanned",
1303 "unevictable_pgs_rescued",
1304 "unevictable_pgs_mlocked",
1305 "unevictable_pgs_munlocked",
1306 "unevictable_pgs_cleared",
1307 "unevictable_pgs_stranded",
fa25c503
KM
1308
1309#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1310 "thp_fault_alloc",
1311 "thp_fault_fallback",
85b9f46e 1312 "thp_fault_fallback_charge",
fa25c503
KM
1313 "thp_collapse_alloc",
1314 "thp_collapse_alloc_failed",
95ecedcd 1315 "thp_file_alloc",
dcdf11ee 1316 "thp_file_fallback",
85b9f46e 1317 "thp_file_fallback_charge",
95ecedcd 1318 "thp_file_mapped",
122afea9
KS
1319 "thp_split_page",
1320 "thp_split_page_failed",
f9719a03 1321 "thp_deferred_split_page",
122afea9 1322 "thp_split_pmd",
ce9311cf
YX
1323#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1324 "thp_split_pud",
1325#endif
d8a8e1f0
KS
1326 "thp_zero_page_alloc",
1327 "thp_zero_page_alloc_failed",
225311a4 1328 "thp_swpout",
fe490cc0 1329 "thp_swpout_fallback",
fa25c503 1330#endif
09316c09
KK
1331#ifdef CONFIG_MEMORY_BALLOON
1332 "balloon_inflate",
1333 "balloon_deflate",
1334#ifdef CONFIG_BALLOON_COMPACTION
1335 "balloon_migrate",
1336#endif
1337#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1338#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1339 "nr_tlb_remote_flush",
1340 "nr_tlb_remote_flush_received",
1341 "nr_tlb_local_flush_all",
1342 "nr_tlb_local_flush_one",
ec659934 1343#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1344
4f115147
DB
1345#ifdef CONFIG_DEBUG_VM_VMACACHE
1346 "vmacache_find_calls",
1347 "vmacache_find_hits",
1348#endif
cbc65df2
HY
1349#ifdef CONFIG_SWAP
1350 "swap_ra",
1351 "swap_ra_hit",
1352#endif
ebc5d83d 1353#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1354};
ebc5d83d 1355#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1356
3c486871
AM
1357#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1358 defined(CONFIG_PROC_FS)
1359static void *frag_start(struct seq_file *m, loff_t *pos)
1360{
1361 pg_data_t *pgdat;
1362 loff_t node = *pos;
1363
1364 for (pgdat = first_online_pgdat();
1365 pgdat && node;
1366 pgdat = next_online_pgdat(pgdat))
1367 --node;
1368
1369 return pgdat;
1370}
1371
1372static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1373{
1374 pg_data_t *pgdat = (pg_data_t *)arg;
1375
1376 (*pos)++;
1377 return next_online_pgdat(pgdat);
1378}
1379
1380static void frag_stop(struct seq_file *m, void *arg)
1381{
1382}
1383
b2bd8598
DR
1384/*
1385 * Walk zones in a node and print using a callback.
1386 * If @assert_populated is true, only use callback for zones that are populated.
1387 */
3c486871 1388static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1389 bool assert_populated, bool nolock,
3c486871
AM
1390 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1391{
1392 struct zone *zone;
1393 struct zone *node_zones = pgdat->node_zones;
1394 unsigned long flags;
1395
1396 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1397 if (assert_populated && !populated_zone(zone))
3c486871
AM
1398 continue;
1399
727c080f
VM
1400 if (!nolock)
1401 spin_lock_irqsave(&zone->lock, flags);
3c486871 1402 print(m, pgdat, zone);
727c080f
VM
1403 if (!nolock)
1404 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1405 }
1406}
1407#endif
1408
d7a5752c 1409#ifdef CONFIG_PROC_FS
467c996c
MG
1410static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1411 struct zone *zone)
1412{
1413 int order;
1414
1415 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1416 for (order = 0; order < MAX_ORDER; ++order)
1417 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1418 seq_putc(m, '\n');
1419}
1420
1421/*
1422 * This walks the free areas for each zone.
1423 */
1424static int frag_show(struct seq_file *m, void *arg)
1425{
1426 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1427 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1428 return 0;
1429}
1430
1431static void pagetypeinfo_showfree_print(struct seq_file *m,
1432 pg_data_t *pgdat, struct zone *zone)
1433{
1434 int order, mtype;
1435
1436 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1437 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1438 pgdat->node_id,
1439 zone->name,
1440 migratetype_names[mtype]);
1441 for (order = 0; order < MAX_ORDER; ++order) {
1442 unsigned long freecount = 0;
1443 struct free_area *area;
1444 struct list_head *curr;
93b3a674 1445 bool overflow = false;
467c996c
MG
1446
1447 area = &(zone->free_area[order]);
1448
93b3a674
MH
1449 list_for_each(curr, &area->free_list[mtype]) {
1450 /*
1451 * Cap the free_list iteration because it might
1452 * be really large and we are under a spinlock
1453 * so a long time spent here could trigger a
1454 * hard lockup detector. Anyway this is a
1455 * debugging tool so knowing there is a handful
1456 * of pages of this order should be more than
1457 * sufficient.
1458 */
1459 if (++freecount >= 100000) {
1460 overflow = true;
1461 break;
1462 }
1463 }
1464 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1465 spin_unlock_irq(&zone->lock);
1466 cond_resched();
1467 spin_lock_irq(&zone->lock);
467c996c 1468 }
f6ac2354
CL
1469 seq_putc(m, '\n');
1470 }
467c996c
MG
1471}
1472
1473/* Print out the free pages at each order for each migatetype */
1474static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1475{
1476 int order;
1477 pg_data_t *pgdat = (pg_data_t *)arg;
1478
1479 /* Print header */
1480 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1481 for (order = 0; order < MAX_ORDER; ++order)
1482 seq_printf(m, "%6d ", order);
1483 seq_putc(m, '\n');
1484
727c080f 1485 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1486
1487 return 0;
1488}
1489
1490static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1491 pg_data_t *pgdat, struct zone *zone)
1492{
1493 int mtype;
1494 unsigned long pfn;
1495 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1496 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1497 unsigned long count[MIGRATE_TYPES] = { 0, };
1498
1499 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1500 struct page *page;
1501
d336e94e
MH
1502 page = pfn_to_online_page(pfn);
1503 if (!page)
467c996c
MG
1504 continue;
1505
a91c43c7
JK
1506 if (page_zone(page) != zone)
1507 continue;
1508
467c996c
MG
1509 mtype = get_pageblock_migratetype(page);
1510
e80d6a24
MG
1511 if (mtype < MIGRATE_TYPES)
1512 count[mtype]++;
467c996c
MG
1513 }
1514
1515 /* Print counts */
1516 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1517 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1518 seq_printf(m, "%12lu ", count[mtype]);
1519 seq_putc(m, '\n');
1520}
1521
f113e641 1522/* Print out the number of pageblocks for each migratetype */
467c996c
MG
1523static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1524{
1525 int mtype;
1526 pg_data_t *pgdat = (pg_data_t *)arg;
1527
1528 seq_printf(m, "\n%-23s", "Number of blocks type ");
1529 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1530 seq_printf(m, "%12s ", migratetype_names[mtype]);
1531 seq_putc(m, '\n');
727c080f
VM
1532 walk_zones_in_node(m, pgdat, true, false,
1533 pagetypeinfo_showblockcount_print);
467c996c
MG
1534
1535 return 0;
1536}
1537
48c96a36
JK
1538/*
1539 * Print out the number of pageblocks for each migratetype that contain pages
1540 * of other types. This gives an indication of how well fallbacks are being
1541 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1542 * to determine what is going on
1543 */
1544static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1545{
1546#ifdef CONFIG_PAGE_OWNER
1547 int mtype;
1548
7dd80b8a 1549 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1550 return;
1551
1552 drain_all_pages(NULL);
1553
1554 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1555 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1556 seq_printf(m, "%12s ", migratetype_names[mtype]);
1557 seq_putc(m, '\n');
1558
727c080f
VM
1559 walk_zones_in_node(m, pgdat, true, true,
1560 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1561#endif /* CONFIG_PAGE_OWNER */
1562}
1563
467c996c
MG
1564/*
1565 * This prints out statistics in relation to grouping pages by mobility.
1566 * It is expensive to collect so do not constantly read the file.
1567 */
1568static int pagetypeinfo_show(struct seq_file *m, void *arg)
1569{
1570 pg_data_t *pgdat = (pg_data_t *)arg;
1571
41b25a37 1572 /* check memoryless node */
a47b53c5 1573 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1574 return 0;
1575
467c996c
MG
1576 seq_printf(m, "Page block order: %d\n", pageblock_order);
1577 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1578 seq_putc(m, '\n');
1579 pagetypeinfo_showfree(m, pgdat);
1580 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1581 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1582
f6ac2354
CL
1583 return 0;
1584}
1585
8f32f7e5 1586static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1587 .start = frag_start,
1588 .next = frag_next,
1589 .stop = frag_stop,
1590 .show = frag_show,
1591};
1592
74e2e8e8 1593static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1594 .start = frag_start,
1595 .next = frag_next,
1596 .stop = frag_stop,
1597 .show = pagetypeinfo_show,
1598};
1599
e2ecc8a7
MG
1600static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1601{
1602 int zid;
1603
1604 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1605 struct zone *compare = &pgdat->node_zones[zid];
1606
1607 if (populated_zone(compare))
1608 return zone == compare;
1609 }
1610
e2ecc8a7
MG
1611 return false;
1612}
1613
467c996c
MG
1614static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1615 struct zone *zone)
f6ac2354 1616{
467c996c
MG
1617 int i;
1618 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1619 if (is_zone_first_populated(pgdat, zone)) {
1620 seq_printf(m, "\n per-node stats");
1621 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
9d7ea9a2 1622 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
ea426c2a 1623 node_page_state_pages(pgdat, i));
e2ecc8a7
MG
1624 }
1625 }
467c996c
MG
1626 seq_printf(m,
1627 "\n pages free %lu"
1628 "\n min %lu"
1629 "\n low %lu"
1630 "\n high %lu"
467c996c 1631 "\n spanned %lu"
9feedc9d
JL
1632 "\n present %lu"
1633 "\n managed %lu",
88f5acf8 1634 zone_page_state(zone, NR_FREE_PAGES),
41858966
MG
1635 min_wmark_pages(zone),
1636 low_wmark_pages(zone),
1637 high_wmark_pages(zone),
467c996c 1638 zone->spanned_pages,
9feedc9d 1639 zone->present_pages,
9705bea5 1640 zone_managed_pages(zone));
467c996c 1641
467c996c 1642 seq_printf(m,
3484b2de 1643 "\n protection: (%ld",
467c996c
MG
1644 zone->lowmem_reserve[0]);
1645 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1646 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1647 seq_putc(m, ')');
1648
a8a4b7ae
BH
1649 /* If unpopulated, no other information is useful */
1650 if (!populated_zone(zone)) {
1651 seq_putc(m, '\n');
1652 return;
1653 }
1654
7dfb8bf3 1655 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1656 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1657 zone_page_state(zone, i));
7dfb8bf3 1658
3a321d2a
KW
1659#ifdef CONFIG_NUMA
1660 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
9d7ea9a2
KK
1661 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
1662 zone_numa_state_snapshot(zone, i));
3a321d2a
KW
1663#endif
1664
7dfb8bf3 1665 seq_printf(m, "\n pagesets");
467c996c
MG
1666 for_each_online_cpu(i) {
1667 struct per_cpu_pageset *pageset;
467c996c 1668
99dcc3e5 1669 pageset = per_cpu_ptr(zone->pageset, i);
3dfa5721
CL
1670 seq_printf(m,
1671 "\n cpu: %i"
1672 "\n count: %i"
1673 "\n high: %i"
1674 "\n batch: %i",
1675 i,
1676 pageset->pcp.count,
1677 pageset->pcp.high,
1678 pageset->pcp.batch);
df9ecaba 1679#ifdef CONFIG_SMP
467c996c
MG
1680 seq_printf(m, "\n vm stats threshold: %d",
1681 pageset->stat_threshold);
df9ecaba 1682#endif
f6ac2354 1683 }
467c996c 1684 seq_printf(m,
599d0c95 1685 "\n node_unreclaimable: %u"
3a50d14d 1686 "\n start_pfn: %lu",
c73322d0 1687 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1688 zone->zone_start_pfn);
467c996c
MG
1689 seq_putc(m, '\n');
1690}
1691
1692/*
b2bd8598
DR
1693 * Output information about zones in @pgdat. All zones are printed regardless
1694 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1695 * set of all zones and userspace would not be aware of such zones if they are
1696 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1697 */
1698static int zoneinfo_show(struct seq_file *m, void *arg)
1699{
1700 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1701 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1702 return 0;
1703}
1704
5c9fe628 1705static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1706 .start = frag_start, /* iterate over all zones. The same as in
1707 * fragmentation. */
1708 .next = frag_next,
1709 .stop = frag_stop,
1710 .show = zoneinfo_show,
1711};
1712
9d7ea9a2
KK
1713#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1714 NR_VM_NUMA_STAT_ITEMS + \
1715 NR_VM_NODE_STAT_ITEMS + \
1716 NR_VM_WRITEBACK_STAT_ITEMS + \
1717 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1718 NR_VM_EVENT_ITEMS : 0))
79da826a 1719
f6ac2354
CL
1720static void *vmstat_start(struct seq_file *m, loff_t *pos)
1721{
2244b95a 1722 unsigned long *v;
9d7ea9a2 1723 int i;
f6ac2354 1724
9d7ea9a2 1725 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1726 return NULL;
79da826a 1727
9d7ea9a2
KK
1728 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1729 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1730 m->private = v;
1731 if (!v)
f6ac2354 1732 return ERR_PTR(-ENOMEM);
2244b95a 1733 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1734 v[i] = global_zone_page_state(i);
79da826a
MR
1735 v += NR_VM_ZONE_STAT_ITEMS;
1736
3a321d2a
KW
1737#ifdef CONFIG_NUMA
1738 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1739 v[i] = global_numa_state(i);
1740 v += NR_VM_NUMA_STAT_ITEMS;
1741#endif
1742
75ef7184 1743 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
ea426c2a 1744 v[i] = global_node_page_state_pages(i);
75ef7184
MG
1745 v += NR_VM_NODE_STAT_ITEMS;
1746
79da826a
MR
1747 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1748 v + NR_DIRTY_THRESHOLD);
1749 v += NR_VM_WRITEBACK_STAT_ITEMS;
1750
f8891e5e 1751#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1752 all_vm_events(v);
1753 v[PGPGIN] /= 2; /* sectors -> kbytes */
1754 v[PGPGOUT] /= 2;
f8891e5e 1755#endif
ff8b16d7 1756 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1757}
1758
1759static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1760{
1761 (*pos)++;
9d7ea9a2 1762 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1763 return NULL;
1764 return (unsigned long *)m->private + *pos;
1765}
1766
1767static int vmstat_show(struct seq_file *m, void *arg)
1768{
1769 unsigned long *l = arg;
1770 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1771
1772 seq_puts(m, vmstat_text[off]);
75ba1d07 1773 seq_put_decimal_ull(m, " ", *l);
68ba0326 1774 seq_putc(m, '\n');
8d92890b
N
1775
1776 if (off == NR_VMSTAT_ITEMS - 1) {
1777 /*
1778 * We've come to the end - add any deprecated counters to avoid
1779 * breaking userspace which might depend on them being present.
1780 */
1781 seq_puts(m, "nr_unstable 0\n");
1782 }
f6ac2354
CL
1783 return 0;
1784}
1785
1786static void vmstat_stop(struct seq_file *m, void *arg)
1787{
1788 kfree(m->private);
1789 m->private = NULL;
1790}
1791
b6aa44ab 1792static const struct seq_operations vmstat_op = {
f6ac2354
CL
1793 .start = vmstat_start,
1794 .next = vmstat_next,
1795 .stop = vmstat_stop,
1796 .show = vmstat_show,
1797};
f6ac2354
CL
1798#endif /* CONFIG_PROC_FS */
1799
df9ecaba 1800#ifdef CONFIG_SMP
d1187ed2 1801static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1802int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1803
52b6f46b
HD
1804#ifdef CONFIG_PROC_FS
1805static void refresh_vm_stats(struct work_struct *work)
1806{
1807 refresh_cpu_vm_stats(true);
1808}
1809
1810int vmstat_refresh(struct ctl_table *table, int write,
32927393 1811 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1812{
1813 long val;
1814 int err;
1815 int i;
1816
1817 /*
1818 * The regular update, every sysctl_stat_interval, may come later
1819 * than expected: leaving a significant amount in per_cpu buckets.
1820 * This is particularly misleading when checking a quantity of HUGE
1821 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1822 * which can equally be echo'ed to or cat'ted from (by root),
1823 * can be used to update the stats just before reading them.
1824 *
c41f012a 1825 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1826 * transiently negative values, report an error here if any of
1827 * the stats is negative, so we know to go looking for imbalance.
1828 */
1829 err = schedule_on_each_cpu(refresh_vm_stats);
1830 if (err)
1831 return err;
1832 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75ef7184 1833 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1834 if (val < 0) {
c822f622 1835 pr_warn("%s: %s %ld\n",
9d7ea9a2 1836 __func__, zone_stat_name(i), val);
c822f622 1837 err = -EINVAL;
52b6f46b
HD
1838 }
1839 }
3a321d2a
KW
1840#ifdef CONFIG_NUMA
1841 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1842 val = atomic_long_read(&vm_numa_stat[i]);
1843 if (val < 0) {
1844 pr_warn("%s: %s %ld\n",
9d7ea9a2 1845 __func__, numa_stat_name(i), val);
3a321d2a
KW
1846 err = -EINVAL;
1847 }
1848 }
1849#endif
52b6f46b
HD
1850 if (err)
1851 return err;
1852 if (write)
1853 *ppos += *lenp;
1854 else
1855 *lenp = 0;
1856 return 0;
1857}
1858#endif /* CONFIG_PROC_FS */
1859
d1187ed2
CL
1860static void vmstat_update(struct work_struct *w)
1861{
0eb77e98 1862 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1863 /*
1864 * Counters were updated so we expect more updates
1865 * to occur in the future. Keep on running the
1866 * update worker thread.
1867 */
ce612879 1868 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1869 this_cpu_ptr(&vmstat_work),
1870 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1871 }
1872}
1873
0eb77e98
CL
1874/*
1875 * Switch off vmstat processing and then fold all the remaining differentials
1876 * until the diffs stay at zero. The function is used by NOHZ and can only be
1877 * invoked when tick processing is not active.
1878 */
7cc36bbd
CL
1879/*
1880 * Check if the diffs for a certain cpu indicate that
1881 * an update is needed.
1882 */
1883static bool need_update(int cpu)
1884{
1885 struct zone *zone;
1886
1887 for_each_populated_zone(zone) {
1888 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1889
1890 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
3a321d2a 1891#ifdef CONFIG_NUMA
1d90ca89 1892 BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
3a321d2a 1893#endif
63803222 1894
7cc36bbd
CL
1895 /*
1896 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1897 */
13c9aaf7
JH
1898 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
1899 sizeof(p->vm_stat_diff[0])))
7cc36bbd 1900 return true;
3a321d2a 1901#ifdef CONFIG_NUMA
13c9aaf7
JH
1902 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
1903 sizeof(p->vm_numa_stat_diff[0])))
3a321d2a
KW
1904 return true;
1905#endif
7cc36bbd
CL
1906 }
1907 return false;
1908}
1909
7b8da4c7
CL
1910/*
1911 * Switch off vmstat processing and then fold all the remaining differentials
1912 * until the diffs stay at zero. The function is used by NOHZ and can only be
1913 * invoked when tick processing is not active.
1914 */
f01f17d3
MH
1915void quiet_vmstat(void)
1916{
1917 if (system_state != SYSTEM_RUNNING)
1918 return;
1919
7b8da4c7 1920 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1921 return;
1922
1923 if (!need_update(smp_processor_id()))
1924 return;
1925
1926 /*
1927 * Just refresh counters and do not care about the pending delayed
1928 * vmstat_update. It doesn't fire that often to matter and canceling
1929 * it would be too expensive from this path.
1930 * vmstat_shepherd will take care about that for us.
1931 */
1932 refresh_cpu_vm_stats(false);
1933}
1934
7cc36bbd
CL
1935/*
1936 * Shepherd worker thread that checks the
1937 * differentials of processors that have their worker
1938 * threads for vm statistics updates disabled because of
1939 * inactivity.
1940 */
1941static void vmstat_shepherd(struct work_struct *w);
1942
0eb77e98 1943static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
1944
1945static void vmstat_shepherd(struct work_struct *w)
1946{
1947 int cpu;
1948
1949 get_online_cpus();
1950 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 1951 for_each_online_cpu(cpu) {
f01f17d3 1952 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 1953
7b8da4c7 1954 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 1955 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
f01f17d3 1956 }
7cc36bbd
CL
1957 put_online_cpus();
1958
1959 schedule_delayed_work(&shepherd,
98f4ebb2 1960 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1961}
1962
7cc36bbd 1963static void __init start_shepherd_timer(void)
d1187ed2 1964{
7cc36bbd
CL
1965 int cpu;
1966
1967 for_each_possible_cpu(cpu)
ccde8bd4 1968 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
1969 vmstat_update);
1970
7cc36bbd
CL
1971 schedule_delayed_work(&shepherd,
1972 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1973}
1974
03e86dba
TC
1975static void __init init_cpu_node_state(void)
1976{
4c501327 1977 int node;
03e86dba 1978
4c501327
SAS
1979 for_each_online_node(node) {
1980 if (cpumask_weight(cpumask_of_node(node)) > 0)
1981 node_set_state(node, N_CPU);
1982 }
03e86dba
TC
1983}
1984
5438da97
SAS
1985static int vmstat_cpu_online(unsigned int cpu)
1986{
1987 refresh_zone_stat_thresholds();
1988 node_set_state(cpu_to_node(cpu), N_CPU);
1989 return 0;
1990}
1991
1992static int vmstat_cpu_down_prep(unsigned int cpu)
1993{
1994 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1995 return 0;
1996}
1997
1998static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 1999{
4c501327 2000 const struct cpumask *node_cpus;
5438da97 2001 int node;
807a1bd2 2002
5438da97
SAS
2003 node = cpu_to_node(cpu);
2004
2005 refresh_zone_stat_thresholds();
4c501327
SAS
2006 node_cpus = cpumask_of_node(node);
2007 if (cpumask_weight(node_cpus) > 0)
5438da97 2008 return 0;
807a1bd2
TK
2009
2010 node_clear_state(node, N_CPU);
5438da97 2011 return 0;
807a1bd2
TK
2012}
2013
8f32f7e5 2014#endif
df9ecaba 2015
ce612879
MH
2016struct workqueue_struct *mm_percpu_wq;
2017
597b7305 2018void __init init_mm_internals(void)
df9ecaba 2019{
ce612879 2020 int ret __maybe_unused;
5438da97 2021
80d136e1 2022 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2023
2024#ifdef CONFIG_SMP
5438da97
SAS
2025 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2026 NULL, vmstat_cpu_dead);
2027 if (ret < 0)
2028 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2029
2030 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2031 vmstat_cpu_online,
2032 vmstat_cpu_down_prep);
2033 if (ret < 0)
2034 pr_err("vmstat: failed to register 'online' hotplug state\n");
2035
2036 get_online_cpus();
03e86dba 2037 init_cpu_node_state();
5438da97 2038 put_online_cpus();
d1187ed2 2039
7cc36bbd 2040 start_shepherd_timer();
8f32f7e5
AD
2041#endif
2042#ifdef CONFIG_PROC_FS
fddda2b7 2043 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2044 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2045 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2046 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2047#endif
df9ecaba 2048}
d7a5752c
MG
2049
2050#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2051
2052/*
2053 * Return an index indicating how much of the available free memory is
2054 * unusable for an allocation of the requested size.
2055 */
2056static int unusable_free_index(unsigned int order,
2057 struct contig_page_info *info)
2058{
2059 /* No free memory is interpreted as all free memory is unusable */
2060 if (info->free_pages == 0)
2061 return 1000;
2062
2063 /*
2064 * Index should be a value between 0 and 1. Return a value to 3
2065 * decimal places.
2066 *
2067 * 0 => no fragmentation
2068 * 1 => high fragmentation
2069 */
2070 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2071
2072}
2073
2074static void unusable_show_print(struct seq_file *m,
2075 pg_data_t *pgdat, struct zone *zone)
2076{
2077 unsigned int order;
2078 int index;
2079 struct contig_page_info info;
2080
2081 seq_printf(m, "Node %d, zone %8s ",
2082 pgdat->node_id,
2083 zone->name);
2084 for (order = 0; order < MAX_ORDER; ++order) {
2085 fill_contig_page_info(zone, order, &info);
2086 index = unusable_free_index(order, &info);
2087 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2088 }
2089
2090 seq_putc(m, '\n');
2091}
2092
2093/*
2094 * Display unusable free space index
2095 *
2096 * The unusable free space index measures how much of the available free
2097 * memory cannot be used to satisfy an allocation of a given size and is a
2098 * value between 0 and 1. The higher the value, the more of free memory is
2099 * unusable and by implication, the worse the external fragmentation is. This
2100 * can be expressed as a percentage by multiplying by 100.
2101 */
2102static int unusable_show(struct seq_file *m, void *arg)
2103{
2104 pg_data_t *pgdat = (pg_data_t *)arg;
2105
2106 /* check memoryless node */
a47b53c5 2107 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2108 return 0;
2109
727c080f 2110 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2111
2112 return 0;
2113}
2114
01a99560 2115static const struct seq_operations unusable_sops = {
d7a5752c
MG
2116 .start = frag_start,
2117 .next = frag_next,
2118 .stop = frag_stop,
2119 .show = unusable_show,
2120};
2121
01a99560 2122DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2123
f1a5ab12
MG
2124static void extfrag_show_print(struct seq_file *m,
2125 pg_data_t *pgdat, struct zone *zone)
2126{
2127 unsigned int order;
2128 int index;
2129
2130 /* Alloc on stack as interrupts are disabled for zone walk */
2131 struct contig_page_info info;
2132
2133 seq_printf(m, "Node %d, zone %8s ",
2134 pgdat->node_id,
2135 zone->name);
2136 for (order = 0; order < MAX_ORDER; ++order) {
2137 fill_contig_page_info(zone, order, &info);
56de7263 2138 index = __fragmentation_index(order, &info);
f1a5ab12
MG
2139 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2140 }
2141
2142 seq_putc(m, '\n');
2143}
2144
2145/*
2146 * Display fragmentation index for orders that allocations would fail for
2147 */
2148static int extfrag_show(struct seq_file *m, void *arg)
2149{
2150 pg_data_t *pgdat = (pg_data_t *)arg;
2151
727c080f 2152 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2153
2154 return 0;
2155}
2156
01a99560 2157static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2158 .start = frag_start,
2159 .next = frag_next,
2160 .stop = frag_stop,
2161 .show = extfrag_show,
2162};
2163
01a99560 2164DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2165
d7a5752c
MG
2166static int __init extfrag_debug_init(void)
2167{
bde8bd8a
S
2168 struct dentry *extfrag_debug_root;
2169
d7a5752c 2170 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2171
d9f7979c 2172 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2173 &unusable_fops);
d7a5752c 2174
d9f7979c 2175 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2176 &extfrag_fops);
f1a5ab12 2177
d7a5752c
MG
2178 return 0;
2179}
2180
2181module_init(extfrag_debug_init);
2182#endif