]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blame - mm/vmstat.c
Merge tag 'regulator-fix-v6.0-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git...
[mirror_ubuntu-kernels.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
734c1570 31#include <linux/migrate.h>
6e543d57
LD
32
33#include "internal.h"
f6ac2354 34
4518085e
KW
35#ifdef CONFIG_NUMA
36int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
37
38/* zero numa counters within a zone */
39static void zero_zone_numa_counters(struct zone *zone)
40{
41 int item, cpu;
42
f19298b9
MG
43 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
44 atomic_long_set(&zone->vm_numa_event[item], 0);
45 for_each_online_cpu(cpu) {
46 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
4518085e 47 = 0;
f19298b9 48 }
4518085e
KW
49 }
50}
51
52/* zero numa counters of all the populated zones */
53static void zero_zones_numa_counters(void)
54{
55 struct zone *zone;
56
57 for_each_populated_zone(zone)
58 zero_zone_numa_counters(zone);
59}
60
61/* zero global numa counters */
62static void zero_global_numa_counters(void)
63{
64 int item;
65
f19298b9
MG
66 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
67 atomic_long_set(&vm_numa_event[item], 0);
4518085e
KW
68}
69
70static void invalid_numa_statistics(void)
71{
72 zero_zones_numa_counters();
73 zero_global_numa_counters();
74}
75
76static DEFINE_MUTEX(vm_numa_stat_lock);
77
78int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 79 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
80{
81 int ret, oldval;
82
83 mutex_lock(&vm_numa_stat_lock);
84 if (write)
85 oldval = sysctl_vm_numa_stat;
86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87 if (ret || !write)
88 goto out;
89
90 if (oldval == sysctl_vm_numa_stat)
91 goto out;
92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93 static_branch_enable(&vm_numa_stat_key);
94 pr_info("enable numa statistics\n");
95 } else {
96 static_branch_disable(&vm_numa_stat_key);
97 invalid_numa_statistics();
98 pr_info("disable numa statistics, and clear numa counters\n");
99 }
100
101out:
102 mutex_unlock(&vm_numa_stat_lock);
103 return ret;
104}
105#endif
106
f8891e5e
CL
107#ifdef CONFIG_VM_EVENT_COUNTERS
108DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109EXPORT_PER_CPU_SYMBOL(vm_event_states);
110
31f961a8 111static void sum_vm_events(unsigned long *ret)
f8891e5e 112{
9eccf2a8 113 int cpu;
f8891e5e
CL
114 int i;
115
116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117
31f961a8 118 for_each_online_cpu(cpu) {
f8891e5e
CL
119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120
f8891e5e
CL
121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122 ret[i] += this->event[i];
123 }
124}
125
126/*
127 * Accumulate the vm event counters across all CPUs.
128 * The result is unavoidably approximate - it can change
129 * during and after execution of this function.
130*/
131void all_vm_events(unsigned long *ret)
132{
7625eccd 133 cpus_read_lock();
31f961a8 134 sum_vm_events(ret);
7625eccd 135 cpus_read_unlock();
f8891e5e 136}
32dd66fc 137EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 138
f8891e5e
CL
139/*
140 * Fold the foreign cpu events into our own.
141 *
142 * This is adding to the events on one processor
143 * but keeps the global counts constant.
144 */
145void vm_events_fold_cpu(int cpu)
146{
147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148 int i;
149
150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151 count_vm_events(i, fold_state->event[i]);
152 fold_state->event[i] = 0;
153 }
154}
f8891e5e
CL
155
156#endif /* CONFIG_VM_EVENT_COUNTERS */
157
2244b95a
CL
158/*
159 * Manage combined zone based / global counters
160 *
161 * vm_stat contains the global counters
162 */
75ef7184
MG
163atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
164atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
f19298b9 165atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
166EXPORT_SYMBOL(vm_zone_stat);
167EXPORT_SYMBOL(vm_node_stat);
2244b95a 168
ebeac3ea
GU
169#ifdef CONFIG_NUMA
170static void fold_vm_zone_numa_events(struct zone *zone)
171{
172 unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
173 int cpu;
174 enum numa_stat_item item;
175
176 for_each_online_cpu(cpu) {
177 struct per_cpu_zonestat *pzstats;
178
179 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
180 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
181 zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
182 }
183
184 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
185 zone_numa_event_add(zone_numa_events[item], zone, item);
186}
187
188void fold_vm_numa_events(void)
189{
190 struct zone *zone;
191
192 for_each_populated_zone(zone)
193 fold_vm_zone_numa_events(zone);
194}
195#endif
196
2244b95a
CL
197#ifdef CONFIG_SMP
198
b44129b3 199int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
200{
201 int threshold;
202 int watermark_distance;
203
204 /*
205 * As vmstats are not up to date, there is drift between the estimated
206 * and real values. For high thresholds and a high number of CPUs, it
207 * is possible for the min watermark to be breached while the estimated
208 * value looks fine. The pressure threshold is a reduced value such
209 * that even the maximum amount of drift will not accidentally breach
210 * the min watermark
211 */
212 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
213 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
214
215 /*
216 * Maximum threshold is 125
217 */
218 threshold = min(125, threshold);
219
220 return threshold;
221}
222
b44129b3 223int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
224{
225 int threshold;
226 int mem; /* memory in 128 MB units */
227
228 /*
229 * The threshold scales with the number of processors and the amount
230 * of memory per zone. More memory means that we can defer updates for
231 * longer, more processors could lead to more contention.
232 * fls() is used to have a cheap way of logarithmic scaling.
233 *
234 * Some sample thresholds:
235 *
ea15ba17 236 * Threshold Processors (fls) Zonesize fls(mem)+1
df9ecaba
CL
237 * ------------------------------------------------------------------
238 * 8 1 1 0.9-1 GB 4
239 * 16 2 2 0.9-1 GB 4
240 * 20 2 2 1-2 GB 5
241 * 24 2 2 2-4 GB 6
242 * 28 2 2 4-8 GB 7
243 * 32 2 2 8-16 GB 8
244 * 4 2 2 <128M 1
245 * 30 4 3 2-4 GB 5
246 * 48 4 3 8-16 GB 8
247 * 32 8 4 1-2 GB 4
248 * 32 8 4 0.9-1GB 4
249 * 10 16 5 <128M 1
250 * 40 16 5 900M 4
251 * 70 64 7 2-4 GB 5
252 * 84 64 7 4-8 GB 6
253 * 108 512 9 4-8 GB 6
254 * 125 1024 10 8-16 GB 8
255 * 125 1024 10 16-32 GB 9
256 */
257
9705bea5 258 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
259
260 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
261
262 /*
263 * Maximum threshold is 125
264 */
265 threshold = min(125, threshold);
266
267 return threshold;
268}
2244b95a
CL
269
270/*
df9ecaba 271 * Refresh the thresholds for each zone.
2244b95a 272 */
a6cccdc3 273void refresh_zone_stat_thresholds(void)
2244b95a 274{
75ef7184 275 struct pglist_data *pgdat;
df9ecaba
CL
276 struct zone *zone;
277 int cpu;
278 int threshold;
279
75ef7184
MG
280 /* Zero current pgdat thresholds */
281 for_each_online_pgdat(pgdat) {
282 for_each_online_cpu(cpu) {
283 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
284 }
285 }
286
ee99c71c 287 for_each_populated_zone(zone) {
75ef7184 288 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
289 unsigned long max_drift, tolerate_drift;
290
b44129b3 291 threshold = calculate_normal_threshold(zone);
df9ecaba 292
75ef7184
MG
293 for_each_online_cpu(cpu) {
294 int pgdat_threshold;
295
28f836b6 296 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
99dcc3e5 297 = threshold;
1d90ca89 298
75ef7184
MG
299 /* Base nodestat threshold on the largest populated zone. */
300 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
301 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
302 = max(threshold, pgdat_threshold);
303 }
304
aa454840
CL
305 /*
306 * Only set percpu_drift_mark if there is a danger that
307 * NR_FREE_PAGES reports the low watermark is ok when in fact
308 * the min watermark could be breached by an allocation
309 */
310 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
311 max_drift = num_online_cpus() * threshold;
312 if (max_drift > tolerate_drift)
313 zone->percpu_drift_mark = high_wmark_pages(zone) +
314 max_drift;
df9ecaba 315 }
2244b95a
CL
316}
317
b44129b3
MG
318void set_pgdat_percpu_threshold(pg_data_t *pgdat,
319 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
320{
321 struct zone *zone;
322 int cpu;
323 int threshold;
324 int i;
325
88f5acf8
MG
326 for (i = 0; i < pgdat->nr_zones; i++) {
327 zone = &pgdat->node_zones[i];
328 if (!zone->percpu_drift_mark)
329 continue;
330
b44129b3 331 threshold = (*calculate_pressure)(zone);
1d90ca89 332 for_each_online_cpu(cpu)
28f836b6 333 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
88f5acf8
MG
334 = threshold;
335 }
88f5acf8
MG
336}
337
2244b95a 338/*
bea04b07
JZ
339 * For use when we know that interrupts are disabled,
340 * or when we know that preemption is disabled and that
341 * particular counter cannot be updated from interrupt context.
2244b95a
CL
342 */
343void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 344 long delta)
2244b95a 345{
28f836b6 346 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92 347 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 348 long x;
12938a92
CL
349 long t;
350
c68ed794
IM
351 /*
352 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
353 * atomicity is provided by IRQs being disabled -- either explicitly
354 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
355 * CPU migrations and preemption potentially corrupts a counter so
356 * disable preemption.
357 */
358 if (IS_ENABLED(CONFIG_PREEMPT_RT))
359 preempt_disable();
360
12938a92 361 x = delta + __this_cpu_read(*p);
2244b95a 362
12938a92 363 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 364
40610076 365 if (unlikely(abs(x) > t)) {
2244b95a
CL
366 zone_page_state_add(x, zone, item);
367 x = 0;
368 }
12938a92 369 __this_cpu_write(*p, x);
c68ed794
IM
370
371 if (IS_ENABLED(CONFIG_PREEMPT_RT))
372 preempt_enable();
2244b95a
CL
373}
374EXPORT_SYMBOL(__mod_zone_page_state);
375
75ef7184
MG
376void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
377 long delta)
378{
379 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
380 s8 __percpu *p = pcp->vm_node_stat_diff + item;
381 long x;
382 long t;
383
ea426c2a 384 if (vmstat_item_in_bytes(item)) {
629484ae
JW
385 /*
386 * Only cgroups use subpage accounting right now; at
387 * the global level, these items still change in
388 * multiples of whole pages. Store them as pages
389 * internally to keep the per-cpu counters compact.
390 */
ea426c2a
RG
391 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
392 delta >>= PAGE_SHIFT;
393 }
394
c68ed794
IM
395 /* See __mod_node_page_state */
396 if (IS_ENABLED(CONFIG_PREEMPT_RT))
397 preempt_disable();
398
75ef7184
MG
399 x = delta + __this_cpu_read(*p);
400
401 t = __this_cpu_read(pcp->stat_threshold);
402
40610076 403 if (unlikely(abs(x) > t)) {
75ef7184
MG
404 node_page_state_add(x, pgdat, item);
405 x = 0;
406 }
407 __this_cpu_write(*p, x);
c68ed794
IM
408
409 if (IS_ENABLED(CONFIG_PREEMPT_RT))
410 preempt_enable();
75ef7184
MG
411}
412EXPORT_SYMBOL(__mod_node_page_state);
413
2244b95a
CL
414/*
415 * Optimized increment and decrement functions.
416 *
417 * These are only for a single page and therefore can take a struct page *
418 * argument instead of struct zone *. This allows the inclusion of the code
419 * generated for page_zone(page) into the optimized functions.
420 *
421 * No overflow check is necessary and therefore the differential can be
422 * incremented or decremented in place which may allow the compilers to
423 * generate better code.
2244b95a
CL
424 * The increment or decrement is known and therefore one boundary check can
425 * be omitted.
426 *
df9ecaba
CL
427 * NOTE: These functions are very performance sensitive. Change only
428 * with care.
429 *
2244b95a
CL
430 * Some processors have inc/dec instructions that are atomic vs an interrupt.
431 * However, the code must first determine the differential location in a zone
432 * based on the processor number and then inc/dec the counter. There is no
433 * guarantee without disabling preemption that the processor will not change
434 * in between and therefore the atomicity vs. interrupt cannot be exploited
435 * in a useful way here.
436 */
c8785385 437void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 438{
28f836b6 439 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
440 s8 __percpu *p = pcp->vm_stat_diff + item;
441 s8 v, t;
2244b95a 442
c68ed794
IM
443 /* See __mod_node_page_state */
444 if (IS_ENABLED(CONFIG_PREEMPT_RT))
445 preempt_disable();
446
908ee0f1 447 v = __this_cpu_inc_return(*p);
12938a92
CL
448 t = __this_cpu_read(pcp->stat_threshold);
449 if (unlikely(v > t)) {
450 s8 overstep = t >> 1;
df9ecaba 451
12938a92
CL
452 zone_page_state_add(v + overstep, zone, item);
453 __this_cpu_write(*p, -overstep);
2244b95a 454 }
c68ed794
IM
455
456 if (IS_ENABLED(CONFIG_PREEMPT_RT))
457 preempt_enable();
2244b95a 458}
ca889e6c 459
75ef7184
MG
460void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
461{
462 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
463 s8 __percpu *p = pcp->vm_node_stat_diff + item;
464 s8 v, t;
465
ea426c2a
RG
466 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
467
c68ed794
IM
468 /* See __mod_node_page_state */
469 if (IS_ENABLED(CONFIG_PREEMPT_RT))
470 preempt_disable();
471
75ef7184
MG
472 v = __this_cpu_inc_return(*p);
473 t = __this_cpu_read(pcp->stat_threshold);
474 if (unlikely(v > t)) {
475 s8 overstep = t >> 1;
476
477 node_page_state_add(v + overstep, pgdat, item);
478 __this_cpu_write(*p, -overstep);
479 }
c68ed794
IM
480
481 if (IS_ENABLED(CONFIG_PREEMPT_RT))
482 preempt_enable();
75ef7184
MG
483}
484
ca889e6c
CL
485void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
486{
487 __inc_zone_state(page_zone(page), item);
488}
2244b95a
CL
489EXPORT_SYMBOL(__inc_zone_page_state);
490
75ef7184
MG
491void __inc_node_page_state(struct page *page, enum node_stat_item item)
492{
493 __inc_node_state(page_pgdat(page), item);
494}
495EXPORT_SYMBOL(__inc_node_page_state);
496
c8785385 497void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 498{
28f836b6 499 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
500 s8 __percpu *p = pcp->vm_stat_diff + item;
501 s8 v, t;
2244b95a 502
c68ed794
IM
503 /* See __mod_node_page_state */
504 if (IS_ENABLED(CONFIG_PREEMPT_RT))
505 preempt_disable();
506
908ee0f1 507 v = __this_cpu_dec_return(*p);
12938a92
CL
508 t = __this_cpu_read(pcp->stat_threshold);
509 if (unlikely(v < - t)) {
510 s8 overstep = t >> 1;
2244b95a 511
12938a92
CL
512 zone_page_state_add(v - overstep, zone, item);
513 __this_cpu_write(*p, overstep);
2244b95a 514 }
c68ed794
IM
515
516 if (IS_ENABLED(CONFIG_PREEMPT_RT))
517 preempt_enable();
2244b95a 518}
c8785385 519
75ef7184
MG
520void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
521{
522 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
523 s8 __percpu *p = pcp->vm_node_stat_diff + item;
524 s8 v, t;
525
ea426c2a
RG
526 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
527
c68ed794
IM
528 /* See __mod_node_page_state */
529 if (IS_ENABLED(CONFIG_PREEMPT_RT))
530 preempt_disable();
531
75ef7184
MG
532 v = __this_cpu_dec_return(*p);
533 t = __this_cpu_read(pcp->stat_threshold);
534 if (unlikely(v < - t)) {
535 s8 overstep = t >> 1;
536
537 node_page_state_add(v - overstep, pgdat, item);
538 __this_cpu_write(*p, overstep);
539 }
c68ed794
IM
540
541 if (IS_ENABLED(CONFIG_PREEMPT_RT))
542 preempt_enable();
75ef7184
MG
543}
544
c8785385
CL
545void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
546{
547 __dec_zone_state(page_zone(page), item);
548}
2244b95a
CL
549EXPORT_SYMBOL(__dec_zone_page_state);
550
75ef7184
MG
551void __dec_node_page_state(struct page *page, enum node_stat_item item)
552{
553 __dec_node_state(page_pgdat(page), item);
554}
555EXPORT_SYMBOL(__dec_node_page_state);
556
4156153c 557#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
558/*
559 * If we have cmpxchg_local support then we do not need to incur the overhead
560 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
561 *
562 * mod_state() modifies the zone counter state through atomic per cpu
563 * operations.
564 *
565 * Overstep mode specifies how overstep should handled:
566 * 0 No overstepping
567 * 1 Overstepping half of threshold
568 * -1 Overstepping minus half of threshold
569*/
75ef7184
MG
570static inline void mod_zone_state(struct zone *zone,
571 enum zone_stat_item item, long delta, int overstep_mode)
7c839120 572{
28f836b6 573 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
7c839120
CL
574 s8 __percpu *p = pcp->vm_stat_diff + item;
575 long o, n, t, z;
576
577 do {
578 z = 0; /* overflow to zone counters */
579
580 /*
581 * The fetching of the stat_threshold is racy. We may apply
582 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
583 * rescheduled while executing here. However, the next
584 * counter update will apply the threshold again and
585 * therefore bring the counter under the threshold again.
586 *
587 * Most of the time the thresholds are the same anyways
588 * for all cpus in a zone.
7c839120
CL
589 */
590 t = this_cpu_read(pcp->stat_threshold);
591
592 o = this_cpu_read(*p);
593 n = delta + o;
594
40610076 595 if (abs(n) > t) {
7c839120
CL
596 int os = overstep_mode * (t >> 1) ;
597
598 /* Overflow must be added to zone counters */
599 z = n + os;
600 n = -os;
601 }
602 } while (this_cpu_cmpxchg(*p, o, n) != o);
603
604 if (z)
605 zone_page_state_add(z, zone, item);
606}
607
608void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 609 long delta)
7c839120 610{
75ef7184 611 mod_zone_state(zone, item, delta, 0);
7c839120
CL
612}
613EXPORT_SYMBOL(mod_zone_page_state);
614
7c839120
CL
615void inc_zone_page_state(struct page *page, enum zone_stat_item item)
616{
75ef7184 617 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
618}
619EXPORT_SYMBOL(inc_zone_page_state);
620
621void dec_zone_page_state(struct page *page, enum zone_stat_item item)
622{
75ef7184 623 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
624}
625EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
626
627static inline void mod_node_state(struct pglist_data *pgdat,
628 enum node_stat_item item, int delta, int overstep_mode)
629{
630 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
631 s8 __percpu *p = pcp->vm_node_stat_diff + item;
632 long o, n, t, z;
633
ea426c2a 634 if (vmstat_item_in_bytes(item)) {
629484ae
JW
635 /*
636 * Only cgroups use subpage accounting right now; at
637 * the global level, these items still change in
638 * multiples of whole pages. Store them as pages
639 * internally to keep the per-cpu counters compact.
640 */
ea426c2a
RG
641 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
642 delta >>= PAGE_SHIFT;
643 }
644
75ef7184
MG
645 do {
646 z = 0; /* overflow to node counters */
647
648 /*
649 * The fetching of the stat_threshold is racy. We may apply
650 * a counter threshold to the wrong the cpu if we get
651 * rescheduled while executing here. However, the next
652 * counter update will apply the threshold again and
653 * therefore bring the counter under the threshold again.
654 *
655 * Most of the time the thresholds are the same anyways
656 * for all cpus in a node.
657 */
658 t = this_cpu_read(pcp->stat_threshold);
659
660 o = this_cpu_read(*p);
661 n = delta + o;
662
40610076 663 if (abs(n) > t) {
75ef7184
MG
664 int os = overstep_mode * (t >> 1) ;
665
666 /* Overflow must be added to node counters */
667 z = n + os;
668 n = -os;
669 }
670 } while (this_cpu_cmpxchg(*p, o, n) != o);
671
672 if (z)
673 node_page_state_add(z, pgdat, item);
674}
675
676void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
677 long delta)
678{
679 mod_node_state(pgdat, item, delta, 0);
680}
681EXPORT_SYMBOL(mod_node_page_state);
682
683void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
684{
685 mod_node_state(pgdat, item, 1, 1);
686}
687
688void inc_node_page_state(struct page *page, enum node_stat_item item)
689{
690 mod_node_state(page_pgdat(page), item, 1, 1);
691}
692EXPORT_SYMBOL(inc_node_page_state);
693
694void dec_node_page_state(struct page *page, enum node_stat_item item)
695{
696 mod_node_state(page_pgdat(page), item, -1, -1);
697}
698EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
699#else
700/*
701 * Use interrupt disable to serialize counter updates
702 */
703void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 704 long delta)
7c839120
CL
705{
706 unsigned long flags;
707
708 local_irq_save(flags);
709 __mod_zone_page_state(zone, item, delta);
710 local_irq_restore(flags);
711}
712EXPORT_SYMBOL(mod_zone_page_state);
713
2244b95a
CL
714void inc_zone_page_state(struct page *page, enum zone_stat_item item)
715{
716 unsigned long flags;
717 struct zone *zone;
2244b95a
CL
718
719 zone = page_zone(page);
720 local_irq_save(flags);
ca889e6c 721 __inc_zone_state(zone, item);
2244b95a
CL
722 local_irq_restore(flags);
723}
724EXPORT_SYMBOL(inc_zone_page_state);
725
726void dec_zone_page_state(struct page *page, enum zone_stat_item item)
727{
728 unsigned long flags;
2244b95a 729
2244b95a 730 local_irq_save(flags);
a302eb4e 731 __dec_zone_page_state(page, item);
2244b95a
CL
732 local_irq_restore(flags);
733}
734EXPORT_SYMBOL(dec_zone_page_state);
735
75ef7184
MG
736void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
737{
738 unsigned long flags;
739
740 local_irq_save(flags);
741 __inc_node_state(pgdat, item);
742 local_irq_restore(flags);
743}
744EXPORT_SYMBOL(inc_node_state);
745
746void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
747 long delta)
748{
749 unsigned long flags;
750
751 local_irq_save(flags);
752 __mod_node_page_state(pgdat, item, delta);
753 local_irq_restore(flags);
754}
755EXPORT_SYMBOL(mod_node_page_state);
756
757void inc_node_page_state(struct page *page, enum node_stat_item item)
758{
759 unsigned long flags;
760 struct pglist_data *pgdat;
761
762 pgdat = page_pgdat(page);
763 local_irq_save(flags);
764 __inc_node_state(pgdat, item);
765 local_irq_restore(flags);
766}
767EXPORT_SYMBOL(inc_node_page_state);
768
769void dec_node_page_state(struct page *page, enum node_stat_item item)
770{
771 unsigned long flags;
772
773 local_irq_save(flags);
774 __dec_node_page_state(page, item);
775 local_irq_restore(flags);
776}
777EXPORT_SYMBOL(dec_node_page_state);
778#endif
7cc36bbd
CL
779
780/*
781 * Fold a differential into the global counters.
782 * Returns the number of counters updated.
783 */
f19298b9 784static int fold_diff(int *zone_diff, int *node_diff)
3a321d2a
KW
785{
786 int i;
787 int changes = 0;
788
789 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
790 if (zone_diff[i]) {
791 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
792 changes++;
793 }
794
3a321d2a
KW
795 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
796 if (node_diff[i]) {
797 atomic_long_add(node_diff[i], &vm_node_stat[i]);
798 changes++;
799 }
800 return changes;
801}
f19298b9 802
2244b95a 803/*
2bb921e5 804 * Update the zone counters for the current cpu.
a7f75e25 805 *
4037d452
CL
806 * Note that refresh_cpu_vm_stats strives to only access
807 * node local memory. The per cpu pagesets on remote zones are placed
808 * in the memory local to the processor using that pageset. So the
809 * loop over all zones will access a series of cachelines local to
810 * the processor.
811 *
812 * The call to zone_page_state_add updates the cachelines with the
813 * statistics in the remote zone struct as well as the global cachelines
814 * with the global counters. These could cause remote node cache line
815 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
816 *
817 * The function returns the number of global counters updated.
2244b95a 818 */
0eb77e98 819static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 820{
75ef7184 821 struct pglist_data *pgdat;
2244b95a
CL
822 struct zone *zone;
823 int i;
75ef7184
MG
824 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
825 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 826 int changes = 0;
2244b95a 827
ee99c71c 828 for_each_populated_zone(zone) {
28f836b6
MG
829 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
830#ifdef CONFIG_NUMA
831 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
832#endif
2244b95a 833
fbc2edb0
CL
834 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
835 int v;
2244b95a 836
28f836b6 837 v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
fbc2edb0 838 if (v) {
a7f75e25 839
a7f75e25 840 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 841 global_zone_diff[i] += v;
4037d452
CL
842#ifdef CONFIG_NUMA
843 /* 3 seconds idle till flush */
28f836b6 844 __this_cpu_write(pcp->expire, 3);
4037d452 845#endif
2244b95a 846 }
fbc2edb0 847 }
4037d452 848#ifdef CONFIG_NUMA
3a321d2a 849
0eb77e98
CL
850 if (do_pagesets) {
851 cond_resched();
852 /*
853 * Deal with draining the remote pageset of this
854 * processor
855 *
856 * Check if there are pages remaining in this pageset
857 * if not then there is nothing to expire.
858 */
28f836b6
MG
859 if (!__this_cpu_read(pcp->expire) ||
860 !__this_cpu_read(pcp->count))
0eb77e98 861 continue;
4037d452 862
0eb77e98
CL
863 /*
864 * We never drain zones local to this processor.
865 */
866 if (zone_to_nid(zone) == numa_node_id()) {
28f836b6 867 __this_cpu_write(pcp->expire, 0);
0eb77e98
CL
868 continue;
869 }
4037d452 870
28f836b6 871 if (__this_cpu_dec_return(pcp->expire))
0eb77e98 872 continue;
4037d452 873
28f836b6
MG
874 if (__this_cpu_read(pcp->count)) {
875 drain_zone_pages(zone, this_cpu_ptr(pcp));
0eb77e98
CL
876 changes++;
877 }
7cc36bbd 878 }
4037d452 879#endif
2244b95a 880 }
75ef7184
MG
881
882 for_each_online_pgdat(pgdat) {
883 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
884
885 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
886 int v;
887
888 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
889 if (v) {
890 atomic_long_add(v, &pgdat->vm_stat[i]);
891 global_node_diff[i] += v;
892 }
893 }
894 }
895
896 changes += fold_diff(global_zone_diff, global_node_diff);
7cc36bbd 897 return changes;
2244b95a
CL
898}
899
2bb921e5
CL
900/*
901 * Fold the data for an offline cpu into the global array.
902 * There cannot be any access by the offline cpu and therefore
903 * synchronization is simplified.
904 */
905void cpu_vm_stats_fold(int cpu)
906{
75ef7184 907 struct pglist_data *pgdat;
2bb921e5
CL
908 struct zone *zone;
909 int i;
75ef7184
MG
910 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
911 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
912
913 for_each_populated_zone(zone) {
28f836b6 914 struct per_cpu_zonestat *pzstats;
2bb921e5 915
28f836b6 916 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bb921e5 917
f19298b9 918 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 919 if (pzstats->vm_stat_diff[i]) {
2bb921e5
CL
920 int v;
921
28f836b6
MG
922 v = pzstats->vm_stat_diff[i];
923 pzstats->vm_stat_diff[i] = 0;
2bb921e5 924 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 925 global_zone_diff[i] += v;
2bb921e5 926 }
f19298b9 927 }
3a321d2a 928#ifdef CONFIG_NUMA
f19298b9
MG
929 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
930 if (pzstats->vm_numa_event[i]) {
931 unsigned long v;
3a321d2a 932
f19298b9
MG
933 v = pzstats->vm_numa_event[i];
934 pzstats->vm_numa_event[i] = 0;
935 zone_numa_event_add(v, zone, i);
3a321d2a 936 }
f19298b9 937 }
3a321d2a 938#endif
2bb921e5
CL
939 }
940
75ef7184
MG
941 for_each_online_pgdat(pgdat) {
942 struct per_cpu_nodestat *p;
943
944 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
945
946 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
947 if (p->vm_node_stat_diff[i]) {
948 int v;
949
950 v = p->vm_node_stat_diff[i];
951 p->vm_node_stat_diff[i] = 0;
952 atomic_long_add(v, &pgdat->vm_stat[i]);
953 global_node_diff[i] += v;
954 }
955 }
956
957 fold_diff(global_zone_diff, global_node_diff);
2bb921e5
CL
958}
959
40f4b1ea
CS
960/*
961 * this is only called if !populated_zone(zone), which implies no other users of
f0953a1b 962 * pset->vm_stat_diff[] exist.
40f4b1ea 963 */
28f836b6 964void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
5a883813 965{
f19298b9 966 unsigned long v;
5a883813
MK
967 int i;
968
f19298b9 969 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 970 if (pzstats->vm_stat_diff[i]) {
f19298b9 971 v = pzstats->vm_stat_diff[i];
28f836b6 972 pzstats->vm_stat_diff[i] = 0;
f19298b9 973 zone_page_state_add(v, zone, i);
5a883813 974 }
f19298b9 975 }
3a321d2a
KW
976
977#ifdef CONFIG_NUMA
f19298b9
MG
978 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
979 if (pzstats->vm_numa_event[i]) {
980 v = pzstats->vm_numa_event[i];
981 pzstats->vm_numa_event[i] = 0;
982 zone_numa_event_add(v, zone, i);
3a321d2a 983 }
f19298b9 984 }
3a321d2a 985#endif
5a883813 986}
2244b95a
CL
987#endif
988
ca889e6c 989#ifdef CONFIG_NUMA
c2d42c16 990/*
75ef7184
MG
991 * Determine the per node value of a stat item. This function
992 * is called frequently in a NUMA machine, so try to be as
993 * frugal as possible.
c2d42c16 994 */
75ef7184
MG
995unsigned long sum_zone_node_page_state(int node,
996 enum zone_stat_item item)
c2d42c16
AM
997{
998 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
999 int i;
1000 unsigned long count = 0;
c2d42c16 1001
e87d59f7
JK
1002 for (i = 0; i < MAX_NR_ZONES; i++)
1003 count += zone_page_state(zones + i, item);
1004
1005 return count;
c2d42c16
AM
1006}
1007
f19298b9
MG
1008/* Determine the per node value of a numa stat item. */
1009unsigned long sum_zone_numa_event_state(int node,
3a321d2a
KW
1010 enum numa_stat_item item)
1011{
1012 struct zone *zones = NODE_DATA(node)->node_zones;
3a321d2a 1013 unsigned long count = 0;
f19298b9 1014 int i;
3a321d2a
KW
1015
1016 for (i = 0; i < MAX_NR_ZONES; i++)
f19298b9 1017 count += zone_numa_event_state(zones + i, item);
3a321d2a
KW
1018
1019 return count;
1020}
1021
75ef7184
MG
1022/*
1023 * Determine the per node value of a stat item.
1024 */
ea426c2a
RG
1025unsigned long node_page_state_pages(struct pglist_data *pgdat,
1026 enum node_stat_item item)
75ef7184
MG
1027{
1028 long x = atomic_long_read(&pgdat->vm_stat[item]);
1029#ifdef CONFIG_SMP
1030 if (x < 0)
1031 x = 0;
1032#endif
1033 return x;
1034}
ea426c2a
RG
1035
1036unsigned long node_page_state(struct pglist_data *pgdat,
1037 enum node_stat_item item)
1038{
1039 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1040
1041 return node_page_state_pages(pgdat, item);
1042}
ca889e6c
CL
1043#endif
1044
d7a5752c 1045#ifdef CONFIG_COMPACTION
36deb0be 1046
d7a5752c
MG
1047struct contig_page_info {
1048 unsigned long free_pages;
1049 unsigned long free_blocks_total;
1050 unsigned long free_blocks_suitable;
1051};
1052
1053/*
1054 * Calculate the number of free pages in a zone, how many contiguous
1055 * pages are free and how many are large enough to satisfy an allocation of
1056 * the target size. Note that this function makes no attempt to estimate
1057 * how many suitable free blocks there *might* be if MOVABLE pages were
1058 * migrated. Calculating that is possible, but expensive and can be
1059 * figured out from userspace
1060 */
1061static void fill_contig_page_info(struct zone *zone,
1062 unsigned int suitable_order,
1063 struct contig_page_info *info)
1064{
1065 unsigned int order;
1066
1067 info->free_pages = 0;
1068 info->free_blocks_total = 0;
1069 info->free_blocks_suitable = 0;
1070
1071 for (order = 0; order < MAX_ORDER; order++) {
1072 unsigned long blocks;
1073
af1c31ac
LS
1074 /*
1075 * Count number of free blocks.
1076 *
1077 * Access to nr_free is lockless as nr_free is used only for
1078 * diagnostic purposes. Use data_race to avoid KCSAN warning.
1079 */
1080 blocks = data_race(zone->free_area[order].nr_free);
d7a5752c
MG
1081 info->free_blocks_total += blocks;
1082
1083 /* Count free base pages */
1084 info->free_pages += blocks << order;
1085
1086 /* Count the suitable free blocks */
1087 if (order >= suitable_order)
1088 info->free_blocks_suitable += blocks <<
1089 (order - suitable_order);
1090 }
1091}
f1a5ab12
MG
1092
1093/*
1094 * A fragmentation index only makes sense if an allocation of a requested
1095 * size would fail. If that is true, the fragmentation index indicates
1096 * whether external fragmentation or a lack of memory was the problem.
1097 * The value can be used to determine if page reclaim or compaction
1098 * should be used
1099 */
56de7263 1100static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1101{
1102 unsigned long requested = 1UL << order;
1103
88d6ac40
WY
1104 if (WARN_ON_ONCE(order >= MAX_ORDER))
1105 return 0;
1106
f1a5ab12
MG
1107 if (!info->free_blocks_total)
1108 return 0;
1109
1110 /* Fragmentation index only makes sense when a request would fail */
1111 if (info->free_blocks_suitable)
1112 return -1000;
1113
1114 /*
1115 * Index is between 0 and 1 so return within 3 decimal places
1116 *
1117 * 0 => allocation would fail due to lack of memory
1118 * 1 => allocation would fail due to fragmentation
1119 */
1120 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1121}
56de7263 1122
facdaa91
NG
1123/*
1124 * Calculates external fragmentation within a zone wrt the given order.
1125 * It is defined as the percentage of pages found in blocks of size
1126 * less than 1 << order. It returns values in range [0, 100].
1127 */
d34c0a75 1128unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1129{
1130 struct contig_page_info info;
1131
1132 fill_contig_page_info(zone, order, &info);
1133 if (info.free_pages == 0)
1134 return 0;
1135
1136 return div_u64((info.free_pages -
1137 (info.free_blocks_suitable << order)) * 100,
1138 info.free_pages);
1139}
1140
56de7263
MG
1141/* Same as __fragmentation index but allocs contig_page_info on stack */
1142int fragmentation_index(struct zone *zone, unsigned int order)
1143{
1144 struct contig_page_info info;
1145
1146 fill_contig_page_info(zone, order, &info);
1147 return __fragmentation_index(order, &info);
1148}
d7a5752c
MG
1149#endif
1150
ebc5d83d
KK
1151#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1152 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1153#ifdef CONFIG_ZONE_DMA
1154#define TEXT_FOR_DMA(xx) xx "_dma",
1155#else
1156#define TEXT_FOR_DMA(xx)
1157#endif
1158
1159#ifdef CONFIG_ZONE_DMA32
1160#define TEXT_FOR_DMA32(xx) xx "_dma32",
1161#else
1162#define TEXT_FOR_DMA32(xx)
1163#endif
1164
1165#ifdef CONFIG_HIGHMEM
1166#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1167#else
1168#define TEXT_FOR_HIGHMEM(xx)
1169#endif
1170
a39c5d3c
HL
1171#ifdef CONFIG_ZONE_DEVICE
1172#define TEXT_FOR_DEVICE(xx) xx "_device",
1173#else
1174#define TEXT_FOR_DEVICE(xx)
1175#endif
1176
fa25c503 1177#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
a39c5d3c
HL
1178 TEXT_FOR_HIGHMEM(xx) xx "_movable", \
1179 TEXT_FOR_DEVICE(xx)
fa25c503
KM
1180
1181const char * const vmstat_text[] = {
8d92890b 1182 /* enum zone_stat_item counters */
fa25c503 1183 "nr_free_pages",
71c799f4
MK
1184 "nr_zone_inactive_anon",
1185 "nr_zone_active_anon",
1186 "nr_zone_inactive_file",
1187 "nr_zone_active_file",
1188 "nr_zone_unevictable",
5a1c84b4 1189 "nr_zone_write_pending",
fa25c503 1190 "nr_mlock",
fa25c503 1191 "nr_bounce",
91537fee
MK
1192#if IS_ENABLED(CONFIG_ZSMALLOC)
1193 "nr_zspages",
1194#endif
3a321d2a
KW
1195 "nr_free_cma",
1196
1197 /* enum numa_stat_item counters */
fa25c503
KM
1198#ifdef CONFIG_NUMA
1199 "numa_hit",
1200 "numa_miss",
1201 "numa_foreign",
1202 "numa_interleave",
1203 "numa_local",
1204 "numa_other",
1205#endif
09316c09 1206
9d7ea9a2 1207 /* enum node_stat_item counters */
599d0c95
MG
1208 "nr_inactive_anon",
1209 "nr_active_anon",
1210 "nr_inactive_file",
1211 "nr_active_file",
1212 "nr_unevictable",
385386cf
JW
1213 "nr_slab_reclaimable",
1214 "nr_slab_unreclaimable",
599d0c95
MG
1215 "nr_isolated_anon",
1216 "nr_isolated_file",
68d48e6a 1217 "workingset_nodes",
170b04b7
JK
1218 "workingset_refault_anon",
1219 "workingset_refault_file",
1220 "workingset_activate_anon",
1221 "workingset_activate_file",
1222 "workingset_restore_anon",
1223 "workingset_restore_file",
1e6b1085 1224 "workingset_nodereclaim",
50658e2e
MG
1225 "nr_anon_pages",
1226 "nr_mapped",
11fb9989
MG
1227 "nr_file_pages",
1228 "nr_dirty",
1229 "nr_writeback",
1230 "nr_writeback_temp",
1231 "nr_shmem",
1232 "nr_shmem_hugepages",
1233 "nr_shmem_pmdmapped",
60fbf0ab
SL
1234 "nr_file_hugepages",
1235 "nr_file_pmdmapped",
11fb9989 1236 "nr_anon_transparent_hugepages",
c4a25635
MG
1237 "nr_vmscan_write",
1238 "nr_vmscan_immediate_reclaim",
1239 "nr_dirtied",
1240 "nr_written",
8cd7c588 1241 "nr_throttled_written",
b29940c1 1242 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1243 "nr_foll_pin_acquired",
1244 "nr_foll_pin_released",
991e7673
SB
1245 "nr_kernel_stack",
1246#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1247 "nr_shadow_call_stack",
1248#endif
f0c0c115 1249 "nr_page_table_pages",
b6038942
SB
1250#ifdef CONFIG_SWAP
1251 "nr_swapcached",
1252#endif
e39bb6be
HY
1253#ifdef CONFIG_NUMA_BALANCING
1254 "pgpromote_success",
1255#endif
599d0c95 1256
09316c09 1257 /* enum writeback_stat_item counters */
fa25c503
KM
1258 "nr_dirty_threshold",
1259 "nr_dirty_background_threshold",
1260
ebc5d83d 1261#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1262 /* enum vm_event_item counters */
fa25c503
KM
1263 "pgpgin",
1264 "pgpgout",
1265 "pswpin",
1266 "pswpout",
1267
1268 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1269 TEXTS_FOR_ZONES("allocstall")
1270 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1271
1272 "pgfree",
1273 "pgactivate",
1274 "pgdeactivate",
f7ad2a6c 1275 "pglazyfree",
fa25c503
KM
1276
1277 "pgfault",
1278 "pgmajfault",
854e9ed0 1279 "pglazyfreed",
fa25c503 1280
599d0c95 1281 "pgrefill",
798a6b87 1282 "pgreuse",
599d0c95
MG
1283 "pgsteal_kswapd",
1284 "pgsteal_direct",
668e4147
YS
1285 "pgdemote_kswapd",
1286 "pgdemote_direct",
599d0c95
MG
1287 "pgscan_kswapd",
1288 "pgscan_direct",
68243e76 1289 "pgscan_direct_throttle",
497a6c1b
JW
1290 "pgscan_anon",
1291 "pgscan_file",
1292 "pgsteal_anon",
1293 "pgsteal_file",
fa25c503
KM
1294
1295#ifdef CONFIG_NUMA
1296 "zone_reclaim_failed",
1297#endif
1298 "pginodesteal",
1299 "slabs_scanned",
fa25c503
KM
1300 "kswapd_inodesteal",
1301 "kswapd_low_wmark_hit_quickly",
1302 "kswapd_high_wmark_hit_quickly",
fa25c503 1303 "pageoutrun",
fa25c503
KM
1304
1305 "pgrotated",
1306
5509a5d2
DH
1307 "drop_pagecache",
1308 "drop_slab",
8e675f7a 1309 "oom_kill",
5509a5d2 1310
03c5a6e1
MG
1311#ifdef CONFIG_NUMA_BALANCING
1312 "numa_pte_updates",
72403b4a 1313 "numa_huge_pte_updates",
03c5a6e1
MG
1314 "numa_hint_faults",
1315 "numa_hint_faults_local",
1316 "numa_pages_migrated",
1317#endif
5647bc29
MG
1318#ifdef CONFIG_MIGRATION
1319 "pgmigrate_success",
1320 "pgmigrate_fail",
1a5bae25
AK
1321 "thp_migration_success",
1322 "thp_migration_fail",
1323 "thp_migration_split",
5647bc29 1324#endif
fa25c503 1325#ifdef CONFIG_COMPACTION
397487db
MG
1326 "compact_migrate_scanned",
1327 "compact_free_scanned",
1328 "compact_isolated",
fa25c503
KM
1329 "compact_stall",
1330 "compact_fail",
1331 "compact_success",
698b1b30 1332 "compact_daemon_wake",
7f354a54
DR
1333 "compact_daemon_migrate_scanned",
1334 "compact_daemon_free_scanned",
fa25c503
KM
1335#endif
1336
1337#ifdef CONFIG_HUGETLB_PAGE
1338 "htlb_buddy_alloc_success",
1339 "htlb_buddy_alloc_fail",
bbb26920
MK
1340#endif
1341#ifdef CONFIG_CMA
1342 "cma_alloc_success",
1343 "cma_alloc_fail",
fa25c503
KM
1344#endif
1345 "unevictable_pgs_culled",
1346 "unevictable_pgs_scanned",
1347 "unevictable_pgs_rescued",
1348 "unevictable_pgs_mlocked",
1349 "unevictable_pgs_munlocked",
1350 "unevictable_pgs_cleared",
1351 "unevictable_pgs_stranded",
fa25c503
KM
1352
1353#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1354 "thp_fault_alloc",
1355 "thp_fault_fallback",
85b9f46e 1356 "thp_fault_fallback_charge",
fa25c503
KM
1357 "thp_collapse_alloc",
1358 "thp_collapse_alloc_failed",
95ecedcd 1359 "thp_file_alloc",
dcdf11ee 1360 "thp_file_fallback",
85b9f46e 1361 "thp_file_fallback_charge",
95ecedcd 1362 "thp_file_mapped",
122afea9
KS
1363 "thp_split_page",
1364 "thp_split_page_failed",
f9719a03 1365 "thp_deferred_split_page",
122afea9 1366 "thp_split_pmd",
e9ea874a
YY
1367 "thp_scan_exceed_none_pte",
1368 "thp_scan_exceed_swap_pte",
1369 "thp_scan_exceed_share_pte",
ce9311cf
YX
1370#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1371 "thp_split_pud",
1372#endif
d8a8e1f0
KS
1373 "thp_zero_page_alloc",
1374 "thp_zero_page_alloc_failed",
225311a4 1375 "thp_swpout",
fe490cc0 1376 "thp_swpout_fallback",
fa25c503 1377#endif
09316c09
KK
1378#ifdef CONFIG_MEMORY_BALLOON
1379 "balloon_inflate",
1380 "balloon_deflate",
1381#ifdef CONFIG_BALLOON_COMPACTION
1382 "balloon_migrate",
1383#endif
1384#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1385#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1386 "nr_tlb_remote_flush",
1387 "nr_tlb_remote_flush_received",
1388 "nr_tlb_local_flush_all",
1389 "nr_tlb_local_flush_one",
ec659934 1390#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1391
4f115147
DB
1392#ifdef CONFIG_DEBUG_VM_VMACACHE
1393 "vmacache_find_calls",
1394 "vmacache_find_hits",
1395#endif
cbc65df2
HY
1396#ifdef CONFIG_SWAP
1397 "swap_ra",
1398 "swap_ra_hit",
4d45c3af
YY
1399#ifdef CONFIG_KSM
1400 "ksm_swpin_copy",
1401#endif
cbc65df2 1402#endif
94bfe85b
YY
1403#ifdef CONFIG_KSM
1404 "cow_ksm",
1405#endif
f6498b77
JW
1406#ifdef CONFIG_ZSWAP
1407 "zswpin",
1408 "zswpout",
1409#endif
575299ea
S
1410#ifdef CONFIG_X86
1411 "direct_map_level2_splits",
1412 "direct_map_level3_splits",
1413#endif
ebc5d83d 1414#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1415};
ebc5d83d 1416#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1417
3c486871
AM
1418#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1419 defined(CONFIG_PROC_FS)
1420static void *frag_start(struct seq_file *m, loff_t *pos)
1421{
1422 pg_data_t *pgdat;
1423 loff_t node = *pos;
1424
1425 for (pgdat = first_online_pgdat();
1426 pgdat && node;
1427 pgdat = next_online_pgdat(pgdat))
1428 --node;
1429
1430 return pgdat;
1431}
1432
1433static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1434{
1435 pg_data_t *pgdat = (pg_data_t *)arg;
1436
1437 (*pos)++;
1438 return next_online_pgdat(pgdat);
1439}
1440
1441static void frag_stop(struct seq_file *m, void *arg)
1442{
1443}
1444
b2bd8598
DR
1445/*
1446 * Walk zones in a node and print using a callback.
1447 * If @assert_populated is true, only use callback for zones that are populated.
1448 */
3c486871 1449static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1450 bool assert_populated, bool nolock,
3c486871
AM
1451 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1452{
1453 struct zone *zone;
1454 struct zone *node_zones = pgdat->node_zones;
1455 unsigned long flags;
1456
1457 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1458 if (assert_populated && !populated_zone(zone))
3c486871
AM
1459 continue;
1460
727c080f
VM
1461 if (!nolock)
1462 spin_lock_irqsave(&zone->lock, flags);
3c486871 1463 print(m, pgdat, zone);
727c080f
VM
1464 if (!nolock)
1465 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1466 }
1467}
1468#endif
1469
d7a5752c 1470#ifdef CONFIG_PROC_FS
467c996c
MG
1471static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1472 struct zone *zone)
1473{
1474 int order;
1475
1476 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1477 for (order = 0; order < MAX_ORDER; ++order)
af1c31ac
LS
1478 /*
1479 * Access to nr_free is lockless as nr_free is used only for
1480 * printing purposes. Use data_race to avoid KCSAN warning.
1481 */
1482 seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
467c996c
MG
1483 seq_putc(m, '\n');
1484}
1485
1486/*
1487 * This walks the free areas for each zone.
1488 */
1489static int frag_show(struct seq_file *m, void *arg)
1490{
1491 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1492 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1493 return 0;
1494}
1495
1496static void pagetypeinfo_showfree_print(struct seq_file *m,
1497 pg_data_t *pgdat, struct zone *zone)
1498{
1499 int order, mtype;
1500
1501 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1502 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1503 pgdat->node_id,
1504 zone->name,
1505 migratetype_names[mtype]);
1506 for (order = 0; order < MAX_ORDER; ++order) {
1507 unsigned long freecount = 0;
1508 struct free_area *area;
1509 struct list_head *curr;
93b3a674 1510 bool overflow = false;
467c996c
MG
1511
1512 area = &(zone->free_area[order]);
1513
93b3a674
MH
1514 list_for_each(curr, &area->free_list[mtype]) {
1515 /*
1516 * Cap the free_list iteration because it might
1517 * be really large and we are under a spinlock
1518 * so a long time spent here could trigger a
1519 * hard lockup detector. Anyway this is a
1520 * debugging tool so knowing there is a handful
1521 * of pages of this order should be more than
1522 * sufficient.
1523 */
1524 if (++freecount >= 100000) {
1525 overflow = true;
1526 break;
1527 }
1528 }
1529 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1530 spin_unlock_irq(&zone->lock);
1531 cond_resched();
1532 spin_lock_irq(&zone->lock);
467c996c 1533 }
f6ac2354
CL
1534 seq_putc(m, '\n');
1535 }
467c996c
MG
1536}
1537
1538/* Print out the free pages at each order for each migatetype */
33090af9 1539static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
467c996c
MG
1540{
1541 int order;
1542 pg_data_t *pgdat = (pg_data_t *)arg;
1543
1544 /* Print header */
1545 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1546 for (order = 0; order < MAX_ORDER; ++order)
1547 seq_printf(m, "%6d ", order);
1548 seq_putc(m, '\n');
1549
727c080f 1550 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1551}
1552
1553static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1554 pg_data_t *pgdat, struct zone *zone)
1555{
1556 int mtype;
1557 unsigned long pfn;
1558 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1559 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1560 unsigned long count[MIGRATE_TYPES] = { 0, };
1561
1562 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1563 struct page *page;
1564
d336e94e
MH
1565 page = pfn_to_online_page(pfn);
1566 if (!page)
467c996c
MG
1567 continue;
1568
a91c43c7
JK
1569 if (page_zone(page) != zone)
1570 continue;
1571
467c996c
MG
1572 mtype = get_pageblock_migratetype(page);
1573
e80d6a24
MG
1574 if (mtype < MIGRATE_TYPES)
1575 count[mtype]++;
467c996c
MG
1576 }
1577
1578 /* Print counts */
1579 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1580 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1581 seq_printf(m, "%12lu ", count[mtype]);
1582 seq_putc(m, '\n');
1583}
1584
f113e641 1585/* Print out the number of pageblocks for each migratetype */
33090af9 1586static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
467c996c
MG
1587{
1588 int mtype;
1589 pg_data_t *pgdat = (pg_data_t *)arg;
1590
1591 seq_printf(m, "\n%-23s", "Number of blocks type ");
1592 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1593 seq_printf(m, "%12s ", migratetype_names[mtype]);
1594 seq_putc(m, '\n');
727c080f
VM
1595 walk_zones_in_node(m, pgdat, true, false,
1596 pagetypeinfo_showblockcount_print);
467c996c
MG
1597}
1598
48c96a36
JK
1599/*
1600 * Print out the number of pageblocks for each migratetype that contain pages
1601 * of other types. This gives an indication of how well fallbacks are being
1602 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1603 * to determine what is going on
1604 */
1605static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1606{
1607#ifdef CONFIG_PAGE_OWNER
1608 int mtype;
1609
7dd80b8a 1610 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1611 return;
1612
1613 drain_all_pages(NULL);
1614
1615 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1616 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1617 seq_printf(m, "%12s ", migratetype_names[mtype]);
1618 seq_putc(m, '\n');
1619
727c080f
VM
1620 walk_zones_in_node(m, pgdat, true, true,
1621 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1622#endif /* CONFIG_PAGE_OWNER */
1623}
1624
467c996c
MG
1625/*
1626 * This prints out statistics in relation to grouping pages by mobility.
1627 * It is expensive to collect so do not constantly read the file.
1628 */
1629static int pagetypeinfo_show(struct seq_file *m, void *arg)
1630{
1631 pg_data_t *pgdat = (pg_data_t *)arg;
1632
41b25a37 1633 /* check memoryless node */
a47b53c5 1634 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1635 return 0;
1636
467c996c
MG
1637 seq_printf(m, "Page block order: %d\n", pageblock_order);
1638 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1639 seq_putc(m, '\n');
1640 pagetypeinfo_showfree(m, pgdat);
1641 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1642 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1643
f6ac2354
CL
1644 return 0;
1645}
1646
8f32f7e5 1647static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1648 .start = frag_start,
1649 .next = frag_next,
1650 .stop = frag_stop,
1651 .show = frag_show,
1652};
1653
74e2e8e8 1654static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1655 .start = frag_start,
1656 .next = frag_next,
1657 .stop = frag_stop,
1658 .show = pagetypeinfo_show,
1659};
1660
e2ecc8a7
MG
1661static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1662{
1663 int zid;
1664
1665 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1666 struct zone *compare = &pgdat->node_zones[zid];
1667
1668 if (populated_zone(compare))
1669 return zone == compare;
1670 }
1671
e2ecc8a7
MG
1672 return false;
1673}
1674
467c996c
MG
1675static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1676 struct zone *zone)
f6ac2354 1677{
467c996c
MG
1678 int i;
1679 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1680 if (is_zone_first_populated(pgdat, zone)) {
1681 seq_printf(m, "\n per-node stats");
1682 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1683 unsigned long pages = node_page_state_pages(pgdat, i);
1684
1685 if (vmstat_item_print_in_thp(i))
1686 pages /= HPAGE_PMD_NR;
9d7ea9a2 1687 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1688 pages);
e2ecc8a7
MG
1689 }
1690 }
467c996c
MG
1691 seq_printf(m,
1692 "\n pages free %lu"
a6ea8b5b 1693 "\n boost %lu"
467c996c
MG
1694 "\n min %lu"
1695 "\n low %lu"
1696 "\n high %lu"
467c996c 1697 "\n spanned %lu"
9feedc9d 1698 "\n present %lu"
3c381db1
DH
1699 "\n managed %lu"
1700 "\n cma %lu",
88f5acf8 1701 zone_page_state(zone, NR_FREE_PAGES),
a6ea8b5b 1702 zone->watermark_boost,
41858966
MG
1703 min_wmark_pages(zone),
1704 low_wmark_pages(zone),
1705 high_wmark_pages(zone),
467c996c 1706 zone->spanned_pages,
9feedc9d 1707 zone->present_pages,
3c381db1
DH
1708 zone_managed_pages(zone),
1709 zone_cma_pages(zone));
467c996c 1710
467c996c 1711 seq_printf(m,
3484b2de 1712 "\n protection: (%ld",
467c996c
MG
1713 zone->lowmem_reserve[0]);
1714 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1715 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1716 seq_putc(m, ')');
1717
a8a4b7ae
BH
1718 /* If unpopulated, no other information is useful */
1719 if (!populated_zone(zone)) {
1720 seq_putc(m, '\n');
1721 return;
1722 }
1723
7dfb8bf3 1724 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1725 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1726 zone_page_state(zone, i));
7dfb8bf3 1727
3a321d2a 1728#ifdef CONFIG_NUMA
f19298b9 1729 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
9d7ea9a2 1730 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
f19298b9 1731 zone_numa_event_state(zone, i));
3a321d2a
KW
1732#endif
1733
7dfb8bf3 1734 seq_printf(m, "\n pagesets");
467c996c 1735 for_each_online_cpu(i) {
28f836b6
MG
1736 struct per_cpu_pages *pcp;
1737 struct per_cpu_zonestat __maybe_unused *pzstats;
467c996c 1738
28f836b6 1739 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
3dfa5721
CL
1740 seq_printf(m,
1741 "\n cpu: %i"
1742 "\n count: %i"
1743 "\n high: %i"
1744 "\n batch: %i",
1745 i,
28f836b6
MG
1746 pcp->count,
1747 pcp->high,
1748 pcp->batch);
df9ecaba 1749#ifdef CONFIG_SMP
28f836b6 1750 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
467c996c 1751 seq_printf(m, "\n vm stats threshold: %d",
28f836b6 1752 pzstats->stat_threshold);
df9ecaba 1753#endif
f6ac2354 1754 }
467c996c 1755 seq_printf(m,
599d0c95 1756 "\n node_unreclaimable: %u"
3a50d14d 1757 "\n start_pfn: %lu",
c73322d0 1758 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1759 zone->zone_start_pfn);
467c996c
MG
1760 seq_putc(m, '\n');
1761}
1762
1763/*
b2bd8598
DR
1764 * Output information about zones in @pgdat. All zones are printed regardless
1765 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1766 * set of all zones and userspace would not be aware of such zones if they are
1767 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1768 */
1769static int zoneinfo_show(struct seq_file *m, void *arg)
1770{
1771 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1772 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1773 return 0;
1774}
1775
5c9fe628 1776static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1777 .start = frag_start, /* iterate over all zones. The same as in
1778 * fragmentation. */
1779 .next = frag_next,
1780 .stop = frag_stop,
1781 .show = zoneinfo_show,
1782};
1783
9d7ea9a2 1784#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
f19298b9 1785 NR_VM_NUMA_EVENT_ITEMS + \
9d7ea9a2
KK
1786 NR_VM_NODE_STAT_ITEMS + \
1787 NR_VM_WRITEBACK_STAT_ITEMS + \
1788 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1789 NR_VM_EVENT_ITEMS : 0))
79da826a 1790
f6ac2354
CL
1791static void *vmstat_start(struct seq_file *m, loff_t *pos)
1792{
2244b95a 1793 unsigned long *v;
9d7ea9a2 1794 int i;
f6ac2354 1795
9d7ea9a2 1796 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1797 return NULL;
79da826a 1798
9d7ea9a2 1799 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
f19298b9 1800 fold_vm_numa_events();
9d7ea9a2 1801 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1802 m->private = v;
1803 if (!v)
f6ac2354 1804 return ERR_PTR(-ENOMEM);
2244b95a 1805 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1806 v[i] = global_zone_page_state(i);
79da826a
MR
1807 v += NR_VM_ZONE_STAT_ITEMS;
1808
3a321d2a 1809#ifdef CONFIG_NUMA
f19298b9
MG
1810 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1811 v[i] = global_numa_event_state(i);
1812 v += NR_VM_NUMA_EVENT_ITEMS;
3a321d2a
KW
1813#endif
1814
69473e5d 1815 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1816 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1817 if (vmstat_item_print_in_thp(i))
1818 v[i] /= HPAGE_PMD_NR;
1819 }
75ef7184
MG
1820 v += NR_VM_NODE_STAT_ITEMS;
1821
79da826a
MR
1822 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1823 v + NR_DIRTY_THRESHOLD);
1824 v += NR_VM_WRITEBACK_STAT_ITEMS;
1825
f8891e5e 1826#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1827 all_vm_events(v);
1828 v[PGPGIN] /= 2; /* sectors -> kbytes */
1829 v[PGPGOUT] /= 2;
f8891e5e 1830#endif
ff8b16d7 1831 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1832}
1833
1834static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1835{
1836 (*pos)++;
9d7ea9a2 1837 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1838 return NULL;
1839 return (unsigned long *)m->private + *pos;
1840}
1841
1842static int vmstat_show(struct seq_file *m, void *arg)
1843{
1844 unsigned long *l = arg;
1845 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1846
1847 seq_puts(m, vmstat_text[off]);
75ba1d07 1848 seq_put_decimal_ull(m, " ", *l);
68ba0326 1849 seq_putc(m, '\n');
8d92890b
N
1850
1851 if (off == NR_VMSTAT_ITEMS - 1) {
1852 /*
1853 * We've come to the end - add any deprecated counters to avoid
1854 * breaking userspace which might depend on them being present.
1855 */
1856 seq_puts(m, "nr_unstable 0\n");
1857 }
f6ac2354
CL
1858 return 0;
1859}
1860
1861static void vmstat_stop(struct seq_file *m, void *arg)
1862{
1863 kfree(m->private);
1864 m->private = NULL;
1865}
1866
b6aa44ab 1867static const struct seq_operations vmstat_op = {
f6ac2354
CL
1868 .start = vmstat_start,
1869 .next = vmstat_next,
1870 .stop = vmstat_stop,
1871 .show = vmstat_show,
1872};
f6ac2354
CL
1873#endif /* CONFIG_PROC_FS */
1874
df9ecaba 1875#ifdef CONFIG_SMP
d1187ed2 1876static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1877int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1878
52b6f46b
HD
1879#ifdef CONFIG_PROC_FS
1880static void refresh_vm_stats(struct work_struct *work)
1881{
1882 refresh_cpu_vm_stats(true);
1883}
1884
1885int vmstat_refresh(struct ctl_table *table, int write,
32927393 1886 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1887{
1888 long val;
1889 int err;
1890 int i;
1891
1892 /*
1893 * The regular update, every sysctl_stat_interval, may come later
1894 * than expected: leaving a significant amount in per_cpu buckets.
1895 * This is particularly misleading when checking a quantity of HUGE
1896 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1897 * which can equally be echo'ed to or cat'ted from (by root),
1898 * can be used to update the stats just before reading them.
1899 *
c41f012a 1900 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1901 * transiently negative values, report an error here if any of
1902 * the stats is negative, so we know to go looking for imbalance.
1903 */
1904 err = schedule_on_each_cpu(refresh_vm_stats);
1905 if (err)
1906 return err;
1907 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1908 /*
1909 * Skip checking stats known to go negative occasionally.
1910 */
1911 switch (i) {
1912 case NR_ZONE_WRITE_PENDING:
1913 case NR_FREE_CMA_PAGES:
1914 continue;
1915 }
75ef7184 1916 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1917 if (val < 0) {
c822f622 1918 pr_warn("%s: %s %ld\n",
9d7ea9a2 1919 __func__, zone_stat_name(i), val);
52b6f46b
HD
1920 }
1921 }
76d8cc3c 1922 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1923 /*
1924 * Skip checking stats known to go negative occasionally.
1925 */
1926 switch (i) {
1927 case NR_WRITEBACK:
1928 continue;
1929 }
76d8cc3c
HD
1930 val = atomic_long_read(&vm_node_stat[i]);
1931 if (val < 0) {
1932 pr_warn("%s: %s %ld\n",
1933 __func__, node_stat_name(i), val);
76d8cc3c
HD
1934 }
1935 }
52b6f46b
HD
1936 if (write)
1937 *ppos += *lenp;
1938 else
1939 *lenp = 0;
1940 return 0;
1941}
1942#endif /* CONFIG_PROC_FS */
1943
d1187ed2
CL
1944static void vmstat_update(struct work_struct *w)
1945{
0eb77e98 1946 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1947 /*
1948 * Counters were updated so we expect more updates
1949 * to occur in the future. Keep on running the
1950 * update worker thread.
1951 */
ce612879 1952 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1953 this_cpu_ptr(&vmstat_work),
1954 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1955 }
1956}
1957
1958/*
1959 * Check if the diffs for a certain cpu indicate that
1960 * an update is needed.
1961 */
1962static bool need_update(int cpu)
1963{
2bbd00ae 1964 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1965 struct zone *zone;
1966
1967 for_each_populated_zone(zone) {
28f836b6 1968 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bbd00ae 1969 struct per_cpu_nodestat *n;
28f836b6 1970
7cc36bbd
CL
1971 /*
1972 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1973 */
64632fd3 1974 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
7cc36bbd 1975 return true;
f19298b9 1976
2bbd00ae
JW
1977 if (last_pgdat == zone->zone_pgdat)
1978 continue;
1979 last_pgdat = zone->zone_pgdat;
1980 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
64632fd3
ML
1981 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
1982 return true;
7cc36bbd
CL
1983 }
1984 return false;
1985}
1986
7b8da4c7
CL
1987/*
1988 * Switch off vmstat processing and then fold all the remaining differentials
1989 * until the diffs stay at zero. The function is used by NOHZ and can only be
1990 * invoked when tick processing is not active.
1991 */
f01f17d3
MH
1992void quiet_vmstat(void)
1993{
1994 if (system_state != SYSTEM_RUNNING)
1995 return;
1996
7b8da4c7 1997 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1998 return;
1999
2000 if (!need_update(smp_processor_id()))
2001 return;
2002
2003 /*
2004 * Just refresh counters and do not care about the pending delayed
2005 * vmstat_update. It doesn't fire that often to matter and canceling
2006 * it would be too expensive from this path.
2007 * vmstat_shepherd will take care about that for us.
2008 */
2009 refresh_cpu_vm_stats(false);
2010}
2011
7cc36bbd
CL
2012/*
2013 * Shepherd worker thread that checks the
2014 * differentials of processors that have their worker
2015 * threads for vm statistics updates disabled because of
2016 * inactivity.
2017 */
2018static void vmstat_shepherd(struct work_struct *w);
2019
0eb77e98 2020static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
2021
2022static void vmstat_shepherd(struct work_struct *w)
2023{
2024 int cpu;
2025
7625eccd 2026 cpus_read_lock();
7cc36bbd 2027 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 2028 for_each_online_cpu(cpu) {
f01f17d3 2029 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 2030
7b8da4c7 2031 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 2032 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
2033
2034 cond_resched();
f01f17d3 2035 }
7625eccd 2036 cpus_read_unlock();
7cc36bbd
CL
2037
2038 schedule_delayed_work(&shepherd,
98f4ebb2 2039 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2040}
2041
7cc36bbd 2042static void __init start_shepherd_timer(void)
d1187ed2 2043{
7cc36bbd
CL
2044 int cpu;
2045
2046 for_each_possible_cpu(cpu)
ccde8bd4 2047 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
2048 vmstat_update);
2049
7cc36bbd
CL
2050 schedule_delayed_work(&shepherd,
2051 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2052}
2053
03e86dba
TC
2054static void __init init_cpu_node_state(void)
2055{
4c501327 2056 int node;
03e86dba 2057
4c501327 2058 for_each_online_node(node) {
b55032f1 2059 if (!cpumask_empty(cpumask_of_node(node)))
4c501327
SAS
2060 node_set_state(node, N_CPU);
2061 }
03e86dba
TC
2062}
2063
5438da97
SAS
2064static int vmstat_cpu_online(unsigned int cpu)
2065{
2066 refresh_zone_stat_thresholds();
734c1570
OS
2067
2068 if (!node_state(cpu_to_node(cpu), N_CPU)) {
2069 node_set_state(cpu_to_node(cpu), N_CPU);
2070 set_migration_target_nodes();
2071 }
2072
5438da97
SAS
2073 return 0;
2074}
2075
2076static int vmstat_cpu_down_prep(unsigned int cpu)
2077{
2078 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2079 return 0;
2080}
2081
2082static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2083{
4c501327 2084 const struct cpumask *node_cpus;
5438da97 2085 int node;
807a1bd2 2086
5438da97
SAS
2087 node = cpu_to_node(cpu);
2088
2089 refresh_zone_stat_thresholds();
4c501327 2090 node_cpus = cpumask_of_node(node);
b55032f1 2091 if (!cpumask_empty(node_cpus))
5438da97 2092 return 0;
807a1bd2
TK
2093
2094 node_clear_state(node, N_CPU);
734c1570
OS
2095 set_migration_target_nodes();
2096
5438da97 2097 return 0;
807a1bd2
TK
2098}
2099
8f32f7e5 2100#endif
df9ecaba 2101
ce612879
MH
2102struct workqueue_struct *mm_percpu_wq;
2103
597b7305 2104void __init init_mm_internals(void)
df9ecaba 2105{
ce612879 2106 int ret __maybe_unused;
5438da97 2107
80d136e1 2108 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2109
2110#ifdef CONFIG_SMP
5438da97
SAS
2111 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2112 NULL, vmstat_cpu_dead);
2113 if (ret < 0)
2114 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2115
2116 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2117 vmstat_cpu_online,
2118 vmstat_cpu_down_prep);
2119 if (ret < 0)
2120 pr_err("vmstat: failed to register 'online' hotplug state\n");
2121
7625eccd 2122 cpus_read_lock();
03e86dba 2123 init_cpu_node_state();
7625eccd 2124 cpus_read_unlock();
d1187ed2 2125
7cc36bbd 2126 start_shepherd_timer();
8f32f7e5 2127#endif
734c1570 2128 migrate_on_reclaim_init();
8f32f7e5 2129#ifdef CONFIG_PROC_FS
fddda2b7 2130 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2131 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2132 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2133 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2134#endif
df9ecaba 2135}
d7a5752c
MG
2136
2137#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2138
2139/*
2140 * Return an index indicating how much of the available free memory is
2141 * unusable for an allocation of the requested size.
2142 */
2143static int unusable_free_index(unsigned int order,
2144 struct contig_page_info *info)
2145{
2146 /* No free memory is interpreted as all free memory is unusable */
2147 if (info->free_pages == 0)
2148 return 1000;
2149
2150 /*
2151 * Index should be a value between 0 and 1. Return a value to 3
2152 * decimal places.
2153 *
2154 * 0 => no fragmentation
2155 * 1 => high fragmentation
2156 */
2157 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2158
2159}
2160
2161static void unusable_show_print(struct seq_file *m,
2162 pg_data_t *pgdat, struct zone *zone)
2163{
2164 unsigned int order;
2165 int index;
2166 struct contig_page_info info;
2167
2168 seq_printf(m, "Node %d, zone %8s ",
2169 pgdat->node_id,
2170 zone->name);
2171 for (order = 0; order < MAX_ORDER; ++order) {
2172 fill_contig_page_info(zone, order, &info);
2173 index = unusable_free_index(order, &info);
2174 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2175 }
2176
2177 seq_putc(m, '\n');
2178}
2179
2180/*
2181 * Display unusable free space index
2182 *
2183 * The unusable free space index measures how much of the available free
2184 * memory cannot be used to satisfy an allocation of a given size and is a
2185 * value between 0 and 1. The higher the value, the more of free memory is
2186 * unusable and by implication, the worse the external fragmentation is. This
2187 * can be expressed as a percentage by multiplying by 100.
2188 */
2189static int unusable_show(struct seq_file *m, void *arg)
2190{
2191 pg_data_t *pgdat = (pg_data_t *)arg;
2192
2193 /* check memoryless node */
a47b53c5 2194 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2195 return 0;
2196
727c080f 2197 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2198
2199 return 0;
2200}
2201
01a99560 2202static const struct seq_operations unusable_sops = {
d7a5752c
MG
2203 .start = frag_start,
2204 .next = frag_next,
2205 .stop = frag_stop,
2206 .show = unusable_show,
2207};
2208
01a99560 2209DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2210
f1a5ab12
MG
2211static void extfrag_show_print(struct seq_file *m,
2212 pg_data_t *pgdat, struct zone *zone)
2213{
2214 unsigned int order;
2215 int index;
2216
2217 /* Alloc on stack as interrupts are disabled for zone walk */
2218 struct contig_page_info info;
2219
2220 seq_printf(m, "Node %d, zone %8s ",
2221 pgdat->node_id,
2222 zone->name);
2223 for (order = 0; order < MAX_ORDER; ++order) {
2224 fill_contig_page_info(zone, order, &info);
56de7263 2225 index = __fragmentation_index(order, &info);
a9970586 2226 seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
f1a5ab12
MG
2227 }
2228
2229 seq_putc(m, '\n');
2230}
2231
2232/*
2233 * Display fragmentation index for orders that allocations would fail for
2234 */
2235static int extfrag_show(struct seq_file *m, void *arg)
2236{
2237 pg_data_t *pgdat = (pg_data_t *)arg;
2238
727c080f 2239 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2240
2241 return 0;
2242}
2243
01a99560 2244static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2245 .start = frag_start,
2246 .next = frag_next,
2247 .stop = frag_stop,
2248 .show = extfrag_show,
2249};
2250
01a99560 2251DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2252
d7a5752c
MG
2253static int __init extfrag_debug_init(void)
2254{
bde8bd8a
S
2255 struct dentry *extfrag_debug_root;
2256
d7a5752c 2257 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2258
d9f7979c 2259 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2260 &unusable_fops);
d7a5752c 2261
d9f7979c 2262 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2263 &extfrag_fops);
f1a5ab12 2264
d7a5752c
MG
2265 return 0;
2266}
2267
2268module_init(extfrag_debug_init);
2269#endif