]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - mm/vmstat.c
bpf: Add classid helper only based on skb->sk
[mirror_ubuntu-hirsute-kernel.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
1d90ca89
KW
34#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
35
4518085e
KW
36#ifdef CONFIG_NUMA
37int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
38
39/* zero numa counters within a zone */
40static void zero_zone_numa_counters(struct zone *zone)
41{
42 int item, cpu;
43
44 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
45 atomic_long_set(&zone->vm_numa_stat[item], 0);
46 for_each_online_cpu(cpu)
47 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
48 = 0;
49 }
50}
51
52/* zero numa counters of all the populated zones */
53static void zero_zones_numa_counters(void)
54{
55 struct zone *zone;
56
57 for_each_populated_zone(zone)
58 zero_zone_numa_counters(zone);
59}
60
61/* zero global numa counters */
62static void zero_global_numa_counters(void)
63{
64 int item;
65
66 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
67 atomic_long_set(&vm_numa_stat[item], 0);
68}
69
70static void invalid_numa_statistics(void)
71{
72 zero_zones_numa_counters();
73 zero_global_numa_counters();
74}
75
76static DEFINE_MUTEX(vm_numa_stat_lock);
77
78int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 79 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
80{
81 int ret, oldval;
82
83 mutex_lock(&vm_numa_stat_lock);
84 if (write)
85 oldval = sysctl_vm_numa_stat;
86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
87 if (ret || !write)
88 goto out;
89
90 if (oldval == sysctl_vm_numa_stat)
91 goto out;
92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
93 static_branch_enable(&vm_numa_stat_key);
94 pr_info("enable numa statistics\n");
95 } else {
96 static_branch_disable(&vm_numa_stat_key);
97 invalid_numa_statistics();
98 pr_info("disable numa statistics, and clear numa counters\n");
99 }
100
101out:
102 mutex_unlock(&vm_numa_stat_lock);
103 return ret;
104}
105#endif
106
f8891e5e
CL
107#ifdef CONFIG_VM_EVENT_COUNTERS
108DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
109EXPORT_PER_CPU_SYMBOL(vm_event_states);
110
31f961a8 111static void sum_vm_events(unsigned long *ret)
f8891e5e 112{
9eccf2a8 113 int cpu;
f8891e5e
CL
114 int i;
115
116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
117
31f961a8 118 for_each_online_cpu(cpu) {
f8891e5e
CL
119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
120
f8891e5e
CL
121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
122 ret[i] += this->event[i];
123 }
124}
125
126/*
127 * Accumulate the vm event counters across all CPUs.
128 * The result is unavoidably approximate - it can change
129 * during and after execution of this function.
130*/
131void all_vm_events(unsigned long *ret)
132{
b5be1132 133 get_online_cpus();
31f961a8 134 sum_vm_events(ret);
b5be1132 135 put_online_cpus();
f8891e5e 136}
32dd66fc 137EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 138
f8891e5e
CL
139/*
140 * Fold the foreign cpu events into our own.
141 *
142 * This is adding to the events on one processor
143 * but keeps the global counts constant.
144 */
145void vm_events_fold_cpu(int cpu)
146{
147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
148 int i;
149
150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
151 count_vm_events(i, fold_state->event[i]);
152 fold_state->event[i] = 0;
153 }
154}
f8891e5e
CL
155
156#endif /* CONFIG_VM_EVENT_COUNTERS */
157
2244b95a
CL
158/*
159 * Manage combined zone based / global counters
160 *
161 * vm_stat contains the global counters
162 */
75ef7184 163atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
3a321d2a 164atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
166EXPORT_SYMBOL(vm_zone_stat);
3a321d2a 167EXPORT_SYMBOL(vm_numa_stat);
75ef7184 168EXPORT_SYMBOL(vm_node_stat);
2244b95a
CL
169
170#ifdef CONFIG_SMP
171
b44129b3 172int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
173{
174 int threshold;
175 int watermark_distance;
176
177 /*
178 * As vmstats are not up to date, there is drift between the estimated
179 * and real values. For high thresholds and a high number of CPUs, it
180 * is possible for the min watermark to be breached while the estimated
181 * value looks fine. The pressure threshold is a reduced value such
182 * that even the maximum amount of drift will not accidentally breach
183 * the min watermark
184 */
185 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
186 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
187
188 /*
189 * Maximum threshold is 125
190 */
191 threshold = min(125, threshold);
192
193 return threshold;
194}
195
b44129b3 196int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
197{
198 int threshold;
199 int mem; /* memory in 128 MB units */
200
201 /*
202 * The threshold scales with the number of processors and the amount
203 * of memory per zone. More memory means that we can defer updates for
204 * longer, more processors could lead to more contention.
205 * fls() is used to have a cheap way of logarithmic scaling.
206 *
207 * Some sample thresholds:
208 *
209 * Threshold Processors (fls) Zonesize fls(mem+1)
210 * ------------------------------------------------------------------
211 * 8 1 1 0.9-1 GB 4
212 * 16 2 2 0.9-1 GB 4
213 * 20 2 2 1-2 GB 5
214 * 24 2 2 2-4 GB 6
215 * 28 2 2 4-8 GB 7
216 * 32 2 2 8-16 GB 8
217 * 4 2 2 <128M 1
218 * 30 4 3 2-4 GB 5
219 * 48 4 3 8-16 GB 8
220 * 32 8 4 1-2 GB 4
221 * 32 8 4 0.9-1GB 4
222 * 10 16 5 <128M 1
223 * 40 16 5 900M 4
224 * 70 64 7 2-4 GB 5
225 * 84 64 7 4-8 GB 6
226 * 108 512 9 4-8 GB 6
227 * 125 1024 10 8-16 GB 8
228 * 125 1024 10 16-32 GB 9
229 */
230
9705bea5 231 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
232
233 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
234
235 /*
236 * Maximum threshold is 125
237 */
238 threshold = min(125, threshold);
239
240 return threshold;
241}
2244b95a
CL
242
243/*
df9ecaba 244 * Refresh the thresholds for each zone.
2244b95a 245 */
a6cccdc3 246void refresh_zone_stat_thresholds(void)
2244b95a 247{
75ef7184 248 struct pglist_data *pgdat;
df9ecaba
CL
249 struct zone *zone;
250 int cpu;
251 int threshold;
252
75ef7184
MG
253 /* Zero current pgdat thresholds */
254 for_each_online_pgdat(pgdat) {
255 for_each_online_cpu(cpu) {
256 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
257 }
258 }
259
ee99c71c 260 for_each_populated_zone(zone) {
75ef7184 261 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
262 unsigned long max_drift, tolerate_drift;
263
b44129b3 264 threshold = calculate_normal_threshold(zone);
df9ecaba 265
75ef7184
MG
266 for_each_online_cpu(cpu) {
267 int pgdat_threshold;
268
99dcc3e5
CL
269 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
270 = threshold;
1d90ca89 271
75ef7184
MG
272 /* Base nodestat threshold on the largest populated zone. */
273 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
274 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
275 = max(threshold, pgdat_threshold);
276 }
277
aa454840
CL
278 /*
279 * Only set percpu_drift_mark if there is a danger that
280 * NR_FREE_PAGES reports the low watermark is ok when in fact
281 * the min watermark could be breached by an allocation
282 */
283 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
284 max_drift = num_online_cpus() * threshold;
285 if (max_drift > tolerate_drift)
286 zone->percpu_drift_mark = high_wmark_pages(zone) +
287 max_drift;
df9ecaba 288 }
2244b95a
CL
289}
290
b44129b3
MG
291void set_pgdat_percpu_threshold(pg_data_t *pgdat,
292 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
293{
294 struct zone *zone;
295 int cpu;
296 int threshold;
297 int i;
298
88f5acf8
MG
299 for (i = 0; i < pgdat->nr_zones; i++) {
300 zone = &pgdat->node_zones[i];
301 if (!zone->percpu_drift_mark)
302 continue;
303
b44129b3 304 threshold = (*calculate_pressure)(zone);
1d90ca89 305 for_each_online_cpu(cpu)
88f5acf8
MG
306 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
307 = threshold;
308 }
88f5acf8
MG
309}
310
2244b95a 311/*
bea04b07
JZ
312 * For use when we know that interrupts are disabled,
313 * or when we know that preemption is disabled and that
314 * particular counter cannot be updated from interrupt context.
2244b95a
CL
315 */
316void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 317 long delta)
2244b95a 318{
12938a92
CL
319 struct per_cpu_pageset __percpu *pcp = zone->pageset;
320 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 321 long x;
12938a92
CL
322 long t;
323
324 x = delta + __this_cpu_read(*p);
2244b95a 325
12938a92 326 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 327
12938a92 328 if (unlikely(x > t || x < -t)) {
2244b95a
CL
329 zone_page_state_add(x, zone, item);
330 x = 0;
331 }
12938a92 332 __this_cpu_write(*p, x);
2244b95a
CL
333}
334EXPORT_SYMBOL(__mod_zone_page_state);
335
75ef7184
MG
336void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
337 long delta)
338{
339 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
340 s8 __percpu *p = pcp->vm_node_stat_diff + item;
341 long x;
342 long t;
343
ea426c2a
RG
344 if (vmstat_item_in_bytes(item)) {
345 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
346 delta >>= PAGE_SHIFT;
347 }
348
75ef7184
MG
349 x = delta + __this_cpu_read(*p);
350
351 t = __this_cpu_read(pcp->stat_threshold);
352
353 if (unlikely(x > t || x < -t)) {
354 node_page_state_add(x, pgdat, item);
355 x = 0;
356 }
357 __this_cpu_write(*p, x);
358}
359EXPORT_SYMBOL(__mod_node_page_state);
360
2244b95a
CL
361/*
362 * Optimized increment and decrement functions.
363 *
364 * These are only for a single page and therefore can take a struct page *
365 * argument instead of struct zone *. This allows the inclusion of the code
366 * generated for page_zone(page) into the optimized functions.
367 *
368 * No overflow check is necessary and therefore the differential can be
369 * incremented or decremented in place which may allow the compilers to
370 * generate better code.
2244b95a
CL
371 * The increment or decrement is known and therefore one boundary check can
372 * be omitted.
373 *
df9ecaba
CL
374 * NOTE: These functions are very performance sensitive. Change only
375 * with care.
376 *
2244b95a
CL
377 * Some processors have inc/dec instructions that are atomic vs an interrupt.
378 * However, the code must first determine the differential location in a zone
379 * based on the processor number and then inc/dec the counter. There is no
380 * guarantee without disabling preemption that the processor will not change
381 * in between and therefore the atomicity vs. interrupt cannot be exploited
382 * in a useful way here.
383 */
c8785385 384void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 385{
12938a92
CL
386 struct per_cpu_pageset __percpu *pcp = zone->pageset;
387 s8 __percpu *p = pcp->vm_stat_diff + item;
388 s8 v, t;
2244b95a 389
908ee0f1 390 v = __this_cpu_inc_return(*p);
12938a92
CL
391 t = __this_cpu_read(pcp->stat_threshold);
392 if (unlikely(v > t)) {
393 s8 overstep = t >> 1;
df9ecaba 394
12938a92
CL
395 zone_page_state_add(v + overstep, zone, item);
396 __this_cpu_write(*p, -overstep);
2244b95a
CL
397 }
398}
ca889e6c 399
75ef7184
MG
400void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
401{
402 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
403 s8 __percpu *p = pcp->vm_node_stat_diff + item;
404 s8 v, t;
405
ea426c2a
RG
406 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
407
75ef7184
MG
408 v = __this_cpu_inc_return(*p);
409 t = __this_cpu_read(pcp->stat_threshold);
410 if (unlikely(v > t)) {
411 s8 overstep = t >> 1;
412
413 node_page_state_add(v + overstep, pgdat, item);
414 __this_cpu_write(*p, -overstep);
415 }
416}
417
ca889e6c
CL
418void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
419{
420 __inc_zone_state(page_zone(page), item);
421}
2244b95a
CL
422EXPORT_SYMBOL(__inc_zone_page_state);
423
75ef7184
MG
424void __inc_node_page_state(struct page *page, enum node_stat_item item)
425{
426 __inc_node_state(page_pgdat(page), item);
427}
428EXPORT_SYMBOL(__inc_node_page_state);
429
c8785385 430void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 431{
12938a92
CL
432 struct per_cpu_pageset __percpu *pcp = zone->pageset;
433 s8 __percpu *p = pcp->vm_stat_diff + item;
434 s8 v, t;
2244b95a 435
908ee0f1 436 v = __this_cpu_dec_return(*p);
12938a92
CL
437 t = __this_cpu_read(pcp->stat_threshold);
438 if (unlikely(v < - t)) {
439 s8 overstep = t >> 1;
2244b95a 440
12938a92
CL
441 zone_page_state_add(v - overstep, zone, item);
442 __this_cpu_write(*p, overstep);
2244b95a
CL
443 }
444}
c8785385 445
75ef7184
MG
446void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
447{
448 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
449 s8 __percpu *p = pcp->vm_node_stat_diff + item;
450 s8 v, t;
451
ea426c2a
RG
452 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
453
75ef7184
MG
454 v = __this_cpu_dec_return(*p);
455 t = __this_cpu_read(pcp->stat_threshold);
456 if (unlikely(v < - t)) {
457 s8 overstep = t >> 1;
458
459 node_page_state_add(v - overstep, pgdat, item);
460 __this_cpu_write(*p, overstep);
461 }
462}
463
c8785385
CL
464void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
465{
466 __dec_zone_state(page_zone(page), item);
467}
2244b95a
CL
468EXPORT_SYMBOL(__dec_zone_page_state);
469
75ef7184
MG
470void __dec_node_page_state(struct page *page, enum node_stat_item item)
471{
472 __dec_node_state(page_pgdat(page), item);
473}
474EXPORT_SYMBOL(__dec_node_page_state);
475
4156153c 476#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
477/*
478 * If we have cmpxchg_local support then we do not need to incur the overhead
479 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
480 *
481 * mod_state() modifies the zone counter state through atomic per cpu
482 * operations.
483 *
484 * Overstep mode specifies how overstep should handled:
485 * 0 No overstepping
486 * 1 Overstepping half of threshold
487 * -1 Overstepping minus half of threshold
488*/
75ef7184
MG
489static inline void mod_zone_state(struct zone *zone,
490 enum zone_stat_item item, long delta, int overstep_mode)
7c839120
CL
491{
492 struct per_cpu_pageset __percpu *pcp = zone->pageset;
493 s8 __percpu *p = pcp->vm_stat_diff + item;
494 long o, n, t, z;
495
496 do {
497 z = 0; /* overflow to zone counters */
498
499 /*
500 * The fetching of the stat_threshold is racy. We may apply
501 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
502 * rescheduled while executing here. However, the next
503 * counter update will apply the threshold again and
504 * therefore bring the counter under the threshold again.
505 *
506 * Most of the time the thresholds are the same anyways
507 * for all cpus in a zone.
7c839120
CL
508 */
509 t = this_cpu_read(pcp->stat_threshold);
510
511 o = this_cpu_read(*p);
512 n = delta + o;
513
514 if (n > t || n < -t) {
515 int os = overstep_mode * (t >> 1) ;
516
517 /* Overflow must be added to zone counters */
518 z = n + os;
519 n = -os;
520 }
521 } while (this_cpu_cmpxchg(*p, o, n) != o);
522
523 if (z)
524 zone_page_state_add(z, zone, item);
525}
526
527void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 528 long delta)
7c839120 529{
75ef7184 530 mod_zone_state(zone, item, delta, 0);
7c839120
CL
531}
532EXPORT_SYMBOL(mod_zone_page_state);
533
7c839120
CL
534void inc_zone_page_state(struct page *page, enum zone_stat_item item)
535{
75ef7184 536 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
537}
538EXPORT_SYMBOL(inc_zone_page_state);
539
540void dec_zone_page_state(struct page *page, enum zone_stat_item item)
541{
75ef7184 542 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
543}
544EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
545
546static inline void mod_node_state(struct pglist_data *pgdat,
547 enum node_stat_item item, int delta, int overstep_mode)
548{
549 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
550 s8 __percpu *p = pcp->vm_node_stat_diff + item;
551 long o, n, t, z;
552
ea426c2a
RG
553 if (vmstat_item_in_bytes(item)) {
554 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
555 delta >>= PAGE_SHIFT;
556 }
557
75ef7184
MG
558 do {
559 z = 0; /* overflow to node counters */
560
561 /*
562 * The fetching of the stat_threshold is racy. We may apply
563 * a counter threshold to the wrong the cpu if we get
564 * rescheduled while executing here. However, the next
565 * counter update will apply the threshold again and
566 * therefore bring the counter under the threshold again.
567 *
568 * Most of the time the thresholds are the same anyways
569 * for all cpus in a node.
570 */
571 t = this_cpu_read(pcp->stat_threshold);
572
573 o = this_cpu_read(*p);
574 n = delta + o;
575
576 if (n > t || n < -t) {
577 int os = overstep_mode * (t >> 1) ;
578
579 /* Overflow must be added to node counters */
580 z = n + os;
581 n = -os;
582 }
583 } while (this_cpu_cmpxchg(*p, o, n) != o);
584
585 if (z)
586 node_page_state_add(z, pgdat, item);
587}
588
589void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
590 long delta)
591{
592 mod_node_state(pgdat, item, delta, 0);
593}
594EXPORT_SYMBOL(mod_node_page_state);
595
596void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
597{
598 mod_node_state(pgdat, item, 1, 1);
599}
600
601void inc_node_page_state(struct page *page, enum node_stat_item item)
602{
603 mod_node_state(page_pgdat(page), item, 1, 1);
604}
605EXPORT_SYMBOL(inc_node_page_state);
606
607void dec_node_page_state(struct page *page, enum node_stat_item item)
608{
609 mod_node_state(page_pgdat(page), item, -1, -1);
610}
611EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
612#else
613/*
614 * Use interrupt disable to serialize counter updates
615 */
616void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 617 long delta)
7c839120
CL
618{
619 unsigned long flags;
620
621 local_irq_save(flags);
622 __mod_zone_page_state(zone, item, delta);
623 local_irq_restore(flags);
624}
625EXPORT_SYMBOL(mod_zone_page_state);
626
2244b95a
CL
627void inc_zone_page_state(struct page *page, enum zone_stat_item item)
628{
629 unsigned long flags;
630 struct zone *zone;
2244b95a
CL
631
632 zone = page_zone(page);
633 local_irq_save(flags);
ca889e6c 634 __inc_zone_state(zone, item);
2244b95a
CL
635 local_irq_restore(flags);
636}
637EXPORT_SYMBOL(inc_zone_page_state);
638
639void dec_zone_page_state(struct page *page, enum zone_stat_item item)
640{
641 unsigned long flags;
2244b95a 642
2244b95a 643 local_irq_save(flags);
a302eb4e 644 __dec_zone_page_state(page, item);
2244b95a
CL
645 local_irq_restore(flags);
646}
647EXPORT_SYMBOL(dec_zone_page_state);
648
75ef7184
MG
649void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
650{
651 unsigned long flags;
652
653 local_irq_save(flags);
654 __inc_node_state(pgdat, item);
655 local_irq_restore(flags);
656}
657EXPORT_SYMBOL(inc_node_state);
658
659void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
660 long delta)
661{
662 unsigned long flags;
663
664 local_irq_save(flags);
665 __mod_node_page_state(pgdat, item, delta);
666 local_irq_restore(flags);
667}
668EXPORT_SYMBOL(mod_node_page_state);
669
670void inc_node_page_state(struct page *page, enum node_stat_item item)
671{
672 unsigned long flags;
673 struct pglist_data *pgdat;
674
675 pgdat = page_pgdat(page);
676 local_irq_save(flags);
677 __inc_node_state(pgdat, item);
678 local_irq_restore(flags);
679}
680EXPORT_SYMBOL(inc_node_page_state);
681
682void dec_node_page_state(struct page *page, enum node_stat_item item)
683{
684 unsigned long flags;
685
686 local_irq_save(flags);
687 __dec_node_page_state(page, item);
688 local_irq_restore(flags);
689}
690EXPORT_SYMBOL(dec_node_page_state);
691#endif
7cc36bbd
CL
692
693/*
694 * Fold a differential into the global counters.
695 * Returns the number of counters updated.
696 */
3a321d2a
KW
697#ifdef CONFIG_NUMA
698static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
699{
700 int i;
701 int changes = 0;
702
703 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
704 if (zone_diff[i]) {
705 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
706 changes++;
707 }
708
709 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
710 if (numa_diff[i]) {
711 atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
712 changes++;
713 }
714
715 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
716 if (node_diff[i]) {
717 atomic_long_add(node_diff[i], &vm_node_stat[i]);
718 changes++;
719 }
720 return changes;
721}
722#else
75ef7184 723static int fold_diff(int *zone_diff, int *node_diff)
4edb0748
CL
724{
725 int i;
7cc36bbd 726 int changes = 0;
4edb0748
CL
727
728 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
75ef7184
MG
729 if (zone_diff[i]) {
730 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
731 changes++;
732 }
733
734 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
735 if (node_diff[i]) {
736 atomic_long_add(node_diff[i], &vm_node_stat[i]);
7cc36bbd
CL
737 changes++;
738 }
739 return changes;
4edb0748 740}
3a321d2a 741#endif /* CONFIG_NUMA */
4edb0748 742
2244b95a 743/*
2bb921e5 744 * Update the zone counters for the current cpu.
a7f75e25 745 *
4037d452
CL
746 * Note that refresh_cpu_vm_stats strives to only access
747 * node local memory. The per cpu pagesets on remote zones are placed
748 * in the memory local to the processor using that pageset. So the
749 * loop over all zones will access a series of cachelines local to
750 * the processor.
751 *
752 * The call to zone_page_state_add updates the cachelines with the
753 * statistics in the remote zone struct as well as the global cachelines
754 * with the global counters. These could cause remote node cache line
755 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
756 *
757 * The function returns the number of global counters updated.
2244b95a 758 */
0eb77e98 759static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 760{
75ef7184 761 struct pglist_data *pgdat;
2244b95a
CL
762 struct zone *zone;
763 int i;
75ef7184 764 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
765#ifdef CONFIG_NUMA
766 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
767#endif
75ef7184 768 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 769 int changes = 0;
2244b95a 770
ee99c71c 771 for_each_populated_zone(zone) {
fbc2edb0 772 struct per_cpu_pageset __percpu *p = zone->pageset;
2244b95a 773
fbc2edb0
CL
774 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
775 int v;
2244b95a 776
fbc2edb0
CL
777 v = this_cpu_xchg(p->vm_stat_diff[i], 0);
778 if (v) {
a7f75e25 779
a7f75e25 780 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 781 global_zone_diff[i] += v;
4037d452
CL
782#ifdef CONFIG_NUMA
783 /* 3 seconds idle till flush */
fbc2edb0 784 __this_cpu_write(p->expire, 3);
4037d452 785#endif
2244b95a 786 }
fbc2edb0 787 }
4037d452 788#ifdef CONFIG_NUMA
3a321d2a
KW
789 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
790 int v;
791
792 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
793 if (v) {
794
795 atomic_long_add(v, &zone->vm_numa_stat[i]);
796 global_numa_diff[i] += v;
797 __this_cpu_write(p->expire, 3);
798 }
799 }
800
0eb77e98
CL
801 if (do_pagesets) {
802 cond_resched();
803 /*
804 * Deal with draining the remote pageset of this
805 * processor
806 *
807 * Check if there are pages remaining in this pageset
808 * if not then there is nothing to expire.
809 */
810 if (!__this_cpu_read(p->expire) ||
fbc2edb0 811 !__this_cpu_read(p->pcp.count))
0eb77e98 812 continue;
4037d452 813
0eb77e98
CL
814 /*
815 * We never drain zones local to this processor.
816 */
817 if (zone_to_nid(zone) == numa_node_id()) {
818 __this_cpu_write(p->expire, 0);
819 continue;
820 }
4037d452 821
0eb77e98
CL
822 if (__this_cpu_dec_return(p->expire))
823 continue;
4037d452 824
0eb77e98
CL
825 if (__this_cpu_read(p->pcp.count)) {
826 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
827 changes++;
828 }
7cc36bbd 829 }
4037d452 830#endif
2244b95a 831 }
75ef7184
MG
832
833 for_each_online_pgdat(pgdat) {
834 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
835
836 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
837 int v;
838
839 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
840 if (v) {
841 atomic_long_add(v, &pgdat->vm_stat[i]);
842 global_node_diff[i] += v;
843 }
844 }
845 }
846
3a321d2a
KW
847#ifdef CONFIG_NUMA
848 changes += fold_diff(global_zone_diff, global_numa_diff,
849 global_node_diff);
850#else
75ef7184 851 changes += fold_diff(global_zone_diff, global_node_diff);
3a321d2a 852#endif
7cc36bbd 853 return changes;
2244b95a
CL
854}
855
2bb921e5
CL
856/*
857 * Fold the data for an offline cpu into the global array.
858 * There cannot be any access by the offline cpu and therefore
859 * synchronization is simplified.
860 */
861void cpu_vm_stats_fold(int cpu)
862{
75ef7184 863 struct pglist_data *pgdat;
2bb921e5
CL
864 struct zone *zone;
865 int i;
75ef7184 866 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
3a321d2a
KW
867#ifdef CONFIG_NUMA
868 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
869#endif
75ef7184 870 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
871
872 for_each_populated_zone(zone) {
873 struct per_cpu_pageset *p;
874
875 p = per_cpu_ptr(zone->pageset, cpu);
876
877 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
878 if (p->vm_stat_diff[i]) {
879 int v;
880
881 v = p->vm_stat_diff[i];
882 p->vm_stat_diff[i] = 0;
883 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 884 global_zone_diff[i] += v;
2bb921e5 885 }
3a321d2a
KW
886
887#ifdef CONFIG_NUMA
888 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
889 if (p->vm_numa_stat_diff[i]) {
890 int v;
891
892 v = p->vm_numa_stat_diff[i];
893 p->vm_numa_stat_diff[i] = 0;
894 atomic_long_add(v, &zone->vm_numa_stat[i]);
895 global_numa_diff[i] += v;
896 }
897#endif
2bb921e5
CL
898 }
899
75ef7184
MG
900 for_each_online_pgdat(pgdat) {
901 struct per_cpu_nodestat *p;
902
903 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
904
905 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
906 if (p->vm_node_stat_diff[i]) {
907 int v;
908
909 v = p->vm_node_stat_diff[i];
910 p->vm_node_stat_diff[i] = 0;
911 atomic_long_add(v, &pgdat->vm_stat[i]);
912 global_node_diff[i] += v;
913 }
914 }
915
3a321d2a
KW
916#ifdef CONFIG_NUMA
917 fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
918#else
75ef7184 919 fold_diff(global_zone_diff, global_node_diff);
3a321d2a 920#endif
2bb921e5
CL
921}
922
40f4b1ea
CS
923/*
924 * this is only called if !populated_zone(zone), which implies no other users of
925 * pset->vm_stat_diff[] exsist.
926 */
5a883813
MK
927void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
928{
929 int i;
930
931 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
932 if (pset->vm_stat_diff[i]) {
933 int v = pset->vm_stat_diff[i];
934 pset->vm_stat_diff[i] = 0;
935 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 936 atomic_long_add(v, &vm_zone_stat[i]);
5a883813 937 }
3a321d2a
KW
938
939#ifdef CONFIG_NUMA
940 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
941 if (pset->vm_numa_stat_diff[i]) {
942 int v = pset->vm_numa_stat_diff[i];
943
944 pset->vm_numa_stat_diff[i] = 0;
945 atomic_long_add(v, &zone->vm_numa_stat[i]);
946 atomic_long_add(v, &vm_numa_stat[i]);
947 }
948#endif
5a883813 949}
2244b95a
CL
950#endif
951
ca889e6c 952#ifdef CONFIG_NUMA
3a321d2a
KW
953void __inc_numa_state(struct zone *zone,
954 enum numa_stat_item item)
955{
956 struct per_cpu_pageset __percpu *pcp = zone->pageset;
1d90ca89
KW
957 u16 __percpu *p = pcp->vm_numa_stat_diff + item;
958 u16 v;
3a321d2a
KW
959
960 v = __this_cpu_inc_return(*p);
3a321d2a 961
1d90ca89
KW
962 if (unlikely(v > NUMA_STATS_THRESHOLD)) {
963 zone_numa_state_add(v, zone, item);
964 __this_cpu_write(*p, 0);
3a321d2a
KW
965 }
966}
967
c2d42c16 968/*
75ef7184
MG
969 * Determine the per node value of a stat item. This function
970 * is called frequently in a NUMA machine, so try to be as
971 * frugal as possible.
c2d42c16 972 */
75ef7184
MG
973unsigned long sum_zone_node_page_state(int node,
974 enum zone_stat_item item)
c2d42c16
AM
975{
976 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
977 int i;
978 unsigned long count = 0;
c2d42c16 979
e87d59f7
JK
980 for (i = 0; i < MAX_NR_ZONES; i++)
981 count += zone_page_state(zones + i, item);
982
983 return count;
c2d42c16
AM
984}
985
63803222
KW
986/*
987 * Determine the per node value of a numa stat item. To avoid deviation,
988 * the per cpu stat number in vm_numa_stat_diff[] is also included.
989 */
3a321d2a
KW
990unsigned long sum_zone_numa_state(int node,
991 enum numa_stat_item item)
992{
993 struct zone *zones = NODE_DATA(node)->node_zones;
994 int i;
995 unsigned long count = 0;
996
997 for (i = 0; i < MAX_NR_ZONES; i++)
63803222 998 count += zone_numa_state_snapshot(zones + i, item);
3a321d2a
KW
999
1000 return count;
1001}
1002
75ef7184
MG
1003/*
1004 * Determine the per node value of a stat item.
1005 */
ea426c2a
RG
1006unsigned long node_page_state_pages(struct pglist_data *pgdat,
1007 enum node_stat_item item)
75ef7184
MG
1008{
1009 long x = atomic_long_read(&pgdat->vm_stat[item]);
1010#ifdef CONFIG_SMP
1011 if (x < 0)
1012 x = 0;
1013#endif
1014 return x;
1015}
ea426c2a
RG
1016
1017unsigned long node_page_state(struct pglist_data *pgdat,
1018 enum node_stat_item item)
1019{
1020 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1021
1022 return node_page_state_pages(pgdat, item);
1023}
ca889e6c
CL
1024#endif
1025
d7a5752c 1026#ifdef CONFIG_COMPACTION
36deb0be 1027
d7a5752c
MG
1028struct contig_page_info {
1029 unsigned long free_pages;
1030 unsigned long free_blocks_total;
1031 unsigned long free_blocks_suitable;
1032};
1033
1034/*
1035 * Calculate the number of free pages in a zone, how many contiguous
1036 * pages are free and how many are large enough to satisfy an allocation of
1037 * the target size. Note that this function makes no attempt to estimate
1038 * how many suitable free blocks there *might* be if MOVABLE pages were
1039 * migrated. Calculating that is possible, but expensive and can be
1040 * figured out from userspace
1041 */
1042static void fill_contig_page_info(struct zone *zone,
1043 unsigned int suitable_order,
1044 struct contig_page_info *info)
1045{
1046 unsigned int order;
1047
1048 info->free_pages = 0;
1049 info->free_blocks_total = 0;
1050 info->free_blocks_suitable = 0;
1051
1052 for (order = 0; order < MAX_ORDER; order++) {
1053 unsigned long blocks;
1054
1055 /* Count number of free blocks */
1056 blocks = zone->free_area[order].nr_free;
1057 info->free_blocks_total += blocks;
1058
1059 /* Count free base pages */
1060 info->free_pages += blocks << order;
1061
1062 /* Count the suitable free blocks */
1063 if (order >= suitable_order)
1064 info->free_blocks_suitable += blocks <<
1065 (order - suitable_order);
1066 }
1067}
f1a5ab12
MG
1068
1069/*
1070 * A fragmentation index only makes sense if an allocation of a requested
1071 * size would fail. If that is true, the fragmentation index indicates
1072 * whether external fragmentation or a lack of memory was the problem.
1073 * The value can be used to determine if page reclaim or compaction
1074 * should be used
1075 */
56de7263 1076static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1077{
1078 unsigned long requested = 1UL << order;
1079
88d6ac40
WY
1080 if (WARN_ON_ONCE(order >= MAX_ORDER))
1081 return 0;
1082
f1a5ab12
MG
1083 if (!info->free_blocks_total)
1084 return 0;
1085
1086 /* Fragmentation index only makes sense when a request would fail */
1087 if (info->free_blocks_suitable)
1088 return -1000;
1089
1090 /*
1091 * Index is between 0 and 1 so return within 3 decimal places
1092 *
1093 * 0 => allocation would fail due to lack of memory
1094 * 1 => allocation would fail due to fragmentation
1095 */
1096 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1097}
56de7263 1098
facdaa91
NG
1099/*
1100 * Calculates external fragmentation within a zone wrt the given order.
1101 * It is defined as the percentage of pages found in blocks of size
1102 * less than 1 << order. It returns values in range [0, 100].
1103 */
d34c0a75 1104unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1105{
1106 struct contig_page_info info;
1107
1108 fill_contig_page_info(zone, order, &info);
1109 if (info.free_pages == 0)
1110 return 0;
1111
1112 return div_u64((info.free_pages -
1113 (info.free_blocks_suitable << order)) * 100,
1114 info.free_pages);
1115}
1116
56de7263
MG
1117/* Same as __fragmentation index but allocs contig_page_info on stack */
1118int fragmentation_index(struct zone *zone, unsigned int order)
1119{
1120 struct contig_page_info info;
1121
1122 fill_contig_page_info(zone, order, &info);
1123 return __fragmentation_index(order, &info);
1124}
d7a5752c
MG
1125#endif
1126
ebc5d83d
KK
1127#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1128 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1129#ifdef CONFIG_ZONE_DMA
1130#define TEXT_FOR_DMA(xx) xx "_dma",
1131#else
1132#define TEXT_FOR_DMA(xx)
1133#endif
1134
1135#ifdef CONFIG_ZONE_DMA32
1136#define TEXT_FOR_DMA32(xx) xx "_dma32",
1137#else
1138#define TEXT_FOR_DMA32(xx)
1139#endif
1140
1141#ifdef CONFIG_HIGHMEM
1142#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1143#else
1144#define TEXT_FOR_HIGHMEM(xx)
1145#endif
1146
1147#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1148 TEXT_FOR_HIGHMEM(xx) xx "_movable",
1149
1150const char * const vmstat_text[] = {
8d92890b 1151 /* enum zone_stat_item counters */
fa25c503 1152 "nr_free_pages",
71c799f4
MK
1153 "nr_zone_inactive_anon",
1154 "nr_zone_active_anon",
1155 "nr_zone_inactive_file",
1156 "nr_zone_active_file",
1157 "nr_zone_unevictable",
5a1c84b4 1158 "nr_zone_write_pending",
fa25c503 1159 "nr_mlock",
fa25c503 1160 "nr_page_table_pages",
fa25c503 1161 "nr_bounce",
91537fee
MK
1162#if IS_ENABLED(CONFIG_ZSMALLOC)
1163 "nr_zspages",
1164#endif
3a321d2a
KW
1165 "nr_free_cma",
1166
1167 /* enum numa_stat_item counters */
fa25c503
KM
1168#ifdef CONFIG_NUMA
1169 "numa_hit",
1170 "numa_miss",
1171 "numa_foreign",
1172 "numa_interleave",
1173 "numa_local",
1174 "numa_other",
1175#endif
09316c09 1176
9d7ea9a2 1177 /* enum node_stat_item counters */
599d0c95
MG
1178 "nr_inactive_anon",
1179 "nr_active_anon",
1180 "nr_inactive_file",
1181 "nr_active_file",
1182 "nr_unevictable",
385386cf
JW
1183 "nr_slab_reclaimable",
1184 "nr_slab_unreclaimable",
599d0c95
MG
1185 "nr_isolated_anon",
1186 "nr_isolated_file",
68d48e6a 1187 "workingset_nodes",
170b04b7
JK
1188 "workingset_refault_anon",
1189 "workingset_refault_file",
1190 "workingset_activate_anon",
1191 "workingset_activate_file",
1192 "workingset_restore_anon",
1193 "workingset_restore_file",
1e6b1085 1194 "workingset_nodereclaim",
50658e2e
MG
1195 "nr_anon_pages",
1196 "nr_mapped",
11fb9989
MG
1197 "nr_file_pages",
1198 "nr_dirty",
1199 "nr_writeback",
1200 "nr_writeback_temp",
1201 "nr_shmem",
1202 "nr_shmem_hugepages",
1203 "nr_shmem_pmdmapped",
60fbf0ab
SL
1204 "nr_file_hugepages",
1205 "nr_file_pmdmapped",
11fb9989 1206 "nr_anon_transparent_hugepages",
c4a25635
MG
1207 "nr_vmscan_write",
1208 "nr_vmscan_immediate_reclaim",
1209 "nr_dirtied",
1210 "nr_written",
b29940c1 1211 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1212 "nr_foll_pin_acquired",
1213 "nr_foll_pin_released",
991e7673
SB
1214 "nr_kernel_stack",
1215#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1216 "nr_shadow_call_stack",
1217#endif
599d0c95 1218
09316c09 1219 /* enum writeback_stat_item counters */
fa25c503
KM
1220 "nr_dirty_threshold",
1221 "nr_dirty_background_threshold",
1222
ebc5d83d 1223#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1224 /* enum vm_event_item counters */
fa25c503
KM
1225 "pgpgin",
1226 "pgpgout",
1227 "pswpin",
1228 "pswpout",
1229
1230 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1231 TEXTS_FOR_ZONES("allocstall")
1232 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1233
1234 "pgfree",
1235 "pgactivate",
1236 "pgdeactivate",
f7ad2a6c 1237 "pglazyfree",
fa25c503
KM
1238
1239 "pgfault",
1240 "pgmajfault",
854e9ed0 1241 "pglazyfreed",
fa25c503 1242
599d0c95 1243 "pgrefill",
798a6b87 1244 "pgreuse",
599d0c95
MG
1245 "pgsteal_kswapd",
1246 "pgsteal_direct",
1247 "pgscan_kswapd",
1248 "pgscan_direct",
68243e76 1249 "pgscan_direct_throttle",
497a6c1b
JW
1250 "pgscan_anon",
1251 "pgscan_file",
1252 "pgsteal_anon",
1253 "pgsteal_file",
fa25c503
KM
1254
1255#ifdef CONFIG_NUMA
1256 "zone_reclaim_failed",
1257#endif
1258 "pginodesteal",
1259 "slabs_scanned",
fa25c503
KM
1260 "kswapd_inodesteal",
1261 "kswapd_low_wmark_hit_quickly",
1262 "kswapd_high_wmark_hit_quickly",
fa25c503 1263 "pageoutrun",
fa25c503
KM
1264
1265 "pgrotated",
1266
5509a5d2
DH
1267 "drop_pagecache",
1268 "drop_slab",
8e675f7a 1269 "oom_kill",
5509a5d2 1270
03c5a6e1
MG
1271#ifdef CONFIG_NUMA_BALANCING
1272 "numa_pte_updates",
72403b4a 1273 "numa_huge_pte_updates",
03c5a6e1
MG
1274 "numa_hint_faults",
1275 "numa_hint_faults_local",
1276 "numa_pages_migrated",
1277#endif
5647bc29
MG
1278#ifdef CONFIG_MIGRATION
1279 "pgmigrate_success",
1280 "pgmigrate_fail",
1a5bae25
AK
1281 "thp_migration_success",
1282 "thp_migration_fail",
1283 "thp_migration_split",
5647bc29 1284#endif
fa25c503 1285#ifdef CONFIG_COMPACTION
397487db
MG
1286 "compact_migrate_scanned",
1287 "compact_free_scanned",
1288 "compact_isolated",
fa25c503
KM
1289 "compact_stall",
1290 "compact_fail",
1291 "compact_success",
698b1b30 1292 "compact_daemon_wake",
7f354a54
DR
1293 "compact_daemon_migrate_scanned",
1294 "compact_daemon_free_scanned",
fa25c503
KM
1295#endif
1296
1297#ifdef CONFIG_HUGETLB_PAGE
1298 "htlb_buddy_alloc_success",
1299 "htlb_buddy_alloc_fail",
1300#endif
1301 "unevictable_pgs_culled",
1302 "unevictable_pgs_scanned",
1303 "unevictable_pgs_rescued",
1304 "unevictable_pgs_mlocked",
1305 "unevictable_pgs_munlocked",
1306 "unevictable_pgs_cleared",
1307 "unevictable_pgs_stranded",
fa25c503
KM
1308
1309#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1310 "thp_fault_alloc",
1311 "thp_fault_fallback",
85b9f46e 1312 "thp_fault_fallback_charge",
fa25c503
KM
1313 "thp_collapse_alloc",
1314 "thp_collapse_alloc_failed",
95ecedcd 1315 "thp_file_alloc",
dcdf11ee 1316 "thp_file_fallback",
85b9f46e 1317 "thp_file_fallback_charge",
95ecedcd 1318 "thp_file_mapped",
122afea9
KS
1319 "thp_split_page",
1320 "thp_split_page_failed",
f9719a03 1321 "thp_deferred_split_page",
122afea9 1322 "thp_split_pmd",
ce9311cf
YX
1323#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1324 "thp_split_pud",
1325#endif
d8a8e1f0
KS
1326 "thp_zero_page_alloc",
1327 "thp_zero_page_alloc_failed",
225311a4 1328 "thp_swpout",
fe490cc0 1329 "thp_swpout_fallback",
fa25c503 1330#endif
09316c09
KK
1331#ifdef CONFIG_MEMORY_BALLOON
1332 "balloon_inflate",
1333 "balloon_deflate",
1334#ifdef CONFIG_BALLOON_COMPACTION
1335 "balloon_migrate",
1336#endif
1337#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1338#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1339 "nr_tlb_remote_flush",
1340 "nr_tlb_remote_flush_received",
1341 "nr_tlb_local_flush_all",
1342 "nr_tlb_local_flush_one",
ec659934 1343#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1344
4f115147
DB
1345#ifdef CONFIG_DEBUG_VM_VMACACHE
1346 "vmacache_find_calls",
1347 "vmacache_find_hits",
1348#endif
cbc65df2
HY
1349#ifdef CONFIG_SWAP
1350 "swap_ra",
1351 "swap_ra_hit",
1352#endif
ebc5d83d 1353#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1354};
ebc5d83d 1355#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1356
3c486871
AM
1357#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1358 defined(CONFIG_PROC_FS)
1359static void *frag_start(struct seq_file *m, loff_t *pos)
1360{
1361 pg_data_t *pgdat;
1362 loff_t node = *pos;
1363
1364 for (pgdat = first_online_pgdat();
1365 pgdat && node;
1366 pgdat = next_online_pgdat(pgdat))
1367 --node;
1368
1369 return pgdat;
1370}
1371
1372static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1373{
1374 pg_data_t *pgdat = (pg_data_t *)arg;
1375
1376 (*pos)++;
1377 return next_online_pgdat(pgdat);
1378}
1379
1380static void frag_stop(struct seq_file *m, void *arg)
1381{
1382}
1383
b2bd8598
DR
1384/*
1385 * Walk zones in a node and print using a callback.
1386 * If @assert_populated is true, only use callback for zones that are populated.
1387 */
3c486871 1388static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1389 bool assert_populated, bool nolock,
3c486871
AM
1390 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1391{
1392 struct zone *zone;
1393 struct zone *node_zones = pgdat->node_zones;
1394 unsigned long flags;
1395
1396 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1397 if (assert_populated && !populated_zone(zone))
3c486871
AM
1398 continue;
1399
727c080f
VM
1400 if (!nolock)
1401 spin_lock_irqsave(&zone->lock, flags);
3c486871 1402 print(m, pgdat, zone);
727c080f
VM
1403 if (!nolock)
1404 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1405 }
1406}
1407#endif
1408
d7a5752c 1409#ifdef CONFIG_PROC_FS
467c996c
MG
1410static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1411 struct zone *zone)
1412{
1413 int order;
1414
1415 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1416 for (order = 0; order < MAX_ORDER; ++order)
1417 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1418 seq_putc(m, '\n');
1419}
1420
1421/*
1422 * This walks the free areas for each zone.
1423 */
1424static int frag_show(struct seq_file *m, void *arg)
1425{
1426 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1427 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1428 return 0;
1429}
1430
1431static void pagetypeinfo_showfree_print(struct seq_file *m,
1432 pg_data_t *pgdat, struct zone *zone)
1433{
1434 int order, mtype;
1435
1436 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1437 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1438 pgdat->node_id,
1439 zone->name,
1440 migratetype_names[mtype]);
1441 for (order = 0; order < MAX_ORDER; ++order) {
1442 unsigned long freecount = 0;
1443 struct free_area *area;
1444 struct list_head *curr;
93b3a674 1445 bool overflow = false;
467c996c
MG
1446
1447 area = &(zone->free_area[order]);
1448
93b3a674
MH
1449 list_for_each(curr, &area->free_list[mtype]) {
1450 /*
1451 * Cap the free_list iteration because it might
1452 * be really large and we are under a spinlock
1453 * so a long time spent here could trigger a
1454 * hard lockup detector. Anyway this is a
1455 * debugging tool so knowing there is a handful
1456 * of pages of this order should be more than
1457 * sufficient.
1458 */
1459 if (++freecount >= 100000) {
1460 overflow = true;
1461 break;
1462 }
1463 }
1464 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1465 spin_unlock_irq(&zone->lock);
1466 cond_resched();
1467 spin_lock_irq(&zone->lock);
467c996c 1468 }
f6ac2354
CL
1469 seq_putc(m, '\n');
1470 }
467c996c
MG
1471}
1472
1473/* Print out the free pages at each order for each migatetype */
1474static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
1475{
1476 int order;
1477 pg_data_t *pgdat = (pg_data_t *)arg;
1478
1479 /* Print header */
1480 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1481 for (order = 0; order < MAX_ORDER; ++order)
1482 seq_printf(m, "%6d ", order);
1483 seq_putc(m, '\n');
1484
727c080f 1485 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1486
1487 return 0;
1488}
1489
1490static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1491 pg_data_t *pgdat, struct zone *zone)
1492{
1493 int mtype;
1494 unsigned long pfn;
1495 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1496 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1497 unsigned long count[MIGRATE_TYPES] = { 0, };
1498
1499 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1500 struct page *page;
1501
d336e94e
MH
1502 page = pfn_to_online_page(pfn);
1503 if (!page)
467c996c
MG
1504 continue;
1505
eb33575c
MG
1506 /* Watch for unexpected holes punched in the memmap */
1507 if (!memmap_valid_within(pfn, page, zone))
e80d6a24 1508 continue;
eb33575c 1509
a91c43c7
JK
1510 if (page_zone(page) != zone)
1511 continue;
1512
467c996c
MG
1513 mtype = get_pageblock_migratetype(page);
1514
e80d6a24
MG
1515 if (mtype < MIGRATE_TYPES)
1516 count[mtype]++;
467c996c
MG
1517 }
1518
1519 /* Print counts */
1520 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1521 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1522 seq_printf(m, "%12lu ", count[mtype]);
1523 seq_putc(m, '\n');
1524}
1525
f113e641 1526/* Print out the number of pageblocks for each migratetype */
467c996c
MG
1527static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1528{
1529 int mtype;
1530 pg_data_t *pgdat = (pg_data_t *)arg;
1531
1532 seq_printf(m, "\n%-23s", "Number of blocks type ");
1533 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1534 seq_printf(m, "%12s ", migratetype_names[mtype]);
1535 seq_putc(m, '\n');
727c080f
VM
1536 walk_zones_in_node(m, pgdat, true, false,
1537 pagetypeinfo_showblockcount_print);
467c996c
MG
1538
1539 return 0;
1540}
1541
48c96a36
JK
1542/*
1543 * Print out the number of pageblocks for each migratetype that contain pages
1544 * of other types. This gives an indication of how well fallbacks are being
1545 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1546 * to determine what is going on
1547 */
1548static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1549{
1550#ifdef CONFIG_PAGE_OWNER
1551 int mtype;
1552
7dd80b8a 1553 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1554 return;
1555
1556 drain_all_pages(NULL);
1557
1558 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1559 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1560 seq_printf(m, "%12s ", migratetype_names[mtype]);
1561 seq_putc(m, '\n');
1562
727c080f
VM
1563 walk_zones_in_node(m, pgdat, true, true,
1564 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1565#endif /* CONFIG_PAGE_OWNER */
1566}
1567
467c996c
MG
1568/*
1569 * This prints out statistics in relation to grouping pages by mobility.
1570 * It is expensive to collect so do not constantly read the file.
1571 */
1572static int pagetypeinfo_show(struct seq_file *m, void *arg)
1573{
1574 pg_data_t *pgdat = (pg_data_t *)arg;
1575
41b25a37 1576 /* check memoryless node */
a47b53c5 1577 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1578 return 0;
1579
467c996c
MG
1580 seq_printf(m, "Page block order: %d\n", pageblock_order);
1581 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1582 seq_putc(m, '\n');
1583 pagetypeinfo_showfree(m, pgdat);
1584 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1585 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1586
f6ac2354
CL
1587 return 0;
1588}
1589
8f32f7e5 1590static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1591 .start = frag_start,
1592 .next = frag_next,
1593 .stop = frag_stop,
1594 .show = frag_show,
1595};
1596
74e2e8e8 1597static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1598 .start = frag_start,
1599 .next = frag_next,
1600 .stop = frag_stop,
1601 .show = pagetypeinfo_show,
1602};
1603
e2ecc8a7
MG
1604static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1605{
1606 int zid;
1607
1608 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1609 struct zone *compare = &pgdat->node_zones[zid];
1610
1611 if (populated_zone(compare))
1612 return zone == compare;
1613 }
1614
e2ecc8a7
MG
1615 return false;
1616}
1617
467c996c
MG
1618static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1619 struct zone *zone)
f6ac2354 1620{
467c996c
MG
1621 int i;
1622 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1623 if (is_zone_first_populated(pgdat, zone)) {
1624 seq_printf(m, "\n per-node stats");
1625 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
9d7ea9a2 1626 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
ea426c2a 1627 node_page_state_pages(pgdat, i));
e2ecc8a7
MG
1628 }
1629 }
467c996c
MG
1630 seq_printf(m,
1631 "\n pages free %lu"
1632 "\n min %lu"
1633 "\n low %lu"
1634 "\n high %lu"
467c996c 1635 "\n spanned %lu"
9feedc9d
JL
1636 "\n present %lu"
1637 "\n managed %lu",
88f5acf8 1638 zone_page_state(zone, NR_FREE_PAGES),
41858966
MG
1639 min_wmark_pages(zone),
1640 low_wmark_pages(zone),
1641 high_wmark_pages(zone),
467c996c 1642 zone->spanned_pages,
9feedc9d 1643 zone->present_pages,
9705bea5 1644 zone_managed_pages(zone));
467c996c 1645
467c996c 1646 seq_printf(m,
3484b2de 1647 "\n protection: (%ld",
467c996c
MG
1648 zone->lowmem_reserve[0]);
1649 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1650 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1651 seq_putc(m, ')');
1652
a8a4b7ae
BH
1653 /* If unpopulated, no other information is useful */
1654 if (!populated_zone(zone)) {
1655 seq_putc(m, '\n');
1656 return;
1657 }
1658
7dfb8bf3 1659 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1660 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1661 zone_page_state(zone, i));
7dfb8bf3 1662
3a321d2a
KW
1663#ifdef CONFIG_NUMA
1664 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
9d7ea9a2
KK
1665 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
1666 zone_numa_state_snapshot(zone, i));
3a321d2a
KW
1667#endif
1668
7dfb8bf3 1669 seq_printf(m, "\n pagesets");
467c996c
MG
1670 for_each_online_cpu(i) {
1671 struct per_cpu_pageset *pageset;
467c996c 1672
99dcc3e5 1673 pageset = per_cpu_ptr(zone->pageset, i);
3dfa5721
CL
1674 seq_printf(m,
1675 "\n cpu: %i"
1676 "\n count: %i"
1677 "\n high: %i"
1678 "\n batch: %i",
1679 i,
1680 pageset->pcp.count,
1681 pageset->pcp.high,
1682 pageset->pcp.batch);
df9ecaba 1683#ifdef CONFIG_SMP
467c996c
MG
1684 seq_printf(m, "\n vm stats threshold: %d",
1685 pageset->stat_threshold);
df9ecaba 1686#endif
f6ac2354 1687 }
467c996c 1688 seq_printf(m,
599d0c95 1689 "\n node_unreclaimable: %u"
3a50d14d 1690 "\n start_pfn: %lu",
c73322d0 1691 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1692 zone->zone_start_pfn);
467c996c
MG
1693 seq_putc(m, '\n');
1694}
1695
1696/*
b2bd8598
DR
1697 * Output information about zones in @pgdat. All zones are printed regardless
1698 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1699 * set of all zones and userspace would not be aware of such zones if they are
1700 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1701 */
1702static int zoneinfo_show(struct seq_file *m, void *arg)
1703{
1704 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1705 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1706 return 0;
1707}
1708
5c9fe628 1709static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1710 .start = frag_start, /* iterate over all zones. The same as in
1711 * fragmentation. */
1712 .next = frag_next,
1713 .stop = frag_stop,
1714 .show = zoneinfo_show,
1715};
1716
9d7ea9a2
KK
1717#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1718 NR_VM_NUMA_STAT_ITEMS + \
1719 NR_VM_NODE_STAT_ITEMS + \
1720 NR_VM_WRITEBACK_STAT_ITEMS + \
1721 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1722 NR_VM_EVENT_ITEMS : 0))
79da826a 1723
f6ac2354
CL
1724static void *vmstat_start(struct seq_file *m, loff_t *pos)
1725{
2244b95a 1726 unsigned long *v;
9d7ea9a2 1727 int i;
f6ac2354 1728
9d7ea9a2 1729 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1730 return NULL;
79da826a 1731
9d7ea9a2
KK
1732 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1733 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1734 m->private = v;
1735 if (!v)
f6ac2354 1736 return ERR_PTR(-ENOMEM);
2244b95a 1737 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1738 v[i] = global_zone_page_state(i);
79da826a
MR
1739 v += NR_VM_ZONE_STAT_ITEMS;
1740
3a321d2a
KW
1741#ifdef CONFIG_NUMA
1742 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1743 v[i] = global_numa_state(i);
1744 v += NR_VM_NUMA_STAT_ITEMS;
1745#endif
1746
75ef7184 1747 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
ea426c2a 1748 v[i] = global_node_page_state_pages(i);
75ef7184
MG
1749 v += NR_VM_NODE_STAT_ITEMS;
1750
79da826a
MR
1751 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1752 v + NR_DIRTY_THRESHOLD);
1753 v += NR_VM_WRITEBACK_STAT_ITEMS;
1754
f8891e5e 1755#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1756 all_vm_events(v);
1757 v[PGPGIN] /= 2; /* sectors -> kbytes */
1758 v[PGPGOUT] /= 2;
f8891e5e 1759#endif
ff8b16d7 1760 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1761}
1762
1763static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1764{
1765 (*pos)++;
9d7ea9a2 1766 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1767 return NULL;
1768 return (unsigned long *)m->private + *pos;
1769}
1770
1771static int vmstat_show(struct seq_file *m, void *arg)
1772{
1773 unsigned long *l = arg;
1774 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1775
1776 seq_puts(m, vmstat_text[off]);
75ba1d07 1777 seq_put_decimal_ull(m, " ", *l);
68ba0326 1778 seq_putc(m, '\n');
8d92890b
N
1779
1780 if (off == NR_VMSTAT_ITEMS - 1) {
1781 /*
1782 * We've come to the end - add any deprecated counters to avoid
1783 * breaking userspace which might depend on them being present.
1784 */
1785 seq_puts(m, "nr_unstable 0\n");
1786 }
f6ac2354
CL
1787 return 0;
1788}
1789
1790static void vmstat_stop(struct seq_file *m, void *arg)
1791{
1792 kfree(m->private);
1793 m->private = NULL;
1794}
1795
b6aa44ab 1796static const struct seq_operations vmstat_op = {
f6ac2354
CL
1797 .start = vmstat_start,
1798 .next = vmstat_next,
1799 .stop = vmstat_stop,
1800 .show = vmstat_show,
1801};
f6ac2354
CL
1802#endif /* CONFIG_PROC_FS */
1803
df9ecaba 1804#ifdef CONFIG_SMP
d1187ed2 1805static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1806int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1807
52b6f46b
HD
1808#ifdef CONFIG_PROC_FS
1809static void refresh_vm_stats(struct work_struct *work)
1810{
1811 refresh_cpu_vm_stats(true);
1812}
1813
1814int vmstat_refresh(struct ctl_table *table, int write,
32927393 1815 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1816{
1817 long val;
1818 int err;
1819 int i;
1820
1821 /*
1822 * The regular update, every sysctl_stat_interval, may come later
1823 * than expected: leaving a significant amount in per_cpu buckets.
1824 * This is particularly misleading when checking a quantity of HUGE
1825 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1826 * which can equally be echo'ed to or cat'ted from (by root),
1827 * can be used to update the stats just before reading them.
1828 *
c41f012a 1829 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1830 * transiently negative values, report an error here if any of
1831 * the stats is negative, so we know to go looking for imbalance.
1832 */
1833 err = schedule_on_each_cpu(refresh_vm_stats);
1834 if (err)
1835 return err;
1836 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75ef7184 1837 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1838 if (val < 0) {
c822f622 1839 pr_warn("%s: %s %ld\n",
9d7ea9a2 1840 __func__, zone_stat_name(i), val);
c822f622 1841 err = -EINVAL;
52b6f46b
HD
1842 }
1843 }
3a321d2a
KW
1844#ifdef CONFIG_NUMA
1845 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1846 val = atomic_long_read(&vm_numa_stat[i]);
1847 if (val < 0) {
1848 pr_warn("%s: %s %ld\n",
9d7ea9a2 1849 __func__, numa_stat_name(i), val);
3a321d2a
KW
1850 err = -EINVAL;
1851 }
1852 }
1853#endif
52b6f46b
HD
1854 if (err)
1855 return err;
1856 if (write)
1857 *ppos += *lenp;
1858 else
1859 *lenp = 0;
1860 return 0;
1861}
1862#endif /* CONFIG_PROC_FS */
1863
d1187ed2
CL
1864static void vmstat_update(struct work_struct *w)
1865{
0eb77e98 1866 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1867 /*
1868 * Counters were updated so we expect more updates
1869 * to occur in the future. Keep on running the
1870 * update worker thread.
1871 */
ce612879 1872 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1873 this_cpu_ptr(&vmstat_work),
1874 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1875 }
1876}
1877
0eb77e98
CL
1878/*
1879 * Switch off vmstat processing and then fold all the remaining differentials
1880 * until the diffs stay at zero. The function is used by NOHZ and can only be
1881 * invoked when tick processing is not active.
1882 */
7cc36bbd
CL
1883/*
1884 * Check if the diffs for a certain cpu indicate that
1885 * an update is needed.
1886 */
1887static bool need_update(int cpu)
1888{
1889 struct zone *zone;
1890
1891 for_each_populated_zone(zone) {
1892 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1893
1894 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
3a321d2a 1895#ifdef CONFIG_NUMA
1d90ca89 1896 BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
3a321d2a 1897#endif
63803222 1898
7cc36bbd
CL
1899 /*
1900 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1901 */
13c9aaf7
JH
1902 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
1903 sizeof(p->vm_stat_diff[0])))
7cc36bbd 1904 return true;
3a321d2a 1905#ifdef CONFIG_NUMA
13c9aaf7
JH
1906 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
1907 sizeof(p->vm_numa_stat_diff[0])))
3a321d2a
KW
1908 return true;
1909#endif
7cc36bbd
CL
1910 }
1911 return false;
1912}
1913
7b8da4c7
CL
1914/*
1915 * Switch off vmstat processing and then fold all the remaining differentials
1916 * until the diffs stay at zero. The function is used by NOHZ and can only be
1917 * invoked when tick processing is not active.
1918 */
f01f17d3
MH
1919void quiet_vmstat(void)
1920{
1921 if (system_state != SYSTEM_RUNNING)
1922 return;
1923
7b8da4c7 1924 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1925 return;
1926
1927 if (!need_update(smp_processor_id()))
1928 return;
1929
1930 /*
1931 * Just refresh counters and do not care about the pending delayed
1932 * vmstat_update. It doesn't fire that often to matter and canceling
1933 * it would be too expensive from this path.
1934 * vmstat_shepherd will take care about that for us.
1935 */
1936 refresh_cpu_vm_stats(false);
1937}
1938
7cc36bbd
CL
1939/*
1940 * Shepherd worker thread that checks the
1941 * differentials of processors that have their worker
1942 * threads for vm statistics updates disabled because of
1943 * inactivity.
1944 */
1945static void vmstat_shepherd(struct work_struct *w);
1946
0eb77e98 1947static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
1948
1949static void vmstat_shepherd(struct work_struct *w)
1950{
1951 int cpu;
1952
1953 get_online_cpus();
1954 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 1955 for_each_online_cpu(cpu) {
f01f17d3 1956 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 1957
7b8da4c7 1958 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 1959 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
f01f17d3 1960 }
7cc36bbd
CL
1961 put_online_cpus();
1962
1963 schedule_delayed_work(&shepherd,
98f4ebb2 1964 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1965}
1966
7cc36bbd 1967static void __init start_shepherd_timer(void)
d1187ed2 1968{
7cc36bbd
CL
1969 int cpu;
1970
1971 for_each_possible_cpu(cpu)
ccde8bd4 1972 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
1973 vmstat_update);
1974
7cc36bbd
CL
1975 schedule_delayed_work(&shepherd,
1976 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
1977}
1978
03e86dba
TC
1979static void __init init_cpu_node_state(void)
1980{
4c501327 1981 int node;
03e86dba 1982
4c501327
SAS
1983 for_each_online_node(node) {
1984 if (cpumask_weight(cpumask_of_node(node)) > 0)
1985 node_set_state(node, N_CPU);
1986 }
03e86dba
TC
1987}
1988
5438da97
SAS
1989static int vmstat_cpu_online(unsigned int cpu)
1990{
1991 refresh_zone_stat_thresholds();
1992 node_set_state(cpu_to_node(cpu), N_CPU);
1993 return 0;
1994}
1995
1996static int vmstat_cpu_down_prep(unsigned int cpu)
1997{
1998 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1999 return 0;
2000}
2001
2002static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2003{
4c501327 2004 const struct cpumask *node_cpus;
5438da97 2005 int node;
807a1bd2 2006
5438da97
SAS
2007 node = cpu_to_node(cpu);
2008
2009 refresh_zone_stat_thresholds();
4c501327
SAS
2010 node_cpus = cpumask_of_node(node);
2011 if (cpumask_weight(node_cpus) > 0)
5438da97 2012 return 0;
807a1bd2
TK
2013
2014 node_clear_state(node, N_CPU);
5438da97 2015 return 0;
807a1bd2
TK
2016}
2017
8f32f7e5 2018#endif
df9ecaba 2019
ce612879
MH
2020struct workqueue_struct *mm_percpu_wq;
2021
597b7305 2022void __init init_mm_internals(void)
df9ecaba 2023{
ce612879 2024 int ret __maybe_unused;
5438da97 2025
80d136e1 2026 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2027
2028#ifdef CONFIG_SMP
5438da97
SAS
2029 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2030 NULL, vmstat_cpu_dead);
2031 if (ret < 0)
2032 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2033
2034 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2035 vmstat_cpu_online,
2036 vmstat_cpu_down_prep);
2037 if (ret < 0)
2038 pr_err("vmstat: failed to register 'online' hotplug state\n");
2039
2040 get_online_cpus();
03e86dba 2041 init_cpu_node_state();
5438da97 2042 put_online_cpus();
d1187ed2 2043
7cc36bbd 2044 start_shepherd_timer();
8f32f7e5
AD
2045#endif
2046#ifdef CONFIG_PROC_FS
fddda2b7 2047 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2048 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2049 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2050 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2051#endif
df9ecaba 2052}
d7a5752c
MG
2053
2054#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2055
2056/*
2057 * Return an index indicating how much of the available free memory is
2058 * unusable for an allocation of the requested size.
2059 */
2060static int unusable_free_index(unsigned int order,
2061 struct contig_page_info *info)
2062{
2063 /* No free memory is interpreted as all free memory is unusable */
2064 if (info->free_pages == 0)
2065 return 1000;
2066
2067 /*
2068 * Index should be a value between 0 and 1. Return a value to 3
2069 * decimal places.
2070 *
2071 * 0 => no fragmentation
2072 * 1 => high fragmentation
2073 */
2074 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2075
2076}
2077
2078static void unusable_show_print(struct seq_file *m,
2079 pg_data_t *pgdat, struct zone *zone)
2080{
2081 unsigned int order;
2082 int index;
2083 struct contig_page_info info;
2084
2085 seq_printf(m, "Node %d, zone %8s ",
2086 pgdat->node_id,
2087 zone->name);
2088 for (order = 0; order < MAX_ORDER; ++order) {
2089 fill_contig_page_info(zone, order, &info);
2090 index = unusable_free_index(order, &info);
2091 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2092 }
2093
2094 seq_putc(m, '\n');
2095}
2096
2097/*
2098 * Display unusable free space index
2099 *
2100 * The unusable free space index measures how much of the available free
2101 * memory cannot be used to satisfy an allocation of a given size and is a
2102 * value between 0 and 1. The higher the value, the more of free memory is
2103 * unusable and by implication, the worse the external fragmentation is. This
2104 * can be expressed as a percentage by multiplying by 100.
2105 */
2106static int unusable_show(struct seq_file *m, void *arg)
2107{
2108 pg_data_t *pgdat = (pg_data_t *)arg;
2109
2110 /* check memoryless node */
a47b53c5 2111 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2112 return 0;
2113
727c080f 2114 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2115
2116 return 0;
2117}
2118
01a99560 2119static const struct seq_operations unusable_sops = {
d7a5752c
MG
2120 .start = frag_start,
2121 .next = frag_next,
2122 .stop = frag_stop,
2123 .show = unusable_show,
2124};
2125
01a99560 2126DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2127
f1a5ab12
MG
2128static void extfrag_show_print(struct seq_file *m,
2129 pg_data_t *pgdat, struct zone *zone)
2130{
2131 unsigned int order;
2132 int index;
2133
2134 /* Alloc on stack as interrupts are disabled for zone walk */
2135 struct contig_page_info info;
2136
2137 seq_printf(m, "Node %d, zone %8s ",
2138 pgdat->node_id,
2139 zone->name);
2140 for (order = 0; order < MAX_ORDER; ++order) {
2141 fill_contig_page_info(zone, order, &info);
56de7263 2142 index = __fragmentation_index(order, &info);
f1a5ab12
MG
2143 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2144 }
2145
2146 seq_putc(m, '\n');
2147}
2148
2149/*
2150 * Display fragmentation index for orders that allocations would fail for
2151 */
2152static int extfrag_show(struct seq_file *m, void *arg)
2153{
2154 pg_data_t *pgdat = (pg_data_t *)arg;
2155
727c080f 2156 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2157
2158 return 0;
2159}
2160
01a99560 2161static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2162 .start = frag_start,
2163 .next = frag_next,
2164 .stop = frag_stop,
2165 .show = extfrag_show,
2166};
2167
01a99560 2168DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2169
d7a5752c
MG
2170static int __init extfrag_debug_init(void)
2171{
bde8bd8a
S
2172 struct dentry *extfrag_debug_root;
2173
d7a5752c 2174 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2175
d9f7979c 2176 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2177 &unusable_fops);
d7a5752c 2178
d9f7979c 2179 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2180 &extfrag_fops);
f1a5ab12 2181
d7a5752c
MG
2182 return 0;
2183}
2184
2185module_init(extfrag_debug_init);
2186#endif