]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - mm/vmstat.c
KVM: x86/speculation: Disable Fill buffer clear within guests
[mirror_ubuntu-jammy-kernel.git] / mm / vmstat.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
f6ac2354
CL
2/*
3 * linux/mm/vmstat.c
4 *
5 * Manages VM statistics
6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
2244b95a
CL
7 *
8 * zoned VM statistics
9 * Copyright (C) 2006 Silicon Graphics, Inc.,
10 * Christoph Lameter <christoph@lameter.com>
7cc36bbd 11 * Copyright (C) 2008-2014 Christoph Lameter
f6ac2354 12 */
8f32f7e5 13#include <linux/fs.h>
f6ac2354 14#include <linux/mm.h>
4e950f6f 15#include <linux/err.h>
2244b95a 16#include <linux/module.h>
5a0e3ad6 17#include <linux/slab.h>
df9ecaba 18#include <linux/cpu.h>
7cc36bbd 19#include <linux/cpumask.h>
c748e134 20#include <linux/vmstat.h>
3c486871
AM
21#include <linux/proc_fs.h>
22#include <linux/seq_file.h>
23#include <linux/debugfs.h>
e8edc6e0 24#include <linux/sched.h>
f1a5ab12 25#include <linux/math64.h>
79da826a 26#include <linux/writeback.h>
36deb0be 27#include <linux/compaction.h>
6e543d57 28#include <linux/mm_inline.h>
48c96a36
JK
29#include <linux/page_ext.h>
30#include <linux/page_owner.h>
6e543d57
LD
31
32#include "internal.h"
f6ac2354 33
4518085e
KW
34#ifdef CONFIG_NUMA
35int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
36
37/* zero numa counters within a zone */
38static void zero_zone_numa_counters(struct zone *zone)
39{
40 int item, cpu;
41
f19298b9
MG
42 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
43 atomic_long_set(&zone->vm_numa_event[item], 0);
44 for_each_online_cpu(cpu) {
45 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
4518085e 46 = 0;
f19298b9 47 }
4518085e
KW
48 }
49}
50
51/* zero numa counters of all the populated zones */
52static void zero_zones_numa_counters(void)
53{
54 struct zone *zone;
55
56 for_each_populated_zone(zone)
57 zero_zone_numa_counters(zone);
58}
59
60/* zero global numa counters */
61static void zero_global_numa_counters(void)
62{
63 int item;
64
f19298b9
MG
65 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
66 atomic_long_set(&vm_numa_event[item], 0);
4518085e
KW
67}
68
69static void invalid_numa_statistics(void)
70{
71 zero_zones_numa_counters();
72 zero_global_numa_counters();
73}
74
75static DEFINE_MUTEX(vm_numa_stat_lock);
76
77int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
32927393 78 void *buffer, size_t *length, loff_t *ppos)
4518085e
KW
79{
80 int ret, oldval;
81
82 mutex_lock(&vm_numa_stat_lock);
83 if (write)
84 oldval = sysctl_vm_numa_stat;
85 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
86 if (ret || !write)
87 goto out;
88
89 if (oldval == sysctl_vm_numa_stat)
90 goto out;
91 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
92 static_branch_enable(&vm_numa_stat_key);
93 pr_info("enable numa statistics\n");
94 } else {
95 static_branch_disable(&vm_numa_stat_key);
96 invalid_numa_statistics();
97 pr_info("disable numa statistics, and clear numa counters\n");
98 }
99
100out:
101 mutex_unlock(&vm_numa_stat_lock);
102 return ret;
103}
104#endif
105
f8891e5e
CL
106#ifdef CONFIG_VM_EVENT_COUNTERS
107DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
108EXPORT_PER_CPU_SYMBOL(vm_event_states);
109
31f961a8 110static void sum_vm_events(unsigned long *ret)
f8891e5e 111{
9eccf2a8 112 int cpu;
f8891e5e
CL
113 int i;
114
115 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
116
31f961a8 117 for_each_online_cpu(cpu) {
f8891e5e
CL
118 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
119
f8891e5e
CL
120 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
121 ret[i] += this->event[i];
122 }
123}
124
125/*
126 * Accumulate the vm event counters across all CPUs.
127 * The result is unavoidably approximate - it can change
128 * during and after execution of this function.
129*/
130void all_vm_events(unsigned long *ret)
131{
7625eccd 132 cpus_read_lock();
31f961a8 133 sum_vm_events(ret);
7625eccd 134 cpus_read_unlock();
f8891e5e 135}
32dd66fc 136EXPORT_SYMBOL_GPL(all_vm_events);
f8891e5e 137
f8891e5e
CL
138/*
139 * Fold the foreign cpu events into our own.
140 *
141 * This is adding to the events on one processor
142 * but keeps the global counts constant.
143 */
144void vm_events_fold_cpu(int cpu)
145{
146 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
147 int i;
148
149 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
150 count_vm_events(i, fold_state->event[i]);
151 fold_state->event[i] = 0;
152 }
153}
f8891e5e
CL
154
155#endif /* CONFIG_VM_EVENT_COUNTERS */
156
2244b95a
CL
157/*
158 * Manage combined zone based / global counters
159 *
160 * vm_stat contains the global counters
161 */
75ef7184
MG
162atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
163atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
f19298b9 164atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
75ef7184
MG
165EXPORT_SYMBOL(vm_zone_stat);
166EXPORT_SYMBOL(vm_node_stat);
2244b95a
CL
167
168#ifdef CONFIG_SMP
169
b44129b3 170int calculate_pressure_threshold(struct zone *zone)
88f5acf8
MG
171{
172 int threshold;
173 int watermark_distance;
174
175 /*
176 * As vmstats are not up to date, there is drift between the estimated
177 * and real values. For high thresholds and a high number of CPUs, it
178 * is possible for the min watermark to be breached while the estimated
179 * value looks fine. The pressure threshold is a reduced value such
180 * that even the maximum amount of drift will not accidentally breach
181 * the min watermark
182 */
183 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
184 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
185
186 /*
187 * Maximum threshold is 125
188 */
189 threshold = min(125, threshold);
190
191 return threshold;
192}
193
b44129b3 194int calculate_normal_threshold(struct zone *zone)
df9ecaba
CL
195{
196 int threshold;
197 int mem; /* memory in 128 MB units */
198
199 /*
200 * The threshold scales with the number of processors and the amount
201 * of memory per zone. More memory means that we can defer updates for
202 * longer, more processors could lead to more contention.
203 * fls() is used to have a cheap way of logarithmic scaling.
204 *
205 * Some sample thresholds:
206 *
ea15ba17 207 * Threshold Processors (fls) Zonesize fls(mem)+1
df9ecaba
CL
208 * ------------------------------------------------------------------
209 * 8 1 1 0.9-1 GB 4
210 * 16 2 2 0.9-1 GB 4
211 * 20 2 2 1-2 GB 5
212 * 24 2 2 2-4 GB 6
213 * 28 2 2 4-8 GB 7
214 * 32 2 2 8-16 GB 8
215 * 4 2 2 <128M 1
216 * 30 4 3 2-4 GB 5
217 * 48 4 3 8-16 GB 8
218 * 32 8 4 1-2 GB 4
219 * 32 8 4 0.9-1GB 4
220 * 10 16 5 <128M 1
221 * 40 16 5 900M 4
222 * 70 64 7 2-4 GB 5
223 * 84 64 7 4-8 GB 6
224 * 108 512 9 4-8 GB 6
225 * 125 1024 10 8-16 GB 8
226 * 125 1024 10 16-32 GB 9
227 */
228
9705bea5 229 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
df9ecaba
CL
230
231 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
232
233 /*
234 * Maximum threshold is 125
235 */
236 threshold = min(125, threshold);
237
238 return threshold;
239}
2244b95a
CL
240
241/*
df9ecaba 242 * Refresh the thresholds for each zone.
2244b95a 243 */
a6cccdc3 244void refresh_zone_stat_thresholds(void)
2244b95a 245{
75ef7184 246 struct pglist_data *pgdat;
df9ecaba
CL
247 struct zone *zone;
248 int cpu;
249 int threshold;
250
75ef7184
MG
251 /* Zero current pgdat thresholds */
252 for_each_online_pgdat(pgdat) {
253 for_each_online_cpu(cpu) {
254 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
255 }
256 }
257
ee99c71c 258 for_each_populated_zone(zone) {
75ef7184 259 struct pglist_data *pgdat = zone->zone_pgdat;
aa454840
CL
260 unsigned long max_drift, tolerate_drift;
261
b44129b3 262 threshold = calculate_normal_threshold(zone);
df9ecaba 263
75ef7184
MG
264 for_each_online_cpu(cpu) {
265 int pgdat_threshold;
266
28f836b6 267 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
99dcc3e5 268 = threshold;
1d90ca89 269
75ef7184
MG
270 /* Base nodestat threshold on the largest populated zone. */
271 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
272 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
273 = max(threshold, pgdat_threshold);
274 }
275
aa454840
CL
276 /*
277 * Only set percpu_drift_mark if there is a danger that
278 * NR_FREE_PAGES reports the low watermark is ok when in fact
279 * the min watermark could be breached by an allocation
280 */
281 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
282 max_drift = num_online_cpus() * threshold;
283 if (max_drift > tolerate_drift)
284 zone->percpu_drift_mark = high_wmark_pages(zone) +
285 max_drift;
df9ecaba 286 }
2244b95a
CL
287}
288
b44129b3
MG
289void set_pgdat_percpu_threshold(pg_data_t *pgdat,
290 int (*calculate_pressure)(struct zone *))
88f5acf8
MG
291{
292 struct zone *zone;
293 int cpu;
294 int threshold;
295 int i;
296
88f5acf8
MG
297 for (i = 0; i < pgdat->nr_zones; i++) {
298 zone = &pgdat->node_zones[i];
299 if (!zone->percpu_drift_mark)
300 continue;
301
b44129b3 302 threshold = (*calculate_pressure)(zone);
1d90ca89 303 for_each_online_cpu(cpu)
28f836b6 304 per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
88f5acf8
MG
305 = threshold;
306 }
88f5acf8
MG
307}
308
2244b95a 309/*
bea04b07
JZ
310 * For use when we know that interrupts are disabled,
311 * or when we know that preemption is disabled and that
312 * particular counter cannot be updated from interrupt context.
2244b95a
CL
313 */
314void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 315 long delta)
2244b95a 316{
28f836b6 317 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92 318 s8 __percpu *p = pcp->vm_stat_diff + item;
2244b95a 319 long x;
12938a92
CL
320 long t;
321
c68ed794
IM
322 /*
323 * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
324 * atomicity is provided by IRQs being disabled -- either explicitly
325 * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
326 * CPU migrations and preemption potentially corrupts a counter so
327 * disable preemption.
328 */
329 if (IS_ENABLED(CONFIG_PREEMPT_RT))
330 preempt_disable();
331
12938a92 332 x = delta + __this_cpu_read(*p);
2244b95a 333
12938a92 334 t = __this_cpu_read(pcp->stat_threshold);
2244b95a 335
40610076 336 if (unlikely(abs(x) > t)) {
2244b95a
CL
337 zone_page_state_add(x, zone, item);
338 x = 0;
339 }
12938a92 340 __this_cpu_write(*p, x);
c68ed794
IM
341
342 if (IS_ENABLED(CONFIG_PREEMPT_RT))
343 preempt_enable();
2244b95a
CL
344}
345EXPORT_SYMBOL(__mod_zone_page_state);
346
75ef7184
MG
347void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
348 long delta)
349{
350 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
351 s8 __percpu *p = pcp->vm_node_stat_diff + item;
352 long x;
353 long t;
354
ea426c2a 355 if (vmstat_item_in_bytes(item)) {
629484ae
JW
356 /*
357 * Only cgroups use subpage accounting right now; at
358 * the global level, these items still change in
359 * multiples of whole pages. Store them as pages
360 * internally to keep the per-cpu counters compact.
361 */
ea426c2a
RG
362 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
363 delta >>= PAGE_SHIFT;
364 }
365
c68ed794
IM
366 /* See __mod_node_page_state */
367 if (IS_ENABLED(CONFIG_PREEMPT_RT))
368 preempt_disable();
369
75ef7184
MG
370 x = delta + __this_cpu_read(*p);
371
372 t = __this_cpu_read(pcp->stat_threshold);
373
40610076 374 if (unlikely(abs(x) > t)) {
75ef7184
MG
375 node_page_state_add(x, pgdat, item);
376 x = 0;
377 }
378 __this_cpu_write(*p, x);
c68ed794
IM
379
380 if (IS_ENABLED(CONFIG_PREEMPT_RT))
381 preempt_enable();
75ef7184
MG
382}
383EXPORT_SYMBOL(__mod_node_page_state);
384
2244b95a
CL
385/*
386 * Optimized increment and decrement functions.
387 *
388 * These are only for a single page and therefore can take a struct page *
389 * argument instead of struct zone *. This allows the inclusion of the code
390 * generated for page_zone(page) into the optimized functions.
391 *
392 * No overflow check is necessary and therefore the differential can be
393 * incremented or decremented in place which may allow the compilers to
394 * generate better code.
2244b95a
CL
395 * The increment or decrement is known and therefore one boundary check can
396 * be omitted.
397 *
df9ecaba
CL
398 * NOTE: These functions are very performance sensitive. Change only
399 * with care.
400 *
2244b95a
CL
401 * Some processors have inc/dec instructions that are atomic vs an interrupt.
402 * However, the code must first determine the differential location in a zone
403 * based on the processor number and then inc/dec the counter. There is no
404 * guarantee without disabling preemption that the processor will not change
405 * in between and therefore the atomicity vs. interrupt cannot be exploited
406 * in a useful way here.
407 */
c8785385 408void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 409{
28f836b6 410 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
411 s8 __percpu *p = pcp->vm_stat_diff + item;
412 s8 v, t;
2244b95a 413
c68ed794
IM
414 /* See __mod_node_page_state */
415 if (IS_ENABLED(CONFIG_PREEMPT_RT))
416 preempt_disable();
417
908ee0f1 418 v = __this_cpu_inc_return(*p);
12938a92
CL
419 t = __this_cpu_read(pcp->stat_threshold);
420 if (unlikely(v > t)) {
421 s8 overstep = t >> 1;
df9ecaba 422
12938a92
CL
423 zone_page_state_add(v + overstep, zone, item);
424 __this_cpu_write(*p, -overstep);
2244b95a 425 }
c68ed794
IM
426
427 if (IS_ENABLED(CONFIG_PREEMPT_RT))
428 preempt_enable();
2244b95a 429}
ca889e6c 430
75ef7184
MG
431void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
432{
433 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
434 s8 __percpu *p = pcp->vm_node_stat_diff + item;
435 s8 v, t;
436
ea426c2a
RG
437 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
438
c68ed794
IM
439 /* See __mod_node_page_state */
440 if (IS_ENABLED(CONFIG_PREEMPT_RT))
441 preempt_disable();
442
75ef7184
MG
443 v = __this_cpu_inc_return(*p);
444 t = __this_cpu_read(pcp->stat_threshold);
445 if (unlikely(v > t)) {
446 s8 overstep = t >> 1;
447
448 node_page_state_add(v + overstep, pgdat, item);
449 __this_cpu_write(*p, -overstep);
450 }
c68ed794
IM
451
452 if (IS_ENABLED(CONFIG_PREEMPT_RT))
453 preempt_enable();
75ef7184
MG
454}
455
ca889e6c
CL
456void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
457{
458 __inc_zone_state(page_zone(page), item);
459}
2244b95a
CL
460EXPORT_SYMBOL(__inc_zone_page_state);
461
75ef7184
MG
462void __inc_node_page_state(struct page *page, enum node_stat_item item)
463{
464 __inc_node_state(page_pgdat(page), item);
465}
466EXPORT_SYMBOL(__inc_node_page_state);
467
c8785385 468void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
2244b95a 469{
28f836b6 470 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
12938a92
CL
471 s8 __percpu *p = pcp->vm_stat_diff + item;
472 s8 v, t;
2244b95a 473
c68ed794
IM
474 /* See __mod_node_page_state */
475 if (IS_ENABLED(CONFIG_PREEMPT_RT))
476 preempt_disable();
477
908ee0f1 478 v = __this_cpu_dec_return(*p);
12938a92
CL
479 t = __this_cpu_read(pcp->stat_threshold);
480 if (unlikely(v < - t)) {
481 s8 overstep = t >> 1;
2244b95a 482
12938a92
CL
483 zone_page_state_add(v - overstep, zone, item);
484 __this_cpu_write(*p, overstep);
2244b95a 485 }
c68ed794
IM
486
487 if (IS_ENABLED(CONFIG_PREEMPT_RT))
488 preempt_enable();
2244b95a 489}
c8785385 490
75ef7184
MG
491void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
492{
493 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
494 s8 __percpu *p = pcp->vm_node_stat_diff + item;
495 s8 v, t;
496
ea426c2a
RG
497 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
498
c68ed794
IM
499 /* See __mod_node_page_state */
500 if (IS_ENABLED(CONFIG_PREEMPT_RT))
501 preempt_disable();
502
75ef7184
MG
503 v = __this_cpu_dec_return(*p);
504 t = __this_cpu_read(pcp->stat_threshold);
505 if (unlikely(v < - t)) {
506 s8 overstep = t >> 1;
507
508 node_page_state_add(v - overstep, pgdat, item);
509 __this_cpu_write(*p, overstep);
510 }
c68ed794
IM
511
512 if (IS_ENABLED(CONFIG_PREEMPT_RT))
513 preempt_enable();
75ef7184
MG
514}
515
c8785385
CL
516void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
517{
518 __dec_zone_state(page_zone(page), item);
519}
2244b95a
CL
520EXPORT_SYMBOL(__dec_zone_page_state);
521
75ef7184
MG
522void __dec_node_page_state(struct page *page, enum node_stat_item item)
523{
524 __dec_node_state(page_pgdat(page), item);
525}
526EXPORT_SYMBOL(__dec_node_page_state);
527
4156153c 528#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
7c839120
CL
529/*
530 * If we have cmpxchg_local support then we do not need to incur the overhead
531 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
532 *
533 * mod_state() modifies the zone counter state through atomic per cpu
534 * operations.
535 *
536 * Overstep mode specifies how overstep should handled:
537 * 0 No overstepping
538 * 1 Overstepping half of threshold
539 * -1 Overstepping minus half of threshold
540*/
75ef7184
MG
541static inline void mod_zone_state(struct zone *zone,
542 enum zone_stat_item item, long delta, int overstep_mode)
7c839120 543{
28f836b6 544 struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
7c839120
CL
545 s8 __percpu *p = pcp->vm_stat_diff + item;
546 long o, n, t, z;
547
548 do {
549 z = 0; /* overflow to zone counters */
550
551 /*
552 * The fetching of the stat_threshold is racy. We may apply
553 * a counter threshold to the wrong the cpu if we get
d3bc2367
CL
554 * rescheduled while executing here. However, the next
555 * counter update will apply the threshold again and
556 * therefore bring the counter under the threshold again.
557 *
558 * Most of the time the thresholds are the same anyways
559 * for all cpus in a zone.
7c839120
CL
560 */
561 t = this_cpu_read(pcp->stat_threshold);
562
563 o = this_cpu_read(*p);
564 n = delta + o;
565
40610076 566 if (abs(n) > t) {
7c839120
CL
567 int os = overstep_mode * (t >> 1) ;
568
569 /* Overflow must be added to zone counters */
570 z = n + os;
571 n = -os;
572 }
573 } while (this_cpu_cmpxchg(*p, o, n) != o);
574
575 if (z)
576 zone_page_state_add(z, zone, item);
577}
578
579void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 580 long delta)
7c839120 581{
75ef7184 582 mod_zone_state(zone, item, delta, 0);
7c839120
CL
583}
584EXPORT_SYMBOL(mod_zone_page_state);
585
7c839120
CL
586void inc_zone_page_state(struct page *page, enum zone_stat_item item)
587{
75ef7184 588 mod_zone_state(page_zone(page), item, 1, 1);
7c839120
CL
589}
590EXPORT_SYMBOL(inc_zone_page_state);
591
592void dec_zone_page_state(struct page *page, enum zone_stat_item item)
593{
75ef7184 594 mod_zone_state(page_zone(page), item, -1, -1);
7c839120
CL
595}
596EXPORT_SYMBOL(dec_zone_page_state);
75ef7184
MG
597
598static inline void mod_node_state(struct pglist_data *pgdat,
599 enum node_stat_item item, int delta, int overstep_mode)
600{
601 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
602 s8 __percpu *p = pcp->vm_node_stat_diff + item;
603 long o, n, t, z;
604
ea426c2a 605 if (vmstat_item_in_bytes(item)) {
629484ae
JW
606 /*
607 * Only cgroups use subpage accounting right now; at
608 * the global level, these items still change in
609 * multiples of whole pages. Store them as pages
610 * internally to keep the per-cpu counters compact.
611 */
ea426c2a
RG
612 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
613 delta >>= PAGE_SHIFT;
614 }
615
75ef7184
MG
616 do {
617 z = 0; /* overflow to node counters */
618
619 /*
620 * The fetching of the stat_threshold is racy. We may apply
621 * a counter threshold to the wrong the cpu if we get
622 * rescheduled while executing here. However, the next
623 * counter update will apply the threshold again and
624 * therefore bring the counter under the threshold again.
625 *
626 * Most of the time the thresholds are the same anyways
627 * for all cpus in a node.
628 */
629 t = this_cpu_read(pcp->stat_threshold);
630
631 o = this_cpu_read(*p);
632 n = delta + o;
633
40610076 634 if (abs(n) > t) {
75ef7184
MG
635 int os = overstep_mode * (t >> 1) ;
636
637 /* Overflow must be added to node counters */
638 z = n + os;
639 n = -os;
640 }
641 } while (this_cpu_cmpxchg(*p, o, n) != o);
642
643 if (z)
644 node_page_state_add(z, pgdat, item);
645}
646
647void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
648 long delta)
649{
650 mod_node_state(pgdat, item, delta, 0);
651}
652EXPORT_SYMBOL(mod_node_page_state);
653
654void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
655{
656 mod_node_state(pgdat, item, 1, 1);
657}
658
659void inc_node_page_state(struct page *page, enum node_stat_item item)
660{
661 mod_node_state(page_pgdat(page), item, 1, 1);
662}
663EXPORT_SYMBOL(inc_node_page_state);
664
665void dec_node_page_state(struct page *page, enum node_stat_item item)
666{
667 mod_node_state(page_pgdat(page), item, -1, -1);
668}
669EXPORT_SYMBOL(dec_node_page_state);
7c839120
CL
670#else
671/*
672 * Use interrupt disable to serialize counter updates
673 */
674void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
6cdb18ad 675 long delta)
7c839120
CL
676{
677 unsigned long flags;
678
679 local_irq_save(flags);
680 __mod_zone_page_state(zone, item, delta);
681 local_irq_restore(flags);
682}
683EXPORT_SYMBOL(mod_zone_page_state);
684
2244b95a
CL
685void inc_zone_page_state(struct page *page, enum zone_stat_item item)
686{
687 unsigned long flags;
688 struct zone *zone;
2244b95a
CL
689
690 zone = page_zone(page);
691 local_irq_save(flags);
ca889e6c 692 __inc_zone_state(zone, item);
2244b95a
CL
693 local_irq_restore(flags);
694}
695EXPORT_SYMBOL(inc_zone_page_state);
696
697void dec_zone_page_state(struct page *page, enum zone_stat_item item)
698{
699 unsigned long flags;
2244b95a 700
2244b95a 701 local_irq_save(flags);
a302eb4e 702 __dec_zone_page_state(page, item);
2244b95a
CL
703 local_irq_restore(flags);
704}
705EXPORT_SYMBOL(dec_zone_page_state);
706
75ef7184
MG
707void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
708{
709 unsigned long flags;
710
711 local_irq_save(flags);
712 __inc_node_state(pgdat, item);
713 local_irq_restore(flags);
714}
715EXPORT_SYMBOL(inc_node_state);
716
717void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
718 long delta)
719{
720 unsigned long flags;
721
722 local_irq_save(flags);
723 __mod_node_page_state(pgdat, item, delta);
724 local_irq_restore(flags);
725}
726EXPORT_SYMBOL(mod_node_page_state);
727
728void inc_node_page_state(struct page *page, enum node_stat_item item)
729{
730 unsigned long flags;
731 struct pglist_data *pgdat;
732
733 pgdat = page_pgdat(page);
734 local_irq_save(flags);
735 __inc_node_state(pgdat, item);
736 local_irq_restore(flags);
737}
738EXPORT_SYMBOL(inc_node_page_state);
739
740void dec_node_page_state(struct page *page, enum node_stat_item item)
741{
742 unsigned long flags;
743
744 local_irq_save(flags);
745 __dec_node_page_state(page, item);
746 local_irq_restore(flags);
747}
748EXPORT_SYMBOL(dec_node_page_state);
749#endif
7cc36bbd
CL
750
751/*
752 * Fold a differential into the global counters.
753 * Returns the number of counters updated.
754 */
f19298b9 755static int fold_diff(int *zone_diff, int *node_diff)
3a321d2a
KW
756{
757 int i;
758 int changes = 0;
759
760 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
761 if (zone_diff[i]) {
762 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
763 changes++;
764 }
765
3a321d2a
KW
766 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
767 if (node_diff[i]) {
768 atomic_long_add(node_diff[i], &vm_node_stat[i]);
769 changes++;
770 }
771 return changes;
772}
f19298b9
MG
773
774#ifdef CONFIG_NUMA
775static void fold_vm_zone_numa_events(struct zone *zone)
4edb0748 776{
f19298b9
MG
777 unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
778 int cpu;
779 enum numa_stat_item item;
4edb0748 780
f19298b9
MG
781 for_each_online_cpu(cpu) {
782 struct per_cpu_zonestat *pzstats;
75ef7184 783
f19298b9
MG
784 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
785 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
786 zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
7cc36bbd 787 }
f19298b9
MG
788
789 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
790 zone_numa_event_add(zone_numa_events[item], zone, item);
4edb0748 791}
f19298b9
MG
792
793void fold_vm_numa_events(void)
794{
795 struct zone *zone;
796
797 for_each_populated_zone(zone)
798 fold_vm_zone_numa_events(zone);
799}
800#endif
4edb0748 801
2244b95a 802/*
2bb921e5 803 * Update the zone counters for the current cpu.
a7f75e25 804 *
4037d452
CL
805 * Note that refresh_cpu_vm_stats strives to only access
806 * node local memory. The per cpu pagesets on remote zones are placed
807 * in the memory local to the processor using that pageset. So the
808 * loop over all zones will access a series of cachelines local to
809 * the processor.
810 *
811 * The call to zone_page_state_add updates the cachelines with the
812 * statistics in the remote zone struct as well as the global cachelines
813 * with the global counters. These could cause remote node cache line
814 * bouncing and will have to be only done when necessary.
7cc36bbd
CL
815 *
816 * The function returns the number of global counters updated.
2244b95a 817 */
0eb77e98 818static int refresh_cpu_vm_stats(bool do_pagesets)
2244b95a 819{
75ef7184 820 struct pglist_data *pgdat;
2244b95a
CL
821 struct zone *zone;
822 int i;
75ef7184
MG
823 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
824 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
7cc36bbd 825 int changes = 0;
2244b95a 826
ee99c71c 827 for_each_populated_zone(zone) {
28f836b6
MG
828 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
829#ifdef CONFIG_NUMA
830 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
831#endif
2244b95a 832
fbc2edb0
CL
833 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
834 int v;
2244b95a 835
28f836b6 836 v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
fbc2edb0 837 if (v) {
a7f75e25 838
a7f75e25 839 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 840 global_zone_diff[i] += v;
4037d452
CL
841#ifdef CONFIG_NUMA
842 /* 3 seconds idle till flush */
28f836b6 843 __this_cpu_write(pcp->expire, 3);
4037d452 844#endif
2244b95a 845 }
fbc2edb0 846 }
4037d452 847#ifdef CONFIG_NUMA
3a321d2a 848
0eb77e98
CL
849 if (do_pagesets) {
850 cond_resched();
851 /*
852 * Deal with draining the remote pageset of this
853 * processor
854 *
855 * Check if there are pages remaining in this pageset
856 * if not then there is nothing to expire.
857 */
28f836b6
MG
858 if (!__this_cpu_read(pcp->expire) ||
859 !__this_cpu_read(pcp->count))
0eb77e98 860 continue;
4037d452 861
0eb77e98
CL
862 /*
863 * We never drain zones local to this processor.
864 */
865 if (zone_to_nid(zone) == numa_node_id()) {
28f836b6 866 __this_cpu_write(pcp->expire, 0);
0eb77e98
CL
867 continue;
868 }
4037d452 869
28f836b6 870 if (__this_cpu_dec_return(pcp->expire))
0eb77e98 871 continue;
4037d452 872
28f836b6
MG
873 if (__this_cpu_read(pcp->count)) {
874 drain_zone_pages(zone, this_cpu_ptr(pcp));
0eb77e98
CL
875 changes++;
876 }
7cc36bbd 877 }
4037d452 878#endif
2244b95a 879 }
75ef7184
MG
880
881 for_each_online_pgdat(pgdat) {
882 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
883
884 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
885 int v;
886
887 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
888 if (v) {
889 atomic_long_add(v, &pgdat->vm_stat[i]);
890 global_node_diff[i] += v;
891 }
892 }
893 }
894
895 changes += fold_diff(global_zone_diff, global_node_diff);
7cc36bbd 896 return changes;
2244b95a
CL
897}
898
2bb921e5
CL
899/*
900 * Fold the data for an offline cpu into the global array.
901 * There cannot be any access by the offline cpu and therefore
902 * synchronization is simplified.
903 */
904void cpu_vm_stats_fold(int cpu)
905{
75ef7184 906 struct pglist_data *pgdat;
2bb921e5
CL
907 struct zone *zone;
908 int i;
75ef7184
MG
909 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
910 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
2bb921e5
CL
911
912 for_each_populated_zone(zone) {
28f836b6 913 struct per_cpu_zonestat *pzstats;
2bb921e5 914
28f836b6 915 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bb921e5 916
f19298b9 917 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 918 if (pzstats->vm_stat_diff[i]) {
2bb921e5
CL
919 int v;
920
28f836b6
MG
921 v = pzstats->vm_stat_diff[i];
922 pzstats->vm_stat_diff[i] = 0;
2bb921e5 923 atomic_long_add(v, &zone->vm_stat[i]);
75ef7184 924 global_zone_diff[i] += v;
2bb921e5 925 }
f19298b9 926 }
3a321d2a 927#ifdef CONFIG_NUMA
f19298b9
MG
928 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
929 if (pzstats->vm_numa_event[i]) {
930 unsigned long v;
3a321d2a 931
f19298b9
MG
932 v = pzstats->vm_numa_event[i];
933 pzstats->vm_numa_event[i] = 0;
934 zone_numa_event_add(v, zone, i);
3a321d2a 935 }
f19298b9 936 }
3a321d2a 937#endif
2bb921e5
CL
938 }
939
75ef7184
MG
940 for_each_online_pgdat(pgdat) {
941 struct per_cpu_nodestat *p;
942
943 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
944
945 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
946 if (p->vm_node_stat_diff[i]) {
947 int v;
948
949 v = p->vm_node_stat_diff[i];
950 p->vm_node_stat_diff[i] = 0;
951 atomic_long_add(v, &pgdat->vm_stat[i]);
952 global_node_diff[i] += v;
953 }
954 }
955
956 fold_diff(global_zone_diff, global_node_diff);
2bb921e5
CL
957}
958
40f4b1ea
CS
959/*
960 * this is only called if !populated_zone(zone), which implies no other users of
f0953a1b 961 * pset->vm_stat_diff[] exist.
40f4b1ea 962 */
28f836b6 963void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
5a883813 964{
f19298b9 965 unsigned long v;
5a883813
MK
966 int i;
967
f19298b9 968 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
28f836b6 969 if (pzstats->vm_stat_diff[i]) {
f19298b9 970 v = pzstats->vm_stat_diff[i];
28f836b6 971 pzstats->vm_stat_diff[i] = 0;
f19298b9 972 zone_page_state_add(v, zone, i);
5a883813 973 }
f19298b9 974 }
3a321d2a
KW
975
976#ifdef CONFIG_NUMA
f19298b9
MG
977 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
978 if (pzstats->vm_numa_event[i]) {
979 v = pzstats->vm_numa_event[i];
980 pzstats->vm_numa_event[i] = 0;
981 zone_numa_event_add(v, zone, i);
3a321d2a 982 }
f19298b9 983 }
3a321d2a 984#endif
5a883813 985}
2244b95a
CL
986#endif
987
ca889e6c 988#ifdef CONFIG_NUMA
c2d42c16 989/*
75ef7184
MG
990 * Determine the per node value of a stat item. This function
991 * is called frequently in a NUMA machine, so try to be as
992 * frugal as possible.
c2d42c16 993 */
75ef7184
MG
994unsigned long sum_zone_node_page_state(int node,
995 enum zone_stat_item item)
c2d42c16
AM
996{
997 struct zone *zones = NODE_DATA(node)->node_zones;
e87d59f7
JK
998 int i;
999 unsigned long count = 0;
c2d42c16 1000
e87d59f7
JK
1001 for (i = 0; i < MAX_NR_ZONES; i++)
1002 count += zone_page_state(zones + i, item);
1003
1004 return count;
c2d42c16
AM
1005}
1006
f19298b9
MG
1007/* Determine the per node value of a numa stat item. */
1008unsigned long sum_zone_numa_event_state(int node,
3a321d2a
KW
1009 enum numa_stat_item item)
1010{
1011 struct zone *zones = NODE_DATA(node)->node_zones;
3a321d2a 1012 unsigned long count = 0;
f19298b9 1013 int i;
3a321d2a
KW
1014
1015 for (i = 0; i < MAX_NR_ZONES; i++)
f19298b9 1016 count += zone_numa_event_state(zones + i, item);
3a321d2a
KW
1017
1018 return count;
1019}
1020
75ef7184
MG
1021/*
1022 * Determine the per node value of a stat item.
1023 */
ea426c2a
RG
1024unsigned long node_page_state_pages(struct pglist_data *pgdat,
1025 enum node_stat_item item)
75ef7184
MG
1026{
1027 long x = atomic_long_read(&pgdat->vm_stat[item]);
1028#ifdef CONFIG_SMP
1029 if (x < 0)
1030 x = 0;
1031#endif
1032 return x;
1033}
ea426c2a
RG
1034
1035unsigned long node_page_state(struct pglist_data *pgdat,
1036 enum node_stat_item item)
1037{
1038 VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1039
1040 return node_page_state_pages(pgdat, item);
1041}
ca889e6c
CL
1042#endif
1043
d7a5752c 1044#ifdef CONFIG_COMPACTION
36deb0be 1045
d7a5752c
MG
1046struct contig_page_info {
1047 unsigned long free_pages;
1048 unsigned long free_blocks_total;
1049 unsigned long free_blocks_suitable;
1050};
1051
1052/*
1053 * Calculate the number of free pages in a zone, how many contiguous
1054 * pages are free and how many are large enough to satisfy an allocation of
1055 * the target size. Note that this function makes no attempt to estimate
1056 * how many suitable free blocks there *might* be if MOVABLE pages were
1057 * migrated. Calculating that is possible, but expensive and can be
1058 * figured out from userspace
1059 */
1060static void fill_contig_page_info(struct zone *zone,
1061 unsigned int suitable_order,
1062 struct contig_page_info *info)
1063{
1064 unsigned int order;
1065
1066 info->free_pages = 0;
1067 info->free_blocks_total = 0;
1068 info->free_blocks_suitable = 0;
1069
1070 for (order = 0; order < MAX_ORDER; order++) {
1071 unsigned long blocks;
1072
1073 /* Count number of free blocks */
1074 blocks = zone->free_area[order].nr_free;
1075 info->free_blocks_total += blocks;
1076
1077 /* Count free base pages */
1078 info->free_pages += blocks << order;
1079
1080 /* Count the suitable free blocks */
1081 if (order >= suitable_order)
1082 info->free_blocks_suitable += blocks <<
1083 (order - suitable_order);
1084 }
1085}
f1a5ab12
MG
1086
1087/*
1088 * A fragmentation index only makes sense if an allocation of a requested
1089 * size would fail. If that is true, the fragmentation index indicates
1090 * whether external fragmentation or a lack of memory was the problem.
1091 * The value can be used to determine if page reclaim or compaction
1092 * should be used
1093 */
56de7263 1094static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
f1a5ab12
MG
1095{
1096 unsigned long requested = 1UL << order;
1097
88d6ac40
WY
1098 if (WARN_ON_ONCE(order >= MAX_ORDER))
1099 return 0;
1100
f1a5ab12
MG
1101 if (!info->free_blocks_total)
1102 return 0;
1103
1104 /* Fragmentation index only makes sense when a request would fail */
1105 if (info->free_blocks_suitable)
1106 return -1000;
1107
1108 /*
1109 * Index is between 0 and 1 so return within 3 decimal places
1110 *
1111 * 0 => allocation would fail due to lack of memory
1112 * 1 => allocation would fail due to fragmentation
1113 */
1114 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1115}
56de7263 1116
facdaa91
NG
1117/*
1118 * Calculates external fragmentation within a zone wrt the given order.
1119 * It is defined as the percentage of pages found in blocks of size
1120 * less than 1 << order. It returns values in range [0, 100].
1121 */
d34c0a75 1122unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
facdaa91
NG
1123{
1124 struct contig_page_info info;
1125
1126 fill_contig_page_info(zone, order, &info);
1127 if (info.free_pages == 0)
1128 return 0;
1129
1130 return div_u64((info.free_pages -
1131 (info.free_blocks_suitable << order)) * 100,
1132 info.free_pages);
1133}
1134
56de7263
MG
1135/* Same as __fragmentation index but allocs contig_page_info on stack */
1136int fragmentation_index(struct zone *zone, unsigned int order)
1137{
1138 struct contig_page_info info;
1139
1140 fill_contig_page_info(zone, order, &info);
1141 return __fragmentation_index(order, &info);
1142}
d7a5752c
MG
1143#endif
1144
ebc5d83d
KK
1145#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1146 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
fa25c503
KM
1147#ifdef CONFIG_ZONE_DMA
1148#define TEXT_FOR_DMA(xx) xx "_dma",
1149#else
1150#define TEXT_FOR_DMA(xx)
1151#endif
1152
1153#ifdef CONFIG_ZONE_DMA32
1154#define TEXT_FOR_DMA32(xx) xx "_dma32",
1155#else
1156#define TEXT_FOR_DMA32(xx)
1157#endif
1158
1159#ifdef CONFIG_HIGHMEM
1160#define TEXT_FOR_HIGHMEM(xx) xx "_high",
1161#else
1162#define TEXT_FOR_HIGHMEM(xx)
1163#endif
1164
1165#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1166 TEXT_FOR_HIGHMEM(xx) xx "_movable",
1167
1168const char * const vmstat_text[] = {
8d92890b 1169 /* enum zone_stat_item counters */
fa25c503 1170 "nr_free_pages",
71c799f4
MK
1171 "nr_zone_inactive_anon",
1172 "nr_zone_active_anon",
1173 "nr_zone_inactive_file",
1174 "nr_zone_active_file",
1175 "nr_zone_unevictable",
5a1c84b4 1176 "nr_zone_write_pending",
fa25c503 1177 "nr_mlock",
fa25c503 1178 "nr_bounce",
91537fee
MK
1179#if IS_ENABLED(CONFIG_ZSMALLOC)
1180 "nr_zspages",
1181#endif
3a321d2a
KW
1182 "nr_free_cma",
1183
1184 /* enum numa_stat_item counters */
fa25c503
KM
1185#ifdef CONFIG_NUMA
1186 "numa_hit",
1187 "numa_miss",
1188 "numa_foreign",
1189 "numa_interleave",
1190 "numa_local",
1191 "numa_other",
1192#endif
09316c09 1193
9d7ea9a2 1194 /* enum node_stat_item counters */
599d0c95
MG
1195 "nr_inactive_anon",
1196 "nr_active_anon",
1197 "nr_inactive_file",
1198 "nr_active_file",
1199 "nr_unevictable",
385386cf
JW
1200 "nr_slab_reclaimable",
1201 "nr_slab_unreclaimable",
599d0c95
MG
1202 "nr_isolated_anon",
1203 "nr_isolated_file",
68d48e6a 1204 "workingset_nodes",
170b04b7
JK
1205 "workingset_refault_anon",
1206 "workingset_refault_file",
1207 "workingset_activate_anon",
1208 "workingset_activate_file",
1209 "workingset_restore_anon",
1210 "workingset_restore_file",
1e6b1085 1211 "workingset_nodereclaim",
50658e2e
MG
1212 "nr_anon_pages",
1213 "nr_mapped",
11fb9989
MG
1214 "nr_file_pages",
1215 "nr_dirty",
1216 "nr_writeback",
1217 "nr_writeback_temp",
1218 "nr_shmem",
1219 "nr_shmem_hugepages",
1220 "nr_shmem_pmdmapped",
60fbf0ab
SL
1221 "nr_file_hugepages",
1222 "nr_file_pmdmapped",
11fb9989 1223 "nr_anon_transparent_hugepages",
c4a25635
MG
1224 "nr_vmscan_write",
1225 "nr_vmscan_immediate_reclaim",
1226 "nr_dirtied",
1227 "nr_written",
b29940c1 1228 "nr_kernel_misc_reclaimable",
1970dc6f
JH
1229 "nr_foll_pin_acquired",
1230 "nr_foll_pin_released",
991e7673
SB
1231 "nr_kernel_stack",
1232#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1233 "nr_shadow_call_stack",
1234#endif
f0c0c115 1235 "nr_page_table_pages",
b6038942
SB
1236#ifdef CONFIG_SWAP
1237 "nr_swapcached",
1238#endif
599d0c95 1239
09316c09 1240 /* enum writeback_stat_item counters */
fa25c503
KM
1241 "nr_dirty_threshold",
1242 "nr_dirty_background_threshold",
1243
ebc5d83d 1244#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
09316c09 1245 /* enum vm_event_item counters */
fa25c503
KM
1246 "pgpgin",
1247 "pgpgout",
1248 "pswpin",
1249 "pswpout",
1250
1251 TEXTS_FOR_ZONES("pgalloc")
7cc30fcf
MG
1252 TEXTS_FOR_ZONES("allocstall")
1253 TEXTS_FOR_ZONES("pgskip")
fa25c503
KM
1254
1255 "pgfree",
1256 "pgactivate",
1257 "pgdeactivate",
f7ad2a6c 1258 "pglazyfree",
fa25c503
KM
1259
1260 "pgfault",
1261 "pgmajfault",
854e9ed0 1262 "pglazyfreed",
fa25c503 1263
599d0c95 1264 "pgrefill",
798a6b87 1265 "pgreuse",
599d0c95
MG
1266 "pgsteal_kswapd",
1267 "pgsteal_direct",
668e4147
YS
1268 "pgdemote_kswapd",
1269 "pgdemote_direct",
599d0c95
MG
1270 "pgscan_kswapd",
1271 "pgscan_direct",
68243e76 1272 "pgscan_direct_throttle",
497a6c1b
JW
1273 "pgscan_anon",
1274 "pgscan_file",
1275 "pgsteal_anon",
1276 "pgsteal_file",
fa25c503
KM
1277
1278#ifdef CONFIG_NUMA
1279 "zone_reclaim_failed",
1280#endif
1281 "pginodesteal",
1282 "slabs_scanned",
fa25c503
KM
1283 "kswapd_inodesteal",
1284 "kswapd_low_wmark_hit_quickly",
1285 "kswapd_high_wmark_hit_quickly",
fa25c503 1286 "pageoutrun",
fa25c503
KM
1287
1288 "pgrotated",
1289
5509a5d2
DH
1290 "drop_pagecache",
1291 "drop_slab",
8e675f7a 1292 "oom_kill",
5509a5d2 1293
03c5a6e1
MG
1294#ifdef CONFIG_NUMA_BALANCING
1295 "numa_pte_updates",
72403b4a 1296 "numa_huge_pte_updates",
03c5a6e1
MG
1297 "numa_hint_faults",
1298 "numa_hint_faults_local",
1299 "numa_pages_migrated",
1300#endif
5647bc29
MG
1301#ifdef CONFIG_MIGRATION
1302 "pgmigrate_success",
1303 "pgmigrate_fail",
1a5bae25
AK
1304 "thp_migration_success",
1305 "thp_migration_fail",
1306 "thp_migration_split",
5647bc29 1307#endif
fa25c503 1308#ifdef CONFIG_COMPACTION
397487db
MG
1309 "compact_migrate_scanned",
1310 "compact_free_scanned",
1311 "compact_isolated",
fa25c503
KM
1312 "compact_stall",
1313 "compact_fail",
1314 "compact_success",
698b1b30 1315 "compact_daemon_wake",
7f354a54
DR
1316 "compact_daemon_migrate_scanned",
1317 "compact_daemon_free_scanned",
fa25c503
KM
1318#endif
1319
1320#ifdef CONFIG_HUGETLB_PAGE
1321 "htlb_buddy_alloc_success",
1322 "htlb_buddy_alloc_fail",
bbb26920
MK
1323#endif
1324#ifdef CONFIG_CMA
1325 "cma_alloc_success",
1326 "cma_alloc_fail",
fa25c503
KM
1327#endif
1328 "unevictable_pgs_culled",
1329 "unevictable_pgs_scanned",
1330 "unevictable_pgs_rescued",
1331 "unevictable_pgs_mlocked",
1332 "unevictable_pgs_munlocked",
1333 "unevictable_pgs_cleared",
1334 "unevictable_pgs_stranded",
fa25c503
KM
1335
1336#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1337 "thp_fault_alloc",
1338 "thp_fault_fallback",
85b9f46e 1339 "thp_fault_fallback_charge",
fa25c503
KM
1340 "thp_collapse_alloc",
1341 "thp_collapse_alloc_failed",
95ecedcd 1342 "thp_file_alloc",
dcdf11ee 1343 "thp_file_fallback",
85b9f46e 1344 "thp_file_fallback_charge",
95ecedcd 1345 "thp_file_mapped",
122afea9
KS
1346 "thp_split_page",
1347 "thp_split_page_failed",
f9719a03 1348 "thp_deferred_split_page",
122afea9 1349 "thp_split_pmd",
ce9311cf
YX
1350#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1351 "thp_split_pud",
1352#endif
d8a8e1f0
KS
1353 "thp_zero_page_alloc",
1354 "thp_zero_page_alloc_failed",
225311a4 1355 "thp_swpout",
fe490cc0 1356 "thp_swpout_fallback",
fa25c503 1357#endif
09316c09
KK
1358#ifdef CONFIG_MEMORY_BALLOON
1359 "balloon_inflate",
1360 "balloon_deflate",
1361#ifdef CONFIG_BALLOON_COMPACTION
1362 "balloon_migrate",
1363#endif
1364#endif /* CONFIG_MEMORY_BALLOON */
ec659934 1365#ifdef CONFIG_DEBUG_TLBFLUSH
9824cf97
DH
1366 "nr_tlb_remote_flush",
1367 "nr_tlb_remote_flush_received",
1368 "nr_tlb_local_flush_all",
1369 "nr_tlb_local_flush_one",
ec659934 1370#endif /* CONFIG_DEBUG_TLBFLUSH */
fa25c503 1371
4f115147
DB
1372#ifdef CONFIG_DEBUG_VM_VMACACHE
1373 "vmacache_find_calls",
1374 "vmacache_find_hits",
1375#endif
cbc65df2
HY
1376#ifdef CONFIG_SWAP
1377 "swap_ra",
1378 "swap_ra_hit",
1379#endif
575299ea
S
1380#ifdef CONFIG_X86
1381 "direct_map_level2_splits",
1382 "direct_map_level3_splits",
1383#endif
ebc5d83d 1384#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
fa25c503 1385};
ebc5d83d 1386#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
fa25c503 1387
3c486871
AM
1388#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1389 defined(CONFIG_PROC_FS)
1390static void *frag_start(struct seq_file *m, loff_t *pos)
1391{
1392 pg_data_t *pgdat;
1393 loff_t node = *pos;
1394
1395 for (pgdat = first_online_pgdat();
1396 pgdat && node;
1397 pgdat = next_online_pgdat(pgdat))
1398 --node;
1399
1400 return pgdat;
1401}
1402
1403static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1404{
1405 pg_data_t *pgdat = (pg_data_t *)arg;
1406
1407 (*pos)++;
1408 return next_online_pgdat(pgdat);
1409}
1410
1411static void frag_stop(struct seq_file *m, void *arg)
1412{
1413}
1414
b2bd8598
DR
1415/*
1416 * Walk zones in a node and print using a callback.
1417 * If @assert_populated is true, only use callback for zones that are populated.
1418 */
3c486871 1419static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
727c080f 1420 bool assert_populated, bool nolock,
3c486871
AM
1421 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1422{
1423 struct zone *zone;
1424 struct zone *node_zones = pgdat->node_zones;
1425 unsigned long flags;
1426
1427 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
b2bd8598 1428 if (assert_populated && !populated_zone(zone))
3c486871
AM
1429 continue;
1430
727c080f
VM
1431 if (!nolock)
1432 spin_lock_irqsave(&zone->lock, flags);
3c486871 1433 print(m, pgdat, zone);
727c080f
VM
1434 if (!nolock)
1435 spin_unlock_irqrestore(&zone->lock, flags);
3c486871
AM
1436 }
1437}
1438#endif
1439
d7a5752c 1440#ifdef CONFIG_PROC_FS
467c996c
MG
1441static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1442 struct zone *zone)
1443{
1444 int order;
1445
1446 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1447 for (order = 0; order < MAX_ORDER; ++order)
1448 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1449 seq_putc(m, '\n');
1450}
1451
1452/*
1453 * This walks the free areas for each zone.
1454 */
1455static int frag_show(struct seq_file *m, void *arg)
1456{
1457 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1458 walk_zones_in_node(m, pgdat, true, false, frag_show_print);
467c996c
MG
1459 return 0;
1460}
1461
1462static void pagetypeinfo_showfree_print(struct seq_file *m,
1463 pg_data_t *pgdat, struct zone *zone)
1464{
1465 int order, mtype;
1466
1467 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1468 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1469 pgdat->node_id,
1470 zone->name,
1471 migratetype_names[mtype]);
1472 for (order = 0; order < MAX_ORDER; ++order) {
1473 unsigned long freecount = 0;
1474 struct free_area *area;
1475 struct list_head *curr;
93b3a674 1476 bool overflow = false;
467c996c
MG
1477
1478 area = &(zone->free_area[order]);
1479
93b3a674
MH
1480 list_for_each(curr, &area->free_list[mtype]) {
1481 /*
1482 * Cap the free_list iteration because it might
1483 * be really large and we are under a spinlock
1484 * so a long time spent here could trigger a
1485 * hard lockup detector. Anyway this is a
1486 * debugging tool so knowing there is a handful
1487 * of pages of this order should be more than
1488 * sufficient.
1489 */
1490 if (++freecount >= 100000) {
1491 overflow = true;
1492 break;
1493 }
1494 }
1495 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1496 spin_unlock_irq(&zone->lock);
1497 cond_resched();
1498 spin_lock_irq(&zone->lock);
467c996c 1499 }
f6ac2354
CL
1500 seq_putc(m, '\n');
1501 }
467c996c
MG
1502}
1503
1504/* Print out the free pages at each order for each migatetype */
33090af9 1505static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
467c996c
MG
1506{
1507 int order;
1508 pg_data_t *pgdat = (pg_data_t *)arg;
1509
1510 /* Print header */
1511 seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1512 for (order = 0; order < MAX_ORDER; ++order)
1513 seq_printf(m, "%6d ", order);
1514 seq_putc(m, '\n');
1515
727c080f 1516 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
467c996c
MG
1517}
1518
1519static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1520 pg_data_t *pgdat, struct zone *zone)
1521{
1522 int mtype;
1523 unsigned long pfn;
1524 unsigned long start_pfn = zone->zone_start_pfn;
108bcc96 1525 unsigned long end_pfn = zone_end_pfn(zone);
467c996c
MG
1526 unsigned long count[MIGRATE_TYPES] = { 0, };
1527
1528 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1529 struct page *page;
1530
d336e94e
MH
1531 page = pfn_to_online_page(pfn);
1532 if (!page)
467c996c
MG
1533 continue;
1534
a91c43c7
JK
1535 if (page_zone(page) != zone)
1536 continue;
1537
467c996c
MG
1538 mtype = get_pageblock_migratetype(page);
1539
e80d6a24
MG
1540 if (mtype < MIGRATE_TYPES)
1541 count[mtype]++;
467c996c
MG
1542 }
1543
1544 /* Print counts */
1545 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1546 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1547 seq_printf(m, "%12lu ", count[mtype]);
1548 seq_putc(m, '\n');
1549}
1550
f113e641 1551/* Print out the number of pageblocks for each migratetype */
33090af9 1552static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
467c996c
MG
1553{
1554 int mtype;
1555 pg_data_t *pgdat = (pg_data_t *)arg;
1556
1557 seq_printf(m, "\n%-23s", "Number of blocks type ");
1558 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1559 seq_printf(m, "%12s ", migratetype_names[mtype]);
1560 seq_putc(m, '\n');
727c080f
VM
1561 walk_zones_in_node(m, pgdat, true, false,
1562 pagetypeinfo_showblockcount_print);
467c996c
MG
1563}
1564
48c96a36
JK
1565/*
1566 * Print out the number of pageblocks for each migratetype that contain pages
1567 * of other types. This gives an indication of how well fallbacks are being
1568 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1569 * to determine what is going on
1570 */
1571static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1572{
1573#ifdef CONFIG_PAGE_OWNER
1574 int mtype;
1575
7dd80b8a 1576 if (!static_branch_unlikely(&page_owner_inited))
48c96a36
JK
1577 return;
1578
1579 drain_all_pages(NULL);
1580
1581 seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1582 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1583 seq_printf(m, "%12s ", migratetype_names[mtype]);
1584 seq_putc(m, '\n');
1585
727c080f
VM
1586 walk_zones_in_node(m, pgdat, true, true,
1587 pagetypeinfo_showmixedcount_print);
48c96a36
JK
1588#endif /* CONFIG_PAGE_OWNER */
1589}
1590
467c996c
MG
1591/*
1592 * This prints out statistics in relation to grouping pages by mobility.
1593 * It is expensive to collect so do not constantly read the file.
1594 */
1595static int pagetypeinfo_show(struct seq_file *m, void *arg)
1596{
1597 pg_data_t *pgdat = (pg_data_t *)arg;
1598
41b25a37 1599 /* check memoryless node */
a47b53c5 1600 if (!node_state(pgdat->node_id, N_MEMORY))
41b25a37
KM
1601 return 0;
1602
467c996c
MG
1603 seq_printf(m, "Page block order: %d\n", pageblock_order);
1604 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages);
1605 seq_putc(m, '\n');
1606 pagetypeinfo_showfree(m, pgdat);
1607 pagetypeinfo_showblockcount(m, pgdat);
48c96a36 1608 pagetypeinfo_showmixedcount(m, pgdat);
467c996c 1609
f6ac2354
CL
1610 return 0;
1611}
1612
8f32f7e5 1613static const struct seq_operations fragmentation_op = {
f6ac2354
CL
1614 .start = frag_start,
1615 .next = frag_next,
1616 .stop = frag_stop,
1617 .show = frag_show,
1618};
1619
74e2e8e8 1620static const struct seq_operations pagetypeinfo_op = {
467c996c
MG
1621 .start = frag_start,
1622 .next = frag_next,
1623 .stop = frag_stop,
1624 .show = pagetypeinfo_show,
1625};
1626
e2ecc8a7
MG
1627static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1628{
1629 int zid;
1630
1631 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1632 struct zone *compare = &pgdat->node_zones[zid];
1633
1634 if (populated_zone(compare))
1635 return zone == compare;
1636 }
1637
e2ecc8a7
MG
1638 return false;
1639}
1640
467c996c
MG
1641static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1642 struct zone *zone)
f6ac2354 1643{
467c996c
MG
1644 int i;
1645 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
e2ecc8a7
MG
1646 if (is_zone_first_populated(pgdat, zone)) {
1647 seq_printf(m, "\n per-node stats");
1648 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
69473e5d
MS
1649 unsigned long pages = node_page_state_pages(pgdat, i);
1650
1651 if (vmstat_item_print_in_thp(i))
1652 pages /= HPAGE_PMD_NR;
9d7ea9a2 1653 seq_printf(m, "\n %-12s %lu", node_stat_name(i),
69473e5d 1654 pages);
e2ecc8a7
MG
1655 }
1656 }
467c996c
MG
1657 seq_printf(m,
1658 "\n pages free %lu"
1659 "\n min %lu"
1660 "\n low %lu"
1661 "\n high %lu"
467c996c 1662 "\n spanned %lu"
9feedc9d 1663 "\n present %lu"
3c381db1
DH
1664 "\n managed %lu"
1665 "\n cma %lu",
88f5acf8 1666 zone_page_state(zone, NR_FREE_PAGES),
41858966
MG
1667 min_wmark_pages(zone),
1668 low_wmark_pages(zone),
1669 high_wmark_pages(zone),
467c996c 1670 zone->spanned_pages,
9feedc9d 1671 zone->present_pages,
3c381db1
DH
1672 zone_managed_pages(zone),
1673 zone_cma_pages(zone));
467c996c 1674
467c996c 1675 seq_printf(m,
3484b2de 1676 "\n protection: (%ld",
467c996c
MG
1677 zone->lowmem_reserve[0]);
1678 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
3484b2de 1679 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
7dfb8bf3
DR
1680 seq_putc(m, ')');
1681
a8a4b7ae
BH
1682 /* If unpopulated, no other information is useful */
1683 if (!populated_zone(zone)) {
1684 seq_putc(m, '\n');
1685 return;
1686 }
1687
7dfb8bf3 1688 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
9d7ea9a2
KK
1689 seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
1690 zone_page_state(zone, i));
7dfb8bf3 1691
3a321d2a 1692#ifdef CONFIG_NUMA
f19298b9 1693 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
9d7ea9a2 1694 seq_printf(m, "\n %-12s %lu", numa_stat_name(i),
f19298b9 1695 zone_numa_event_state(zone, i));
3a321d2a
KW
1696#endif
1697
7dfb8bf3 1698 seq_printf(m, "\n pagesets");
467c996c 1699 for_each_online_cpu(i) {
28f836b6
MG
1700 struct per_cpu_pages *pcp;
1701 struct per_cpu_zonestat __maybe_unused *pzstats;
467c996c 1702
28f836b6 1703 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
3dfa5721
CL
1704 seq_printf(m,
1705 "\n cpu: %i"
1706 "\n count: %i"
1707 "\n high: %i"
1708 "\n batch: %i",
1709 i,
28f836b6
MG
1710 pcp->count,
1711 pcp->high,
1712 pcp->batch);
df9ecaba 1713#ifdef CONFIG_SMP
28f836b6 1714 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
467c996c 1715 seq_printf(m, "\n vm stats threshold: %d",
28f836b6 1716 pzstats->stat_threshold);
df9ecaba 1717#endif
f6ac2354 1718 }
467c996c 1719 seq_printf(m,
599d0c95 1720 "\n node_unreclaimable: %u"
3a50d14d 1721 "\n start_pfn: %lu",
c73322d0 1722 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
3a50d14d 1723 zone->zone_start_pfn);
467c996c
MG
1724 seq_putc(m, '\n');
1725}
1726
1727/*
b2bd8598
DR
1728 * Output information about zones in @pgdat. All zones are printed regardless
1729 * of whether they are populated or not: lowmem_reserve_ratio operates on the
1730 * set of all zones and userspace would not be aware of such zones if they are
1731 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
467c996c
MG
1732 */
1733static int zoneinfo_show(struct seq_file *m, void *arg)
1734{
1735 pg_data_t *pgdat = (pg_data_t *)arg;
727c080f 1736 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
f6ac2354
CL
1737 return 0;
1738}
1739
5c9fe628 1740static const struct seq_operations zoneinfo_op = {
f6ac2354
CL
1741 .start = frag_start, /* iterate over all zones. The same as in
1742 * fragmentation. */
1743 .next = frag_next,
1744 .stop = frag_stop,
1745 .show = zoneinfo_show,
1746};
1747
9d7ea9a2 1748#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
f19298b9 1749 NR_VM_NUMA_EVENT_ITEMS + \
9d7ea9a2
KK
1750 NR_VM_NODE_STAT_ITEMS + \
1751 NR_VM_WRITEBACK_STAT_ITEMS + \
1752 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1753 NR_VM_EVENT_ITEMS : 0))
79da826a 1754
f6ac2354
CL
1755static void *vmstat_start(struct seq_file *m, loff_t *pos)
1756{
2244b95a 1757 unsigned long *v;
9d7ea9a2 1758 int i;
f6ac2354 1759
9d7ea9a2 1760 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354 1761 return NULL;
79da826a 1762
9d7ea9a2 1763 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
f19298b9 1764 fold_vm_numa_events();
9d7ea9a2 1765 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
2244b95a
CL
1766 m->private = v;
1767 if (!v)
f6ac2354 1768 return ERR_PTR(-ENOMEM);
2244b95a 1769 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
c41f012a 1770 v[i] = global_zone_page_state(i);
79da826a
MR
1771 v += NR_VM_ZONE_STAT_ITEMS;
1772
3a321d2a 1773#ifdef CONFIG_NUMA
f19298b9
MG
1774 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1775 v[i] = global_numa_event_state(i);
1776 v += NR_VM_NUMA_EVENT_ITEMS;
3a321d2a
KW
1777#endif
1778
69473e5d 1779 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
ea426c2a 1780 v[i] = global_node_page_state_pages(i);
69473e5d
MS
1781 if (vmstat_item_print_in_thp(i))
1782 v[i] /= HPAGE_PMD_NR;
1783 }
75ef7184
MG
1784 v += NR_VM_NODE_STAT_ITEMS;
1785
79da826a
MR
1786 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1787 v + NR_DIRTY_THRESHOLD);
1788 v += NR_VM_WRITEBACK_STAT_ITEMS;
1789
f8891e5e 1790#ifdef CONFIG_VM_EVENT_COUNTERS
79da826a
MR
1791 all_vm_events(v);
1792 v[PGPGIN] /= 2; /* sectors -> kbytes */
1793 v[PGPGOUT] /= 2;
f8891e5e 1794#endif
ff8b16d7 1795 return (unsigned long *)m->private + *pos;
f6ac2354
CL
1796}
1797
1798static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1799{
1800 (*pos)++;
9d7ea9a2 1801 if (*pos >= NR_VMSTAT_ITEMS)
f6ac2354
CL
1802 return NULL;
1803 return (unsigned long *)m->private + *pos;
1804}
1805
1806static int vmstat_show(struct seq_file *m, void *arg)
1807{
1808 unsigned long *l = arg;
1809 unsigned long off = l - (unsigned long *)m->private;
68ba0326
AD
1810
1811 seq_puts(m, vmstat_text[off]);
75ba1d07 1812 seq_put_decimal_ull(m, " ", *l);
68ba0326 1813 seq_putc(m, '\n');
8d92890b
N
1814
1815 if (off == NR_VMSTAT_ITEMS - 1) {
1816 /*
1817 * We've come to the end - add any deprecated counters to avoid
1818 * breaking userspace which might depend on them being present.
1819 */
1820 seq_puts(m, "nr_unstable 0\n");
1821 }
f6ac2354
CL
1822 return 0;
1823}
1824
1825static void vmstat_stop(struct seq_file *m, void *arg)
1826{
1827 kfree(m->private);
1828 m->private = NULL;
1829}
1830
b6aa44ab 1831static const struct seq_operations vmstat_op = {
f6ac2354
CL
1832 .start = vmstat_start,
1833 .next = vmstat_next,
1834 .stop = vmstat_stop,
1835 .show = vmstat_show,
1836};
f6ac2354
CL
1837#endif /* CONFIG_PROC_FS */
1838
df9ecaba 1839#ifdef CONFIG_SMP
d1187ed2 1840static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
77461ab3 1841int sysctl_stat_interval __read_mostly = HZ;
d1187ed2 1842
52b6f46b
HD
1843#ifdef CONFIG_PROC_FS
1844static void refresh_vm_stats(struct work_struct *work)
1845{
1846 refresh_cpu_vm_stats(true);
1847}
1848
1849int vmstat_refresh(struct ctl_table *table, int write,
32927393 1850 void *buffer, size_t *lenp, loff_t *ppos)
52b6f46b
HD
1851{
1852 long val;
1853 int err;
1854 int i;
1855
1856 /*
1857 * The regular update, every sysctl_stat_interval, may come later
1858 * than expected: leaving a significant amount in per_cpu buckets.
1859 * This is particularly misleading when checking a quantity of HUGE
1860 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1861 * which can equally be echo'ed to or cat'ted from (by root),
1862 * can be used to update the stats just before reading them.
1863 *
c41f012a 1864 * Oh, and since global_zone_page_state() etc. are so careful to hide
52b6f46b
HD
1865 * transiently negative values, report an error here if any of
1866 * the stats is negative, so we know to go looking for imbalance.
1867 */
1868 err = schedule_on_each_cpu(refresh_vm_stats);
1869 if (err)
1870 return err;
1871 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
75083aae
HD
1872 /*
1873 * Skip checking stats known to go negative occasionally.
1874 */
1875 switch (i) {
1876 case NR_ZONE_WRITE_PENDING:
1877 case NR_FREE_CMA_PAGES:
1878 continue;
1879 }
75ef7184 1880 val = atomic_long_read(&vm_zone_stat[i]);
52b6f46b 1881 if (val < 0) {
c822f622 1882 pr_warn("%s: %s %ld\n",
9d7ea9a2 1883 __func__, zone_stat_name(i), val);
52b6f46b
HD
1884 }
1885 }
76d8cc3c 1886 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
75083aae
HD
1887 /*
1888 * Skip checking stats known to go negative occasionally.
1889 */
1890 switch (i) {
1891 case NR_WRITEBACK:
1892 continue;
1893 }
76d8cc3c
HD
1894 val = atomic_long_read(&vm_node_stat[i]);
1895 if (val < 0) {
1896 pr_warn("%s: %s %ld\n",
1897 __func__, node_stat_name(i), val);
76d8cc3c
HD
1898 }
1899 }
52b6f46b
HD
1900 if (write)
1901 *ppos += *lenp;
1902 else
1903 *lenp = 0;
1904 return 0;
1905}
1906#endif /* CONFIG_PROC_FS */
1907
d1187ed2
CL
1908static void vmstat_update(struct work_struct *w)
1909{
0eb77e98 1910 if (refresh_cpu_vm_stats(true)) {
7cc36bbd
CL
1911 /*
1912 * Counters were updated so we expect more updates
1913 * to occur in the future. Keep on running the
1914 * update worker thread.
1915 */
ce612879 1916 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
f01f17d3
MH
1917 this_cpu_ptr(&vmstat_work),
1918 round_jiffies_relative(sysctl_stat_interval));
7cc36bbd
CL
1919 }
1920}
1921
1922/*
1923 * Check if the diffs for a certain cpu indicate that
1924 * an update is needed.
1925 */
1926static bool need_update(int cpu)
1927{
2bbd00ae 1928 pg_data_t *last_pgdat = NULL;
7cc36bbd
CL
1929 struct zone *zone;
1930
1931 for_each_populated_zone(zone) {
28f836b6 1932 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2bbd00ae 1933 struct per_cpu_nodestat *n;
28f836b6 1934
7cc36bbd
CL
1935 /*
1936 * The fast way of checking if there are any vmstat diffs.
7cc36bbd 1937 */
64632fd3 1938 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
7cc36bbd 1939 return true;
f19298b9 1940
2bbd00ae
JW
1941 if (last_pgdat == zone->zone_pgdat)
1942 continue;
1943 last_pgdat = zone->zone_pgdat;
1944 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
64632fd3
ML
1945 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
1946 return true;
7cc36bbd
CL
1947 }
1948 return false;
1949}
1950
7b8da4c7
CL
1951/*
1952 * Switch off vmstat processing and then fold all the remaining differentials
1953 * until the diffs stay at zero. The function is used by NOHZ and can only be
1954 * invoked when tick processing is not active.
1955 */
f01f17d3
MH
1956void quiet_vmstat(void)
1957{
1958 if (system_state != SYSTEM_RUNNING)
1959 return;
1960
7b8da4c7 1961 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
f01f17d3
MH
1962 return;
1963
1964 if (!need_update(smp_processor_id()))
1965 return;
1966
1967 /*
1968 * Just refresh counters and do not care about the pending delayed
1969 * vmstat_update. It doesn't fire that often to matter and canceling
1970 * it would be too expensive from this path.
1971 * vmstat_shepherd will take care about that for us.
1972 */
1973 refresh_cpu_vm_stats(false);
1974}
1975
7cc36bbd
CL
1976/*
1977 * Shepherd worker thread that checks the
1978 * differentials of processors that have their worker
1979 * threads for vm statistics updates disabled because of
1980 * inactivity.
1981 */
1982static void vmstat_shepherd(struct work_struct *w);
1983
0eb77e98 1984static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
7cc36bbd
CL
1985
1986static void vmstat_shepherd(struct work_struct *w)
1987{
1988 int cpu;
1989
7625eccd 1990 cpus_read_lock();
7cc36bbd 1991 /* Check processors whose vmstat worker threads have been disabled */
7b8da4c7 1992 for_each_online_cpu(cpu) {
f01f17d3 1993 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
7cc36bbd 1994
7b8da4c7 1995 if (!delayed_work_pending(dw) && need_update(cpu))
ce612879 1996 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
fbcc8183
JB
1997
1998 cond_resched();
f01f17d3 1999 }
7625eccd 2000 cpus_read_unlock();
7cc36bbd
CL
2001
2002 schedule_delayed_work(&shepherd,
98f4ebb2 2003 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2004}
2005
7cc36bbd 2006static void __init start_shepherd_timer(void)
d1187ed2 2007{
7cc36bbd
CL
2008 int cpu;
2009
2010 for_each_possible_cpu(cpu)
ccde8bd4 2011 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
7cc36bbd
CL
2012 vmstat_update);
2013
7cc36bbd
CL
2014 schedule_delayed_work(&shepherd,
2015 round_jiffies_relative(sysctl_stat_interval));
d1187ed2
CL
2016}
2017
03e86dba
TC
2018static void __init init_cpu_node_state(void)
2019{
4c501327 2020 int node;
03e86dba 2021
4c501327
SAS
2022 for_each_online_node(node) {
2023 if (cpumask_weight(cpumask_of_node(node)) > 0)
2024 node_set_state(node, N_CPU);
2025 }
03e86dba
TC
2026}
2027
5438da97
SAS
2028static int vmstat_cpu_online(unsigned int cpu)
2029{
2030 refresh_zone_stat_thresholds();
2031 node_set_state(cpu_to_node(cpu), N_CPU);
2032 return 0;
2033}
2034
2035static int vmstat_cpu_down_prep(unsigned int cpu)
2036{
2037 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2038 return 0;
2039}
2040
2041static int vmstat_cpu_dead(unsigned int cpu)
807a1bd2 2042{
4c501327 2043 const struct cpumask *node_cpus;
5438da97 2044 int node;
807a1bd2 2045
5438da97
SAS
2046 node = cpu_to_node(cpu);
2047
2048 refresh_zone_stat_thresholds();
4c501327
SAS
2049 node_cpus = cpumask_of_node(node);
2050 if (cpumask_weight(node_cpus) > 0)
5438da97 2051 return 0;
807a1bd2
TK
2052
2053 node_clear_state(node, N_CPU);
5438da97 2054 return 0;
807a1bd2
TK
2055}
2056
8f32f7e5 2057#endif
df9ecaba 2058
ce612879
MH
2059struct workqueue_struct *mm_percpu_wq;
2060
597b7305 2061void __init init_mm_internals(void)
df9ecaba 2062{
ce612879 2063 int ret __maybe_unused;
5438da97 2064
80d136e1 2065 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
ce612879
MH
2066
2067#ifdef CONFIG_SMP
5438da97
SAS
2068 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2069 NULL, vmstat_cpu_dead);
2070 if (ret < 0)
2071 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2072
2073 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2074 vmstat_cpu_online,
2075 vmstat_cpu_down_prep);
2076 if (ret < 0)
2077 pr_err("vmstat: failed to register 'online' hotplug state\n");
2078
7625eccd 2079 cpus_read_lock();
03e86dba 2080 init_cpu_node_state();
7625eccd 2081 cpus_read_unlock();
d1187ed2 2082
7cc36bbd 2083 start_shepherd_timer();
8f32f7e5
AD
2084#endif
2085#ifdef CONFIG_PROC_FS
fddda2b7 2086 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
abaed011 2087 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
fddda2b7
CH
2088 proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2089 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
8f32f7e5 2090#endif
df9ecaba 2091}
d7a5752c
MG
2092
2093#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
d7a5752c
MG
2094
2095/*
2096 * Return an index indicating how much of the available free memory is
2097 * unusable for an allocation of the requested size.
2098 */
2099static int unusable_free_index(unsigned int order,
2100 struct contig_page_info *info)
2101{
2102 /* No free memory is interpreted as all free memory is unusable */
2103 if (info->free_pages == 0)
2104 return 1000;
2105
2106 /*
2107 * Index should be a value between 0 and 1. Return a value to 3
2108 * decimal places.
2109 *
2110 * 0 => no fragmentation
2111 * 1 => high fragmentation
2112 */
2113 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2114
2115}
2116
2117static void unusable_show_print(struct seq_file *m,
2118 pg_data_t *pgdat, struct zone *zone)
2119{
2120 unsigned int order;
2121 int index;
2122 struct contig_page_info info;
2123
2124 seq_printf(m, "Node %d, zone %8s ",
2125 pgdat->node_id,
2126 zone->name);
2127 for (order = 0; order < MAX_ORDER; ++order) {
2128 fill_contig_page_info(zone, order, &info);
2129 index = unusable_free_index(order, &info);
2130 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2131 }
2132
2133 seq_putc(m, '\n');
2134}
2135
2136/*
2137 * Display unusable free space index
2138 *
2139 * The unusable free space index measures how much of the available free
2140 * memory cannot be used to satisfy an allocation of a given size and is a
2141 * value between 0 and 1. The higher the value, the more of free memory is
2142 * unusable and by implication, the worse the external fragmentation is. This
2143 * can be expressed as a percentage by multiplying by 100.
2144 */
2145static int unusable_show(struct seq_file *m, void *arg)
2146{
2147 pg_data_t *pgdat = (pg_data_t *)arg;
2148
2149 /* check memoryless node */
a47b53c5 2150 if (!node_state(pgdat->node_id, N_MEMORY))
d7a5752c
MG
2151 return 0;
2152
727c080f 2153 walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
d7a5752c
MG
2154
2155 return 0;
2156}
2157
01a99560 2158static const struct seq_operations unusable_sops = {
d7a5752c
MG
2159 .start = frag_start,
2160 .next = frag_next,
2161 .stop = frag_stop,
2162 .show = unusable_show,
2163};
2164
01a99560 2165DEFINE_SEQ_ATTRIBUTE(unusable);
d7a5752c 2166
f1a5ab12
MG
2167static void extfrag_show_print(struct seq_file *m,
2168 pg_data_t *pgdat, struct zone *zone)
2169{
2170 unsigned int order;
2171 int index;
2172
2173 /* Alloc on stack as interrupts are disabled for zone walk */
2174 struct contig_page_info info;
2175
2176 seq_printf(m, "Node %d, zone %8s ",
2177 pgdat->node_id,
2178 zone->name);
2179 for (order = 0; order < MAX_ORDER; ++order) {
2180 fill_contig_page_info(zone, order, &info);
56de7263 2181 index = __fragmentation_index(order, &info);
f1a5ab12
MG
2182 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2183 }
2184
2185 seq_putc(m, '\n');
2186}
2187
2188/*
2189 * Display fragmentation index for orders that allocations would fail for
2190 */
2191static int extfrag_show(struct seq_file *m, void *arg)
2192{
2193 pg_data_t *pgdat = (pg_data_t *)arg;
2194
727c080f 2195 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
f1a5ab12
MG
2196
2197 return 0;
2198}
2199
01a99560 2200static const struct seq_operations extfrag_sops = {
f1a5ab12
MG
2201 .start = frag_start,
2202 .next = frag_next,
2203 .stop = frag_stop,
2204 .show = extfrag_show,
2205};
2206
01a99560 2207DEFINE_SEQ_ATTRIBUTE(extfrag);
f1a5ab12 2208
d7a5752c
MG
2209static int __init extfrag_debug_init(void)
2210{
bde8bd8a
S
2211 struct dentry *extfrag_debug_root;
2212
d7a5752c 2213 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
d7a5752c 2214
d9f7979c 2215 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
01a99560 2216 &unusable_fops);
d7a5752c 2217
d9f7979c 2218 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
01a99560 2219 &extfrag_fops);
f1a5ab12 2220
d7a5752c
MG
2221 return 0;
2222}
2223
2224module_init(extfrag_debug_init);
2225#endif