]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/kernel/cpu/mce/core.c
x86/mce: Change default MCE logger to check mce->kflags
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kernel / cpu / mce / core.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Machine check handler.
e9eee03e 4 *
1da177e4 5 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
6 * Rest from unknown author(s).
7 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
8 * Copyright 2008 Intel Corporation
9 * Author: Andi Kleen
1da177e4 10 */
c767a54b 11
e9eee03e
IM
12#include <linux/thread_info.h>
13#include <linux/capability.h>
14#include <linux/miscdevice.h>
15#include <linux/ratelimit.h>
e9eee03e 16#include <linux/rcupdate.h>
e9eee03e 17#include <linux/kobject.h>
14a02530 18#include <linux/uaccess.h>
e9eee03e
IM
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
1da177e4 22#include <linux/string.h>
8a25a2fd 23#include <linux/device.h>
f3c6ea1b 24#include <linux/syscore_ops.h>
3c079792 25#include <linux/delay.h>
8c566ef5 26#include <linux/ctype.h>
e9eee03e 27#include <linux/sched.h>
0d7482e3 28#include <linux/sysfs.h>
e9eee03e 29#include <linux/types.h>
5a0e3ad6 30#include <linux/slab.h>
e9eee03e
IM
31#include <linux/init.h>
32#include <linux/kmod.h>
33#include <linux/poll.h>
3c079792 34#include <linux/nmi.h>
e9eee03e 35#include <linux/cpu.h>
011d8261 36#include <linux/ras.h>
14a02530 37#include <linux/smp.h>
e9eee03e 38#include <linux/fs.h>
9b1beaf2 39#include <linux/mm.h>
5be9ed25 40#include <linux/debugfs.h>
b77e70bf 41#include <linux/irq_work.h>
69c60c88 42#include <linux/export.h>
3637efb0 43#include <linux/jump_label.h>
284ce401 44#include <linux/set_memory.h>
e9eee03e 45
3f5a7896 46#include <asm/intel-family.h>
d88203d1 47#include <asm/processor.h>
95927475 48#include <asm/traps.h>
375074cc 49#include <asm/tlbflush.h>
e9eee03e
IM
50#include <asm/mce.h>
51#include <asm/msr.h>
5bc32950 52#include <asm/reboot.h>
1da177e4 53
21afaf18 54#include "internal.h"
711c2e48 55
b3b7c479
SH
56/* sysfs synchronization */
57static DEFINE_MUTEX(mce_sysfs_mutex);
58
8968f9d3
HS
59#define CREATE_TRACE_POINTS
60#include <trace/events/mce.h>
61
3f2f0680 62#define SPINUNIT 100 /* 100ns */
3c079792 63
01ca79f1
AK
64DEFINE_PER_CPU(unsigned, mce_exception_count);
65
c7d314f3
YG
66DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
67
95fdce6b
YG
68struct mce_bank {
69 u64 ctl; /* subevents to enable */
70 bool init; /* initialise bank? */
b4914508
YG
71};
72static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
73
74#define ATTR_LEN 16
75/* One object for each MCE bank, shared by all CPUs */
76struct mce_bank_dev {
95fdce6b
YG
77 struct device_attribute attr; /* device attribute */
78 char attrname[ATTR_LEN]; /* attribute name */
b4914508 79 u8 bank; /* bank number */
95fdce6b 80};
b4914508 81static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
95fdce6b 82
bf80bbd7 83struct mce_vendor_flags mce_flags __read_mostly;
cebe1820 84
d203f0b8 85struct mca_config mca_cfg __read_mostly = {
84c2559d 86 .bootlog = -1,
d203f0b8
BP
87 /*
88 * Tolerant levels:
89 * 0: always panic on uncorrected errors, log corrected errors
90 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
91 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
92 * 3: never panic or SIGBUS, log all errors (for testing only)
93 */
84c2559d
BP
94 .tolerant = 1,
95 .monarch_timeout = -1
d203f0b8
BP
96};
97
3c079792 98static DEFINE_PER_CPU(struct mce, mces_seen);
5de97c9f
TL
99static unsigned long mce_need_notify;
100static int cpu_missing;
3c079792 101
0644414e
NR
102/*
103 * MCA banks polled by the period polling timer for corrected events.
104 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
105 */
ee031c31
AK
106DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
107 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
108};
109
c3d1fb56
NR
110/*
111 * MCA banks controlled through firmware first for corrected errors.
112 * This is a global list of banks for which we won't enable CMCI and we
113 * won't poll. Firmware controls these banks and is responsible for
114 * reporting corrected errors through GHES. Uncorrected/recoverable
115 * errors are still notified through a machine check.
116 */
117mce_banks_t mce_banks_ce_disabled;
118
061120ae
CG
119static struct work_struct mce_work;
120static struct irq_work mce_irq_work;
9b1beaf2 121
61b0fccd
TL
122static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
123
3653ada5
BP
124/*
125 * CPU/chipset specific EDAC code can register a notifier call here to print
126 * MCE errors in a human-readable form.
127 */
0dc9c639 128BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
3653ada5 129
b5f2fa4e
AK
130/* Do initial initialization of a struct mce */
131void mce_setup(struct mce *m)
132{
133 memset(m, 0, sizeof(struct mce));
d620c67f 134 m->cpu = m->extcpu = smp_processor_id();
bc39f010
AB
135 /* need the internal __ version to avoid deadlocks */
136 m->time = __ktime_get_real_seconds();
8ee08347
AK
137 m->cpuvendor = boot_cpu_data.x86_vendor;
138 m->cpuid = cpuid_eax(1);
8ee08347 139 m->socketid = cpu_data(m->extcpu).phys_proc_id;
8ee08347
AK
140 m->apicid = cpu_data(m->extcpu).initial_apicid;
141 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
3f5a7896
TL
142
143 if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
144 rdmsrl(MSR_PPIN, m->ppin);
077168e2
WH
145 else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
146 rdmsrl(MSR_AMD_PPIN, m->ppin);
fa94d0c6
TL
147
148 m->microcode = boot_cpu_data.microcode;
b5f2fa4e
AK
149}
150
ea149b36
AK
151DEFINE_PER_CPU(struct mce, injectm);
152EXPORT_PER_CPU_SYMBOL_GPL(injectm);
153
fe3ed20f 154void mce_log(struct mce *m)
1da177e4 155{
fe3ed20f 156 if (!mce_gen_pool_add(m))
f29a7aff 157 irq_work_queue(&mce_irq_work);
1da177e4 158}
81736abd 159EXPORT_SYMBOL_GPL(mce_log);
09371957 160
3653ada5
BP
161void mce_register_decode_chain(struct notifier_block *nb)
162{
415601b1 163 if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
32b40a82 164 return;
cd9c57ca 165
0dc9c639 166 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
3653ada5
BP
167}
168EXPORT_SYMBOL_GPL(mce_register_decode_chain);
169
170void mce_unregister_decode_chain(struct notifier_block *nb)
171{
0dc9c639 172 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
3653ada5
BP
173}
174EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
175
a9750a31
YG
176static inline u32 ctl_reg(int bank)
177{
178 return MSR_IA32_MCx_CTL(bank);
179}
180
181static inline u32 status_reg(int bank)
182{
183 return MSR_IA32_MCx_STATUS(bank);
184}
185
186static inline u32 addr_reg(int bank)
187{
188 return MSR_IA32_MCx_ADDR(bank);
189}
190
191static inline u32 misc_reg(int bank)
192{
193 return MSR_IA32_MCx_MISC(bank);
194}
195
196static inline u32 smca_ctl_reg(int bank)
197{
198 return MSR_AMD64_SMCA_MCx_CTL(bank);
199}
200
201static inline u32 smca_status_reg(int bank)
202{
203 return MSR_AMD64_SMCA_MCx_STATUS(bank);
204}
205
206static inline u32 smca_addr_reg(int bank)
207{
208 return MSR_AMD64_SMCA_MCx_ADDR(bank);
209}
210
211static inline u32 smca_misc_reg(int bank)
212{
213 return MSR_AMD64_SMCA_MCx_MISC(bank);
214}
215
216struct mca_msr_regs msr_ops = {
217 .ctl = ctl_reg,
218 .status = status_reg,
219 .addr = addr_reg,
220 .misc = misc_reg
221};
222
cd9c57ca 223static void __print_mce(struct mce *m)
1da177e4 224{
cd9c57ca
BP
225 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
226 m->extcpu,
227 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
228 m->mcgstatus, m->bank, m->status);
f436f8bb 229
65ea5b03 230 if (m->ip) {
a2d7b0d4 231 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
f436f8bb 232 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
cd9c57ca 233 m->cs, m->ip);
f436f8bb 234
1da177e4 235 if (m->cs == __KERNEL_CS)
c80c5ec1 236 pr_cont("{%pS}", (void *)(unsigned long)m->ip);
f436f8bb 237 pr_cont("\n");
1da177e4 238 }
f436f8bb 239
a2d7b0d4 240 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
1da177e4 241 if (m->addr)
f436f8bb 242 pr_cont("ADDR %llx ", m->addr);
1da177e4 243 if (m->misc)
f436f8bb 244 pr_cont("MISC %llx ", m->misc);
549d042d 245
4b711f92
YG
246 if (mce_flags.smca) {
247 if (m->synd)
248 pr_cont("SYND %llx ", m->synd);
249 if (m->ipid)
250 pr_cont("IPID %llx ", m->ipid);
251 }
252
f436f8bb 253 pr_cont("\n");
925946cf 254
506ed6b5
AK
255 /*
256 * Note this output is parsed by external tools and old fields
257 * should not be changed.
258 */
881e23e5 259 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
506ed6b5 260 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
fa94d0c6 261 m->microcode);
cd9c57ca
BP
262}
263
264static void print_mce(struct mce *m)
265{
cd9c57ca 266 __print_mce(m);
b2fbf6f2 267
ac78bd72 268 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
b2fbf6f2 269 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
86503560
AK
270}
271
f94b61c2
AK
272#define PANIC_TIMEOUT 5 /* 5 seconds */
273
c7c9b392 274static atomic_t mce_panicked;
f94b61c2 275
bf783f9f 276static int fake_panic;
c7c9b392 277static atomic_t mce_fake_panicked;
bf783f9f 278
f94b61c2
AK
279/* Panic in progress. Enable interrupts and wait for final IPI */
280static void wait_for_panic(void)
281{
282 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
f436f8bb 283
f94b61c2
AK
284 preempt_disable();
285 local_irq_enable();
286 while (timeout-- > 0)
287 udelay(1);
29b0f591 288 if (panic_timeout == 0)
7af19e4a 289 panic_timeout = mca_cfg.panic_timeout;
f94b61c2
AK
290 panic("Panicing machine check CPU died");
291}
292
6c80f87e 293static void mce_panic(const char *msg, struct mce *final, char *exp)
d88203d1 294{
5541c93c
TL
295 int apei_err = 0;
296 struct llist_node *pending;
297 struct mce_evt_llist *l;
e02e68d3 298
bf783f9f
HY
299 if (!fake_panic) {
300 /*
301 * Make sure only one CPU runs in machine check panic
302 */
c7c9b392 303 if (atomic_inc_return(&mce_panicked) > 1)
bf783f9f
HY
304 wait_for_panic();
305 barrier();
f94b61c2 306
bf783f9f
HY
307 bust_spinlocks(1);
308 console_verbose();
309 } else {
310 /* Don't log too much for fake panic */
c7c9b392 311 if (atomic_inc_return(&mce_fake_panicked) > 1)
bf783f9f
HY
312 return;
313 }
5541c93c 314 pending = mce_gen_pool_prepare_records();
a0189c70 315 /* First print corrected ones that are still unlogged */
5541c93c
TL
316 llist_for_each_entry(l, pending, llnode) {
317 struct mce *m = &l->mce;
482908b4 318 if (!(m->status & MCI_STATUS_UC)) {
77e26cca 319 print_mce(m);
482908b4
HY
320 if (!apei_err)
321 apei_err = apei_write_mce(m);
322 }
a0189c70
AK
323 }
324 /* Now print uncorrected but with the final one last */
5541c93c
TL
325 llist_for_each_entry(l, pending, llnode) {
326 struct mce *m = &l->mce;
77e26cca
HS
327 if (!(m->status & MCI_STATUS_UC))
328 continue;
5541c93c 329 if (!final || mce_cmp(m, final)) {
77e26cca 330 print_mce(m);
482908b4
HY
331 if (!apei_err)
332 apei_err = apei_write_mce(m);
333 }
1da177e4 334 }
482908b4 335 if (final) {
77e26cca 336 print_mce(final);
482908b4
HY
337 if (!apei_err)
338 apei_err = apei_write_mce(final);
339 }
3c079792 340 if (cpu_missing)
a2d7b0d4 341 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
bd19a5e6 342 if (exp)
a2d7b0d4 343 pr_emerg(HW_ERR "Machine check: %s\n", exp);
bf783f9f
HY
344 if (!fake_panic) {
345 if (panic_timeout == 0)
7af19e4a 346 panic_timeout = mca_cfg.panic_timeout;
bf783f9f
HY
347 panic(msg);
348 } else
a2d7b0d4 349 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
d88203d1 350}
1da177e4 351
ea149b36
AK
352/* Support code for software error injection */
353
354static int msr_to_offset(u32 msr)
355{
0a3aee0d 356 unsigned bank = __this_cpu_read(injectm.bank);
f436f8bb 357
84c2559d 358 if (msr == mca_cfg.rip_msr)
ea149b36 359 return offsetof(struct mce, ip);
d9d73fcc 360 if (msr == msr_ops.status(bank))
ea149b36 361 return offsetof(struct mce, status);
d9d73fcc 362 if (msr == msr_ops.addr(bank))
ea149b36 363 return offsetof(struct mce, addr);
d9d73fcc 364 if (msr == msr_ops.misc(bank))
ea149b36
AK
365 return offsetof(struct mce, misc);
366 if (msr == MSR_IA32_MCG_STATUS)
367 return offsetof(struct mce, mcgstatus);
368 return -1;
369}
370
5f8c1a54
AK
371/* MSR access wrappers used for error injection */
372static u64 mce_rdmsrl(u32 msr)
373{
374 u64 v;
11868a2d 375
0a3aee0d 376 if (__this_cpu_read(injectm.finished)) {
ea149b36 377 int offset = msr_to_offset(msr);
11868a2d 378
ea149b36
AK
379 if (offset < 0)
380 return 0;
89cbc767 381 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
ea149b36 382 }
11868a2d
IM
383
384 if (rdmsrl_safe(msr, &v)) {
38c54ccb 385 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
11868a2d
IM
386 /*
387 * Return zero in case the access faulted. This should
388 * not happen normally but can happen if the CPU does
389 * something weird, or if the code is buggy.
390 */
391 v = 0;
392 }
393
5f8c1a54
AK
394 return v;
395}
396
397static void mce_wrmsrl(u32 msr, u64 v)
398{
0a3aee0d 399 if (__this_cpu_read(injectm.finished)) {
ea149b36 400 int offset = msr_to_offset(msr);
11868a2d 401
ea149b36 402 if (offset >= 0)
89cbc767 403 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
ea149b36
AK
404 return;
405 }
5f8c1a54
AK
406 wrmsrl(msr, v);
407}
408
b8325c5b
HS
409/*
410 * Collect all global (w.r.t. this processor) status about this machine
411 * check into our "mce" struct so that we can use it later to assess
412 * the severity of the problem as we read per-bank specific details.
413 */
414static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
415{
416 mce_setup(m);
417
418 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
419 if (regs) {
420 /*
421 * Get the address of the instruction at the time of
422 * the machine check error.
423 */
424 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
425 m->ip = regs->ip;
426 m->cs = regs->cs;
a129a7c8
AK
427
428 /*
429 * When in VM86 mode make the cs look like ring 3
430 * always. This is a lie, but it's better than passing
431 * the additional vm86 bit around everywhere.
432 */
433 if (v8086_mode(regs))
434 m->cs |= 3;
b8325c5b
HS
435 }
436 /* Use accurate RIP reporting if available. */
84c2559d
BP
437 if (mca_cfg.rip_msr)
438 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
b8325c5b
HS
439 }
440}
441
88ccbedd 442int mce_available(struct cpuinfo_x86 *c)
1da177e4 443{
1462594b 444 if (mca_cfg.disabled)
5b4408fd 445 return 0;
3d1712c9 446 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
447}
448
9b1beaf2
AK
449static void mce_schedule_work(void)
450{
a2c2727d 451 if (!mce_gen_pool_empty())
061120ae 452 schedule_work(&mce_work);
9b1beaf2
AK
453}
454
b77e70bf 455static void mce_irq_work_cb(struct irq_work *entry)
ccc3c319 456{
9b1beaf2 457 mce_schedule_work();
ccc3c319 458}
ccc3c319 459
feab21f8
BP
460/*
461 * Check if the address reported by the CPU is in a format we can parse.
462 * It would be possible to add code for most other cases, but all would
463 * be somewhat complicated (e.g. segment offset would require an instruction
464 * parser). So only support physical addresses up to page granuality for now.
465 */
e8a308e5 466int mce_usable_address(struct mce *m)
feab21f8 467{
c6a9583f 468 if (!(m->status & MCI_STATUS_ADDRV))
feab21f8
BP
469 return 0;
470
6e898d2b
TW
471 /* Checks after this one are Intel/Zhaoxin-specific: */
472 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
473 boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
feab21f8
BP
474 return 1;
475
c6a9583f
BP
476 if (!(m->status & MCI_STATUS_MISCV))
477 return 0;
478
feab21f8
BP
479 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
480 return 0;
c6a9583f 481
feab21f8
BP
482 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
483 return 0;
c6a9583f 484
feab21f8
BP
485 return 1;
486}
e8a308e5 487EXPORT_SYMBOL_GPL(mce_usable_address);
feab21f8 488
2d1f4061 489bool mce_is_memory_error(struct mce *m)
011d8261 490{
6e898d2b
TW
491 switch (m->cpuvendor) {
492 case X86_VENDOR_AMD:
493 case X86_VENDOR_HYGON:
c6708d50 494 return amd_mce_is_memory_error(m);
6e898d2b
TW
495
496 case X86_VENDOR_INTEL:
497 case X86_VENDOR_ZHAOXIN:
011d8261
BP
498 /*
499 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
500 *
501 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
502 * indicating a memory error. Bit 8 is used for indicating a
503 * cache hierarchy error. The combination of bit 2 and bit 3
504 * is used for indicating a `generic' cache hierarchy error
505 * But we can't just blindly check the above bits, because if
506 * bit 11 is set, then it is a bus/interconnect error - and
507 * either way the above bits just gives more detail on what
508 * bus/interconnect error happened. Note that bit 12 can be
509 * ignored, as it's the "filter" bit.
510 */
511 return (m->status & 0xef80) == BIT(7) ||
512 (m->status & 0xef00) == BIT(8) ||
513 (m->status & 0xeffc) == 0xc;
011d8261 514
6e898d2b
TW
515 default:
516 return false;
517 }
011d8261 518}
2d1f4061 519EXPORT_SYMBOL_GPL(mce_is_memory_error);
011d8261 520
5d96c934 521bool mce_is_correctable(struct mce *m)
179eb850
YG
522{
523 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
524 return false;
525
ac78bd72
PW
526 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
527 return false;
528
179eb850
YG
529 if (m->status & MCI_STATUS_UC)
530 return false;
531
532 return true;
533}
5d96c934 534EXPORT_SYMBOL_GPL(mce_is_correctable);
179eb850 535
c9c6d216 536static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
011d8261
BP
537 void *data)
538{
539 struct mce *m = (struct mce *)data;
011d8261
BP
540
541 if (!m)
542 return NOTIFY_DONE;
543
011d8261
BP
544 /* Emit the trace record: */
545 trace_mce_record(m);
546
011d8261
BP
547 set_bit(0, &mce_need_notify);
548
549 mce_notify_irq();
550
551 return NOTIFY_DONE;
552}
553
c9c6d216
TL
554static struct notifier_block early_nb = {
555 .notifier_call = mce_early_notifier,
556 .priority = MCE_PRIO_EARLY,
011d8261
BP
557};
558
8438b84a
JS
559static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
560 void *data)
fd4cf79f
CG
561{
562 struct mce *mce = (struct mce *)data;
563 unsigned long pfn;
564
8438b84a 565 if (!mce || !mce_usable_address(mce))
fd4cf79f
CG
566 return NOTIFY_DONE;
567
8438b84a
JS
568 if (mce->severity != MCE_AO_SEVERITY &&
569 mce->severity != MCE_DEFERRED_SEVERITY)
570 return NOTIFY_DONE;
571
572 pfn = mce->addr >> PAGE_SHIFT;
23ba710a 573 if (!memory_failure(pfn, 0)) {
8438b84a 574 set_mce_nospec(pfn);
23ba710a
TL
575 mce->kflags |= MCE_HANDLED_UC;
576 }
fd4cf79f
CG
577
578 return NOTIFY_OK;
ccc3c319 579}
8438b84a
JS
580
581static struct notifier_block mce_uc_nb = {
582 .notifier_call = uc_decode_notifier,
583 .priority = MCE_PRIO_UC,
fd4cf79f 584};
ccc3c319 585
cd9c57ca
BP
586static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
587 void *data)
588{
589 struct mce *m = (struct mce *)data;
590
591 if (!m)
592 return NOTIFY_DONE;
593
925946cf
TL
594 if (!m->kflags)
595 __print_mce(m);
cd9c57ca
BP
596
597 return NOTIFY_DONE;
598}
599
600static struct notifier_block mce_default_nb = {
601 .notifier_call = mce_default_notifier,
602 /* lowest prio, we want it to run last. */
9026cc82 603 .priority = MCE_PRIO_LOWEST,
cd9c57ca
BP
604};
605
85f92694
TL
606/*
607 * Read ADDR and MISC registers.
608 */
609static void mce_read_aux(struct mce *m, int i)
610{
611 if (m->status & MCI_STATUS_MISCV)
d9d73fcc 612 m->misc = mce_rdmsrl(msr_ops.misc(i));
db819d60 613
85f92694 614 if (m->status & MCI_STATUS_ADDRV) {
d9d73fcc 615 m->addr = mce_rdmsrl(msr_ops.addr(i));
85f92694
TL
616
617 /*
618 * Mask the reported address by the reported granularity.
619 */
1462594b 620 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
85f92694
TL
621 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
622 m->addr >>= shift;
623 m->addr <<= shift;
624 }
4f29b73b
YG
625
626 /*
627 * Extract [55:<lsb>] where lsb is the least significant
628 * *valid* bit of the address bits.
629 */
630 if (mce_flags.smca) {
631 u8 lsb = (m->addr >> 56) & 0x3f;
632
633 m->addr &= GENMASK_ULL(55, lsb);
634 }
85f92694 635 }
db819d60 636
5828c46f
YG
637 if (mce_flags.smca) {
638 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
639
640 if (m->status & MCI_STATUS_SYNDV)
641 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
642 }
85f92694
TL
643}
644
ca84f696
AK
645DEFINE_PER_CPU(unsigned, mce_poll_count);
646
d88203d1 647/*
b79109c3
AK
648 * Poll for corrected events or events that happened before reset.
649 * Those are just logged through /dev/mcelog.
650 *
651 * This is executed in standard interrupt context.
ed7290d0
AK
652 *
653 * Note: spec recommends to panic for fatal unsignalled
654 * errors here. However this would be quite problematic --
655 * we would need to reimplement the Monarch handling and
656 * it would mess up the exclusion between exception handler
a97673a1 657 * and poll handler -- * so we skip this for now.
ed7290d0
AK
658 * These cases should not happen anyways, or only when the CPU
659 * is already totally * confused. In this case it's likely it will
660 * not fully execute the machine check handler either.
b79109c3 661 */
3f2f0680 662bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3 663{
b4914508 664 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
8b38937b 665 bool error_seen = false;
b79109c3
AK
666 struct mce m;
667 int i;
668
c6ae41e7 669 this_cpu_inc(mce_poll_count);
ca84f696 670
b8325c5b 671 mce_gather_info(&m, NULL);
b79109c3 672
669c00f0
BP
673 if (flags & MCP_TIMESTAMP)
674 m.tsc = rdtsc();
54467353 675
c7d314f3 676 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 677 if (!mce_banks[i].ctl || !test_bit(i, *b))
b79109c3
AK
678 continue;
679
680 m.misc = 0;
681 m.addr = 0;
682 m.bank = i;
b79109c3
AK
683
684 barrier();
d9d73fcc 685 m.status = mce_rdmsrl(msr_ops.status(i));
f19501aa
TL
686
687 /* If this entry is not valid, ignore it */
b79109c3
AK
688 if (!(m.status & MCI_STATUS_VAL))
689 continue;
690
691 /*
f19501aa
TL
692 * If we are logging everything (at CPU online) or this
693 * is a corrected error, then we must log it.
b79109c3 694 */
f19501aa
TL
695 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
696 goto log_it;
697
698 /*
699 * Newer Intel systems that support software error
700 * recovery need to make additional checks. Other
701 * CPUs should skip over uncorrected errors, but log
702 * everything else.
703 */
704 if (!mca_cfg.ser) {
705 if (m.status & MCI_STATUS_UC)
706 continue;
707 goto log_it;
708 }
709
710 /* Log "not enabled" (speculative) errors */
711 if (!(m.status & MCI_STATUS_EN))
712 goto log_it;
713
714 /*
715 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
716 * UC == 1 && PCC == 0 && S == 0
717 */
718 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
719 goto log_it;
720
721 /*
722 * Skip anything else. Presumption is that our read of this
723 * bank is racing with a machine check. Leave the log alone
724 * for do_machine_check() to deal with it.
725 */
726 continue;
b79109c3 727
f19501aa 728log_it:
8b38937b
TL
729 error_seen = true;
730
90454e49
JS
731 if (flags & MCP_DONTLOG)
732 goto clear_it;
b79109c3 733
90454e49 734 mce_read_aux(&m, i);
e2de64ec 735 m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
b79109c3
AK
736 /*
737 * Don't get the IP here because it's unlikely to
738 * have anything to do with the actual error location.
739 */
b79109c3 740
90454e49
JS
741 if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
742 goto clear_it;
743
744 mce_log(&m);
745
746clear_it:
b79109c3
AK
747 /*
748 * Clear state for this bank.
749 */
d9d73fcc 750 mce_wrmsrl(msr_ops.status(i), 0);
b79109c3
AK
751 }
752
753 /*
754 * Don't clear MCG_STATUS here because it's only defined for
755 * exceptions.
756 */
88921be3
AK
757
758 sync_core();
3f2f0680 759
8b38937b 760 return error_seen;
b79109c3 761}
ea149b36 762EXPORT_SYMBOL_GPL(machine_check_poll);
b79109c3 763
bd19a5e6
AK
764/*
765 * Do a quick check if any of the events requires a panic.
766 * This decides if we keep the events around or clear them.
767 */
61b0fccd
TL
768static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
769 struct pt_regs *regs)
bd19a5e6 770{
7a8bc2b0 771 char *tmp = *msg;
1f74c8a6 772 int i;
bd19a5e6 773
c7d314f3 774 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
d9d73fcc 775 m->status = mce_rdmsrl(msr_ops.status(i));
1f74c8a6
BP
776 if (!(m->status & MCI_STATUS_VAL))
777 continue;
778
779 __set_bit(i, validp);
780 if (quirk_no_way_out)
781 quirk_no_way_out(i, m, regs);
17fea54b 782
a3a57dda 783 m->bank = i;
17fea54b 784 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
1f74c8a6 785 mce_read_aux(m, i);
17fea54b 786 *msg = tmp;
1f74c8a6 787 return 1;
17fea54b 788 }
bd19a5e6 789 }
1f74c8a6 790 return 0;
bd19a5e6
AK
791}
792
3c079792
AK
793/*
794 * Variable to establish order between CPUs while scanning.
795 * Each CPU spins initially until executing is equal its number.
796 */
797static atomic_t mce_executing;
798
799/*
800 * Defines order of CPUs on entry. First CPU becomes Monarch.
801 */
802static atomic_t mce_callin;
803
804/*
805 * Check if a timeout waiting for other CPUs happened.
806 */
6c80f87e 807static int mce_timed_out(u64 *t, const char *msg)
3c079792
AK
808{
809 /*
810 * The others already did panic for some reason.
811 * Bail out like in a timeout.
812 * rmb() to tell the compiler that system_state
813 * might have been modified by someone else.
814 */
815 rmb();
c7c9b392 816 if (atomic_read(&mce_panicked))
3c079792 817 wait_for_panic();
84c2559d 818 if (!mca_cfg.monarch_timeout)
3c079792
AK
819 goto out;
820 if ((s64)*t < SPINUNIT) {
716079f6 821 if (mca_cfg.tolerant <= 1)
6c80f87e 822 mce_panic(msg, NULL, NULL);
3c079792
AK
823 cpu_missing = 1;
824 return 1;
825 }
826 *t -= SPINUNIT;
827out:
828 touch_nmi_watchdog();
829 return 0;
830}
831
832/*
833 * The Monarch's reign. The Monarch is the CPU who entered
834 * the machine check handler first. It waits for the others to
835 * raise the exception too and then grades them. When any
836 * error is fatal panic. Only then let the others continue.
837 *
838 * The other CPUs entering the MCE handler will be controlled by the
839 * Monarch. They are called Subjects.
840 *
841 * This way we prevent any potential data corruption in a unrecoverable case
842 * and also makes sure always all CPU's errors are examined.
843 *
680b6cfd 844 * Also this detects the case of a machine check event coming from outer
3c079792
AK
845 * space (not detected by any CPUs) In this case some external agent wants
846 * us to shut down, so panic too.
847 *
848 * The other CPUs might still decide to panic if the handler happens
849 * in a unrecoverable place, but in this case the system is in a semi-stable
850 * state and won't corrupt anything by itself. It's ok to let the others
851 * continue for a bit first.
852 *
853 * All the spin loops have timeouts; when a timeout happens a CPU
854 * typically elects itself to be Monarch.
855 */
856static void mce_reign(void)
857{
858 int cpu;
859 struct mce *m = NULL;
860 int global_worst = 0;
861 char *msg = NULL;
862 char *nmsg = NULL;
863
864 /*
865 * This CPU is the Monarch and the other CPUs have run
866 * through their handlers.
867 * Grade the severity of the errors of all the CPUs.
868 */
869 for_each_possible_cpu(cpu) {
d203f0b8
BP
870 int severity = mce_severity(&per_cpu(mces_seen, cpu),
871 mca_cfg.tolerant,
e3480271 872 &nmsg, true);
3c079792
AK
873 if (severity > global_worst) {
874 msg = nmsg;
875 global_worst = severity;
876 m = &per_cpu(mces_seen, cpu);
877 }
878 }
879
880 /*
881 * Cannot recover? Panic here then.
882 * This dumps all the mces in the log buffer and stops the
883 * other CPUs.
884 */
d203f0b8 885 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
8af7043a 886 mce_panic("Fatal machine check", m, msg);
3c079792
AK
887
888 /*
889 * For UC somewhere we let the CPU who detects it handle it.
890 * Also must let continue the others, otherwise the handling
891 * CPU could deadlock on a lock.
892 */
893
894 /*
895 * No machine check event found. Must be some external
896 * source or one CPU is hung. Panic.
897 */
d203f0b8 898 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
8af7043a 899 mce_panic("Fatal machine check from unknown source", NULL, NULL);
3c079792
AK
900
901 /*
902 * Now clear all the mces_seen so that they don't reappear on
903 * the next mce.
904 */
905 for_each_possible_cpu(cpu)
906 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
907}
908
909static atomic_t global_nwo;
910
911/*
912 * Start of Monarch synchronization. This waits until all CPUs have
913 * entered the exception handler and then determines if any of them
914 * saw a fatal event that requires panic. Then it executes them
915 * in the entry order.
916 * TBD double check parallel CPU hotunplug
917 */
7fb06fc9 918static int mce_start(int *no_way_out)
3c079792 919{
7fb06fc9 920 int order;
3c079792 921 int cpus = num_online_cpus();
84c2559d 922 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
3c079792 923
7fb06fc9
HS
924 if (!timeout)
925 return -1;
3c079792 926
7fb06fc9 927 atomic_add(*no_way_out, &global_nwo);
184e1fdf 928 /*
bf92b1fe
DB
929 * Rely on the implied barrier below, such that global_nwo
930 * is updated before mce_callin.
184e1fdf 931 */
a95436e4 932 order = atomic_inc_return(&mce_callin);
3c079792
AK
933
934 /*
935 * Wait for everyone.
936 */
937 while (atomic_read(&mce_callin) != cpus) {
6c80f87e
AL
938 if (mce_timed_out(&timeout,
939 "Timeout: Not all CPUs entered broadcast exception handler")) {
3c079792 940 atomic_set(&global_nwo, 0);
7fb06fc9 941 return -1;
3c079792
AK
942 }
943 ndelay(SPINUNIT);
944 }
945
184e1fdf
HY
946 /*
947 * mce_callin should be read before global_nwo
948 */
949 smp_rmb();
3c079792 950
7fb06fc9
HS
951 if (order == 1) {
952 /*
953 * Monarch: Starts executing now, the others wait.
954 */
3c079792 955 atomic_set(&mce_executing, 1);
7fb06fc9
HS
956 } else {
957 /*
958 * Subject: Now start the scanning loop one by one in
959 * the original callin order.
960 * This way when there are any shared banks it will be
961 * only seen by one CPU before cleared, avoiding duplicates.
962 */
963 while (atomic_read(&mce_executing) < order) {
6c80f87e
AL
964 if (mce_timed_out(&timeout,
965 "Timeout: Subject CPUs unable to finish machine check processing")) {
7fb06fc9
HS
966 atomic_set(&global_nwo, 0);
967 return -1;
968 }
969 ndelay(SPINUNIT);
970 }
3c079792
AK
971 }
972
973 /*
7fb06fc9 974 * Cache the global no_way_out state.
3c079792 975 */
7fb06fc9
HS
976 *no_way_out = atomic_read(&global_nwo);
977
978 return order;
3c079792
AK
979}
980
981/*
982 * Synchronize between CPUs after main scanning loop.
983 * This invokes the bulk of the Monarch processing.
984 */
985static int mce_end(int order)
986{
987 int ret = -1;
84c2559d 988 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
3c079792
AK
989
990 if (!timeout)
991 goto reset;
992 if (order < 0)
993 goto reset;
994
995 /*
996 * Allow others to run.
997 */
998 atomic_inc(&mce_executing);
999
1000 if (order == 1) {
1001 /* CHECKME: Can this race with a parallel hotplug? */
1002 int cpus = num_online_cpus();
1003
1004 /*
1005 * Monarch: Wait for everyone to go through their scanning
1006 * loops.
1007 */
1008 while (atomic_read(&mce_executing) <= cpus) {
6c80f87e
AL
1009 if (mce_timed_out(&timeout,
1010 "Timeout: Monarch CPU unable to finish machine check processing"))
3c079792
AK
1011 goto reset;
1012 ndelay(SPINUNIT);
1013 }
1014
1015 mce_reign();
1016 barrier();
1017 ret = 0;
1018 } else {
1019 /*
1020 * Subject: Wait for Monarch to finish.
1021 */
1022 while (atomic_read(&mce_executing) != 0) {
6c80f87e
AL
1023 if (mce_timed_out(&timeout,
1024 "Timeout: Monarch CPU did not finish machine check processing"))
3c079792
AK
1025 goto reset;
1026 ndelay(SPINUNIT);
1027 }
1028
1029 /*
1030 * Don't reset anything. That's done by the Monarch.
1031 */
1032 return 0;
1033 }
1034
1035 /*
1036 * Reset all global state.
1037 */
1038reset:
1039 atomic_set(&global_nwo, 0);
1040 atomic_set(&mce_callin, 0);
1041 barrier();
1042
1043 /*
1044 * Let others run again.
1045 */
1046 atomic_set(&mce_executing, 0);
1047 return ret;
1048}
1049
1050static void mce_clear_state(unsigned long *toclear)
1051{
1052 int i;
1053
c7d314f3 1054 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
3c079792 1055 if (test_bit(i, toclear))
d9d73fcc 1056 mce_wrmsrl(msr_ops.status(i), 0);
3c079792
AK
1057 }
1058}
1059
b2f9d678
TL
1060static int do_memory_failure(struct mce *m)
1061{
1062 int flags = MF_ACTION_REQUIRED;
1063 int ret;
1064
1065 pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1066 if (!(m->mcgstatus & MCG_STATUS_RIPV))
1067 flags |= MF_MUST_KILL;
83b57531 1068 ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
b2f9d678
TL
1069 if (ret)
1070 pr_err("Memory error not recovered");
fd0e786d 1071 else
284ce401 1072 set_mce_nospec(m->addr >> PAGE_SHIFT);
b2f9d678
TL
1073 return ret;
1074}
1075
d3d6923c
BP
1076
1077/*
1078 * Cases where we avoid rendezvous handler timeout:
1079 * 1) If this CPU is offline.
1080 *
1081 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1082 * skip those CPUs which remain looping in the 1st kernel - see
1083 * crash_nmi_callback().
1084 *
1085 * Note: there still is a small window between kexec-ing and the new,
1086 * kdump kernel establishing a new #MC handler where a broadcasted MCE
1087 * might not get handled properly.
1088 */
1089static bool __mc_check_crashing_cpu(int cpu)
1090{
1091 if (cpu_is_offline(cpu) ||
1092 (crashing_cpu != -1 && crashing_cpu != cpu)) {
1093 u64 mcgstatus;
1094
1095 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
70f0c230
TW
1096
1097 if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1098 if (mcgstatus & MCG_STATUS_LMCES)
1099 return false;
1100 }
1101
d3d6923c
BP
1102 if (mcgstatus & MCG_STATUS_RIPV) {
1103 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1104 return true;
1105 }
1106 }
1107 return false;
1108}
1109
f35565e3
BP
1110static void __mc_scan_banks(struct mce *m, struct mce *final,
1111 unsigned long *toclear, unsigned long *valid_banks,
1112 int no_way_out, int *worst)
1113{
b4914508 1114 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
f35565e3
BP
1115 struct mca_config *cfg = &mca_cfg;
1116 int severity, i;
1117
c7d314f3 1118 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
f35565e3
BP
1119 __clear_bit(i, toclear);
1120 if (!test_bit(i, valid_banks))
1121 continue;
d5c84ef2 1122
f35565e3
BP
1123 if (!mce_banks[i].ctl)
1124 continue;
1125
1126 m->misc = 0;
1127 m->addr = 0;
1128 m->bank = i;
1129
1130 m->status = mce_rdmsrl(msr_ops.status(i));
d5c84ef2 1131 if (!(m->status & MCI_STATUS_VAL))
f35565e3
BP
1132 continue;
1133
1134 /*
d5c84ef2
BP
1135 * Corrected or non-signaled errors are handled by
1136 * machine_check_poll(). Leave them alone, unless this panics.
f35565e3
BP
1137 */
1138 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1139 !no_way_out)
1140 continue;
1141
d5c84ef2 1142 /* Set taint even when machine check was not enabled. */
f35565e3
BP
1143 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1144
1145 severity = mce_severity(m, cfg->tolerant, NULL, true);
1146
1147 /*
1148 * When machine check was for corrected/deferred handler don't
d5c84ef2 1149 * touch, unless we're panicking.
f35565e3
BP
1150 */
1151 if ((severity == MCE_KEEP_SEVERITY ||
1152 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1153 continue;
d5c84ef2 1154
f35565e3 1155 __set_bit(i, toclear);
d5c84ef2
BP
1156
1157 /* Machine check event was not enabled. Clear, but ignore. */
1158 if (severity == MCE_NO_SEVERITY)
f35565e3 1159 continue;
f35565e3
BP
1160
1161 mce_read_aux(m, i);
1162
1163 /* assuming valid severity level != 0 */
1164 m->severity = severity;
1165
1166 mce_log(m);
1167
1168 if (severity > *worst) {
1169 *final = *m;
1170 *worst = severity;
1171 }
1172 }
1173
1174 /* mce_clear_state will clear *final, save locally for use later */
1175 *m = *final;
1176}
1177
b79109c3
AK
1178/*
1179 * The actual machine check handler. This only handles real
1180 * exceptions when something got corrupted coming in through int 18.
1181 *
1182 * This is executed in NMI context not subject to normal locking rules. This
1183 * implies that most kernel services cannot be safely used. Don't even
1184 * think about putting a printk in there!
3c079792
AK
1185 *
1186 * On Intel systems this is entered on all CPUs in parallel through
1187 * MCE broadcast. However some CPUs might be broken beyond repair,
1188 * so be always careful when synchronizing with others.
55ba18d6
AL
1189 *
1190 * Tracing and kprobes are disabled: if we interrupted a kernel context
1191 * with IF=1, we need to minimize stack usage. There are also recursion
1192 * issues: if the machine check was due to a failure of the memory
1193 * backing the user stack, tracing that reads the user stack will cause
1194 * potentially infinite recursion.
1da177e4 1195 */
55ba18d6 1196void notrace do_machine_check(struct pt_regs *regs, long error_code)
1da177e4 1197{
d3d6923c
BP
1198 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1199 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1462594b 1200 struct mca_config *cfg = &mca_cfg;
d3d6923c 1201 int cpu = smp_processor_id();
3c079792 1202 struct mce m, *final;
7a8bc2b0 1203 char *msg = NULL;
3c079792 1204 int worst = 0;
fead35c6 1205
3c079792
AK
1206 /*
1207 * Establish sequential order between the CPUs entering the machine
1208 * check handler.
1209 */
fead35c6 1210 int order = -1;
d3d6923c 1211
bd78432c
TH
1212 /*
1213 * If no_way_out gets set, there is no safe way to recover from this
d203f0b8 1214 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
bd78432c
TH
1215 */
1216 int no_way_out = 0;
d3d6923c 1217
bd78432c
TH
1218 /*
1219 * If kill_it gets set, there might be a way to recover from this
1220 * error.
1221 */
1222 int kill_it = 0;
fead35c6
YG
1223
1224 /*
1225 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1226 * on Intel.
1227 */
1228 int lmce = 1;
1da177e4 1229
d3d6923c
BP
1230 if (__mc_check_crashing_cpu(cpu))
1231 return;
d90167a9 1232
8c84014f 1233 ist_enter(regs);
95927475 1234
c6ae41e7 1235 this_cpu_inc(mce_exception_count);
01ca79f1 1236
b8325c5b 1237 mce_gather_info(&m, regs);
669c00f0 1238 m.tsc = rdtsc();
b5f2fa4e 1239
89cbc767 1240 final = this_cpu_ptr(&mces_seen);
3c079792
AK
1241 *final = m;
1242
95022b8c 1243 memset(valid_banks, 0, sizeof(valid_banks));
61b0fccd 1244 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
680b6cfd 1245
1da177e4
LT
1246 barrier();
1247
ed7290d0 1248 /*
a8c321fb
TL
1249 * When no restart IP might need to kill or panic.
1250 * Assume the worst for now, but if we find the
1251 * severity is MCE_AR_SEVERITY we have other options.
ed7290d0
AK
1252 */
1253 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1254 kill_it = 1;
1255
3c079792 1256 /*
fead35c6 1257 * Check if this MCE is signaled to only this logical processor,
70f0c230 1258 * on Intel, Zhaoxin only.
3c079792 1259 */
70f0c230
TW
1260 if (m.cpuvendor == X86_VENDOR_INTEL ||
1261 m.cpuvendor == X86_VENDOR_ZHAOXIN)
fead35c6
YG
1262 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1263
1264 /*
40c36e27
TL
1265 * Local machine check may already know that we have to panic.
1266 * Broadcast machine check begins rendezvous in mce_start()
fead35c6
YG
1267 * Go through all banks in exclusion of the other CPUs. This way we
1268 * don't report duplicated events on shared banks because the first one
40c36e27 1269 * to see it will clear it.
fead35c6 1270 */
40c36e27
TL
1271 if (lmce) {
1272 if (no_way_out)
1273 mce_panic("Fatal local machine check", &m, msg);
1274 } else {
243d657e 1275 order = mce_start(&no_way_out);
40c36e27 1276 }
243d657e 1277
f35565e3 1278 __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
a8c321fb 1279
3c079792
AK
1280 if (!no_way_out)
1281 mce_clear_state(toclear);
1282
e9eee03e 1283 /*
3c079792
AK
1284 * Do most of the synchronization with other CPUs.
1285 * When there's any problem use only local no_way_out state.
e9eee03e 1286 */
243d657e
AR
1287 if (!lmce) {
1288 if (mce_end(order) < 0)
1289 no_way_out = worst >= MCE_PANIC_SEVERITY;
1290 } else {
1291 /*
40c36e27
TL
1292 * If there was a fatal machine check we should have
1293 * already called mce_panic earlier in this function.
1294 * Since we re-read the banks, we might have found
1295 * something new. Check again to see if we found a
1296 * fatal error. We call "mce_severity()" again to
1297 * make sure we have the right "msg".
243d657e 1298 */
40c36e27
TL
1299 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1300 mce_severity(&m, cfg->tolerant, &msg, true);
1301 mce_panic("Local fatal machine check!", &m, msg);
1302 }
243d657e 1303 }
bd78432c
TH
1304
1305 /*
b2f9d678
TL
1306 * If tolerant is at an insane level we drop requests to kill
1307 * processes and continue even when there is no way out.
bd78432c 1308 */
b2f9d678
TL
1309 if (cfg->tolerant == 3)
1310 kill_it = 0;
1311 else if (no_way_out)
1312 mce_panic("Fatal machine check on current CPU", &m, msg);
e02e68d3 1313
3c079792 1314 if (worst > 0)
39f0584e
BP
1315 irq_work_queue(&mce_irq_work);
1316
5f8c1a54 1317 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
45deca7d 1318
88921be3 1319 sync_core();
d4812e16 1320
b2f9d678
TL
1321 if (worst != MCE_AR_SEVERITY && !kill_it)
1322 goto out_ist;
d4812e16 1323
b2f9d678
TL
1324 /* Fault was in user mode and we need to take some action */
1325 if ((m.cs & 3) == 3) {
1326 ist_begin_non_atomic(regs);
1327 local_irq_enable();
1328
1329 if (kill_it || do_memory_failure(&m))
3cf5d076 1330 force_sig(SIGBUS);
b2f9d678
TL
1331 local_irq_disable();
1332 ist_end_non_atomic();
1333 } else {
81fd9c18 1334 if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
2d806d07 1335 mce_panic("Failed kernel mode recovery", &m, msg);
d4812e16 1336 }
b2f9d678
TL
1337
1338out_ist:
8c84014f 1339 ist_exit(regs);
1da177e4 1340}
ea149b36 1341EXPORT_SYMBOL_GPL(do_machine_check);
55ba18d6 1342NOKPROBE_SYMBOL(do_machine_check);
1da177e4 1343
cd42f4a3 1344#ifndef CONFIG_MEMORY_FAILURE
83b57531 1345int memory_failure(unsigned long pfn, int flags)
9b1beaf2 1346{
a8c321fb
TL
1347 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1348 BUG_ON(flags & MF_ACTION_REQUIRED);
c767a54b
JP
1349 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1350 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1351 pfn);
cd42f4a3
TL
1352
1353 return 0;
9b1beaf2 1354}
cd42f4a3 1355#endif
9b1beaf2 1356
1da177e4 1357/*
8a336b0a
TH
1358 * Periodic polling timer for "silent" machine check errors. If the
1359 * poller finds an MCE, poll 2x faster. When the poller finds no more
1360 * errors, poll 2x slower (up to check_interval seconds).
1da177e4 1361 */
3f2f0680 1362static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
e9eee03e 1363
82f7af09 1364static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
52d168e2 1365static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 1366
55babd8f
CG
1367static unsigned long mce_adjust_timer_default(unsigned long interval)
1368{
1369 return interval;
1370}
1371
3f2f0680 1372static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
55babd8f 1373
0becc0ae 1374static void __start_timer(struct timer_list *t, unsigned long interval)
27f6c573 1375{
3f2f0680
BP
1376 unsigned long when = jiffies + interval;
1377 unsigned long flags;
27f6c573 1378
3f2f0680 1379 local_irq_save(flags);
27f6c573 1380
0becc0ae
TG
1381 if (!timer_pending(t) || time_before(when, t->expires))
1382 mod_timer(t, round_jiffies(when));
3f2f0680
BP
1383
1384 local_irq_restore(flags);
27f6c573
CG
1385}
1386
92bb6cb1 1387static void mce_timer_fn(struct timer_list *t)
1da177e4 1388{
92bb6cb1 1389 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
82f7af09 1390 unsigned long iv;
52d168e2 1391
92bb6cb1 1392 WARN_ON(cpu_t != t);
3f2f0680
BP
1393
1394 iv = __this_cpu_read(mce_next_interval);
52d168e2 1395
89cbc767 1396 if (mce_available(this_cpu_ptr(&cpu_info))) {
54467353 1397 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
3f2f0680
BP
1398
1399 if (mce_intel_cmci_poll()) {
1400 iv = mce_adjust_timer(iv);
1401 goto done;
1402 }
e9eee03e 1403 }
1da177e4
LT
1404
1405 /*
3f2f0680
BP
1406 * Alert userspace if needed. If we logged an MCE, reduce the polling
1407 * interval, otherwise increase the polling interval.
1da177e4 1408 */
3f2f0680 1409 if (mce_notify_irq())
958fb3c5 1410 iv = max(iv / 2, (unsigned long) HZ/100);
3f2f0680 1411 else
82f7af09 1412 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
3f2f0680
BP
1413
1414done:
82f7af09 1415 __this_cpu_write(mce_next_interval, iv);
0becc0ae 1416 __start_timer(t, iv);
55babd8f 1417}
e02e68d3 1418
55babd8f
CG
1419/*
1420 * Ensure that the timer is firing in @interval from now.
1421 */
1422void mce_timer_kick(unsigned long interval)
1423{
89cbc767 1424 struct timer_list *t = this_cpu_ptr(&mce_timer);
55babd8f
CG
1425 unsigned long iv = __this_cpu_read(mce_next_interval);
1426
0becc0ae 1427 __start_timer(t, interval);
3f2f0680 1428
55babd8f
CG
1429 if (interval < iv)
1430 __this_cpu_write(mce_next_interval, interval);
e02e68d3
TH
1431}
1432
9aaef96f
HS
1433/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1434static void mce_timer_delete_all(void)
1435{
1436 int cpu;
1437
1438 for_each_online_cpu(cpu)
1439 del_timer_sync(&per_cpu(mce_timer, cpu));
1440}
1441
e02e68d3 1442/*
9bd98405
AK
1443 * Notify the user(s) about new machine check events.
1444 * Can be called from interrupt context, but not from machine check/NMI
1445 * context.
e02e68d3 1446 */
9ff36ee9 1447int mce_notify_irq(void)
e02e68d3 1448{
8457c84d
AK
1449 /* Not more than two messages every minute */
1450 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1451
1020bcbc 1452 if (test_and_clear_bit(0, &mce_need_notify)) {
5de97c9f 1453 mce_work_trigger();
e02e68d3 1454
8457c84d 1455 if (__ratelimit(&ratelimit))
a2d7b0d4 1456 pr_info(HW_ERR "Machine check events logged\n");
e02e68d3
TH
1457
1458 return 1;
1da177e4 1459 }
e02e68d3
TH
1460 return 0;
1461}
9ff36ee9 1462EXPORT_SYMBOL_GPL(mce_notify_irq);
8a336b0a 1463
b4914508 1464static void __mcheck_cpu_mce_banks_init(void)
cebe1820 1465{
b4914508 1466 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
c7d314f3 1467 u8 n_banks = this_cpu_read(mce_num_banks);
cebe1820
AK
1468 int i;
1469
c7d314f3 1470 for (i = 0; i < n_banks; i++) {
cebe1820 1471 struct mce_bank *b = &mce_banks[i];
11868a2d 1472
068b053d
YG
1473 /*
1474 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
1475 * the required vendor quirks before
1476 * __mcheck_cpu_init_clear_banks() does the final bank setup.
1477 */
cebe1820
AK
1478 b->ctl = -1ULL;
1479 b->init = 1;
1480 }
cebe1820
AK
1481}
1482
d88203d1 1483/*
1da177e4
LT
1484 * Initialize Machine Checks for a CPU.
1485 */
b4914508 1486static void __mcheck_cpu_cap_init(void)
1da177e4 1487{
e9eee03e 1488 u64 cap;
006c0770 1489 u8 b;
1da177e4
LT
1490
1491 rdmsrl(MSR_IA32_MCG_CAP, cap);
01c6680a
TG
1492
1493 b = cap & MCG_BANKCNT_MASK;
c7d314f3
YG
1494
1495 if (b > MAX_NR_BANKS) {
1496 pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1497 smp_processor_id(), MAX_NR_BANKS, b);
0d7482e3 1498 b = MAX_NR_BANKS;
c7d314f3 1499 }
0d7482e3 1500
c7d314f3 1501 this_cpu_write(mce_num_banks, b);
d203f0b8 1502
b4914508 1503 __mcheck_cpu_mce_banks_init();
0d7482e3 1504
94ad8474 1505 /* Use accurate RIP reporting if available. */
01c6680a 1506 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
84c2559d 1507 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1da177e4 1508
ed7290d0 1509 if (cap & MCG_SER_P)
09933946 1510 mca_cfg.ser = 1;
0d7482e3
AK
1511}
1512
5e09954a 1513static void __mcheck_cpu_init_generic(void)
0d7482e3 1514{
84c2559d 1515 enum mcp_flags m_fl = 0;
e9eee03e 1516 mce_banks_t all_banks;
0d7482e3 1517 u64 cap;
0d7482e3 1518
84c2559d
BP
1519 if (!mca_cfg.bootlog)
1520 m_fl = MCP_DONTLOG;
1521
b79109c3
AK
1522 /*
1523 * Log the machine checks left over from the previous reset.
1524 */
ee031c31 1525 bitmap_fill(all_banks, MAX_NR_BANKS);
84c2559d 1526 machine_check_poll(MCP_UC | m_fl, &all_banks);
1da177e4 1527
375074cc 1528 cr4_set_bits(X86_CR4_MCE);
1da177e4 1529
0d7482e3 1530 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
1531 if (cap & MCG_CTL_P)
1532 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
bb91f8c0
AG
1533}
1534
1535static void __mcheck_cpu_init_clear_banks(void)
1536{
b4914508 1537 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
bb91f8c0 1538 int i;
1da177e4 1539
c7d314f3 1540 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 1541 struct mce_bank *b = &mce_banks[i];
11868a2d 1542
cebe1820 1543 if (!b->init)
06b7a7a5 1544 continue;
d9d73fcc
YG
1545 wrmsrl(msr_ops.ctl(i), b->ctl);
1546 wrmsrl(msr_ops.status(i), 0);
d88203d1 1547 }
1da177e4
LT
1548}
1549
068b053d
YG
1550/*
1551 * Do a final check to see if there are any unused/RAZ banks.
1552 *
1553 * This must be done after the banks have been initialized and any quirks have
1554 * been applied.
1555 *
1556 * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
1557 * Otherwise, a user who disables a bank will not be able to re-enable it
1558 * without a system reboot.
1559 */
1560static void __mcheck_cpu_check_banks(void)
1561{
1562 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1563 u64 msrval;
1564 int i;
1565
1566 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1567 struct mce_bank *b = &mce_banks[i];
1568
1569 if (!b->init)
1570 continue;
1571
1572 rdmsrl(msr_ops.ctl(i), msrval);
1573 b->init = !!msrval;
1574 }
1575}
1576
61b0fccd
TL
1577/*
1578 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1579 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1580 * Vol 3B Table 15-20). But this confuses both the code that determines
1581 * whether the machine check occurred in kernel or user mode, and also
1582 * the severity assessment code. Pretend that EIPV was set, and take the
1583 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1584 */
1585static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1586{
1587 if (bank != 0)
1588 return;
1589 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1590 return;
1591 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1592 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1593 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1594 MCACOD)) !=
1595 (MCI_STATUS_UC|MCI_STATUS_EN|
1596 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1597 MCI_STATUS_AR|MCACOD_INSTR))
1598 return;
1599
1600 m->mcgstatus |= MCG_STATUS_EIPV;
1601 m->ip = regs->ip;
1602 m->cs = regs->cs;
1603}
1604
1da177e4 1605/* Add per CPU specific workarounds here */
148f9bb8 1606static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
d88203d1 1607{
b4914508 1608 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
d203f0b8
BP
1609 struct mca_config *cfg = &mca_cfg;
1610
e412cd25 1611 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
c767a54b 1612 pr_info("unknown CPU type - not enabling MCE support\n");
e412cd25
IM
1613 return -EOPNOTSUPP;
1614 }
1615
1da177e4 1616 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 1617 if (c->x86_vendor == X86_VENDOR_AMD) {
c7d314f3 1618 if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
e9eee03e
IM
1619 /*
1620 * disable GART TBL walk error reporting, which
1621 * trips off incorrectly with the IOMMU & 3ware
1622 * & Cerberus:
1623 */
cebe1820 1624 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
e9eee03e 1625 }
6057077f 1626 if (c->x86 < 0x11 && cfg->bootlog < 0) {
e9eee03e
IM
1627 /*
1628 * Lots of broken BIOS around that don't clear them
1629 * by default and leave crap in there. Don't log:
1630 */
84c2559d 1631 cfg->bootlog = 0;
e9eee03e 1632 }
2e6f694f
AK
1633 /*
1634 * Various K7s with broken bank 0 around. Always disable
1635 * by default.
1636 */
c7d314f3 1637 if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
cebe1820 1638 mce_banks[0].ctl = 0;
575203b4 1639
bf80bbd7
AG
1640 /*
1641 * overflow_recov is supported for F15h Models 00h-0fh
1642 * even though we don't have a CPUID bit for it.
1643 */
1644 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1645 mce_flags.overflow_recov = 1;
1646
1da177e4 1647 }
e583538f 1648
06b7a7a5
AK
1649 if (c->x86_vendor == X86_VENDOR_INTEL) {
1650 /*
1651 * SDM documents that on family 6 bank 0 should not be written
1652 * because it aliases to another special BIOS controlled
1653 * register.
1654 * But it's not aliased anymore on model 0x1a+
1655 * Don't ignore bank 0 completely because there could be a
1656 * valid event later, merely don't write CTL0.
1657 */
1658
c7d314f3 1659 if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
cebe1820 1660 mce_banks[0].init = 0;
3c079792
AK
1661
1662 /*
1663 * All newer Intel systems support MCE broadcasting. Enable
1664 * synchronization with a one second timeout.
1665 */
1666 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
84c2559d
BP
1667 cfg->monarch_timeout < 0)
1668 cfg->monarch_timeout = USEC_PER_SEC;
c7f6fa44 1669
e412cd25
IM
1670 /*
1671 * There are also broken BIOSes on some Pentium M and
1672 * earlier systems:
1673 */
84c2559d
BP
1674 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1675 cfg->bootlog = 0;
61b0fccd
TL
1676
1677 if (c->x86 == 6 && c->x86_model == 45)
1678 quirk_no_way_out = quirk_sandybridge_ifu;
06b7a7a5 1679 }
6e898d2b
TW
1680
1681 if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1682 /*
1683 * All newer Zhaoxin CPUs support MCE broadcasting. Enable
1684 * synchronization with a one second timeout.
1685 */
1686 if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1687 if (cfg->monarch_timeout < 0)
1688 cfg->monarch_timeout = USEC_PER_SEC;
1689 }
1690 }
1691
84c2559d
BP
1692 if (cfg->monarch_timeout < 0)
1693 cfg->monarch_timeout = 0;
1694 if (cfg->bootlog != 0)
7af19e4a 1695 cfg->panic_timeout = 30;
e412cd25
IM
1696
1697 return 0;
d88203d1 1698}
1da177e4 1699
148f9bb8 1700static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
4efc0670
AK
1701{
1702 if (c->x86 != 5)
3a97fc34
HS
1703 return 0;
1704
4efc0670
AK
1705 switch (c->x86_vendor) {
1706 case X86_VENDOR_INTEL:
c6978369 1707 intel_p5_mcheck_init(c);
3a97fc34 1708 return 1;
4efc0670
AK
1709 break;
1710 case X86_VENDOR_CENTAUR:
1711 winchip_mcheck_init(c);
3a97fc34 1712 return 1;
4efc0670 1713 break;
dc34bdd2
BP
1714 default:
1715 return 0;
4efc0670 1716 }
3a97fc34
HS
1717
1718 return 0;
4efc0670
AK
1719}
1720
5204bf17
YG
1721/*
1722 * Init basic CPU features needed for early decoding of MCEs.
1723 */
1724static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1da177e4 1725{
ac78bd72 1726 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
14cddfd5
YG
1727 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1728 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1729 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
c9bf318f 1730 mce_flags.amd_threshold = 1;
d9d73fcc 1731
d9d73fcc
YG
1732 if (mce_flags.smca) {
1733 msr_ops.ctl = smca_ctl_reg;
1734 msr_ops.status = smca_status_reg;
1735 msr_ops.addr = smca_addr_reg;
1736 msr_ops.misc = smca_misc_reg;
1737 }
5204bf17
YG
1738 }
1739}
c7f54d21 1740
13e85822
DW
1741static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1742{
1743 struct mca_config *cfg = &mca_cfg;
1744
1745 /*
1746 * All newer Centaur CPUs support MCE broadcasting. Enable
1747 * synchronization with a one second timeout.
1748 */
1749 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1750 c->x86 > 6) {
1751 if (cfg->monarch_timeout < 0)
1752 cfg->monarch_timeout = USEC_PER_SEC;
1753 }
1754}
1755
5a3d56a0
TW
1756static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1757{
1758 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1759
1760 /*
1761 * These CPUs have MCA bank 8 which reports only one error type called
1762 * SVAD (System View Address Decoder). The reporting of that error is
1763 * controlled by IA32_MC8.CTL.0.
1764 *
1765 * If enabled, prefetching on these CPUs will cause SVAD MCE when
1766 * virtual machines start and result in a system panic. Always disable
1767 * bank 8 SVAD error by default.
1768 */
1769 if ((c->x86 == 7 && c->x86_model == 0x1b) ||
1770 (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1771 if (this_cpu_read(mce_num_banks) > 8)
1772 mce_banks[8].ctl = 0;
1773 }
1774
1775 intel_init_cmci();
70f0c230 1776 intel_init_lmce();
5a3d56a0
TW
1777 mce_adjust_timer = cmci_intel_adjust_timer;
1778}
1779
70f0c230
TW
1780static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
1781{
1782 intel_clear_lmce();
1783}
1784
5204bf17
YG
1785static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1786{
1787 switch (c->x86_vendor) {
1788 case X86_VENDOR_INTEL:
1789 mce_intel_feature_init(c);
1790 mce_adjust_timer = cmci_intel_adjust_timer;
1791 break;
c7f54d21 1792
5204bf17
YG
1793 case X86_VENDOR_AMD: {
1794 mce_amd_feature_init(c);
89b831ef 1795 break;
7559e13f 1796 }
ac78bd72
PW
1797
1798 case X86_VENDOR_HYGON:
1799 mce_hygon_feature_init(c);
1800 break;
1801
13e85822
DW
1802 case X86_VENDOR_CENTAUR:
1803 mce_centaur_feature_init(c);
1804 break;
7559e13f 1805
5a3d56a0
TW
1806 case X86_VENDOR_ZHAOXIN:
1807 mce_zhaoxin_feature_init(c);
1808 break;
1809
1da177e4
LT
1810 default:
1811 break;
1812 }
1813}
1814
8838eb6c
AR
1815static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1816{
1817 switch (c->x86_vendor) {
1818 case X86_VENDOR_INTEL:
1819 mce_intel_feature_clear(c);
1820 break;
70f0c230
TW
1821
1822 case X86_VENDOR_ZHAOXIN:
1823 mce_zhaoxin_feature_clear(c);
1824 break;
1825
8838eb6c
AR
1826 default:
1827 break;
1828 }
1829}
1830
0becc0ae 1831static void mce_start_timer(struct timer_list *t)
52d168e2 1832{
4f75d841 1833 unsigned long iv = check_interval * HZ;
bc09effa 1834
7af19e4a 1835 if (mca_cfg.ignore_ce || !iv)
62fdac59
HS
1836 return;
1837
0becc0ae
TG
1838 this_cpu_write(mce_next_interval, iv);
1839 __start_timer(t, iv);
52d168e2
AK
1840}
1841
39f152ff
SAS
1842static void __mcheck_cpu_setup_timer(void)
1843{
1844 struct timer_list *t = this_cpu_ptr(&mce_timer);
39f152ff 1845
92bb6cb1 1846 timer_setup(t, mce_timer_fn, TIMER_PINNED);
39f152ff
SAS
1847}
1848
26c3c283
TG
1849static void __mcheck_cpu_init_timer(void)
1850{
89cbc767 1851 struct timer_list *t = this_cpu_ptr(&mce_timer);
26c3c283 1852
92bb6cb1 1853 timer_setup(t, mce_timer_fn, TIMER_PINNED);
0becc0ae 1854 mce_start_timer(t);
26c3c283
TG
1855}
1856
45d4b7b9
YG
1857bool filter_mce(struct mce *m)
1858{
71a84402
YG
1859 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1860 return amd_filter_mce(m);
2976908e
PB
1861 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1862 return intel_filter_mce(m);
71a84402 1863
45d4b7b9
YG
1864 return false;
1865}
1866
9eda8cb3
AK
1867/* Handle unconfigured int18 (should never happen) */
1868static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1869{
c767a54b 1870 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
9eda8cb3
AK
1871 smp_processor_id());
1872}
1873
1874/* Call the installed machine check handler for this CPU setup. */
1875void (*machine_check_vector)(struct pt_regs *, long error_code) =
1876 unexpected_machine_check;
1877
55ba18d6 1878dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code)
6f41c34d
TG
1879{
1880 machine_check_vector(regs, error_code);
1881}
55ba18d6 1882NOKPROBE_SYMBOL(do_mce);
6f41c34d 1883
d88203d1 1884/*
1da177e4 1885 * Called for each booted CPU to set up machine checks.
e9eee03e 1886 * Must be called with preempt off:
1da177e4 1887 */
148f9bb8 1888void mcheck_cpu_init(struct cpuinfo_x86 *c)
1da177e4 1889{
1462594b 1890 if (mca_cfg.disabled)
4efc0670
AK
1891 return;
1892
3a97fc34
HS
1893 if (__mcheck_cpu_ancient_init(c))
1894 return;
4efc0670 1895
5b4408fd 1896 if (!mce_available(c))
1da177e4
LT
1897 return;
1898
b4914508
YG
1899 __mcheck_cpu_cap_init();
1900
1901 if (__mcheck_cpu_apply_quirks(c) < 0) {
09933946 1902 mca_cfg.disabled = 1;
0d7482e3
AK
1903 return;
1904 }
0d7482e3 1905
648ed940 1906 if (mce_gen_pool_init()) {
09933946 1907 mca_cfg.disabled = 1;
648ed940
CG
1908 pr_emerg("Couldn't allocate MCE records pool!\n");
1909 return;
1910 }
1911
5d727926
AK
1912 machine_check_vector = do_machine_check;
1913
5204bf17 1914 __mcheck_cpu_init_early(c);
5e09954a
BP
1915 __mcheck_cpu_init_generic();
1916 __mcheck_cpu_init_vendor(c);
bb91f8c0 1917 __mcheck_cpu_init_clear_banks();
068b053d 1918 __mcheck_cpu_check_banks();
39f152ff 1919 __mcheck_cpu_setup_timer();
1da177e4
LT
1920}
1921
8838eb6c
AR
1922/*
1923 * Called for each booted CPU to clear some machine checks opt-ins
1924 */
1925void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1926{
1927 if (mca_cfg.disabled)
1928 return;
1929
1930 if (!mce_available(c))
1931 return;
1932
1933 /*
1934 * Possibly to clear general settings generic to x86
1935 * __mcheck_cpu_clear_generic(c);
1936 */
1937 __mcheck_cpu_clear_vendor(c);
1938
1da177e4
LT
1939}
1940
c3d1fb56
NR
1941static void __mce_disable_bank(void *arg)
1942{
1943 int bank = *((int *)arg);
89cbc767 1944 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
c3d1fb56
NR
1945 cmci_disable_bank(bank);
1946}
1947
1948void mce_disable_bank(int bank)
1949{
c7d314f3 1950 if (bank >= this_cpu_read(mce_num_banks)) {
c3d1fb56
NR
1951 pr_warn(FW_BUG
1952 "Ignoring request to disable invalid MCA bank %d.\n",
1953 bank);
1954 return;
1955 }
1956 set_bit(bank, mce_banks_ce_disabled);
1957 on_each_cpu(__mce_disable_bank, &bank, 1);
1958}
1959
13503fa9 1960/*
62fdac59
HS
1961 * mce=off Disables machine check
1962 * mce=no_cmci Disables CMCI
88d53867 1963 * mce=no_lmce Disables LMCE
62fdac59
HS
1964 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1965 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
3c079792
AK
1966 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1967 * monarchtimeout is how long to wait for other CPUs on machine
1968 * check, or 0 to not wait
6057077f
YG
1969 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
1970 and older.
13503fa9 1971 * mce=nobootlog Don't log MCEs from before booting.
450cc201 1972 * mce=bios_cmci_threshold Don't program the CMCI threshold
3637efb0 1973 * mce=recovery force enable memcpy_mcsafe()
13503fa9 1974 */
1da177e4
LT
1975static int __init mcheck_enable(char *str)
1976{
d203f0b8
BP
1977 struct mca_config *cfg = &mca_cfg;
1978
e3346fc4 1979 if (*str == 0) {
4efc0670 1980 enable_p5_mce();
e3346fc4
BZ
1981 return 1;
1982 }
4efc0670
AK
1983 if (*str == '=')
1984 str++;
1da177e4 1985 if (!strcmp(str, "off"))
09933946 1986 cfg->disabled = 1;
62fdac59 1987 else if (!strcmp(str, "no_cmci"))
7af19e4a 1988 cfg->cmci_disabled = true;
88d53867 1989 else if (!strcmp(str, "no_lmce"))
09933946 1990 cfg->lmce_disabled = 1;
62fdac59 1991 else if (!strcmp(str, "dont_log_ce"))
d203f0b8 1992 cfg->dont_log_ce = true;
62fdac59 1993 else if (!strcmp(str, "ignore_ce"))
7af19e4a 1994 cfg->ignore_ce = true;
13503fa9 1995 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
84c2559d 1996 cfg->bootlog = (str[0] == 'b');
450cc201 1997 else if (!strcmp(str, "bios_cmci_threshold"))
09933946 1998 cfg->bios_cmci_threshold = 1;
0f68c088 1999 else if (!strcmp(str, "recovery"))
09933946 2000 cfg->recovery = 1;
3c079792 2001 else if (isdigit(str[0])) {
5c31b280 2002 if (get_option(&str, &cfg->tolerant) == 2)
84c2559d 2003 get_option(&str, &(cfg->monarch_timeout));
3c079792 2004 } else {
c767a54b 2005 pr_info("mce argument %s ignored. Please use /sys\n", str);
13503fa9
HS
2006 return 0;
2007 }
9b41046c 2008 return 1;
1da177e4 2009}
4efc0670 2010__setup("mce", mcheck_enable);
1da177e4 2011
a2202aa2 2012int __init mcheck_init(void)
b33a6363 2013{
a2202aa2 2014 mcheck_intel_therm_init();
c9c6d216 2015 mce_register_decode_chain(&early_nb);
8438b84a 2016 mce_register_decode_chain(&mce_uc_nb);
cd9c57ca 2017 mce_register_decode_chain(&mce_default_nb);
43eaa2a1 2018 mcheck_vendor_init_severity();
a2202aa2 2019
cff4c039 2020 INIT_WORK(&mce_work, mce_gen_pool_process);
061120ae
CG
2021 init_irq_work(&mce_irq_work, mce_irq_work_cb);
2022
b33a6363
BP
2023 return 0;
2024}
b33a6363 2025
d88203d1 2026/*
c7cece89 2027 * mce_syscore: PM support
d88203d1 2028 */
1da177e4 2029
973a2dd1
AK
2030/*
2031 * Disable machine checks on suspend and shutdown. We can't really handle
2032 * them later.
2033 */
6e06780a 2034static void mce_disable_error_reporting(void)
973a2dd1 2035{
b4914508 2036 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
973a2dd1
AK
2037 int i;
2038
c7d314f3 2039 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 2040 struct mce_bank *b = &mce_banks[i];
11868a2d 2041
cebe1820 2042 if (b->init)
d9d73fcc 2043 wrmsrl(msr_ops.ctl(i), 0);
06b7a7a5 2044 }
6e06780a
AR
2045 return;
2046}
2047
2048static void vendor_disable_error_reporting(void)
2049{
2050 /*
6e898d2b
TW
2051 * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
2052 * MSRs are socket-wide. Disabling them for just a single offlined CPU
2053 * is bad, since it will inhibit reporting for all shared resources on
2054 * the socket like the last level cache (LLC), the integrated memory
2055 * controller (iMC), etc.
6e06780a 2056 */
ec338382 2057 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
ac78bd72 2058 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
6e898d2b
TW
2059 boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
2060 boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
6e06780a
AR
2061 return;
2062
2063 mce_disable_error_reporting();
973a2dd1
AK
2064}
2065
c7cece89 2066static int mce_syscore_suspend(void)
973a2dd1 2067{
6e06780a
AR
2068 vendor_disable_error_reporting();
2069 return 0;
973a2dd1
AK
2070}
2071
c7cece89 2072static void mce_syscore_shutdown(void)
973a2dd1 2073{
6e06780a 2074 vendor_disable_error_reporting();
973a2dd1
AK
2075}
2076
e9eee03e
IM
2077/*
2078 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2079 * Only one CPU is active at this time, the others get re-added later using
2080 * CPU hotplug:
2081 */
c7cece89 2082static void mce_syscore_resume(void)
1da177e4 2083{
5e09954a 2084 __mcheck_cpu_init_generic();
89cbc767 2085 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
bb91f8c0 2086 __mcheck_cpu_init_clear_banks();
1da177e4
LT
2087}
2088
f3c6ea1b 2089static struct syscore_ops mce_syscore_ops = {
c7cece89
HS
2090 .suspend = mce_syscore_suspend,
2091 .shutdown = mce_syscore_shutdown,
2092 .resume = mce_syscore_resume,
f3c6ea1b
RW
2093};
2094
c7cece89 2095/*
8a25a2fd 2096 * mce_device: Sysfs support
c7cece89
HS
2097 */
2098
52d168e2
AK
2099static void mce_cpu_restart(void *data)
2100{
89cbc767 2101 if (!mce_available(raw_cpu_ptr(&cpu_info)))
33edbf02 2102 return;
5e09954a 2103 __mcheck_cpu_init_generic();
bb91f8c0 2104 __mcheck_cpu_init_clear_banks();
5e09954a 2105 __mcheck_cpu_init_timer();
52d168e2
AK
2106}
2107
1da177e4 2108/* Reinit MCEs after user configuration changes */
d88203d1
TG
2109static void mce_restart(void)
2110{
9aaef96f 2111 mce_timer_delete_all();
52d168e2 2112 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
2113}
2114
9af43b54 2115/* Toggle features for corrected errors */
9aaef96f 2116static void mce_disable_cmci(void *data)
9af43b54 2117{
89cbc767 2118 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54 2119 return;
9af43b54
HS
2120 cmci_clear();
2121}
2122
2123static void mce_enable_ce(void *all)
2124{
89cbc767 2125 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54
HS
2126 return;
2127 cmci_reenable();
2128 cmci_recheck();
2129 if (all)
5e09954a 2130 __mcheck_cpu_init_timer();
9af43b54
HS
2131}
2132
8a25a2fd 2133static struct bus_type mce_subsys = {
e9eee03e 2134 .name = "machinecheck",
8a25a2fd 2135 .dev_name = "machinecheck",
1da177e4
LT
2136};
2137
d6126ef5 2138DEFINE_PER_CPU(struct device *, mce_device);
e9eee03e 2139
b4914508 2140static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
cebe1820 2141{
b4914508 2142 return container_of(attr, struct mce_bank_dev, attr);
cebe1820 2143}
0d7482e3 2144
8a25a2fd 2145static ssize_t show_bank(struct device *s, struct device_attribute *attr,
0d7482e3
AK
2146 char *buf)
2147{
b4914508
YG
2148 u8 bank = attr_to_bank(attr)->bank;
2149 struct mce_bank *b;
2150
c7d314f3 2151 if (bank >= per_cpu(mce_num_banks, s->id))
b4914508
YG
2152 return -EINVAL;
2153
2154 b = &per_cpu(mce_banks_array, s->id)[bank];
2155
068b053d
YG
2156 if (!b->init)
2157 return -ENODEV;
2158
b4914508 2159 return sprintf(buf, "%llx\n", b->ctl);
0d7482e3
AK
2160}
2161
8a25a2fd 2162static ssize_t set_bank(struct device *s, struct device_attribute *attr,
9319cec8 2163 const char *buf, size_t size)
0d7482e3 2164{
b4914508
YG
2165 u8 bank = attr_to_bank(attr)->bank;
2166 struct mce_bank *b;
9319cec8 2167 u64 new;
e9eee03e 2168
164109e3 2169 if (kstrtou64(buf, 0, &new) < 0)
0d7482e3 2170 return -EINVAL;
e9eee03e 2171
c7d314f3 2172 if (bank >= per_cpu(mce_num_banks, s->id))
b4914508
YG
2173 return -EINVAL;
2174
2175 b = &per_cpu(mce_banks_array, s->id)[bank];
2176
068b053d
YG
2177 if (!b->init)
2178 return -ENODEV;
2179
b4914508 2180 b->ctl = new;
0d7482e3 2181 mce_restart();
e9eee03e 2182
9319cec8 2183 return size;
0d7482e3 2184}
a98f0dd3 2185
8a25a2fd
KS
2186static ssize_t set_ignore_ce(struct device *s,
2187 struct device_attribute *attr,
9af43b54
HS
2188 const char *buf, size_t size)
2189{
2190 u64 new;
2191
164109e3 2192 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2193 return -EINVAL;
2194
b3b7c479 2195 mutex_lock(&mce_sysfs_mutex);
7af19e4a 2196 if (mca_cfg.ignore_ce ^ !!new) {
9af43b54
HS
2197 if (new) {
2198 /* disable ce features */
9aaef96f
HS
2199 mce_timer_delete_all();
2200 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2201 mca_cfg.ignore_ce = true;
9af43b54
HS
2202 } else {
2203 /* enable ce features */
7af19e4a 2204 mca_cfg.ignore_ce = false;
9af43b54
HS
2205 on_each_cpu(mce_enable_ce, (void *)1, 1);
2206 }
2207 }
b3b7c479
SH
2208 mutex_unlock(&mce_sysfs_mutex);
2209
9af43b54
HS
2210 return size;
2211}
2212
8a25a2fd
KS
2213static ssize_t set_cmci_disabled(struct device *s,
2214 struct device_attribute *attr,
9af43b54
HS
2215 const char *buf, size_t size)
2216{
2217 u64 new;
2218
164109e3 2219 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2220 return -EINVAL;
2221
b3b7c479 2222 mutex_lock(&mce_sysfs_mutex);
7af19e4a 2223 if (mca_cfg.cmci_disabled ^ !!new) {
9af43b54
HS
2224 if (new) {
2225 /* disable cmci */
9aaef96f 2226 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2227 mca_cfg.cmci_disabled = true;
9af43b54
HS
2228 } else {
2229 /* enable cmci */
7af19e4a 2230 mca_cfg.cmci_disabled = false;
9af43b54
HS
2231 on_each_cpu(mce_enable_ce, NULL, 1);
2232 }
2233 }
b3b7c479
SH
2234 mutex_unlock(&mce_sysfs_mutex);
2235
9af43b54
HS
2236 return size;
2237}
2238
8a25a2fd
KS
2239static ssize_t store_int_with_restart(struct device *s,
2240 struct device_attribute *attr,
b56f642d
AK
2241 const char *buf, size_t size)
2242{
b3b7c479
SH
2243 unsigned long old_check_interval = check_interval;
2244 ssize_t ret = device_store_ulong(s, attr, buf, size);
2245
2246 if (check_interval == old_check_interval)
2247 return ret;
2248
b3b7c479 2249 mutex_lock(&mce_sysfs_mutex);
b56f642d 2250 mce_restart();
b3b7c479
SH
2251 mutex_unlock(&mce_sysfs_mutex);
2252
b56f642d
AK
2253 return ret;
2254}
2255
d203f0b8 2256static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
84c2559d 2257static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
d203f0b8 2258static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
e9eee03e 2259
8a25a2fd
KS
2260static struct dev_ext_attribute dev_attr_check_interval = {
2261 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
b56f642d
AK
2262 &check_interval
2263};
e9eee03e 2264
8a25a2fd 2265static struct dev_ext_attribute dev_attr_ignore_ce = {
7af19e4a
BP
2266 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2267 &mca_cfg.ignore_ce
9af43b54
HS
2268};
2269
8a25a2fd 2270static struct dev_ext_attribute dev_attr_cmci_disabled = {
7af19e4a
BP
2271 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2272 &mca_cfg.cmci_disabled
9af43b54
HS
2273};
2274
8a25a2fd
KS
2275static struct device_attribute *mce_device_attrs[] = {
2276 &dev_attr_tolerant.attr,
2277 &dev_attr_check_interval.attr,
5de97c9f 2278#ifdef CONFIG_X86_MCELOG_LEGACY
8a25a2fd 2279 &dev_attr_trigger,
5de97c9f 2280#endif
8a25a2fd
KS
2281 &dev_attr_monarch_timeout.attr,
2282 &dev_attr_dont_log_ce.attr,
2283 &dev_attr_ignore_ce.attr,
2284 &dev_attr_cmci_disabled.attr,
a98f0dd3
AK
2285 NULL
2286};
1da177e4 2287
8a25a2fd 2288static cpumask_var_t mce_device_initialized;
bae19fe0 2289
e032d807
GKH
2290static void mce_device_release(struct device *dev)
2291{
2292 kfree(dev);
2293}
2294
b4914508 2295/* Per CPU device init. All of the CPUs still share the same bank device: */
148f9bb8 2296static int mce_device_create(unsigned int cpu)
1da177e4 2297{
e032d807 2298 struct device *dev;
1da177e4 2299 int err;
b1f49f95 2300 int i, j;
92cb7612 2301
90367556 2302 if (!mce_available(&boot_cpu_data))
91c6d400
AK
2303 return -EIO;
2304
7f34b935
SAS
2305 dev = per_cpu(mce_device, cpu);
2306 if (dev)
2307 return 0;
2308
0e96f31e 2309 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
e032d807
GKH
2310 if (!dev)
2311 return -ENOMEM;
8a25a2fd
KS
2312 dev->id = cpu;
2313 dev->bus = &mce_subsys;
e032d807 2314 dev->release = &mce_device_release;
91c6d400 2315
8a25a2fd 2316 err = device_register(dev);
853d9b18
LK
2317 if (err) {
2318 put_device(dev);
d435d862 2319 return err;
853d9b18 2320 }
d435d862 2321
8a25a2fd
KS
2322 for (i = 0; mce_device_attrs[i]; i++) {
2323 err = device_create_file(dev, mce_device_attrs[i]);
d435d862
AM
2324 if (err)
2325 goto error;
2326 }
c7d314f3 2327 for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
b4914508 2328 err = device_create_file(dev, &mce_bank_devs[j].attr);
0d7482e3
AK
2329 if (err)
2330 goto error2;
2331 }
8a25a2fd 2332 cpumask_set_cpu(cpu, mce_device_initialized);
d6126ef5 2333 per_cpu(mce_device, cpu) = dev;
91c6d400 2334
d435d862 2335 return 0;
0d7482e3 2336error2:
b1f49f95 2337 while (--j >= 0)
b4914508 2338 device_remove_file(dev, &mce_bank_devs[j].attr);
d435d862 2339error:
cb491fca 2340 while (--i >= 0)
8a25a2fd 2341 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2342
8a25a2fd 2343 device_unregister(dev);
d435d862 2344
91c6d400
AK
2345 return err;
2346}
2347
148f9bb8 2348static void mce_device_remove(unsigned int cpu)
91c6d400 2349{
d6126ef5 2350 struct device *dev = per_cpu(mce_device, cpu);
73ca5358
SL
2351 int i;
2352
8a25a2fd 2353 if (!cpumask_test_cpu(cpu, mce_device_initialized))
bae19fe0
AH
2354 return;
2355
8a25a2fd
KS
2356 for (i = 0; mce_device_attrs[i]; i++)
2357 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2358
c7d314f3 2359 for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
b4914508 2360 device_remove_file(dev, &mce_bank_devs[i].attr);
cb491fca 2361
8a25a2fd
KS
2362 device_unregister(dev);
2363 cpumask_clear_cpu(cpu, mce_device_initialized);
d6126ef5 2364 per_cpu(mce_device, cpu) = NULL;
91c6d400 2365}
91c6d400 2366
d6b75584 2367/* Make sure there are no machine checks on offlined CPUs. */
39f152ff 2368static void mce_disable_cpu(void)
d6b75584 2369{
89cbc767 2370 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2371 return;
767df1bd 2372
39f152ff 2373 if (!cpuhp_tasks_frozen)
88ccbedd 2374 cmci_clear();
11868a2d 2375
6e06780a 2376 vendor_disable_error_reporting();
d6b75584
AK
2377}
2378
39f152ff 2379static void mce_reenable_cpu(void)
d6b75584 2380{
b4914508 2381 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
e9eee03e 2382 int i;
d6b75584 2383
89cbc767 2384 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2385 return;
e9eee03e 2386
39f152ff 2387 if (!cpuhp_tasks_frozen)
88ccbedd 2388 cmci_reenable();
c7d314f3 2389 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 2390 struct mce_bank *b = &mce_banks[i];
11868a2d 2391
cebe1820 2392 if (b->init)
d9d73fcc 2393 wrmsrl(msr_ops.ctl(i), b->ctl);
06b7a7a5 2394 }
d6b75584
AK
2395}
2396
0e285d36 2397static int mce_cpu_dead(unsigned int cpu)
91c6d400 2398{
0e285d36 2399 mce_intel_hcpu_update(cpu);
91c6d400 2400
0e285d36
SAS
2401 /* intentionally ignoring frozen here */
2402 if (!cpuhp_tasks_frozen)
2403 cmci_rediscover();
2404 return 0;
91c6d400
AK
2405}
2406
8c0eeac8 2407static int mce_cpu_online(unsigned int cpu)
91c6d400 2408{
0becc0ae 2409 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8 2410 int ret;
91c6d400 2411
8c0eeac8 2412 mce_device_create(cpu);
38356c1f 2413
8c0eeac8
SAS
2414 ret = mce_threshold_create_device(cpu);
2415 if (ret) {
2416 mce_device_remove(cpu);
2417 return ret;
1a65f970 2418 }
8c0eeac8 2419 mce_reenable_cpu();
0becc0ae 2420 mce_start_timer(t);
8c0eeac8 2421 return 0;
91c6d400
AK
2422}
2423
8c0eeac8
SAS
2424static int mce_cpu_pre_down(unsigned int cpu)
2425{
0becc0ae 2426 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8
SAS
2427
2428 mce_disable_cpu();
2429 del_timer_sync(t);
2430 mce_threshold_remove_device(cpu);
2431 mce_device_remove(cpu);
2432 return 0;
2433}
91c6d400 2434
cebe1820 2435static __init void mce_init_banks(void)
0d7482e3
AK
2436{
2437 int i;
2438
b4914508
YG
2439 for (i = 0; i < MAX_NR_BANKS; i++) {
2440 struct mce_bank_dev *b = &mce_bank_devs[i];
8a25a2fd 2441 struct device_attribute *a = &b->attr;
e9eee03e 2442
b4914508
YG
2443 b->bank = i;
2444
a07e4156 2445 sysfs_attr_init(&a->attr);
cebe1820
AK
2446 a->attr.name = b->attrname;
2447 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
e9eee03e
IM
2448
2449 a->attr.mode = 0644;
2450 a->show = show_bank;
2451 a->store = set_bank;
0d7482e3 2452 }
0d7482e3
AK
2453}
2454
6e7a41c6
TG
2455/*
2456 * When running on XEN, this initcall is ordered against the XEN mcelog
2457 * initcall:
2458 *
2459 * device_initcall(xen_late_init_mcelog);
2460 * device_initcall_sync(mcheck_init_device);
2461 */
5e09954a 2462static __init int mcheck_init_device(void)
91c6d400
AK
2463{
2464 int err;
91c6d400 2465
c65e774f
KS
2466 /*
2467 * Check if we have a spare virtual bit. This will only become
2468 * a problem if/when we move beyond 5-level page tables.
2469 */
2470 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2471
9c15a24b
MS
2472 if (!mce_available(&boot_cpu_data)) {
2473 err = -EIO;
2474 goto err_out;
2475 }
0d7482e3 2476
9c15a24b
MS
2477 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2478 err = -ENOMEM;
2479 goto err_out;
2480 }
996867d0 2481
cebe1820 2482 mce_init_banks();
0d7482e3 2483
8a25a2fd 2484 err = subsys_system_register(&mce_subsys, NULL);
d435d862 2485 if (err)
9c15a24b 2486 goto err_out_mem;
91c6d400 2487
0e285d36
SAS
2488 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2489 mce_cpu_dead);
2490 if (err)
2491 goto err_out_mem;
91c6d400 2492
6e7a41c6
TG
2493 /*
2494 * Invokes mce_cpu_online() on all CPUs which are online when
2495 * the state is installed.
2496 */
8c0eeac8
SAS
2497 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2498 mce_cpu_online, mce_cpu_pre_down);
2499 if (err < 0)
0e285d36 2500 goto err_out_online;
93b62c3c 2501
9c15a24b
MS
2502 register_syscore_ops(&mce_syscore_ops);
2503
9c15a24b
MS
2504 return 0;
2505
0e285d36
SAS
2506err_out_online:
2507 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
9c15a24b
MS
2508
2509err_out_mem:
2510 free_cpumask_var(mce_device_initialized);
2511
2512err_out:
5de97c9f 2513 pr_err("Unable to init MCE device (rc: %d)\n", err);
e9eee03e 2514
1da177e4 2515 return err;
1da177e4 2516}
cef12ee5 2517device_initcall_sync(mcheck_init_device);
a988d334 2518
d7c3c9a6
AK
2519/*
2520 * Old style boot options parsing. Only for compatibility.
2521 */
2522static int __init mcheck_disable(char *str)
2523{
09933946 2524 mca_cfg.disabled = 1;
d7c3c9a6
AK
2525 return 1;
2526}
2527__setup("nomce", mcheck_disable);
a988d334 2528
5be9ed25
HY
2529#ifdef CONFIG_DEBUG_FS
2530struct dentry *mce_get_debugfs_dir(void)
a988d334 2531{
5be9ed25 2532 static struct dentry *dmce;
a988d334 2533
5be9ed25
HY
2534 if (!dmce)
2535 dmce = debugfs_create_dir("mce", NULL);
a988d334 2536
5be9ed25
HY
2537 return dmce;
2538}
a988d334 2539
bf783f9f
HY
2540static void mce_reset(void)
2541{
2542 cpu_missing = 0;
c7c9b392 2543 atomic_set(&mce_fake_panicked, 0);
bf783f9f
HY
2544 atomic_set(&mce_executing, 0);
2545 atomic_set(&mce_callin, 0);
2546 atomic_set(&global_nwo, 0);
2547}
a988d334 2548
bf783f9f
HY
2549static int fake_panic_get(void *data, u64 *val)
2550{
2551 *val = fake_panic;
2552 return 0;
a988d334
IM
2553}
2554
bf783f9f 2555static int fake_panic_set(void *data, u64 val)
a988d334 2556{
bf783f9f
HY
2557 mce_reset();
2558 fake_panic = val;
2559 return 0;
a988d334 2560}
a988d334 2561
28156d76
Y
2562DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2563 "%llu\n");
d7c3c9a6 2564
6e4f929e 2565static void __init mcheck_debugfs_init(void)
d7c3c9a6 2566{
6e4f929e 2567 struct dentry *dmce;
bf783f9f
HY
2568
2569 dmce = mce_get_debugfs_dir();
6e4f929e
GKH
2570 debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
2571 &fake_panic_fops);
d7c3c9a6 2572}
fd4cf79f 2573#else
6e4f929e 2574static void __init mcheck_debugfs_init(void) { }
5be9ed25 2575#endif
fd4cf79f 2576
3637efb0
TL
2577DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2578EXPORT_SYMBOL_GPL(mcsafe_key);
2579
fd4cf79f
CG
2580static int __init mcheck_late_init(void)
2581{
3637efb0
TL
2582 if (mca_cfg.recovery)
2583 static_branch_inc(&mcsafe_key);
2584
fd4cf79f
CG
2585 mcheck_debugfs_init();
2586
2587 /*
2588 * Flush out everything that has been logged during early boot, now that
2589 * everything has been initialized (workqueues, decoders, ...).
2590 */
2591 mce_schedule_work();
2592
2593 return 0;
2594}
2595late_initcall(mcheck_late_init);