]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - arch/x86/kernel/cpu/mce/core.c
UBUNTU: Ubuntu-5.4.0-117.132
[mirror_ubuntu-focal-kernel.git] / arch / x86 / kernel / cpu / mce / core.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Machine check handler.
e9eee03e 4 *
1da177e4 5 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
6 * Rest from unknown author(s).
7 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
8 * Copyright 2008 Intel Corporation
9 * Author: Andi Kleen
1da177e4 10 */
c767a54b 11
e9eee03e
IM
12#include <linux/thread_info.h>
13#include <linux/capability.h>
14#include <linux/miscdevice.h>
15#include <linux/ratelimit.h>
e9eee03e 16#include <linux/rcupdate.h>
e9eee03e 17#include <linux/kobject.h>
14a02530 18#include <linux/uaccess.h>
e9eee03e
IM
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
1da177e4 22#include <linux/string.h>
8a25a2fd 23#include <linux/device.h>
f3c6ea1b 24#include <linux/syscore_ops.h>
3c079792 25#include <linux/delay.h>
8c566ef5 26#include <linux/ctype.h>
e9eee03e 27#include <linux/sched.h>
0d7482e3 28#include <linux/sysfs.h>
e9eee03e 29#include <linux/types.h>
5a0e3ad6 30#include <linux/slab.h>
e9eee03e
IM
31#include <linux/init.h>
32#include <linux/kmod.h>
33#include <linux/poll.h>
3c079792 34#include <linux/nmi.h>
e9eee03e 35#include <linux/cpu.h>
011d8261 36#include <linux/ras.h>
14a02530 37#include <linux/smp.h>
e9eee03e 38#include <linux/fs.h>
9b1beaf2 39#include <linux/mm.h>
5be9ed25 40#include <linux/debugfs.h>
b77e70bf 41#include <linux/irq_work.h>
69c60c88 42#include <linux/export.h>
3637efb0 43#include <linux/jump_label.h>
284ce401 44#include <linux/set_memory.h>
e9eee03e 45
3f5a7896 46#include <asm/intel-family.h>
d88203d1 47#include <asm/processor.h>
95927475 48#include <asm/traps.h>
375074cc 49#include <asm/tlbflush.h>
e9eee03e
IM
50#include <asm/mce.h>
51#include <asm/msr.h>
5bc32950 52#include <asm/reboot.h>
1da177e4 53
21afaf18 54#include "internal.h"
711c2e48 55
5de97c9f 56static DEFINE_MUTEX(mce_log_mutex);
f56e8a07 57
b3b7c479
SH
58/* sysfs synchronization */
59static DEFINE_MUTEX(mce_sysfs_mutex);
60
8968f9d3
HS
61#define CREATE_TRACE_POINTS
62#include <trace/events/mce.h>
63
3f2f0680 64#define SPINUNIT 100 /* 100ns */
3c079792 65
01ca79f1
AK
66DEFINE_PER_CPU(unsigned, mce_exception_count);
67
c7d314f3
YG
68DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
69
95fdce6b
YG
70struct mce_bank {
71 u64 ctl; /* subevents to enable */
72 bool init; /* initialise bank? */
b4914508
YG
73};
74static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
75
76#define ATTR_LEN 16
77/* One object for each MCE bank, shared by all CPUs */
78struct mce_bank_dev {
95fdce6b
YG
79 struct device_attribute attr; /* device attribute */
80 char attrname[ATTR_LEN]; /* attribute name */
b4914508 81 u8 bank; /* bank number */
95fdce6b 82};
b4914508 83static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
95fdce6b 84
bf80bbd7 85struct mce_vendor_flags mce_flags __read_mostly;
cebe1820 86
d203f0b8 87struct mca_config mca_cfg __read_mostly = {
84c2559d 88 .bootlog = -1,
d203f0b8
BP
89 /*
90 * Tolerant levels:
91 * 0: always panic on uncorrected errors, log corrected errors
92 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
93 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
94 * 3: never panic or SIGBUS, log all errors (for testing only)
95 */
84c2559d
BP
96 .tolerant = 1,
97 .monarch_timeout = -1
d203f0b8
BP
98};
99
3c079792 100static DEFINE_PER_CPU(struct mce, mces_seen);
5de97c9f
TL
101static unsigned long mce_need_notify;
102static int cpu_missing;
3c079792 103
0644414e
NR
104/*
105 * MCA banks polled by the period polling timer for corrected events.
106 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
107 */
ee031c31
AK
108DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
109 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
110};
111
c3d1fb56
NR
112/*
113 * MCA banks controlled through firmware first for corrected errors.
114 * This is a global list of banks for which we won't enable CMCI and we
115 * won't poll. Firmware controls these banks and is responsible for
116 * reporting corrected errors through GHES. Uncorrected/recoverable
117 * errors are still notified through a machine check.
118 */
119mce_banks_t mce_banks_ce_disabled;
120
061120ae
CG
121static struct work_struct mce_work;
122static struct irq_work mce_irq_work;
9b1beaf2 123
61b0fccd
TL
124static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
125
3653ada5
BP
126/*
127 * CPU/chipset specific EDAC code can register a notifier call here to print
128 * MCE errors in a human-readable form.
129 */
0dc9c639 130BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
3653ada5 131
b5f2fa4e
AK
132/* Do initial initialization of a struct mce */
133void mce_setup(struct mce *m)
134{
135 memset(m, 0, sizeof(struct mce));
d620c67f 136 m->cpu = m->extcpu = smp_processor_id();
bc39f010
AB
137 /* need the internal __ version to avoid deadlocks */
138 m->time = __ktime_get_real_seconds();
8ee08347
AK
139 m->cpuvendor = boot_cpu_data.x86_vendor;
140 m->cpuid = cpuid_eax(1);
8ee08347 141 m->socketid = cpu_data(m->extcpu).phys_proc_id;
8ee08347
AK
142 m->apicid = cpu_data(m->extcpu).initial_apicid;
143 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
3f5a7896
TL
144
145 if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
146 rdmsrl(MSR_PPIN, m->ppin);
fa94d0c6
TL
147
148 m->microcode = boot_cpu_data.microcode;
b5f2fa4e
AK
149}
150
ea149b36
AK
151DEFINE_PER_CPU(struct mce, injectm);
152EXPORT_PER_CPU_SYMBOL_GPL(injectm);
153
fe3ed20f 154void mce_log(struct mce *m)
1da177e4 155{
fe3ed20f 156 if (!mce_gen_pool_add(m))
f29a7aff 157 irq_work_queue(&mce_irq_work);
1da177e4
LT
158}
159
a79da384 160void mce_inject_log(struct mce *m)
09371957 161{
5de97c9f 162 mutex_lock(&mce_log_mutex);
a79da384 163 mce_log(m);
5de97c9f 164 mutex_unlock(&mce_log_mutex);
09371957 165}
a79da384 166EXPORT_SYMBOL_GPL(mce_inject_log);
09371957 167
fd4cf79f 168static struct notifier_block mce_srao_nb;
09371957 169
011d8261
BP
170/*
171 * We run the default notifier if we have only the SRAO, the first and the
172 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
173 * notifiers registered on the chain.
174 */
175#define NUM_DEFAULT_NOTIFIERS 3
cd9c57ca
BP
176static atomic_t num_notifiers;
177
3653ada5
BP
178void mce_register_decode_chain(struct notifier_block *nb)
179{
415601b1 180 if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
32b40a82 181 return;
cd9c57ca 182
32b40a82 183 atomic_inc(&num_notifiers);
fd4cf79f 184
0dc9c639 185 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
3653ada5
BP
186}
187EXPORT_SYMBOL_GPL(mce_register_decode_chain);
188
189void mce_unregister_decode_chain(struct notifier_block *nb)
190{
cd9c57ca
BP
191 atomic_dec(&num_notifiers);
192
0dc9c639 193 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
3653ada5
BP
194}
195EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
196
a9750a31
YG
197static inline u32 ctl_reg(int bank)
198{
199 return MSR_IA32_MCx_CTL(bank);
200}
201
202static inline u32 status_reg(int bank)
203{
204 return MSR_IA32_MCx_STATUS(bank);
205}
206
207static inline u32 addr_reg(int bank)
208{
209 return MSR_IA32_MCx_ADDR(bank);
210}
211
212static inline u32 misc_reg(int bank)
213{
214 return MSR_IA32_MCx_MISC(bank);
215}
216
217static inline u32 smca_ctl_reg(int bank)
218{
219 return MSR_AMD64_SMCA_MCx_CTL(bank);
220}
221
222static inline u32 smca_status_reg(int bank)
223{
224 return MSR_AMD64_SMCA_MCx_STATUS(bank);
225}
226
227static inline u32 smca_addr_reg(int bank)
228{
229 return MSR_AMD64_SMCA_MCx_ADDR(bank);
230}
231
232static inline u32 smca_misc_reg(int bank)
233{
234 return MSR_AMD64_SMCA_MCx_MISC(bank);
235}
236
237struct mca_msr_regs msr_ops = {
238 .ctl = ctl_reg,
239 .status = status_reg,
240 .addr = addr_reg,
241 .misc = misc_reg
242};
243
cd9c57ca 244static void __print_mce(struct mce *m)
1da177e4 245{
cd9c57ca
BP
246 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
247 m->extcpu,
248 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
249 m->mcgstatus, m->bank, m->status);
f436f8bb 250
65ea5b03 251 if (m->ip) {
a2d7b0d4 252 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
f436f8bb 253 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
cd9c57ca 254 m->cs, m->ip);
f436f8bb 255
1da177e4 256 if (m->cs == __KERNEL_CS)
c80c5ec1 257 pr_cont("{%pS}", (void *)(unsigned long)m->ip);
f436f8bb 258 pr_cont("\n");
1da177e4 259 }
f436f8bb 260
a2d7b0d4 261 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
1da177e4 262 if (m->addr)
f436f8bb 263 pr_cont("ADDR %llx ", m->addr);
1da177e4 264 if (m->misc)
f436f8bb 265 pr_cont("MISC %llx ", m->misc);
549d042d 266
4b711f92
YG
267 if (mce_flags.smca) {
268 if (m->synd)
269 pr_cont("SYND %llx ", m->synd);
270 if (m->ipid)
271 pr_cont("IPID %llx ", m->ipid);
272 }
273
f436f8bb 274 pr_cont("\n");
506ed6b5
AK
275 /*
276 * Note this output is parsed by external tools and old fields
277 * should not be changed.
278 */
881e23e5 279 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
506ed6b5 280 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
fa94d0c6 281 m->microcode);
cd9c57ca
BP
282}
283
284static void print_mce(struct mce *m)
285{
cd9c57ca 286 __print_mce(m);
b2fbf6f2 287
ac78bd72 288 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
b2fbf6f2 289 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
86503560
AK
290}
291
f94b61c2
AK
292#define PANIC_TIMEOUT 5 /* 5 seconds */
293
c7c9b392 294static atomic_t mce_panicked;
f94b61c2 295
bf783f9f 296static int fake_panic;
c7c9b392 297static atomic_t mce_fake_panicked;
bf783f9f 298
f94b61c2
AK
299/* Panic in progress. Enable interrupts and wait for final IPI */
300static void wait_for_panic(void)
301{
302 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
f436f8bb 303
f94b61c2
AK
304 preempt_disable();
305 local_irq_enable();
306 while (timeout-- > 0)
307 udelay(1);
29b0f591 308 if (panic_timeout == 0)
7af19e4a 309 panic_timeout = mca_cfg.panic_timeout;
f94b61c2
AK
310 panic("Panicing machine check CPU died");
311}
312
5f39df22 313static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
d88203d1 314{
5541c93c
TL
315 struct llist_node *pending;
316 struct mce_evt_llist *l;
5f39df22
BP
317 int apei_err = 0;
318
319 /*
320 * Allow instrumentation around external facilities usage. Not that it
321 * matters a whole lot since the machine is going to panic anyway.
322 */
323 instrumentation_begin();
e02e68d3 324
bf783f9f
HY
325 if (!fake_panic) {
326 /*
327 * Make sure only one CPU runs in machine check panic
328 */
c7c9b392 329 if (atomic_inc_return(&mce_panicked) > 1)
bf783f9f
HY
330 wait_for_panic();
331 barrier();
f94b61c2 332
bf783f9f
HY
333 bust_spinlocks(1);
334 console_verbose();
335 } else {
336 /* Don't log too much for fake panic */
c7c9b392 337 if (atomic_inc_return(&mce_fake_panicked) > 1)
5f39df22 338 goto out;
bf783f9f 339 }
5541c93c 340 pending = mce_gen_pool_prepare_records();
a0189c70 341 /* First print corrected ones that are still unlogged */
5541c93c
TL
342 llist_for_each_entry(l, pending, llnode) {
343 struct mce *m = &l->mce;
482908b4 344 if (!(m->status & MCI_STATUS_UC)) {
77e26cca 345 print_mce(m);
482908b4
HY
346 if (!apei_err)
347 apei_err = apei_write_mce(m);
348 }
a0189c70
AK
349 }
350 /* Now print uncorrected but with the final one last */
5541c93c
TL
351 llist_for_each_entry(l, pending, llnode) {
352 struct mce *m = &l->mce;
77e26cca
HS
353 if (!(m->status & MCI_STATUS_UC))
354 continue;
5541c93c 355 if (!final || mce_cmp(m, final)) {
77e26cca 356 print_mce(m);
482908b4
HY
357 if (!apei_err)
358 apei_err = apei_write_mce(m);
359 }
1da177e4 360 }
482908b4 361 if (final) {
77e26cca 362 print_mce(final);
482908b4
HY
363 if (!apei_err)
364 apei_err = apei_write_mce(final);
365 }
3c079792 366 if (cpu_missing)
a2d7b0d4 367 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
bd19a5e6 368 if (exp)
a2d7b0d4 369 pr_emerg(HW_ERR "Machine check: %s\n", exp);
bf783f9f
HY
370 if (!fake_panic) {
371 if (panic_timeout == 0)
7af19e4a 372 panic_timeout = mca_cfg.panic_timeout;
bf783f9f
HY
373 panic(msg);
374 } else
a2d7b0d4 375 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
5f39df22
BP
376
377out:
378 instrumentation_end();
d88203d1 379}
1da177e4 380
ea149b36
AK
381/* Support code for software error injection */
382
383static int msr_to_offset(u32 msr)
384{
0a3aee0d 385 unsigned bank = __this_cpu_read(injectm.bank);
f436f8bb 386
84c2559d 387 if (msr == mca_cfg.rip_msr)
ea149b36 388 return offsetof(struct mce, ip);
d9d73fcc 389 if (msr == msr_ops.status(bank))
ea149b36 390 return offsetof(struct mce, status);
d9d73fcc 391 if (msr == msr_ops.addr(bank))
ea149b36 392 return offsetof(struct mce, addr);
d9d73fcc 393 if (msr == msr_ops.misc(bank))
ea149b36
AK
394 return offsetof(struct mce, misc);
395 if (msr == MSR_IA32_MCG_STATUS)
396 return offsetof(struct mce, mcgstatus);
397 return -1;
398}
399
2fcb0d72
BP
400__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
401 struct pt_regs *regs, int trapnr,
402 unsigned long error_code,
403 unsigned long fault_addr)
404{
405 pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
406 (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
407
408 show_stack_regs(regs);
409
410 panic("MCA architectural violation!\n");
411
412 while (true)
413 cpu_relax();
414
415 return true;
416}
417
5f8c1a54
AK
418/* MSR access wrappers used for error injection */
419static u64 mce_rdmsrl(u32 msr)
420{
2fcb0d72 421 DECLARE_ARGS(val, low, high);
11868a2d 422
0a3aee0d 423 if (__this_cpu_read(injectm.finished)) {
ea149b36 424 int offset = msr_to_offset(msr);
11868a2d 425
ea149b36
AK
426 if (offset < 0)
427 return 0;
89cbc767 428 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
ea149b36 429 }
11868a2d 430
2fcb0d72
BP
431 /*
432 * RDMSR on MCA MSRs should not fault. If they do, this is very much an
433 * architectural violation and needs to be reported to hw vendor. Panic
434 * the box to not allow any further progress.
435 */
436 asm volatile("1: rdmsr\n"
437 "2:\n"
438 _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
439 : EAX_EDX_RET(val, low, high) : "c" (msr));
11868a2d 440
2fcb0d72
BP
441
442 return EAX_EDX_VAL(val, low, high);
443}
444
445__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
446 struct pt_regs *regs, int trapnr,
447 unsigned long error_code,
448 unsigned long fault_addr)
449{
450 pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
451 (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
452 regs->ip, (void *)regs->ip);
453
454 show_stack_regs(regs);
455
456 panic("MCA architectural violation!\n");
457
458 while (true)
459 cpu_relax();
460
461 return true;
5f8c1a54
AK
462}
463
464static void mce_wrmsrl(u32 msr, u64 v)
465{
2fcb0d72
BP
466 u32 low, high;
467
0a3aee0d 468 if (__this_cpu_read(injectm.finished)) {
ea149b36 469 int offset = msr_to_offset(msr);
11868a2d 470
ea149b36 471 if (offset >= 0)
89cbc767 472 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
ea149b36
AK
473 return;
474 }
2fcb0d72
BP
475
476 low = (u32)v;
477 high = (u32)(v >> 32);
478
479 /* See comment in mce_rdmsrl() */
480 asm volatile("1: wrmsr\n"
481 "2:\n"
482 _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
483 : : "c" (msr), "a"(low), "d" (high) : "memory");
5f8c1a54
AK
484}
485
b8325c5b
HS
486/*
487 * Collect all global (w.r.t. this processor) status about this machine
488 * check into our "mce" struct so that we can use it later to assess
489 * the severity of the problem as we read per-bank specific details.
490 */
491static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
492{
493 mce_setup(m);
494
495 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
496 if (regs) {
497 /*
498 * Get the address of the instruction at the time of
499 * the machine check error.
500 */
501 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
502 m->ip = regs->ip;
503 m->cs = regs->cs;
a129a7c8
AK
504
505 /*
506 * When in VM86 mode make the cs look like ring 3
507 * always. This is a lie, but it's better than passing
508 * the additional vm86 bit around everywhere.
509 */
510 if (v8086_mode(regs))
511 m->cs |= 3;
b8325c5b
HS
512 }
513 /* Use accurate RIP reporting if available. */
84c2559d
BP
514 if (mca_cfg.rip_msr)
515 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
b8325c5b
HS
516 }
517}
518
88ccbedd 519int mce_available(struct cpuinfo_x86 *c)
1da177e4 520{
1462594b 521 if (mca_cfg.disabled)
5b4408fd 522 return 0;
3d1712c9 523 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
524}
525
9b1beaf2
AK
526static void mce_schedule_work(void)
527{
a2c2727d 528 if (!mce_gen_pool_empty())
061120ae 529 schedule_work(&mce_work);
9b1beaf2
AK
530}
531
b77e70bf 532static void mce_irq_work_cb(struct irq_work *entry)
ccc3c319 533{
9b1beaf2 534 mce_schedule_work();
ccc3c319 535}
ccc3c319 536
feab21f8
BP
537/*
538 * Check if the address reported by the CPU is in a format we can parse.
539 * It would be possible to add code for most other cases, but all would
540 * be somewhat complicated (e.g. segment offset would require an instruction
541 * parser). So only support physical addresses up to page granuality for now.
542 */
e8a308e5 543int mce_usable_address(struct mce *m)
feab21f8 544{
c6a9583f 545 if (!(m->status & MCI_STATUS_ADDRV))
feab21f8
BP
546 return 0;
547
548 /* Checks after this one are Intel-specific: */
549 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
550 return 1;
551
c6a9583f
BP
552 if (!(m->status & MCI_STATUS_MISCV))
553 return 0;
554
feab21f8
BP
555 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
556 return 0;
c6a9583f 557
feab21f8
BP
558 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
559 return 0;
c6a9583f 560
feab21f8
BP
561 return 1;
562}
e8a308e5 563EXPORT_SYMBOL_GPL(mce_usable_address);
feab21f8 564
2d1f4061 565bool mce_is_memory_error(struct mce *m)
011d8261 566{
ac78bd72
PW
567 if (m->cpuvendor == X86_VENDOR_AMD ||
568 m->cpuvendor == X86_VENDOR_HYGON) {
c6708d50 569 return amd_mce_is_memory_error(m);
2d1f4061 570 } else if (m->cpuvendor == X86_VENDOR_INTEL) {
011d8261
BP
571 /*
572 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
573 *
574 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
575 * indicating a memory error. Bit 8 is used for indicating a
576 * cache hierarchy error. The combination of bit 2 and bit 3
577 * is used for indicating a `generic' cache hierarchy error
578 * But we can't just blindly check the above bits, because if
579 * bit 11 is set, then it is a bus/interconnect error - and
580 * either way the above bits just gives more detail on what
581 * bus/interconnect error happened. Note that bit 12 can be
582 * ignored, as it's the "filter" bit.
583 */
584 return (m->status & 0xef80) == BIT(7) ||
585 (m->status & 0xef00) == BIT(8) ||
586 (m->status & 0xeffc) == 0xc;
587 }
588
589 return false;
590}
2d1f4061 591EXPORT_SYMBOL_GPL(mce_is_memory_error);
011d8261 592
68da5df9
TL
593static bool whole_page(struct mce *m)
594{
595 if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
596 return true;
597 return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
598}
599
5d96c934 600bool mce_is_correctable(struct mce *m)
179eb850
YG
601{
602 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
603 return false;
604
ac78bd72
PW
605 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
606 return false;
607
179eb850
YG
608 if (m->status & MCI_STATUS_UC)
609 return false;
610
611 return true;
612}
5d96c934 613EXPORT_SYMBOL_GPL(mce_is_correctable);
179eb850 614
011d8261
BP
615static bool cec_add_mce(struct mce *m)
616{
617 if (!m)
618 return false;
619
620 /* We eat only correctable DRAM errors with usable addresses. */
2d1f4061 621 if (mce_is_memory_error(m) &&
179eb850 622 mce_is_correctable(m) &&
011d8261
BP
623 mce_usable_address(m))
624 if (!cec_add_elem(m->addr >> PAGE_SHIFT))
625 return true;
626
627 return false;
628}
629
630static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
631 void *data)
632{
633 struct mce *m = (struct mce *)data;
011d8261
BP
634
635 if (!m)
636 return NOTIFY_DONE;
637
638 if (cec_add_mce(m))
639 return NOTIFY_STOP;
640
641 /* Emit the trace record: */
642 trace_mce_record(m);
643
011d8261
BP
644 set_bit(0, &mce_need_notify);
645
646 mce_notify_irq();
647
648 return NOTIFY_DONE;
649}
650
651static struct notifier_block first_nb = {
652 .notifier_call = mce_first_notifier,
653 .priority = MCE_PRIO_FIRST,
654};
655
fd4cf79f
CG
656static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
657 void *data)
658{
659 struct mce *mce = (struct mce *)data;
660 unsigned long pfn;
661
662 if (!mce)
663 return NOTIFY_DONE;
664
c0ec382e 665 if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
fd4cf79f 666 pfn = mce->addr >> PAGE_SHIFT;
fd0e786d 667 if (!memory_failure(pfn, 0))
68da5df9 668 set_mce_nospec(pfn, whole_page(mce));
fd4cf79f
CG
669 }
670
671 return NOTIFY_OK;
ccc3c319 672}
fd4cf79f
CG
673static struct notifier_block mce_srao_nb = {
674 .notifier_call = srao_decode_notifier,
9026cc82 675 .priority = MCE_PRIO_SRAO,
fd4cf79f 676};
ccc3c319 677
cd9c57ca
BP
678static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
679 void *data)
680{
681 struct mce *m = (struct mce *)data;
682
683 if (!m)
684 return NOTIFY_DONE;
685
011d8261 686 if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
cc66afea
AK
687 return NOTIFY_DONE;
688
cd9c57ca
BP
689 __print_mce(m);
690
691 return NOTIFY_DONE;
692}
693
694static struct notifier_block mce_default_nb = {
695 .notifier_call = mce_default_notifier,
696 /* lowest prio, we want it to run last. */
9026cc82 697 .priority = MCE_PRIO_LOWEST,
cd9c57ca
BP
698};
699
85f92694
TL
700/*
701 * Read ADDR and MISC registers.
702 */
34f2c743 703static noinstr void mce_read_aux(struct mce *m, int i)
85f92694
TL
704{
705 if (m->status & MCI_STATUS_MISCV)
d9d73fcc 706 m->misc = mce_rdmsrl(msr_ops.misc(i));
db819d60 707
85f92694 708 if (m->status & MCI_STATUS_ADDRV) {
d9d73fcc 709 m->addr = mce_rdmsrl(msr_ops.addr(i));
85f92694
TL
710
711 /*
712 * Mask the reported address by the reported granularity.
713 */
1462594b 714 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
85f92694
TL
715 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
716 m->addr >>= shift;
717 m->addr <<= shift;
718 }
4f29b73b
YG
719
720 /*
721 * Extract [55:<lsb>] where lsb is the least significant
722 * *valid* bit of the address bits.
723 */
724 if (mce_flags.smca) {
725 u8 lsb = (m->addr >> 56) & 0x3f;
726
727 m->addr &= GENMASK_ULL(55, lsb);
728 }
85f92694 729 }
db819d60 730
5828c46f
YG
731 if (mce_flags.smca) {
732 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
733
734 if (m->status & MCI_STATUS_SYNDV)
735 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
736 }
85f92694
TL
737}
738
ca84f696
AK
739DEFINE_PER_CPU(unsigned, mce_poll_count);
740
d88203d1 741/*
b79109c3
AK
742 * Poll for corrected events or events that happened before reset.
743 * Those are just logged through /dev/mcelog.
744 *
745 * This is executed in standard interrupt context.
ed7290d0
AK
746 *
747 * Note: spec recommends to panic for fatal unsignalled
748 * errors here. However this would be quite problematic --
749 * we would need to reimplement the Monarch handling and
750 * it would mess up the exclusion between exception handler
a97673a1 751 * and poll handler -- * so we skip this for now.
ed7290d0
AK
752 * These cases should not happen anyways, or only when the CPU
753 * is already totally * confused. In this case it's likely it will
754 * not fully execute the machine check handler either.
b79109c3 755 */
3f2f0680 756bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3 757{
b4914508 758 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
8b38937b 759 bool error_seen = false;
b79109c3
AK
760 struct mce m;
761 int i;
762
c6ae41e7 763 this_cpu_inc(mce_poll_count);
ca84f696 764
b8325c5b 765 mce_gather_info(&m, NULL);
b79109c3 766
669c00f0
BP
767 if (flags & MCP_TIMESTAMP)
768 m.tsc = rdtsc();
54467353 769
c7d314f3 770 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 771 if (!mce_banks[i].ctl || !test_bit(i, *b))
b79109c3
AK
772 continue;
773
774 m.misc = 0;
775 m.addr = 0;
776 m.bank = i;
b79109c3
AK
777
778 barrier();
d9d73fcc 779 m.status = mce_rdmsrl(msr_ops.status(i));
f19501aa
TL
780
781 /* If this entry is not valid, ignore it */
b79109c3
AK
782 if (!(m.status & MCI_STATUS_VAL))
783 continue;
784
785 /*
f19501aa
TL
786 * If we are logging everything (at CPU online) or this
787 * is a corrected error, then we must log it.
b79109c3 788 */
f19501aa
TL
789 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
790 goto log_it;
791
792 /*
793 * Newer Intel systems that support software error
794 * recovery need to make additional checks. Other
795 * CPUs should skip over uncorrected errors, but log
796 * everything else.
797 */
798 if (!mca_cfg.ser) {
799 if (m.status & MCI_STATUS_UC)
800 continue;
801 goto log_it;
802 }
803
804 /* Log "not enabled" (speculative) errors */
805 if (!(m.status & MCI_STATUS_EN))
806 goto log_it;
807
808 /*
809 * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
810 * UC == 1 && PCC == 0 && S == 0
811 */
812 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
813 goto log_it;
814
815 /*
816 * Skip anything else. Presumption is that our read of this
817 * bank is racing with a machine check. Leave the log alone
818 * for do_machine_check() to deal with it.
819 */
820 continue;
b79109c3 821
f19501aa 822log_it:
8b38937b
TL
823 error_seen = true;
824
85f92694 825 mce_read_aux(&m, i);
b79109c3 826
e2de64ec 827 m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
fa92c586 828
b79109c3
AK
829 /*
830 * Don't get the IP here because it's unlikely to
831 * have anything to do with the actual error location.
832 */
8b38937b 833 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
5679af4c 834 mce_log(&m);
c0ec382e 835 else if (mce_usable_address(&m)) {
8b38937b
TL
836 /*
837 * Although we skipped logging this, we still want
838 * to take action. Add to the pool so the registered
839 * notifiers will see it.
840 */
841 if (!mce_gen_pool_add(&m))
842 mce_schedule_work();
3f2f0680 843 }
b79109c3
AK
844
845 /*
846 * Clear state for this bank.
847 */
d9d73fcc 848 mce_wrmsrl(msr_ops.status(i), 0);
b79109c3
AK
849 }
850
851 /*
852 * Don't clear MCG_STATUS here because it's only defined for
853 * exceptions.
854 */
88921be3
AK
855
856 sync_core();
3f2f0680 857
8b38937b 858 return error_seen;
b79109c3 859}
ea149b36 860EXPORT_SYMBOL_GPL(machine_check_poll);
b79109c3 861
bd19a5e6
AK
862/*
863 * Do a quick check if any of the events requires a panic.
864 * This decides if we keep the events around or clear them.
865 */
61b0fccd
TL
866static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
867 struct pt_regs *regs)
bd19a5e6 868{
17fea54b 869 char *tmp;
1f74c8a6 870 int i;
bd19a5e6 871
c7d314f3 872 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
d9d73fcc 873 m->status = mce_rdmsrl(msr_ops.status(i));
1f74c8a6
BP
874 if (!(m->status & MCI_STATUS_VAL))
875 continue;
876
877 __set_bit(i, validp);
878 if (quirk_no_way_out)
879 quirk_no_way_out(i, m, regs);
17fea54b 880
e350b540 881 m->bank = i;
17fea54b 882 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
1f74c8a6 883 mce_read_aux(m, i);
17fea54b 884 *msg = tmp;
1f74c8a6 885 return 1;
17fea54b 886 }
bd19a5e6 887 }
1f74c8a6 888 return 0;
bd19a5e6
AK
889}
890
3c079792
AK
891/*
892 * Variable to establish order between CPUs while scanning.
893 * Each CPU spins initially until executing is equal its number.
894 */
895static atomic_t mce_executing;
896
897/*
898 * Defines order of CPUs on entry. First CPU becomes Monarch.
899 */
900static atomic_t mce_callin;
901
902/*
903 * Check if a timeout waiting for other CPUs happened.
904 */
6c80f87e 905static int mce_timed_out(u64 *t, const char *msg)
3c079792
AK
906{
907 /*
908 * The others already did panic for some reason.
909 * Bail out like in a timeout.
910 * rmb() to tell the compiler that system_state
911 * might have been modified by someone else.
912 */
913 rmb();
c7c9b392 914 if (atomic_read(&mce_panicked))
3c079792 915 wait_for_panic();
84c2559d 916 if (!mca_cfg.monarch_timeout)
3c079792
AK
917 goto out;
918 if ((s64)*t < SPINUNIT) {
716079f6 919 if (mca_cfg.tolerant <= 1)
6c80f87e 920 mce_panic(msg, NULL, NULL);
3c079792
AK
921 cpu_missing = 1;
922 return 1;
923 }
924 *t -= SPINUNIT;
925out:
926 touch_nmi_watchdog();
927 return 0;
928}
929
930/*
931 * The Monarch's reign. The Monarch is the CPU who entered
932 * the machine check handler first. It waits for the others to
933 * raise the exception too and then grades them. When any
934 * error is fatal panic. Only then let the others continue.
935 *
936 * The other CPUs entering the MCE handler will be controlled by the
937 * Monarch. They are called Subjects.
938 *
939 * This way we prevent any potential data corruption in a unrecoverable case
940 * and also makes sure always all CPU's errors are examined.
941 *
680b6cfd 942 * Also this detects the case of a machine check event coming from outer
3c079792
AK
943 * space (not detected by any CPUs) In this case some external agent wants
944 * us to shut down, so panic too.
945 *
946 * The other CPUs might still decide to panic if the handler happens
947 * in a unrecoverable place, but in this case the system is in a semi-stable
948 * state and won't corrupt anything by itself. It's ok to let the others
949 * continue for a bit first.
950 *
951 * All the spin loops have timeouts; when a timeout happens a CPU
952 * typically elects itself to be Monarch.
953 */
954static void mce_reign(void)
955{
956 int cpu;
957 struct mce *m = NULL;
958 int global_worst = 0;
959 char *msg = NULL;
960 char *nmsg = NULL;
961
962 /*
963 * This CPU is the Monarch and the other CPUs have run
964 * through their handlers.
965 * Grade the severity of the errors of all the CPUs.
966 */
967 for_each_possible_cpu(cpu) {
d203f0b8
BP
968 int severity = mce_severity(&per_cpu(mces_seen, cpu),
969 mca_cfg.tolerant,
e3480271 970 &nmsg, true);
3c079792
AK
971 if (severity > global_worst) {
972 msg = nmsg;
973 global_worst = severity;
974 m = &per_cpu(mces_seen, cpu);
975 }
976 }
977
978 /*
979 * Cannot recover? Panic here then.
980 * This dumps all the mces in the log buffer and stops the
981 * other CPUs.
982 */
d203f0b8 983 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
8af7043a 984 mce_panic("Fatal machine check", m, msg);
3c079792
AK
985
986 /*
987 * For UC somewhere we let the CPU who detects it handle it.
988 * Also must let continue the others, otherwise the handling
989 * CPU could deadlock on a lock.
990 */
991
992 /*
993 * No machine check event found. Must be some external
994 * source or one CPU is hung. Panic.
995 */
d203f0b8 996 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
8af7043a 997 mce_panic("Fatal machine check from unknown source", NULL, NULL);
3c079792
AK
998
999 /*
1000 * Now clear all the mces_seen so that they don't reappear on
1001 * the next mce.
1002 */
1003 for_each_possible_cpu(cpu)
1004 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
1005}
1006
1007static atomic_t global_nwo;
1008
1009/*
1010 * Start of Monarch synchronization. This waits until all CPUs have
1011 * entered the exception handler and then determines if any of them
1012 * saw a fatal event that requires panic. Then it executes them
1013 * in the entry order.
1014 * TBD double check parallel CPU hotunplug
1015 */
7fb06fc9 1016static int mce_start(int *no_way_out)
3c079792 1017{
7fb06fc9 1018 int order;
3c079792 1019 int cpus = num_online_cpus();
84c2559d 1020 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
3c079792 1021
7fb06fc9
HS
1022 if (!timeout)
1023 return -1;
3c079792 1024
7fb06fc9 1025 atomic_add(*no_way_out, &global_nwo);
184e1fdf 1026 /*
bf92b1fe
DB
1027 * Rely on the implied barrier below, such that global_nwo
1028 * is updated before mce_callin.
184e1fdf 1029 */
a95436e4 1030 order = atomic_inc_return(&mce_callin);
3c079792
AK
1031
1032 /*
1033 * Wait for everyone.
1034 */
1035 while (atomic_read(&mce_callin) != cpus) {
6c80f87e
AL
1036 if (mce_timed_out(&timeout,
1037 "Timeout: Not all CPUs entered broadcast exception handler")) {
3c079792 1038 atomic_set(&global_nwo, 0);
7fb06fc9 1039 return -1;
3c079792
AK
1040 }
1041 ndelay(SPINUNIT);
1042 }
1043
184e1fdf
HY
1044 /*
1045 * mce_callin should be read before global_nwo
1046 */
1047 smp_rmb();
3c079792 1048
7fb06fc9
HS
1049 if (order == 1) {
1050 /*
1051 * Monarch: Starts executing now, the others wait.
1052 */
3c079792 1053 atomic_set(&mce_executing, 1);
7fb06fc9
HS
1054 } else {
1055 /*
1056 * Subject: Now start the scanning loop one by one in
1057 * the original callin order.
1058 * This way when there are any shared banks it will be
1059 * only seen by one CPU before cleared, avoiding duplicates.
1060 */
1061 while (atomic_read(&mce_executing) < order) {
6c80f87e
AL
1062 if (mce_timed_out(&timeout,
1063 "Timeout: Subject CPUs unable to finish machine check processing")) {
7fb06fc9
HS
1064 atomic_set(&global_nwo, 0);
1065 return -1;
1066 }
1067 ndelay(SPINUNIT);
1068 }
3c079792
AK
1069 }
1070
1071 /*
7fb06fc9 1072 * Cache the global no_way_out state.
3c079792 1073 */
7fb06fc9
HS
1074 *no_way_out = atomic_read(&global_nwo);
1075
1076 return order;
3c079792
AK
1077}
1078
1079/*
1080 * Synchronize between CPUs after main scanning loop.
1081 * This invokes the bulk of the Monarch processing.
1082 */
5fdb493d 1083static noinstr int mce_end(int order)
3c079792 1084{
84c2559d 1085 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
5fdb493d
BP
1086 int ret = -1;
1087
1088 /* Allow instrumentation around external facilities. */
1089 instrumentation_begin();
3c079792
AK
1090
1091 if (!timeout)
1092 goto reset;
1093 if (order < 0)
1094 goto reset;
1095
1096 /*
1097 * Allow others to run.
1098 */
1099 atomic_inc(&mce_executing);
1100
1101 if (order == 1) {
1102 /* CHECKME: Can this race with a parallel hotplug? */
1103 int cpus = num_online_cpus();
1104
1105 /*
1106 * Monarch: Wait for everyone to go through their scanning
1107 * loops.
1108 */
1109 while (atomic_read(&mce_executing) <= cpus) {
6c80f87e
AL
1110 if (mce_timed_out(&timeout,
1111 "Timeout: Monarch CPU unable to finish machine check processing"))
3c079792
AK
1112 goto reset;
1113 ndelay(SPINUNIT);
1114 }
1115
1116 mce_reign();
1117 barrier();
1118 ret = 0;
1119 } else {
1120 /*
1121 * Subject: Wait for Monarch to finish.
1122 */
1123 while (atomic_read(&mce_executing) != 0) {
6c80f87e
AL
1124 if (mce_timed_out(&timeout,
1125 "Timeout: Monarch CPU did not finish machine check processing"))
3c079792
AK
1126 goto reset;
1127 ndelay(SPINUNIT);
1128 }
1129
1130 /*
1131 * Don't reset anything. That's done by the Monarch.
1132 */
5fdb493d
BP
1133 ret = 0;
1134 goto out;
3c079792
AK
1135 }
1136
1137 /*
1138 * Reset all global state.
1139 */
1140reset:
1141 atomic_set(&global_nwo, 0);
1142 atomic_set(&mce_callin, 0);
1143 barrier();
1144
1145 /*
1146 * Let others run again.
1147 */
1148 atomic_set(&mce_executing, 0);
5fdb493d
BP
1149
1150out:
1151 instrumentation_end();
1152
3c079792
AK
1153 return ret;
1154}
1155
1156static void mce_clear_state(unsigned long *toclear)
1157{
1158 int i;
1159
c7d314f3 1160 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
3c079792 1161 if (test_bit(i, toclear))
d9d73fcc 1162 mce_wrmsrl(msr_ops.status(i), 0);
3c079792
AK
1163 }
1164}
1165
b2f9d678
TL
1166static int do_memory_failure(struct mce *m)
1167{
1168 int flags = MF_ACTION_REQUIRED;
1169 int ret;
1170
1171 pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1172 if (!(m->mcgstatus & MCG_STATUS_RIPV))
1173 flags |= MF_MUST_KILL;
83b57531 1174 ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
b2f9d678
TL
1175 if (ret)
1176 pr_err("Memory error not recovered");
fd0e786d 1177 else
68da5df9 1178 set_mce_nospec(m->addr >> PAGE_SHIFT, whole_page(m));
b2f9d678
TL
1179 return ret;
1180}
1181
d3d6923c
BP
1182
1183/*
1184 * Cases where we avoid rendezvous handler timeout:
1185 * 1) If this CPU is offline.
1186 *
1187 * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1188 * skip those CPUs which remain looping in the 1st kernel - see
1189 * crash_nmi_callback().
1190 *
1191 * Note: there still is a small window between kexec-ing and the new,
1192 * kdump kernel establishing a new #MC handler where a broadcasted MCE
1193 * might not get handled properly.
1194 */
1195static bool __mc_check_crashing_cpu(int cpu)
1196{
1197 if (cpu_is_offline(cpu) ||
1198 (crashing_cpu != -1 && crashing_cpu != cpu)) {
1199 u64 mcgstatus;
1200
1201 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1202 if (mcgstatus & MCG_STATUS_RIPV) {
1203 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1204 return true;
1205 }
1206 }
1207 return false;
1208}
1209
f35565e3
BP
1210static void __mc_scan_banks(struct mce *m, struct mce *final,
1211 unsigned long *toclear, unsigned long *valid_banks,
1212 int no_way_out, int *worst)
1213{
b4914508 1214 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
f35565e3
BP
1215 struct mca_config *cfg = &mca_cfg;
1216 int severity, i;
1217
c7d314f3 1218 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
f35565e3
BP
1219 __clear_bit(i, toclear);
1220 if (!test_bit(i, valid_banks))
1221 continue;
d5c84ef2 1222
f35565e3
BP
1223 if (!mce_banks[i].ctl)
1224 continue;
1225
1226 m->misc = 0;
1227 m->addr = 0;
1228 m->bank = i;
1229
1230 m->status = mce_rdmsrl(msr_ops.status(i));
d5c84ef2 1231 if (!(m->status & MCI_STATUS_VAL))
f35565e3
BP
1232 continue;
1233
1234 /*
d5c84ef2
BP
1235 * Corrected or non-signaled errors are handled by
1236 * machine_check_poll(). Leave them alone, unless this panics.
f35565e3
BP
1237 */
1238 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1239 !no_way_out)
1240 continue;
1241
d5c84ef2 1242 /* Set taint even when machine check was not enabled. */
f35565e3
BP
1243 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1244
1245 severity = mce_severity(m, cfg->tolerant, NULL, true);
1246
1247 /*
1248 * When machine check was for corrected/deferred handler don't
d5c84ef2 1249 * touch, unless we're panicking.
f35565e3
BP
1250 */
1251 if ((severity == MCE_KEEP_SEVERITY ||
1252 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1253 continue;
d5c84ef2 1254
f35565e3 1255 __set_bit(i, toclear);
d5c84ef2
BP
1256
1257 /* Machine check event was not enabled. Clear, but ignore. */
1258 if (severity == MCE_NO_SEVERITY)
f35565e3 1259 continue;
f35565e3
BP
1260
1261 mce_read_aux(m, i);
1262
1263 /* assuming valid severity level != 0 */
1264 m->severity = severity;
1265
1266 mce_log(m);
1267
1268 if (severity > *worst) {
1269 *final = *m;
1270 *worst = severity;
1271 }
1272 }
1273
1274 /* mce_clear_state will clear *final, save locally for use later */
1275 *m = *final;
1276}
1277
b79109c3
AK
1278/*
1279 * The actual machine check handler. This only handles real
1280 * exceptions when something got corrupted coming in through int 18.
1281 *
1282 * This is executed in NMI context not subject to normal locking rules. This
1283 * implies that most kernel services cannot be safely used. Don't even
1284 * think about putting a printk in there!
3c079792
AK
1285 *
1286 * On Intel systems this is entered on all CPUs in parallel through
1287 * MCE broadcast. However some CPUs might be broken beyond repair,
1288 * so be always careful when synchronizing with others.
1da177e4 1289 */
e9eee03e 1290void do_machine_check(struct pt_regs *regs, long error_code)
1da177e4 1291{
d3d6923c
BP
1292 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1293 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1462594b 1294 struct mca_config *cfg = &mca_cfg;
d3d6923c
BP
1295 int cpu = smp_processor_id();
1296 char *msg = "Unknown";
3c079792 1297 struct mce m, *final;
3c079792 1298 int worst = 0;
fead35c6 1299
3c079792
AK
1300 /*
1301 * Establish sequential order between the CPUs entering the machine
1302 * check handler.
1303 */
fead35c6 1304 int order = -1;
d3d6923c 1305
bd78432c
TH
1306 /*
1307 * If no_way_out gets set, there is no safe way to recover from this
d203f0b8 1308 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
bd78432c
TH
1309 */
1310 int no_way_out = 0;
d3d6923c 1311
bd78432c
TH
1312 /*
1313 * If kill_it gets set, there might be a way to recover from this
1314 * error.
1315 */
1316 int kill_it = 0;
fead35c6
YG
1317
1318 /*
1319 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1320 * on Intel.
1321 */
1322 int lmce = 1;
1da177e4 1323
d3d6923c
BP
1324 if (__mc_check_crashing_cpu(cpu))
1325 return;
d90167a9 1326
8c84014f 1327 ist_enter(regs);
95927475 1328
c6ae41e7 1329 this_cpu_inc(mce_exception_count);
01ca79f1 1330
b8325c5b 1331 mce_gather_info(&m, regs);
669c00f0 1332 m.tsc = rdtsc();
b5f2fa4e 1333
89cbc767 1334 final = this_cpu_ptr(&mces_seen);
3c079792
AK
1335 *final = m;
1336
95022b8c 1337 memset(valid_banks, 0, sizeof(valid_banks));
61b0fccd 1338 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
680b6cfd 1339
1da177e4
LT
1340 barrier();
1341
ed7290d0 1342 /*
a8c321fb
TL
1343 * When no restart IP might need to kill or panic.
1344 * Assume the worst for now, but if we find the
1345 * severity is MCE_AR_SEVERITY we have other options.
ed7290d0
AK
1346 */
1347 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1348 kill_it = 1;
1349
3c079792 1350 /*
fead35c6
YG
1351 * Check if this MCE is signaled to only this logical processor,
1352 * on Intel only.
3c079792 1353 */
fead35c6
YG
1354 if (m.cpuvendor == X86_VENDOR_INTEL)
1355 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1356
1357 /*
40c36e27
TL
1358 * Local machine check may already know that we have to panic.
1359 * Broadcast machine check begins rendezvous in mce_start()
fead35c6
YG
1360 * Go through all banks in exclusion of the other CPUs. This way we
1361 * don't report duplicated events on shared banks because the first one
40c36e27 1362 * to see it will clear it.
fead35c6 1363 */
40c36e27
TL
1364 if (lmce) {
1365 if (no_way_out)
1366 mce_panic("Fatal local machine check", &m, msg);
1367 } else {
243d657e 1368 order = mce_start(&no_way_out);
40c36e27 1369 }
243d657e 1370
f35565e3 1371 __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
a8c321fb 1372
3c079792
AK
1373 if (!no_way_out)
1374 mce_clear_state(toclear);
1375
e9eee03e 1376 /*
3c079792
AK
1377 * Do most of the synchronization with other CPUs.
1378 * When there's any problem use only local no_way_out state.
e9eee03e 1379 */
243d657e 1380 if (!lmce) {
ac815787
GP
1381 if (mce_end(order) < 0) {
1382 if (!no_way_out)
1383 no_way_out = worst >= MCE_PANIC_SEVERITY;
1384 }
243d657e
AR
1385 } else {
1386 /*
40c36e27
TL
1387 * If there was a fatal machine check we should have
1388 * already called mce_panic earlier in this function.
1389 * Since we re-read the banks, we might have found
1390 * something new. Check again to see if we found a
1391 * fatal error. We call "mce_severity()" again to
1392 * make sure we have the right "msg".
243d657e 1393 */
40c36e27
TL
1394 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1395 mce_severity(&m, cfg->tolerant, &msg, true);
1396 mce_panic("Local fatal machine check!", &m, msg);
1397 }
243d657e 1398 }
bd78432c
TH
1399
1400 /*
b2f9d678
TL
1401 * If tolerant is at an insane level we drop requests to kill
1402 * processes and continue even when there is no way out.
bd78432c 1403 */
b2f9d678
TL
1404 if (cfg->tolerant == 3)
1405 kill_it = 0;
1406 else if (no_way_out)
1407 mce_panic("Fatal machine check on current CPU", &m, msg);
e02e68d3 1408
3c079792 1409 if (worst > 0)
39f0584e
BP
1410 irq_work_queue(&mce_irq_work);
1411
5f8c1a54 1412 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
45deca7d 1413
88921be3 1414 sync_core();
d4812e16 1415
b2f9d678
TL
1416 if (worst != MCE_AR_SEVERITY && !kill_it)
1417 goto out_ist;
d4812e16 1418
b2f9d678
TL
1419 /* Fault was in user mode and we need to take some action */
1420 if ((m.cs & 3) == 3) {
1421 ist_begin_non_atomic(regs);
1422 local_irq_enable();
1423
1424 if (kill_it || do_memory_failure(&m))
3cf5d076 1425 force_sig(SIGBUS);
b2f9d678
TL
1426 local_irq_disable();
1427 ist_end_non_atomic();
1428 } else {
81fd9c18 1429 if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
b2f9d678 1430 mce_panic("Failed kernel mode recovery", &m, NULL);
d4812e16 1431 }
b2f9d678
TL
1432
1433out_ist:
8c84014f 1434 ist_exit(regs);
1da177e4 1435}
ea149b36 1436EXPORT_SYMBOL_GPL(do_machine_check);
1da177e4 1437
cd42f4a3 1438#ifndef CONFIG_MEMORY_FAILURE
83b57531 1439int memory_failure(unsigned long pfn, int flags)
9b1beaf2 1440{
a8c321fb
TL
1441 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1442 BUG_ON(flags & MF_ACTION_REQUIRED);
c767a54b
JP
1443 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1444 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1445 pfn);
cd42f4a3
TL
1446
1447 return 0;
9b1beaf2 1448}
cd42f4a3 1449#endif
9b1beaf2 1450
1da177e4 1451/*
8a336b0a
TH
1452 * Periodic polling timer for "silent" machine check errors. If the
1453 * poller finds an MCE, poll 2x faster. When the poller finds no more
1454 * errors, poll 2x slower (up to check_interval seconds).
1da177e4 1455 */
3f2f0680 1456static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
e9eee03e 1457
82f7af09 1458static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
52d168e2 1459static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 1460
55babd8f
CG
1461static unsigned long mce_adjust_timer_default(unsigned long interval)
1462{
1463 return interval;
1464}
1465
3f2f0680 1466static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
55babd8f 1467
0becc0ae 1468static void __start_timer(struct timer_list *t, unsigned long interval)
27f6c573 1469{
3f2f0680
BP
1470 unsigned long when = jiffies + interval;
1471 unsigned long flags;
27f6c573 1472
3f2f0680 1473 local_irq_save(flags);
27f6c573 1474
0becc0ae
TG
1475 if (!timer_pending(t) || time_before(when, t->expires))
1476 mod_timer(t, round_jiffies(when));
3f2f0680
BP
1477
1478 local_irq_restore(flags);
27f6c573
CG
1479}
1480
92bb6cb1 1481static void mce_timer_fn(struct timer_list *t)
1da177e4 1482{
92bb6cb1 1483 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
82f7af09 1484 unsigned long iv;
52d168e2 1485
92bb6cb1 1486 WARN_ON(cpu_t != t);
3f2f0680
BP
1487
1488 iv = __this_cpu_read(mce_next_interval);
52d168e2 1489
89cbc767 1490 if (mce_available(this_cpu_ptr(&cpu_info))) {
54467353 1491 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
3f2f0680
BP
1492
1493 if (mce_intel_cmci_poll()) {
1494 iv = mce_adjust_timer(iv);
1495 goto done;
1496 }
e9eee03e 1497 }
1da177e4
LT
1498
1499 /*
3f2f0680
BP
1500 * Alert userspace if needed. If we logged an MCE, reduce the polling
1501 * interval, otherwise increase the polling interval.
1da177e4 1502 */
3f2f0680 1503 if (mce_notify_irq())
958fb3c5 1504 iv = max(iv / 2, (unsigned long) HZ/100);
3f2f0680 1505 else
82f7af09 1506 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
3f2f0680
BP
1507
1508done:
82f7af09 1509 __this_cpu_write(mce_next_interval, iv);
0becc0ae 1510 __start_timer(t, iv);
55babd8f 1511}
e02e68d3 1512
55babd8f
CG
1513/*
1514 * Ensure that the timer is firing in @interval from now.
1515 */
1516void mce_timer_kick(unsigned long interval)
1517{
89cbc767 1518 struct timer_list *t = this_cpu_ptr(&mce_timer);
55babd8f
CG
1519 unsigned long iv = __this_cpu_read(mce_next_interval);
1520
0becc0ae 1521 __start_timer(t, interval);
3f2f0680 1522
55babd8f
CG
1523 if (interval < iv)
1524 __this_cpu_write(mce_next_interval, interval);
e02e68d3
TH
1525}
1526
9aaef96f
HS
1527/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1528static void mce_timer_delete_all(void)
1529{
1530 int cpu;
1531
1532 for_each_online_cpu(cpu)
1533 del_timer_sync(&per_cpu(mce_timer, cpu));
1534}
1535
e02e68d3 1536/*
9bd98405
AK
1537 * Notify the user(s) about new machine check events.
1538 * Can be called from interrupt context, but not from machine check/NMI
1539 * context.
e02e68d3 1540 */
9ff36ee9 1541int mce_notify_irq(void)
e02e68d3 1542{
8457c84d
AK
1543 /* Not more than two messages every minute */
1544 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1545
1020bcbc 1546 if (test_and_clear_bit(0, &mce_need_notify)) {
5de97c9f 1547 mce_work_trigger();
e02e68d3 1548
8457c84d 1549 if (__ratelimit(&ratelimit))
a2d7b0d4 1550 pr_info(HW_ERR "Machine check events logged\n");
e02e68d3
TH
1551
1552 return 1;
1da177e4 1553 }
e02e68d3
TH
1554 return 0;
1555}
9ff36ee9 1556EXPORT_SYMBOL_GPL(mce_notify_irq);
8a336b0a 1557
b4914508 1558static void __mcheck_cpu_mce_banks_init(void)
cebe1820 1559{
b4914508 1560 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
c7d314f3 1561 u8 n_banks = this_cpu_read(mce_num_banks);
cebe1820
AK
1562 int i;
1563
c7d314f3 1564 for (i = 0; i < n_banks; i++) {
cebe1820 1565 struct mce_bank *b = &mce_banks[i];
11868a2d 1566
068b053d
YG
1567 /*
1568 * Init them all, __mcheck_cpu_apply_quirks() is going to apply
1569 * the required vendor quirks before
1570 * __mcheck_cpu_init_clear_banks() does the final bank setup.
1571 */
cebe1820
AK
1572 b->ctl = -1ULL;
1573 b->init = 1;
1574 }
cebe1820
AK
1575}
1576
d88203d1 1577/*
1da177e4
LT
1578 * Initialize Machine Checks for a CPU.
1579 */
b4914508 1580static void __mcheck_cpu_cap_init(void)
1da177e4 1581{
e9eee03e 1582 u64 cap;
006c0770 1583 u8 b;
1da177e4
LT
1584
1585 rdmsrl(MSR_IA32_MCG_CAP, cap);
01c6680a
TG
1586
1587 b = cap & MCG_BANKCNT_MASK;
c7d314f3
YG
1588
1589 if (b > MAX_NR_BANKS) {
1590 pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1591 smp_processor_id(), MAX_NR_BANKS, b);
0d7482e3 1592 b = MAX_NR_BANKS;
c7d314f3 1593 }
0d7482e3 1594
c7d314f3 1595 this_cpu_write(mce_num_banks, b);
d203f0b8 1596
b4914508 1597 __mcheck_cpu_mce_banks_init();
0d7482e3 1598
94ad8474 1599 /* Use accurate RIP reporting if available. */
01c6680a 1600 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
84c2559d 1601 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1da177e4 1602
ed7290d0 1603 if (cap & MCG_SER_P)
09933946 1604 mca_cfg.ser = 1;
0d7482e3
AK
1605}
1606
5e09954a 1607static void __mcheck_cpu_init_generic(void)
0d7482e3 1608{
84c2559d 1609 enum mcp_flags m_fl = 0;
e9eee03e 1610 mce_banks_t all_banks;
0d7482e3 1611 u64 cap;
0d7482e3 1612
84c2559d
BP
1613 if (!mca_cfg.bootlog)
1614 m_fl = MCP_DONTLOG;
1615
b79109c3
AK
1616 /*
1617 * Log the machine checks left over from the previous reset.
1618 */
ee031c31 1619 bitmap_fill(all_banks, MAX_NR_BANKS);
84c2559d 1620 machine_check_poll(MCP_UC | m_fl, &all_banks);
1da177e4 1621
375074cc 1622 cr4_set_bits(X86_CR4_MCE);
1da177e4 1623
0d7482e3 1624 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
1625 if (cap & MCG_CTL_P)
1626 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
bb91f8c0
AG
1627}
1628
1629static void __mcheck_cpu_init_clear_banks(void)
1630{
b4914508 1631 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
bb91f8c0 1632 int i;
1da177e4 1633
c7d314f3 1634 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 1635 struct mce_bank *b = &mce_banks[i];
11868a2d 1636
cebe1820 1637 if (!b->init)
06b7a7a5 1638 continue;
d9d73fcc
YG
1639 wrmsrl(msr_ops.ctl(i), b->ctl);
1640 wrmsrl(msr_ops.status(i), 0);
d88203d1 1641 }
1da177e4
LT
1642}
1643
068b053d
YG
1644/*
1645 * Do a final check to see if there are any unused/RAZ banks.
1646 *
1647 * This must be done after the banks have been initialized and any quirks have
1648 * been applied.
1649 *
1650 * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs.
1651 * Otherwise, a user who disables a bank will not be able to re-enable it
1652 * without a system reboot.
1653 */
1654static void __mcheck_cpu_check_banks(void)
1655{
1656 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1657 u64 msrval;
1658 int i;
1659
1660 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1661 struct mce_bank *b = &mce_banks[i];
1662
1663 if (!b->init)
1664 continue;
1665
1666 rdmsrl(msr_ops.ctl(i), msrval);
1667 b->init = !!msrval;
1668 }
1669}
1670
61b0fccd
TL
1671/*
1672 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1673 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1674 * Vol 3B Table 15-20). But this confuses both the code that determines
1675 * whether the machine check occurred in kernel or user mode, and also
1676 * the severity assessment code. Pretend that EIPV was set, and take the
1677 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1678 */
1679static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1680{
1681 if (bank != 0)
1682 return;
1683 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1684 return;
1685 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1686 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1687 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1688 MCACOD)) !=
1689 (MCI_STATUS_UC|MCI_STATUS_EN|
1690 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1691 MCI_STATUS_AR|MCACOD_INSTR))
1692 return;
1693
1694 m->mcgstatus |= MCG_STATUS_EIPV;
1695 m->ip = regs->ip;
1696 m->cs = regs->cs;
1697}
1698
1da177e4 1699/* Add per CPU specific workarounds here */
148f9bb8 1700static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
d88203d1 1701{
b4914508 1702 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
d203f0b8
BP
1703 struct mca_config *cfg = &mca_cfg;
1704
e412cd25 1705 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
c767a54b 1706 pr_info("unknown CPU type - not enabling MCE support\n");
e412cd25
IM
1707 return -EOPNOTSUPP;
1708 }
1709
1da177e4 1710 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 1711 if (c->x86_vendor == X86_VENDOR_AMD) {
c7d314f3 1712 if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
e9eee03e
IM
1713 /*
1714 * disable GART TBL walk error reporting, which
1715 * trips off incorrectly with the IOMMU & 3ware
1716 * & Cerberus:
1717 */
cebe1820 1718 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
e9eee03e 1719 }
6057077f 1720 if (c->x86 < 0x11 && cfg->bootlog < 0) {
e9eee03e
IM
1721 /*
1722 * Lots of broken BIOS around that don't clear them
1723 * by default and leave crap in there. Don't log:
1724 */
84c2559d 1725 cfg->bootlog = 0;
e9eee03e 1726 }
2e6f694f
AK
1727 /*
1728 * Various K7s with broken bank 0 around. Always disable
1729 * by default.
1730 */
c7d314f3 1731 if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
cebe1820 1732 mce_banks[0].ctl = 0;
575203b4 1733
bf80bbd7
AG
1734 /*
1735 * overflow_recov is supported for F15h Models 00h-0fh
1736 * even though we don't have a CPUID bit for it.
1737 */
1738 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1739 mce_flags.overflow_recov = 1;
1740
1da177e4 1741 }
e583538f 1742
06b7a7a5
AK
1743 if (c->x86_vendor == X86_VENDOR_INTEL) {
1744 /*
1745 * SDM documents that on family 6 bank 0 should not be written
1746 * because it aliases to another special BIOS controlled
1747 * register.
1748 * But it's not aliased anymore on model 0x1a+
1749 * Don't ignore bank 0 completely because there could be a
1750 * valid event later, merely don't write CTL0.
1751 */
1752
c7d314f3 1753 if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
cebe1820 1754 mce_banks[0].init = 0;
3c079792
AK
1755
1756 /*
1757 * All newer Intel systems support MCE broadcasting. Enable
1758 * synchronization with a one second timeout.
1759 */
1760 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
84c2559d
BP
1761 cfg->monarch_timeout < 0)
1762 cfg->monarch_timeout = USEC_PER_SEC;
c7f6fa44 1763
e412cd25
IM
1764 /*
1765 * There are also broken BIOSes on some Pentium M and
1766 * earlier systems:
1767 */
84c2559d
BP
1768 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1769 cfg->bootlog = 0;
61b0fccd
TL
1770
1771 if (c->x86 == 6 && c->x86_model == 45)
1772 quirk_no_way_out = quirk_sandybridge_ifu;
06b7a7a5 1773 }
84c2559d
BP
1774 if (cfg->monarch_timeout < 0)
1775 cfg->monarch_timeout = 0;
1776 if (cfg->bootlog != 0)
7af19e4a 1777 cfg->panic_timeout = 30;
e412cd25
IM
1778
1779 return 0;
d88203d1 1780}
1da177e4 1781
148f9bb8 1782static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
4efc0670
AK
1783{
1784 if (c->x86 != 5)
3a97fc34
HS
1785 return 0;
1786
4efc0670
AK
1787 switch (c->x86_vendor) {
1788 case X86_VENDOR_INTEL:
c6978369 1789 intel_p5_mcheck_init(c);
3a97fc34 1790 return 1;
4efc0670
AK
1791 break;
1792 case X86_VENDOR_CENTAUR:
1793 winchip_mcheck_init(c);
3a97fc34 1794 return 1;
4efc0670 1795 break;
dc34bdd2
BP
1796 default:
1797 return 0;
4efc0670 1798 }
3a97fc34
HS
1799
1800 return 0;
4efc0670
AK
1801}
1802
5204bf17
YG
1803/*
1804 * Init basic CPU features needed for early decoding of MCEs.
1805 */
1806static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1da177e4 1807{
ac78bd72 1808 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
14cddfd5
YG
1809 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1810 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1811 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
d9d73fcc 1812
d9d73fcc
YG
1813 if (mce_flags.smca) {
1814 msr_ops.ctl = smca_ctl_reg;
1815 msr_ops.status = smca_status_reg;
1816 msr_ops.addr = smca_addr_reg;
1817 msr_ops.misc = smca_misc_reg;
1818 }
5204bf17
YG
1819 }
1820}
c7f54d21 1821
13e85822
DW
1822static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1823{
1824 struct mca_config *cfg = &mca_cfg;
1825
1826 /*
1827 * All newer Centaur CPUs support MCE broadcasting. Enable
1828 * synchronization with a one second timeout.
1829 */
1830 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1831 c->x86 > 6) {
1832 if (cfg->monarch_timeout < 0)
1833 cfg->monarch_timeout = USEC_PER_SEC;
1834 }
1835}
1836
5204bf17
YG
1837static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1838{
1839 switch (c->x86_vendor) {
1840 case X86_VENDOR_INTEL:
1841 mce_intel_feature_init(c);
1842 mce_adjust_timer = cmci_intel_adjust_timer;
1843 break;
c7f54d21 1844
5204bf17
YG
1845 case X86_VENDOR_AMD: {
1846 mce_amd_feature_init(c);
89b831ef 1847 break;
7559e13f 1848 }
ac78bd72
PW
1849
1850 case X86_VENDOR_HYGON:
1851 mce_hygon_feature_init(c);
1852 break;
1853
13e85822
DW
1854 case X86_VENDOR_CENTAUR:
1855 mce_centaur_feature_init(c);
1856 break;
7559e13f 1857
1da177e4
LT
1858 default:
1859 break;
1860 }
1861}
1862
8838eb6c
AR
1863static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1864{
1865 switch (c->x86_vendor) {
1866 case X86_VENDOR_INTEL:
1867 mce_intel_feature_clear(c);
1868 break;
1869 default:
1870 break;
1871 }
1872}
1873
0becc0ae 1874static void mce_start_timer(struct timer_list *t)
52d168e2 1875{
4f75d841 1876 unsigned long iv = check_interval * HZ;
bc09effa 1877
7af19e4a 1878 if (mca_cfg.ignore_ce || !iv)
62fdac59
HS
1879 return;
1880
0becc0ae
TG
1881 this_cpu_write(mce_next_interval, iv);
1882 __start_timer(t, iv);
52d168e2
AK
1883}
1884
39f152ff
SAS
1885static void __mcheck_cpu_setup_timer(void)
1886{
1887 struct timer_list *t = this_cpu_ptr(&mce_timer);
39f152ff 1888
92bb6cb1 1889 timer_setup(t, mce_timer_fn, TIMER_PINNED);
39f152ff
SAS
1890}
1891
26c3c283
TG
1892static void __mcheck_cpu_init_timer(void)
1893{
89cbc767 1894 struct timer_list *t = this_cpu_ptr(&mce_timer);
26c3c283 1895
92bb6cb1 1896 timer_setup(t, mce_timer_fn, TIMER_PINNED);
0becc0ae 1897 mce_start_timer(t);
26c3c283
TG
1898}
1899
45d4b7b9
YG
1900bool filter_mce(struct mce *m)
1901{
71a84402
YG
1902 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1903 return amd_filter_mce(m);
1904
45d4b7b9
YG
1905 return false;
1906}
1907
9eda8cb3
AK
1908/* Handle unconfigured int18 (should never happen) */
1909static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1910{
c767a54b 1911 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
9eda8cb3
AK
1912 smp_processor_id());
1913}
1914
1915/* Call the installed machine check handler for this CPU setup. */
1916void (*machine_check_vector)(struct pt_regs *, long error_code) =
1917 unexpected_machine_check;
1918
6f41c34d
TG
1919dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1920{
1921 machine_check_vector(regs, error_code);
1922}
1923
d88203d1 1924/*
1da177e4 1925 * Called for each booted CPU to set up machine checks.
e9eee03e 1926 * Must be called with preempt off:
1da177e4 1927 */
148f9bb8 1928void mcheck_cpu_init(struct cpuinfo_x86 *c)
1da177e4 1929{
1462594b 1930 if (mca_cfg.disabled)
4efc0670
AK
1931 return;
1932
3a97fc34
HS
1933 if (__mcheck_cpu_ancient_init(c))
1934 return;
4efc0670 1935
5b4408fd 1936 if (!mce_available(c))
1da177e4
LT
1937 return;
1938
b4914508
YG
1939 __mcheck_cpu_cap_init();
1940
1941 if (__mcheck_cpu_apply_quirks(c) < 0) {
09933946 1942 mca_cfg.disabled = 1;
0d7482e3
AK
1943 return;
1944 }
0d7482e3 1945
648ed940 1946 if (mce_gen_pool_init()) {
09933946 1947 mca_cfg.disabled = 1;
648ed940
CG
1948 pr_emerg("Couldn't allocate MCE records pool!\n");
1949 return;
1950 }
1951
5d727926
AK
1952 machine_check_vector = do_machine_check;
1953
5204bf17 1954 __mcheck_cpu_init_early(c);
5e09954a
BP
1955 __mcheck_cpu_init_generic();
1956 __mcheck_cpu_init_vendor(c);
bb91f8c0 1957 __mcheck_cpu_init_clear_banks();
068b053d 1958 __mcheck_cpu_check_banks();
39f152ff 1959 __mcheck_cpu_setup_timer();
1da177e4
LT
1960}
1961
8838eb6c
AR
1962/*
1963 * Called for each booted CPU to clear some machine checks opt-ins
1964 */
1965void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1966{
1967 if (mca_cfg.disabled)
1968 return;
1969
1970 if (!mce_available(c))
1971 return;
1972
1973 /*
1974 * Possibly to clear general settings generic to x86
1975 * __mcheck_cpu_clear_generic(c);
1976 */
1977 __mcheck_cpu_clear_vendor(c);
1978
1da177e4
LT
1979}
1980
c3d1fb56
NR
1981static void __mce_disable_bank(void *arg)
1982{
1983 int bank = *((int *)arg);
89cbc767 1984 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
c3d1fb56
NR
1985 cmci_disable_bank(bank);
1986}
1987
1988void mce_disable_bank(int bank)
1989{
c7d314f3 1990 if (bank >= this_cpu_read(mce_num_banks)) {
c3d1fb56
NR
1991 pr_warn(FW_BUG
1992 "Ignoring request to disable invalid MCA bank %d.\n",
1993 bank);
1994 return;
1995 }
1996 set_bit(bank, mce_banks_ce_disabled);
1997 on_each_cpu(__mce_disable_bank, &bank, 1);
1998}
1999
13503fa9 2000/*
62fdac59
HS
2001 * mce=off Disables machine check
2002 * mce=no_cmci Disables CMCI
88d53867 2003 * mce=no_lmce Disables LMCE
62fdac59
HS
2004 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2005 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
3c079792
AK
2006 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2007 * monarchtimeout is how long to wait for other CPUs on machine
2008 * check, or 0 to not wait
6057077f
YG
2009 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
2010 and older.
13503fa9 2011 * mce=nobootlog Don't log MCEs from before booting.
450cc201 2012 * mce=bios_cmci_threshold Don't program the CMCI threshold
3637efb0 2013 * mce=recovery force enable memcpy_mcsafe()
13503fa9 2014 */
1da177e4
LT
2015static int __init mcheck_enable(char *str)
2016{
d203f0b8
BP
2017 struct mca_config *cfg = &mca_cfg;
2018
e3346fc4 2019 if (*str == 0) {
4efc0670 2020 enable_p5_mce();
e3346fc4
BZ
2021 return 1;
2022 }
4efc0670
AK
2023 if (*str == '=')
2024 str++;
1da177e4 2025 if (!strcmp(str, "off"))
09933946 2026 cfg->disabled = 1;
62fdac59 2027 else if (!strcmp(str, "no_cmci"))
7af19e4a 2028 cfg->cmci_disabled = true;
88d53867 2029 else if (!strcmp(str, "no_lmce"))
09933946 2030 cfg->lmce_disabled = 1;
62fdac59 2031 else if (!strcmp(str, "dont_log_ce"))
d203f0b8 2032 cfg->dont_log_ce = true;
62fdac59 2033 else if (!strcmp(str, "ignore_ce"))
7af19e4a 2034 cfg->ignore_ce = true;
13503fa9 2035 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
84c2559d 2036 cfg->bootlog = (str[0] == 'b');
450cc201 2037 else if (!strcmp(str, "bios_cmci_threshold"))
09933946 2038 cfg->bios_cmci_threshold = 1;
0f68c088 2039 else if (!strcmp(str, "recovery"))
09933946 2040 cfg->recovery = 1;
3c079792 2041 else if (isdigit(str[0])) {
5c31b280 2042 if (get_option(&str, &cfg->tolerant) == 2)
84c2559d 2043 get_option(&str, &(cfg->monarch_timeout));
3c079792 2044 } else {
c767a54b 2045 pr_info("mce argument %s ignored. Please use /sys\n", str);
13503fa9
HS
2046 return 0;
2047 }
9b41046c 2048 return 1;
1da177e4 2049}
4efc0670 2050__setup("mce", mcheck_enable);
1da177e4 2051
a2202aa2 2052int __init mcheck_init(void)
b33a6363 2053{
a2202aa2 2054 mcheck_intel_therm_init();
011d8261 2055 mce_register_decode_chain(&first_nb);
eef4dfa0 2056 mce_register_decode_chain(&mce_srao_nb);
cd9c57ca 2057 mce_register_decode_chain(&mce_default_nb);
43eaa2a1 2058 mcheck_vendor_init_severity();
a2202aa2 2059
cff4c039 2060 INIT_WORK(&mce_work, mce_gen_pool_process);
061120ae
CG
2061 init_irq_work(&mce_irq_work, mce_irq_work_cb);
2062
b33a6363
BP
2063 return 0;
2064}
b33a6363 2065
d88203d1 2066/*
c7cece89 2067 * mce_syscore: PM support
d88203d1 2068 */
1da177e4 2069
973a2dd1
AK
2070/*
2071 * Disable machine checks on suspend and shutdown. We can't really handle
2072 * them later.
2073 */
6e06780a 2074static void mce_disable_error_reporting(void)
973a2dd1 2075{
b4914508 2076 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
973a2dd1
AK
2077 int i;
2078
c7d314f3 2079 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 2080 struct mce_bank *b = &mce_banks[i];
11868a2d 2081
cebe1820 2082 if (b->init)
d9d73fcc 2083 wrmsrl(msr_ops.ctl(i), 0);
06b7a7a5 2084 }
6e06780a
AR
2085 return;
2086}
2087
2088static void vendor_disable_error_reporting(void)
2089{
2090 /*
ac78bd72
PW
2091 * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs
2092 * are socket-wide.
6e06780a
AR
2093 * Disabling them for just a single offlined CPU is bad, since it will
2094 * inhibit reporting for all shared resources on the socket like the
2095 * last level cache (LLC), the integrated memory controller (iMC), etc.
2096 */
ec338382 2097 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
ac78bd72 2098 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
ec338382 2099 boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
6e06780a
AR
2100 return;
2101
2102 mce_disable_error_reporting();
973a2dd1
AK
2103}
2104
c7cece89 2105static int mce_syscore_suspend(void)
973a2dd1 2106{
6e06780a
AR
2107 vendor_disable_error_reporting();
2108 return 0;
973a2dd1
AK
2109}
2110
c7cece89 2111static void mce_syscore_shutdown(void)
973a2dd1 2112{
6e06780a 2113 vendor_disable_error_reporting();
973a2dd1
AK
2114}
2115
e9eee03e
IM
2116/*
2117 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2118 * Only one CPU is active at this time, the others get re-added later using
2119 * CPU hotplug:
2120 */
c7cece89 2121static void mce_syscore_resume(void)
1da177e4 2122{
5e09954a 2123 __mcheck_cpu_init_generic();
89cbc767 2124 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
bb91f8c0 2125 __mcheck_cpu_init_clear_banks();
1da177e4
LT
2126}
2127
f3c6ea1b 2128static struct syscore_ops mce_syscore_ops = {
c7cece89
HS
2129 .suspend = mce_syscore_suspend,
2130 .shutdown = mce_syscore_shutdown,
2131 .resume = mce_syscore_resume,
f3c6ea1b
RW
2132};
2133
c7cece89 2134/*
8a25a2fd 2135 * mce_device: Sysfs support
c7cece89
HS
2136 */
2137
52d168e2
AK
2138static void mce_cpu_restart(void *data)
2139{
89cbc767 2140 if (!mce_available(raw_cpu_ptr(&cpu_info)))
33edbf02 2141 return;
5e09954a 2142 __mcheck_cpu_init_generic();
bb91f8c0 2143 __mcheck_cpu_init_clear_banks();
5e09954a 2144 __mcheck_cpu_init_timer();
52d168e2
AK
2145}
2146
1da177e4 2147/* Reinit MCEs after user configuration changes */
d88203d1
TG
2148static void mce_restart(void)
2149{
9aaef96f 2150 mce_timer_delete_all();
52d168e2 2151 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
2152}
2153
9af43b54 2154/* Toggle features for corrected errors */
9aaef96f 2155static void mce_disable_cmci(void *data)
9af43b54 2156{
89cbc767 2157 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54 2158 return;
9af43b54
HS
2159 cmci_clear();
2160}
2161
2162static void mce_enable_ce(void *all)
2163{
89cbc767 2164 if (!mce_available(raw_cpu_ptr(&cpu_info)))
9af43b54
HS
2165 return;
2166 cmci_reenable();
2167 cmci_recheck();
2168 if (all)
5e09954a 2169 __mcheck_cpu_init_timer();
9af43b54
HS
2170}
2171
8a25a2fd 2172static struct bus_type mce_subsys = {
e9eee03e 2173 .name = "machinecheck",
8a25a2fd 2174 .dev_name = "machinecheck",
1da177e4
LT
2175};
2176
d6126ef5 2177DEFINE_PER_CPU(struct device *, mce_device);
e9eee03e 2178
b4914508 2179static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
cebe1820 2180{
b4914508 2181 return container_of(attr, struct mce_bank_dev, attr);
cebe1820 2182}
0d7482e3 2183
8a25a2fd 2184static ssize_t show_bank(struct device *s, struct device_attribute *attr,
0d7482e3
AK
2185 char *buf)
2186{
b4914508
YG
2187 u8 bank = attr_to_bank(attr)->bank;
2188 struct mce_bank *b;
2189
c7d314f3 2190 if (bank >= per_cpu(mce_num_banks, s->id))
b4914508
YG
2191 return -EINVAL;
2192
2193 b = &per_cpu(mce_banks_array, s->id)[bank];
2194
068b053d
YG
2195 if (!b->init)
2196 return -ENODEV;
2197
b4914508 2198 return sprintf(buf, "%llx\n", b->ctl);
0d7482e3
AK
2199}
2200
8a25a2fd 2201static ssize_t set_bank(struct device *s, struct device_attribute *attr,
9319cec8 2202 const char *buf, size_t size)
0d7482e3 2203{
b4914508
YG
2204 u8 bank = attr_to_bank(attr)->bank;
2205 struct mce_bank *b;
9319cec8 2206 u64 new;
e9eee03e 2207
164109e3 2208 if (kstrtou64(buf, 0, &new) < 0)
0d7482e3 2209 return -EINVAL;
e9eee03e 2210
c7d314f3 2211 if (bank >= per_cpu(mce_num_banks, s->id))
b4914508
YG
2212 return -EINVAL;
2213
2214 b = &per_cpu(mce_banks_array, s->id)[bank];
2215
068b053d
YG
2216 if (!b->init)
2217 return -ENODEV;
2218
b4914508 2219 b->ctl = new;
0d7482e3 2220 mce_restart();
e9eee03e 2221
9319cec8 2222 return size;
0d7482e3 2223}
a98f0dd3 2224
8a25a2fd
KS
2225static ssize_t set_ignore_ce(struct device *s,
2226 struct device_attribute *attr,
9af43b54
HS
2227 const char *buf, size_t size)
2228{
2229 u64 new;
2230
164109e3 2231 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2232 return -EINVAL;
2233
b3b7c479 2234 mutex_lock(&mce_sysfs_mutex);
7af19e4a 2235 if (mca_cfg.ignore_ce ^ !!new) {
9af43b54
HS
2236 if (new) {
2237 /* disable ce features */
9aaef96f
HS
2238 mce_timer_delete_all();
2239 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2240 mca_cfg.ignore_ce = true;
9af43b54
HS
2241 } else {
2242 /* enable ce features */
7af19e4a 2243 mca_cfg.ignore_ce = false;
9af43b54
HS
2244 on_each_cpu(mce_enable_ce, (void *)1, 1);
2245 }
2246 }
b3b7c479
SH
2247 mutex_unlock(&mce_sysfs_mutex);
2248
9af43b54
HS
2249 return size;
2250}
2251
8a25a2fd
KS
2252static ssize_t set_cmci_disabled(struct device *s,
2253 struct device_attribute *attr,
9af43b54
HS
2254 const char *buf, size_t size)
2255{
2256 u64 new;
2257
164109e3 2258 if (kstrtou64(buf, 0, &new) < 0)
9af43b54
HS
2259 return -EINVAL;
2260
b3b7c479 2261 mutex_lock(&mce_sysfs_mutex);
7af19e4a 2262 if (mca_cfg.cmci_disabled ^ !!new) {
9af43b54
HS
2263 if (new) {
2264 /* disable cmci */
9aaef96f 2265 on_each_cpu(mce_disable_cmci, NULL, 1);
7af19e4a 2266 mca_cfg.cmci_disabled = true;
9af43b54
HS
2267 } else {
2268 /* enable cmci */
7af19e4a 2269 mca_cfg.cmci_disabled = false;
9af43b54
HS
2270 on_each_cpu(mce_enable_ce, NULL, 1);
2271 }
2272 }
b3b7c479
SH
2273 mutex_unlock(&mce_sysfs_mutex);
2274
9af43b54
HS
2275 return size;
2276}
2277
8a25a2fd
KS
2278static ssize_t store_int_with_restart(struct device *s,
2279 struct device_attribute *attr,
b56f642d
AK
2280 const char *buf, size_t size)
2281{
b3b7c479
SH
2282 unsigned long old_check_interval = check_interval;
2283 ssize_t ret = device_store_ulong(s, attr, buf, size);
2284
2285 if (check_interval == old_check_interval)
2286 return ret;
2287
b3b7c479 2288 mutex_lock(&mce_sysfs_mutex);
b56f642d 2289 mce_restart();
b3b7c479
SH
2290 mutex_unlock(&mce_sysfs_mutex);
2291
b56f642d
AK
2292 return ret;
2293}
2294
d203f0b8 2295static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
84c2559d 2296static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
d203f0b8 2297static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
e9eee03e 2298
8a25a2fd
KS
2299static struct dev_ext_attribute dev_attr_check_interval = {
2300 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
b56f642d
AK
2301 &check_interval
2302};
e9eee03e 2303
8a25a2fd 2304static struct dev_ext_attribute dev_attr_ignore_ce = {
7af19e4a
BP
2305 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2306 &mca_cfg.ignore_ce
9af43b54
HS
2307};
2308
8a25a2fd 2309static struct dev_ext_attribute dev_attr_cmci_disabled = {
7af19e4a
BP
2310 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2311 &mca_cfg.cmci_disabled
9af43b54
HS
2312};
2313
8a25a2fd
KS
2314static struct device_attribute *mce_device_attrs[] = {
2315 &dev_attr_tolerant.attr,
2316 &dev_attr_check_interval.attr,
5de97c9f 2317#ifdef CONFIG_X86_MCELOG_LEGACY
8a25a2fd 2318 &dev_attr_trigger,
5de97c9f 2319#endif
8a25a2fd
KS
2320 &dev_attr_monarch_timeout.attr,
2321 &dev_attr_dont_log_ce.attr,
2322 &dev_attr_ignore_ce.attr,
2323 &dev_attr_cmci_disabled.attr,
a98f0dd3
AK
2324 NULL
2325};
1da177e4 2326
8a25a2fd 2327static cpumask_var_t mce_device_initialized;
bae19fe0 2328
e032d807
GKH
2329static void mce_device_release(struct device *dev)
2330{
2331 kfree(dev);
2332}
2333
b4914508 2334/* Per CPU device init. All of the CPUs still share the same bank device: */
148f9bb8 2335static int mce_device_create(unsigned int cpu)
1da177e4 2336{
e032d807 2337 struct device *dev;
1da177e4 2338 int err;
b1f49f95 2339 int i, j;
92cb7612 2340
90367556 2341 if (!mce_available(&boot_cpu_data))
91c6d400
AK
2342 return -EIO;
2343
7f34b935
SAS
2344 dev = per_cpu(mce_device, cpu);
2345 if (dev)
2346 return 0;
2347
0e96f31e 2348 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
e032d807
GKH
2349 if (!dev)
2350 return -ENOMEM;
8a25a2fd
KS
2351 dev->id = cpu;
2352 dev->bus = &mce_subsys;
e032d807 2353 dev->release = &mce_device_release;
91c6d400 2354
8a25a2fd 2355 err = device_register(dev);
853d9b18
LK
2356 if (err) {
2357 put_device(dev);
d435d862 2358 return err;
853d9b18 2359 }
d435d862 2360
8a25a2fd
KS
2361 for (i = 0; mce_device_attrs[i]; i++) {
2362 err = device_create_file(dev, mce_device_attrs[i]);
d435d862
AM
2363 if (err)
2364 goto error;
2365 }
c7d314f3 2366 for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
b4914508 2367 err = device_create_file(dev, &mce_bank_devs[j].attr);
0d7482e3
AK
2368 if (err)
2369 goto error2;
2370 }
8a25a2fd 2371 cpumask_set_cpu(cpu, mce_device_initialized);
d6126ef5 2372 per_cpu(mce_device, cpu) = dev;
91c6d400 2373
d435d862 2374 return 0;
0d7482e3 2375error2:
b1f49f95 2376 while (--j >= 0)
b4914508 2377 device_remove_file(dev, &mce_bank_devs[j].attr);
d435d862 2378error:
cb491fca 2379 while (--i >= 0)
8a25a2fd 2380 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2381
8a25a2fd 2382 device_unregister(dev);
d435d862 2383
91c6d400
AK
2384 return err;
2385}
2386
148f9bb8 2387static void mce_device_remove(unsigned int cpu)
91c6d400 2388{
d6126ef5 2389 struct device *dev = per_cpu(mce_device, cpu);
73ca5358
SL
2390 int i;
2391
8a25a2fd 2392 if (!cpumask_test_cpu(cpu, mce_device_initialized))
bae19fe0
AH
2393 return;
2394
8a25a2fd
KS
2395 for (i = 0; mce_device_attrs[i]; i++)
2396 device_remove_file(dev, mce_device_attrs[i]);
cb491fca 2397
c7d314f3 2398 for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
b4914508 2399 device_remove_file(dev, &mce_bank_devs[i].attr);
cb491fca 2400
8a25a2fd
KS
2401 device_unregister(dev);
2402 cpumask_clear_cpu(cpu, mce_device_initialized);
d6126ef5 2403 per_cpu(mce_device, cpu) = NULL;
91c6d400 2404}
91c6d400 2405
d6b75584 2406/* Make sure there are no machine checks on offlined CPUs. */
39f152ff 2407static void mce_disable_cpu(void)
d6b75584 2408{
89cbc767 2409 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2410 return;
767df1bd 2411
39f152ff 2412 if (!cpuhp_tasks_frozen)
88ccbedd 2413 cmci_clear();
11868a2d 2414
6e06780a 2415 vendor_disable_error_reporting();
d6b75584
AK
2416}
2417
39f152ff 2418static void mce_reenable_cpu(void)
d6b75584 2419{
b4914508 2420 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
e9eee03e 2421 int i;
d6b75584 2422
89cbc767 2423 if (!mce_available(raw_cpu_ptr(&cpu_info)))
d6b75584 2424 return;
e9eee03e 2425
39f152ff 2426 if (!cpuhp_tasks_frozen)
88ccbedd 2427 cmci_reenable();
c7d314f3 2428 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
cebe1820 2429 struct mce_bank *b = &mce_banks[i];
11868a2d 2430
cebe1820 2431 if (b->init)
d9d73fcc 2432 wrmsrl(msr_ops.ctl(i), b->ctl);
06b7a7a5 2433 }
d6b75584
AK
2434}
2435
0e285d36 2436static int mce_cpu_dead(unsigned int cpu)
91c6d400 2437{
0e285d36 2438 mce_intel_hcpu_update(cpu);
91c6d400 2439
0e285d36
SAS
2440 /* intentionally ignoring frozen here */
2441 if (!cpuhp_tasks_frozen)
2442 cmci_rediscover();
2443 return 0;
91c6d400
AK
2444}
2445
8c0eeac8 2446static int mce_cpu_online(unsigned int cpu)
91c6d400 2447{
0becc0ae 2448 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8 2449 int ret;
91c6d400 2450
8c0eeac8 2451 mce_device_create(cpu);
38356c1f 2452
8c0eeac8
SAS
2453 ret = mce_threshold_create_device(cpu);
2454 if (ret) {
2455 mce_device_remove(cpu);
2456 return ret;
1a65f970 2457 }
8c0eeac8 2458 mce_reenable_cpu();
0becc0ae 2459 mce_start_timer(t);
8c0eeac8 2460 return 0;
91c6d400
AK
2461}
2462
8c0eeac8
SAS
2463static int mce_cpu_pre_down(unsigned int cpu)
2464{
0becc0ae 2465 struct timer_list *t = this_cpu_ptr(&mce_timer);
8c0eeac8
SAS
2466
2467 mce_disable_cpu();
2468 del_timer_sync(t);
2469 mce_threshold_remove_device(cpu);
2470 mce_device_remove(cpu);
2471 return 0;
2472}
91c6d400 2473
cebe1820 2474static __init void mce_init_banks(void)
0d7482e3
AK
2475{
2476 int i;
2477
b4914508
YG
2478 for (i = 0; i < MAX_NR_BANKS; i++) {
2479 struct mce_bank_dev *b = &mce_bank_devs[i];
8a25a2fd 2480 struct device_attribute *a = &b->attr;
e9eee03e 2481
b4914508
YG
2482 b->bank = i;
2483
a07e4156 2484 sysfs_attr_init(&a->attr);
cebe1820
AK
2485 a->attr.name = b->attrname;
2486 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
e9eee03e
IM
2487
2488 a->attr.mode = 0644;
2489 a->show = show_bank;
2490 a->store = set_bank;
0d7482e3 2491 }
0d7482e3
AK
2492}
2493
5e09954a 2494static __init int mcheck_init_device(void)
91c6d400
AK
2495{
2496 int err;
91c6d400 2497
c65e774f
KS
2498 /*
2499 * Check if we have a spare virtual bit. This will only become
2500 * a problem if/when we move beyond 5-level page tables.
2501 */
2502 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2503
9c15a24b
MS
2504 if (!mce_available(&boot_cpu_data)) {
2505 err = -EIO;
2506 goto err_out;
2507 }
0d7482e3 2508
9c15a24b
MS
2509 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2510 err = -ENOMEM;
2511 goto err_out;
2512 }
996867d0 2513
cebe1820 2514 mce_init_banks();
0d7482e3 2515
8a25a2fd 2516 err = subsys_system_register(&mce_subsys, NULL);
d435d862 2517 if (err)
9c15a24b 2518 goto err_out_mem;
91c6d400 2519
0e285d36
SAS
2520 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2521 mce_cpu_dead);
2522 if (err)
2523 goto err_out_mem;
91c6d400 2524
8c0eeac8
SAS
2525 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2526 mce_cpu_online, mce_cpu_pre_down);
2527 if (err < 0)
0e285d36 2528 goto err_out_online;
93b62c3c 2529
9c15a24b
MS
2530 register_syscore_ops(&mce_syscore_ops);
2531
9c15a24b
MS
2532 return 0;
2533
0e285d36
SAS
2534err_out_online:
2535 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
9c15a24b
MS
2536
2537err_out_mem:
2538 free_cpumask_var(mce_device_initialized);
2539
2540err_out:
5de97c9f 2541 pr_err("Unable to init MCE device (rc: %d)\n", err);
e9eee03e 2542
1da177e4 2543 return err;
1da177e4 2544}
cef12ee5 2545device_initcall_sync(mcheck_init_device);
a988d334 2546
d7c3c9a6
AK
2547/*
2548 * Old style boot options parsing. Only for compatibility.
2549 */
2550static int __init mcheck_disable(char *str)
2551{
09933946 2552 mca_cfg.disabled = 1;
d7c3c9a6
AK
2553 return 1;
2554}
2555__setup("nomce", mcheck_disable);
a988d334 2556
5be9ed25
HY
2557#ifdef CONFIG_DEBUG_FS
2558struct dentry *mce_get_debugfs_dir(void)
a988d334 2559{
5be9ed25 2560 static struct dentry *dmce;
a988d334 2561
5be9ed25
HY
2562 if (!dmce)
2563 dmce = debugfs_create_dir("mce", NULL);
a988d334 2564
5be9ed25
HY
2565 return dmce;
2566}
a988d334 2567
bf783f9f
HY
2568static void mce_reset(void)
2569{
2570 cpu_missing = 0;
c7c9b392 2571 atomic_set(&mce_fake_panicked, 0);
bf783f9f
HY
2572 atomic_set(&mce_executing, 0);
2573 atomic_set(&mce_callin, 0);
2574 atomic_set(&global_nwo, 0);
2575}
a988d334 2576
bf783f9f
HY
2577static int fake_panic_get(void *data, u64 *val)
2578{
2579 *val = fake_panic;
2580 return 0;
a988d334
IM
2581}
2582
bf783f9f 2583static int fake_panic_set(void *data, u64 val)
a988d334 2584{
bf783f9f
HY
2585 mce_reset();
2586 fake_panic = val;
2587 return 0;
a988d334 2588}
a988d334 2589
28156d76
Y
2590DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2591 "%llu\n");
d7c3c9a6 2592
6e4f929e 2593static void __init mcheck_debugfs_init(void)
d7c3c9a6 2594{
6e4f929e 2595 struct dentry *dmce;
bf783f9f
HY
2596
2597 dmce = mce_get_debugfs_dir();
6e4f929e
GKH
2598 debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
2599 &fake_panic_fops);
d7c3c9a6 2600}
fd4cf79f 2601#else
6e4f929e 2602static void __init mcheck_debugfs_init(void) { }
5be9ed25 2603#endif
fd4cf79f 2604
3637efb0
TL
2605DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2606EXPORT_SYMBOL_GPL(mcsafe_key);
2607
fd4cf79f
CG
2608static int __init mcheck_late_init(void)
2609{
3637efb0
TL
2610 if (mca_cfg.recovery)
2611 static_branch_inc(&mcsafe_key);
2612
fd4cf79f 2613 mcheck_debugfs_init();
011d8261 2614 cec_init();
fd4cf79f
CG
2615
2616 /*
2617 * Flush out everything that has been logged during early boot, now that
2618 * everything has been initialized (workqueues, decoders, ...).
2619 */
2620 mce_schedule_work();
2621
2622 return 0;
2623}
2624late_initcall(mcheck_late_init);