]> git.proxmox.com Git - mirror_ubuntu-eoan-kernel.git/blame - arch/x86/kernel/cpu/mcheck/mce_64.c
x86, mce: separate correct machine check poller and fatal exception handler
[mirror_ubuntu-eoan-kernel.git] / arch / x86 / kernel / cpu / mcheck / mce_64.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
1da177e4
LT
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
38c4c97c 14#include <linux/smp_lock.h>
1da177e4
LT
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
a9415644 21#include <linux/capability.h>
91c6d400
AK
22#include <linux/cpu.h>
23#include <linux/percpu.h>
e02e68d3
TH
24#include <linux/poll.h>
25#include <linux/thread_info.h>
8c566ef5 26#include <linux/ctype.h>
a98f0dd3 27#include <linux/kmod.h>
1eeb66a1 28#include <linux/kdebug.h>
0d7482e3
AK
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
d88203d1 31#include <asm/processor.h>
1da177e4
LT
32#include <asm/msr.h>
33#include <asm/mce.h>
1da177e4 34#include <asm/uaccess.h>
0a9c3ee7 35#include <asm/smp.h>
e02e68d3 36#include <asm/idle.h>
1da177e4
LT
37
38#define MISC_MCELOG_MINOR 227
0d7482e3
AK
39
40/*
41 * To support more than 128 would need to escape the predefined
42 * Linux defined extended banks first.
43 */
44#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
1da177e4 45
553f265f
AK
46atomic_t mce_entry;
47
1da177e4
LT
48static int mce_dont_init;
49
bd78432c
TH
50/*
51 * Tolerant levels:
52 * 0: always panic on uncorrected errors, log corrected errors
53 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
54 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
55 * 3: never panic or SIGBUS, log all errors (for testing only)
56 */
1da177e4
LT
57static int tolerant = 1;
58static int banks;
0d7482e3 59static u64 *bank;
e02e68d3 60static unsigned long notify_user;
94ad8474 61static int rip_msr;
911f6a7b 62static int mce_bootlog = -1;
a98f0dd3
AK
63static atomic_t mce_events;
64
65static char trigger[128];
66static char *trigger_argv[2] = { trigger, NULL };
1da177e4 67
e02e68d3
TH
68static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
69
b5f2fa4e
AK
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
1da177e4
LT
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
231fd906 84static struct mce_log mcelog = {
1da177e4
LT
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
d88203d1 87};
1da177e4
LT
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
a98f0dd3 92 atomic_inc(&mce_events);
1da177e4 93 mce->finished = 0;
7644143c 94 wmb();
1da177e4
LT
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
673242c1
AK
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
53756d37 101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
673242c1
AK
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
7644143c 109 break;
1da177e4 110 }
1da177e4
LT
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 117 wmb();
1da177e4 118 mcelog.entry[entry].finished = 1;
7644143c 119 wmb();
1da177e4 120
e02e68d3 121 set_bit(0, &notify_user);
1da177e4
LT
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
4855170f 127 KERN_EMERG "HARDWARE ERROR\n"
1da177e4
LT
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
65ea5b03 131 if (m->ip) {
d88203d1 132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
1da177e4 133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
65ea5b03 134 m->cs, m->ip);
1da177e4 135 if (m->cs == __KERNEL_CS)
65ea5b03 136 print_symbol("{%s}", m->ip);
1da177e4
LT
137 printk("\n");
138 }
d88203d1 139 printk(KERN_EMERG "TSC %Lx ", m->tsc);
1da177e4
LT
140 if (m->addr)
141 printk("ADDR %Lx ", m->addr);
142 if (m->misc)
d88203d1 143 printk("MISC %Lx ", m->misc);
1da177e4 144 printk("\n");
4855170f 145 printk(KERN_EMERG "This is not a software problem!\n");
d88203d1
TG
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
1da177e4
LT
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
d88203d1 151{
1da177e4 152 int i;
e02e68d3 153
1da177e4
LT
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
d88203d1 157
1da177e4
LT
158 if (time_before(tsc, start))
159 continue;
d88203d1 160 print_mce(&mcelog.entry[i]);
1da177e4
LT
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
e02e68d3 166 panic(msg);
d88203d1 167}
1da177e4
LT
168
169static int mce_available(struct cpuinfo_x86 *c)
170{
5b4408fd
AK
171 if (mce_dont_init)
172 return 0;
3d1712c9 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
174}
175
94ad8474
AK
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
65ea5b03 179 m->ip = regs->ip;
94ad8474
AK
180 m->cs = regs->cs;
181 } else {
65ea5b03 182 m->ip = 0;
94ad8474
AK
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
65ea5b03 188 rdmsrl(rip_msr, m->ip);
94ad8474
AK
189 m->cs = 0;
190 }
191}
192
d88203d1 193/*
b79109c3
AK
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
199void machine_check_poll(enum mcp_flags flags)
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
208 if (!bank[i])
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
242
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245
246 /*
247 * Clear state for this bank.
248 */
249 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
250 }
251
252 /*
253 * Don't clear MCG_STATUS here because it's only defined for
254 * exceptions.
255 */
256}
257
258/*
259 * The actual machine check handler. This only handles real
260 * exceptions when something got corrupted coming in through int 18.
261 *
262 * This is executed in NMI context not subject to normal locking rules. This
263 * implies that most kernel services cannot be safely used. Don't even
264 * think about putting a printk in there!
1da177e4 265 */
1da177e4
LT
266void do_machine_check(struct pt_regs * regs, long error_code)
267{
268 struct mce m, panicm;
1da177e4
LT
269 u64 mcestart = 0;
270 int i;
271 int panicm_found = 0;
bd78432c
TH
272 /*
273 * If no_way_out gets set, there is no safe way to recover from this
274 * MCE. If tolerant is cranked up, we'll try anyway.
275 */
276 int no_way_out = 0;
277 /*
278 * If kill_it gets set, there might be a way to recover from this
279 * error.
280 */
281 int kill_it = 0;
b79109c3 282 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1da177e4 283
553f265f
AK
284 atomic_inc(&mce_entry);
285
b79109c3 286 if (notify_die(DIE_NMI, "machine check", regs, error_code,
22f5991c 287 18, SIGKILL) == NOTIFY_STOP)
b79109c3
AK
288 goto out2;
289 if (!banks)
553f265f 290 goto out2;
1da177e4 291
b5f2fa4e
AK
292 mce_setup(&m);
293
1da177e4 294 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
bd78432c 295 /* if the restart IP is not valid, we're done for */
1da177e4 296 if (!(m.mcgstatus & MCG_STATUS_RIPV))
bd78432c 297 no_way_out = 1;
d88203d1 298
1da177e4
LT
299 rdtscll(mcestart);
300 barrier();
301
302 for (i = 0; i < banks; i++) {
b79109c3 303 __clear_bit(i, toclear);
0d7482e3 304 if (!bank[i])
1da177e4 305 continue;
d88203d1
TG
306
307 m.misc = 0;
1da177e4
LT
308 m.addr = 0;
309 m.bank = i;
1da177e4
LT
310
311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
312 if ((m.status & MCI_STATUS_VAL) == 0)
313 continue;
314
b79109c3
AK
315 /*
316 * Non uncorrected errors are handled by machine_check_poll
317 * Leave them alone.
318 */
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
321
322 /*
323 * Set taint even when machine check was not enabled.
324 */
325 add_taint(TAINT_MACHINE_CHECK);
326
327 __set_bit(i, toclear);
328
1da177e4 329 if (m.status & MCI_STATUS_EN) {
bd78432c
TH
330 /* if PCC was set, there's no way out */
331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
332 /*
333 * If this error was uncorrectable and there was
334 * an overflow, we're in trouble. If no overflow,
335 * we might get away with just killing a task.
336 */
337 if (m.status & MCI_STATUS_UC) {
338 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
339 no_way_out = 1;
340 kill_it = 1;
341 }
b79109c3
AK
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
1da177e4
LT
348 }
349
350 if (m.status & MCI_STATUS_MISCV)
351 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
352 if (m.status & MCI_STATUS_ADDRV)
353 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
354
94ad8474 355 mce_get_rip(&m, regs);
b79109c3 356 mce_log(&m);
1da177e4
LT
357
358 /* Did this bank cause the exception? */
359 /* Assume that the bank with uncorrectable errors did it,
360 and that there is only a single one. */
361 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
362 panicm = m;
363 panicm_found = 1;
364 }
1da177e4
LT
365 }
366
1da177e4
LT
367 /* If we didn't find an uncorrectable error, pick
368 the last one (shouldn't happen, just being safe). */
369 if (!panicm_found)
370 panicm = m;
bd78432c
TH
371
372 /*
373 * If we have decided that we just CAN'T continue, and the user
374 * has not set tolerant to an insane level, give up and die.
375 */
376 if (no_way_out && tolerant < 3)
1da177e4 377 mce_panic("Machine check", &panicm, mcestart);
bd78432c
TH
378
379 /*
380 * If the error seems to be unrecoverable, something should be
381 * done. Try to kill as little as possible. If we can kill just
382 * one task, do that. If the user has set the tolerance very
383 * high, don't try to do anything at all.
384 */
385 if (kill_it && tolerant < 3) {
1da177e4
LT
386 int user_space = 0;
387
bd78432c
TH
388 /*
389 * If the EIPV bit is set, it means the saved IP is the
390 * instruction which caused the MCE.
391 */
392 if (m.mcgstatus & MCG_STATUS_EIPV)
65ea5b03 393 user_space = panicm.ip && (panicm.cs & 3);
bd78432c
TH
394
395 /*
396 * If we know that the error was in user space, send a
397 * SIGBUS. Otherwise, panic if tolerance is low.
398 *
380851bc 399 * force_sig() takes an awful lot of locks and has a slight
bd78432c
TH
400 * risk of deadlocking.
401 */
402 if (user_space) {
380851bc 403 force_sig(SIGBUS, current);
bd78432c
TH
404 } else if (panic_on_oops || tolerant < 2) {
405 mce_panic("Uncorrected machine check",
406 &panicm, mcestart);
407 }
1da177e4
LT
408 }
409
e02e68d3
TH
410 /* notify userspace ASAP */
411 set_thread_flag(TIF_MCE_NOTIFY);
412
bd78432c 413 /* the last thing we do is clear state */
b79109c3
AK
414 for (i = 0; i < banks; i++) {
415 if (test_bit(i, toclear))
416 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
417 }
1da177e4 418 wrmsrl(MSR_IA32_MCG_STATUS, 0);
553f265f
AK
419 out2:
420 atomic_dec(&mce_entry);
1da177e4
LT
421}
422
15d5f839
DZ
423#ifdef CONFIG_X86_MCE_INTEL
424/***
425 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
676b1855 426 * @cpu: The CPU on which the event occurred.
15d5f839
DZ
427 * @status: Event status information
428 *
429 * This function should be called by the thermal interrupt after the
430 * event has been processed and the decision was made to log the event
431 * further.
432 *
433 * The status parameter will be saved to the 'status' field of 'struct mce'
434 * and historically has been the register value of the
435 * MSR_IA32_THERMAL_STATUS (Intel) msr.
436 */
b5f2fa4e 437void mce_log_therm_throt_event(__u64 status)
15d5f839
DZ
438{
439 struct mce m;
440
b5f2fa4e 441 mce_setup(&m);
15d5f839
DZ
442 m.bank = MCE_THERMAL_BANK;
443 m.status = status;
15d5f839
DZ
444 mce_log(&m);
445}
446#endif /* CONFIG_X86_MCE_INTEL */
447
1da177e4 448/*
8a336b0a
TH
449 * Periodic polling timer for "silent" machine check errors. If the
450 * poller finds an MCE, poll 2x faster. When the poller finds no more
451 * errors, poll 2x slower (up to check_interval seconds).
1da177e4
LT
452 */
453
454static int check_interval = 5 * 60; /* 5 minutes */
8a336b0a 455static int next_interval; /* in jiffies */
52d168e2
AK
456static void mcheck_timer(unsigned long);
457static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 458
52d168e2 459static void mcheck_timer(unsigned long data)
1da177e4 460{
52d168e2
AK
461 struct timer_list *t = &per_cpu(mce_timer, data);
462
463 WARN_ON(smp_processor_id() != data);
464
1da177e4 465 if (mce_available(&current_cpu_data))
b79109c3 466 machine_check_poll(MCP_TIMESTAMP);
1da177e4
LT
467
468 /*
e02e68d3
TH
469 * Alert userspace if needed. If we logged an MCE, reduce the
470 * polling interval, otherwise increase the polling interval.
1da177e4 471 */
e02e68d3
TH
472 if (mce_notify_user()) {
473 next_interval = max(next_interval/2, HZ/100);
474 } else {
d88203d1 475 next_interval = min(next_interval * 2,
22293e58 476 (int)round_jiffies_relative(check_interval*HZ));
e02e68d3
TH
477 }
478
52d168e2
AK
479 t->expires = jiffies + next_interval;
480 add_timer(t);
e02e68d3
TH
481}
482
9bd98405
AK
483static void mce_do_trigger(struct work_struct *work)
484{
485 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
486}
487
488static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
489
e02e68d3 490/*
9bd98405
AK
491 * Notify the user(s) about new machine check events.
492 * Can be called from interrupt context, but not from machine check/NMI
493 * context.
e02e68d3
TH
494 */
495int mce_notify_user(void)
496{
497 clear_thread_flag(TIF_MCE_NOTIFY);
498 if (test_and_clear_bit(0, &notify_user)) {
8a336b0a
TH
499 static unsigned long last_print;
500 unsigned long now = jiffies;
501
e02e68d3 502 wake_up_interruptible(&mce_wait);
9bd98405
AK
503
504 /*
505 * There is no risk of missing notifications because
506 * work_pending is always cleared before the function is
507 * executed.
508 */
509 if (trigger[0] && !work_pending(&mce_trigger_work))
510 schedule_work(&mce_trigger_work);
e02e68d3 511
8a336b0a
TH
512 if (time_after_eq(now, last_print + (check_interval*HZ))) {
513 last_print = now;
514 printk(KERN_INFO "Machine check events logged\n");
515 }
e02e68d3
TH
516
517 return 1;
1da177e4 518 }
e02e68d3
TH
519 return 0;
520}
8a336b0a 521
e02e68d3
TH
522/* see if the idle task needs to notify userspace */
523static int
524mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
525{
526 /* IDLE_END should be safe - interrupts are back on */
527 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
528 mce_notify_user();
529
530 return NOTIFY_OK;
1da177e4
LT
531}
532
e02e68d3
TH
533static struct notifier_block mce_idle_notifier = {
534 .notifier_call = mce_idle_callback,
535};
1da177e4
LT
536
537static __init int periodic_mcheck_init(void)
d88203d1 538{
52d168e2
AK
539 idle_notifier_register(&mce_idle_notifier);
540 return 0;
d88203d1 541}
1da177e4
LT
542__initcall(periodic_mcheck_init);
543
d88203d1 544/*
1da177e4
LT
545 * Initialize Machine Checks for a CPU.
546 */
0d7482e3 547static int mce_cap_init(void)
1da177e4
LT
548{
549 u64 cap;
0d7482e3 550 unsigned b;
1da177e4
LT
551
552 rdmsrl(MSR_IA32_MCG_CAP, cap);
0d7482e3
AK
553 b = cap & 0xff;
554 if (b > MAX_NR_BANKS) {
555 printk(KERN_WARNING
556 "MCE: Using only %u machine check banks out of %u\n",
557 MAX_NR_BANKS, b);
558 b = MAX_NR_BANKS;
559 }
560
561 /* Don't support asymmetric configurations today */
562 WARN_ON(banks != 0 && b != banks);
563 banks = b;
564 if (!bank) {
565 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
566 if (!bank)
567 return -ENOMEM;
568 memset(bank, 0xff, banks * sizeof(u64));
1da177e4 569 }
0d7482e3 570
94ad8474
AK
571 /* Use accurate RIP reporting if available. */
572 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
573 rip_msr = MSR_IA32_MCG_EIP;
1da177e4 574
0d7482e3
AK
575 return 0;
576}
577
578static void mce_init(void *dummy)
579{
580 u64 cap;
581 int i;
582
b79109c3
AK
583 /*
584 * Log the machine checks left over from the previous reset.
585 */
586 machine_check_poll(MCP_UC);
1da177e4
LT
587
588 set_in_cr4(X86_CR4_MCE);
589
0d7482e3 590 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
591 if (cap & MCG_CTL_P)
592 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
593
594 for (i = 0; i < banks; i++) {
0d7482e3 595 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1da177e4 596 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
d88203d1 597 }
1da177e4
LT
598}
599
600/* Add per CPU specific workarounds here */
e6982c67 601static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
d88203d1 602{
1da177e4 603 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 604 if (c->x86_vendor == X86_VENDOR_AMD) {
0d7482e3 605 if (c->x86 == 15 && banks > 4)
911f6a7b
JB
606 /* disable GART TBL walk error reporting, which trips off
607 incorrectly with the IOMMU & 3ware & Cerberus. */
0d7482e3 608 clear_bit(10, (unsigned long *)&bank[4]);
911f6a7b
JB
609 if(c->x86 <= 17 && mce_bootlog < 0)
610 /* Lots of broken BIOS around that don't clear them
611 by default and leave crap in there. Don't log. */
612 mce_bootlog = 0;
1da177e4 613 }
e583538f 614
d88203d1 615}
1da177e4 616
e6982c67 617static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
618{
619 switch (c->x86_vendor) {
620 case X86_VENDOR_INTEL:
621 mce_intel_feature_init(c);
622 break;
89b831ef
JS
623 case X86_VENDOR_AMD:
624 mce_amd_feature_init(c);
625 break;
1da177e4
LT
626 default:
627 break;
628 }
629}
630
52d168e2
AK
631static void mce_init_timer(void)
632{
633 struct timer_list *t = &__get_cpu_var(mce_timer);
634
635 /* data race harmless because everyone sets to the same value */
636 if (!next_interval)
637 next_interval = check_interval * HZ;
638 if (!next_interval)
639 return;
640 setup_timer(t, mcheck_timer, smp_processor_id());
641 t->expires = round_jiffies_relative(jiffies + next_interval);
642 add_timer(t);
643}
644
d88203d1 645/*
1da177e4 646 * Called for each booted CPU to set up machine checks.
d88203d1 647 * Must be called with preempt off.
1da177e4 648 */
e6982c67 649void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4 650{
5b4408fd 651 if (!mce_available(c))
1da177e4
LT
652 return;
653
0d7482e3
AK
654 if (mce_cap_init() < 0) {
655 mce_dont_init = 1;
656 return;
657 }
658 mce_cpu_quirks(c);
659
1da177e4
LT
660 mce_init(NULL);
661 mce_cpu_features(c);
52d168e2 662 mce_init_timer();
1da177e4
LT
663}
664
665/*
666 * Character device to read and clear the MCE log.
667 */
668
f528e7ba
TH
669static DEFINE_SPINLOCK(mce_state_lock);
670static int open_count; /* #times opened */
671static int open_exclu; /* already open exclusive? */
672
673static int mce_open(struct inode *inode, struct file *file)
674{
38c4c97c 675 lock_kernel();
f528e7ba
TH
676 spin_lock(&mce_state_lock);
677
678 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
679 spin_unlock(&mce_state_lock);
38c4c97c 680 unlock_kernel();
f528e7ba
TH
681 return -EBUSY;
682 }
683
684 if (file->f_flags & O_EXCL)
685 open_exclu = 1;
686 open_count++;
687
688 spin_unlock(&mce_state_lock);
38c4c97c 689 unlock_kernel();
f528e7ba 690
bd78432c 691 return nonseekable_open(inode, file);
f528e7ba
TH
692}
693
694static int mce_release(struct inode *inode, struct file *file)
695{
696 spin_lock(&mce_state_lock);
697
698 open_count--;
699 open_exclu = 0;
700
701 spin_unlock(&mce_state_lock);
702
703 return 0;
704}
705
d88203d1
TG
706static void collect_tscs(void *data)
707{
1da177e4 708 unsigned long *cpu_tsc = (unsigned long *)data;
d88203d1 709
1da177e4 710 rdtscll(cpu_tsc[smp_processor_id()]);
d88203d1 711}
1da177e4 712
d88203d1
TG
713static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
714 loff_t *off)
1da177e4 715{
f0de53bb 716 unsigned long *cpu_tsc;
8c8b8859 717 static DEFINE_MUTEX(mce_read_mutex);
ef41df43 718 unsigned prev, next;
1da177e4
LT
719 char __user *buf = ubuf;
720 int i, err;
721
6bca67f9 722 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
f0de53bb
AK
723 if (!cpu_tsc)
724 return -ENOMEM;
725
8c8b8859 726 mutex_lock(&mce_read_mutex);
1da177e4
LT
727 next = rcu_dereference(mcelog.next);
728
729 /* Only supports full reads right now */
d88203d1 730 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
8c8b8859 731 mutex_unlock(&mce_read_mutex);
f0de53bb 732 kfree(cpu_tsc);
1da177e4
LT
733 return -EINVAL;
734 }
735
736 err = 0;
ef41df43
HY
737 prev = 0;
738 do {
739 for (i = prev; i < next; i++) {
740 unsigned long start = jiffies;
741
742 while (!mcelog.entry[i].finished) {
743 if (time_after_eq(jiffies, start + 2)) {
744 memset(mcelog.entry + i, 0,
745 sizeof(struct mce));
746 goto timeout;
747 }
748 cpu_relax();
673242c1 749 }
ef41df43
HY
750 smp_rmb();
751 err |= copy_to_user(buf, mcelog.entry + i,
752 sizeof(struct mce));
753 buf += sizeof(struct mce);
754timeout:
755 ;
673242c1 756 }
1da177e4 757
ef41df43
HY
758 memset(mcelog.entry + prev, 0,
759 (next - prev) * sizeof(struct mce));
760 prev = next;
761 next = cmpxchg(&mcelog.next, prev, 0);
762 } while (next != prev);
1da177e4 763
b2b18660 764 synchronize_sched();
1da177e4 765
d88203d1
TG
766 /*
767 * Collect entries that were still getting written before the
768 * synchronize.
769 */
15c8b6c1 770 on_each_cpu(collect_tscs, cpu_tsc, 1);
d88203d1
TG
771 for (i = next; i < MCE_LOG_LEN; i++) {
772 if (mcelog.entry[i].finished &&
773 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
774 err |= copy_to_user(buf, mcelog.entry+i,
775 sizeof(struct mce));
1da177e4
LT
776 smp_rmb();
777 buf += sizeof(struct mce);
778 memset(&mcelog.entry[i], 0, sizeof(struct mce));
779 }
d88203d1 780 }
8c8b8859 781 mutex_unlock(&mce_read_mutex);
f0de53bb 782 kfree(cpu_tsc);
d88203d1 783 return err ? -EFAULT : buf - ubuf;
1da177e4
LT
784}
785
e02e68d3
TH
786static unsigned int mce_poll(struct file *file, poll_table *wait)
787{
788 poll_wait(file, &mce_wait, wait);
789 if (rcu_dereference(mcelog.next))
790 return POLLIN | POLLRDNORM;
791 return 0;
792}
793
c68461b6 794static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1da177e4
LT
795{
796 int __user *p = (int __user *)arg;
d88203d1 797
1da177e4 798 if (!capable(CAP_SYS_ADMIN))
d88203d1 799 return -EPERM;
1da177e4 800 switch (cmd) {
d88203d1 801 case MCE_GET_RECORD_LEN:
1da177e4
LT
802 return put_user(sizeof(struct mce), p);
803 case MCE_GET_LOG_LEN:
d88203d1 804 return put_user(MCE_LOG_LEN, p);
1da177e4
LT
805 case MCE_GETCLEAR_FLAGS: {
806 unsigned flags;
d88203d1
TG
807
808 do {
1da177e4 809 flags = mcelog.flags;
d88203d1
TG
810 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
811 return put_user(flags, p);
1da177e4
LT
812 }
813 default:
d88203d1
TG
814 return -ENOTTY;
815 }
1da177e4
LT
816}
817
5dfe4c96 818static const struct file_operations mce_chrdev_ops = {
f528e7ba
TH
819 .open = mce_open,
820 .release = mce_release,
1da177e4 821 .read = mce_read,
e02e68d3 822 .poll = mce_poll,
c68461b6 823 .unlocked_ioctl = mce_ioctl,
1da177e4
LT
824};
825
826static struct miscdevice mce_log_device = {
827 MISC_MCELOG_MINOR,
828 "mcelog",
829 &mce_chrdev_ops,
830};
831
d88203d1
TG
832/*
833 * Old style boot options parsing. Only for compatibility.
1da177e4 834 */
1da177e4
LT
835static int __init mcheck_disable(char *str)
836{
837 mce_dont_init = 1;
9b41046c 838 return 1;
1da177e4
LT
839}
840
5b4408fd 841/* mce=off disables machine check.
8c566ef5 842 mce=TOLERANCELEVEL (number, see above)
e583538f
AK
843 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
844 mce=nobootlog Don't log MCEs from before booting. */
1da177e4
LT
845static int __init mcheck_enable(char *str)
846{
847 if (!strcmp(str, "off"))
848 mce_dont_init = 1;
e583538f
AK
849 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
850 mce_bootlog = str[0] == 'b';
8c566ef5
AK
851 else if (isdigit(str[0]))
852 get_option(&str, &tolerant);
1da177e4 853 else
d88203d1 854 printk("mce= argument %s ignored. Please use /sys", str);
9b41046c 855 return 1;
1da177e4
LT
856}
857
858__setup("nomce", mcheck_disable);
909dd324 859__setup("mce=", mcheck_enable);
1da177e4 860
d88203d1 861/*
1da177e4 862 * Sysfs support
d88203d1 863 */
1da177e4 864
973a2dd1
AK
865/*
866 * Disable machine checks on suspend and shutdown. We can't really handle
867 * them later.
868 */
869static int mce_disable(void)
870{
871 int i;
872
873 for (i = 0; i < banks; i++)
874 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
875 return 0;
876}
877
878static int mce_suspend(struct sys_device *dev, pm_message_t state)
879{
880 return mce_disable();
881}
882
883static int mce_shutdown(struct sys_device *dev)
884{
885 return mce_disable();
886}
887
413588c7
AK
888/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
889 Only one CPU is active at this time, the others get readded later using
890 CPU hotplug. */
1da177e4
LT
891static int mce_resume(struct sys_device *dev)
892{
413588c7 893 mce_init(NULL);
6ec68bff 894 mce_cpu_features(&current_cpu_data);
1da177e4
LT
895 return 0;
896}
897
52d168e2
AK
898static void mce_cpu_restart(void *data)
899{
900 del_timer_sync(&__get_cpu_var(mce_timer));
901 if (mce_available(&current_cpu_data))
902 mce_init(NULL);
903 mce_init_timer();
904}
905
1da177e4 906/* Reinit MCEs after user configuration changes */
d88203d1
TG
907static void mce_restart(void)
908{
8a336b0a 909 next_interval = check_interval * HZ;
52d168e2 910 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
911}
912
913static struct sysdev_class mce_sysclass = {
973a2dd1
AK
914 .suspend = mce_suspend,
915 .shutdown = mce_shutdown,
1da177e4 916 .resume = mce_resume,
af5ca3f4 917 .name = "machinecheck",
1da177e4
LT
918};
919
fff2e89f 920DEFINE_PER_CPU(struct sys_device, device_mce);
8735728e 921void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
1da177e4
LT
922
923/* Why are there no generic functions for this? */
924#define ACCESSOR(name, var, start) \
4a0b2b4d
AK
925 static ssize_t show_ ## name(struct sys_device *s, \
926 struct sysdev_attribute *attr, \
927 char *buf) { \
d88203d1
TG
928 return sprintf(buf, "%lx\n", (unsigned long)var); \
929 } \
4a0b2b4d
AK
930 static ssize_t set_ ## name(struct sys_device *s, \
931 struct sysdev_attribute *attr, \
932 const char *buf, size_t siz) { \
d88203d1
TG
933 char *end; \
934 unsigned long new = simple_strtoul(buf, &end, 0); \
935 if (end == buf) return -EINVAL; \
936 var = new; \
937 start; \
938 return end-buf; \
939 } \
1da177e4
LT
940 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
941
0d7482e3
AK
942static struct sysdev_attribute *bank_attrs;
943
944static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
945 char *buf)
946{
947 u64 b = bank[attr - bank_attrs];
948 return sprintf(buf, "%Lx\n", b);
949}
950
951static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
952 const char *buf, size_t siz)
953{
954 char *end;
955 u64 new = simple_strtoull(buf, &end, 0);
956 if (end == buf)
957 return -EINVAL;
958 bank[attr - bank_attrs] = new;
959 mce_restart();
960 return end-buf;
961}
a98f0dd3 962
4a0b2b4d
AK
963static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
964 char *buf)
a98f0dd3
AK
965{
966 strcpy(buf, trigger);
967 strcat(buf, "\n");
968 return strlen(trigger) + 1;
969}
970
4a0b2b4d
AK
971static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
972 const char *buf,size_t siz)
a98f0dd3
AK
973{
974 char *p;
975 int len;
976 strncpy(trigger, buf, sizeof(trigger));
977 trigger[sizeof(trigger)-1] = 0;
978 len = strlen(trigger);
979 p = strchr(trigger, '\n');
980 if (*p) *p = 0;
981 return len;
982}
983
984static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
d95d62c0 985static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1da177e4 986ACCESSOR(check_interval,check_interval,mce_restart())
a98f0dd3 987static struct sysdev_attribute *mce_attributes[] = {
d95d62c0 988 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
a98f0dd3
AK
989 NULL
990};
1da177e4 991
bae19fe0
AH
992static cpumask_t mce_device_initialized = CPU_MASK_NONE;
993
91c6d400
AK
994/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
995static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
996{
997 int err;
73ca5358 998 int i;
92cb7612 999
90367556 1000 if (!mce_available(&boot_cpu_data))
91c6d400
AK
1001 return -EIO;
1002
d435d862 1003 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
91c6d400
AK
1004 per_cpu(device_mce,cpu).id = cpu;
1005 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1006
1007 err = sysdev_register(&per_cpu(device_mce,cpu));
d435d862
AM
1008 if (err)
1009 return err;
1010
1011 for (i = 0; mce_attributes[i]; i++) {
1012 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1013 mce_attributes[i]);
1014 if (err)
1015 goto error;
1016 }
0d7482e3
AK
1017 for (i = 0; i < banks; i++) {
1018 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1019 &bank_attrs[i]);
1020 if (err)
1021 goto error2;
1022 }
bae19fe0 1023 cpu_set(cpu, mce_device_initialized);
91c6d400 1024
d435d862 1025 return 0;
0d7482e3
AK
1026error2:
1027 while (--i >= 0) {
1028 sysdev_remove_file(&per_cpu(device_mce, cpu),
1029 &bank_attrs[i]);
1030 }
d435d862 1031error:
0d7482e3 1032 while (--i >= 0) {
d435d862
AM
1033 sysdev_remove_file(&per_cpu(device_mce,cpu),
1034 mce_attributes[i]);
91c6d400 1035 }
d435d862
AM
1036 sysdev_unregister(&per_cpu(device_mce,cpu));
1037
91c6d400
AK
1038 return err;
1039}
1040
2d9cd6c2 1041static __cpuinit void mce_remove_device(unsigned int cpu)
91c6d400 1042{
73ca5358
SL
1043 int i;
1044
bae19fe0
AH
1045 if (!cpu_isset(cpu, mce_device_initialized))
1046 return;
1047
a98f0dd3 1048 for (i = 0; mce_attributes[i]; i++)
73ca5358 1049 sysdev_remove_file(&per_cpu(device_mce,cpu),
a98f0dd3 1050 mce_attributes[i]);
0d7482e3
AK
1051 for (i = 0; i < banks; i++)
1052 sysdev_remove_file(&per_cpu(device_mce, cpu),
1053 &bank_attrs[i]);
91c6d400 1054 sysdev_unregister(&per_cpu(device_mce,cpu));
bae19fe0 1055 cpu_clear(cpu, mce_device_initialized);
91c6d400 1056}
91c6d400 1057
d6b75584
AK
1058/* Make sure there are no machine checks on offlined CPUs. */
1059static void __cpuexit mce_disable_cpu(void *h)
1060{
1061 int i;
1062
1063 if (!mce_available(&current_cpu_data))
1064 return;
1065 for (i = 0; i < banks; i++)
1066 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1067}
1068
1069static void __cpuexit mce_reenable_cpu(void *h)
1070{
1071 int i;
1072
1073 if (!mce_available(&current_cpu_data))
1074 return;
1075 for (i = 0; i < banks; i++)
1076 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1077}
1078
91c6d400 1079/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1e35669d
SR
1080static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1081 unsigned long action, void *hcpu)
91c6d400
AK
1082{
1083 unsigned int cpu = (unsigned long)hcpu;
52d168e2 1084 struct timer_list *t = &per_cpu(mce_timer, cpu);
91c6d400
AK
1085
1086 switch (action) {
bae19fe0
AH
1087 case CPU_ONLINE:
1088 case CPU_ONLINE_FROZEN:
1089 mce_create_device(cpu);
8735728e
RW
1090 if (threshold_cpu_callback)
1091 threshold_cpu_callback(action, cpu);
91c6d400 1092 break;
91c6d400 1093 case CPU_DEAD:
8bb78442 1094 case CPU_DEAD_FROZEN:
8735728e
RW
1095 if (threshold_cpu_callback)
1096 threshold_cpu_callback(action, cpu);
91c6d400
AK
1097 mce_remove_device(cpu);
1098 break;
52d168e2
AK
1099 case CPU_DOWN_PREPARE:
1100 case CPU_DOWN_PREPARE_FROZEN:
1101 del_timer_sync(t);
d6b75584 1102 smp_call_function_single(cpu, mce_disable_cpu, NULL, 1);
52d168e2
AK
1103 break;
1104 case CPU_DOWN_FAILED:
1105 case CPU_DOWN_FAILED_FROZEN:
1106 t->expires = round_jiffies_relative(jiffies + next_interval);
1107 add_timer_on(t, cpu);
d6b75584 1108 smp_call_function_single(cpu, mce_reenable_cpu, NULL, 1);
52d168e2 1109 break;
91c6d400 1110 }
bae19fe0 1111 return NOTIFY_OK;
91c6d400
AK
1112}
1113
1e35669d 1114static struct notifier_block mce_cpu_notifier __cpuinitdata = {
91c6d400
AK
1115 .notifier_call = mce_cpu_callback,
1116};
1117
0d7482e3
AK
1118static __init int mce_init_banks(void)
1119{
1120 int i;
1121
1122 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1123 GFP_KERNEL);
1124 if (!bank_attrs)
1125 return -ENOMEM;
1126
1127 for (i = 0; i < banks; i++) {
1128 struct sysdev_attribute *a = &bank_attrs[i];
1129 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1130 if (!a->attr.name)
1131 goto nomem;
1132 a->attr.mode = 0644;
1133 a->show = show_bank;
1134 a->store = set_bank;
1135 }
1136 return 0;
1137
1138nomem:
1139 while (--i >= 0)
1140 kfree(bank_attrs[i].attr.name);
1141 kfree(bank_attrs);
1142 bank_attrs = NULL;
1143 return -ENOMEM;
1144}
1145
91c6d400
AK
1146static __init int mce_init_device(void)
1147{
1148 int err;
1149 int i = 0;
1150
1da177e4
LT
1151 if (!mce_available(&boot_cpu_data))
1152 return -EIO;
0d7482e3
AK
1153
1154 err = mce_init_banks();
1155 if (err)
1156 return err;
1157
1da177e4 1158 err = sysdev_class_register(&mce_sysclass);
d435d862
AM
1159 if (err)
1160 return err;
91c6d400
AK
1161
1162 for_each_online_cpu(i) {
d435d862
AM
1163 err = mce_create_device(i);
1164 if (err)
1165 return err;
91c6d400
AK
1166 }
1167
be6b5a35 1168 register_hotcpu_notifier(&mce_cpu_notifier);
1da177e4
LT
1169 misc_register(&mce_log_device);
1170 return err;
1da177e4 1171}
91c6d400 1172
1da177e4 1173device_initcall(mce_init_device);