]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - arch/x86/kernel/cpu/mcheck/mce_64.c
x86, mce: Cleanup param parser
[mirror_ubuntu-artful-kernel.git] / arch / x86 / kernel / cpu / mcheck / mce_64.c
CommitLineData
1da177e4
LT
1/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
d88203d1
TG
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
b79109c3
AK
6 * Copyright 2008 Intel Corporation
7 * Author: Andi Kleen
1da177e4
LT
8 */
9
10#include <linux/init.h>
11#include <linux/types.h>
12#include <linux/kernel.h>
13#include <linux/sched.h>
38c4c97c 14#include <linux/smp_lock.h>
1da177e4
LT
15#include <linux/string.h>
16#include <linux/rcupdate.h>
17#include <linux/kallsyms.h>
18#include <linux/sysdev.h>
19#include <linux/miscdevice.h>
20#include <linux/fs.h>
a9415644 21#include <linux/capability.h>
91c6d400
AK
22#include <linux/cpu.h>
23#include <linux/percpu.h>
e02e68d3
TH
24#include <linux/poll.h>
25#include <linux/thread_info.h>
8c566ef5 26#include <linux/ctype.h>
a98f0dd3 27#include <linux/kmod.h>
1eeb66a1 28#include <linux/kdebug.h>
0d7482e3
AK
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
8457c84d 31#include <linux/ratelimit.h>
d88203d1 32#include <asm/processor.h>
1da177e4
LT
33#include <asm/msr.h>
34#include <asm/mce.h>
1da177e4 35#include <asm/uaccess.h>
0a9c3ee7 36#include <asm/smp.h>
e02e68d3 37#include <asm/idle.h>
1da177e4
LT
38
39#define MISC_MCELOG_MINOR 227
0d7482e3 40
553f265f
AK
41atomic_t mce_entry;
42
1da177e4
LT
43static int mce_dont_init;
44
bd78432c
TH
45/*
46 * Tolerant levels:
47 * 0: always panic on uncorrected errors, log corrected errors
48 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
49 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
50 * 3: never panic or SIGBUS, log all errors (for testing only)
51 */
1da177e4
LT
52static int tolerant = 1;
53static int banks;
0d7482e3 54static u64 *bank;
e02e68d3 55static unsigned long notify_user;
94ad8474 56static int rip_msr;
911f6a7b 57static int mce_bootlog = -1;
a98f0dd3
AK
58static atomic_t mce_events;
59
60static char trigger[128];
61static char *trigger_argv[2] = { trigger, NULL };
1da177e4 62
e02e68d3
TH
63static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
64
ee031c31
AK
65/* MCA banks polled by the period polling timer for corrected events */
66DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
67 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
68};
69
b5f2fa4e
AK
70/* Do initial initialization of a struct mce */
71void mce_setup(struct mce *m)
72{
73 memset(m, 0, sizeof(struct mce));
74 m->cpu = smp_processor_id();
75 rdtscll(m->tsc);
76}
77
1da177e4
LT
78/*
79 * Lockless MCE logging infrastructure.
80 * This avoids deadlocks on printk locks without having to break locks. Also
81 * separate MCEs from kernel messages to avoid bogus bug reports.
82 */
83
231fd906 84static struct mce_log mcelog = {
1da177e4
LT
85 MCE_LOG_SIGNATURE,
86 MCE_LOG_LEN,
d88203d1 87};
1da177e4
LT
88
89void mce_log(struct mce *mce)
90{
91 unsigned next, entry;
a98f0dd3 92 atomic_inc(&mce_events);
1da177e4 93 mce->finished = 0;
7644143c 94 wmb();
1da177e4
LT
95 for (;;) {
96 entry = rcu_dereference(mcelog.next);
673242c1
AK
97 for (;;) {
98 /* When the buffer fills up discard new entries. Assume
99 that the earlier errors are the more interesting. */
100 if (entry >= MCE_LOG_LEN) {
53756d37 101 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
673242c1
AK
102 return;
103 }
104 /* Old left over entry. Skip. */
105 if (mcelog.entry[entry].finished) {
106 entry++;
107 continue;
108 }
7644143c 109 break;
1da177e4 110 }
1da177e4
LT
111 smp_rmb();
112 next = entry + 1;
113 if (cmpxchg(&mcelog.next, entry, next) == entry)
114 break;
115 }
116 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
7644143c 117 wmb();
1da177e4 118 mcelog.entry[entry].finished = 1;
7644143c 119 wmb();
1da177e4 120
e02e68d3 121 set_bit(0, &notify_user);
1da177e4
LT
122}
123
124static void print_mce(struct mce *m)
125{
126 printk(KERN_EMERG "\n"
4855170f 127 KERN_EMERG "HARDWARE ERROR\n"
1da177e4
LT
128 KERN_EMERG
129 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
130 m->cpu, m->mcgstatus, m->bank, m->status);
65ea5b03 131 if (m->ip) {
d88203d1 132 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
1da177e4 133 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
65ea5b03 134 m->cs, m->ip);
1da177e4 135 if (m->cs == __KERNEL_CS)
65ea5b03 136 print_symbol("{%s}", m->ip);
1da177e4
LT
137 printk("\n");
138 }
f6d1826d 139 printk(KERN_EMERG "TSC %llx ", m->tsc);
1da177e4 140 if (m->addr)
f6d1826d 141 printk("ADDR %llx ", m->addr);
1da177e4 142 if (m->misc)
f6d1826d 143 printk("MISC %llx ", m->misc);
1da177e4 144 printk("\n");
4855170f 145 printk(KERN_EMERG "This is not a software problem!\n");
d88203d1
TG
146 printk(KERN_EMERG "Run through mcelog --ascii to decode "
147 "and contact your hardware vendor\n");
1da177e4
LT
148}
149
150static void mce_panic(char *msg, struct mce *backup, unsigned long start)
d88203d1 151{
1da177e4 152 int i;
e02e68d3 153
1da177e4
LT
154 oops_begin();
155 for (i = 0; i < MCE_LOG_LEN; i++) {
156 unsigned long tsc = mcelog.entry[i].tsc;
d88203d1 157
1da177e4
LT
158 if (time_before(tsc, start))
159 continue;
d88203d1 160 print_mce(&mcelog.entry[i]);
1da177e4
LT
161 if (backup && mcelog.entry[i].tsc == backup->tsc)
162 backup = NULL;
163 }
164 if (backup)
165 print_mce(backup);
e02e68d3 166 panic(msg);
d88203d1 167}
1da177e4 168
88ccbedd 169int mce_available(struct cpuinfo_x86 *c)
1da177e4 170{
5b4408fd
AK
171 if (mce_dont_init)
172 return 0;
3d1712c9 173 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
1da177e4
LT
174}
175
94ad8474
AK
176static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
177{
178 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
65ea5b03 179 m->ip = regs->ip;
94ad8474
AK
180 m->cs = regs->cs;
181 } else {
65ea5b03 182 m->ip = 0;
94ad8474
AK
183 m->cs = 0;
184 }
185 if (rip_msr) {
186 /* Assume the RIP in the MSR is exact. Is this true? */
187 m->mcgstatus |= MCG_STATUS_EIPV;
65ea5b03 188 rdmsrl(rip_msr, m->ip);
94ad8474
AK
189 m->cs = 0;
190 }
191}
192
d88203d1 193/*
b79109c3
AK
194 * Poll for corrected events or events that happened before reset.
195 * Those are just logged through /dev/mcelog.
196 *
197 * This is executed in standard interrupt context.
198 */
ee031c31 199void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
b79109c3
AK
200{
201 struct mce m;
202 int i;
203
204 mce_setup(&m);
205
206 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
207 for (i = 0; i < banks; i++) {
ee031c31 208 if (!bank[i] || !test_bit(i, *b))
b79109c3
AK
209 continue;
210
211 m.misc = 0;
212 m.addr = 0;
213 m.bank = i;
214 m.tsc = 0;
215
216 barrier();
217 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
218 if (!(m.status & MCI_STATUS_VAL))
219 continue;
220
221 /*
222 * Uncorrected events are handled by the exception handler
223 * when it is enabled. But when the exception is disabled log
224 * everything.
225 *
226 * TBD do the same check for MCI_STATUS_EN here?
227 */
228 if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
229 continue;
230
231 if (m.status & MCI_STATUS_MISCV)
232 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
233 if (m.status & MCI_STATUS_ADDRV)
234 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
235
236 if (!(flags & MCP_TIMESTAMP))
237 m.tsc = 0;
238 /*
239 * Don't get the IP here because it's unlikely to
240 * have anything to do with the actual error location.
241 */
5679af4c
AK
242 if (!(flags & MCP_DONTLOG)) {
243 mce_log(&m);
244 add_taint(TAINT_MACHINE_CHECK);
245 }
b79109c3
AK
246
247 /*
248 * Clear state for this bank.
249 */
250 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
251 }
252
253 /*
254 * Don't clear MCG_STATUS here because it's only defined for
255 * exceptions.
256 */
257}
258
259/*
260 * The actual machine check handler. This only handles real
261 * exceptions when something got corrupted coming in through int 18.
262 *
263 * This is executed in NMI context not subject to normal locking rules. This
264 * implies that most kernel services cannot be safely used. Don't even
265 * think about putting a printk in there!
1da177e4 266 */
1da177e4
LT
267void do_machine_check(struct pt_regs * regs, long error_code)
268{
269 struct mce m, panicm;
1da177e4
LT
270 u64 mcestart = 0;
271 int i;
272 int panicm_found = 0;
bd78432c
TH
273 /*
274 * If no_way_out gets set, there is no safe way to recover from this
275 * MCE. If tolerant is cranked up, we'll try anyway.
276 */
277 int no_way_out = 0;
278 /*
279 * If kill_it gets set, there might be a way to recover from this
280 * error.
281 */
282 int kill_it = 0;
b79109c3 283 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1da177e4 284
553f265f
AK
285 atomic_inc(&mce_entry);
286
b79109c3 287 if (notify_die(DIE_NMI, "machine check", regs, error_code,
22f5991c 288 18, SIGKILL) == NOTIFY_STOP)
b79109c3
AK
289 goto out2;
290 if (!banks)
553f265f 291 goto out2;
1da177e4 292
b5f2fa4e
AK
293 mce_setup(&m);
294
1da177e4 295 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
bd78432c 296 /* if the restart IP is not valid, we're done for */
1da177e4 297 if (!(m.mcgstatus & MCG_STATUS_RIPV))
bd78432c 298 no_way_out = 1;
d88203d1 299
1da177e4
LT
300 rdtscll(mcestart);
301 barrier();
302
303 for (i = 0; i < banks; i++) {
b79109c3 304 __clear_bit(i, toclear);
0d7482e3 305 if (!bank[i])
1da177e4 306 continue;
d88203d1
TG
307
308 m.misc = 0;
1da177e4
LT
309 m.addr = 0;
310 m.bank = i;
1da177e4
LT
311
312 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
313 if ((m.status & MCI_STATUS_VAL) == 0)
314 continue;
315
b79109c3
AK
316 /*
317 * Non uncorrected errors are handled by machine_check_poll
318 * Leave them alone.
319 */
320 if ((m.status & MCI_STATUS_UC) == 0)
321 continue;
322
323 /*
324 * Set taint even when machine check was not enabled.
325 */
326 add_taint(TAINT_MACHINE_CHECK);
327
328 __set_bit(i, toclear);
329
1da177e4 330 if (m.status & MCI_STATUS_EN) {
bd78432c
TH
331 /* if PCC was set, there's no way out */
332 no_way_out |= !!(m.status & MCI_STATUS_PCC);
333 /*
334 * If this error was uncorrectable and there was
335 * an overflow, we're in trouble. If no overflow,
336 * we might get away with just killing a task.
337 */
338 if (m.status & MCI_STATUS_UC) {
339 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
340 no_way_out = 1;
341 kill_it = 1;
342 }
b79109c3
AK
343 } else {
344 /*
345 * Machine check event was not enabled. Clear, but
346 * ignore.
347 */
348 continue;
1da177e4
LT
349 }
350
351 if (m.status & MCI_STATUS_MISCV)
352 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
353 if (m.status & MCI_STATUS_ADDRV)
354 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
355
94ad8474 356 mce_get_rip(&m, regs);
b79109c3 357 mce_log(&m);
1da177e4
LT
358
359 /* Did this bank cause the exception? */
360 /* Assume that the bank with uncorrectable errors did it,
361 and that there is only a single one. */
362 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
363 panicm = m;
364 panicm_found = 1;
365 }
1da177e4
LT
366 }
367
1da177e4
LT
368 /* If we didn't find an uncorrectable error, pick
369 the last one (shouldn't happen, just being safe). */
370 if (!panicm_found)
371 panicm = m;
bd78432c
TH
372
373 /*
374 * If we have decided that we just CAN'T continue, and the user
375 * has not set tolerant to an insane level, give up and die.
376 */
377 if (no_way_out && tolerant < 3)
1da177e4 378 mce_panic("Machine check", &panicm, mcestart);
bd78432c
TH
379
380 /*
381 * If the error seems to be unrecoverable, something should be
382 * done. Try to kill as little as possible. If we can kill just
383 * one task, do that. If the user has set the tolerance very
384 * high, don't try to do anything at all.
385 */
386 if (kill_it && tolerant < 3) {
1da177e4
LT
387 int user_space = 0;
388
bd78432c
TH
389 /*
390 * If the EIPV bit is set, it means the saved IP is the
391 * instruction which caused the MCE.
392 */
393 if (m.mcgstatus & MCG_STATUS_EIPV)
65ea5b03 394 user_space = panicm.ip && (panicm.cs & 3);
bd78432c
TH
395
396 /*
397 * If we know that the error was in user space, send a
398 * SIGBUS. Otherwise, panic if tolerance is low.
399 *
380851bc 400 * force_sig() takes an awful lot of locks and has a slight
bd78432c
TH
401 * risk of deadlocking.
402 */
403 if (user_space) {
380851bc 404 force_sig(SIGBUS, current);
bd78432c
TH
405 } else if (panic_on_oops || tolerant < 2) {
406 mce_panic("Uncorrected machine check",
407 &panicm, mcestart);
408 }
1da177e4
LT
409 }
410
e02e68d3
TH
411 /* notify userspace ASAP */
412 set_thread_flag(TIF_MCE_NOTIFY);
413
bd78432c 414 /* the last thing we do is clear state */
b79109c3
AK
415 for (i = 0; i < banks; i++) {
416 if (test_bit(i, toclear))
417 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
418 }
1da177e4 419 wrmsrl(MSR_IA32_MCG_STATUS, 0);
553f265f
AK
420 out2:
421 atomic_dec(&mce_entry);
1da177e4
LT
422}
423
15d5f839
DZ
424#ifdef CONFIG_X86_MCE_INTEL
425/***
426 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
676b1855 427 * @cpu: The CPU on which the event occurred.
15d5f839
DZ
428 * @status: Event status information
429 *
430 * This function should be called by the thermal interrupt after the
431 * event has been processed and the decision was made to log the event
432 * further.
433 *
434 * The status parameter will be saved to the 'status' field of 'struct mce'
435 * and historically has been the register value of the
436 * MSR_IA32_THERMAL_STATUS (Intel) msr.
437 */
b5f2fa4e 438void mce_log_therm_throt_event(__u64 status)
15d5f839
DZ
439{
440 struct mce m;
441
b5f2fa4e 442 mce_setup(&m);
15d5f839
DZ
443 m.bank = MCE_THERMAL_BANK;
444 m.status = status;
15d5f839
DZ
445 mce_log(&m);
446}
447#endif /* CONFIG_X86_MCE_INTEL */
448
1da177e4 449/*
8a336b0a
TH
450 * Periodic polling timer for "silent" machine check errors. If the
451 * poller finds an MCE, poll 2x faster. When the poller finds no more
452 * errors, poll 2x slower (up to check_interval seconds).
1da177e4
LT
453 */
454
455static int check_interval = 5 * 60; /* 5 minutes */
6298c512 456static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
52d168e2
AK
457static void mcheck_timer(unsigned long);
458static DEFINE_PER_CPU(struct timer_list, mce_timer);
1da177e4 459
52d168e2 460static void mcheck_timer(unsigned long data)
1da177e4 461{
52d168e2 462 struct timer_list *t = &per_cpu(mce_timer, data);
6298c512 463 int *n;
52d168e2
AK
464
465 WARN_ON(smp_processor_id() != data);
466
1da177e4 467 if (mce_available(&current_cpu_data))
ee031c31
AK
468 machine_check_poll(MCP_TIMESTAMP,
469 &__get_cpu_var(mce_poll_banks));
1da177e4
LT
470
471 /*
e02e68d3
TH
472 * Alert userspace if needed. If we logged an MCE, reduce the
473 * polling interval, otherwise increase the polling interval.
1da177e4 474 */
6298c512 475 n = &__get_cpu_var(next_interval);
e02e68d3 476 if (mce_notify_user()) {
6298c512 477 *n = max(*n/2, HZ/100);
e02e68d3 478 } else {
6298c512 479 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
e02e68d3
TH
480 }
481
6298c512 482 t->expires = jiffies + *n;
52d168e2 483 add_timer(t);
e02e68d3
TH
484}
485
9bd98405
AK
486static void mce_do_trigger(struct work_struct *work)
487{
488 call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
489}
490
491static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
492
e02e68d3 493/*
9bd98405
AK
494 * Notify the user(s) about new machine check events.
495 * Can be called from interrupt context, but not from machine check/NMI
496 * context.
e02e68d3
TH
497 */
498int mce_notify_user(void)
499{
8457c84d
AK
500 /* Not more than two messages every minute */
501 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
502
e02e68d3
TH
503 clear_thread_flag(TIF_MCE_NOTIFY);
504 if (test_and_clear_bit(0, &notify_user)) {
e02e68d3 505 wake_up_interruptible(&mce_wait);
9bd98405
AK
506
507 /*
508 * There is no risk of missing notifications because
509 * work_pending is always cleared before the function is
510 * executed.
511 */
512 if (trigger[0] && !work_pending(&mce_trigger_work))
513 schedule_work(&mce_trigger_work);
e02e68d3 514
8457c84d 515 if (__ratelimit(&ratelimit))
8a336b0a 516 printk(KERN_INFO "Machine check events logged\n");
e02e68d3
TH
517
518 return 1;
1da177e4 519 }
e02e68d3
TH
520 return 0;
521}
8a336b0a 522
e02e68d3
TH
523/* see if the idle task needs to notify userspace */
524static int
525mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
526{
527 /* IDLE_END should be safe - interrupts are back on */
528 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
529 mce_notify_user();
530
531 return NOTIFY_OK;
1da177e4
LT
532}
533
e02e68d3
TH
534static struct notifier_block mce_idle_notifier = {
535 .notifier_call = mce_idle_callback,
536};
1da177e4
LT
537
538static __init int periodic_mcheck_init(void)
d88203d1 539{
52d168e2
AK
540 idle_notifier_register(&mce_idle_notifier);
541 return 0;
d88203d1 542}
1da177e4
LT
543__initcall(periodic_mcheck_init);
544
d88203d1 545/*
1da177e4
LT
546 * Initialize Machine Checks for a CPU.
547 */
0d7482e3 548static int mce_cap_init(void)
1da177e4
LT
549{
550 u64 cap;
0d7482e3 551 unsigned b;
1da177e4
LT
552
553 rdmsrl(MSR_IA32_MCG_CAP, cap);
0d7482e3
AK
554 b = cap & 0xff;
555 if (b > MAX_NR_BANKS) {
556 printk(KERN_WARNING
557 "MCE: Using only %u machine check banks out of %u\n",
558 MAX_NR_BANKS, b);
559 b = MAX_NR_BANKS;
560 }
561
562 /* Don't support asymmetric configurations today */
563 WARN_ON(banks != 0 && b != banks);
564 banks = b;
565 if (!bank) {
566 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
567 if (!bank)
568 return -ENOMEM;
569 memset(bank, 0xff, banks * sizeof(u64));
1da177e4 570 }
0d7482e3 571
94ad8474
AK
572 /* Use accurate RIP reporting if available. */
573 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
574 rip_msr = MSR_IA32_MCG_EIP;
1da177e4 575
0d7482e3
AK
576 return 0;
577}
578
579static void mce_init(void *dummy)
580{
581 u64 cap;
582 int i;
ee031c31 583 mce_banks_t all_banks;
0d7482e3 584
b79109c3
AK
585 /*
586 * Log the machine checks left over from the previous reset.
587 */
ee031c31 588 bitmap_fill(all_banks, MAX_NR_BANKS);
5679af4c 589 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1da177e4
LT
590
591 set_in_cr4(X86_CR4_MCE);
592
0d7482e3 593 rdmsrl(MSR_IA32_MCG_CAP, cap);
1da177e4
LT
594 if (cap & MCG_CTL_P)
595 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
596
597 for (i = 0; i < banks; i++) {
0d7482e3 598 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
1da177e4 599 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
d88203d1 600 }
1da177e4
LT
601}
602
603/* Add per CPU specific workarounds here */
ec5b3d32 604static void mce_cpu_quirks(struct cpuinfo_x86 *c)
d88203d1 605{
1da177e4 606 /* This should be disabled by the BIOS, but isn't always */
911f6a7b 607 if (c->x86_vendor == X86_VENDOR_AMD) {
0d7482e3 608 if (c->x86 == 15 && banks > 4)
911f6a7b
JB
609 /* disable GART TBL walk error reporting, which trips off
610 incorrectly with the IOMMU & 3ware & Cerberus. */
0d7482e3 611 clear_bit(10, (unsigned long *)&bank[4]);
911f6a7b
JB
612 if(c->x86 <= 17 && mce_bootlog < 0)
613 /* Lots of broken BIOS around that don't clear them
614 by default and leave crap in there. Don't log. */
615 mce_bootlog = 0;
1da177e4 616 }
e583538f 617
d88203d1 618}
1da177e4 619
cc3ca220 620static void mce_cpu_features(struct cpuinfo_x86 *c)
1da177e4
LT
621{
622 switch (c->x86_vendor) {
623 case X86_VENDOR_INTEL:
624 mce_intel_feature_init(c);
625 break;
89b831ef
JS
626 case X86_VENDOR_AMD:
627 mce_amd_feature_init(c);
628 break;
1da177e4
LT
629 default:
630 break;
631 }
632}
633
52d168e2
AK
634static void mce_init_timer(void)
635{
636 struct timer_list *t = &__get_cpu_var(mce_timer);
6298c512 637 int *n = &__get_cpu_var(next_interval);
52d168e2 638
6298c512
AK
639 *n = check_interval * HZ;
640 if (!*n)
52d168e2
AK
641 return;
642 setup_timer(t, mcheck_timer, smp_processor_id());
6298c512 643 t->expires = round_jiffies(jiffies + *n);
52d168e2
AK
644 add_timer(t);
645}
646
d88203d1 647/*
1da177e4 648 * Called for each booted CPU to set up machine checks.
d88203d1 649 * Must be called with preempt off.
1da177e4 650 */
e6982c67 651void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
1da177e4 652{
5b4408fd 653 if (!mce_available(c))
1da177e4
LT
654 return;
655
0d7482e3
AK
656 if (mce_cap_init() < 0) {
657 mce_dont_init = 1;
658 return;
659 }
660 mce_cpu_quirks(c);
661
1da177e4
LT
662 mce_init(NULL);
663 mce_cpu_features(c);
52d168e2 664 mce_init_timer();
1da177e4
LT
665}
666
667/*
668 * Character device to read and clear the MCE log.
669 */
670
f528e7ba
TH
671static DEFINE_SPINLOCK(mce_state_lock);
672static int open_count; /* #times opened */
673static int open_exclu; /* already open exclusive? */
674
675static int mce_open(struct inode *inode, struct file *file)
676{
38c4c97c 677 lock_kernel();
f528e7ba
TH
678 spin_lock(&mce_state_lock);
679
680 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
681 spin_unlock(&mce_state_lock);
38c4c97c 682 unlock_kernel();
f528e7ba
TH
683 return -EBUSY;
684 }
685
686 if (file->f_flags & O_EXCL)
687 open_exclu = 1;
688 open_count++;
689
690 spin_unlock(&mce_state_lock);
38c4c97c 691 unlock_kernel();
f528e7ba 692
bd78432c 693 return nonseekable_open(inode, file);
f528e7ba
TH
694}
695
696static int mce_release(struct inode *inode, struct file *file)
697{
698 spin_lock(&mce_state_lock);
699
700 open_count--;
701 open_exclu = 0;
702
703 spin_unlock(&mce_state_lock);
704
705 return 0;
706}
707
d88203d1
TG
708static void collect_tscs(void *data)
709{
1da177e4 710 unsigned long *cpu_tsc = (unsigned long *)data;
d88203d1 711
1da177e4 712 rdtscll(cpu_tsc[smp_processor_id()]);
d88203d1 713}
1da177e4 714
d88203d1
TG
715static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
716 loff_t *off)
1da177e4 717{
f0de53bb 718 unsigned long *cpu_tsc;
8c8b8859 719 static DEFINE_MUTEX(mce_read_mutex);
ef41df43 720 unsigned prev, next;
1da177e4
LT
721 char __user *buf = ubuf;
722 int i, err;
723
6bca67f9 724 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
f0de53bb
AK
725 if (!cpu_tsc)
726 return -ENOMEM;
727
8c8b8859 728 mutex_lock(&mce_read_mutex);
1da177e4
LT
729 next = rcu_dereference(mcelog.next);
730
731 /* Only supports full reads right now */
d88203d1 732 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
8c8b8859 733 mutex_unlock(&mce_read_mutex);
f0de53bb 734 kfree(cpu_tsc);
1da177e4
LT
735 return -EINVAL;
736 }
737
738 err = 0;
ef41df43
HY
739 prev = 0;
740 do {
741 for (i = prev; i < next; i++) {
742 unsigned long start = jiffies;
743
744 while (!mcelog.entry[i].finished) {
745 if (time_after_eq(jiffies, start + 2)) {
746 memset(mcelog.entry + i, 0,
747 sizeof(struct mce));
748 goto timeout;
749 }
750 cpu_relax();
673242c1 751 }
ef41df43
HY
752 smp_rmb();
753 err |= copy_to_user(buf, mcelog.entry + i,
754 sizeof(struct mce));
755 buf += sizeof(struct mce);
756timeout:
757 ;
673242c1 758 }
1da177e4 759
ef41df43
HY
760 memset(mcelog.entry + prev, 0,
761 (next - prev) * sizeof(struct mce));
762 prev = next;
763 next = cmpxchg(&mcelog.next, prev, 0);
764 } while (next != prev);
1da177e4 765
b2b18660 766 synchronize_sched();
1da177e4 767
d88203d1
TG
768 /*
769 * Collect entries that were still getting written before the
770 * synchronize.
771 */
15c8b6c1 772 on_each_cpu(collect_tscs, cpu_tsc, 1);
d88203d1
TG
773 for (i = next; i < MCE_LOG_LEN; i++) {
774 if (mcelog.entry[i].finished &&
775 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
776 err |= copy_to_user(buf, mcelog.entry+i,
777 sizeof(struct mce));
1da177e4
LT
778 smp_rmb();
779 buf += sizeof(struct mce);
780 memset(&mcelog.entry[i], 0, sizeof(struct mce));
781 }
d88203d1 782 }
8c8b8859 783 mutex_unlock(&mce_read_mutex);
f0de53bb 784 kfree(cpu_tsc);
d88203d1 785 return err ? -EFAULT : buf - ubuf;
1da177e4
LT
786}
787
e02e68d3
TH
788static unsigned int mce_poll(struct file *file, poll_table *wait)
789{
790 poll_wait(file, &mce_wait, wait);
791 if (rcu_dereference(mcelog.next))
792 return POLLIN | POLLRDNORM;
793 return 0;
794}
795
c68461b6 796static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1da177e4
LT
797{
798 int __user *p = (int __user *)arg;
d88203d1 799
1da177e4 800 if (!capable(CAP_SYS_ADMIN))
d88203d1 801 return -EPERM;
1da177e4 802 switch (cmd) {
d88203d1 803 case MCE_GET_RECORD_LEN:
1da177e4
LT
804 return put_user(sizeof(struct mce), p);
805 case MCE_GET_LOG_LEN:
d88203d1 806 return put_user(MCE_LOG_LEN, p);
1da177e4
LT
807 case MCE_GETCLEAR_FLAGS: {
808 unsigned flags;
d88203d1
TG
809
810 do {
1da177e4 811 flags = mcelog.flags;
d88203d1
TG
812 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
813 return put_user(flags, p);
1da177e4
LT
814 }
815 default:
d88203d1
TG
816 return -ENOTTY;
817 }
1da177e4
LT
818}
819
5dfe4c96 820static const struct file_operations mce_chrdev_ops = {
f528e7ba
TH
821 .open = mce_open,
822 .release = mce_release,
1da177e4 823 .read = mce_read,
e02e68d3 824 .poll = mce_poll,
c68461b6 825 .unlocked_ioctl = mce_ioctl,
1da177e4
LT
826};
827
828static struct miscdevice mce_log_device = {
829 MISC_MCELOG_MINOR,
830 "mcelog",
831 &mce_chrdev_ops,
832};
833
d88203d1
TG
834/*
835 * Old style boot options parsing. Only for compatibility.
1da177e4 836 */
1da177e4
LT
837static int __init mcheck_disable(char *str)
838{
839 mce_dont_init = 1;
9b41046c 840 return 1;
1da177e4 841}
13503fa9 842__setup("nomce", mcheck_disable);
1da177e4 843
13503fa9
HS
844/*
845 * mce=off disables machine check
846 * mce=TOLERANCELEVEL (number, see above)
847 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
848 * mce=nobootlog Don't log MCEs from before booting.
849 */
1da177e4
LT
850static int __init mcheck_enable(char *str)
851{
852 if (!strcmp(str, "off"))
853 mce_dont_init = 1;
13503fa9
HS
854 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
855 mce_bootlog = (str[0] == 'b');
8c566ef5
AK
856 else if (isdigit(str[0]))
857 get_option(&str, &tolerant);
13503fa9
HS
858 else {
859 printk(KERN_INFO "mce= argument %s ignored. Please use /sys\n",
860 str);
861 return 0;
862 }
9b41046c 863 return 1;
1da177e4 864}
909dd324 865__setup("mce=", mcheck_enable);
1da177e4 866
d88203d1 867/*
1da177e4 868 * Sysfs support
d88203d1 869 */
1da177e4 870
973a2dd1
AK
871/*
872 * Disable machine checks on suspend and shutdown. We can't really handle
873 * them later.
874 */
875static int mce_disable(void)
876{
877 int i;
878
879 for (i = 0; i < banks; i++)
880 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
881 return 0;
882}
883
884static int mce_suspend(struct sys_device *dev, pm_message_t state)
885{
886 return mce_disable();
887}
888
889static int mce_shutdown(struct sys_device *dev)
890{
891 return mce_disable();
892}
893
413588c7
AK
894/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
895 Only one CPU is active at this time, the others get readded later using
896 CPU hotplug. */
1da177e4
LT
897static int mce_resume(struct sys_device *dev)
898{
413588c7 899 mce_init(NULL);
6ec68bff 900 mce_cpu_features(&current_cpu_data);
1da177e4
LT
901 return 0;
902}
903
52d168e2
AK
904static void mce_cpu_restart(void *data)
905{
906 del_timer_sync(&__get_cpu_var(mce_timer));
907 if (mce_available(&current_cpu_data))
908 mce_init(NULL);
909 mce_init_timer();
910}
911
1da177e4 912/* Reinit MCEs after user configuration changes */
d88203d1
TG
913static void mce_restart(void)
914{
52d168e2 915 on_each_cpu(mce_cpu_restart, NULL, 1);
1da177e4
LT
916}
917
918static struct sysdev_class mce_sysclass = {
973a2dd1
AK
919 .suspend = mce_suspend,
920 .shutdown = mce_shutdown,
1da177e4 921 .resume = mce_resume,
af5ca3f4 922 .name = "machinecheck",
1da177e4
LT
923};
924
fff2e89f 925DEFINE_PER_CPU(struct sys_device, device_mce);
8735728e 926void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
1da177e4
LT
927
928/* Why are there no generic functions for this? */
929#define ACCESSOR(name, var, start) \
4a0b2b4d
AK
930 static ssize_t show_ ## name(struct sys_device *s, \
931 struct sysdev_attribute *attr, \
932 char *buf) { \
d88203d1
TG
933 return sprintf(buf, "%lx\n", (unsigned long)var); \
934 } \
4a0b2b4d
AK
935 static ssize_t set_ ## name(struct sys_device *s, \
936 struct sysdev_attribute *attr, \
937 const char *buf, size_t siz) { \
d88203d1
TG
938 char *end; \
939 unsigned long new = simple_strtoul(buf, &end, 0); \
940 if (end == buf) return -EINVAL; \
941 var = new; \
942 start; \
943 return end-buf; \
944 } \
1da177e4
LT
945 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
946
0d7482e3
AK
947static struct sysdev_attribute *bank_attrs;
948
949static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
950 char *buf)
951{
952 u64 b = bank[attr - bank_attrs];
f6d1826d 953 return sprintf(buf, "%llx\n", b);
0d7482e3
AK
954}
955
956static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
957 const char *buf, size_t siz)
958{
959 char *end;
960 u64 new = simple_strtoull(buf, &end, 0);
961 if (end == buf)
962 return -EINVAL;
963 bank[attr - bank_attrs] = new;
964 mce_restart();
965 return end-buf;
966}
a98f0dd3 967
4a0b2b4d
AK
968static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
969 char *buf)
a98f0dd3
AK
970{
971 strcpy(buf, trigger);
972 strcat(buf, "\n");
973 return strlen(trigger) + 1;
974}
975
4a0b2b4d
AK
976static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
977 const char *buf,size_t siz)
a98f0dd3
AK
978{
979 char *p;
980 int len;
981 strncpy(trigger, buf, sizeof(trigger));
982 trigger[sizeof(trigger)-1] = 0;
983 len = strlen(trigger);
984 p = strchr(trigger, '\n');
985 if (*p) *p = 0;
986 return len;
987}
988
989static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
d95d62c0 990static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1da177e4 991ACCESSOR(check_interval,check_interval,mce_restart())
a98f0dd3 992static struct sysdev_attribute *mce_attributes[] = {
d95d62c0 993 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
a98f0dd3
AK
994 NULL
995};
1da177e4 996
996867d0 997static cpumask_var_t mce_device_initialized;
bae19fe0 998
91c6d400
AK
999/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
1000static __cpuinit int mce_create_device(unsigned int cpu)
1da177e4
LT
1001{
1002 int err;
73ca5358 1003 int i;
92cb7612 1004
90367556 1005 if (!mce_available(&boot_cpu_data))
91c6d400
AK
1006 return -EIO;
1007
d435d862 1008 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
91c6d400
AK
1009 per_cpu(device_mce,cpu).id = cpu;
1010 per_cpu(device_mce,cpu).cls = &mce_sysclass;
1011
1012 err = sysdev_register(&per_cpu(device_mce,cpu));
d435d862
AM
1013 if (err)
1014 return err;
1015
1016 for (i = 0; mce_attributes[i]; i++) {
1017 err = sysdev_create_file(&per_cpu(device_mce,cpu),
1018 mce_attributes[i]);
1019 if (err)
1020 goto error;
1021 }
0d7482e3
AK
1022 for (i = 0; i < banks; i++) {
1023 err = sysdev_create_file(&per_cpu(device_mce, cpu),
1024 &bank_attrs[i]);
1025 if (err)
1026 goto error2;
1027 }
996867d0 1028 cpumask_set_cpu(cpu, mce_device_initialized);
91c6d400 1029
d435d862 1030 return 0;
0d7482e3
AK
1031error2:
1032 while (--i >= 0) {
1033 sysdev_remove_file(&per_cpu(device_mce, cpu),
1034 &bank_attrs[i]);
1035 }
d435d862 1036error:
0d7482e3 1037 while (--i >= 0) {
d435d862
AM
1038 sysdev_remove_file(&per_cpu(device_mce,cpu),
1039 mce_attributes[i]);
91c6d400 1040 }
d435d862
AM
1041 sysdev_unregister(&per_cpu(device_mce,cpu));
1042
91c6d400
AK
1043 return err;
1044}
1045
2d9cd6c2 1046static __cpuinit void mce_remove_device(unsigned int cpu)
91c6d400 1047{
73ca5358
SL
1048 int i;
1049
996867d0 1050 if (!cpumask_test_cpu(cpu, mce_device_initialized))
bae19fe0
AH
1051 return;
1052
a98f0dd3 1053 for (i = 0; mce_attributes[i]; i++)
73ca5358 1054 sysdev_remove_file(&per_cpu(device_mce,cpu),
a98f0dd3 1055 mce_attributes[i]);
0d7482e3
AK
1056 for (i = 0; i < banks; i++)
1057 sysdev_remove_file(&per_cpu(device_mce, cpu),
1058 &bank_attrs[i]);
91c6d400 1059 sysdev_unregister(&per_cpu(device_mce,cpu));
996867d0 1060 cpumask_clear_cpu(cpu, mce_device_initialized);
91c6d400 1061}
91c6d400 1062
d6b75584 1063/* Make sure there are no machine checks on offlined CPUs. */
ec5b3d32 1064static void mce_disable_cpu(void *h)
d6b75584
AK
1065{
1066 int i;
88ccbedd 1067 unsigned long action = *(unsigned long *)h;
d6b75584
AK
1068
1069 if (!mce_available(&current_cpu_data))
1070 return;
88ccbedd
AK
1071 if (!(action & CPU_TASKS_FROZEN))
1072 cmci_clear();
d6b75584
AK
1073 for (i = 0; i < banks; i++)
1074 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
1075}
1076
ec5b3d32 1077static void mce_reenable_cpu(void *h)
d6b75584
AK
1078{
1079 int i;
88ccbedd 1080 unsigned long action = *(unsigned long *)h;
d6b75584
AK
1081
1082 if (!mce_available(&current_cpu_data))
1083 return;
88ccbedd
AK
1084 if (!(action & CPU_TASKS_FROZEN))
1085 cmci_reenable();
d6b75584
AK
1086 for (i = 0; i < banks; i++)
1087 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
1088}
1089
91c6d400 1090/* Get notified when a cpu comes on/off. Be hotplug friendly. */
1e35669d
SR
1091static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
1092 unsigned long action, void *hcpu)
91c6d400
AK
1093{
1094 unsigned int cpu = (unsigned long)hcpu;
52d168e2 1095 struct timer_list *t = &per_cpu(mce_timer, cpu);
91c6d400
AK
1096
1097 switch (action) {
bae19fe0
AH
1098 case CPU_ONLINE:
1099 case CPU_ONLINE_FROZEN:
1100 mce_create_device(cpu);
8735728e
RW
1101 if (threshold_cpu_callback)
1102 threshold_cpu_callback(action, cpu);
91c6d400 1103 break;
91c6d400 1104 case CPU_DEAD:
8bb78442 1105 case CPU_DEAD_FROZEN:
8735728e
RW
1106 if (threshold_cpu_callback)
1107 threshold_cpu_callback(action, cpu);
91c6d400
AK
1108 mce_remove_device(cpu);
1109 break;
52d168e2
AK
1110 case CPU_DOWN_PREPARE:
1111 case CPU_DOWN_PREPARE_FROZEN:
1112 del_timer_sync(t);
88ccbedd 1113 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
52d168e2
AK
1114 break;
1115 case CPU_DOWN_FAILED:
1116 case CPU_DOWN_FAILED_FROZEN:
6298c512
AK
1117 t->expires = round_jiffies(jiffies +
1118 __get_cpu_var(next_interval));
52d168e2 1119 add_timer_on(t, cpu);
88ccbedd
AK
1120 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1121 break;
1122 case CPU_POST_DEAD:
1123 /* intentionally ignoring frozen here */
1124 cmci_rediscover(cpu);
52d168e2 1125 break;
91c6d400 1126 }
bae19fe0 1127 return NOTIFY_OK;
91c6d400
AK
1128}
1129
1e35669d 1130static struct notifier_block mce_cpu_notifier __cpuinitdata = {
91c6d400
AK
1131 .notifier_call = mce_cpu_callback,
1132};
1133
0d7482e3
AK
1134static __init int mce_init_banks(void)
1135{
1136 int i;
1137
1138 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1139 GFP_KERNEL);
1140 if (!bank_attrs)
1141 return -ENOMEM;
1142
1143 for (i = 0; i < banks; i++) {
1144 struct sysdev_attribute *a = &bank_attrs[i];
1145 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
1146 if (!a->attr.name)
1147 goto nomem;
1148 a->attr.mode = 0644;
1149 a->show = show_bank;
1150 a->store = set_bank;
1151 }
1152 return 0;
1153
1154nomem:
1155 while (--i >= 0)
1156 kfree(bank_attrs[i].attr.name);
1157 kfree(bank_attrs);
1158 bank_attrs = NULL;
1159 return -ENOMEM;
1160}
1161
91c6d400
AK
1162static __init int mce_init_device(void)
1163{
1164 int err;
1165 int i = 0;
1166
1da177e4
LT
1167 if (!mce_available(&boot_cpu_data))
1168 return -EIO;
0d7482e3 1169
996867d0
RR
1170 alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
1171
0d7482e3
AK
1172 err = mce_init_banks();
1173 if (err)
1174 return err;
1175
1da177e4 1176 err = sysdev_class_register(&mce_sysclass);
d435d862
AM
1177 if (err)
1178 return err;
91c6d400
AK
1179
1180 for_each_online_cpu(i) {
d435d862
AM
1181 err = mce_create_device(i);
1182 if (err)
1183 return err;
91c6d400
AK
1184 }
1185
be6b5a35 1186 register_hotcpu_notifier(&mce_cpu_notifier);
1da177e4
LT
1187 misc_register(&mce_log_device);
1188 return err;
1da177e4 1189}
91c6d400 1190
1da177e4 1191device_initcall(mce_init_device);