[mirror_ubuntu-bionic-kernel.git] / arch / x86_64 / kernel / mce.c

/*
 * Machine check handler.
 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
 * Rest from unknown author(s). 
 * 2004 Andi Kleen. Rewrote most of it. 
 */

#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/sysdev.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <linux/ctype.h>
#include <asm/processor.h> 
#include <asm/msr.h>
#include <asm/mce.h>
#include <asm/kdebug.h>
#include <asm/uaccess.h>
#include <asm/smp.h>

#define MISC_MCELOG_MINOR 227
#define NR_BANKS 6

atomic_t mce_entry;

static int mce_dont_init;

/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
   3: never panic or exit (for testing only) */
static int tolerant = 1;
static int banks;
static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
static unsigned long console_logged;
static int notify_user;
static int rip_msr;
static int mce_bootlog = 1;

/*
 * Lockless MCE logging infrastructure.
 * This avoids deadlocks on printk locks without having to break locks. Also
 * separate MCEs from kernel messages to avoid bogus bug reports.
 */

struct mce_log mcelog = { 
	MCE_LOG_SIGNATURE,
	MCE_LOG_LEN,
}; 

void mce_log(struct mce *mce)
{
	unsigned next, entry;
	mce->finished = 0;
	wmb();
	for (;;) {
		entry = rcu_dereference(mcelog.next);
		/* The rmb forces the compiler to reload next in each
		    iteration */
		rmb();
		for (;;) {
			/* When the buffer fills up discard new entries. Assume
			   that the earlier errors are the more interesting. */
			if (entry >= MCE_LOG_LEN) {
				set_bit(MCE_OVERFLOW, &mcelog.flags);
				return;
			}
			/* Old left over entry. Skip. */
			if (mcelog.entry[entry].finished) {
				entry++;
				continue;
			}
			break;
		}
		smp_rmb();
		next = entry + 1;
		if (cmpxchg(&mcelog.next, entry, next) == entry)
			break;
	}
	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
	wmb();
	mcelog.entry[entry].finished = 1;
	wmb();

	if (!test_and_set_bit(0, &console_logged))
		notify_user = 1;
}

static void print_mce(struct mce *m)
{
	printk(KERN_EMERG "\n"
	       KERN_EMERG "HARDWARE ERROR\n"
	       KERN_EMERG
	       "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
	       m->cpu, m->mcgstatus, m->bank, m->status);
	if (m->rip) {
		printk(KERN_EMERG 
		       "RIP%s %02x:<%016Lx> ",
		       !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
		       m->cs, m->rip);
		if (m->cs == __KERNEL_CS)
			print_symbol("{%s}", m->rip);
		printk("\n");
	}
	printk(KERN_EMERG "TSC %Lx ", m->tsc); 
	if (m->addr)
		printk("ADDR %Lx ", m->addr);
	if (m->misc)
		printk("MISC %Lx ", m->misc); 	
	printk("\n");
	printk(KERN_EMERG "This is not a software problem!\n");
        printk(KERN_EMERG
    "Run through mcelog --ascii to decode and contact your hardware vendor\n");
}

static void mce_panic(char *msg, struct mce *backup, unsigned long start)
{ 
	int i;
	oops_begin();
	for (i = 0; i < MCE_LOG_LEN; i++) {
		unsigned long tsc = mcelog.entry[i].tsc;
		if (time_before(tsc, start))
			continue;
		print_mce(&mcelog.entry[i]); 
		if (backup && mcelog.entry[i].tsc == backup->tsc)
			backup = NULL;
	}
	if (backup)
		print_mce(backup);
	if (tolerant >= 3)
		printk("Fake panic: %s\n", msg);
	else
		panic(msg);
} 

static int mce_available(struct cpuinfo_x86 *c)
{
	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}

static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
		m->rip = regs->rip;
		m->cs = regs->cs;
	} else {
		m->rip = 0;
		m->cs = 0;
	}
	if (rip_msr) {
		/* Assume the RIP in the MSR is exact. Is this true? */
		m->mcgstatus |= MCG_STATUS_EIPV;
		rdmsrl(rip_msr, m->rip);
		m->cs = 0;
	}
}

/* 
 * The actual machine check handler
 */

void do_machine_check(struct pt_regs * regs, long error_code)
{
	struct mce m, panicm;
	int nowayout = (tolerant < 1); 
	int kill_it = 0;
	u64 mcestart = 0;
	int i;
	int panicm_found = 0;

	atomic_inc(&mce_entry);

	if (regs)
		notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
	if (!banks)
		goto out2;

	memset(&m, 0, sizeof(struct mce));
	m.cpu = smp_processor_id();
	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
	if (!(m.mcgstatus & MCG_STATUS_RIPV))
		kill_it = 1;
	
	rdtscll(mcestart);
	barrier();

	for (i = 0; i < banks; i++) {
		if (!bank[i])
			continue;
		
		m.misc = 0; 
		m.addr = 0;
		m.bank = i;
		m.tsc = 0;

		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
		if ((m.status & MCI_STATUS_VAL) == 0)
			continue;

		if (m.status & MCI_STATUS_EN) {
			/* In theory _OVER could be a nowayout too, but
			   assume any overflowed errors were no fatal. */
			nowayout |= !!(m.status & MCI_STATUS_PCC);
			kill_it |= !!(m.status & MCI_STATUS_UC);
		}

		if (m.status & MCI_STATUS_MISCV)
			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
		if (m.status & MCI_STATUS_ADDRV)
			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);

		mce_get_rip(&m, regs);
		if (error_code >= 0)
			rdtscll(m.tsc);
		wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
		if (error_code != -2)
			mce_log(&m);

		/* Did this bank cause the exception? */
		/* Assume that the bank with uncorrectable errors did it,
		   and that there is only a single one. */
		if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
			panicm = m;
			panicm_found = 1;
		}

		add_taint(TAINT_MACHINE_CHECK);
	}

	/* Never do anything final in the polling timer */
	if (!regs)
		goto out;

	/* If we didn't find an uncorrectable error, pick
	   the last one (shouldn't happen, just being safe). */
	if (!panicm_found)
		panicm = m;
	if (nowayout)
		mce_panic("Machine check", &panicm, mcestart);
	if (kill_it) {
		int user_space = 0;

		if (m.mcgstatus & MCG_STATUS_RIPV)
			user_space = panicm.rip && (panicm.cs & 3);
		
		/* When the machine was in user space and the CPU didn't get
		   confused it's normally not necessary to panic, unless you 
		   are paranoid (tolerant == 0)

		   RED-PEN could be more tolerant for MCEs in idle,
		   but most likely they occur at boot anyways, where
		   it is best to just halt the machine. */
		if ((!user_space && (panic_on_oops || tolerant < 2)) ||
		    (unsigned)current->pid <= 1)
			mce_panic("Uncorrected machine check", &panicm, mcestart);

		/* do_exit takes an awful lot of locks and has as
		   slight risk of deadlocking. If you don't want that
		   don't set tolerant >= 2 */
		if (tolerant < 3)
			do_exit(SIGBUS);
	}

 out:
	/* Last thing done in the machine check exception to clear state. */
	wrmsrl(MSR_IA32_MCG_STATUS, 0);
 out2:
	atomic_dec(&mce_entry);
}

#ifdef CONFIG_X86_MCE_INTEL
/***
 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
 * @cpu: The CPU on which the event occured.
 * @status: Event status information
 *
 * This function should be called by the thermal interrupt after the
 * event has been processed and the decision was made to log the event
 * further.
 *
 * The status parameter will be saved to the 'status' field of 'struct mce'
 * and historically has been the register value of the
 * MSR_IA32_THERMAL_STATUS (Intel) msr.
 */
void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
{
	struct mce m;

	memset(&m, 0, sizeof(m));
	m.cpu = cpu;
	m.bank = MCE_THERMAL_BANK;
	m.status = status;
	rdtscll(m.tsc);
	mce_log(&m);
}
#endif /* CONFIG_X86_MCE_INTEL */

/*
 * Periodic polling timer for "silent" machine check errors.
 */

static int check_interval = 5 * 60; /* 5 minutes */
static void mcheck_timer(void *data);
static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);

static void mcheck_check_cpu(void *info)
{
	if (mce_available(&current_cpu_data))
		do_machine_check(NULL, 0);
}

static void mcheck_timer(void *data)
{
	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
	schedule_delayed_work(&mcheck_work, check_interval * HZ);

	/*
	 * It's ok to read stale data here for notify_user and
	 * console_logged as we'll simply get the updated versions
	 * on the next mcheck_timer execution and atomic operations
	 * on console_logged act as synchronization for notify_user
	 * writes.
	 */
	if (notify_user && console_logged) {
		notify_user = 0;
		clear_bit(0, &console_logged);
		printk(KERN_INFO "Machine check events logged\n");
	}
}


static __init int periodic_mcheck_init(void)
{ 
	if (check_interval)
		schedule_delayed_work(&mcheck_work, check_interval*HZ);
	return 0;
} 
__initcall(periodic_mcheck_init);


/* 
 * Initialize Machine Checks for a CPU.
 */
static void mce_init(void *dummy)
{
	u64 cap;
	int i;

	rdmsrl(MSR_IA32_MCG_CAP, cap);
	banks = cap & 0xff;
	if (banks > NR_BANKS) { 
		printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
		banks = NR_BANKS; 
	}
	/* Use accurate RIP reporting if available. */
	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
		rip_msr = MSR_IA32_MCG_EIP;

	/* Log the machine checks left over from the previous reset.
	   This also clears all registers */
	do_machine_check(NULL, mce_bootlog ? -1 : -2);

	set_in_cr4(X86_CR4_MCE);

	if (cap & MCG_CTL_P)
		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);

	for (i = 0; i < banks; i++) {
		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
	}	
}

/* Add per CPU specific workarounds here */
static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
{ 
	/* This should be disabled by the BIOS, but isn't always */
	if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
		/* disable GART TBL walk error reporting, which trips off 
		   incorrectly with the IOMMU & 3ware & Cerberus. */
		clear_bit(10, &bank[4]);
		/* Lots of broken BIOS around that don't clear them
		   by default and leave crap in there. Don't log. */
		mce_bootlog = 0;
	}

}			

static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
{
	switch (c->x86_vendor) {
	case X86_VENDOR_INTEL:
		mce_intel_feature_init(c);
		break;
	case X86_VENDOR_AMD:
		mce_amd_feature_init(c);
		break;
	default:
		break;
	}
}

/* 
 * Called for each booted CPU to set up machine checks.
 * Must be called with preempt off. 
 */
void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
	static cpumask_t mce_cpus = CPU_MASK_NONE;

	mce_cpu_quirks(c); 

	if (mce_dont_init ||
	    cpu_test_and_set(smp_processor_id(), mce_cpus) ||
	    !mce_available(c))
		return;

	mce_init(NULL);
	mce_cpu_features(c);
}

/*
 * Character device to read and clear the MCE log.
 */

static void collect_tscs(void *data) 
{ 
	unsigned long *cpu_tsc = (unsigned long *)data;
	rdtscll(cpu_tsc[smp_processor_id()]);
} 

static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
{
	unsigned long *cpu_tsc;
	static DECLARE_MUTEX(mce_read_sem);
	unsigned next;
	char __user *buf = ubuf;
	int i, err;

	cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
	if (!cpu_tsc)
		return -ENOMEM;

	down(&mce_read_sem); 
	next = rcu_dereference(mcelog.next);

	/* Only supports full reads right now */
	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 
		up(&mce_read_sem);
		kfree(cpu_tsc);
		return -EINVAL;
	}

	err = 0;
	for (i = 0; i < next; i++) {		
		unsigned long start = jiffies;
		while (!mcelog.entry[i].finished) {
			if (!time_before(jiffies, start + 2)) {
				memset(mcelog.entry + i,0, sizeof(struct mce));
				continue;
			}
			cpu_relax();
		}
		smp_rmb();
		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
		buf += sizeof(struct mce); 
	} 

	memset(mcelog.entry, 0, next * sizeof(struct mce));
	mcelog.next = 0;

	synchronize_sched();

	/* Collect entries that were still getting written before the synchronize. */

	on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
	for (i = next; i < MCE_LOG_LEN; i++) { 
		if (mcelog.entry[i].finished && 
		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {  
			err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
			smp_rmb();
			buf += sizeof(struct mce);
			memset(&mcelog.entry[i], 0, sizeof(struct mce));
		}
	} 	
	up(&mce_read_sem);
	kfree(cpu_tsc);
	return err ? -EFAULT : buf - ubuf; 
}

static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
{
	int __user *p = (int __user *)arg;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM; 
	switch (cmd) {
	case MCE_GET_RECORD_LEN: 
		return put_user(sizeof(struct mce), p);
	case MCE_GET_LOG_LEN:
		return put_user(MCE_LOG_LEN, p);		
	case MCE_GETCLEAR_FLAGS: {
		unsigned flags;
		do { 
			flags = mcelog.flags;
		} while (cmpxchg(&mcelog.flags, flags, 0) != flags); 
		return put_user(flags, p); 
	}
	default:
		return -ENOTTY; 
	} 
}

static struct file_operations mce_chrdev_ops = {
	.read = mce_read,
	.ioctl = mce_ioctl,
};

static struct miscdevice mce_log_device = {
	MISC_MCELOG_MINOR,
	"mcelog",
	&mce_chrdev_ops,
};

/* 
 * Old style boot options parsing. Only for compatibility. 
 */

static int __init mcheck_disable(char *str)
{
	mce_dont_init = 1;
	return 1;
}

/* mce=off disables machine check. Note you can reenable it later
   using sysfs.
   mce=TOLERANCELEVEL (number, see above)
   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
   mce=nobootlog Don't log MCEs from before booting. */
static int __init mcheck_enable(char *str)
{
	if (*str == '=')
		str++;
	if (!strcmp(str, "off"))
		mce_dont_init = 1;
	else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
		mce_bootlog = str[0] == 'b';
	else if (isdigit(str[0]))
		get_option(&str, &tolerant);
	else
		printk("mce= argument %s ignored. Please use /sys", str); 
	return 1;
}

__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);

/* 
 * Sysfs support
 */ 

/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
   Only one CPU is active at this time, the others get readded later using
   CPU hotplug. */
static int mce_resume(struct sys_device *dev)
{
	mce_init(NULL);
	return 0;
}

/* Reinit MCEs after user configuration changes */
static void mce_restart(void) 
{ 
	if (check_interval)
		cancel_delayed_work(&mcheck_work);
	/* Timer race is harmless here */
	on_each_cpu(mce_init, NULL, 1, 1);       
	if (check_interval)
		schedule_delayed_work(&mcheck_work, check_interval*HZ);
}

static struct sysdev_class mce_sysclass = {
	.resume = mce_resume,
	set_kset_name("machinecheck"),
};

DEFINE_PER_CPU(struct sys_device, device_mce);

/* Why are there no generic functions for this? */
#define ACCESSOR(name, var, start) \
	static ssize_t show_ ## name(struct sys_device *s, char *buf) { 	   	   \
		return sprintf(buf, "%lx\n", (unsigned long)var);		   \
	} 									   \
	static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
		char *end; 							   \
		unsigned long new = simple_strtoul(buf, &end, 0); 		   \
		if (end == buf) return -EINVAL;					   \
		var = new;							   \
		start; 								   \
		return end-buf;		     					   \
	}									   \
	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);

ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
ACCESSOR(bank3ctl,bank[3],mce_restart())
ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(bank5ctl,bank[5],mce_restart())
static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())

/* Per cpu sysdev init.  All of the cpus still share the same ctl bank */
static __cpuinit int mce_create_device(unsigned int cpu)
{
	int err;
	int i;
	if (!mce_available(&cpu_data[cpu]))
		return -EIO;

	per_cpu(device_mce,cpu).id = cpu;
	per_cpu(device_mce,cpu).cls = &mce_sysclass;

	err = sysdev_register(&per_cpu(device_mce,cpu));

	if (!err) {
		for (i = 0; i < banks; i++)
			sysdev_create_file(&per_cpu(device_mce,cpu),
				bank_attributes[i]);
		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
		sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
	}
	return err;
}

#ifdef CONFIG_HOTPLUG_CPU
static void mce_remove_device(unsigned int cpu)
{
	int i;

	for (i = 0; i < banks; i++)
		sysdev_remove_file(&per_cpu(device_mce,cpu),
			bank_attributes[i]);
	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
	sysdev_unregister(&per_cpu(device_mce,cpu));
	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
}

/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
	unsigned int cpu = (unsigned long)hcpu;

	switch (action) {
	case CPU_ONLINE:
		mce_create_device(cpu);
		break;
	case CPU_DEAD:
		mce_remove_device(cpu);
		break;
	}
	return NOTIFY_OK;
}

static struct notifier_block mce_cpu_notifier = {
	.notifier_call = mce_cpu_callback,
};
#endif

static __init int mce_init_device(void)
{
	int err;
	int i = 0;

	if (!mce_available(&boot_cpu_data))
		return -EIO;
	err = sysdev_class_register(&mce_sysclass);

	for_each_online_cpu(i) {
		mce_create_device(i);
	}

	register_hotcpu_notifier(&mce_cpu_notifier);
	misc_register(&mce_log_device);
	return err;
}

device_initcall(mce_init_device);
Commit	Line	Data
	1	/*
	2	* Machine check handler.
	3	* K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
	4	* Rest from unknown author(s).
	5	* 2004 Andi Kleen. Rewrote most of it.
	6	*/
	7
	8	#include <linux/init.h>
	9	#include <linux/types.h>
	10	#include <linux/kernel.h>
	11	#include <linux/sched.h>
	12	#include <linux/string.h>
	13	#include <linux/rcupdate.h>
	14	#include <linux/kallsyms.h>
	15	#include <linux/sysdev.h>
	16	#include <linux/miscdevice.h>
	17	#include <linux/fs.h>
	18	#include <linux/capability.h>
	19	#include <linux/cpu.h>
	20	#include <linux/percpu.h>
	21	#include <linux/ctype.h>
	22	#include <asm/processor.h>
	23	#include <asm/msr.h>
	24	#include <asm/mce.h>
	25	#include <asm/kdebug.h>
	26	#include <asm/uaccess.h>
	27	#include <asm/smp.h>
	28
	29	#define MISC_MCELOG_MINOR 227
	30	#define NR_BANKS 6
	31
	32	atomic_t mce_entry;
	33
	34	static int mce_dont_init;
	35
	36	/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
	37	3: never panic or exit (for testing only) */
	38	static int tolerant = 1;
	39	static int banks;
	40	static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
	41	static unsigned long console_logged;
	42	static int notify_user;
	43	static int rip_msr;
	44	static int mce_bootlog = 1;
	45
	46	/*
	47	* Lockless MCE logging infrastructure.
	48	* This avoids deadlocks on printk locks without having to break locks. Also
	49	* separate MCEs from kernel messages to avoid bogus bug reports.
	50	*/
	51
	52	struct mce_log mcelog = {
	53	MCE_LOG_SIGNATURE,
	54	MCE_LOG_LEN,
	55	};
	56
	57	void mce_log(struct mce *mce)
	58	{
	59	unsigned next, entry;
	60	mce->finished = 0;
	61	wmb();
	62	for (;;) {
	63	entry = rcu_dereference(mcelog.next);
	64	/* The rmb forces the compiler to reload next in each
	65	iteration */
	66	rmb();
	67	for (;;) {
	68	/* When the buffer fills up discard new entries. Assume
	69	that the earlier errors are the more interesting. */
	70	if (entry >= MCE_LOG_LEN) {
	71	set_bit(MCE_OVERFLOW, &mcelog.flags);
	72	return;
	73	}
	74	/* Old left over entry. Skip. */
	75	if (mcelog.entry[entry].finished) {
	76	entry++;
	77	continue;
	78	}
	79	break;
	80	}
	81	smp_rmb();
	82	next = entry + 1;
	83	if (cmpxchg(&mcelog.next, entry, next) == entry)
	84	break;
	85	}
	86	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
	87	wmb();
	88	mcelog.entry[entry].finished = 1;
	89	wmb();
	90
	91	if (!test_and_set_bit(0, &console_logged))
	92	notify_user = 1;
	93	}
	94
	95	static void print_mce(struct mce *m)
	96	{
	97	printk(KERN_EMERG "\n"
	98	KERN_EMERG "HARDWARE ERROR\n"
	99	KERN_EMERG
	100	"CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
	101	m->cpu, m->mcgstatus, m->bank, m->status);
	102	if (m->rip) {
	103	printk(KERN_EMERG
	104	"RIP%s %02x:<%016Lx> ",
	105	!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
	106	m->cs, m->rip);
	107	if (m->cs == __KERNEL_CS)
	108	print_symbol("{%s}", m->rip);
	109	printk("\n");
	110	}
	111	printk(KERN_EMERG "TSC %Lx ", m->tsc);
	112	if (m->addr)
	113	printk("ADDR %Lx ", m->addr);
	114	if (m->misc)
	115	printk("MISC %Lx ", m->misc);
	116	printk("\n");
	117	printk(KERN_EMERG "This is not a software problem!\n");
	118	printk(KERN_EMERG
	119	"Run through mcelog --ascii to decode and contact your hardware vendor\n");
	120	}
	121
	122	static void mce_panic(char msg, struct mce backup, unsigned long start)
	123	{
	124	int i;
	125	oops_begin();
	126	for (i = 0; i < MCE_LOG_LEN; i++) {
	127	unsigned long tsc = mcelog.entry[i].tsc;
	128	if (time_before(tsc, start))
	129	continue;
	130	print_mce(&mcelog.entry[i]);
	131	if (backup && mcelog.entry[i].tsc == backup->tsc)
	132	backup = NULL;
	133	}
	134	if (backup)
	135	print_mce(backup);
	136	if (tolerant >= 3)
	137	printk("Fake panic: %s\n", msg);
	138	else
	139	panic(msg);
	140	}
	141
	142	static int mce_available(struct cpuinfo_x86 *c)
	143	{
	144	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
	145	}
	146
	147	static inline void mce_get_rip(struct mce m, struct pt_regs regs)
	148	{
	149	if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
	150	m->rip = regs->rip;
	151	m->cs = regs->cs;
	152	} else {
	153	m->rip = 0;
	154	m->cs = 0;
	155	}
	156	if (rip_msr) {
	157	/* Assume the RIP in the MSR is exact. Is this true? */
	158	m->mcgstatus \|= MCG_STATUS_EIPV;
	159	rdmsrl(rip_msr, m->rip);
	160	m->cs = 0;
	161	}
	162	}
	163
	164	/*
	165	* The actual machine check handler
	166	*/
	167
	168	void do_machine_check(struct pt_regs * regs, long error_code)
	169	{
	170	struct mce m, panicm;
	171	int nowayout = (tolerant < 1);
	172	int kill_it = 0;
	173	u64 mcestart = 0;
	174	int i;
	175	int panicm_found = 0;
	176
	177	atomic_inc(&mce_entry);
	178
	179	if (regs)
	180	notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
	181	if (!banks)
	182	goto out2;
	183
	184	memset(&m, 0, sizeof(struct mce));
	185	m.cpu = smp_processor_id();
	186	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
	187	if (!(m.mcgstatus & MCG_STATUS_RIPV))
	188	kill_it = 1;
	189
	190	rdtscll(mcestart);
	191	barrier();
	192
	193	for (i = 0; i < banks; i++) {
	194	if (!bank[i])
	195	continue;
	196
	197	m.misc = 0;
	198	m.addr = 0;
	199	m.bank = i;
	200	m.tsc = 0;
	201
	202	rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
	203	if ((m.status & MCI_STATUS_VAL) == 0)
	204	continue;
	205
	206	if (m.status & MCI_STATUS_EN) {
	207	/* In theory _OVER could be a nowayout too, but
	208	assume any overflowed errors were no fatal. */
	209	nowayout \|= !!(m.status & MCI_STATUS_PCC);
	210	kill_it \|= !!(m.status & MCI_STATUS_UC);
	211	}
	212
	213	if (m.status & MCI_STATUS_MISCV)
	214	rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
	215	if (m.status & MCI_STATUS_ADDRV)
	216	rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
	217
	218	mce_get_rip(&m, regs);
	219	if (error_code >= 0)
	220	rdtscll(m.tsc);
	221	wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
	222	if (error_code != -2)
	223	mce_log(&m);
	224
	225	/* Did this bank cause the exception? */
	226	/* Assume that the bank with uncorrectable errors did it,
	227	and that there is only a single one. */
	228	if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
	229	panicm = m;
	230	panicm_found = 1;
	231	}
	232
	233	add_taint(TAINT_MACHINE_CHECK);
	234	}
	235
	236	/* Never do anything final in the polling timer */
	237	if (!regs)
	238	goto out;
	239
	240	/* If we didn't find an uncorrectable error, pick
	241	the last one (shouldn't happen, just being safe). */
	242	if (!panicm_found)
	243	panicm = m;
	244	if (nowayout)
	245	mce_panic("Machine check", &panicm, mcestart);
	246	if (kill_it) {
	247	int user_space = 0;
	248
	249	if (m.mcgstatus & MCG_STATUS_RIPV)
	250	user_space = panicm.rip && (panicm.cs & 3);
	251
	252	/* When the machine was in user space and the CPU didn't get
	253	confused it's normally not necessary to panic, unless you
	254	are paranoid (tolerant == 0)
	255
	256	RED-PEN could be more tolerant for MCEs in idle,
	257	but most likely they occur at boot anyways, where
	258	it is best to just halt the machine. */
	259	if ((!user_space && (panic_on_oops \|\| tolerant < 2)) \|\|
	260	(unsigned)current->pid <= 1)
	261	mce_panic("Uncorrected machine check", &panicm, mcestart);
	262
	263	/* do_exit takes an awful lot of locks and has as
	264	slight risk of deadlocking. If you don't want that
	265	don't set tolerant >= 2 */
	266	if (tolerant < 3)
	267	do_exit(SIGBUS);
	268	}
	269
	270	out:
	271	/* Last thing done in the machine check exception to clear state. */
	272	wrmsrl(MSR_IA32_MCG_STATUS, 0);
	273	out2:
	274	atomic_dec(&mce_entry);
	275	}
	276
	277	#ifdef CONFIG_X86_MCE_INTEL
	278	/***
	279	* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
	280	* @cpu: The CPU on which the event occured.
	281	* @status: Event status information
	282	*
	283	* This function should be called by the thermal interrupt after the
	284	* event has been processed and the decision was made to log the event
	285	* further.
	286	*
	287	* The status parameter will be saved to the 'status' field of 'struct mce'
	288	* and historically has been the register value of the
	289	* MSR_IA32_THERMAL_STATUS (Intel) msr.
	290	*/
	291	void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
	292	{
	293	struct mce m;
	294
	295	memset(&m, 0, sizeof(m));
	296	m.cpu = cpu;
	297	m.bank = MCE_THERMAL_BANK;
	298	m.status = status;
	299	rdtscll(m.tsc);
	300	mce_log(&m);
	301	}
	302	#endif /* CONFIG_X86_MCE_INTEL */
	303
	304	/*
	305	* Periodic polling timer for "silent" machine check errors.
	306	*/
	307
	308	static int check_interval = 5 * 60; /* 5 minutes */
	309	static void mcheck_timer(void *data);
	310	static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
	311
	312	static void mcheck_check_cpu(void *info)
	313	{
	314	if (mce_available(&current_cpu_data))
	315	do_machine_check(NULL, 0);
	316	}
	317
	318	static void mcheck_timer(void *data)
	319	{
	320	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
	321	schedule_delayed_work(&mcheck_work, check_interval * HZ);
	322
	323	/*
	324	* It's ok to read stale data here for notify_user and
	325	* console_logged as we'll simply get the updated versions
	326	* on the next mcheck_timer execution and atomic operations
	327	* on console_logged act as synchronization for notify_user
	328	* writes.
	329	*/
	330	if (notify_user && console_logged) {
	331	notify_user = 0;
	332	clear_bit(0, &console_logged);
	333	printk(KERN_INFO "Machine check events logged\n");
	334	}
	335	}
	336
	337
	338	static __init int periodic_mcheck_init(void)
	339	{
	340	if (check_interval)
	341	schedule_delayed_work(&mcheck_work, check_interval*HZ);
	342	return 0;
	343	}
	344	__initcall(periodic_mcheck_init);
	345
	346
	347	/*
	348	* Initialize Machine Checks for a CPU.
	349	*/
	350	static void mce_init(void *dummy)
	351	{
	352	u64 cap;
	353	int i;
	354
	355	rdmsrl(MSR_IA32_MCG_CAP, cap);
	356	banks = cap & 0xff;
	357	if (banks > NR_BANKS) {
	358	printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
	359	banks = NR_BANKS;
	360	}
	361	/* Use accurate RIP reporting if available. */
	362	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
	363	rip_msr = MSR_IA32_MCG_EIP;
	364
	365	/* Log the machine checks left over from the previous reset.
	366	This also clears all registers */
	367	do_machine_check(NULL, mce_bootlog ? -1 : -2);
	368
	369	set_in_cr4(X86_CR4_MCE);
	370
	371	if (cap & MCG_CTL_P)
	372	wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
	373
	374	for (i = 0; i < banks; i++) {
	375	wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
	376	wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
	377	}
	378	}
	379
	380	/* Add per CPU specific workarounds here */
	381	static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
	382	{
	383	/* This should be disabled by the BIOS, but isn't always */
	384	if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
	385	/* disable GART TBL walk error reporting, which trips off
	386	incorrectly with the IOMMU & 3ware & Cerberus. */
	387	clear_bit(10, &bank[4]);
	388	/* Lots of broken BIOS around that don't clear them
	389	by default and leave crap in there. Don't log. */
	390	mce_bootlog = 0;
	391	}
	392
	393	}
	394
	395	static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
	396	{
	397	switch (c->x86_vendor) {
	398	case X86_VENDOR_INTEL:
	399	mce_intel_feature_init(c);
	400	break;
	401	case X86_VENDOR_AMD:
	402	mce_amd_feature_init(c);
	403	break;
	404	default:
	405	break;
	406	}
	407	}
	408
	409	/*
	410	* Called for each booted CPU to set up machine checks.
	411	* Must be called with preempt off.
	412	*/
	413	void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
	414	{
	415	static cpumask_t mce_cpus = CPU_MASK_NONE;
	416
	417	mce_cpu_quirks(c);
	418
	419	if (mce_dont_init \|\|
	420	cpu_test_and_set(smp_processor_id(), mce_cpus) \|\|
	421	!mce_available(c))
	422	return;
	423
	424	mce_init(NULL);
	425	mce_cpu_features(c);
	426	}
	427
	428	/*
	429	* Character device to read and clear the MCE log.
	430	*/
	431
	432	static void collect_tscs(void *data)
	433	{
	434	unsigned long cpu_tsc = (unsigned long )data;
	435	rdtscll(cpu_tsc[smp_processor_id()]);
	436	}
	437
	438	static ssize_t mce_read(struct file filp, char __user ubuf, size_t usize, loff_t *off)
	439	{
	440	unsigned long *cpu_tsc;
	441	static DECLARE_MUTEX(mce_read_sem);
	442	unsigned next;
	443	char __user *buf = ubuf;
	444	int i, err;
	445
	446	cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
	447	if (!cpu_tsc)
	448	return -ENOMEM;
	449
	450	down(&mce_read_sem);
	451	next = rcu_dereference(mcelog.next);
	452
	453	/* Only supports full reads right now */
	454	if (off != 0 \|\| usize < MCE_LOG_LENsizeof(struct mce)) {
	455	up(&mce_read_sem);
	456	kfree(cpu_tsc);
	457	return -EINVAL;
	458	}
	459
	460	err = 0;
	461	for (i = 0; i < next; i++) {
	462	unsigned long start = jiffies;
	463	while (!mcelog.entry[i].finished) {
	464	if (!time_before(jiffies, start + 2)) {
	465	memset(mcelog.entry + i,0, sizeof(struct mce));
	466	continue;
	467	}
	468	cpu_relax();
	469	}
	470	smp_rmb();
	471	err \|= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
	472	buf += sizeof(struct mce);
	473	}
	474
	475	memset(mcelog.entry, 0, next * sizeof(struct mce));
	476	mcelog.next = 0;
	477
	478	synchronize_sched();
	479
	480	/* Collect entries that were still getting written before the synchronize. */
	481
	482	on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
	483	for (i = next; i < MCE_LOG_LEN; i++) {
	484	if (mcelog.entry[i].finished &&
	485	mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
	486	err \|= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
	487	smp_rmb();
	488	buf += sizeof(struct mce);
	489	memset(&mcelog.entry[i], 0, sizeof(struct mce));
	490	}
	491	}
	492	up(&mce_read_sem);
	493	kfree(cpu_tsc);
	494	return err ? -EFAULT : buf - ubuf;
	495	}
	496
	497	static int mce_ioctl(struct inode i, struct file f,unsigned int cmd, unsigned long arg)
	498	{
	499	int __user p = (int __user )arg;
	500	if (!capable(CAP_SYS_ADMIN))
	501	return -EPERM;
	502	switch (cmd) {
	503	case MCE_GET_RECORD_LEN:
	504	return put_user(sizeof(struct mce), p);
	505	case MCE_GET_LOG_LEN:
	506	return put_user(MCE_LOG_LEN, p);
	507	case MCE_GETCLEAR_FLAGS: {
	508	unsigned flags;
	509	do {
	510	flags = mcelog.flags;
	511	} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
	512	return put_user(flags, p);
	513	}
	514	default:
	515	return -ENOTTY;
	516	}
	517	}
	518
	519	static struct file_operations mce_chrdev_ops = {
	520	.read = mce_read,
	521	.ioctl = mce_ioctl,
	522	};
	523
	524	static struct miscdevice mce_log_device = {
	525	MISC_MCELOG_MINOR,
	526	"mcelog",
	527	&mce_chrdev_ops,
	528	};
	529
	530	/*
	531	* Old style boot options parsing. Only for compatibility.
	532	*/
	533
	534	static int __init mcheck_disable(char *str)
	535	{
	536	mce_dont_init = 1;
	537	return 1;
	538	}
	539
	540	/* mce=off disables machine check. Note you can reenable it later
	541	using sysfs.
	542	mce=TOLERANCELEVEL (number, see above)
	543	mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
	544	mce=nobootlog Don't log MCEs from before booting. */
	545	static int __init mcheck_enable(char *str)
	546	{
	547	if (*str == '=')
	548	str++;
	549	if (!strcmp(str, "off"))
	550	mce_dont_init = 1;
	551	else if (!strcmp(str, "bootlog") \|\| !strcmp(str,"nobootlog"))
	552	mce_bootlog = str[0] == 'b';
	553	else if (isdigit(str[0]))
	554	get_option(&str, &tolerant);
	555	else
	556	printk("mce= argument %s ignored. Please use /sys", str);
	557	return 1;
	558	}
	559
	560	__setup("nomce", mcheck_disable);
	561	__setup("mce", mcheck_enable);
	562
	563	/*
	564	* Sysfs support
	565	*/
	566
	567	/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
	568	Only one CPU is active at this time, the others get readded later using
	569	CPU hotplug. */
	570	static int mce_resume(struct sys_device *dev)
	571	{
	572	mce_init(NULL);
	573	return 0;
	574	}
	575
	576	/* Reinit MCEs after user configuration changes */
	577	static void mce_restart(void)
	578	{
	579	if (check_interval)
	580	cancel_delayed_work(&mcheck_work);
	581	/* Timer race is harmless here */
	582	on_each_cpu(mce_init, NULL, 1, 1);
	583	if (check_interval)
	584	schedule_delayed_work(&mcheck_work, check_interval*HZ);
	585	}
	586
	587	static struct sysdev_class mce_sysclass = {
	588	.resume = mce_resume,
	589	set_kset_name("machinecheck"),
	590	};
	591
	592	DEFINE_PER_CPU(struct sys_device, device_mce);
	593
	594	/* Why are there no generic functions for this? */
	595	#define ACCESSOR(name, var, start) \
	596	static ssize_t show_ ## name(struct sys_device s, char buf) { \
	597	return sprintf(buf, "%lx\n", (unsigned long)var); \
	598	} \
	599	static ssize_t set_ ## name(struct sys_device s,const char buf,size_t siz) { \
	600	char *end; \
	601	unsigned long new = simple_strtoul(buf, &end, 0); \
	602	if (end == buf) return -EINVAL; \
	603	var = new; \
	604	start; \
	605	return end-buf; \
	606	} \
	607	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
	608
	609	ACCESSOR(bank0ctl,bank[0],mce_restart())
	610	ACCESSOR(bank1ctl,bank[1],mce_restart())
	611	ACCESSOR(bank2ctl,bank[2],mce_restart())
	612	ACCESSOR(bank3ctl,bank[3],mce_restart())
	613	ACCESSOR(bank4ctl,bank[4],mce_restart())
	614	ACCESSOR(bank5ctl,bank[5],mce_restart())
	615	static struct sysdev_attribute * bank_attributes[NR_BANKS] = {
	616	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
	617	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl};
	618	ACCESSOR(tolerant,tolerant,)
	619	ACCESSOR(check_interval,check_interval,mce_restart())
	620
	621	/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
	622	static __cpuinit int mce_create_device(unsigned int cpu)
	623	{
	624	int err;
	625	int i;
	626	if (!mce_available(&cpu_data[cpu]))
	627	return -EIO;
	628
	629	per_cpu(device_mce,cpu).id = cpu;
	630	per_cpu(device_mce,cpu).cls = &mce_sysclass;
	631
	632	err = sysdev_register(&per_cpu(device_mce,cpu));
	633
	634	if (!err) {
	635	for (i = 0; i < banks; i++)
	636	sysdev_create_file(&per_cpu(device_mce,cpu),
	637	bank_attributes[i]);
	638	sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
	639	sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
	640	}
	641	return err;
	642	}
	643
	644	#ifdef CONFIG_HOTPLUG_CPU
	645	static void mce_remove_device(unsigned int cpu)
	646	{
	647	int i;
	648
	649	for (i = 0; i < banks; i++)
	650	sysdev_remove_file(&per_cpu(device_mce,cpu),
	651	bank_attributes[i]);
	652	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
	653	sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
	654	sysdev_unregister(&per_cpu(device_mce,cpu));
	655	memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
	656	}
	657
	658	/* Get notified when a cpu comes on/off. Be hotplug friendly. */
	659	static int
	660	mce_cpu_callback(struct notifier_block nfb, unsigned long action, void hcpu)
	661	{
	662	unsigned int cpu = (unsigned long)hcpu;
	663
	664	switch (action) {
	665	case CPU_ONLINE:
	666	mce_create_device(cpu);
	667	break;
	668	case CPU_DEAD:
	669	mce_remove_device(cpu);
	670	break;
	671	}
	672	return NOTIFY_OK;
	673	}
	674
	675	static struct notifier_block mce_cpu_notifier = {
	676	.notifier_call = mce_cpu_callback,
	677	};
	678	#endif
	679
	680	static __init int mce_init_device(void)
	681	{
	682	int err;
	683	int i = 0;
	684
	685	if (!mce_available(&boot_cpu_data))
	686	return -EIO;
	687	err = sysdev_class_register(&mce_sysclass);
	688
	689	for_each_online_cpu(i) {
	690	mce_create_device(i);
	691	}
	692
	693	register_hotcpu_notifier(&mce_cpu_notifier);
	694	misc_register(&mce_log_device);
	695	return err;
	696	}
	697
	698	device_initcall(mce_init_device);