arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/thread_info.h>
  14 #include <linux/capability.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/ratelimit.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/kobject.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kernel.h>
  23 #include <linux/percpu.h>
  24 #include <linux/string.h>
  25 #include <linux/device.h>
  26 #include <linux/syscore_ops.h>
  27 #include <linux/delay.h>
  28 #include <linux/ctype.h>
  29 #include <linux/sched.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/types.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/kmod.h>
  35 #include <linux/poll.h>
  36 #include <linux/nmi.h>
  37 #include <linux/cpu.h>
  38 #include <linux/smp.h>
  39 #include <linux/fs.h>
  40 #include <linux/mm.h>
  41 #include <linux/debugfs.h>
  42 #include <linux/irq_work.h>
  43 #include <linux/export.h>
  44
  45 #include <asm/processor.h>
  46 #include <asm/traps.h>
  47 #include <asm/tlbflush.h>
  48 #include <asm/mce.h>
  49 #include <asm/msr.h>
  50
  51 #include "mce-internal.h"
  52
  53 static DEFINE_MUTEX(mce_chrdev_read_mutex);
  54
  55 #define rcu_dereference_check_mce(p) \
  56         rcu_dereference_index_check((p), \
  57                               rcu_read_lock_sched_held() || \
  58                               lockdep_is_held(&mce_chrdev_read_mutex))
  59
  60 #define CREATE_TRACE_POINTS
  61 #include <trace/events/mce.h>
  62
  63 #define SPINUNIT                100     /* 100ns */
  64
  65 DEFINE_PER_CPU(unsigned, mce_exception_count);
  66
  67 struct mce_bank *mce_banks __read_mostly;
  68 struct mce_vendor_flags mce_flags __read_mostly;
  69
  70 struct mca_config mca_cfg __read_mostly = {
  71         .bootlog  = -1,
  72         /*
  73          * Tolerant levels:
  74          * 0: always panic on uncorrected errors, log corrected errors
  75          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  76          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  77          * 3: never panic or SIGBUS, log all errors (for testing only)
  78          */
  79         .tolerant = 1,
  80         .monarch_timeout = -1
  81 };
  82
  83 /* User mode helper program triggered by machine check event */
  84 static unsigned long            mce_need_notify;
  85 static char                     mce_helper[128];
  86 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  87
  88 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  89
  90 static DEFINE_PER_CPU(struct mce, mces_seen);
  91 static int                      cpu_missing;
  92
  93 /*
  94  * MCA banks polled by the period polling timer for corrected events.
  95  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  96  */
  97 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  98         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  99 };
 100
 101 /*
 102  * MCA banks controlled through firmware first for corrected errors.
 103  * This is a global list of banks for which we won't enable CMCI and we
 104  * won't poll. Firmware controls these banks and is responsible for
 105  * reporting corrected errors through GHES. Uncorrected/recoverable
 106  * errors are still notified through a machine check.
 107  */
 108 mce_banks_t mce_banks_ce_disabled;
 109
 110 static DEFINE_PER_CPU(struct work_struct, mce_work);
 111
 112 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 113
 114 /*
 115  * CPU/chipset specific EDAC code can register a notifier call here to print
 116  * MCE errors in a human-readable form.
 117  */
 118 static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 119
 120 /* Do initial initialization of a struct mce */
 121 void mce_setup(struct mce *m)
 122 {
 123         memset(m, 0, sizeof(struct mce));
 124         m->cpu = m->extcpu = smp_processor_id();
 125         rdtscll(m->tsc);
 126         /* We hope get_seconds stays lockless */
 127         m->time = get_seconds();
 128         m->cpuvendor = boot_cpu_data.x86_vendor;
 129         m->cpuid = cpuid_eax(1);
 130         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 131         m->apicid = cpu_data(m->extcpu).initial_apicid;
 132         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 133 }
 134
 135 DEFINE_PER_CPU(struct mce, injectm);
 136 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 137
 138 /*
 139  * Lockless MCE logging infrastructure.
 140  * This avoids deadlocks on printk locks without having to break locks. Also
 141  * separate MCEs from kernel messages to avoid bogus bug reports.
 142  */
 143
 144 static struct mce_log mcelog = {
 145         .signature      = MCE_LOG_SIGNATURE,
 146         .len            = MCE_LOG_LEN,
 147         .recordlen      = sizeof(struct mce),
 148 };
 149
 150 void mce_log(struct mce *mce)
 151 {
 152         unsigned next, entry;
 153
 154         /* Emit the trace record: */
 155         trace_mce_record(mce);
 156
 157         atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 158
 159         mce->finished = 0;
 160         wmb();
 161         for (;;) {
 162                 entry = rcu_dereference_check_mce(mcelog.next);
 163                 for (;;) {
 164
 165                         /*
 166                          * When the buffer fills up discard new entries.
 167                          * Assume that the earlier errors are the more
 168                          * interesting ones:
 169                          */
 170                         if (entry >= MCE_LOG_LEN) {
 171                                 set_bit(MCE_OVERFLOW,
 172                                         (unsigned long *)&mcelog.flags);
 173                                 return;
 174                         }
 175                         /* Old left over entry. Skip: */
 176                         if (mcelog.entry[entry].finished) {
 177                                 entry++;
 178                                 continue;
 179                         }
 180                         break;
 181                 }
 182                 smp_rmb();
 183                 next = entry + 1;
 184                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 185                         break;
 186         }
 187         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 188         wmb();
 189         mcelog.entry[entry].finished = 1;
 190         wmb();
 191
 192         mce->finished = 1;
 193         set_bit(0, &mce_need_notify);
 194 }
 195
 196 static void drain_mcelog_buffer(void)
 197 {
 198         unsigned int next, i, prev = 0;
 199
 200         next = ACCESS_ONCE(mcelog.next);
 201
 202         do {
 203                 struct mce *m;
 204
 205                 /* drain what was logged during boot */
 206                 for (i = prev; i < next; i++) {
 207                         unsigned long start = jiffies;
 208                         unsigned retries = 1;
 209
 210                         m = &mcelog.entry[i];
 211
 212                         while (!m->finished) {
 213                                 if (time_after_eq(jiffies, start + 2*retries))
 214                                         retries++;
 215
 216                                 cpu_relax();
 217
 218                                 if (!m->finished && retries >= 4) {
 219                                         pr_err("skipping error being logged currently!\n");
 220                                         break;
 221                                 }
 222                         }
 223                         smp_rmb();
 224                         atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 225                 }
 226
 227                 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
 228                 prev = next;
 229                 next = cmpxchg(&mcelog.next, prev, 0);
 230         } while (next != prev);
 231 }
 232
 233
 234 void mce_register_decode_chain(struct notifier_block *nb)
 235 {
 236         atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
 237         drain_mcelog_buffer();
 238 }
 239 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 240
 241 void mce_unregister_decode_chain(struct notifier_block *nb)
 242 {
 243         atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 244 }
 245 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 246
 247 static void print_mce(struct mce *m)
 248 {
 249         int ret = 0;
 250
 251         pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 252                m->extcpu, m->mcgstatus, m->bank, m->status);
 253
 254         if (m->ip) {
 255                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 256                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 257                                 m->cs, m->ip);
 258
 259                 if (m->cs == __KERNEL_CS)
 260                         print_symbol("{%s}", m->ip);
 261                 pr_cont("\n");
 262         }
 263
 264         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 265         if (m->addr)
 266                 pr_cont("ADDR %llx ", m->addr);
 267         if (m->misc)
 268                 pr_cont("MISC %llx ", m->misc);
 269
 270         pr_cont("\n");
 271         /*
 272          * Note this output is parsed by external tools and old fields
 273          * should not be changed.
 274          */
 275         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 276                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 277                 cpu_data(m->extcpu).microcode);
 278
 279         /*
 280          * Print out human-readable details about the MCE error,
 281          * (if the CPU has an implementation for that)
 282          */
 283         ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 284         if (ret == NOTIFY_STOP)
 285                 return;
 286
 287         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 288 }
 289
 290 #define PANIC_TIMEOUT 5 /* 5 seconds */
 291
 292 static atomic_t mce_panicked;
 293
 294 static int fake_panic;
 295 static atomic_t mce_fake_panicked;
 296
 297 /* Panic in progress. Enable interrupts and wait for final IPI */
 298 static void wait_for_panic(void)
 299 {
 300         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 301
 302         preempt_disable();
 303         local_irq_enable();
 304         while (timeout-- > 0)
 305                 udelay(1);
 306         if (panic_timeout == 0)
 307                 panic_timeout = mca_cfg.panic_timeout;
 308         panic("Panicing machine check CPU died");
 309 }
 310
 311 static void mce_panic(const char *msg, struct mce *final, char *exp)
 312 {
 313         int i, apei_err = 0;
 314
 315         if (!fake_panic) {
 316                 /*
 317                  * Make sure only one CPU runs in machine check panic
 318                  */
 319                 if (atomic_inc_return(&mce_panicked) > 1)
 320                         wait_for_panic();
 321                 barrier();
 322
 323                 bust_spinlocks(1);
 324                 console_verbose();
 325         } else {
 326                 /* Don't log too much for fake panic */
 327                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 328                         return;
 329         }
 330         /* First print corrected ones that are still unlogged */
 331         for (i = 0; i < MCE_LOG_LEN; i++) {
 332                 struct mce *m = &mcelog.entry[i];
 333                 if (!(m->status & MCI_STATUS_VAL))
 334                         continue;
 335                 if (!(m->status & MCI_STATUS_UC)) {
 336                         print_mce(m);
 337                         if (!apei_err)
 338                                 apei_err = apei_write_mce(m);
 339                 }
 340         }
 341         /* Now print uncorrected but with the final one last */
 342         for (i = 0; i < MCE_LOG_LEN; i++) {
 343                 struct mce *m = &mcelog.entry[i];
 344                 if (!(m->status & MCI_STATUS_VAL))
 345                         continue;
 346                 if (!(m->status & MCI_STATUS_UC))
 347                         continue;
 348                 if (!final || memcmp(m, final, sizeof(struct mce))) {
 349                         print_mce(m);
 350                         if (!apei_err)
 351                                 apei_err = apei_write_mce(m);
 352                 }
 353         }
 354         if (final) {
 355                 print_mce(final);
 356                 if (!apei_err)
 357                         apei_err = apei_write_mce(final);
 358         }
 359         if (cpu_missing)
 360                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 361         if (exp)
 362                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 363         if (!fake_panic) {
 364                 if (panic_timeout == 0)
 365                         panic_timeout = mca_cfg.panic_timeout;
 366                 panic(msg);
 367         } else
 368                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 369 }
 370
 371 /* Support code for software error injection */
 372
 373 static int msr_to_offset(u32 msr)
 374 {
 375         unsigned bank = __this_cpu_read(injectm.bank);
 376
 377         if (msr == mca_cfg.rip_msr)
 378                 return offsetof(struct mce, ip);
 379         if (msr == MSR_IA32_MCx_STATUS(bank))
 380                 return offsetof(struct mce, status);
 381         if (msr == MSR_IA32_MCx_ADDR(bank))
 382                 return offsetof(struct mce, addr);
 383         if (msr == MSR_IA32_MCx_MISC(bank))
 384                 return offsetof(struct mce, misc);
 385         if (msr == MSR_IA32_MCG_STATUS)
 386                 return offsetof(struct mce, mcgstatus);
 387         return -1;
 388 }
 389
 390 /* MSR access wrappers used for error injection */
 391 static u64 mce_rdmsrl(u32 msr)
 392 {
 393         u64 v;
 394
 395         if (__this_cpu_read(injectm.finished)) {
 396                 int offset = msr_to_offset(msr);
 397
 398                 if (offset < 0)
 399                         return 0;
 400                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 401         }
 402
 403         if (rdmsrl_safe(msr, &v)) {
 404                 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 405                 /*
 406                  * Return zero in case the access faulted. This should
 407                  * not happen normally but can happen if the CPU does
 408                  * something weird, or if the code is buggy.
 409                  */
 410                 v = 0;
 411         }
 412
 413         return v;
 414 }
 415
 416 static void mce_wrmsrl(u32 msr, u64 v)
 417 {
 418         if (__this_cpu_read(injectm.finished)) {
 419                 int offset = msr_to_offset(msr);
 420
 421                 if (offset >= 0)
 422                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 423                 return;
 424         }
 425         wrmsrl(msr, v);
 426 }
 427
 428 /*
 429  * Collect all global (w.r.t. this processor) status about this machine
 430  * check into our "mce" struct so that we can use it later to assess
 431  * the severity of the problem as we read per-bank specific details.
 432  */
 433 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 434 {
 435         mce_setup(m);
 436
 437         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 438         if (regs) {
 439                 /*
 440                  * Get the address of the instruction at the time of
 441                  * the machine check error.
 442                  */
 443                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 444                         m->ip = regs->ip;
 445                         m->cs = regs->cs;
 446
 447                         /*
 448                          * When in VM86 mode make the cs look like ring 3
 449                          * always. This is a lie, but it's better than passing
 450                          * the additional vm86 bit around everywhere.
 451                          */
 452                         if (v8086_mode(regs))
 453                                 m->cs |= 3;
 454                 }
 455                 /* Use accurate RIP reporting if available. */
 456                 if (mca_cfg.rip_msr)
 457                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 458         }
 459 }
 460
 461 /*
 462  * Simple lockless ring to communicate PFNs from the exception handler with the
 463  * process context work function. This is vastly simplified because there's
 464  * only a single reader and a single writer.
 465  */
 466 #define MCE_RING_SIZE 16        /* we use one entry less */
 467
 468 struct mce_ring {
 469         unsigned short start;
 470         unsigned short end;
 471         unsigned long ring[MCE_RING_SIZE];
 472 };
 473 static DEFINE_PER_CPU(struct mce_ring, mce_ring);
 474
 475 /* Runs with CPU affinity in workqueue */
 476 static int mce_ring_empty(void)
 477 {
 478         struct mce_ring *r = this_cpu_ptr(&mce_ring);
 479
 480         return r->start == r->end;
 481 }
 482
 483 static int mce_ring_get(unsigned long *pfn)
 484 {
 485         struct mce_ring *r;
 486         int ret = 0;
 487
 488         *pfn = 0;
 489         get_cpu();
 490         r = this_cpu_ptr(&mce_ring);
 491         if (r->start == r->end)
 492                 goto out;
 493         *pfn = r->ring[r->start];
 494         r->start = (r->start + 1) % MCE_RING_SIZE;
 495         ret = 1;
 496 out:
 497         put_cpu();
 498         return ret;
 499 }
 500
 501 /* Always runs in MCE context with preempt off */
 502 static int mce_ring_add(unsigned long pfn)
 503 {
 504         struct mce_ring *r = this_cpu_ptr(&mce_ring);
 505         unsigned next;
 506
 507         next = (r->end + 1) % MCE_RING_SIZE;
 508         if (next == r->start)
 509                 return -1;
 510         r->ring[r->end] = pfn;
 511         wmb();
 512         r->end = next;
 513         return 0;
 514 }
 515
 516 int mce_available(struct cpuinfo_x86 *c)
 517 {
 518         if (mca_cfg.disabled)
 519                 return 0;
 520         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 521 }
 522
 523 static void mce_schedule_work(void)
 524 {
 525         if (!mce_ring_empty())
 526                 schedule_work(this_cpu_ptr(&mce_work));
 527 }
 528
 529 static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
 530
 531 static void mce_irq_work_cb(struct irq_work *entry)
 532 {
 533         mce_notify_irq();
 534         mce_schedule_work();
 535 }
 536
 537 static void mce_report_event(struct pt_regs *regs)
 538 {
 539         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 540                 mce_notify_irq();
 541                 /*
 542                  * Triggering the work queue here is just an insurance
 543                  * policy in case the syscall exit notify handler
 544                  * doesn't run soon enough or ends up running on the
 545                  * wrong CPU (can happen when audit sleeps)
 546                  */
 547                 mce_schedule_work();
 548                 return;
 549         }
 550
 551         irq_work_queue(this_cpu_ptr(&mce_irq_work));
 552 }
 553
 554 /*
 555  * Read ADDR and MISC registers.
 556  */
 557 static void mce_read_aux(struct mce *m, int i)
 558 {
 559         if (m->status & MCI_STATUS_MISCV)
 560                 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 561         if (m->status & MCI_STATUS_ADDRV) {
 562                 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 563
 564                 /*
 565                  * Mask the reported address by the reported granularity.
 566                  */
 567                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 568                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 569                         m->addr >>= shift;
 570                         m->addr <<= shift;
 571                 }
 572         }
 573 }
 574
 575 static bool memory_error(struct mce *m)
 576 {
 577         struct cpuinfo_x86 *c = &boot_cpu_data;
 578
 579         if (c->x86_vendor == X86_VENDOR_AMD) {
 580                 /*
 581                  * coming soon
 582                  */
 583                 return false;
 584         } else if (c->x86_vendor == X86_VENDOR_INTEL) {
 585                 /*
 586                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 587                  *
 588                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 589                  * indicating a memory error. Bit 8 is used for indicating a
 590                  * cache hierarchy error. The combination of bit 2 and bit 3
 591                  * is used for indicating a `generic' cache hierarchy error
 592                  * But we can't just blindly check the above bits, because if
 593                  * bit 11 is set, then it is a bus/interconnect error - and
 594                  * either way the above bits just gives more detail on what
 595                  * bus/interconnect error happened. Note that bit 12 can be
 596                  * ignored, as it's the "filter" bit.
 597                  */
 598                 return (m->status & 0xef80) == BIT(7) ||
 599                        (m->status & 0xef00) == BIT(8) ||
 600                        (m->status & 0xeffc) == 0xc;
 601         }
 602
 603         return false;
 604 }
 605
 606 DEFINE_PER_CPU(unsigned, mce_poll_count);
 607
 608 /*
 609  * Poll for corrected events or events that happened before reset.
 610  * Those are just logged through /dev/mcelog.
 611  *
 612  * This is executed in standard interrupt context.
 613  *
 614  * Note: spec recommends to panic for fatal unsignalled
 615  * errors here. However this would be quite problematic --
 616  * we would need to reimplement the Monarch handling and
 617  * it would mess up the exclusion between exception handler
 618  * and poll hander -- * so we skip this for now.
 619  * These cases should not happen anyways, or only when the CPU
 620  * is already totally * confused. In this case it's likely it will
 621  * not fully execute the machine check handler either.
 622  */
 623 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 624 {
 625         bool error_logged = false;
 626         struct mce m;
 627         int severity;
 628         int i;
 629
 630         this_cpu_inc(mce_poll_count);
 631
 632         mce_gather_info(&m, NULL);
 633
 634         for (i = 0; i < mca_cfg.banks; i++) {
 635                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 636                         continue;
 637
 638                 m.misc = 0;
 639                 m.addr = 0;
 640                 m.bank = i;
 641                 m.tsc = 0;
 642
 643                 barrier();
 644                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 645                 if (!(m.status & MCI_STATUS_VAL))
 646                         continue;
 647
 648
 649                 /*
 650                  * Uncorrected or signalled events are handled by the exception
 651                  * handler when it is enabled, so don't process those here.
 652                  *
 653                  * TBD do the same check for MCI_STATUS_EN here?
 654                  */
 655                 if (!(flags & MCP_UC) &&
 656                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 657                         continue;
 658
 659                 mce_read_aux(&m, i);
 660
 661                 if (!(flags & MCP_TIMESTAMP))
 662                         m.tsc = 0;
 663
 664                 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 665
 666                 /*
 667                  * In the cases where we don't have a valid address after all,
 668                  * do not add it into the ring buffer.
 669                  */
 670                 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 671                         if (m.status & MCI_STATUS_ADDRV) {
 672                                 mce_ring_add(m.addr >> PAGE_SHIFT);
 673                                 mce_schedule_work();
 674                         }
 675                 }
 676
 677                 /*
 678                  * Don't get the IP here because it's unlikely to
 679                  * have anything to do with the actual error location.
 680                  */
 681                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
 682                         error_logged = true;
 683                         mce_log(&m);
 684                 }
 685
 686                 /*
 687                  * Clear state for this bank.
 688                  */
 689                 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 690         }
 691
 692         /*
 693          * Don't clear MCG_STATUS here because it's only defined for
 694          * exceptions.
 695          */
 696
 697         sync_core();
 698
 699         return error_logged;
 700 }
 701 EXPORT_SYMBOL_GPL(machine_check_poll);
 702
 703 /*
 704  * Do a quick check if any of the events requires a panic.
 705  * This decides if we keep the events around or clear them.
 706  */
 707 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 708                           struct pt_regs *regs)
 709 {
 710         int i, ret = 0;
 711         char *tmp;
 712
 713         for (i = 0; i < mca_cfg.banks; i++) {
 714                 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 715                 if (m->status & MCI_STATUS_VAL) {
 716                         __set_bit(i, validp);
 717                         if (quirk_no_way_out)
 718                                 quirk_no_way_out(i, m, regs);
 719                 }
 720
 721                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 722                         *msg = tmp;
 723                         ret = 1;
 724                 }
 725         }
 726         return ret;
 727 }
 728
 729 /*
 730  * Variable to establish order between CPUs while scanning.
 731  * Each CPU spins initially until executing is equal its number.
 732  */
 733 static atomic_t mce_executing;
 734
 735 /*
 736  * Defines order of CPUs on entry. First CPU becomes Monarch.
 737  */
 738 static atomic_t mce_callin;
 739
 740 /*
 741  * Check if a timeout waiting for other CPUs happened.
 742  */
 743 static int mce_timed_out(u64 *t, const char *msg)
 744 {
 745         /*
 746          * The others already did panic for some reason.
 747          * Bail out like in a timeout.
 748          * rmb() to tell the compiler that system_state
 749          * might have been modified by someone else.
 750          */
 751         rmb();
 752         if (atomic_read(&mce_panicked))
 753                 wait_for_panic();
 754         if (!mca_cfg.monarch_timeout)
 755                 goto out;
 756         if ((s64)*t < SPINUNIT) {
 757                 if (mca_cfg.tolerant <= 1)
 758                         mce_panic(msg, NULL, NULL);
 759                 cpu_missing = 1;
 760                 return 1;
 761         }
 762         *t -= SPINUNIT;
 763 out:
 764         touch_nmi_watchdog();
 765         return 0;
 766 }
 767
 768 /*
 769  * The Monarch's reign.  The Monarch is the CPU who entered
 770  * the machine check handler first. It waits for the others to
 771  * raise the exception too and then grades them. When any
 772  * error is fatal panic. Only then let the others continue.
 773  *
 774  * The other CPUs entering the MCE handler will be controlled by the
 775  * Monarch. They are called Subjects.
 776  *
 777  * This way we prevent any potential data corruption in a unrecoverable case
 778  * and also makes sure always all CPU's errors are examined.
 779  *
 780  * Also this detects the case of a machine check event coming from outer
 781  * space (not detected by any CPUs) In this case some external agent wants
 782  * us to shut down, so panic too.
 783  *
 784  * The other CPUs might still decide to panic if the handler happens
 785  * in a unrecoverable place, but in this case the system is in a semi-stable
 786  * state and won't corrupt anything by itself. It's ok to let the others
 787  * continue for a bit first.
 788  *
 789  * All the spin loops have timeouts; when a timeout happens a CPU
 790  * typically elects itself to be Monarch.
 791  */
 792 static void mce_reign(void)
 793 {
 794         int cpu;
 795         struct mce *m = NULL;
 796         int global_worst = 0;
 797         char *msg = NULL;
 798         char *nmsg = NULL;
 799
 800         /*
 801          * This CPU is the Monarch and the other CPUs have run
 802          * through their handlers.
 803          * Grade the severity of the errors of all the CPUs.
 804          */
 805         for_each_possible_cpu(cpu) {
 806                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
 807                                             mca_cfg.tolerant,
 808                                             &nmsg, true);
 809                 if (severity > global_worst) {
 810                         msg = nmsg;
 811                         global_worst = severity;
 812                         m = &per_cpu(mces_seen, cpu);
 813                 }
 814         }
 815
 816         /*
 817          * Cannot recover? Panic here then.
 818          * This dumps all the mces in the log buffer and stops the
 819          * other CPUs.
 820          */
 821         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 822                 mce_panic("Fatal machine check", m, msg);
 823
 824         /*
 825          * For UC somewhere we let the CPU who detects it handle it.
 826          * Also must let continue the others, otherwise the handling
 827          * CPU could deadlock on a lock.
 828          */
 829
 830         /*
 831          * No machine check event found. Must be some external
 832          * source or one CPU is hung. Panic.
 833          */
 834         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 835                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
 836
 837         /*
 838          * Now clear all the mces_seen so that they don't reappear on
 839          * the next mce.
 840          */
 841         for_each_possible_cpu(cpu)
 842                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 843 }
 844
 845 static atomic_t global_nwo;
 846
 847 /*
 848  * Start of Monarch synchronization. This waits until all CPUs have
 849  * entered the exception handler and then determines if any of them
 850  * saw a fatal event that requires panic. Then it executes them
 851  * in the entry order.
 852  * TBD double check parallel CPU hotunplug
 853  */
 854 static int mce_start(int *no_way_out)
 855 {
 856         int order;
 857         int cpus = num_online_cpus();
 858         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 859
 860         if (!timeout)
 861                 return -1;
 862
 863         atomic_add(*no_way_out, &global_nwo);
 864         /*
 865          * global_nwo should be updated before mce_callin
 866          */
 867         smp_wmb();
 868         order = atomic_inc_return(&mce_callin);
 869
 870         /*
 871          * Wait for everyone.
 872          */
 873         while (atomic_read(&mce_callin) != cpus) {
 874                 if (mce_timed_out(&timeout,
 875                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
 876                         atomic_set(&global_nwo, 0);
 877                         return -1;
 878                 }
 879                 ndelay(SPINUNIT);
 880         }
 881
 882         /*
 883          * mce_callin should be read before global_nwo
 884          */
 885         smp_rmb();
 886
 887         if (order == 1) {
 888                 /*
 889                  * Monarch: Starts executing now, the others wait.
 890                  */
 891                 atomic_set(&mce_executing, 1);
 892         } else {
 893                 /*
 894                  * Subject: Now start the scanning loop one by one in
 895                  * the original callin order.
 896                  * This way when there are any shared banks it will be
 897                  * only seen by one CPU before cleared, avoiding duplicates.
 898                  */
 899                 while (atomic_read(&mce_executing) < order) {
 900                         if (mce_timed_out(&timeout,
 901                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
 902                                 atomic_set(&global_nwo, 0);
 903                                 return -1;
 904                         }
 905                         ndelay(SPINUNIT);
 906                 }
 907         }
 908
 909         /*
 910          * Cache the global no_way_out state.
 911          */
 912         *no_way_out = atomic_read(&global_nwo);
 913
 914         return order;
 915 }
 916
 917 /*
 918  * Synchronize between CPUs after main scanning loop.
 919  * This invokes the bulk of the Monarch processing.
 920  */
 921 static int mce_end(int order)
 922 {
 923         int ret = -1;
 924         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 925
 926         if (!timeout)
 927                 goto reset;
 928         if (order < 0)
 929                 goto reset;
 930
 931         /*
 932          * Allow others to run.
 933          */
 934         atomic_inc(&mce_executing);
 935
 936         if (order == 1) {
 937                 /* CHECKME: Can this race with a parallel hotplug? */
 938                 int cpus = num_online_cpus();
 939
 940                 /*
 941                  * Monarch: Wait for everyone to go through their scanning
 942                  * loops.
 943                  */
 944                 while (atomic_read(&mce_executing) <= cpus) {
 945                         if (mce_timed_out(&timeout,
 946                                           "Timeout: Monarch CPU unable to finish machine check processing"))
 947                                 goto reset;
 948                         ndelay(SPINUNIT);
 949                 }
 950
 951                 mce_reign();
 952                 barrier();
 953                 ret = 0;
 954         } else {
 955                 /*
 956                  * Subject: Wait for Monarch to finish.
 957                  */
 958                 while (atomic_read(&mce_executing) != 0) {
 959                         if (mce_timed_out(&timeout,
 960                                           "Timeout: Monarch CPU did not finish machine check processing"))
 961                                 goto reset;
 962                         ndelay(SPINUNIT);
 963                 }
 964
 965                 /*
 966                  * Don't reset anything. That's done by the Monarch.
 967                  */
 968                 return 0;
 969         }
 970
 971         /*
 972          * Reset all global state.
 973          */
 974 reset:
 975         atomic_set(&global_nwo, 0);
 976         atomic_set(&mce_callin, 0);
 977         barrier();
 978
 979         /*
 980          * Let others run again.
 981          */
 982         atomic_set(&mce_executing, 0);
 983         return ret;
 984 }
 985
 986 /*
 987  * Check if the address reported by the CPU is in a format we can parse.
 988  * It would be possible to add code for most other cases, but all would
 989  * be somewhat complicated (e.g. segment offset would require an instruction
 990  * parser). So only support physical addresses up to page granuality for now.
 991  */
 992 static int mce_usable_address(struct mce *m)
 993 {
 994         if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 995                 return 0;
 996         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 997                 return 0;
 998         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 999                 return 0;
1000         return 1;
1001 }
1002
1003 static void mce_clear_state(unsigned long *toclear)
1004 {
1005         int i;
1006
1007         for (i = 0; i < mca_cfg.banks; i++) {
1008                 if (test_bit(i, toclear))
1009                         mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1010         }
1011 }
1012
1013 /*
1014  * The actual machine check handler. This only handles real
1015  * exceptions when something got corrupted coming in through int 18.
1016  *
1017  * This is executed in NMI context not subject to normal locking rules. This
1018  * implies that most kernel services cannot be safely used. Don't even
1019  * think about putting a printk in there!
1020  *
1021  * On Intel systems this is entered on all CPUs in parallel through
1022  * MCE broadcast. However some CPUs might be broken beyond repair,
1023  * so be always careful when synchronizing with others.
1024  */
1025 void do_machine_check(struct pt_regs *regs, long error_code)
1026 {
1027         struct mca_config *cfg = &mca_cfg;
1028         struct mce m, *final;
1029         enum ctx_state prev_state;
1030         int i;
1031         int worst = 0;
1032         int severity;
1033         /*
1034          * Establish sequential order between the CPUs entering the machine
1035          * check handler.
1036          */
1037         int order;
1038         /*
1039          * If no_way_out gets set, there is no safe way to recover from this
1040          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1041          */
1042         int no_way_out = 0;
1043         /*
1044          * If kill_it gets set, there might be a way to recover from this
1045          * error.
1046          */
1047         int kill_it = 0;
1048         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1049         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1050         char *msg = "Unknown";
1051         u64 recover_paddr = ~0ull;
1052         int flags = MF_ACTION_REQUIRED;
1053         int lmce = 0;
1054
1055         prev_state = ist_enter(regs);
1056
1057         this_cpu_inc(mce_exception_count);
1058
1059         if (!cfg->banks)
1060                 goto out;
1061
1062         mce_gather_info(&m, regs);
1063
1064         final = this_cpu_ptr(&mces_seen);
1065         *final = m;
1066
1067         memset(valid_banks, 0, sizeof(valid_banks));
1068         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1069
1070         barrier();
1071
1072         /*
1073          * When no restart IP might need to kill or panic.
1074          * Assume the worst for now, but if we find the
1075          * severity is MCE_AR_SEVERITY we have other options.
1076          */
1077         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1078                 kill_it = 1;
1079
1080         /*
1081          * Check if this MCE is signaled to only this logical processor
1082          */
1083         if (m.mcgstatus & MCG_STATUS_LMCES)
1084                 lmce = 1;
1085         else {
1086                 /*
1087                  * Go through all the banks in exclusion of the other CPUs.
1088                  * This way we don't report duplicated events on shared banks
1089                  * because the first one to see it will clear it.
1090                  * If this is a Local MCE, then no need to perform rendezvous.
1091                  */
1092                 order = mce_start(&no_way_out);
1093         }
1094
1095         for (i = 0; i < cfg->banks; i++) {
1096                 __clear_bit(i, toclear);
1097                 if (!test_bit(i, valid_banks))
1098                         continue;
1099                 if (!mce_banks[i].ctl)
1100                         continue;
1101
1102                 m.misc = 0;
1103                 m.addr = 0;
1104                 m.bank = i;
1105
1106                 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1107                 if ((m.status & MCI_STATUS_VAL) == 0)
1108                         continue;
1109
1110                 /*
1111                  * Non uncorrected or non signaled errors are handled by
1112                  * machine_check_poll. Leave them alone, unless this panics.
1113                  */
1114                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1115                         !no_way_out)
1116                         continue;
1117
1118                 /*
1119                  * Set taint even when machine check was not enabled.
1120                  */
1121                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1122
1123                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1124
1125                 /*
1126                  * When machine check was for corrected/deferred handler don't
1127                  * touch, unless we're panicing.
1128                  */
1129                 if ((severity == MCE_KEEP_SEVERITY ||
1130                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1131                         continue;
1132                 __set_bit(i, toclear);
1133                 if (severity == MCE_NO_SEVERITY) {
1134                         /*
1135                          * Machine check event was not enabled. Clear, but
1136                          * ignore.
1137                          */
1138                         continue;
1139                 }
1140
1141                 mce_read_aux(&m, i);
1142
1143                 /*
1144                  * Action optional error. Queue address for later processing.
1145                  * When the ring overflows we just ignore the AO error.
1146                  * RED-PEN add some logging mechanism when
1147                  * usable_address or mce_add_ring fails.
1148                  * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1149                  */
1150                 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1151                         mce_ring_add(m.addr >> PAGE_SHIFT);
1152
1153                 mce_log(&m);
1154
1155                 if (severity > worst) {
1156                         *final = m;
1157                         worst = severity;
1158                 }
1159         }
1160
1161         /* mce_clear_state will clear *final, save locally for use later */
1162         m = *final;
1163
1164         if (!no_way_out)
1165                 mce_clear_state(toclear);
1166
1167         /*
1168          * Do most of the synchronization with other CPUs.
1169          * When there's any problem use only local no_way_out state.
1170          */
1171         if (!lmce) {
1172                 if (mce_end(order) < 0)
1173                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1174         } else {
1175                 /*
1176                  * Local MCE skipped calling mce_reign()
1177                  * If we found a fatal error, we need to panic here.
1178                  */
1179                  if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1180                         mce_panic("Machine check from unknown source",
1181                                 NULL, NULL);
1182         }
1183
1184         /*
1185          * At insane "tolerant" levels we take no action. Otherwise
1186          * we only die if we have no other choice. For less serious
1187          * issues we try to recover, or limit damage to the current
1188          * process.
1189          */
1190         if (cfg->tolerant < 3) {
1191                 if (no_way_out)
1192                         mce_panic("Fatal machine check on current CPU", &m, msg);
1193                 if (worst == MCE_AR_SEVERITY) {
1194                         recover_paddr = m.addr;
1195                         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1196                                 flags |= MF_MUST_KILL;
1197                 } else if (kill_it) {
1198                         force_sig(SIGBUS, current);
1199                 }
1200         }
1201
1202         if (worst > 0)
1203                 mce_report_event(regs);
1204         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1205 out:
1206         sync_core();
1207
1208         if (recover_paddr == ~0ull)
1209                 goto done;
1210
1211         pr_err("Uncorrected hardware memory error in user-access at %llx",
1212                  recover_paddr);
1213         /*
1214          * We must call memory_failure() here even if the current process is
1215          * doomed. We still need to mark the page as poisoned and alert any
1216          * other users of the page.
1217          */
1218         ist_begin_non_atomic(regs);
1219         local_irq_enable();
1220         if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1221                 pr_err("Memory error not recovered");
1222                 force_sig(SIGBUS, current);
1223         }
1224         local_irq_disable();
1225         ist_end_non_atomic();
1226 done:
1227         ist_exit(regs, prev_state);
1228 }
1229 EXPORT_SYMBOL_GPL(do_machine_check);
1230
1231 #ifndef CONFIG_MEMORY_FAILURE
1232 int memory_failure(unsigned long pfn, int vector, int flags)
1233 {
1234         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1235         BUG_ON(flags & MF_ACTION_REQUIRED);
1236         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1237                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1238                pfn);
1239
1240         return 0;
1241 }
1242 #endif
1243
1244 /*
1245  * Action optional processing happens here (picking up
1246  * from the list of faulting pages that do_machine_check()
1247  * placed into the "ring").
1248  */
1249 static void mce_process_work(struct work_struct *dummy)
1250 {
1251         unsigned long pfn;
1252
1253         while (mce_ring_get(&pfn))
1254                 memory_failure(pfn, MCE_VECTOR, 0);
1255 }
1256
1257 #ifdef CONFIG_X86_MCE_INTEL
1258 /***
1259  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1260  * @cpu: The CPU on which the event occurred.
1261  * @status: Event status information
1262  *
1263  * This function should be called by the thermal interrupt after the
1264  * event has been processed and the decision was made to log the event
1265  * further.
1266  *
1267  * The status parameter will be saved to the 'status' field of 'struct mce'
1268  * and historically has been the register value of the
1269  * MSR_IA32_THERMAL_STATUS (Intel) msr.
1270  */
1271 void mce_log_therm_throt_event(__u64 status)
1272 {
1273         struct mce m;
1274
1275         mce_setup(&m);
1276         m.bank = MCE_THERMAL_BANK;
1277         m.status = status;
1278         mce_log(&m);
1279 }
1280 #endif /* CONFIG_X86_MCE_INTEL */
1281
1282 /*
1283  * Periodic polling timer for "silent" machine check errors.  If the
1284  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1285  * errors, poll 2x slower (up to check_interval seconds).
1286  */
1287 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1288
1289 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1290 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1291
1292 static unsigned long mce_adjust_timer_default(unsigned long interval)
1293 {
1294         return interval;
1295 }
1296
1297 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1298
1299 static void __restart_timer(struct timer_list *t, unsigned long interval)
1300 {
1301         unsigned long when = jiffies + interval;
1302         unsigned long flags;
1303
1304         local_irq_save(flags);
1305
1306         if (timer_pending(t)) {
1307                 if (time_before(when, t->expires))
1308                         mod_timer_pinned(t, when);
1309         } else {
1310                 t->expires = round_jiffies(when);
1311                 add_timer_on(t, smp_processor_id());
1312         }
1313
1314         local_irq_restore(flags);
1315 }
1316
1317 static void mce_timer_fn(unsigned long data)
1318 {
1319         struct timer_list *t = this_cpu_ptr(&mce_timer);
1320         int cpu = smp_processor_id();
1321         unsigned long iv;
1322
1323         WARN_ON(cpu != data);
1324
1325         iv = __this_cpu_read(mce_next_interval);
1326
1327         if (mce_available(this_cpu_ptr(&cpu_info))) {
1328                 machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1329
1330                 if (mce_intel_cmci_poll()) {
1331                         iv = mce_adjust_timer(iv);
1332                         goto done;
1333                 }
1334         }
1335
1336         /*
1337          * Alert userspace if needed. If we logged an MCE, reduce the polling
1338          * interval, otherwise increase the polling interval.
1339          */
1340         if (mce_notify_irq())
1341                 iv = max(iv / 2, (unsigned long) HZ/100);
1342         else
1343                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1344
1345 done:
1346         __this_cpu_write(mce_next_interval, iv);
1347         __restart_timer(t, iv);
1348 }
1349
1350 /*
1351  * Ensure that the timer is firing in @interval from now.
1352  */
1353 void mce_timer_kick(unsigned long interval)
1354 {
1355         struct timer_list *t = this_cpu_ptr(&mce_timer);
1356         unsigned long iv = __this_cpu_read(mce_next_interval);
1357
1358         __restart_timer(t, interval);
1359
1360         if (interval < iv)
1361                 __this_cpu_write(mce_next_interval, interval);
1362 }
1363
1364 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1365 static void mce_timer_delete_all(void)
1366 {
1367         int cpu;
1368
1369         for_each_online_cpu(cpu)
1370                 del_timer_sync(&per_cpu(mce_timer, cpu));
1371 }
1372
1373 static void mce_do_trigger(struct work_struct *work)
1374 {
1375         call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1376 }
1377
1378 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1379
1380 /*
1381  * Notify the user(s) about new machine check events.
1382  * Can be called from interrupt context, but not from machine check/NMI
1383  * context.
1384  */
1385 int mce_notify_irq(void)
1386 {
1387         /* Not more than two messages every minute */
1388         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1389
1390         if (test_and_clear_bit(0, &mce_need_notify)) {
1391                 /* wake processes polling /dev/mcelog */
1392                 wake_up_interruptible(&mce_chrdev_wait);
1393
1394                 if (mce_helper[0])
1395                         schedule_work(&mce_trigger_work);
1396
1397                 if (__ratelimit(&ratelimit))
1398                         pr_info(HW_ERR "Machine check events logged\n");
1399
1400                 return 1;
1401         }
1402         return 0;
1403 }
1404 EXPORT_SYMBOL_GPL(mce_notify_irq);
1405
1406 static int __mcheck_cpu_mce_banks_init(void)
1407 {
1408         int i;
1409         u8 num_banks = mca_cfg.banks;
1410
1411         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1412         if (!mce_banks)
1413                 return -ENOMEM;
1414
1415         for (i = 0; i < num_banks; i++) {
1416                 struct mce_bank *b = &mce_banks[i];
1417
1418                 b->ctl = -1ULL;
1419                 b->init = 1;
1420         }
1421         return 0;
1422 }
1423
1424 /*
1425  * Initialize Machine Checks for a CPU.
1426  */
1427 static int __mcheck_cpu_cap_init(void)
1428 {
1429         unsigned b;
1430         u64 cap;
1431
1432         rdmsrl(MSR_IA32_MCG_CAP, cap);
1433
1434         b = cap & MCG_BANKCNT_MASK;
1435         if (!mca_cfg.banks)
1436                 pr_info("CPU supports %d MCE banks\n", b);
1437
1438         if (b > MAX_NR_BANKS) {
1439                 pr_warn("Using only %u machine check banks out of %u\n",
1440                         MAX_NR_BANKS, b);
1441                 b = MAX_NR_BANKS;
1442         }
1443
1444         /* Don't support asymmetric configurations today */
1445         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1446         mca_cfg.banks = b;
1447
1448         if (!mce_banks) {
1449                 int err = __mcheck_cpu_mce_banks_init();
1450
1451                 if (err)
1452                         return err;
1453         }
1454
1455         /* Use accurate RIP reporting if available. */
1456         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1457                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1458
1459         if (cap & MCG_SER_P)
1460                 mca_cfg.ser = true;
1461
1462         return 0;
1463 }
1464
1465 static void __mcheck_cpu_init_generic(void)
1466 {
1467         enum mcp_flags m_fl = 0;
1468         mce_banks_t all_banks;
1469         u64 cap;
1470         int i;
1471
1472         if (!mca_cfg.bootlog)
1473                 m_fl = MCP_DONTLOG;
1474
1475         /*
1476          * Log the machine checks left over from the previous reset.
1477          */
1478         bitmap_fill(all_banks, MAX_NR_BANKS);
1479         machine_check_poll(MCP_UC | m_fl, &all_banks);
1480
1481         cr4_set_bits(X86_CR4_MCE);
1482
1483         rdmsrl(MSR_IA32_MCG_CAP, cap);
1484         if (cap & MCG_CTL_P)
1485                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1486
1487         for (i = 0; i < mca_cfg.banks; i++) {
1488                 struct mce_bank *b = &mce_banks[i];
1489
1490                 if (!b->init)
1491                         continue;
1492                 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1493                 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1494         }
1495 }
1496
1497 /*
1498  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1499  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1500  * Vol 3B Table 15-20). But this confuses both the code that determines
1501  * whether the machine check occurred in kernel or user mode, and also
1502  * the severity assessment code. Pretend that EIPV was set, and take the
1503  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1504  */
1505 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1506 {
1507         if (bank != 0)
1508                 return;
1509         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1510                 return;
1511         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1512                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1513                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1514                           MCACOD)) !=
1515                          (MCI_STATUS_UC|MCI_STATUS_EN|
1516                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1517                           MCI_STATUS_AR|MCACOD_INSTR))
1518                 return;
1519
1520         m->mcgstatus |= MCG_STATUS_EIPV;
1521         m->ip = regs->ip;
1522         m->cs = regs->cs;
1523 }
1524
1525 /* Add per CPU specific workarounds here */
1526 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1527 {
1528         struct mca_config *cfg = &mca_cfg;
1529
1530         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1531                 pr_info("unknown CPU type - not enabling MCE support\n");
1532                 return -EOPNOTSUPP;
1533         }
1534
1535         /* This should be disabled by the BIOS, but isn't always */
1536         if (c->x86_vendor == X86_VENDOR_AMD) {
1537                 if (c->x86 == 15 && cfg->banks > 4) {
1538                         /*
1539                          * disable GART TBL walk error reporting, which
1540                          * trips off incorrectly with the IOMMU & 3ware
1541                          * & Cerberus:
1542                          */
1543                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1544                 }
1545                 if (c->x86 <= 17 && cfg->bootlog < 0) {
1546                         /*
1547                          * Lots of broken BIOS around that don't clear them
1548                          * by default and leave crap in there. Don't log:
1549                          */
1550                         cfg->bootlog = 0;
1551                 }
1552                 /*
1553                  * Various K7s with broken bank 0 around. Always disable
1554                  * by default.
1555                  */
1556                 if (c->x86 == 6 && cfg->banks > 0)
1557                         mce_banks[0].ctl = 0;
1558
1559                 /*
1560                  * overflow_recov is supported for F15h Models 00h-0fh
1561                  * even though we don't have a CPUID bit for it.
1562                  */
1563                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1564                         mce_flags.overflow_recov = 1;
1565
1566                 /*
1567                  * Turn off MC4_MISC thresholding banks on those models since
1568                  * they're not supported there.
1569                  */
1570                 if (c->x86 == 0x15 &&
1571                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1572                         int i;
1573                         u64 hwcr;
1574                         bool need_toggle;
1575                         u32 msrs[] = {
1576                                 0x00000413, /* MC4_MISC0 */
1577                                 0xc0000408, /* MC4_MISC1 */
1578                         };
1579
1580                         rdmsrl(MSR_K7_HWCR, hwcr);
1581
1582                         /* McStatusWrEn has to be set */
1583                         need_toggle = !(hwcr & BIT(18));
1584
1585                         if (need_toggle)
1586                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1587
1588                         /* Clear CntP bit safely */
1589                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1590                                 msr_clear_bit(msrs[i], 62);
1591
1592                         /* restore old settings */
1593                         if (need_toggle)
1594                                 wrmsrl(MSR_K7_HWCR, hwcr);
1595                 }
1596         }
1597
1598         if (c->x86_vendor == X86_VENDOR_INTEL) {
1599                 /*
1600                  * SDM documents that on family 6 bank 0 should not be written
1601                  * because it aliases to another special BIOS controlled
1602                  * register.
1603                  * But it's not aliased anymore on model 0x1a+
1604                  * Don't ignore bank 0 completely because there could be a
1605                  * valid event later, merely don't write CTL0.
1606                  */
1607
1608                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1609                         mce_banks[0].init = 0;
1610
1611                 /*
1612                  * All newer Intel systems support MCE broadcasting. Enable
1613                  * synchronization with a one second timeout.
1614                  */
1615                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1616                         cfg->monarch_timeout < 0)
1617                         cfg->monarch_timeout = USEC_PER_SEC;
1618
1619                 /*
1620                  * There are also broken BIOSes on some Pentium M and
1621                  * earlier systems:
1622                  */
1623                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1624                         cfg->bootlog = 0;
1625
1626                 if (c->x86 == 6 && c->x86_model == 45)
1627                         quirk_no_way_out = quirk_sandybridge_ifu;
1628         }
1629         if (cfg->monarch_timeout < 0)
1630                 cfg->monarch_timeout = 0;
1631         if (cfg->bootlog != 0)
1632                 cfg->panic_timeout = 30;
1633
1634         return 0;
1635 }
1636
1637 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1638 {
1639         if (c->x86 != 5)
1640                 return 0;
1641
1642         switch (c->x86_vendor) {
1643         case X86_VENDOR_INTEL:
1644                 intel_p5_mcheck_init(c);
1645                 return 1;
1646                 break;
1647         case X86_VENDOR_CENTAUR:
1648                 winchip_mcheck_init(c);
1649                 return 1;
1650                 break;
1651         }
1652
1653         return 0;
1654 }
1655
1656 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1657 {
1658         switch (c->x86_vendor) {
1659         case X86_VENDOR_INTEL:
1660                 mce_intel_feature_init(c);
1661                 mce_adjust_timer = cmci_intel_adjust_timer;
1662                 break;
1663
1664         case X86_VENDOR_AMD: {
1665                 u32 ebx = cpuid_ebx(0x80000007);
1666
1667                 mce_amd_feature_init(c);
1668                 mce_flags.overflow_recov = !!(ebx & BIT(0));
1669                 mce_flags.succor         = !!(ebx & BIT(1));
1670                 break;
1671                 }
1672
1673         default:
1674                 break;
1675         }
1676 }
1677
1678 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1679 {
1680         unsigned long iv = check_interval * HZ;
1681
1682         if (mca_cfg.ignore_ce || !iv)
1683                 return;
1684
1685         per_cpu(mce_next_interval, cpu) = iv;
1686
1687         t->expires = round_jiffies(jiffies + iv);
1688         add_timer_on(t, cpu);
1689 }
1690
1691 static void __mcheck_cpu_init_timer(void)
1692 {
1693         struct timer_list *t = this_cpu_ptr(&mce_timer);
1694         unsigned int cpu = smp_processor_id();
1695
1696         setup_timer(t, mce_timer_fn, cpu);
1697         mce_start_timer(cpu, t);
1698 }
1699
1700 /* Handle unconfigured int18 (should never happen) */
1701 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1702 {
1703         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1704                smp_processor_id());
1705 }
1706
1707 /* Call the installed machine check handler for this CPU setup. */
1708 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1709                                                 unexpected_machine_check;
1710
1711 /*
1712  * Called for each booted CPU to set up machine checks.
1713  * Must be called with preempt off:
1714  */
1715 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1716 {
1717         if (mca_cfg.disabled)
1718                 return;
1719
1720         if (__mcheck_cpu_ancient_init(c))
1721                 return;
1722
1723         if (!mce_available(c))
1724                 return;
1725
1726         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1727                 mca_cfg.disabled = true;
1728                 return;
1729         }
1730
1731         machine_check_vector = do_machine_check;
1732
1733         __mcheck_cpu_init_generic();
1734         __mcheck_cpu_init_vendor(c);
1735         __mcheck_cpu_init_timer();
1736         INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
1737         init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
1738 }
1739
1740 /*
1741  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1742  */
1743
1744 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1745 static int mce_chrdev_open_count;       /* #times opened */
1746 static int mce_chrdev_open_exclu;       /* already open exclusive? */
1747
1748 static int mce_chrdev_open(struct inode *inode, struct file *file)
1749 {
1750         spin_lock(&mce_chrdev_state_lock);
1751
1752         if (mce_chrdev_open_exclu ||
1753             (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1754                 spin_unlock(&mce_chrdev_state_lock);
1755
1756                 return -EBUSY;
1757         }
1758
1759         if (file->f_flags & O_EXCL)
1760                 mce_chrdev_open_exclu = 1;
1761         mce_chrdev_open_count++;
1762
1763         spin_unlock(&mce_chrdev_state_lock);
1764
1765         return nonseekable_open(inode, file);
1766 }
1767
1768 static int mce_chrdev_release(struct inode *inode, struct file *file)
1769 {
1770         spin_lock(&mce_chrdev_state_lock);
1771
1772         mce_chrdev_open_count--;
1773         mce_chrdev_open_exclu = 0;
1774
1775         spin_unlock(&mce_chrdev_state_lock);
1776
1777         return 0;
1778 }
1779
1780 static void collect_tscs(void *data)
1781 {
1782         unsigned long *cpu_tsc = (unsigned long *)data;
1783
1784         rdtscll(cpu_tsc[smp_processor_id()]);
1785 }
1786
1787 static int mce_apei_read_done;
1788
1789 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1790 static int __mce_read_apei(char __user **ubuf, size_t usize)
1791 {
1792         int rc;
1793         u64 record_id;
1794         struct mce m;
1795
1796         if (usize < sizeof(struct mce))
1797                 return -EINVAL;
1798
1799         rc = apei_read_mce(&m, &record_id);
1800         /* Error or no more MCE record */
1801         if (rc <= 0) {
1802                 mce_apei_read_done = 1;
1803                 /*
1804                  * When ERST is disabled, mce_chrdev_read() should return
1805                  * "no record" instead of "no device."
1806                  */
1807                 if (rc == -ENODEV)
1808                         return 0;
1809                 return rc;
1810         }
1811         rc = -EFAULT;
1812         if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1813                 return rc;
1814         /*
1815          * In fact, we should have cleared the record after that has
1816          * been flushed to the disk or sent to network in
1817          * /sbin/mcelog, but we have no interface to support that now,
1818          * so just clear it to avoid duplication.
1819          */
1820         rc = apei_clear_mce(record_id);
1821         if (rc) {
1822                 mce_apei_read_done = 1;
1823                 return rc;
1824         }
1825         *ubuf += sizeof(struct mce);
1826
1827         return 0;
1828 }
1829
1830 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1831                                 size_t usize, loff_t *off)
1832 {
1833         char __user *buf = ubuf;
1834         unsigned long *cpu_tsc;
1835         unsigned prev, next;
1836         int i, err;
1837
1838         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1839         if (!cpu_tsc)
1840                 return -ENOMEM;
1841
1842         mutex_lock(&mce_chrdev_read_mutex);
1843
1844         if (!mce_apei_read_done) {
1845                 err = __mce_read_apei(&buf, usize);
1846                 if (err || buf != ubuf)
1847                         goto out;
1848         }
1849
1850         next = rcu_dereference_check_mce(mcelog.next);
1851
1852         /* Only supports full reads right now */
1853         err = -EINVAL;
1854         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1855                 goto out;
1856
1857         err = 0;
1858         prev = 0;
1859         do {
1860                 for (i = prev; i < next; i++) {
1861                         unsigned long start = jiffies;
1862                         struct mce *m = &mcelog.entry[i];
1863
1864                         while (!m->finished) {
1865                                 if (time_after_eq(jiffies, start + 2)) {
1866                                         memset(m, 0, sizeof(*m));
1867                                         goto timeout;
1868                                 }
1869                                 cpu_relax();
1870                         }
1871                         smp_rmb();
1872                         err |= copy_to_user(buf, m, sizeof(*m));
1873                         buf += sizeof(*m);
1874 timeout:
1875                         ;
1876                 }
1877
1878                 memset(mcelog.entry + prev, 0,
1879                        (next - prev) * sizeof(struct mce));
1880                 prev = next;
1881                 next = cmpxchg(&mcelog.next, prev, 0);
1882         } while (next != prev);
1883
1884         synchronize_sched();
1885
1886         /*
1887          * Collect entries that were still getting written before the
1888          * synchronize.
1889          */
1890         on_each_cpu(collect_tscs, cpu_tsc, 1);
1891
1892         for (i = next; i < MCE_LOG_LEN; i++) {
1893                 struct mce *m = &mcelog.entry[i];
1894
1895                 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1896                         err |= copy_to_user(buf, m, sizeof(*m));
1897                         smp_rmb();
1898                         buf += sizeof(*m);
1899                         memset(m, 0, sizeof(*m));
1900                 }
1901         }
1902
1903         if (err)
1904                 err = -EFAULT;
1905
1906 out:
1907         mutex_unlock(&mce_chrdev_read_mutex);
1908         kfree(cpu_tsc);
1909
1910         return err ? err : buf - ubuf;
1911 }
1912
1913 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1914 {
1915         poll_wait(file, &mce_chrdev_wait, wait);
1916         if (rcu_access_index(mcelog.next))
1917                 return POLLIN | POLLRDNORM;
1918         if (!mce_apei_read_done && apei_check_mce())
1919                 return POLLIN | POLLRDNORM;
1920         return 0;
1921 }
1922
1923 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1924                                 unsigned long arg)
1925 {
1926         int __user *p = (int __user *)arg;
1927
1928         if (!capable(CAP_SYS_ADMIN))
1929                 return -EPERM;
1930
1931         switch (cmd) {
1932         case MCE_GET_RECORD_LEN:
1933                 return put_user(sizeof(struct mce), p);
1934         case MCE_GET_LOG_LEN:
1935                 return put_user(MCE_LOG_LEN, p);
1936         case MCE_GETCLEAR_FLAGS: {
1937                 unsigned flags;
1938
1939                 do {
1940                         flags = mcelog.flags;
1941                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1942
1943                 return put_user(flags, p);
1944         }
1945         default:
1946                 return -ENOTTY;
1947         }
1948 }
1949
1950 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1951                             size_t usize, loff_t *off);
1952
1953 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1954                              const char __user *ubuf,
1955                              size_t usize, loff_t *off))
1956 {
1957         mce_write = fn;
1958 }
1959 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1960
1961 ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1962                          size_t usize, loff_t *off)
1963 {
1964         if (mce_write)
1965                 return mce_write(filp, ubuf, usize, off);
1966         else
1967                 return -EINVAL;
1968 }
1969
1970 static const struct file_operations mce_chrdev_ops = {
1971         .open                   = mce_chrdev_open,
1972         .release                = mce_chrdev_release,
1973         .read                   = mce_chrdev_read,
1974         .write                  = mce_chrdev_write,
1975         .poll                   = mce_chrdev_poll,
1976         .unlocked_ioctl         = mce_chrdev_ioctl,
1977         .llseek                 = no_llseek,
1978 };
1979
1980 static struct miscdevice mce_chrdev_device = {
1981         MISC_MCELOG_MINOR,
1982         "mcelog",
1983         &mce_chrdev_ops,
1984 };
1985
1986 static void __mce_disable_bank(void *arg)
1987 {
1988         int bank = *((int *)arg);
1989         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1990         cmci_disable_bank(bank);
1991 }
1992
1993 void mce_disable_bank(int bank)
1994 {
1995         if (bank >= mca_cfg.banks) {
1996                 pr_warn(FW_BUG
1997                         "Ignoring request to disable invalid MCA bank %d.\n",
1998                         bank);
1999                 return;
2000         }
2001         set_bit(bank, mce_banks_ce_disabled);
2002         on_each_cpu(__mce_disable_bank, &bank, 1);
2003 }
2004
2005 /*
2006  * mce=off Disables machine check
2007  * mce=no_cmci Disables CMCI
2008  * mce=no_lmce Disables LMCE
2009  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2010  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2011  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2012  *      monarchtimeout is how long to wait for other CPUs on machine
2013  *      check, or 0 to not wait
2014  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2015  * mce=nobootlog Don't log MCEs from before booting.
2016  * mce=bios_cmci_threshold Don't program the CMCI threshold
2017  */
2018 static int __init mcheck_enable(char *str)
2019 {
2020         struct mca_config *cfg = &mca_cfg;
2021
2022         if (*str == 0) {
2023                 enable_p5_mce();
2024                 return 1;
2025         }
2026         if (*str == '=')
2027                 str++;
2028         if (!strcmp(str, "off"))
2029                 cfg->disabled = true;
2030         else if (!strcmp(str, "no_cmci"))
2031                 cfg->cmci_disabled = true;
2032         else if (!strcmp(str, "no_lmce"))
2033                 cfg->lmce_disabled = true;
2034         else if (!strcmp(str, "dont_log_ce"))
2035                 cfg->dont_log_ce = true;
2036         else if (!strcmp(str, "ignore_ce"))
2037                 cfg->ignore_ce = true;
2038         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2039                 cfg->bootlog = (str[0] == 'b');
2040         else if (!strcmp(str, "bios_cmci_threshold"))
2041                 cfg->bios_cmci_threshold = true;
2042         else if (isdigit(str[0])) {
2043                 if (get_option(&str, &cfg->tolerant) == 2)
2044                         get_option(&str, &(cfg->monarch_timeout));
2045         } else {
2046                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2047                 return 0;
2048         }
2049         return 1;
2050 }
2051 __setup("mce", mcheck_enable);
2052
2053 int __init mcheck_init(void)
2054 {
2055         mcheck_intel_therm_init();
2056         mcheck_vendor_init_severity();
2057
2058         return 0;
2059 }
2060
2061 /*
2062  * mce_syscore: PM support
2063  */
2064
2065 /*
2066  * Disable machine checks on suspend and shutdown. We can't really handle
2067  * them later.
2068  */
2069 static int mce_disable_error_reporting(void)
2070 {
2071         int i;
2072
2073         for (i = 0; i < mca_cfg.banks; i++) {
2074                 struct mce_bank *b = &mce_banks[i];
2075
2076                 if (b->init)
2077                         wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2078         }
2079         return 0;
2080 }
2081
2082 static int mce_syscore_suspend(void)
2083 {
2084         return mce_disable_error_reporting();
2085 }
2086
2087 static void mce_syscore_shutdown(void)
2088 {
2089         mce_disable_error_reporting();
2090 }
2091
2092 /*
2093  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2094  * Only one CPU is active at this time, the others get re-added later using
2095  * CPU hotplug:
2096  */
2097 static void mce_syscore_resume(void)
2098 {
2099         __mcheck_cpu_init_generic();
2100         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2101 }
2102
2103 static struct syscore_ops mce_syscore_ops = {
2104         .suspend        = mce_syscore_suspend,
2105         .shutdown       = mce_syscore_shutdown,
2106         .resume         = mce_syscore_resume,
2107 };
2108
2109 /*
2110  * mce_device: Sysfs support
2111  */
2112
2113 static void mce_cpu_restart(void *data)
2114 {
2115         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2116                 return;
2117         __mcheck_cpu_init_generic();
2118         __mcheck_cpu_init_timer();
2119 }
2120
2121 /* Reinit MCEs after user configuration changes */
2122 static void mce_restart(void)
2123 {
2124         mce_timer_delete_all();
2125         on_each_cpu(mce_cpu_restart, NULL, 1);
2126 }
2127
2128 /* Toggle features for corrected errors */
2129 static void mce_disable_cmci(void *data)
2130 {
2131         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2132                 return;
2133         cmci_clear();
2134 }
2135
2136 static void mce_enable_ce(void *all)
2137 {
2138         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2139                 return;
2140         cmci_reenable();
2141         cmci_recheck();
2142         if (all)
2143                 __mcheck_cpu_init_timer();
2144 }
2145
2146 static struct bus_type mce_subsys = {
2147         .name           = "machinecheck",
2148         .dev_name       = "machinecheck",
2149 };
2150
2151 DEFINE_PER_CPU(struct device *, mce_device);
2152
2153 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2154
2155 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2156 {
2157         return container_of(attr, struct mce_bank, attr);
2158 }
2159
2160 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2161                          char *buf)
2162 {
2163         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2164 }
2165
2166 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2167                         const char *buf, size_t size)
2168 {
2169         u64 new;
2170
2171         if (kstrtou64(buf, 0, &new) < 0)
2172                 return -EINVAL;
2173
2174         attr_to_bank(attr)->ctl = new;
2175         mce_restart();
2176
2177         return size;
2178 }
2179
2180 static ssize_t
2181 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2182 {
2183         strcpy(buf, mce_helper);
2184         strcat(buf, "\n");
2185         return strlen(mce_helper) + 1;
2186 }
2187
2188 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2189                                 const char *buf, size_t siz)
2190 {
2191         char *p;
2192
2193         strncpy(mce_helper, buf, sizeof(mce_helper));
2194         mce_helper[sizeof(mce_helper)-1] = 0;
2195         p = strchr(mce_helper, '\n');
2196
2197         if (p)
2198                 *p = 0;
2199
2200         return strlen(mce_helper) + !!p;
2201 }
2202
2203 static ssize_t set_ignore_ce(struct device *s,
2204                              struct device_attribute *attr,
2205                              const char *buf, size_t size)
2206 {
2207         u64 new;
2208
2209         if (kstrtou64(buf, 0, &new) < 0)
2210                 return -EINVAL;
2211
2212         if (mca_cfg.ignore_ce ^ !!new) {
2213                 if (new) {
2214                         /* disable ce features */
2215                         mce_timer_delete_all();
2216                         on_each_cpu(mce_disable_cmci, NULL, 1);
2217                         mca_cfg.ignore_ce = true;
2218                 } else {
2219                         /* enable ce features */
2220                         mca_cfg.ignore_ce = false;
2221                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2222                 }
2223         }
2224         return size;
2225 }
2226
2227 static ssize_t set_cmci_disabled(struct device *s,
2228                                  struct device_attribute *attr,
2229                                  const char *buf, size_t size)
2230 {
2231         u64 new;
2232
2233         if (kstrtou64(buf, 0, &new) < 0)
2234                 return -EINVAL;
2235
2236         if (mca_cfg.cmci_disabled ^ !!new) {
2237                 if (new) {
2238                         /* disable cmci */
2239                         on_each_cpu(mce_disable_cmci, NULL, 1);
2240                         mca_cfg.cmci_disabled = true;
2241                 } else {
2242                         /* enable cmci */
2243                         mca_cfg.cmci_disabled = false;
2244                         on_each_cpu(mce_enable_ce, NULL, 1);
2245                 }
2246         }
2247         return size;
2248 }
2249
2250 static ssize_t store_int_with_restart(struct device *s,
2251                                       struct device_attribute *attr,
2252                                       const char *buf, size_t size)
2253 {
2254         ssize_t ret = device_store_int(s, attr, buf, size);
2255         mce_restart();
2256         return ret;
2257 }
2258
2259 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2260 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2261 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2262 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2263
2264 static struct dev_ext_attribute dev_attr_check_interval = {
2265         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2266         &check_interval
2267 };
2268
2269 static struct dev_ext_attribute dev_attr_ignore_ce = {
2270         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2271         &mca_cfg.ignore_ce
2272 };
2273
2274 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2275         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2276         &mca_cfg.cmci_disabled
2277 };
2278
2279 static struct device_attribute *mce_device_attrs[] = {
2280         &dev_attr_tolerant.attr,
2281         &dev_attr_check_interval.attr,
2282         &dev_attr_trigger,
2283         &dev_attr_monarch_timeout.attr,
2284         &dev_attr_dont_log_ce.attr,
2285         &dev_attr_ignore_ce.attr,
2286         &dev_attr_cmci_disabled.attr,
2287         NULL
2288 };
2289
2290 static cpumask_var_t mce_device_initialized;
2291
2292 static void mce_device_release(struct device *dev)
2293 {
2294         kfree(dev);
2295 }
2296
2297 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2298 static int mce_device_create(unsigned int cpu)
2299 {
2300         struct device *dev;
2301         int err;
2302         int i, j;
2303
2304         if (!mce_available(&boot_cpu_data))
2305                 return -EIO;
2306
2307         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2308         if (!dev)
2309                 return -ENOMEM;
2310         dev->id  = cpu;
2311         dev->bus = &mce_subsys;
2312         dev->release = &mce_device_release;
2313
2314         err = device_register(dev);
2315         if (err) {
2316                 put_device(dev);
2317                 return err;
2318         }
2319
2320         for (i = 0; mce_device_attrs[i]; i++) {
2321                 err = device_create_file(dev, mce_device_attrs[i]);
2322                 if (err)
2323                         goto error;
2324         }
2325         for (j = 0; j < mca_cfg.banks; j++) {
2326                 err = device_create_file(dev, &mce_banks[j].attr);
2327                 if (err)
2328                         goto error2;
2329         }
2330         cpumask_set_cpu(cpu, mce_device_initialized);
2331         per_cpu(mce_device, cpu) = dev;
2332
2333         return 0;
2334 error2:
2335         while (--j >= 0)
2336                 device_remove_file(dev, &mce_banks[j].attr);
2337 error:
2338         while (--i >= 0)
2339                 device_remove_file(dev, mce_device_attrs[i]);
2340
2341         device_unregister(dev);
2342
2343         return err;
2344 }
2345
2346 static void mce_device_remove(unsigned int cpu)
2347 {
2348         struct device *dev = per_cpu(mce_device, cpu);
2349         int i;
2350
2351         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2352                 return;
2353
2354         for (i = 0; mce_device_attrs[i]; i++)
2355                 device_remove_file(dev, mce_device_attrs[i]);
2356
2357         for (i = 0; i < mca_cfg.banks; i++)
2358                 device_remove_file(dev, &mce_banks[i].attr);
2359
2360         device_unregister(dev);
2361         cpumask_clear_cpu(cpu, mce_device_initialized);
2362         per_cpu(mce_device, cpu) = NULL;
2363 }
2364
2365 /* Make sure there are no machine checks on offlined CPUs. */
2366 static void mce_disable_cpu(void *h)
2367 {
2368         unsigned long action = *(unsigned long *)h;
2369         int i;
2370
2371         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2372                 return;
2373
2374         if (!(action & CPU_TASKS_FROZEN))
2375                 cmci_clear();
2376         for (i = 0; i < mca_cfg.banks; i++) {
2377                 struct mce_bank *b = &mce_banks[i];
2378
2379                 if (b->init)
2380                         wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2381         }
2382 }
2383
2384 static void mce_reenable_cpu(void *h)
2385 {
2386         unsigned long action = *(unsigned long *)h;
2387         int i;
2388
2389         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2390                 return;
2391
2392         if (!(action & CPU_TASKS_FROZEN))
2393                 cmci_reenable();
2394         for (i = 0; i < mca_cfg.banks; i++) {
2395                 struct mce_bank *b = &mce_banks[i];
2396
2397                 if (b->init)
2398                         wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2399         }
2400 }
2401
2402 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2403 static int
2404 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2405 {
2406         unsigned int cpu = (unsigned long)hcpu;
2407         struct timer_list *t = &per_cpu(mce_timer, cpu);
2408
2409         switch (action & ~CPU_TASKS_FROZEN) {
2410         case CPU_ONLINE:
2411                 mce_device_create(cpu);
2412                 if (threshold_cpu_callback)
2413                         threshold_cpu_callback(action, cpu);
2414                 break;
2415         case CPU_DEAD:
2416                 if (threshold_cpu_callback)
2417                         threshold_cpu_callback(action, cpu);
2418                 mce_device_remove(cpu);
2419                 mce_intel_hcpu_update(cpu);
2420
2421                 /* intentionally ignoring frozen here */
2422                 if (!(action & CPU_TASKS_FROZEN))
2423                         cmci_rediscover();
2424                 break;
2425         case CPU_DOWN_PREPARE:
2426                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2427                 del_timer_sync(t);
2428                 break;
2429         case CPU_DOWN_FAILED:
2430                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2431                 mce_start_timer(cpu, t);
2432                 break;
2433         }
2434
2435         return NOTIFY_OK;
2436 }
2437
2438 static struct notifier_block mce_cpu_notifier = {
2439         .notifier_call = mce_cpu_callback,
2440 };
2441
2442 static __init void mce_init_banks(void)
2443 {
2444         int i;
2445
2446         for (i = 0; i < mca_cfg.banks; i++) {
2447                 struct mce_bank *b = &mce_banks[i];
2448                 struct device_attribute *a = &b->attr;
2449
2450                 sysfs_attr_init(&a->attr);
2451                 a->attr.name    = b->attrname;
2452                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2453
2454                 a->attr.mode    = 0644;
2455                 a->show         = show_bank;
2456                 a->store        = set_bank;
2457         }
2458 }
2459
2460 static __init int mcheck_init_device(void)
2461 {
2462         int err;
2463         int i = 0;
2464
2465         if (!mce_available(&boot_cpu_data)) {
2466                 err = -EIO;
2467                 goto err_out;
2468         }
2469
2470         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2471                 err = -ENOMEM;
2472                 goto err_out;
2473         }
2474
2475         mce_init_banks();
2476
2477         err = subsys_system_register(&mce_subsys, NULL);
2478         if (err)
2479                 goto err_out_mem;
2480
2481         cpu_notifier_register_begin();
2482         for_each_online_cpu(i) {
2483                 err = mce_device_create(i);
2484                 if (err) {
2485                         /*
2486                          * Register notifier anyway (and do not unreg it) so
2487                          * that we don't leave undeleted timers, see notifier
2488                          * callback above.
2489                          */
2490                         __register_hotcpu_notifier(&mce_cpu_notifier);
2491                         cpu_notifier_register_done();
2492                         goto err_device_create;
2493                 }
2494         }
2495
2496         __register_hotcpu_notifier(&mce_cpu_notifier);
2497         cpu_notifier_register_done();
2498
2499         register_syscore_ops(&mce_syscore_ops);
2500
2501         /* register character device /dev/mcelog */
2502         err = misc_register(&mce_chrdev_device);
2503         if (err)
2504                 goto err_register;
2505
2506         return 0;
2507
2508 err_register:
2509         unregister_syscore_ops(&mce_syscore_ops);
2510
2511 err_device_create:
2512         /*
2513          * We didn't keep track of which devices were created above, but
2514          * even if we had, the set of online cpus might have changed.
2515          * Play safe and remove for every possible cpu, since
2516          * mce_device_remove() will do the right thing.
2517          */
2518         for_each_possible_cpu(i)
2519                 mce_device_remove(i);
2520
2521 err_out_mem:
2522         free_cpumask_var(mce_device_initialized);
2523
2524 err_out:
2525         pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2526
2527         return err;
2528 }
2529 device_initcall_sync(mcheck_init_device);
2530
2531 /*
2532  * Old style boot options parsing. Only for compatibility.
2533  */
2534 static int __init mcheck_disable(char *str)
2535 {
2536         mca_cfg.disabled = true;
2537         return 1;
2538 }
2539 __setup("nomce", mcheck_disable);
2540
2541 #ifdef CONFIG_DEBUG_FS
2542 struct dentry *mce_get_debugfs_dir(void)
2543 {
2544         static struct dentry *dmce;
2545
2546         if (!dmce)
2547                 dmce = debugfs_create_dir("mce", NULL);
2548
2549         return dmce;
2550 }
2551
2552 static void mce_reset(void)
2553 {
2554         cpu_missing = 0;
2555         atomic_set(&mce_fake_panicked, 0);
2556         atomic_set(&mce_executing, 0);
2557         atomic_set(&mce_callin, 0);
2558         atomic_set(&global_nwo, 0);
2559 }
2560
2561 static int fake_panic_get(void *data, u64 *val)
2562 {
2563         *val = fake_panic;
2564         return 0;
2565 }
2566
2567 static int fake_panic_set(void *data, u64 val)
2568 {
2569         mce_reset();
2570         fake_panic = val;
2571         return 0;
2572 }
2573
2574 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2575                         fake_panic_set, "%llu\n");
2576
2577 static int __init mcheck_debugfs_init(void)
2578 {
2579         struct dentry *dmce, *ffake_panic;
2580
2581         dmce = mce_get_debugfs_dir();
2582         if (!dmce)
2583                 return -ENOMEM;
2584         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2585                                           &fake_panic_fops);
2586         if (!ffake_panic)
2587                 return -ENOMEM;
2588
2589         return 0;
2590 }
2591 late_initcall(mcheck_debugfs_init);
2592 #endif