arch/x86/kernel/cpu/mcheck/mce.c

   1 /*
   2  * Machine check handler.
   3  *
   4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5  * Rest from unknown author(s).
   6  * 2004 Andi Kleen. Rewrote most of it.
   7  * Copyright 2008 Intel Corporation
   8  * Author: Andi Kleen
   9  */
  10
  11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13 #include <linux/thread_info.h>
  14 #include <linux/capability.h>
  15 #include <linux/miscdevice.h>
  16 #include <linux/ratelimit.h>
  17 #include <linux/kallsyms.h>
  18 #include <linux/rcupdate.h>
  19 #include <linux/kobject.h>
  20 #include <linux/uaccess.h>
  21 #include <linux/kdebug.h>
  22 #include <linux/kernel.h>
  23 #include <linux/percpu.h>
  24 #include <linux/string.h>
  25 #include <linux/device.h>
  26 #include <linux/syscore_ops.h>
  27 #include <linux/delay.h>
  28 #include <linux/ctype.h>
  29 #include <linux/sched.h>
  30 #include <linux/sysfs.h>
  31 #include <linux/types.h>
  32 #include <linux/slab.h>
  33 #include <linux/init.h>
  34 #include <linux/kmod.h>
  35 #include <linux/poll.h>
  36 #include <linux/nmi.h>
  37 #include <linux/cpu.h>
  38 #include <linux/smp.h>
  39 #include <linux/fs.h>
  40 #include <linux/mm.h>
  41 #include <linux/debugfs.h>
  42 #include <linux/irq_work.h>
  43 #include <linux/export.h>
  44 #include <linux/jump_label.h>
  45
  46 #include <asm/intel-family.h>
  47 #include <asm/processor.h>
  48 #include <asm/traps.h>
  49 #include <asm/tlbflush.h>
  50 #include <asm/mce.h>
  51 #include <asm/msr.h>
  52
  53 #include "mce-internal.h"
  54
  55 static DEFINE_MUTEX(mce_chrdev_read_mutex);
  56
  57 static int mce_chrdev_open_count;       /* #times opened */
  58
  59 #define mce_log_get_idx_check(p) \
  60 ({ \
  61         RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
  62                          !lockdep_is_held(&mce_chrdev_read_mutex), \
  63                          "suspicious mce_log_get_idx_check() usage"); \
  64         smp_load_acquire(&(p)); \
  65 })
  66
  67 #define CREATE_TRACE_POINTS
  68 #include <trace/events/mce.h>
  69
  70 #define SPINUNIT                100     /* 100ns */
  71
  72 DEFINE_PER_CPU(unsigned, mce_exception_count);
  73
  74 struct mce_bank *mce_banks __read_mostly;
  75 struct mce_vendor_flags mce_flags __read_mostly;
  76
  77 struct mca_config mca_cfg __read_mostly = {
  78         .bootlog  = -1,
  79         /*
  80          * Tolerant levels:
  81          * 0: always panic on uncorrected errors, log corrected errors
  82          * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  83          * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  84          * 3: never panic or SIGBUS, log all errors (for testing only)
  85          */
  86         .tolerant = 1,
  87         .monarch_timeout = -1
  88 };
  89
  90 /* User mode helper program triggered by machine check event */
  91 static unsigned long            mce_need_notify;
  92 static char                     mce_helper[128];
  93 static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  94
  95 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  96
  97 static DEFINE_PER_CPU(struct mce, mces_seen);
  98 static int                      cpu_missing;
  99
 100 /*
 101  * MCA banks polled by the period polling timer for corrected events.
 102  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 103  */
 104 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 105         [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 106 };
 107
 108 /*
 109  * MCA banks controlled through firmware first for corrected errors.
 110  * This is a global list of banks for which we won't enable CMCI and we
 111  * won't poll. Firmware controls these banks and is responsible for
 112  * reporting corrected errors through GHES. Uncorrected/recoverable
 113  * errors are still notified through a machine check.
 114  */
 115 mce_banks_t mce_banks_ce_disabled;
 116
 117 static struct work_struct mce_work;
 118 static struct irq_work mce_irq_work;
 119
 120 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 121
 122 /*
 123  * CPU/chipset specific EDAC code can register a notifier call here to print
 124  * MCE errors in a human-readable form.
 125  */
 126 BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 127
 128 /* Do initial initialization of a struct mce */
 129 void mce_setup(struct mce *m)
 130 {
 131         memset(m, 0, sizeof(struct mce));
 132         m->cpu = m->extcpu = smp_processor_id();
 133         /* We hope get_seconds stays lockless */
 134         m->time = get_seconds();
 135         m->cpuvendor = boot_cpu_data.x86_vendor;
 136         m->cpuid = cpuid_eax(1);
 137         m->socketid = cpu_data(m->extcpu).phys_proc_id;
 138         m->apicid = cpu_data(m->extcpu).initial_apicid;
 139         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 140
 141         if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
 142                 rdmsrl(MSR_PPIN, m->ppin);
 143 }
 144
 145 DEFINE_PER_CPU(struct mce, injectm);
 146 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 147
 148 /*
 149  * Lockless MCE logging infrastructure.
 150  * This avoids deadlocks on printk locks without having to break locks. Also
 151  * separate MCEs from kernel messages to avoid bogus bug reports.
 152  */
 153
 154 static struct mce_log mcelog = {
 155         .signature      = MCE_LOG_SIGNATURE,
 156         .len            = MCE_LOG_LEN,
 157         .recordlen      = sizeof(struct mce),
 158 };
 159
 160 void mce_log(struct mce *mce)
 161 {
 162         unsigned next, entry;
 163
 164         /* Emit the trace record: */
 165         trace_mce_record(mce);
 166
 167         if (!mce_gen_pool_add(mce))
 168                 irq_work_queue(&mce_irq_work);
 169
 170         wmb();
 171         for (;;) {
 172                 entry = mce_log_get_idx_check(mcelog.next);
 173                 for (;;) {
 174
 175                         /*
 176                          * When the buffer fills up discard new entries.
 177                          * Assume that the earlier errors are the more
 178                          * interesting ones:
 179                          */
 180                         if (entry >= MCE_LOG_LEN) {
 181                                 set_bit(MCE_OVERFLOW,
 182                                         (unsigned long *)&mcelog.flags);
 183                                 return;
 184                         }
 185                         /* Old left over entry. Skip: */
 186                         if (mcelog.entry[entry].finished) {
 187                                 entry++;
 188                                 continue;
 189                         }
 190                         break;
 191                 }
 192                 smp_rmb();
 193                 next = entry + 1;
 194                 if (cmpxchg(&mcelog.next, entry, next) == entry)
 195                         break;
 196         }
 197         memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 198         wmb();
 199         mcelog.entry[entry].finished = 1;
 200         wmb();
 201
 202         set_bit(0, &mce_need_notify);
 203 }
 204
 205 void mce_inject_log(struct mce *m)
 206 {
 207         mutex_lock(&mce_chrdev_read_mutex);
 208         mce_log(m);
 209         mutex_unlock(&mce_chrdev_read_mutex);
 210 }
 211 EXPORT_SYMBOL_GPL(mce_inject_log);
 212
 213 static struct notifier_block mce_srao_nb;
 214
 215 static atomic_t num_notifiers;
 216
 217 void mce_register_decode_chain(struct notifier_block *nb)
 218 {
 219         atomic_inc(&num_notifiers);
 220
 221         WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
 222
 223         blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 224 }
 225 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 226
 227 void mce_unregister_decode_chain(struct notifier_block *nb)
 228 {
 229         atomic_dec(&num_notifiers);
 230
 231         blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 232 }
 233 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 234
 235 static inline u32 ctl_reg(int bank)
 236 {
 237         return MSR_IA32_MCx_CTL(bank);
 238 }
 239
 240 static inline u32 status_reg(int bank)
 241 {
 242         return MSR_IA32_MCx_STATUS(bank);
 243 }
 244
 245 static inline u32 addr_reg(int bank)
 246 {
 247         return MSR_IA32_MCx_ADDR(bank);
 248 }
 249
 250 static inline u32 misc_reg(int bank)
 251 {
 252         return MSR_IA32_MCx_MISC(bank);
 253 }
 254
 255 static inline u32 smca_ctl_reg(int bank)
 256 {
 257         return MSR_AMD64_SMCA_MCx_CTL(bank);
 258 }
 259
 260 static inline u32 smca_status_reg(int bank)
 261 {
 262         return MSR_AMD64_SMCA_MCx_STATUS(bank);
 263 }
 264
 265 static inline u32 smca_addr_reg(int bank)
 266 {
 267         return MSR_AMD64_SMCA_MCx_ADDR(bank);
 268 }
 269
 270 static inline u32 smca_misc_reg(int bank)
 271 {
 272         return MSR_AMD64_SMCA_MCx_MISC(bank);
 273 }
 274
 275 struct mca_msr_regs msr_ops = {
 276         .ctl    = ctl_reg,
 277         .status = status_reg,
 278         .addr   = addr_reg,
 279         .misc   = misc_reg
 280 };
 281
 282 static void __print_mce(struct mce *m)
 283 {
 284         pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
 285                  m->extcpu,
 286                  (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
 287                  m->mcgstatus, m->bank, m->status);
 288
 289         if (m->ip) {
 290                 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 291                         !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 292                         m->cs, m->ip);
 293
 294                 if (m->cs == __KERNEL_CS)
 295                         print_symbol("{%s}", m->ip);
 296                 pr_cont("\n");
 297         }
 298
 299         pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 300         if (m->addr)
 301                 pr_cont("ADDR %llx ", m->addr);
 302         if (m->misc)
 303                 pr_cont("MISC %llx ", m->misc);
 304
 305         if (mce_flags.smca) {
 306                 if (m->synd)
 307                         pr_cont("SYND %llx ", m->synd);
 308                 if (m->ipid)
 309                         pr_cont("IPID %llx ", m->ipid);
 310         }
 311
 312         pr_cont("\n");
 313         /*
 314          * Note this output is parsed by external tools and old fields
 315          * should not be changed.
 316          */
 317         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 318                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 319                 cpu_data(m->extcpu).microcode);
 320 }
 321
 322 static void print_mce(struct mce *m)
 323 {
 324         __print_mce(m);
 325         pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 326 }
 327
 328 #define PANIC_TIMEOUT 5 /* 5 seconds */
 329
 330 static atomic_t mce_panicked;
 331
 332 static int fake_panic;
 333 static atomic_t mce_fake_panicked;
 334
 335 /* Panic in progress. Enable interrupts and wait for final IPI */
 336 static void wait_for_panic(void)
 337 {
 338         long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 339
 340         preempt_disable();
 341         local_irq_enable();
 342         while (timeout-- > 0)
 343                 udelay(1);
 344         if (panic_timeout == 0)
 345                 panic_timeout = mca_cfg.panic_timeout;
 346         panic("Panicing machine check CPU died");
 347 }
 348
 349 static void mce_panic(const char *msg, struct mce *final, char *exp)
 350 {
 351         int apei_err = 0;
 352         struct llist_node *pending;
 353         struct mce_evt_llist *l;
 354
 355         if (!fake_panic) {
 356                 /*
 357                  * Make sure only one CPU runs in machine check panic
 358                  */
 359                 if (atomic_inc_return(&mce_panicked) > 1)
 360                         wait_for_panic();
 361                 barrier();
 362
 363                 bust_spinlocks(1);
 364                 console_verbose();
 365         } else {
 366                 /* Don't log too much for fake panic */
 367                 if (atomic_inc_return(&mce_fake_panicked) > 1)
 368                         return;
 369         }
 370         pending = mce_gen_pool_prepare_records();
 371         /* First print corrected ones that are still unlogged */
 372         llist_for_each_entry(l, pending, llnode) {
 373                 struct mce *m = &l->mce;
 374                 if (!(m->status & MCI_STATUS_UC)) {
 375                         print_mce(m);
 376                         if (!apei_err)
 377                                 apei_err = apei_write_mce(m);
 378                 }
 379         }
 380         /* Now print uncorrected but with the final one last */
 381         llist_for_each_entry(l, pending, llnode) {
 382                 struct mce *m = &l->mce;
 383                 if (!(m->status & MCI_STATUS_UC))
 384                         continue;
 385                 if (!final || mce_cmp(m, final)) {
 386                         print_mce(m);
 387                         if (!apei_err)
 388                                 apei_err = apei_write_mce(m);
 389                 }
 390         }
 391         if (final) {
 392                 print_mce(final);
 393                 if (!apei_err)
 394                         apei_err = apei_write_mce(final);
 395         }
 396         if (cpu_missing)
 397                 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 398         if (exp)
 399                 pr_emerg(HW_ERR "Machine check: %s\n", exp);
 400         if (!fake_panic) {
 401                 if (panic_timeout == 0)
 402                         panic_timeout = mca_cfg.panic_timeout;
 403                 panic(msg);
 404         } else
 405                 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 406 }
 407
 408 /* Support code for software error injection */
 409
 410 static int msr_to_offset(u32 msr)
 411 {
 412         unsigned bank = __this_cpu_read(injectm.bank);
 413
 414         if (msr == mca_cfg.rip_msr)
 415                 return offsetof(struct mce, ip);
 416         if (msr == msr_ops.status(bank))
 417                 return offsetof(struct mce, status);
 418         if (msr == msr_ops.addr(bank))
 419                 return offsetof(struct mce, addr);
 420         if (msr == msr_ops.misc(bank))
 421                 return offsetof(struct mce, misc);
 422         if (msr == MSR_IA32_MCG_STATUS)
 423                 return offsetof(struct mce, mcgstatus);
 424         return -1;
 425 }
 426
 427 /* MSR access wrappers used for error injection */
 428 static u64 mce_rdmsrl(u32 msr)
 429 {
 430         u64 v;
 431
 432         if (__this_cpu_read(injectm.finished)) {
 433                 int offset = msr_to_offset(msr);
 434
 435                 if (offset < 0)
 436                         return 0;
 437                 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 438         }
 439
 440         if (rdmsrl_safe(msr, &v)) {
 441                 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
 442                 /*
 443                  * Return zero in case the access faulted. This should
 444                  * not happen normally but can happen if the CPU does
 445                  * something weird, or if the code is buggy.
 446                  */
 447                 v = 0;
 448         }
 449
 450         return v;
 451 }
 452
 453 static void mce_wrmsrl(u32 msr, u64 v)
 454 {
 455         if (__this_cpu_read(injectm.finished)) {
 456                 int offset = msr_to_offset(msr);
 457
 458                 if (offset >= 0)
 459                         *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 460                 return;
 461         }
 462         wrmsrl(msr, v);
 463 }
 464
 465 /*
 466  * Collect all global (w.r.t. this processor) status about this machine
 467  * check into our "mce" struct so that we can use it later to assess
 468  * the severity of the problem as we read per-bank specific details.
 469  */
 470 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 471 {
 472         mce_setup(m);
 473
 474         m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 475         if (regs) {
 476                 /*
 477                  * Get the address of the instruction at the time of
 478                  * the machine check error.
 479                  */
 480                 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 481                         m->ip = regs->ip;
 482                         m->cs = regs->cs;
 483
 484                         /*
 485                          * When in VM86 mode make the cs look like ring 3
 486                          * always. This is a lie, but it's better than passing
 487                          * the additional vm86 bit around everywhere.
 488                          */
 489                         if (v8086_mode(regs))
 490                                 m->cs |= 3;
 491                 }
 492                 /* Use accurate RIP reporting if available. */
 493                 if (mca_cfg.rip_msr)
 494                         m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 495         }
 496 }
 497
 498 int mce_available(struct cpuinfo_x86 *c)
 499 {
 500         if (mca_cfg.disabled)
 501                 return 0;
 502         return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 503 }
 504
 505 static void mce_schedule_work(void)
 506 {
 507         if (!mce_gen_pool_empty())
 508                 schedule_work(&mce_work);
 509 }
 510
 511 static void mce_irq_work_cb(struct irq_work *entry)
 512 {
 513         mce_notify_irq();
 514         mce_schedule_work();
 515 }
 516
 517 static void mce_report_event(struct pt_regs *regs)
 518 {
 519         if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 520                 mce_notify_irq();
 521                 /*
 522                  * Triggering the work queue here is just an insurance
 523                  * policy in case the syscall exit notify handler
 524                  * doesn't run soon enough or ends up running on the
 525                  * wrong CPU (can happen when audit sleeps)
 526                  */
 527                 mce_schedule_work();
 528                 return;
 529         }
 530
 531         irq_work_queue(&mce_irq_work);
 532 }
 533
 534 /*
 535  * Check if the address reported by the CPU is in a format we can parse.
 536  * It would be possible to add code for most other cases, but all would
 537  * be somewhat complicated (e.g. segment offset would require an instruction
 538  * parser). So only support physical addresses up to page granuality for now.
 539  */
 540 static int mce_usable_address(struct mce *m)
 541 {
 542         if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 543                 return 0;
 544
 545         /* Checks after this one are Intel-specific: */
 546         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 547                 return 1;
 548
 549         if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 550                 return 0;
 551         if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 552                 return 0;
 553         return 1;
 554 }
 555
 556 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 557                                 void *data)
 558 {
 559         struct mce *mce = (struct mce *)data;
 560         unsigned long pfn;
 561
 562         if (!mce)
 563                 return NOTIFY_DONE;
 564
 565         if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 566                 pfn = mce->addr >> PAGE_SHIFT;
 567                 memory_failure(pfn, MCE_VECTOR, 0);
 568         }
 569
 570         return NOTIFY_OK;
 571 }
 572 static struct notifier_block mce_srao_nb = {
 573         .notifier_call  = srao_decode_notifier,
 574         .priority       = MCE_PRIO_SRAO,
 575 };
 576
 577 static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 578                                 void *data)
 579 {
 580         struct mce *m = (struct mce *)data;
 581
 582         if (!m)
 583                 return NOTIFY_DONE;
 584
 585         /*
 586          * Run the default notifier if we have only the SRAO
 587          * notifier and us registered.
 588          */
 589         if (atomic_read(&num_notifiers) > 2)
 590                 return NOTIFY_DONE;
 591
 592         /* Don't print when mcelog is running */
 593         if (mce_chrdev_open_count > 0)
 594                 return NOTIFY_DONE;
 595
 596         __print_mce(m);
 597
 598         return NOTIFY_DONE;
 599 }
 600
 601 static struct notifier_block mce_default_nb = {
 602         .notifier_call  = mce_default_notifier,
 603         /* lowest prio, we want it to run last. */
 604         .priority       = MCE_PRIO_LOWEST,
 605 };
 606
 607 /*
 608  * Read ADDR and MISC registers.
 609  */
 610 static void mce_read_aux(struct mce *m, int i)
 611 {
 612         if (m->status & MCI_STATUS_MISCV)
 613                 m->misc = mce_rdmsrl(msr_ops.misc(i));
 614
 615         if (m->status & MCI_STATUS_ADDRV) {
 616                 m->addr = mce_rdmsrl(msr_ops.addr(i));
 617
 618                 /*
 619                  * Mask the reported address by the reported granularity.
 620                  */
 621                 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 622                         u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 623                         m->addr >>= shift;
 624                         m->addr <<= shift;
 625                 }
 626
 627                 /*
 628                  * Extract [55:<lsb>] where lsb is the least significant
 629                  * *valid* bit of the address bits.
 630                  */
 631                 if (mce_flags.smca) {
 632                         u8 lsb = (m->addr >> 56) & 0x3f;
 633
 634                         m->addr &= GENMASK_ULL(55, lsb);
 635                 }
 636         }
 637
 638         if (mce_flags.smca) {
 639                 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 640
 641                 if (m->status & MCI_STATUS_SYNDV)
 642                         m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 643         }
 644 }
 645
 646 static bool memory_error(struct mce *m)
 647 {
 648         struct cpuinfo_x86 *c = &boot_cpu_data;
 649
 650         if (c->x86_vendor == X86_VENDOR_AMD) {
 651                 /* ErrCodeExt[20:16] */
 652                 u8 xec = (m->status >> 16) & 0x1f;
 653
 654                 return (xec == 0x0 || xec == 0x8);
 655         } else if (c->x86_vendor == X86_VENDOR_INTEL) {
 656                 /*
 657                  * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 658                  *
 659                  * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 660                  * indicating a memory error. Bit 8 is used for indicating a
 661                  * cache hierarchy error. The combination of bit 2 and bit 3
 662                  * is used for indicating a `generic' cache hierarchy error
 663                  * But we can't just blindly check the above bits, because if
 664                  * bit 11 is set, then it is a bus/interconnect error - and
 665                  * either way the above bits just gives more detail on what
 666                  * bus/interconnect error happened. Note that bit 12 can be
 667                  * ignored, as it's the "filter" bit.
 668                  */
 669                 return (m->status & 0xef80) == BIT(7) ||
 670                        (m->status & 0xef00) == BIT(8) ||
 671                        (m->status & 0xeffc) == 0xc;
 672         }
 673
 674         return false;
 675 }
 676
 677 DEFINE_PER_CPU(unsigned, mce_poll_count);
 678
 679 /*
 680  * Poll for corrected events or events that happened before reset.
 681  * Those are just logged through /dev/mcelog.
 682  *
 683  * This is executed in standard interrupt context.
 684  *
 685  * Note: spec recommends to panic for fatal unsignalled
 686  * errors here. However this would be quite problematic --
 687  * we would need to reimplement the Monarch handling and
 688  * it would mess up the exclusion between exception handler
 689  * and poll hander -- * so we skip this for now.
 690  * These cases should not happen anyways, or only when the CPU
 691  * is already totally * confused. In this case it's likely it will
 692  * not fully execute the machine check handler either.
 693  */
 694 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 695 {
 696         bool error_seen = false;
 697         struct mce m;
 698         int severity;
 699         int i;
 700
 701         this_cpu_inc(mce_poll_count);
 702
 703         mce_gather_info(&m, NULL);
 704
 705         if (flags & MCP_TIMESTAMP)
 706                 m.tsc = rdtsc();
 707
 708         for (i = 0; i < mca_cfg.banks; i++) {
 709                 if (!mce_banks[i].ctl || !test_bit(i, *b))
 710                         continue;
 711
 712                 m.misc = 0;
 713                 m.addr = 0;
 714                 m.bank = i;
 715
 716                 barrier();
 717                 m.status = mce_rdmsrl(msr_ops.status(i));
 718                 if (!(m.status & MCI_STATUS_VAL))
 719                         continue;
 720
 721                 /*
 722                  * Uncorrected or signalled events are handled by the exception
 723                  * handler when it is enabled, so don't process those here.
 724                  *
 725                  * TBD do the same check for MCI_STATUS_EN here?
 726                  */
 727                 if (!(flags & MCP_UC) &&
 728                     (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 729                         continue;
 730
 731                 error_seen = true;
 732
 733                 mce_read_aux(&m, i);
 734
 735                 severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 736
 737                 if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
 738                         if (m.status & MCI_STATUS_ADDRV)
 739                                 m.severity = severity;
 740
 741                 /*
 742                  * Don't get the IP here because it's unlikely to
 743                  * have anything to do with the actual error location.
 744                  */
 745                 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 746                         mce_log(&m);
 747                 else if (mce_usable_address(&m)) {
 748                         /*
 749                          * Although we skipped logging this, we still want
 750                          * to take action. Add to the pool so the registered
 751                          * notifiers will see it.
 752                          */
 753                         if (!mce_gen_pool_add(&m))
 754                                 mce_schedule_work();
 755                 }
 756
 757                 /*
 758                  * Clear state for this bank.
 759                  */
 760                 mce_wrmsrl(msr_ops.status(i), 0);
 761         }
 762
 763         /*
 764          * Don't clear MCG_STATUS here because it's only defined for
 765          * exceptions.
 766          */
 767
 768         sync_core();
 769
 770         return error_seen;
 771 }
 772 EXPORT_SYMBOL_GPL(machine_check_poll);
 773
 774 /*
 775  * Do a quick check if any of the events requires a panic.
 776  * This decides if we keep the events around or clear them.
 777  */
 778 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 779                           struct pt_regs *regs)
 780 {
 781         int i, ret = 0;
 782         char *tmp;
 783
 784         for (i = 0; i < mca_cfg.banks; i++) {
 785                 m->status = mce_rdmsrl(msr_ops.status(i));
 786                 if (m->status & MCI_STATUS_VAL) {
 787                         __set_bit(i, validp);
 788                         if (quirk_no_way_out)
 789                                 quirk_no_way_out(i, m, regs);
 790                 }
 791
 792                 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 793                         *msg = tmp;
 794                         ret = 1;
 795                 }
 796         }
 797         return ret;
 798 }
 799
 800 /*
 801  * Variable to establish order between CPUs while scanning.
 802  * Each CPU spins initially until executing is equal its number.
 803  */
 804 static atomic_t mce_executing;
 805
 806 /*
 807  * Defines order of CPUs on entry. First CPU becomes Monarch.
 808  */
 809 static atomic_t mce_callin;
 810
 811 /*
 812  * Check if a timeout waiting for other CPUs happened.
 813  */
 814 static int mce_timed_out(u64 *t, const char *msg)
 815 {
 816         /*
 817          * The others already did panic for some reason.
 818          * Bail out like in a timeout.
 819          * rmb() to tell the compiler that system_state
 820          * might have been modified by someone else.
 821          */
 822         rmb();
 823         if (atomic_read(&mce_panicked))
 824                 wait_for_panic();
 825         if (!mca_cfg.monarch_timeout)
 826                 goto out;
 827         if ((s64)*t < SPINUNIT) {
 828                 if (mca_cfg.tolerant <= 1)
 829                         mce_panic(msg, NULL, NULL);
 830                 cpu_missing = 1;
 831                 return 1;
 832         }
 833         *t -= SPINUNIT;
 834 out:
 835         touch_nmi_watchdog();
 836         return 0;
 837 }
 838
 839 /*
 840  * The Monarch's reign.  The Monarch is the CPU who entered
 841  * the machine check handler first. It waits for the others to
 842  * raise the exception too and then grades them. When any
 843  * error is fatal panic. Only then let the others continue.
 844  *
 845  * The other CPUs entering the MCE handler will be controlled by the
 846  * Monarch. They are called Subjects.
 847  *
 848  * This way we prevent any potential data corruption in a unrecoverable case
 849  * and also makes sure always all CPU's errors are examined.
 850  *
 851  * Also this detects the case of a machine check event coming from outer
 852  * space (not detected by any CPUs) In this case some external agent wants
 853  * us to shut down, so panic too.
 854  *
 855  * The other CPUs might still decide to panic if the handler happens
 856  * in a unrecoverable place, but in this case the system is in a semi-stable
 857  * state and won't corrupt anything by itself. It's ok to let the others
 858  * continue for a bit first.
 859  *
 860  * All the spin loops have timeouts; when a timeout happens a CPU
 861  * typically elects itself to be Monarch.
 862  */
 863 static void mce_reign(void)
 864 {
 865         int cpu;
 866         struct mce *m = NULL;
 867         int global_worst = 0;
 868         char *msg = NULL;
 869         char *nmsg = NULL;
 870
 871         /*
 872          * This CPU is the Monarch and the other CPUs have run
 873          * through their handlers.
 874          * Grade the severity of the errors of all the CPUs.
 875          */
 876         for_each_possible_cpu(cpu) {
 877                 int severity = mce_severity(&per_cpu(mces_seen, cpu),
 878                                             mca_cfg.tolerant,
 879                                             &nmsg, true);
 880                 if (severity > global_worst) {
 881                         msg = nmsg;
 882                         global_worst = severity;
 883                         m = &per_cpu(mces_seen, cpu);
 884                 }
 885         }
 886
 887         /*
 888          * Cannot recover? Panic here then.
 889          * This dumps all the mces in the log buffer and stops the
 890          * other CPUs.
 891          */
 892         if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 893                 mce_panic("Fatal machine check", m, msg);
 894
 895         /*
 896          * For UC somewhere we let the CPU who detects it handle it.
 897          * Also must let continue the others, otherwise the handling
 898          * CPU could deadlock on a lock.
 899          */
 900
 901         /*
 902          * No machine check event found. Must be some external
 903          * source or one CPU is hung. Panic.
 904          */
 905         if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 906                 mce_panic("Fatal machine check from unknown source", NULL, NULL);
 907
 908         /*
 909          * Now clear all the mces_seen so that they don't reappear on
 910          * the next mce.
 911          */
 912         for_each_possible_cpu(cpu)
 913                 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 914 }
 915
 916 static atomic_t global_nwo;
 917
 918 /*
 919  * Start of Monarch synchronization. This waits until all CPUs have
 920  * entered the exception handler and then determines if any of them
 921  * saw a fatal event that requires panic. Then it executes them
 922  * in the entry order.
 923  * TBD double check parallel CPU hotunplug
 924  */
 925 static int mce_start(int *no_way_out)
 926 {
 927         int order;
 928         int cpus = num_online_cpus();
 929         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 930
 931         if (!timeout)
 932                 return -1;
 933
 934         atomic_add(*no_way_out, &global_nwo);
 935         /*
 936          * Rely on the implied barrier below, such that global_nwo
 937          * is updated before mce_callin.
 938          */
 939         order = atomic_inc_return(&mce_callin);
 940
 941         /*
 942          * Wait for everyone.
 943          */
 944         while (atomic_read(&mce_callin) != cpus) {
 945                 if (mce_timed_out(&timeout,
 946                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
 947                         atomic_set(&global_nwo, 0);
 948                         return -1;
 949                 }
 950                 ndelay(SPINUNIT);
 951         }
 952
 953         /*
 954          * mce_callin should be read before global_nwo
 955          */
 956         smp_rmb();
 957
 958         if (order == 1) {
 959                 /*
 960                  * Monarch: Starts executing now, the others wait.
 961                  */
 962                 atomic_set(&mce_executing, 1);
 963         } else {
 964                 /*
 965                  * Subject: Now start the scanning loop one by one in
 966                  * the original callin order.
 967                  * This way when there are any shared banks it will be
 968                  * only seen by one CPU before cleared, avoiding duplicates.
 969                  */
 970                 while (atomic_read(&mce_executing) < order) {
 971                         if (mce_timed_out(&timeout,
 972                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
 973                                 atomic_set(&global_nwo, 0);
 974                                 return -1;
 975                         }
 976                         ndelay(SPINUNIT);
 977                 }
 978         }
 979
 980         /*
 981          * Cache the global no_way_out state.
 982          */
 983         *no_way_out = atomic_read(&global_nwo);
 984
 985         return order;
 986 }
 987
 988 /*
 989  * Synchronize between CPUs after main scanning loop.
 990  * This invokes the bulk of the Monarch processing.
 991  */
 992 static int mce_end(int order)
 993 {
 994         int ret = -1;
 995         u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 996
 997         if (!timeout)
 998                 goto reset;
 999         if (order < 0)
1000                 goto reset;
1001
1002         /*
1003          * Allow others to run.
1004          */
1005         atomic_inc(&mce_executing);
1006
1007         if (order == 1) {
1008                 /* CHECKME: Can this race with a parallel hotplug? */
1009                 int cpus = num_online_cpus();
1010
1011                 /*
1012                  * Monarch: Wait for everyone to go through their scanning
1013                  * loops.
1014                  */
1015                 while (atomic_read(&mce_executing) <= cpus) {
1016                         if (mce_timed_out(&timeout,
1017                                           "Timeout: Monarch CPU unable to finish machine check processing"))
1018                                 goto reset;
1019                         ndelay(SPINUNIT);
1020                 }
1021
1022                 mce_reign();
1023                 barrier();
1024                 ret = 0;
1025         } else {
1026                 /*
1027                  * Subject: Wait for Monarch to finish.
1028                  */
1029                 while (atomic_read(&mce_executing) != 0) {
1030                         if (mce_timed_out(&timeout,
1031                                           "Timeout: Monarch CPU did not finish machine check processing"))
1032                                 goto reset;
1033                         ndelay(SPINUNIT);
1034                 }
1035
1036                 /*
1037                  * Don't reset anything. That's done by the Monarch.
1038                  */
1039                 return 0;
1040         }
1041
1042         /*
1043          * Reset all global state.
1044          */
1045 reset:
1046         atomic_set(&global_nwo, 0);
1047         atomic_set(&mce_callin, 0);
1048         barrier();
1049
1050         /*
1051          * Let others run again.
1052          */
1053         atomic_set(&mce_executing, 0);
1054         return ret;
1055 }
1056
1057 static void mce_clear_state(unsigned long *toclear)
1058 {
1059         int i;
1060
1061         for (i = 0; i < mca_cfg.banks; i++) {
1062                 if (test_bit(i, toclear))
1063                         mce_wrmsrl(msr_ops.status(i), 0);
1064         }
1065 }
1066
1067 static int do_memory_failure(struct mce *m)
1068 {
1069         int flags = MF_ACTION_REQUIRED;
1070         int ret;
1071
1072         pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1073         if (!(m->mcgstatus & MCG_STATUS_RIPV))
1074                 flags |= MF_MUST_KILL;
1075         ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
1076         if (ret)
1077                 pr_err("Memory error not recovered");
1078         return ret;
1079 }
1080
1081 /*
1082  * The actual machine check handler. This only handles real
1083  * exceptions when something got corrupted coming in through int 18.
1084  *
1085  * This is executed in NMI context not subject to normal locking rules. This
1086  * implies that most kernel services cannot be safely used. Don't even
1087  * think about putting a printk in there!
1088  *
1089  * On Intel systems this is entered on all CPUs in parallel through
1090  * MCE broadcast. However some CPUs might be broken beyond repair,
1091  * so be always careful when synchronizing with others.
1092  */
1093 void do_machine_check(struct pt_regs *regs, long error_code)
1094 {
1095         struct mca_config *cfg = &mca_cfg;
1096         struct mce m, *final;
1097         int i;
1098         int worst = 0;
1099         int severity;
1100
1101         /*
1102          * Establish sequential order between the CPUs entering the machine
1103          * check handler.
1104          */
1105         int order = -1;
1106         /*
1107          * If no_way_out gets set, there is no safe way to recover from this
1108          * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1109          */
1110         int no_way_out = 0;
1111         /*
1112          * If kill_it gets set, there might be a way to recover from this
1113          * error.
1114          */
1115         int kill_it = 0;
1116         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1117         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1118         char *msg = "Unknown";
1119
1120         /*
1121          * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1122          * on Intel.
1123          */
1124         int lmce = 1;
1125
1126         /* If this CPU is offline, just bail out. */
1127         if (cpu_is_offline(smp_processor_id())) {
1128                 u64 mcgstatus;
1129
1130                 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1131                 if (mcgstatus & MCG_STATUS_RIPV) {
1132                         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1133                         return;
1134                 }
1135         }
1136
1137         ist_enter(regs);
1138
1139         this_cpu_inc(mce_exception_count);
1140
1141         if (!cfg->banks)
1142                 goto out;
1143
1144         mce_gather_info(&m, regs);
1145         m.tsc = rdtsc();
1146
1147         final = this_cpu_ptr(&mces_seen);
1148         *final = m;
1149
1150         memset(valid_banks, 0, sizeof(valid_banks));
1151         no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1152
1153         barrier();
1154
1155         /*
1156          * When no restart IP might need to kill or panic.
1157          * Assume the worst for now, but if we find the
1158          * severity is MCE_AR_SEVERITY we have other options.
1159          */
1160         if (!(m.mcgstatus & MCG_STATUS_RIPV))
1161                 kill_it = 1;
1162
1163         /*
1164          * Check if this MCE is signaled to only this logical processor,
1165          * on Intel only.
1166          */
1167         if (m.cpuvendor == X86_VENDOR_INTEL)
1168                 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1169
1170         /*
1171          * Go through all banks in exclusion of the other CPUs. This way we
1172          * don't report duplicated events on shared banks because the first one
1173          * to see it will clear it. If this is a Local MCE, then no need to
1174          * perform rendezvous.
1175          */
1176         if (!lmce)
1177                 order = mce_start(&no_way_out);
1178
1179         for (i = 0; i < cfg->banks; i++) {
1180                 __clear_bit(i, toclear);
1181                 if (!test_bit(i, valid_banks))
1182                         continue;
1183                 if (!mce_banks[i].ctl)
1184                         continue;
1185
1186                 m.misc = 0;
1187                 m.addr = 0;
1188                 m.bank = i;
1189
1190                 m.status = mce_rdmsrl(msr_ops.status(i));
1191                 if ((m.status & MCI_STATUS_VAL) == 0)
1192                         continue;
1193
1194                 /*
1195                  * Non uncorrected or non signaled errors are handled by
1196                  * machine_check_poll. Leave them alone, unless this panics.
1197                  */
1198                 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1199                         !no_way_out)
1200                         continue;
1201
1202                 /*
1203                  * Set taint even when machine check was not enabled.
1204                  */
1205                 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1206
1207                 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1208
1209                 /*
1210                  * When machine check was for corrected/deferred handler don't
1211                  * touch, unless we're panicing.
1212                  */
1213                 if ((severity == MCE_KEEP_SEVERITY ||
1214                      severity == MCE_UCNA_SEVERITY) && !no_way_out)
1215                         continue;
1216                 __set_bit(i, toclear);
1217                 if (severity == MCE_NO_SEVERITY) {
1218                         /*
1219                          * Machine check event was not enabled. Clear, but
1220                          * ignore.
1221                          */
1222                         continue;
1223                 }
1224
1225                 mce_read_aux(&m, i);
1226
1227                 /* assuming valid severity level != 0 */
1228                 m.severity = severity;
1229
1230                 mce_log(&m);
1231
1232                 if (severity > worst) {
1233                         *final = m;
1234                         worst = severity;
1235                 }
1236         }
1237
1238         /* mce_clear_state will clear *final, save locally for use later */
1239         m = *final;
1240
1241         if (!no_way_out)
1242                 mce_clear_state(toclear);
1243
1244         /*
1245          * Do most of the synchronization with other CPUs.
1246          * When there's any problem use only local no_way_out state.
1247          */
1248         if (!lmce) {
1249                 if (mce_end(order) < 0)
1250                         no_way_out = worst >= MCE_PANIC_SEVERITY;
1251         } else {
1252                 /*
1253                  * Local MCE skipped calling mce_reign()
1254                  * If we found a fatal error, we need to panic here.
1255                  */
1256                  if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1257                         mce_panic("Machine check from unknown source",
1258                                 NULL, NULL);
1259         }
1260
1261         /*
1262          * If tolerant is at an insane level we drop requests to kill
1263          * processes and continue even when there is no way out.
1264          */
1265         if (cfg->tolerant == 3)
1266                 kill_it = 0;
1267         else if (no_way_out)
1268                 mce_panic("Fatal machine check on current CPU", &m, msg);
1269
1270         if (worst > 0)
1271                 mce_report_event(regs);
1272         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1273 out:
1274         sync_core();
1275
1276         if (worst != MCE_AR_SEVERITY && !kill_it)
1277                 goto out_ist;
1278
1279         /* Fault was in user mode and we need to take some action */
1280         if ((m.cs & 3) == 3) {
1281                 ist_begin_non_atomic(regs);
1282                 local_irq_enable();
1283
1284                 if (kill_it || do_memory_failure(&m))
1285                         force_sig(SIGBUS, current);
1286                 local_irq_disable();
1287                 ist_end_non_atomic();
1288         } else {
1289                 if (!fixup_exception(regs, X86_TRAP_MC))
1290                         mce_panic("Failed kernel mode recovery", &m, NULL);
1291         }
1292
1293 out_ist:
1294         ist_exit(regs);
1295 }
1296 EXPORT_SYMBOL_GPL(do_machine_check);
1297
1298 #ifndef CONFIG_MEMORY_FAILURE
1299 int memory_failure(unsigned long pfn, int vector, int flags)
1300 {
1301         /* mce_severity() should not hand us an ACTION_REQUIRED error */
1302         BUG_ON(flags & MF_ACTION_REQUIRED);
1303         pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1304                "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1305                pfn);
1306
1307         return 0;
1308 }
1309 #endif
1310
1311 /*
1312  * Periodic polling timer for "silent" machine check errors.  If the
1313  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1314  * errors, poll 2x slower (up to check_interval seconds).
1315  */
1316 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1317
1318 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1319 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1320
1321 static unsigned long mce_adjust_timer_default(unsigned long interval)
1322 {
1323         return interval;
1324 }
1325
1326 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1327
1328 static void __start_timer(struct timer_list *t, unsigned long interval)
1329 {
1330         unsigned long when = jiffies + interval;
1331         unsigned long flags;
1332
1333         local_irq_save(flags);
1334
1335         if (!timer_pending(t) || time_before(when, t->expires))
1336                 mod_timer(t, round_jiffies(when));
1337
1338         local_irq_restore(flags);
1339 }
1340
1341 static void mce_timer_fn(unsigned long data)
1342 {
1343         struct timer_list *t = this_cpu_ptr(&mce_timer);
1344         int cpu = smp_processor_id();
1345         unsigned long iv;
1346
1347         WARN_ON(cpu != data);
1348
1349         iv = __this_cpu_read(mce_next_interval);
1350
1351         if (mce_available(this_cpu_ptr(&cpu_info))) {
1352                 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1353
1354                 if (mce_intel_cmci_poll()) {
1355                         iv = mce_adjust_timer(iv);
1356                         goto done;
1357                 }
1358         }
1359
1360         /*
1361          * Alert userspace if needed. If we logged an MCE, reduce the polling
1362          * interval, otherwise increase the polling interval.
1363          */
1364         if (mce_notify_irq())
1365                 iv = max(iv / 2, (unsigned long) HZ/100);
1366         else
1367                 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1368
1369 done:
1370         __this_cpu_write(mce_next_interval, iv);
1371         __start_timer(t, iv);
1372 }
1373
1374 /*
1375  * Ensure that the timer is firing in @interval from now.
1376  */
1377 void mce_timer_kick(unsigned long interval)
1378 {
1379         struct timer_list *t = this_cpu_ptr(&mce_timer);
1380         unsigned long iv = __this_cpu_read(mce_next_interval);
1381
1382         __start_timer(t, interval);
1383
1384         if (interval < iv)
1385                 __this_cpu_write(mce_next_interval, interval);
1386 }
1387
1388 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
1389 static void mce_timer_delete_all(void)
1390 {
1391         int cpu;
1392
1393         for_each_online_cpu(cpu)
1394                 del_timer_sync(&per_cpu(mce_timer, cpu));
1395 }
1396
1397 static void mce_do_trigger(struct work_struct *work)
1398 {
1399         call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1400 }
1401
1402 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1403
1404 /*
1405  * Notify the user(s) about new machine check events.
1406  * Can be called from interrupt context, but not from machine check/NMI
1407  * context.
1408  */
1409 int mce_notify_irq(void)
1410 {
1411         /* Not more than two messages every minute */
1412         static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1413
1414         if (test_and_clear_bit(0, &mce_need_notify)) {
1415                 /* wake processes polling /dev/mcelog */
1416                 wake_up_interruptible(&mce_chrdev_wait);
1417
1418                 if (mce_helper[0])
1419                         schedule_work(&mce_trigger_work);
1420
1421                 if (__ratelimit(&ratelimit))
1422                         pr_info(HW_ERR "Machine check events logged\n");
1423
1424                 return 1;
1425         }
1426         return 0;
1427 }
1428 EXPORT_SYMBOL_GPL(mce_notify_irq);
1429
1430 static int __mcheck_cpu_mce_banks_init(void)
1431 {
1432         int i;
1433         u8 num_banks = mca_cfg.banks;
1434
1435         mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1436         if (!mce_banks)
1437                 return -ENOMEM;
1438
1439         for (i = 0; i < num_banks; i++) {
1440                 struct mce_bank *b = &mce_banks[i];
1441
1442                 b->ctl = -1ULL;
1443                 b->init = 1;
1444         }
1445         return 0;
1446 }
1447
1448 /*
1449  * Initialize Machine Checks for a CPU.
1450  */
1451 static int __mcheck_cpu_cap_init(void)
1452 {
1453         unsigned b;
1454         u64 cap;
1455
1456         rdmsrl(MSR_IA32_MCG_CAP, cap);
1457
1458         b = cap & MCG_BANKCNT_MASK;
1459         if (!mca_cfg.banks)
1460                 pr_info("CPU supports %d MCE banks\n", b);
1461
1462         if (b > MAX_NR_BANKS) {
1463                 pr_warn("Using only %u machine check banks out of %u\n",
1464                         MAX_NR_BANKS, b);
1465                 b = MAX_NR_BANKS;
1466         }
1467
1468         /* Don't support asymmetric configurations today */
1469         WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1470         mca_cfg.banks = b;
1471
1472         if (!mce_banks) {
1473                 int err = __mcheck_cpu_mce_banks_init();
1474
1475                 if (err)
1476                         return err;
1477         }
1478
1479         /* Use accurate RIP reporting if available. */
1480         if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1481                 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1482
1483         if (cap & MCG_SER_P)
1484                 mca_cfg.ser = true;
1485
1486         return 0;
1487 }
1488
1489 static void __mcheck_cpu_init_generic(void)
1490 {
1491         enum mcp_flags m_fl = 0;
1492         mce_banks_t all_banks;
1493         u64 cap;
1494
1495         if (!mca_cfg.bootlog)
1496                 m_fl = MCP_DONTLOG;
1497
1498         /*
1499          * Log the machine checks left over from the previous reset.
1500          */
1501         bitmap_fill(all_banks, MAX_NR_BANKS);
1502         machine_check_poll(MCP_UC | m_fl, &all_banks);
1503
1504         cr4_set_bits(X86_CR4_MCE);
1505
1506         rdmsrl(MSR_IA32_MCG_CAP, cap);
1507         if (cap & MCG_CTL_P)
1508                 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1509 }
1510
1511 static void __mcheck_cpu_init_clear_banks(void)
1512 {
1513         int i;
1514
1515         for (i = 0; i < mca_cfg.banks; i++) {
1516                 struct mce_bank *b = &mce_banks[i];
1517
1518                 if (!b->init)
1519                         continue;
1520                 wrmsrl(msr_ops.ctl(i), b->ctl);
1521                 wrmsrl(msr_ops.status(i), 0);
1522         }
1523 }
1524
1525 /*
1526  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1527  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1528  * Vol 3B Table 15-20). But this confuses both the code that determines
1529  * whether the machine check occurred in kernel or user mode, and also
1530  * the severity assessment code. Pretend that EIPV was set, and take the
1531  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1532  */
1533 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1534 {
1535         if (bank != 0)
1536                 return;
1537         if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1538                 return;
1539         if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1540                           MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1541                           MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1542                           MCACOD)) !=
1543                          (MCI_STATUS_UC|MCI_STATUS_EN|
1544                           MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1545                           MCI_STATUS_AR|MCACOD_INSTR))
1546                 return;
1547
1548         m->mcgstatus |= MCG_STATUS_EIPV;
1549         m->ip = regs->ip;
1550         m->cs = regs->cs;
1551 }
1552
1553 /* Add per CPU specific workarounds here */
1554 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1555 {
1556         struct mca_config *cfg = &mca_cfg;
1557
1558         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1559                 pr_info("unknown CPU type - not enabling MCE support\n");
1560                 return -EOPNOTSUPP;
1561         }
1562
1563         /* This should be disabled by the BIOS, but isn't always */
1564         if (c->x86_vendor == X86_VENDOR_AMD) {
1565                 if (c->x86 == 15 && cfg->banks > 4) {
1566                         /*
1567                          * disable GART TBL walk error reporting, which
1568                          * trips off incorrectly with the IOMMU & 3ware
1569                          * & Cerberus:
1570                          */
1571                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1572                 }
1573                 if (c->x86 < 17 && cfg->bootlog < 0) {
1574                         /*
1575                          * Lots of broken BIOS around that don't clear them
1576                          * by default and leave crap in there. Don't log:
1577                          */
1578                         cfg->bootlog = 0;
1579                 }
1580                 /*
1581                  * Various K7s with broken bank 0 around. Always disable
1582                  * by default.
1583                  */
1584                 if (c->x86 == 6 && cfg->banks > 0)
1585                         mce_banks[0].ctl = 0;
1586
1587                 /*
1588                  * overflow_recov is supported for F15h Models 00h-0fh
1589                  * even though we don't have a CPUID bit for it.
1590                  */
1591                 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1592                         mce_flags.overflow_recov = 1;
1593
1594                 /*
1595                  * Turn off MC4_MISC thresholding banks on those models since
1596                  * they're not supported there.
1597                  */
1598                 if (c->x86 == 0x15 &&
1599                     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1600                         int i;
1601                         u64 hwcr;
1602                         bool need_toggle;
1603                         u32 msrs[] = {
1604                                 0x00000413, /* MC4_MISC0 */
1605                                 0xc0000408, /* MC4_MISC1 */
1606                         };
1607
1608                         rdmsrl(MSR_K7_HWCR, hwcr);
1609
1610                         /* McStatusWrEn has to be set */
1611                         need_toggle = !(hwcr & BIT(18));
1612
1613                         if (need_toggle)
1614                                 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1615
1616                         /* Clear CntP bit safely */
1617                         for (i = 0; i < ARRAY_SIZE(msrs); i++)
1618                                 msr_clear_bit(msrs[i], 62);
1619
1620                         /* restore old settings */
1621                         if (need_toggle)
1622                                 wrmsrl(MSR_K7_HWCR, hwcr);
1623                 }
1624         }
1625
1626         if (c->x86_vendor == X86_VENDOR_INTEL) {
1627                 /*
1628                  * SDM documents that on family 6 bank 0 should not be written
1629                  * because it aliases to another special BIOS controlled
1630                  * register.
1631                  * But it's not aliased anymore on model 0x1a+
1632                  * Don't ignore bank 0 completely because there could be a
1633                  * valid event later, merely don't write CTL0.
1634                  */
1635
1636                 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1637                         mce_banks[0].init = 0;
1638
1639                 /*
1640                  * All newer Intel systems support MCE broadcasting. Enable
1641                  * synchronization with a one second timeout.
1642                  */
1643                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1644                         cfg->monarch_timeout < 0)
1645                         cfg->monarch_timeout = USEC_PER_SEC;
1646
1647                 /*
1648                  * There are also broken BIOSes on some Pentium M and
1649                  * earlier systems:
1650                  */
1651                 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1652                         cfg->bootlog = 0;
1653
1654                 if (c->x86 == 6 && c->x86_model == 45)
1655                         quirk_no_way_out = quirk_sandybridge_ifu;
1656         }
1657         if (cfg->monarch_timeout < 0)
1658                 cfg->monarch_timeout = 0;
1659         if (cfg->bootlog != 0)
1660                 cfg->panic_timeout = 30;
1661
1662         return 0;
1663 }
1664
1665 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1666 {
1667         if (c->x86 != 5)
1668                 return 0;
1669
1670         switch (c->x86_vendor) {
1671         case X86_VENDOR_INTEL:
1672                 intel_p5_mcheck_init(c);
1673                 return 1;
1674                 break;
1675         case X86_VENDOR_CENTAUR:
1676                 winchip_mcheck_init(c);
1677                 return 1;
1678                 break;
1679         default:
1680                 return 0;
1681         }
1682
1683         return 0;
1684 }
1685
1686 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1687 {
1688         switch (c->x86_vendor) {
1689         case X86_VENDOR_INTEL:
1690                 mce_intel_feature_init(c);
1691                 mce_adjust_timer = cmci_intel_adjust_timer;
1692                 break;
1693
1694         case X86_VENDOR_AMD: {
1695                 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1696                 mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
1697                 mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
1698
1699                 /*
1700                  * Install proper ops for Scalable MCA enabled processors
1701                  */
1702                 if (mce_flags.smca) {
1703                         msr_ops.ctl     = smca_ctl_reg;
1704                         msr_ops.status  = smca_status_reg;
1705                         msr_ops.addr    = smca_addr_reg;
1706                         msr_ops.misc    = smca_misc_reg;
1707                 }
1708                 mce_amd_feature_init(c);
1709
1710                 break;
1711                 }
1712
1713         default:
1714                 break;
1715         }
1716 }
1717
1718 static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1719 {
1720         switch (c->x86_vendor) {
1721         case X86_VENDOR_INTEL:
1722                 mce_intel_feature_clear(c);
1723                 break;
1724         default:
1725                 break;
1726         }
1727 }
1728
1729 static void mce_start_timer(struct timer_list *t)
1730 {
1731         unsigned long iv = check_interval * HZ;
1732
1733         if (mca_cfg.ignore_ce || !iv)
1734                 return;
1735
1736         this_cpu_write(mce_next_interval, iv);
1737         __start_timer(t, iv);
1738 }
1739
1740 static void __mcheck_cpu_setup_timer(void)
1741 {
1742         struct timer_list *t = this_cpu_ptr(&mce_timer);
1743         unsigned int cpu = smp_processor_id();
1744
1745         setup_pinned_timer(t, mce_timer_fn, cpu);
1746 }
1747
1748 static void __mcheck_cpu_init_timer(void)
1749 {
1750         struct timer_list *t = this_cpu_ptr(&mce_timer);
1751         unsigned int cpu = smp_processor_id();
1752
1753         setup_pinned_timer(t, mce_timer_fn, cpu);
1754         mce_start_timer(t);
1755 }
1756
1757 /* Handle unconfigured int18 (should never happen) */
1758 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1759 {
1760         pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1761                smp_processor_id());
1762 }
1763
1764 /* Call the installed machine check handler for this CPU setup. */
1765 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1766                                                 unexpected_machine_check;
1767
1768 /*
1769  * Called for each booted CPU to set up machine checks.
1770  * Must be called with preempt off:
1771  */
1772 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1773 {
1774         if (mca_cfg.disabled)
1775                 return;
1776
1777         if (__mcheck_cpu_ancient_init(c))
1778                 return;
1779
1780         if (!mce_available(c))
1781                 return;
1782
1783         if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1784                 mca_cfg.disabled = true;
1785                 return;
1786         }
1787
1788         if (mce_gen_pool_init()) {
1789                 mca_cfg.disabled = true;
1790                 pr_emerg("Couldn't allocate MCE records pool!\n");
1791                 return;
1792         }
1793
1794         machine_check_vector = do_machine_check;
1795
1796         __mcheck_cpu_init_generic();
1797         __mcheck_cpu_init_vendor(c);
1798         __mcheck_cpu_init_clear_banks();
1799         __mcheck_cpu_setup_timer();
1800 }
1801
1802 /*
1803  * Called for each booted CPU to clear some machine checks opt-ins
1804  */
1805 void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1806 {
1807         if (mca_cfg.disabled)
1808                 return;
1809
1810         if (!mce_available(c))
1811                 return;
1812
1813         /*
1814          * Possibly to clear general settings generic to x86
1815          * __mcheck_cpu_clear_generic(c);
1816          */
1817         __mcheck_cpu_clear_vendor(c);
1818
1819 }
1820
1821 /*
1822  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1823  */
1824
1825 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1826 static int mce_chrdev_open_exclu;       /* already open exclusive? */
1827
1828 static int mce_chrdev_open(struct inode *inode, struct file *file)
1829 {
1830         spin_lock(&mce_chrdev_state_lock);
1831
1832         if (mce_chrdev_open_exclu ||
1833             (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1834                 spin_unlock(&mce_chrdev_state_lock);
1835
1836                 return -EBUSY;
1837         }
1838
1839         if (file->f_flags & O_EXCL)
1840                 mce_chrdev_open_exclu = 1;
1841         mce_chrdev_open_count++;
1842
1843         spin_unlock(&mce_chrdev_state_lock);
1844
1845         return nonseekable_open(inode, file);
1846 }
1847
1848 static int mce_chrdev_release(struct inode *inode, struct file *file)
1849 {
1850         spin_lock(&mce_chrdev_state_lock);
1851
1852         mce_chrdev_open_count--;
1853         mce_chrdev_open_exclu = 0;
1854
1855         spin_unlock(&mce_chrdev_state_lock);
1856
1857         return 0;
1858 }
1859
1860 static void collect_tscs(void *data)
1861 {
1862         unsigned long *cpu_tsc = (unsigned long *)data;
1863
1864         cpu_tsc[smp_processor_id()] = rdtsc();
1865 }
1866
1867 static int mce_apei_read_done;
1868
1869 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1870 static int __mce_read_apei(char __user **ubuf, size_t usize)
1871 {
1872         int rc;
1873         u64 record_id;
1874         struct mce m;
1875
1876         if (usize < sizeof(struct mce))
1877                 return -EINVAL;
1878
1879         rc = apei_read_mce(&m, &record_id);
1880         /* Error or no more MCE record */
1881         if (rc <= 0) {
1882                 mce_apei_read_done = 1;
1883                 /*
1884                  * When ERST is disabled, mce_chrdev_read() should return
1885                  * "no record" instead of "no device."
1886                  */
1887                 if (rc == -ENODEV)
1888                         return 0;
1889                 return rc;
1890         }
1891         rc = -EFAULT;
1892         if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1893                 return rc;
1894         /*
1895          * In fact, we should have cleared the record after that has
1896          * been flushed to the disk or sent to network in
1897          * /sbin/mcelog, but we have no interface to support that now,
1898          * so just clear it to avoid duplication.
1899          */
1900         rc = apei_clear_mce(record_id);
1901         if (rc) {
1902                 mce_apei_read_done = 1;
1903                 return rc;
1904         }
1905         *ubuf += sizeof(struct mce);
1906
1907         return 0;
1908 }
1909
1910 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1911                                 size_t usize, loff_t *off)
1912 {
1913         char __user *buf = ubuf;
1914         unsigned long *cpu_tsc;
1915         unsigned prev, next;
1916         int i, err;
1917
1918         cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1919         if (!cpu_tsc)
1920                 return -ENOMEM;
1921
1922         mutex_lock(&mce_chrdev_read_mutex);
1923
1924         if (!mce_apei_read_done) {
1925                 err = __mce_read_apei(&buf, usize);
1926                 if (err || buf != ubuf)
1927                         goto out;
1928         }
1929
1930         next = mce_log_get_idx_check(mcelog.next);
1931
1932         /* Only supports full reads right now */
1933         err = -EINVAL;
1934         if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1935                 goto out;
1936
1937         err = 0;
1938         prev = 0;
1939         do {
1940                 for (i = prev; i < next; i++) {
1941                         unsigned long start = jiffies;
1942                         struct mce *m = &mcelog.entry[i];
1943
1944                         while (!m->finished) {
1945                                 if (time_after_eq(jiffies, start + 2)) {
1946                                         memset(m, 0, sizeof(*m));
1947                                         goto timeout;
1948                                 }
1949                                 cpu_relax();
1950                         }
1951                         smp_rmb();
1952                         err |= copy_to_user(buf, m, sizeof(*m));
1953                         buf += sizeof(*m);
1954 timeout:
1955                         ;
1956                 }
1957
1958                 memset(mcelog.entry + prev, 0,
1959                        (next - prev) * sizeof(struct mce));
1960                 prev = next;
1961                 next = cmpxchg(&mcelog.next, prev, 0);
1962         } while (next != prev);
1963
1964         synchronize_sched();
1965
1966         /*
1967          * Collect entries that were still getting written before the
1968          * synchronize.
1969          */
1970         on_each_cpu(collect_tscs, cpu_tsc, 1);
1971
1972         for (i = next; i < MCE_LOG_LEN; i++) {
1973                 struct mce *m = &mcelog.entry[i];
1974
1975                 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1976                         err |= copy_to_user(buf, m, sizeof(*m));
1977                         smp_rmb();
1978                         buf += sizeof(*m);
1979                         memset(m, 0, sizeof(*m));
1980                 }
1981         }
1982
1983         if (err)
1984                 err = -EFAULT;
1985
1986 out:
1987         mutex_unlock(&mce_chrdev_read_mutex);
1988         kfree(cpu_tsc);
1989
1990         return err ? err : buf - ubuf;
1991 }
1992
1993 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1994 {
1995         poll_wait(file, &mce_chrdev_wait, wait);
1996         if (READ_ONCE(mcelog.next))
1997                 return POLLIN | POLLRDNORM;
1998         if (!mce_apei_read_done && apei_check_mce())
1999                 return POLLIN | POLLRDNORM;
2000         return 0;
2001 }
2002
2003 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
2004                                 unsigned long arg)
2005 {
2006         int __user *p = (int __user *)arg;
2007
2008         if (!capable(CAP_SYS_ADMIN))
2009                 return -EPERM;
2010
2011         switch (cmd) {
2012         case MCE_GET_RECORD_LEN:
2013                 return put_user(sizeof(struct mce), p);
2014         case MCE_GET_LOG_LEN:
2015                 return put_user(MCE_LOG_LEN, p);
2016         case MCE_GETCLEAR_FLAGS: {
2017                 unsigned flags;
2018
2019                 do {
2020                         flags = mcelog.flags;
2021                 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
2022
2023                 return put_user(flags, p);
2024         }
2025         default:
2026                 return -ENOTTY;
2027         }
2028 }
2029
2030 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
2031                             size_t usize, loff_t *off);
2032
2033 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
2034                              const char __user *ubuf,
2035                              size_t usize, loff_t *off))
2036 {
2037         mce_write = fn;
2038 }
2039 EXPORT_SYMBOL_GPL(register_mce_write_callback);
2040
2041 static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
2042                                 size_t usize, loff_t *off)
2043 {
2044         if (mce_write)
2045                 return mce_write(filp, ubuf, usize, off);
2046         else
2047                 return -EINVAL;
2048 }
2049
2050 static const struct file_operations mce_chrdev_ops = {
2051         .open                   = mce_chrdev_open,
2052         .release                = mce_chrdev_release,
2053         .read                   = mce_chrdev_read,
2054         .write                  = mce_chrdev_write,
2055         .poll                   = mce_chrdev_poll,
2056         .unlocked_ioctl         = mce_chrdev_ioctl,
2057         .llseek                 = no_llseek,
2058 };
2059
2060 static struct miscdevice mce_chrdev_device = {
2061         MISC_MCELOG_MINOR,
2062         "mcelog",
2063         &mce_chrdev_ops,
2064 };
2065
2066 static void __mce_disable_bank(void *arg)
2067 {
2068         int bank = *((int *)arg);
2069         __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2070         cmci_disable_bank(bank);
2071 }
2072
2073 void mce_disable_bank(int bank)
2074 {
2075         if (bank >= mca_cfg.banks) {
2076                 pr_warn(FW_BUG
2077                         "Ignoring request to disable invalid MCA bank %d.\n",
2078                         bank);
2079                 return;
2080         }
2081         set_bit(bank, mce_banks_ce_disabled);
2082         on_each_cpu(__mce_disable_bank, &bank, 1);
2083 }
2084
2085 /*
2086  * mce=off Disables machine check
2087  * mce=no_cmci Disables CMCI
2088  * mce=no_lmce Disables LMCE
2089  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2090  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2091  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2092  *      monarchtimeout is how long to wait for other CPUs on machine
2093  *      check, or 0 to not wait
2094  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2095  * mce=nobootlog Don't log MCEs from before booting.
2096  * mce=bios_cmci_threshold Don't program the CMCI threshold
2097  * mce=recovery force enable memcpy_mcsafe()
2098  */
2099 static int __init mcheck_enable(char *str)
2100 {
2101         struct mca_config *cfg = &mca_cfg;
2102
2103         if (*str == 0) {
2104                 enable_p5_mce();
2105                 return 1;
2106         }
2107         if (*str == '=')
2108                 str++;
2109         if (!strcmp(str, "off"))
2110                 cfg->disabled = true;
2111         else if (!strcmp(str, "no_cmci"))
2112                 cfg->cmci_disabled = true;
2113         else if (!strcmp(str, "no_lmce"))
2114                 cfg->lmce_disabled = true;
2115         else if (!strcmp(str, "dont_log_ce"))
2116                 cfg->dont_log_ce = true;
2117         else if (!strcmp(str, "ignore_ce"))
2118                 cfg->ignore_ce = true;
2119         else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2120                 cfg->bootlog = (str[0] == 'b');
2121         else if (!strcmp(str, "bios_cmci_threshold"))
2122                 cfg->bios_cmci_threshold = true;
2123         else if (!strcmp(str, "recovery"))
2124                 cfg->recovery = true;
2125         else if (isdigit(str[0])) {
2126                 if (get_option(&str, &cfg->tolerant) == 2)
2127                         get_option(&str, &(cfg->monarch_timeout));
2128         } else {
2129                 pr_info("mce argument %s ignored. Please use /sys\n", str);
2130                 return 0;
2131         }
2132         return 1;
2133 }
2134 __setup("mce", mcheck_enable);
2135
2136 int __init mcheck_init(void)
2137 {
2138         mcheck_intel_therm_init();
2139         mce_register_decode_chain(&mce_srao_nb);
2140         mce_register_decode_chain(&mce_default_nb);
2141         mcheck_vendor_init_severity();
2142
2143         INIT_WORK(&mce_work, mce_gen_pool_process);
2144         init_irq_work(&mce_irq_work, mce_irq_work_cb);
2145
2146         return 0;
2147 }
2148
2149 /*
2150  * mce_syscore: PM support
2151  */
2152
2153 /*
2154  * Disable machine checks on suspend and shutdown. We can't really handle
2155  * them later.
2156  */
2157 static void mce_disable_error_reporting(void)
2158 {
2159         int i;
2160
2161         for (i = 0; i < mca_cfg.banks; i++) {
2162                 struct mce_bank *b = &mce_banks[i];
2163
2164                 if (b->init)
2165                         wrmsrl(msr_ops.ctl(i), 0);
2166         }
2167         return;
2168 }
2169
2170 static void vendor_disable_error_reporting(void)
2171 {
2172         /*
2173          * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
2174          * Disabling them for just a single offlined CPU is bad, since it will
2175          * inhibit reporting for all shared resources on the socket like the
2176          * last level cache (LLC), the integrated memory controller (iMC), etc.
2177          */
2178         if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2179                 return;
2180
2181         mce_disable_error_reporting();
2182 }
2183
2184 static int mce_syscore_suspend(void)
2185 {
2186         vendor_disable_error_reporting();
2187         return 0;
2188 }
2189
2190 static void mce_syscore_shutdown(void)
2191 {
2192         vendor_disable_error_reporting();
2193 }
2194
2195 /*
2196  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2197  * Only one CPU is active at this time, the others get re-added later using
2198  * CPU hotplug:
2199  */
2200 static void mce_syscore_resume(void)
2201 {
2202         __mcheck_cpu_init_generic();
2203         __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2204         __mcheck_cpu_init_clear_banks();
2205 }
2206
2207 static struct syscore_ops mce_syscore_ops = {
2208         .suspend        = mce_syscore_suspend,
2209         .shutdown       = mce_syscore_shutdown,
2210         .resume         = mce_syscore_resume,
2211 };
2212
2213 /*
2214  * mce_device: Sysfs support
2215  */
2216
2217 static void mce_cpu_restart(void *data)
2218 {
2219         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2220                 return;
2221         __mcheck_cpu_init_generic();
2222         __mcheck_cpu_init_clear_banks();
2223         __mcheck_cpu_init_timer();
2224 }
2225
2226 /* Reinit MCEs after user configuration changes */
2227 static void mce_restart(void)
2228 {
2229         mce_timer_delete_all();
2230         on_each_cpu(mce_cpu_restart, NULL, 1);
2231 }
2232
2233 /* Toggle features for corrected errors */
2234 static void mce_disable_cmci(void *data)
2235 {
2236         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2237                 return;
2238         cmci_clear();
2239 }
2240
2241 static void mce_enable_ce(void *all)
2242 {
2243         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2244                 return;
2245         cmci_reenable();
2246         cmci_recheck();
2247         if (all)
2248                 __mcheck_cpu_init_timer();
2249 }
2250
2251 static struct bus_type mce_subsys = {
2252         .name           = "machinecheck",
2253         .dev_name       = "machinecheck",
2254 };
2255
2256 DEFINE_PER_CPU(struct device *, mce_device);
2257
2258 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2259 {
2260         return container_of(attr, struct mce_bank, attr);
2261 }
2262
2263 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2264                          char *buf)
2265 {
2266         return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2267 }
2268
2269 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2270                         const char *buf, size_t size)
2271 {
2272         u64 new;
2273
2274         if (kstrtou64(buf, 0, &new) < 0)
2275                 return -EINVAL;
2276
2277         attr_to_bank(attr)->ctl = new;
2278         mce_restart();
2279
2280         return size;
2281 }
2282
2283 static ssize_t
2284 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2285 {
2286         strcpy(buf, mce_helper);
2287         strcat(buf, "\n");
2288         return strlen(mce_helper) + 1;
2289 }
2290
2291 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2292                                 const char *buf, size_t siz)
2293 {
2294         char *p;
2295
2296         strncpy(mce_helper, buf, sizeof(mce_helper));
2297         mce_helper[sizeof(mce_helper)-1] = 0;
2298         p = strchr(mce_helper, '\n');
2299
2300         if (p)
2301                 *p = 0;
2302
2303         return strlen(mce_helper) + !!p;
2304 }
2305
2306 static ssize_t set_ignore_ce(struct device *s,
2307                              struct device_attribute *attr,
2308                              const char *buf, size_t size)
2309 {
2310         u64 new;
2311
2312         if (kstrtou64(buf, 0, &new) < 0)
2313                 return -EINVAL;
2314
2315         if (mca_cfg.ignore_ce ^ !!new) {
2316                 if (new) {
2317                         /* disable ce features */
2318                         mce_timer_delete_all();
2319                         on_each_cpu(mce_disable_cmci, NULL, 1);
2320                         mca_cfg.ignore_ce = true;
2321                 } else {
2322                         /* enable ce features */
2323                         mca_cfg.ignore_ce = false;
2324                         on_each_cpu(mce_enable_ce, (void *)1, 1);
2325                 }
2326         }
2327         return size;
2328 }
2329
2330 static ssize_t set_cmci_disabled(struct device *s,
2331                                  struct device_attribute *attr,
2332                                  const char *buf, size_t size)
2333 {
2334         u64 new;
2335
2336         if (kstrtou64(buf, 0, &new) < 0)
2337                 return -EINVAL;
2338
2339         if (mca_cfg.cmci_disabled ^ !!new) {
2340                 if (new) {
2341                         /* disable cmci */
2342                         on_each_cpu(mce_disable_cmci, NULL, 1);
2343                         mca_cfg.cmci_disabled = true;
2344                 } else {
2345                         /* enable cmci */
2346                         mca_cfg.cmci_disabled = false;
2347                         on_each_cpu(mce_enable_ce, NULL, 1);
2348                 }
2349         }
2350         return size;
2351 }
2352
2353 static ssize_t store_int_with_restart(struct device *s,
2354                                       struct device_attribute *attr,
2355                                       const char *buf, size_t size)
2356 {
2357         ssize_t ret = device_store_int(s, attr, buf, size);
2358         mce_restart();
2359         return ret;
2360 }
2361
2362 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2363 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2364 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2365 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2366
2367 static struct dev_ext_attribute dev_attr_check_interval = {
2368         __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2369         &check_interval
2370 };
2371
2372 static struct dev_ext_attribute dev_attr_ignore_ce = {
2373         __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2374         &mca_cfg.ignore_ce
2375 };
2376
2377 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2378         __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2379         &mca_cfg.cmci_disabled
2380 };
2381
2382 static struct device_attribute *mce_device_attrs[] = {
2383         &dev_attr_tolerant.attr,
2384         &dev_attr_check_interval.attr,
2385         &dev_attr_trigger,
2386         &dev_attr_monarch_timeout.attr,
2387         &dev_attr_dont_log_ce.attr,
2388         &dev_attr_ignore_ce.attr,
2389         &dev_attr_cmci_disabled.attr,
2390         NULL
2391 };
2392
2393 static cpumask_var_t mce_device_initialized;
2394
2395 static void mce_device_release(struct device *dev)
2396 {
2397         kfree(dev);
2398 }
2399
2400 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
2401 static int mce_device_create(unsigned int cpu)
2402 {
2403         struct device *dev;
2404         int err;
2405         int i, j;
2406
2407         if (!mce_available(&boot_cpu_data))
2408                 return -EIO;
2409
2410         dev = per_cpu(mce_device, cpu);
2411         if (dev)
2412                 return 0;
2413
2414         dev = kzalloc(sizeof *dev, GFP_KERNEL);
2415         if (!dev)
2416                 return -ENOMEM;
2417         dev->id  = cpu;
2418         dev->bus = &mce_subsys;
2419         dev->release = &mce_device_release;
2420
2421         err = device_register(dev);
2422         if (err) {
2423                 put_device(dev);
2424                 return err;
2425         }
2426
2427         for (i = 0; mce_device_attrs[i]; i++) {
2428                 err = device_create_file(dev, mce_device_attrs[i]);
2429                 if (err)
2430                         goto error;
2431         }
2432         for (j = 0; j < mca_cfg.banks; j++) {
2433                 err = device_create_file(dev, &mce_banks[j].attr);
2434                 if (err)
2435                         goto error2;
2436         }
2437         cpumask_set_cpu(cpu, mce_device_initialized);
2438         per_cpu(mce_device, cpu) = dev;
2439
2440         return 0;
2441 error2:
2442         while (--j >= 0)
2443                 device_remove_file(dev, &mce_banks[j].attr);
2444 error:
2445         while (--i >= 0)
2446                 device_remove_file(dev, mce_device_attrs[i]);
2447
2448         device_unregister(dev);
2449
2450         return err;
2451 }
2452
2453 static void mce_device_remove(unsigned int cpu)
2454 {
2455         struct device *dev = per_cpu(mce_device, cpu);
2456         int i;
2457
2458         if (!cpumask_test_cpu(cpu, mce_device_initialized))
2459                 return;
2460
2461         for (i = 0; mce_device_attrs[i]; i++)
2462                 device_remove_file(dev, mce_device_attrs[i]);
2463
2464         for (i = 0; i < mca_cfg.banks; i++)
2465                 device_remove_file(dev, &mce_banks[i].attr);
2466
2467         device_unregister(dev);
2468         cpumask_clear_cpu(cpu, mce_device_initialized);
2469         per_cpu(mce_device, cpu) = NULL;
2470 }
2471
2472 /* Make sure there are no machine checks on offlined CPUs. */
2473 static void mce_disable_cpu(void)
2474 {
2475         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2476                 return;
2477
2478         if (!cpuhp_tasks_frozen)
2479                 cmci_clear();
2480
2481         vendor_disable_error_reporting();
2482 }
2483
2484 static void mce_reenable_cpu(void)
2485 {
2486         int i;
2487
2488         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2489                 return;
2490
2491         if (!cpuhp_tasks_frozen)
2492                 cmci_reenable();
2493         for (i = 0; i < mca_cfg.banks; i++) {
2494                 struct mce_bank *b = &mce_banks[i];
2495
2496                 if (b->init)
2497                         wrmsrl(msr_ops.ctl(i), b->ctl);
2498         }
2499 }
2500
2501 static int mce_cpu_dead(unsigned int cpu)
2502 {
2503         mce_intel_hcpu_update(cpu);
2504
2505         /* intentionally ignoring frozen here */
2506         if (!cpuhp_tasks_frozen)
2507                 cmci_rediscover();
2508         return 0;
2509 }
2510
2511 static int mce_cpu_online(unsigned int cpu)
2512 {
2513         struct timer_list *t = this_cpu_ptr(&mce_timer);
2514         int ret;
2515
2516         mce_device_create(cpu);
2517
2518         ret = mce_threshold_create_device(cpu);
2519         if (ret) {
2520                 mce_device_remove(cpu);
2521                 return ret;
2522         }
2523         mce_reenable_cpu();
2524         mce_start_timer(t);
2525         return 0;
2526 }
2527
2528 static int mce_cpu_pre_down(unsigned int cpu)
2529 {
2530         struct timer_list *t = this_cpu_ptr(&mce_timer);
2531
2532         mce_disable_cpu();
2533         del_timer_sync(t);
2534         mce_threshold_remove_device(cpu);
2535         mce_device_remove(cpu);
2536         return 0;
2537 }
2538
2539 static __init void mce_init_banks(void)
2540 {
2541         int i;
2542
2543         for (i = 0; i < mca_cfg.banks; i++) {
2544                 struct mce_bank *b = &mce_banks[i];
2545                 struct device_attribute *a = &b->attr;
2546
2547                 sysfs_attr_init(&a->attr);
2548                 a->attr.name    = b->attrname;
2549                 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2550
2551                 a->attr.mode    = 0644;
2552                 a->show         = show_bank;
2553                 a->store        = set_bank;
2554         }
2555 }
2556
2557 static __init int mcheck_init_device(void)
2558 {
2559         enum cpuhp_state hp_online;
2560         int err;
2561
2562         if (!mce_available(&boot_cpu_data)) {
2563                 err = -EIO;
2564                 goto err_out;
2565         }
2566
2567         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2568                 err = -ENOMEM;
2569                 goto err_out;
2570         }
2571
2572         mce_init_banks();
2573
2574         err = subsys_system_register(&mce_subsys, NULL);
2575         if (err)
2576                 goto err_out_mem;
2577
2578         err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2579                                 mce_cpu_dead);
2580         if (err)
2581                 goto err_out_mem;
2582
2583         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2584                                 mce_cpu_online, mce_cpu_pre_down);
2585         if (err < 0)
2586                 goto err_out_online;
2587         hp_online = err;
2588
2589         register_syscore_ops(&mce_syscore_ops);
2590
2591         /* register character device /dev/mcelog */
2592         err = misc_register(&mce_chrdev_device);
2593         if (err)
2594                 goto err_register;
2595
2596         return 0;
2597
2598 err_register:
2599         unregister_syscore_ops(&mce_syscore_ops);
2600         cpuhp_remove_state(hp_online);
2601
2602 err_out_online:
2603         cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2604
2605 err_out_mem:
2606         free_cpumask_var(mce_device_initialized);
2607
2608 err_out:
2609         pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2610
2611         return err;
2612 }
2613 device_initcall_sync(mcheck_init_device);
2614
2615 /*
2616  * Old style boot options parsing. Only for compatibility.
2617  */
2618 static int __init mcheck_disable(char *str)
2619 {
2620         mca_cfg.disabled = true;
2621         return 1;
2622 }
2623 __setup("nomce", mcheck_disable);
2624
2625 #ifdef CONFIG_DEBUG_FS
2626 struct dentry *mce_get_debugfs_dir(void)
2627 {
2628         static struct dentry *dmce;
2629
2630         if (!dmce)
2631                 dmce = debugfs_create_dir("mce", NULL);
2632
2633         return dmce;
2634 }
2635
2636 static void mce_reset(void)
2637 {
2638         cpu_missing = 0;
2639         atomic_set(&mce_fake_panicked, 0);
2640         atomic_set(&mce_executing, 0);
2641         atomic_set(&mce_callin, 0);
2642         atomic_set(&global_nwo, 0);
2643 }
2644
2645 static int fake_panic_get(void *data, u64 *val)
2646 {
2647         *val = fake_panic;
2648         return 0;
2649 }
2650
2651 static int fake_panic_set(void *data, u64 val)
2652 {
2653         mce_reset();
2654         fake_panic = val;
2655         return 0;
2656 }
2657
2658 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2659                         fake_panic_set, "%llu\n");
2660
2661 static int __init mcheck_debugfs_init(void)
2662 {
2663         struct dentry *dmce, *ffake_panic;
2664
2665         dmce = mce_get_debugfs_dir();
2666         if (!dmce)
2667                 return -ENOMEM;
2668         ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2669                                           &fake_panic_fops);
2670         if (!ffake_panic)
2671                 return -ENOMEM;
2672
2673         return 0;
2674 }
2675 #else
2676 static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2677 #endif
2678
2679 DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2680 EXPORT_SYMBOL_GPL(mcsafe_key);
2681
2682 static int __init mcheck_late_init(void)
2683 {
2684         if (mca_cfg.recovery)
2685                 static_branch_inc(&mcsafe_key);
2686
2687         mcheck_debugfs_init();
2688
2689         /*
2690          * Flush out everything that has been logged during early boot, now that
2691          * everything has been initialized (workqueues, decoders, ...).
2692          */
2693         mce_schedule_work();
2694
2695         return 0;
2696 }
2697 late_initcall(mcheck_late_init);