]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - drivers/edac/mce_amd.c
EDAC, MCE: Add support for F11h MCEs
[mirror_ubuntu-zesty-kernel.git] / drivers / edac / mce_amd.c
1 #include <linux/module.h>
2 #include <linux/slab.h>
3
4 #include "mce_amd.h"
5
6 static struct amd_decoder_ops *fam_ops;
7
8 static u8 nb_err_cpumask = 0xf;
9
10 static bool report_gart_errors;
11 static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
12
13 void amd_report_gart_errors(bool v)
14 {
15 report_gart_errors = v;
16 }
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
20 {
21 nb_bus_decoder = f;
22 }
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
26 {
27 if (nb_bus_decoder) {
28 WARN_ON(nb_bus_decoder != f);
29
30 nb_bus_decoder = NULL;
31 }
32 }
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34
35 /*
36 * string representation for the different MCA reported error types, see F3x48
37 * or MSR0000_0411.
38 */
39
40 /* transaction type */
41 const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42 EXPORT_SYMBOL_GPL(tt_msgs);
43
44 /* cache level */
45 const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
46 EXPORT_SYMBOL_GPL(ll_msgs);
47
48 /* memory transaction type */
49 const char *rrrr_msgs[] = {
50 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51 };
52 EXPORT_SYMBOL_GPL(rrrr_msgs);
53
54 /* participating processor */
55 const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56 EXPORT_SYMBOL_GPL(pp_msgs);
57
58 /* request timeout */
59 const char *to_msgs[] = { "no timeout", "timed out" };
60 EXPORT_SYMBOL_GPL(to_msgs);
61
62 /* memory or i/o */
63 const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
64 EXPORT_SYMBOL_GPL(ii_msgs);
65
66 static const char *f10h_nb_mce_desc[] = {
67 "HT link data error",
68 "Protocol error (link, L3, probe filter, etc.)",
69 "Parity error in NB-internal arrays",
70 "Link Retry due to IO link transmission error",
71 "L3 ECC data cache error",
72 "ECC error in L3 cache tag",
73 "L3 LRU parity bits error",
74 "ECC Error in the Probe Filter directory"
75 };
76
77 static bool f10h_dc_mce(u16 ec)
78 {
79 u8 r4 = (ec >> 4) & 0xf;
80 bool ret = false;
81
82 if (r4 == R4_GEN) {
83 pr_cont("during data scrub.\n");
84 return true;
85 }
86
87 if (MEM_ERROR(ec)) {
88 u8 ll = ec & 0x3;
89 ret = true;
90
91 if (ll == LL_L2)
92 pr_cont("during L1 linefill from L2.\n");
93 else if (ll == LL_L1)
94 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
95 else
96 ret = false;
97 }
98 return ret;
99 }
100
101 static bool k8_dc_mce(u16 ec)
102 {
103 if (BUS_ERROR(ec)) {
104 pr_cont("during system linefill.\n");
105 return true;
106 }
107
108 return f10h_dc_mce(ec);
109 }
110
111 static bool f14h_dc_mce(u16 ec)
112 {
113 u8 r4 = (ec >> 4) & 0xf;
114 u8 ll = ec & 0x3;
115 u8 tt = (ec >> 2) & 0x3;
116 u8 ii = tt;
117 bool ret = true;
118
119 if (MEM_ERROR(ec)) {
120
121 if (tt != TT_DATA || ll != LL_L1)
122 return false;
123
124 switch (r4) {
125 case R4_DRD:
126 case R4_DWR:
127 pr_cont("Data/Tag parity error due to %s.\n",
128 (r4 == R4_DRD ? "load/hw prf" : "store"));
129 break;
130 case R4_EVICT:
131 pr_cont("Copyback parity error on a tag miss.\n");
132 break;
133 case R4_SNOOP:
134 pr_cont("Tag parity error during snoop.\n");
135 break;
136 default:
137 ret = false;
138 }
139 } else if (BUS_ERROR(ec)) {
140
141 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
142 return false;
143
144 pr_cont("System read data error on a ");
145
146 switch (r4) {
147 case R4_RD:
148 pr_cont("TLB reload.\n");
149 break;
150 case R4_DWR:
151 pr_cont("store.\n");
152 break;
153 case R4_DRD:
154 pr_cont("load.\n");
155 break;
156 default:
157 ret = false;
158 }
159 } else {
160 ret = false;
161 }
162
163 return ret;
164 }
165
166 static void amd_decode_dc_mce(struct mce *m)
167 {
168 u16 ec = m->status & 0xffff;
169 u8 xec = (m->status >> 16) & 0xf;
170
171 pr_emerg(HW_ERR "Data Cache Error: ");
172
173 /* TLB error signatures are the same across families */
174 if (TLB_ERROR(ec)) {
175 u8 tt = (ec >> 2) & 0x3;
176
177 if (tt == TT_DATA) {
178 pr_cont("%s TLB %s.\n", LL_MSG(ec),
179 (xec ? "multimatch" : "parity error"));
180 return;
181 }
182 else
183 goto wrong_dc_mce;
184 }
185
186 if (!fam_ops->dc_mce(ec))
187 goto wrong_dc_mce;
188
189 return;
190
191 wrong_dc_mce:
192 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
193 }
194
195 static bool k8_ic_mce(u16 ec)
196 {
197 u8 ll = ec & 0x3;
198 u8 r4 = (ec >> 4) & 0xf;
199 bool ret = true;
200
201 if (!MEM_ERROR(ec))
202 return false;
203
204 if (ll == 0x2)
205 pr_cont("during a linefill from L2.\n");
206 else if (ll == 0x1) {
207 switch (r4) {
208 case R4_IRD:
209 pr_cont("Parity error during data load.\n");
210 break;
211
212 case R4_EVICT:
213 pr_cont("Copyback Parity/Victim error.\n");
214 break;
215
216 case R4_SNOOP:
217 pr_cont("Tag Snoop error.\n");
218 break;
219
220 default:
221 ret = false;
222 break;
223 }
224 } else
225 ret = false;
226
227 return ret;
228 }
229
230 static bool f14h_ic_mce(u16 ec)
231 {
232 u8 ll = ec & 0x3;
233 u8 tt = (ec >> 2) & 0x3;
234 u8 r4 = (ec >> 4) & 0xf;
235 bool ret = true;
236
237 if (MEM_ERROR(ec)) {
238 if (tt != 0 || ll != 1)
239 ret = false;
240
241 if (r4 == R4_IRD)
242 pr_cont("Data/tag array parity error for a tag hit.\n");
243 else if (r4 == R4_SNOOP)
244 pr_cont("Tag error during snoop/victimization.\n");
245 else
246 ret = false;
247 }
248 return ret;
249 }
250
251 static void amd_decode_ic_mce(struct mce *m)
252 {
253 u16 ec = m->status & 0xffff;
254 u8 xec = (m->status >> 16) & 0xf;
255
256 pr_emerg(HW_ERR "Instruction Cache Error: ");
257
258 if (TLB_ERROR(ec))
259 pr_cont("%s TLB %s.\n", LL_MSG(ec),
260 (xec ? "multimatch" : "parity error"));
261 else if (BUS_ERROR(ec)) {
262 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58)));
263
264 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
265 } else if (fam_ops->ic_mce(ec))
266 ;
267 else
268 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
269 }
270
271 static void amd_decode_bu_mce(struct mce *m)
272 {
273 u32 ec = m->status & 0xffff;
274 u32 xec = (m->status >> 16) & 0xf;
275
276 pr_emerg(HW_ERR "Bus Unit Error");
277
278 if (xec == 0x1)
279 pr_cont(" in the write data buffers.\n");
280 else if (xec == 0x3)
281 pr_cont(" in the victim data buffers.\n");
282 else if (xec == 0x2 && MEM_ERROR(ec))
283 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
284 else if (xec == 0x0) {
285 if (TLB_ERROR(ec))
286 pr_cont(": %s error in a Page Descriptor Cache or "
287 "Guest TLB.\n", TT_MSG(ec));
288 else if (BUS_ERROR(ec))
289 pr_cont(": %s/ECC error in data read from NB: %s.\n",
290 RRRR_MSG(ec), PP_MSG(ec));
291 else if (MEM_ERROR(ec)) {
292 u8 rrrr = (ec >> 4) & 0xf;
293
294 if (rrrr >= 0x7)
295 pr_cont(": %s error during data copyback.\n",
296 RRRR_MSG(ec));
297 else if (rrrr <= 0x1)
298 pr_cont(": %s parity/ECC error during data "
299 "access from L2.\n", RRRR_MSG(ec));
300 else
301 goto wrong_bu_mce;
302 } else
303 goto wrong_bu_mce;
304 } else
305 goto wrong_bu_mce;
306
307 return;
308
309 wrong_bu_mce:
310 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
311 }
312
313 static void amd_decode_ls_mce(struct mce *m)
314 {
315 u16 ec = m->status & 0xffff;
316 u8 xec = (m->status >> 16) & 0xf;
317
318 if (boot_cpu_data.x86 == 0x14) {
319 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
320 " please report on LKML.\n");
321 return;
322 }
323
324 pr_emerg(HW_ERR "Load Store Error");
325
326 if (xec == 0x0) {
327 u8 r4 = (ec >> 4) & 0xf;
328
329 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
330 goto wrong_ls_mce;
331
332 pr_cont(" during %s.\n", RRRR_MSG(ec));
333 } else
334 goto wrong_ls_mce;
335
336 return;
337
338 wrong_ls_mce:
339 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
340 }
341
342 static bool k8_nb_mce(u16 ec, u8 xec)
343 {
344 bool ret = true;
345
346 switch (xec) {
347 case 0x1:
348 pr_cont("CRC error detected on HT link.\n");
349 break;
350
351 case 0x5:
352 pr_cont("Invalid GART PTE entry during GART table walk.\n");
353 break;
354
355 case 0x6:
356 pr_cont("Unsupported atomic RMW received from an IO link.\n");
357 break;
358
359 case 0x0:
360 case 0x8:
361 if (boot_cpu_data.x86 == 0x11)
362 return false;
363
364 pr_cont("DRAM ECC error detected on the NB.\n");
365 break;
366
367 case 0xd:
368 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
369 break;
370
371 default:
372 ret = false;
373 break;
374 }
375
376 return ret;
377 }
378
379 static bool f10h_nb_mce(u16 ec, u8 xec)
380 {
381 bool ret = true;
382 u8 offset = 0;
383
384 if (k8_nb_mce(ec, xec))
385 return true;
386
387 switch(xec) {
388 case 0xa ... 0xc:
389 offset = 10;
390 break;
391
392 case 0xe:
393 offset = 11;
394 break;
395
396 case 0xf:
397 if (TLB_ERROR(ec))
398 pr_cont("GART Table Walk data error.\n");
399 else if (BUS_ERROR(ec))
400 pr_cont("DMA Exclusion Vector Table Walk error.\n");
401 else
402 ret = false;
403
404 goto out;
405 break;
406
407 case 0x1c ... 0x1f:
408 offset = 24;
409 break;
410
411 default:
412 ret = false;
413
414 goto out;
415 break;
416 }
417
418 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
419
420 out:
421 return ret;
422 }
423
424 static bool f14h_nb_mce(u16 ec, u8 xec)
425 {
426 return false;
427 }
428
429 void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
430 {
431 u8 xec = (m->status >> 16) & 0x1f;
432 u16 ec = m->status & 0xffff;
433 u32 nbsh = (u32)(m->status >> 32);
434
435 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
436
437 /*
438 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
439 * value encoding has changed so interpret those differently
440 */
441 if ((boot_cpu_data.x86 == 0x10) &&
442 (boot_cpu_data.x86_model > 7)) {
443 if (nbsh & K8_NBSH_ERR_CPU_VAL)
444 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
445 } else {
446 u8 assoc_cpus = nbsh & nb_err_cpumask;
447
448 if (assoc_cpus > 0)
449 pr_cont(", core: %d", fls(assoc_cpus) - 1);
450 }
451
452 switch (xec) {
453 case 0x2:
454 pr_cont("Sync error (sync packets on HT link detected).\n");
455 return;
456
457 case 0x3:
458 pr_cont("HT Master abort.\n");
459 return;
460
461 case 0x4:
462 pr_cont("HT Target abort.\n");
463 return;
464
465 case 0x7:
466 pr_cont("NB Watchdog timeout.\n");
467 return;
468
469 case 0x9:
470 pr_cont("SVM DMA Exclusion Vector error.\n");
471 return;
472
473 default:
474 break;
475 }
476
477 if (!fam_ops->nb_mce(ec, xec))
478 goto wrong_nb_mce;
479
480 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
481 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
482 nb_bus_decoder(node_id, m, nbcfg);
483
484 return;
485
486 wrong_nb_mce:
487 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
488 }
489 EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
490
491 static void amd_decode_fr_mce(struct mce *m)
492 {
493 if (boot_cpu_data.x86 == 0xf ||
494 boot_cpu_data.x86 == 0x11)
495 goto wrong_fr_mce;
496
497 /* we have only one error signature so match all fields at once. */
498 if ((m->status & 0xffff) == 0x0f0f) {
499 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
500 return;
501 }
502
503 wrong_fr_mce:
504 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
505 }
506
507 static inline void amd_decode_err_code(u16 ec)
508 {
509 if (TLB_ERROR(ec)) {
510 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
511 TT_MSG(ec), LL_MSG(ec));
512 } else if (MEM_ERROR(ec)) {
513 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
514 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
515 } else if (BUS_ERROR(ec)) {
516 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
517 "Participating Processor: %s\n",
518 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
519 PP_MSG(ec));
520 } else
521 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
522 }
523
524 /*
525 * Filter out unwanted MCE signatures here.
526 */
527 static bool amd_filter_mce(struct mce *m)
528 {
529 u8 xec = (m->status >> 16) & 0x1f;
530
531 /*
532 * NB GART TLB error reporting is disabled by default.
533 */
534 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
535 return true;
536
537 return false;
538 }
539
540 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
541 {
542 struct mce *m = (struct mce *)data;
543 int node, ecc;
544
545 if (amd_filter_mce(m))
546 return NOTIFY_STOP;
547
548 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
549
550 pr_cont("%sorrected error, other errors lost: %s, "
551 "CPU context corrupt: %s",
552 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
553 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
554 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
555
556 /* do the two bits[14:13] together */
557 ecc = (m->status >> 45) & 0x3;
558 if (ecc)
559 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
560
561 pr_cont("\n");
562
563 switch (m->bank) {
564 case 0:
565 amd_decode_dc_mce(m);
566 break;
567
568 case 1:
569 amd_decode_ic_mce(m);
570 break;
571
572 case 2:
573 amd_decode_bu_mce(m);
574 break;
575
576 case 3:
577 amd_decode_ls_mce(m);
578 break;
579
580 case 4:
581 node = amd_get_nb_id(m->extcpu);
582 amd_decode_nb_mce(node, m, 0);
583 break;
584
585 case 5:
586 amd_decode_fr_mce(m);
587 break;
588
589 default:
590 break;
591 }
592
593 amd_decode_err_code(m->status & 0xffff);
594
595 return NOTIFY_STOP;
596 }
597 EXPORT_SYMBOL_GPL(amd_decode_mce);
598
599 static struct notifier_block amd_mce_dec_nb = {
600 .notifier_call = amd_decode_mce,
601 };
602
603 static int __init mce_amd_init(void)
604 {
605 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
606 return 0;
607
608 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) &&
609 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
610 return 0;
611
612 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
613 if (!fam_ops)
614 return -ENOMEM;
615
616 switch (boot_cpu_data.x86) {
617 case 0xf:
618 fam_ops->dc_mce = k8_dc_mce;
619 fam_ops->ic_mce = k8_ic_mce;
620 fam_ops->nb_mce = k8_nb_mce;
621 break;
622
623 case 0x10:
624 fam_ops->dc_mce = f10h_dc_mce;
625 fam_ops->ic_mce = k8_ic_mce;
626 fam_ops->nb_mce = f10h_nb_mce;
627 break;
628
629 case 0x11:
630 fam_ops->dc_mce = k8_dc_mce;
631 fam_ops->ic_mce = k8_ic_mce;
632 fam_ops->nb_mce = f10h_nb_mce;
633 break;
634
635 case 0x14:
636 nb_err_cpumask = 0x3;
637 fam_ops->dc_mce = f14h_dc_mce;
638 fam_ops->ic_mce = f14h_ic_mce;
639 fam_ops->nb_mce = f14h_nb_mce;
640 break;
641
642 default:
643 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
644 boot_cpu_data.x86);
645 kfree(fam_ops);
646 return -EINVAL;
647 }
648
649 pr_info("MCE: In-kernel MCE decoding enabled.\n");
650
651 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
652
653 return 0;
654 }
655 early_initcall(mce_amd_init);
656
657 #ifdef MODULE
658 static void __exit mce_amd_exit(void)
659 {
660 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
661 kfree(fam_ops);
662 }
663
664 MODULE_DESCRIPTION("AMD MCE decoder");
665 MODULE_ALIAS("edac-mce-amd");
666 MODULE_LICENSE("GPL");
667 module_exit(mce_amd_exit);
668 #endif