]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/edac/mce_amd.c
EDAC/mce_amd: Use SMCA prefix for error descriptions arrays
[mirror_ubuntu-bionic-kernel.git] / drivers / edac / mce_amd.c
1 #include <linux/module.h>
2 #include <linux/slab.h>
3
4 #include "mce_amd.h"
5
6 static struct amd_decoder_ops *fam_ops;
7
8 static u8 xec_mask = 0xf;
9
10 static bool report_gart_errors;
11 static void (*nb_bus_decoder)(int node_id, struct mce *m);
12
13 void amd_report_gart_errors(bool v)
14 {
15 report_gart_errors = v;
16 }
17 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18
19 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
20 {
21 nb_bus_decoder = f;
22 }
23 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24
25 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
26 {
27 if (nb_bus_decoder) {
28 WARN_ON(nb_bus_decoder != f);
29
30 nb_bus_decoder = NULL;
31 }
32 }
33 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34
35 /*
36 * string representation for the different MCA reported error types, see F3x48
37 * or MSR0000_0411.
38 */
39
40 /* transaction type */
41 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42
43 /* cache level */
44 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
45
46 /* memory transaction type */
47 static const char * const rrrr_msgs[] = {
48 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
49 };
50
51 /* participating processor */
52 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
53 EXPORT_SYMBOL_GPL(pp_msgs);
54
55 /* request timeout */
56 static const char * const to_msgs[] = { "no timeout", "timed out" };
57
58 /* memory or i/o */
59 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
60
61 /* internal error type */
62 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
63
64 static const char * const f15h_mc1_mce_desc[] = {
65 "UC during a demand linefill from L2",
66 "Parity error during data load from IC",
67 "Parity error for IC valid bit",
68 "Main tag parity error",
69 "Parity error in prediction queue",
70 "PFB data/address parity error",
71 "Parity error in the branch status reg",
72 "PFB promotion address error",
73 "Tag error during probe/victimization",
74 "Parity error for IC probe tag valid bit",
75 "PFB non-cacheable bit parity error",
76 "PFB valid bit parity error", /* xec = 0xd */
77 "Microcode Patch Buffer", /* xec = 010 */
78 "uop queue",
79 "insn buffer",
80 "predecode buffer",
81 "fetch address FIFO",
82 "dispatch uop queue"
83 };
84
85 static const char * const f15h_mc2_mce_desc[] = {
86 "Fill ECC error on data fills", /* xec = 0x4 */
87 "Fill parity error on insn fills",
88 "Prefetcher request FIFO parity error",
89 "PRQ address parity error",
90 "PRQ data parity error",
91 "WCC Tag ECC error",
92 "WCC Data ECC error",
93 "WCB Data parity error",
94 "VB Data ECC or parity error",
95 "L2 Tag ECC error", /* xec = 0x10 */
96 "Hard L2 Tag ECC error",
97 "Multiple hits on L2 tag",
98 "XAB parity error",
99 "PRB address parity error"
100 };
101
102 static const char * const mc4_mce_desc[] = {
103 "DRAM ECC error detected on the NB",
104 "CRC error detected on HT link",
105 "Link-defined sync error packets detected on HT link",
106 "HT Master abort",
107 "HT Target abort",
108 "Invalid GART PTE entry during GART table walk",
109 "Unsupported atomic RMW received from an IO link",
110 "Watchdog timeout due to lack of progress",
111 "DRAM ECC error detected on the NB",
112 "SVM DMA Exclusion Vector error",
113 "HT data error detected on link",
114 "Protocol error (link, L3, probe filter)",
115 "NB internal arrays parity error",
116 "DRAM addr/ctl signals parity error",
117 "IO link transmission error",
118 "L3 data cache ECC error", /* xec = 0x1c */
119 "L3 cache tag error",
120 "L3 LRU parity bits error",
121 "ECC Error in the Probe Filter directory"
122 };
123
124 static const char * const mc5_mce_desc[] = {
125 "CPU Watchdog timer expire",
126 "Wakeup array dest tag",
127 "AG payload array",
128 "EX payload array",
129 "IDRF array",
130 "Retire dispatch queue",
131 "Mapper checkpoint array",
132 "Physical register file EX0 port",
133 "Physical register file EX1 port",
134 "Physical register file AG0 port",
135 "Physical register file AG1 port",
136 "Flag register file",
137 "DE error occurred",
138 "Retire status queue"
139 };
140
141 static const char * const mc6_mce_desc[] = {
142 "Hardware Assertion",
143 "Free List",
144 "Physical Register File",
145 "Retire Queue",
146 "Scheduler table",
147 "Status Register File",
148 };
149
150 /* Scalable MCA error strings */
151 static const char * const smca_ls_mce_desc[] = {
152 "Load queue parity",
153 "Store queue parity",
154 "Miss address buffer payload parity",
155 "L1 TLB parity",
156 "Reserved",
157 "DC tag error type 6",
158 "DC tag error type 1",
159 "Internal error type 1",
160 "Internal error type 2",
161 "Sys Read data error thread 0",
162 "Sys read data error thread 1",
163 "DC tag error type 2",
164 "DC data error type 1 (poison comsumption)",
165 "DC data error type 2",
166 "DC data error type 3",
167 "DC tag error type 4",
168 "L2 TLB parity",
169 "PDC parity error",
170 "DC tag error type 3",
171 "DC tag error type 5",
172 "L2 fill data error",
173 };
174
175 static const char * const smca_if_mce_desc[] = {
176 "microtag probe port parity error",
177 "IC microtag or full tag multi-hit error",
178 "IC full tag parity",
179 "IC data array parity",
180 "Decoupling queue phys addr parity error",
181 "L0 ITLB parity error",
182 "L1 ITLB parity error",
183 "L2 ITLB parity error",
184 "BPQ snoop parity on Thread 0",
185 "BPQ snoop parity on Thread 1",
186 "L1 BTB multi-match error",
187 "L2 BTB multi-match error",
188 "L2 Cache Response Poison error",
189 "System Read Data error",
190 };
191
192 static const char * const smca_l2_mce_desc[] = {
193 "L2M tag multi-way-hit error",
194 "L2M tag ECC error",
195 "L2M data ECC error",
196 "HW assert",
197 };
198
199 static const char * const smca_de_mce_desc[] = {
200 "uop cache tag parity error",
201 "uop cache data parity error",
202 "Insn buffer parity error",
203 "uop queue parity error",
204 "Insn dispatch queue parity error",
205 "Fetch address FIFO parity",
206 "Patch RAM data parity",
207 "Patch RAM sequencer parity",
208 "uop buffer parity"
209 };
210
211 static const char * const smca_ex_mce_desc[] = {
212 "Watchdog timeout error",
213 "Phy register file parity",
214 "Flag register file parity",
215 "Immediate displacement register file parity",
216 "Address generator payload parity",
217 "EX payload parity",
218 "Checkpoint queue parity",
219 "Retire dispatch queue parity",
220 "Retire status queue parity error",
221 "Scheduling queue parity error",
222 "Branch buffer queue parity error",
223 };
224
225 static const char * const smca_fp_mce_desc[] = {
226 "Physical register file parity",
227 "Freelist parity error",
228 "Schedule queue parity",
229 "NSQ parity error",
230 "Retire queue parity",
231 "Status register file parity",
232 "Hardware assertion",
233 };
234
235 static const char * const smca_l3_mce_desc[] = {
236 "Shadow tag macro ECC error",
237 "Shadow tag macro multi-way-hit error",
238 "L3M tag ECC error",
239 "L3M tag multi-way-hit error",
240 "L3M data ECC error",
241 "XI parity, L3 fill done channel error",
242 "L3 victim queue parity",
243 "L3 HW assert",
244 };
245
246 static const char * const smca_cs_mce_desc[] = {
247 "Illegal request from transport layer",
248 "Address violation",
249 "Security violation",
250 "Illegal response from transport layer",
251 "Unexpected response",
252 "Parity error on incoming request or probe response data",
253 "Parity error on incoming read response data",
254 "Atomic request parity",
255 "ECC error on probe filter access",
256 };
257
258 static const char * const smca_pie_mce_desc[] = {
259 "HW assert",
260 "Internal PIE register security violation",
261 "Error on GMI link",
262 "Poison data written to internal PIE register",
263 };
264
265 static const char * const smca_umc_mce_desc[] = {
266 "DRAM ECC error",
267 "Data poison error on DRAM",
268 "SDP parity error",
269 "Advanced peripheral bus error",
270 "Command/address parity error",
271 "Write data CRC error",
272 };
273
274 static const char * const smca_pb_mce_desc[] = {
275 "Parameter Block RAM ECC error",
276 };
277
278 static const char * const smca_psp_mce_desc[] = {
279 "PSP RAM ECC or parity error",
280 };
281
282 static const char * const smca_smu_mce_desc[] = {
283 "SMU RAM ECC or parity error",
284 };
285
286 static bool f12h_mc0_mce(u16 ec, u8 xec)
287 {
288 bool ret = false;
289
290 if (MEM_ERROR(ec)) {
291 u8 ll = LL(ec);
292 ret = true;
293
294 if (ll == LL_L2)
295 pr_cont("during L1 linefill from L2.\n");
296 else if (ll == LL_L1)
297 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
298 else
299 ret = false;
300 }
301 return ret;
302 }
303
304 static bool f10h_mc0_mce(u16 ec, u8 xec)
305 {
306 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
307 pr_cont("during data scrub.\n");
308 return true;
309 }
310 return f12h_mc0_mce(ec, xec);
311 }
312
313 static bool k8_mc0_mce(u16 ec, u8 xec)
314 {
315 if (BUS_ERROR(ec)) {
316 pr_cont("during system linefill.\n");
317 return true;
318 }
319
320 return f10h_mc0_mce(ec, xec);
321 }
322
323 static bool cat_mc0_mce(u16 ec, u8 xec)
324 {
325 u8 r4 = R4(ec);
326 bool ret = true;
327
328 if (MEM_ERROR(ec)) {
329
330 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
331 return false;
332
333 switch (r4) {
334 case R4_DRD:
335 case R4_DWR:
336 pr_cont("Data/Tag parity error due to %s.\n",
337 (r4 == R4_DRD ? "load/hw prf" : "store"));
338 break;
339 case R4_EVICT:
340 pr_cont("Copyback parity error on a tag miss.\n");
341 break;
342 case R4_SNOOP:
343 pr_cont("Tag parity error during snoop.\n");
344 break;
345 default:
346 ret = false;
347 }
348 } else if (BUS_ERROR(ec)) {
349
350 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
351 return false;
352
353 pr_cont("System read data error on a ");
354
355 switch (r4) {
356 case R4_RD:
357 pr_cont("TLB reload.\n");
358 break;
359 case R4_DWR:
360 pr_cont("store.\n");
361 break;
362 case R4_DRD:
363 pr_cont("load.\n");
364 break;
365 default:
366 ret = false;
367 }
368 } else {
369 ret = false;
370 }
371
372 return ret;
373 }
374
375 static bool f15h_mc0_mce(u16 ec, u8 xec)
376 {
377 bool ret = true;
378
379 if (MEM_ERROR(ec)) {
380
381 switch (xec) {
382 case 0x0:
383 pr_cont("Data Array access error.\n");
384 break;
385
386 case 0x1:
387 pr_cont("UC error during a linefill from L2/NB.\n");
388 break;
389
390 case 0x2:
391 case 0x11:
392 pr_cont("STQ access error.\n");
393 break;
394
395 case 0x3:
396 pr_cont("SCB access error.\n");
397 break;
398
399 case 0x10:
400 pr_cont("Tag error.\n");
401 break;
402
403 case 0x12:
404 pr_cont("LDQ access error.\n");
405 break;
406
407 default:
408 ret = false;
409 }
410 } else if (BUS_ERROR(ec)) {
411
412 if (!xec)
413 pr_cont("System Read Data Error.\n");
414 else
415 pr_cont(" Internal error condition type %d.\n", xec);
416 } else if (INT_ERROR(ec)) {
417 if (xec <= 0x1f)
418 pr_cont("Hardware Assert.\n");
419 else
420 ret = false;
421
422 } else
423 ret = false;
424
425 return ret;
426 }
427
428 static void decode_mc0_mce(struct mce *m)
429 {
430 u16 ec = EC(m->status);
431 u8 xec = XEC(m->status, xec_mask);
432
433 pr_emerg(HW_ERR "MC0 Error: ");
434
435 /* TLB error signatures are the same across families */
436 if (TLB_ERROR(ec)) {
437 if (TT(ec) == TT_DATA) {
438 pr_cont("%s TLB %s.\n", LL_MSG(ec),
439 ((xec == 2) ? "locked miss"
440 : (xec ? "multimatch" : "parity")));
441 return;
442 }
443 } else if (fam_ops->mc0_mce(ec, xec))
444 ;
445 else
446 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
447 }
448
449 static bool k8_mc1_mce(u16 ec, u8 xec)
450 {
451 u8 ll = LL(ec);
452 bool ret = true;
453
454 if (!MEM_ERROR(ec))
455 return false;
456
457 if (ll == 0x2)
458 pr_cont("during a linefill from L2.\n");
459 else if (ll == 0x1) {
460 switch (R4(ec)) {
461 case R4_IRD:
462 pr_cont("Parity error during data load.\n");
463 break;
464
465 case R4_EVICT:
466 pr_cont("Copyback Parity/Victim error.\n");
467 break;
468
469 case R4_SNOOP:
470 pr_cont("Tag Snoop error.\n");
471 break;
472
473 default:
474 ret = false;
475 break;
476 }
477 } else
478 ret = false;
479
480 return ret;
481 }
482
483 static bool cat_mc1_mce(u16 ec, u8 xec)
484 {
485 u8 r4 = R4(ec);
486 bool ret = true;
487
488 if (!MEM_ERROR(ec))
489 return false;
490
491 if (TT(ec) != TT_INSTR)
492 return false;
493
494 if (r4 == R4_IRD)
495 pr_cont("Data/tag array parity error for a tag hit.\n");
496 else if (r4 == R4_SNOOP)
497 pr_cont("Tag error during snoop/victimization.\n");
498 else if (xec == 0x0)
499 pr_cont("Tag parity error from victim castout.\n");
500 else if (xec == 0x2)
501 pr_cont("Microcode patch RAM parity error.\n");
502 else
503 ret = false;
504
505 return ret;
506 }
507
508 static bool f15h_mc1_mce(u16 ec, u8 xec)
509 {
510 bool ret = true;
511
512 if (!MEM_ERROR(ec))
513 return false;
514
515 switch (xec) {
516 case 0x0 ... 0xa:
517 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
518 break;
519
520 case 0xd:
521 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
522 break;
523
524 case 0x10:
525 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
526 break;
527
528 case 0x11 ... 0x15:
529 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
530 break;
531
532 default:
533 ret = false;
534 }
535 return ret;
536 }
537
538 static void decode_mc1_mce(struct mce *m)
539 {
540 u16 ec = EC(m->status);
541 u8 xec = XEC(m->status, xec_mask);
542
543 pr_emerg(HW_ERR "MC1 Error: ");
544
545 if (TLB_ERROR(ec))
546 pr_cont("%s TLB %s.\n", LL_MSG(ec),
547 (xec ? "multimatch" : "parity error"));
548 else if (BUS_ERROR(ec)) {
549 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
550
551 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
552 } else if (INT_ERROR(ec)) {
553 if (xec <= 0x3f)
554 pr_cont("Hardware Assert.\n");
555 else
556 goto wrong_mc1_mce;
557 } else if (fam_ops->mc1_mce(ec, xec))
558 ;
559 else
560 goto wrong_mc1_mce;
561
562 return;
563
564 wrong_mc1_mce:
565 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
566 }
567
568 static bool k8_mc2_mce(u16 ec, u8 xec)
569 {
570 bool ret = true;
571
572 if (xec == 0x1)
573 pr_cont(" in the write data buffers.\n");
574 else if (xec == 0x3)
575 pr_cont(" in the victim data buffers.\n");
576 else if (xec == 0x2 && MEM_ERROR(ec))
577 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
578 else if (xec == 0x0) {
579 if (TLB_ERROR(ec))
580 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
581 TT_MSG(ec));
582 else if (BUS_ERROR(ec))
583 pr_cont(": %s/ECC error in data read from NB: %s.\n",
584 R4_MSG(ec), PP_MSG(ec));
585 else if (MEM_ERROR(ec)) {
586 u8 r4 = R4(ec);
587
588 if (r4 >= 0x7)
589 pr_cont(": %s error during data copyback.\n",
590 R4_MSG(ec));
591 else if (r4 <= 0x1)
592 pr_cont(": %s parity/ECC error during data "
593 "access from L2.\n", R4_MSG(ec));
594 else
595 ret = false;
596 } else
597 ret = false;
598 } else
599 ret = false;
600
601 return ret;
602 }
603
604 static bool f15h_mc2_mce(u16 ec, u8 xec)
605 {
606 bool ret = true;
607
608 if (TLB_ERROR(ec)) {
609 if (xec == 0x0)
610 pr_cont("Data parity TLB read error.\n");
611 else if (xec == 0x1)
612 pr_cont("Poison data provided for TLB fill.\n");
613 else
614 ret = false;
615 } else if (BUS_ERROR(ec)) {
616 if (xec > 2)
617 ret = false;
618
619 pr_cont("Error during attempted NB data read.\n");
620 } else if (MEM_ERROR(ec)) {
621 switch (xec) {
622 case 0x4 ... 0xc:
623 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
624 break;
625
626 case 0x10 ... 0x14:
627 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
628 break;
629
630 default:
631 ret = false;
632 }
633 } else if (INT_ERROR(ec)) {
634 if (xec <= 0x3f)
635 pr_cont("Hardware Assert.\n");
636 else
637 ret = false;
638 }
639
640 return ret;
641 }
642
643 static bool f16h_mc2_mce(u16 ec, u8 xec)
644 {
645 u8 r4 = R4(ec);
646
647 if (!MEM_ERROR(ec))
648 return false;
649
650 switch (xec) {
651 case 0x04 ... 0x05:
652 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
653 break;
654
655 case 0x09 ... 0x0b:
656 case 0x0d ... 0x0f:
657 pr_cont("ECC error in L2 tag (%s).\n",
658 ((r4 == R4_GEN) ? "BankReq" :
659 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
660 break;
661
662 case 0x10 ... 0x19:
663 case 0x1b:
664 pr_cont("ECC error in L2 data array (%s).\n",
665 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
666 ((r4 == R4_GEN) ? "Attr" :
667 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
668 break;
669
670 case 0x1c ... 0x1d:
671 case 0x1f:
672 pr_cont("Parity error in L2 attribute bits (%s).\n",
673 ((r4 == R4_RD) ? "Hit" :
674 ((r4 == R4_GEN) ? "Attr" : "Fill")));
675 break;
676
677 default:
678 return false;
679 }
680
681 return true;
682 }
683
684 static void decode_mc2_mce(struct mce *m)
685 {
686 u16 ec = EC(m->status);
687 u8 xec = XEC(m->status, xec_mask);
688
689 pr_emerg(HW_ERR "MC2 Error: ");
690
691 if (!fam_ops->mc2_mce(ec, xec))
692 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
693 }
694
695 static void decode_mc3_mce(struct mce *m)
696 {
697 u16 ec = EC(m->status);
698 u8 xec = XEC(m->status, xec_mask);
699
700 if (boot_cpu_data.x86 >= 0x14) {
701 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
702 " please report on LKML.\n");
703 return;
704 }
705
706 pr_emerg(HW_ERR "MC3 Error");
707
708 if (xec == 0x0) {
709 u8 r4 = R4(ec);
710
711 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
712 goto wrong_mc3_mce;
713
714 pr_cont(" during %s.\n", R4_MSG(ec));
715 } else
716 goto wrong_mc3_mce;
717
718 return;
719
720 wrong_mc3_mce:
721 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
722 }
723
724 static void decode_mc4_mce(struct mce *m)
725 {
726 struct cpuinfo_x86 *c = &boot_cpu_data;
727 int node_id = amd_get_nb_id(m->extcpu);
728 u16 ec = EC(m->status);
729 u8 xec = XEC(m->status, 0x1f);
730 u8 offset = 0;
731
732 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
733
734 switch (xec) {
735 case 0x0 ... 0xe:
736
737 /* special handling for DRAM ECCs */
738 if (xec == 0x0 || xec == 0x8) {
739 /* no ECCs on F11h */
740 if (c->x86 == 0x11)
741 goto wrong_mc4_mce;
742
743 pr_cont("%s.\n", mc4_mce_desc[xec]);
744
745 if (nb_bus_decoder)
746 nb_bus_decoder(node_id, m);
747 return;
748 }
749 break;
750
751 case 0xf:
752 if (TLB_ERROR(ec))
753 pr_cont("GART Table Walk data error.\n");
754 else if (BUS_ERROR(ec))
755 pr_cont("DMA Exclusion Vector Table Walk error.\n");
756 else
757 goto wrong_mc4_mce;
758 return;
759
760 case 0x19:
761 if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
762 pr_cont("Compute Unit Data Error.\n");
763 else
764 goto wrong_mc4_mce;
765 return;
766
767 case 0x1c ... 0x1f:
768 offset = 13;
769 break;
770
771 default:
772 goto wrong_mc4_mce;
773 }
774
775 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
776 return;
777
778 wrong_mc4_mce:
779 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
780 }
781
782 static void decode_mc5_mce(struct mce *m)
783 {
784 struct cpuinfo_x86 *c = &boot_cpu_data;
785 u16 ec = EC(m->status);
786 u8 xec = XEC(m->status, xec_mask);
787
788 if (c->x86 == 0xf || c->x86 == 0x11)
789 goto wrong_mc5_mce;
790
791 pr_emerg(HW_ERR "MC5 Error: ");
792
793 if (INT_ERROR(ec)) {
794 if (xec <= 0x1f) {
795 pr_cont("Hardware Assert.\n");
796 return;
797 } else
798 goto wrong_mc5_mce;
799 }
800
801 if (xec == 0x0 || xec == 0xc)
802 pr_cont("%s.\n", mc5_mce_desc[xec]);
803 else if (xec <= 0xd)
804 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
805 else
806 goto wrong_mc5_mce;
807
808 return;
809
810 wrong_mc5_mce:
811 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
812 }
813
814 static void decode_mc6_mce(struct mce *m)
815 {
816 u8 xec = XEC(m->status, xec_mask);
817
818 pr_emerg(HW_ERR "MC6 Error: ");
819
820 if (xec > 0x5)
821 goto wrong_mc6_mce;
822
823 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
824 return;
825
826 wrong_mc6_mce:
827 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
828 }
829
830 static void decode_f17h_core_errors(const char *ip_name, u8 xec,
831 unsigned int mca_type)
832 {
833 const char * const *error_desc_array;
834 size_t len;
835
836 pr_emerg(HW_ERR "%s Error: ", ip_name);
837
838 switch (mca_type) {
839 case SMCA_LS:
840 error_desc_array = smca_ls_mce_desc;
841 len = ARRAY_SIZE(smca_ls_mce_desc) - 1;
842
843 if (xec == 0x4) {
844 pr_cont("Unrecognized LS MCA error code.\n");
845 return;
846 }
847 break;
848
849 case SMCA_IF:
850 error_desc_array = smca_if_mce_desc;
851 len = ARRAY_SIZE(smca_if_mce_desc) - 1;
852 break;
853
854 case SMCA_L2_CACHE:
855 error_desc_array = smca_l2_mce_desc;
856 len = ARRAY_SIZE(smca_l2_mce_desc) - 1;
857 break;
858
859 case SMCA_DE:
860 error_desc_array = smca_de_mce_desc;
861 len = ARRAY_SIZE(smca_de_mce_desc) - 1;
862 break;
863
864 case SMCA_EX:
865 error_desc_array = smca_ex_mce_desc;
866 len = ARRAY_SIZE(smca_ex_mce_desc) - 1;
867 break;
868
869 case SMCA_FP:
870 error_desc_array = smca_fp_mce_desc;
871 len = ARRAY_SIZE(smca_fp_mce_desc) - 1;
872 break;
873
874 case SMCA_L3_CACHE:
875 error_desc_array = smca_l3_mce_desc;
876 len = ARRAY_SIZE(smca_l3_mce_desc) - 1;
877 break;
878
879 default:
880 pr_cont("Corrupted MCA core error info.\n");
881 return;
882 }
883
884 if (xec > len) {
885 pr_cont("Unrecognized %s MCA bank error code.\n",
886 amd_core_mcablock_names[mca_type]);
887 return;
888 }
889
890 pr_cont("%s.\n", error_desc_array[xec]);
891 }
892
893 static void decode_df_errors(u8 xec, unsigned int mca_type)
894 {
895 const char * const *error_desc_array;
896 size_t len;
897
898 pr_emerg(HW_ERR "Data Fabric Error: ");
899
900 switch (mca_type) {
901 case SMCA_CS:
902 error_desc_array = smca_cs_mce_desc;
903 len = ARRAY_SIZE(smca_cs_mce_desc) - 1;
904 break;
905
906 case SMCA_PIE:
907 error_desc_array = smca_pie_mce_desc;
908 len = ARRAY_SIZE(smca_pie_mce_desc) - 1;
909 break;
910
911 default:
912 pr_cont("Corrupted MCA Data Fabric info.\n");
913 return;
914 }
915
916 if (xec > len) {
917 pr_cont("Unrecognized %s MCA bank error code.\n",
918 amd_df_mcablock_names[mca_type]);
919 return;
920 }
921
922 pr_cont("%s.\n", error_desc_array[xec]);
923 }
924
925 /* Decode errors according to Scalable MCA specification */
926 static void decode_smca_errors(struct mce *m)
927 {
928 u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
929 unsigned int hwid, mca_type, i;
930 u8 xec = XEC(m->status, xec_mask);
931 const char * const *error_desc_array;
932 const char *ip_name;
933 u32 low, high;
934 size_t len;
935
936 if (rdmsr_safe(addr, &low, &high)) {
937 pr_emerg(HW_ERR "Invalid IP block specified.\n");
938 return;
939 }
940
941 hwid = high & MCI_IPID_HWID;
942 mca_type = (high & MCI_IPID_MCATYPE) >> 16;
943
944 pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
945
946 /*
947 * Based on hwid and mca_type values, decode errors from respective IPs.
948 * Note: mca_type values make sense only in the context of an hwid.
949 */
950 for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
951 if (amd_hwids[i].hwid == hwid)
952 break;
953
954 switch (i) {
955 case SMCA_F17H_CORE:
956 ip_name = (mca_type == SMCA_L3_CACHE) ?
957 "L3 Cache" : "F17h Core";
958 return decode_f17h_core_errors(ip_name, xec, mca_type);
959 break;
960
961 case SMCA_DF:
962 return decode_df_errors(xec, mca_type);
963 break;
964
965 case SMCA_UMC:
966 error_desc_array = smca_umc_mce_desc;
967 len = ARRAY_SIZE(smca_umc_mce_desc) - 1;
968 break;
969
970 case SMCA_PB:
971 error_desc_array = smca_pb_mce_desc;
972 len = ARRAY_SIZE(smca_pb_mce_desc) - 1;
973 break;
974
975 case SMCA_PSP:
976 error_desc_array = smca_psp_mce_desc;
977 len = ARRAY_SIZE(smca_psp_mce_desc) - 1;
978 break;
979
980 case SMCA_SMU:
981 error_desc_array = smca_smu_mce_desc;
982 len = ARRAY_SIZE(smca_smu_mce_desc) - 1;
983 break;
984
985 default:
986 pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
987 return;
988 }
989
990 ip_name = amd_hwids[i].name;
991 pr_emerg(HW_ERR "%s Error: ", ip_name);
992
993 if (xec > len) {
994 pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
995 return;
996 }
997
998 pr_cont("%s.\n", error_desc_array[xec]);
999 }
1000
1001 static inline void amd_decode_err_code(u16 ec)
1002 {
1003 if (INT_ERROR(ec)) {
1004 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1005 return;
1006 }
1007
1008 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1009
1010 if (BUS_ERROR(ec))
1011 pr_cont(", mem/io: %s", II_MSG(ec));
1012 else
1013 pr_cont(", tx: %s", TT_MSG(ec));
1014
1015 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1016 pr_cont(", mem-tx: %s", R4_MSG(ec));
1017
1018 if (BUS_ERROR(ec))
1019 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1020 }
1021
1022 pr_cont("\n");
1023 }
1024
1025 /*
1026 * Filter out unwanted MCE signatures here.
1027 */
1028 static bool amd_filter_mce(struct mce *m)
1029 {
1030 u8 xec = (m->status >> 16) & 0x1f;
1031
1032 /*
1033 * NB GART TLB error reporting is disabled by default.
1034 */
1035 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
1036 return true;
1037
1038 return false;
1039 }
1040
1041 static const char *decode_error_status(struct mce *m)
1042 {
1043 if (m->status & MCI_STATUS_UC) {
1044 if (m->status & MCI_STATUS_PCC)
1045 return "System Fatal error.";
1046 if (m->mcgstatus & MCG_STATUS_RIPV)
1047 return "Uncorrected, software restartable error.";
1048 return "Uncorrected, software containable error.";
1049 }
1050
1051 if (m->status & MCI_STATUS_DEFERRED)
1052 return "Deferred error.";
1053
1054 return "Corrected error, no action required.";
1055 }
1056
1057 int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1058 {
1059 struct mce *m = (struct mce *)data;
1060 struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
1061 int ecc;
1062
1063 if (amd_filter_mce(m))
1064 return NOTIFY_STOP;
1065
1066 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1067
1068 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1069 m->extcpu,
1070 c->x86, c->x86_model, c->x86_mask,
1071 m->bank,
1072 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
1073 ((m->status & MCI_STATUS_UC) ? "UE" :
1074 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
1075 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
1076 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
1077 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
1078
1079 if (c->x86 >= 0x15)
1080 pr_cont("|%s|%s",
1081 ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
1082 ((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));
1083
1084 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1085 u32 low, high;
1086 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1087
1088 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1089
1090 if (!rdmsr_safe(addr, &low, &high) &&
1091 (low & MCI_CONFIG_MCAX))
1092 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1093 }
1094
1095 /* do the two bits[14:13] together */
1096 ecc = (m->status >> 45) & 0x3;
1097 if (ecc)
1098 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1099
1100 pr_cont("]: 0x%016llx\n", m->status);
1101
1102 if (m->status & MCI_STATUS_ADDRV)
1103 pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
1104
1105 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1106 if (m->status & MCI_STATUS_SYNDV)
1107 pr_cont(", Syndrome: 0x%016llx", m->synd);
1108
1109 pr_cont("\n");
1110
1111 decode_smca_errors(m);
1112 goto err_code;
1113 } else
1114 pr_cont("\n");
1115
1116 if (!fam_ops)
1117 goto err_code;
1118
1119 switch (m->bank) {
1120 case 0:
1121 decode_mc0_mce(m);
1122 break;
1123
1124 case 1:
1125 decode_mc1_mce(m);
1126 break;
1127
1128 case 2:
1129 decode_mc2_mce(m);
1130 break;
1131
1132 case 3:
1133 decode_mc3_mce(m);
1134 break;
1135
1136 case 4:
1137 decode_mc4_mce(m);
1138 break;
1139
1140 case 5:
1141 decode_mc5_mce(m);
1142 break;
1143
1144 case 6:
1145 decode_mc6_mce(m);
1146 break;
1147
1148 default:
1149 break;
1150 }
1151
1152 err_code:
1153 amd_decode_err_code(m->status & 0xffff);
1154
1155 return NOTIFY_STOP;
1156 }
1157 EXPORT_SYMBOL_GPL(amd_decode_mce);
1158
1159 static struct notifier_block amd_mce_dec_nb = {
1160 .notifier_call = amd_decode_mce,
1161 };
1162
1163 static int __init mce_amd_init(void)
1164 {
1165 struct cpuinfo_x86 *c = &boot_cpu_data;
1166
1167 if (c->x86_vendor != X86_VENDOR_AMD)
1168 return -ENODEV;
1169
1170 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1171 if (!fam_ops)
1172 return -ENOMEM;
1173
1174 switch (c->x86) {
1175 case 0xf:
1176 fam_ops->mc0_mce = k8_mc0_mce;
1177 fam_ops->mc1_mce = k8_mc1_mce;
1178 fam_ops->mc2_mce = k8_mc2_mce;
1179 break;
1180
1181 case 0x10:
1182 fam_ops->mc0_mce = f10h_mc0_mce;
1183 fam_ops->mc1_mce = k8_mc1_mce;
1184 fam_ops->mc2_mce = k8_mc2_mce;
1185 break;
1186
1187 case 0x11:
1188 fam_ops->mc0_mce = k8_mc0_mce;
1189 fam_ops->mc1_mce = k8_mc1_mce;
1190 fam_ops->mc2_mce = k8_mc2_mce;
1191 break;
1192
1193 case 0x12:
1194 fam_ops->mc0_mce = f12h_mc0_mce;
1195 fam_ops->mc1_mce = k8_mc1_mce;
1196 fam_ops->mc2_mce = k8_mc2_mce;
1197 break;
1198
1199 case 0x14:
1200 fam_ops->mc0_mce = cat_mc0_mce;
1201 fam_ops->mc1_mce = cat_mc1_mce;
1202 fam_ops->mc2_mce = k8_mc2_mce;
1203 break;
1204
1205 case 0x15:
1206 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1207
1208 fam_ops->mc0_mce = f15h_mc0_mce;
1209 fam_ops->mc1_mce = f15h_mc1_mce;
1210 fam_ops->mc2_mce = f15h_mc2_mce;
1211 break;
1212
1213 case 0x16:
1214 xec_mask = 0x1f;
1215 fam_ops->mc0_mce = cat_mc0_mce;
1216 fam_ops->mc1_mce = cat_mc1_mce;
1217 fam_ops->mc2_mce = f16h_mc2_mce;
1218 break;
1219
1220 case 0x17:
1221 xec_mask = 0x3f;
1222 if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1223 printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1224 goto err_out;
1225 }
1226 break;
1227
1228 default:
1229 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1230 goto err_out;
1231 }
1232
1233 pr_info("MCE: In-kernel MCE decoding enabled.\n");
1234
1235 mce_register_decode_chain(&amd_mce_dec_nb);
1236
1237 return 0;
1238
1239 err_out:
1240 kfree(fam_ops);
1241 fam_ops = NULL;
1242 return -EINVAL;
1243 }
1244 early_initcall(mce_amd_init);
1245
1246 #ifdef MODULE
1247 static void __exit mce_amd_exit(void)
1248 {
1249 mce_unregister_decode_chain(&amd_mce_dec_nb);
1250 kfree(fam_ops);
1251 }
1252
1253 MODULE_DESCRIPTION("AMD MCE decoder");
1254 MODULE_ALIAS("edac-mce-amd");
1255 MODULE_LICENSE("GPL");
1256 module_exit(mce_amd_exit);
1257 #endif