]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blame - drivers/edac/mce_amd.c
EDAC, MCE: Add support for F11h MCEs
[mirror_ubuntu-zesty-kernel.git] / drivers / edac / mce_amd.c
CommitLineData
b70ef010 1#include <linux/module.h>
888ab8e6
BP
2#include <linux/slab.h>
3
47ca08a4 4#include "mce_amd.h"
b52401ce 5
888ab8e6
BP
6static struct amd_decoder_ops *fam_ops;
7
5ce88f6e
BP
8static u8 nb_err_cpumask = 0xf;
9
549d042d 10static bool report_gart_errors;
7cfd4a87 11static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
549d042d
BP
12
13void amd_report_gart_errors(bool v)
14{
15 report_gart_errors = v;
16}
17EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18
7cfd4a87 19void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
549d042d
BP
20{
21 nb_bus_decoder = f;
22}
23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24
7cfd4a87 25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
549d042d
BP
26{
27 if (nb_bus_decoder) {
28 WARN_ON(nb_bus_decoder != f);
29
30 nb_bus_decoder = NULL;
31 }
32}
33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34
b52401ce
DT
35/*
36 * string representation for the different MCA reported error types, see F3x48
37 * or MSR0000_0411.
38 */
6337583d
BP
39
40/* transaction type */
41const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
b70ef010 42EXPORT_SYMBOL_GPL(tt_msgs);
b52401ce 43
6337583d
BP
44/* cache level */
45const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
b70ef010 46EXPORT_SYMBOL_GPL(ll_msgs);
b52401ce 47
6337583d 48/* memory transaction type */
b52401ce 49const char *rrrr_msgs[] = {
6337583d 50 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
b52401ce 51};
b70ef010 52EXPORT_SYMBOL_GPL(rrrr_msgs);
b52401ce 53
6337583d
BP
54/* participating processor */
55const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
b70ef010 56EXPORT_SYMBOL_GPL(pp_msgs);
b52401ce 57
6337583d
BP
58/* request timeout */
59const char *to_msgs[] = { "no timeout", "timed out" };
b70ef010 60EXPORT_SYMBOL_GPL(to_msgs);
b52401ce 61
6337583d
BP
62/* memory or i/o */
63const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
b70ef010 64EXPORT_SYMBOL_GPL(ii_msgs);
b52401ce 65
5ce88f6e
BP
66static const char *f10h_nb_mce_desc[] = {
67 "HT link data error",
68 "Protocol error (link, L3, probe filter, etc.)",
69 "Parity error in NB-internal arrays",
70 "Link Retry due to IO link transmission error",
71 "L3 ECC data cache error",
72 "ECC error in L3 cache tag",
73 "L3 LRU parity bits error",
74 "ECC Error in the Probe Filter directory"
b52401ce 75};
549d042d 76
888ab8e6 77static bool f10h_dc_mce(u16 ec)
51966241 78{
888ab8e6
BP
79 u8 r4 = (ec >> 4) & 0xf;
80 bool ret = false;
51966241 81
888ab8e6
BP
82 if (r4 == R4_GEN) {
83 pr_cont("during data scrub.\n");
84 return true;
85 }
51966241 86
888ab8e6
BP
87 if (MEM_ERROR(ec)) {
88 u8 ll = ec & 0x3;
89 ret = true;
51966241 90
888ab8e6
BP
91 if (ll == LL_L2)
92 pr_cont("during L1 linefill from L2.\n");
93 else if (ll == LL_L1)
94 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
95 else
96 ret = false;
97 }
98 return ret;
99}
51966241 100
888ab8e6
BP
101static bool k8_dc_mce(u16 ec)
102{
103 if (BUS_ERROR(ec)) {
104 pr_cont("during system linefill.\n");
105 return true;
106 }
51966241 107
888ab8e6
BP
108 return f10h_dc_mce(ec);
109}
110
111static bool f14h_dc_mce(u16 ec)
112{
113 u8 r4 = (ec >> 4) & 0xf;
114 u8 ll = ec & 0x3;
115 u8 tt = (ec >> 2) & 0x3;
116 u8 ii = tt;
117 bool ret = true;
118
119 if (MEM_ERROR(ec)) {
120
121 if (tt != TT_DATA || ll != LL_L1)
122 return false;
123
124 switch (r4) {
125 case R4_DRD:
126 case R4_DWR:
127 pr_cont("Data/Tag parity error due to %s.\n",
128 (r4 == R4_DRD ? "load/hw prf" : "store"));
129 break;
130 case R4_EVICT:
131 pr_cont("Copyback parity error on a tag miss.\n");
132 break;
133 case R4_SNOOP:
134 pr_cont("Tag parity error during snoop.\n");
135 break;
136 default:
137 ret = false;
138 }
139 } else if (BUS_ERROR(ec)) {
140
141 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
142 return false;
143
144 pr_cont("System read data error on a ");
145
146 switch (r4) {
147 case R4_RD:
148 pr_cont("TLB reload.\n");
149 break;
150 case R4_DWR:
151 pr_cont("store.\n");
152 break;
153 case R4_DRD:
154 pr_cont("load.\n");
155 break;
156 default:
157 ret = false;
158 }
159 } else {
160 ret = false;
161 }
162
163 return ret;
164}
165
166static void amd_decode_dc_mce(struct mce *m)
167{
168 u16 ec = m->status & 0xffff;
169 u8 xec = (m->status >> 16) & 0xf;
170
171 pr_emerg(HW_ERR "Data Cache Error: ");
172
173 /* TLB error signatures are the same across families */
174 if (TLB_ERROR(ec)) {
175 u8 tt = (ec >> 2) & 0x3;
176
177 if (tt == TT_DATA) {
178 pr_cont("%s TLB %s.\n", LL_MSG(ec),
179 (xec ? "multimatch" : "parity error"));
180 return;
181 }
51966241
BP
182 else
183 goto wrong_dc_mce;
888ab8e6
BP
184 }
185
186 if (!fam_ops->dc_mce(ec))
51966241
BP
187 goto wrong_dc_mce;
188
189 return;
190
191wrong_dc_mce:
c9f281fd 192 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
51966241
BP
193}
194
dd53bce4 195static bool k8_ic_mce(u16 ec)
ab5535e7 196{
dd53bce4
BP
197 u8 ll = ec & 0x3;
198 u8 r4 = (ec >> 4) & 0xf;
199 bool ret = true;
ab5535e7 200
dd53bce4
BP
201 if (!MEM_ERROR(ec))
202 return false;
ab5535e7 203
dd53bce4
BP
204 if (ll == 0x2)
205 pr_cont("during a linefill from L2.\n");
206 else if (ll == 0x1) {
207 switch (r4) {
208 case R4_IRD:
209 pr_cont("Parity error during data load.\n");
210 break;
ab5535e7 211
dd53bce4
BP
212 case R4_EVICT:
213 pr_cont("Copyback Parity/Victim error.\n");
214 break;
215
216 case R4_SNOOP:
217 pr_cont("Tag Snoop error.\n");
218 break;
219
220 default:
221 ret = false;
222 break;
223 }
ab5535e7 224 } else
dd53bce4 225 ret = false;
ab5535e7 226
dd53bce4
BP
227 return ret;
228}
229
230static bool f14h_ic_mce(u16 ec)
231{
232 u8 ll = ec & 0x3;
233 u8 tt = (ec >> 2) & 0x3;
234 u8 r4 = (ec >> 4) & 0xf;
235 bool ret = true;
ab5535e7 236
dd53bce4
BP
237 if (MEM_ERROR(ec)) {
238 if (tt != 0 || ll != 1)
239 ret = false;
240
241 if (r4 == R4_IRD)
242 pr_cont("Data/tag array parity error for a tag hit.\n");
243 else if (r4 == R4_SNOOP)
244 pr_cont("Tag error during snoop/victimization.\n");
245 else
246 ret = false;
247 }
248 return ret;
249}
250
251static void amd_decode_ic_mce(struct mce *m)
252{
253 u16 ec = m->status & 0xffff;
254 u8 xec = (m->status >> 16) & 0xf;
255
256 pr_emerg(HW_ERR "Instruction Cache Error: ");
257
258 if (TLB_ERROR(ec))
259 pr_cont("%s TLB %s.\n", LL_MSG(ec),
260 (xec ? "multimatch" : "parity error"));
261 else if (BUS_ERROR(ec)) {
262 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58)));
263
264 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
265 } else if (fam_ops->ic_mce(ec))
266 ;
267 else
268 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
ab5535e7
BP
269}
270
7cfd4a87 271static void amd_decode_bu_mce(struct mce *m)
56cad2d6 272{
7cfd4a87
BP
273 u32 ec = m->status & 0xffff;
274 u32 xec = (m->status >> 16) & 0xf;
56cad2d6 275
c9f281fd 276 pr_emerg(HW_ERR "Bus Unit Error");
56cad2d6
BP
277
278 if (xec == 0x1)
279 pr_cont(" in the write data buffers.\n");
280 else if (xec == 0x3)
281 pr_cont(" in the victim data buffers.\n");
282 else if (xec == 0x2 && MEM_ERROR(ec))
283 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
284 else if (xec == 0x0) {
285 if (TLB_ERROR(ec))
286 pr_cont(": %s error in a Page Descriptor Cache or "
287 "Guest TLB.\n", TT_MSG(ec));
288 else if (BUS_ERROR(ec))
289 pr_cont(": %s/ECC error in data read from NB: %s.\n",
290 RRRR_MSG(ec), PP_MSG(ec));
291 else if (MEM_ERROR(ec)) {
292 u8 rrrr = (ec >> 4) & 0xf;
293
294 if (rrrr >= 0x7)
295 pr_cont(": %s error during data copyback.\n",
296 RRRR_MSG(ec));
297 else if (rrrr <= 0x1)
298 pr_cont(": %s parity/ECC error during data "
299 "access from L2.\n", RRRR_MSG(ec));
300 else
301 goto wrong_bu_mce;
302 } else
303 goto wrong_bu_mce;
304 } else
305 goto wrong_bu_mce;
306
307 return;
308
309wrong_bu_mce:
c9f281fd 310 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
56cad2d6
BP
311}
312
7cfd4a87 313static void amd_decode_ls_mce(struct mce *m)
f9350efd 314{
ded50623
BP
315 u16 ec = m->status & 0xffff;
316 u8 xec = (m->status >> 16) & 0xf;
317
318 if (boot_cpu_data.x86 == 0x14) {
319 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
320 " please report on LKML.\n");
321 return;
322 }
f9350efd 323
c9f281fd 324 pr_emerg(HW_ERR "Load Store Error");
f9350efd
BP
325
326 if (xec == 0x0) {
ded50623 327 u8 r4 = (ec >> 4) & 0xf;
f9350efd 328
ded50623 329 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
f9350efd
BP
330 goto wrong_ls_mce;
331
332 pr_cont(" during %s.\n", RRRR_MSG(ec));
ded50623
BP
333 } else
334 goto wrong_ls_mce;
335
f9350efd
BP
336 return;
337
338wrong_ls_mce:
c9f281fd 339 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
f9350efd
BP
340}
341
5ce88f6e
BP
342static bool k8_nb_mce(u16 ec, u8 xec)
343{
344 bool ret = true;
345
346 switch (xec) {
347 case 0x1:
348 pr_cont("CRC error detected on HT link.\n");
349 break;
350
351 case 0x5:
352 pr_cont("Invalid GART PTE entry during GART table walk.\n");
353 break;
354
355 case 0x6:
356 pr_cont("Unsupported atomic RMW received from an IO link.\n");
357 break;
358
359 case 0x0:
360 case 0x8:
f0157b3a
BP
361 if (boot_cpu_data.x86 == 0x11)
362 return false;
363
5ce88f6e
BP
364 pr_cont("DRAM ECC error detected on the NB.\n");
365 break;
366
367 case 0xd:
368 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
369 break;
370
371 default:
372 ret = false;
373 break;
374 }
375
376 return ret;
377}
378
379static bool f10h_nb_mce(u16 ec, u8 xec)
380{
381 bool ret = true;
382 u8 offset = 0;
383
384 if (k8_nb_mce(ec, xec))
385 return true;
386
387 switch(xec) {
388 case 0xa ... 0xc:
389 offset = 10;
390 break;
391
392 case 0xe:
393 offset = 11;
394 break;
395
396 case 0xf:
397 if (TLB_ERROR(ec))
398 pr_cont("GART Table Walk data error.\n");
399 else if (BUS_ERROR(ec))
400 pr_cont("DMA Exclusion Vector Table Walk error.\n");
401 else
402 ret = false;
403
404 goto out;
405 break;
406
407 case 0x1c ... 0x1f:
408 offset = 24;
409 break;
410
411 default:
412 ret = false;
413
414 goto out;
415 break;
416 }
417
418 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
419
420out:
421 return ret;
422}
423
424static bool f14h_nb_mce(u16 ec, u8 xec)
425{
426 return false;
427}
428
7cfd4a87 429void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
549d042d 430{
5ce88f6e
BP
431 u8 xec = (m->status >> 16) & 0x1f;
432 u16 ec = m->status & 0xffff;
7cfd4a87 433 u32 nbsh = (u32)(m->status >> 32);
256f7276 434
5ce88f6e 435 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
549d042d
BP
436
437 /*
438 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
439 * value encoding has changed so interpret those differently
440 */
441 if ((boot_cpu_data.x86 == 0x10) &&
cec7924f 442 (boot_cpu_data.x86_model > 7)) {
7cfd4a87 443 if (nbsh & K8_NBSH_ERR_CPU_VAL)
5ce88f6e 444 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
549d042d 445 } else {
5ce88f6e 446 u8 assoc_cpus = nbsh & nb_err_cpumask;
5b89d2f9
BP
447
448 if (assoc_cpus > 0)
449 pr_cont(", core: %d", fls(assoc_cpus) - 1);
5ce88f6e 450 }
5b89d2f9 451
5ce88f6e
BP
452 switch (xec) {
453 case 0x2:
454 pr_cont("Sync error (sync packets on HT link detected).\n");
455 return;
456
457 case 0x3:
458 pr_cont("HT Master abort.\n");
459 return;
460
461 case 0x4:
462 pr_cont("HT Target abort.\n");
463 return;
464
465 case 0x7:
466 pr_cont("NB Watchdog timeout.\n");
467 return;
468
469 case 0x9:
470 pr_cont("SVM DMA Exclusion Vector error.\n");
471 return;
472
473 default:
474 break;
549d042d
BP
475 }
476
5ce88f6e
BP
477 if (!fam_ops->nb_mce(ec, xec))
478 goto wrong_nb_mce;
479
480 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
481 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
482 nb_bus_decoder(node_id, m, nbcfg);
d93cc222 483
5ce88f6e
BP
484 return;
485
486wrong_nb_mce:
487 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
d93cc222
BP
488}
489EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
490
7cfd4a87 491static void amd_decode_fr_mce(struct mce *m)
53bd5fed 492{
f0157b3a
BP
493 if (boot_cpu_data.x86 == 0xf ||
494 boot_cpu_data.x86 == 0x11)
fe4ea262
BP
495 goto wrong_fr_mce;
496
53bd5fed 497 /* we have only one error signature so match all fields at once. */
fe4ea262
BP
498 if ((m->status & 0xffff) == 0x0f0f) {
499 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
500 return;
501 }
502
503wrong_fr_mce:
504 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
53bd5fed
BP
505}
506
6337583d 507static inline void amd_decode_err_code(u16 ec)
d93cc222 508{
549d042d 509 if (TLB_ERROR(ec)) {
c9f281fd 510 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
549d042d
BP
511 TT_MSG(ec), LL_MSG(ec));
512 } else if (MEM_ERROR(ec)) {
6337583d 513 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
549d042d
BP
514 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
515 } else if (BUS_ERROR(ec)) {
6337583d 516 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
d93cc222
BP
517 "Participating Processor: %s\n",
518 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
519 PP_MSG(ec));
520 } else
c9f281fd 521 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
549d042d 522}
549d042d 523
5ce88f6e
BP
524/*
525 * Filter out unwanted MCE signatures here.
526 */
527static bool amd_filter_mce(struct mce *m)
528{
529 u8 xec = (m->status >> 16) & 0x1f;
530
531 /*
532 * NB GART TLB error reporting is disabled by default.
533 */
534 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
535 return true;
536
537 return false;
538}
539
9cdeb404 540int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
549d042d 541{
fb253195 542 struct mce *m = (struct mce *)data;
b69b29de 543 int node, ecc;
549d042d 544
5ce88f6e
BP
545 if (amd_filter_mce(m))
546 return NOTIFY_STOP;
547
c9f281fd 548 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
549d042d 549
37b7370a 550 pr_cont("%sorrected error, other errors lost: %s, "
b69b29de
BP
551 "CPU context corrupt: %s",
552 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
37b7370a 553 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
b69b29de 554 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
549d042d 555
b69b29de 556 /* do the two bits[14:13] together */
35d824b2 557 ecc = (m->status >> 45) & 0x3;
b69b29de
BP
558 if (ecc)
559 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
560
561 pr_cont("\n");
562
51966241
BP
563 switch (m->bank) {
564 case 0:
7cfd4a87 565 amd_decode_dc_mce(m);
51966241 566 break;
d93cc222 567
ab5535e7 568 case 1:
7cfd4a87 569 amd_decode_ic_mce(m);
ab5535e7
BP
570 break;
571
56cad2d6 572 case 2:
7cfd4a87 573 amd_decode_bu_mce(m);
56cad2d6
BP
574 break;
575
f9350efd 576 case 3:
7cfd4a87 577 amd_decode_ls_mce(m);
f9350efd
BP
578 break;
579
51966241 580 case 4:
7cfd4a87
BP
581 node = amd_get_nb_id(m->extcpu);
582 amd_decode_nb_mce(node, m, 0);
51966241
BP
583 break;
584
53bd5fed 585 case 5:
7cfd4a87 586 amd_decode_fr_mce(m);
53bd5fed
BP
587 break;
588
51966241
BP
589 default:
590 break;
b69b29de 591 }
51966241
BP
592
593 amd_decode_err_code(m->status & 0xffff);
fb253195
BP
594
595 return NOTIFY_STOP;
549d042d 596}
9cdeb404 597EXPORT_SYMBOL_GPL(amd_decode_mce);
f436f8bb 598
fb253195
BP
599static struct notifier_block amd_mce_dec_nb = {
600 .notifier_call = amd_decode_mce,
601};
602
f436f8bb
IM
603static int __init mce_amd_init(void)
604{
e045c291
BP
605 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
606 return 0;
607
f0157b3a 608 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) &&
9530d608 609 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
e045c291
BP
610 return 0;
611
888ab8e6
BP
612 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
613 if (!fam_ops)
614 return -ENOMEM;
615
616 switch (boot_cpu_data.x86) {
617 case 0xf:
618 fam_ops->dc_mce = k8_dc_mce;
dd53bce4 619 fam_ops->ic_mce = k8_ic_mce;
5ce88f6e 620 fam_ops->nb_mce = k8_nb_mce;
888ab8e6
BP
621 break;
622
623 case 0x10:
624 fam_ops->dc_mce = f10h_dc_mce;
dd53bce4 625 fam_ops->ic_mce = k8_ic_mce;
5ce88f6e 626 fam_ops->nb_mce = f10h_nb_mce;
888ab8e6
BP
627 break;
628
f0157b3a
BP
629 case 0x11:
630 fam_ops->dc_mce = k8_dc_mce;
631 fam_ops->ic_mce = k8_ic_mce;
632 fam_ops->nb_mce = f10h_nb_mce;
633 break;
634
888ab8e6 635 case 0x14:
5ce88f6e 636 nb_err_cpumask = 0x3;
888ab8e6 637 fam_ops->dc_mce = f14h_dc_mce;
dd53bce4 638 fam_ops->ic_mce = f14h_ic_mce;
5ce88f6e 639 fam_ops->nb_mce = f14h_nb_mce;
888ab8e6
BP
640 break;
641
642 default:
643 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
644 boot_cpu_data.x86);
645 kfree(fam_ops);
646 return -EINVAL;
647 }
648
9530d608
BP
649 pr_info("MCE: In-kernel MCE decoding enabled.\n");
650
e045c291 651 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
f436f8bb
IM
652
653 return 0;
654}
655early_initcall(mce_amd_init);
0d18b2e3
BP
656
657#ifdef MODULE
658static void __exit mce_amd_exit(void)
659{
fb253195 660 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
888ab8e6 661 kfree(fam_ops);
0d18b2e3
BP
662}
663
664MODULE_DESCRIPTION("AMD MCE decoder");
665MODULE_ALIAS("edac-mce-amd");
666MODULE_LICENSE("GPL");
667module_exit(mce_amd_exit);
668#endif