]>
Commit | Line | Data |
---|---|---|
1da60899 FG |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Qiuxu Zhuo <qiuxu.zhuo@intel.com> | |
3 | Date: Wed, 13 Sep 2017 18:42:14 +0800 | |
4 | Subject: [PATCH] EDAC, sb_edac: Don't create a second memory controller if HA1 | |
5 | is not present | |
6 | MIME-Version: 1.0 | |
7 | Content-Type: text/plain; charset=UTF-8 | |
8 | Content-Transfer-Encoding: 8bit | |
9 | ||
10 | Yi Zhang reported the following failure on a 2-socket Haswell (E5-2603v3) | |
11 | server (DELL PowerEdge 730xd): | |
12 | ||
13 | EDAC sbridge: Some needed devices are missing | |
14 | EDAC MC: Removed device 0 for sb_edac.c Haswell SrcID#0_Ha#0: DEV 0000:7f:12.0 | |
15 | EDAC MC: Removed device 1 for sb_edac.c Haswell SrcID#1_Ha#0: DEV 0000:ff:12.0 | |
16 | EDAC sbridge: Couldn't find mci handler | |
17 | EDAC sbridge: Couldn't find mci handler | |
18 | EDAC sbridge: Failed to register device with error -19. | |
19 | ||
20 | The refactored sb_edac driver creates the IMC1 (the 2nd memory | |
21 | controller) if any IMC1 device is present. In this case only | |
22 | HA1_TA of IMC1 was present, but the driver expected to find | |
23 | HA1/HA1_TM/HA1_TAD[0-3] devices too, leading to the above failure. | |
24 | ||
25 | The document [1] says the 'E5-2603 v3' CPU has 4 memory channels max. Yi | |
26 | Zhang inserted one DIMM per channel for each CPU, and did random error | |
27 | address injection test with this patch: | |
28 | ||
29 | 4024 addresses fell in TOLM hole area | |
30 | 12715 addresses fell in CPU_SrcID#0_Ha#0_Chan#0_DIMM#0 | |
31 | 12774 addresses fell in CPU_SrcID#0_Ha#0_Chan#1_DIMM#0 | |
32 | 12798 addresses fell in CPU_SrcID#0_Ha#0_Chan#2_DIMM#0 | |
33 | 12913 addresses fell in CPU_SrcID#0_Ha#0_Chan#3_DIMM#0 | |
34 | 12674 addresses fell in CPU_SrcID#1_Ha#0_Chan#0_DIMM#0 | |
35 | 12686 addresses fell in CPU_SrcID#1_Ha#0_Chan#1_DIMM#0 | |
36 | 12882 addresses fell in CPU_SrcID#1_Ha#0_Chan#2_DIMM#0 | |
37 | 12934 addresses fell in CPU_SrcID#1_Ha#0_Chan#3_DIMM#0 | |
38 | 106400 addresses were injected totally. | |
39 | ||
40 | The test result shows that all the 4 channels belong to IMC0 per CPU, so | |
41 | the server really only has one IMC per CPU. | |
42 | ||
43 | In the 1st page of chapter 2 in datasheet [2], it also says 'E5-2600 v3' | |
44 | implements either one or two IMCs. For CPUs with one IMC, IMC1 is not | |
45 | used and should be ignored. | |
46 | ||
47 | Thus, do not create a second memory controller if the key HA1 is absent. | |
48 | ||
49 | [1] http://ark.intel.com/products/83349/Intel-Xeon-Processor-E5-2603-v3-15M-Cache-1_60-GHz | |
50 | [2] https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf | |
51 | ||
52 | Reported-and-tested-by: Yi Zhang <yizhan@redhat.com> | |
53 | Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> | |
54 | Cc: Tony Luck <tony.luck@intel.com> | |
55 | Cc: linux-edac <linux-edac@vger.kernel.org> | |
56 | Fixes: e2f747b1f42a ("EDAC, sb_edac: Assign EDAC memory controller per h/w controller") | |
57 | Link: http://lkml.kernel.org/r/20170913104214.7325-1-qiuxu.zhuo@intel.com | |
58 | [ Massage commit message. ] | |
59 | Signed-off-by: Borislav Petkov <bp@suse.de> | |
60 | (cherry picked from commit 15cc3ae001873845b5d842e212478a6570c7d938) | |
61 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
62 | --- | |
63 | drivers/edac/sb_edac.c | 9 ++++++++- | |
64 | 1 file changed, 8 insertions(+), 1 deletion(-) | |
65 | ||
66 | diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c | |
67 | index 80d860cb0746..7a3b201d51df 100644 | |
68 | --- a/drivers/edac/sb_edac.c | |
69 | +++ b/drivers/edac/sb_edac.c | |
70 | @@ -455,6 +455,7 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = { | |
71 | static const struct pci_id_descr pci_dev_descr_ibridge[] = { | |
72 | /* Processor Home Agent */ | |
73 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0, 0, IMC0) }, | |
74 | + { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) }, | |
75 | ||
76 | /* Memory controller */ | |
77 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA, 0, IMC0) }, | |
78 | @@ -465,7 +466,6 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = { | |
79 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3, 0, IMC0) }, | |
80 | ||
81 | /* Optional, mode 2HA */ | |
82 | - { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, 1, IMC1) }, | |
83 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TA, 1, IMC1) }, | |
84 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS, 1, IMC1) }, | |
85 | { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1, IMC1) }, | |
86 | @@ -2260,6 +2260,13 @@ static int sbridge_get_onedevice(struct pci_dev **prev, | |
87 | next_imc: | |
88 | sbridge_dev = get_sbridge_dev(bus, dev_descr->dom, multi_bus, sbridge_dev); | |
89 | if (!sbridge_dev) { | |
90 | + /* If the HA1 wasn't found, don't create EDAC second memory controller */ | |
91 | + if (dev_descr->dom == IMC1 && devno != 1) { | |
92 | + edac_dbg(0, "Skip IMC1: %04x:%04x (since HA1 was absent)\n", | |
93 | + PCI_VENDOR_ID_INTEL, dev_descr->dev_id); | |
94 | + pci_dev_put(pdev); | |
95 | + return 0; | |
96 | + } | |
97 | ||
98 | if (dev_descr->dom == SOCK) | |
99 | goto out_imc; | |
100 | -- | |
101 | 2.14.2 | |
102 |