2 * Copyright 2019 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
24 #include "amdgpu_ras.h"
26 int amdgpu_umc_ras_late_init(struct amdgpu_device
*adev
)
29 struct ras_fs_if fs_info
= {
30 .sysfs_name
= "umc_err_count",
32 struct ras_ih_if ih_info
= {
33 .cb
= amdgpu_umc_process_ras_data_cb
,
36 if (!adev
->umc
.ras_if
) {
38 kmalloc(sizeof(struct ras_common_if
), GFP_KERNEL
);
39 if (!adev
->umc
.ras_if
)
41 adev
->umc
.ras_if
->block
= AMDGPU_RAS_BLOCK__UMC
;
42 adev
->umc
.ras_if
->type
= AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE
;
43 adev
->umc
.ras_if
->sub_block_index
= 0;
44 strcpy(adev
->umc
.ras_if
->name
, "umc");
46 ih_info
.head
= fs_info
.head
= *adev
->umc
.ras_if
;
48 r
= amdgpu_ras_late_init(adev
, adev
->umc
.ras_if
,
53 if (amdgpu_ras_is_supported(adev
, adev
->umc
.ras_if
->block
)) {
54 r
= amdgpu_irq_get(adev
, &adev
->gmc
.ecc_irq
, 0);
62 /* ras init of specific umc version */
63 if (adev
->umc
.funcs
&& adev
->umc
.funcs
->err_cnt_init
)
64 adev
->umc
.funcs
->err_cnt_init(adev
);
69 amdgpu_ras_late_fini(adev
, adev
->umc
.ras_if
, &ih_info
);
71 kfree(adev
->umc
.ras_if
);
72 adev
->umc
.ras_if
= NULL
;
76 void amdgpu_umc_ras_fini(struct amdgpu_device
*adev
)
78 if (amdgpu_ras_is_supported(adev
, AMDGPU_RAS_BLOCK__UMC
) &&
80 struct ras_common_if
*ras_if
= adev
->umc
.ras_if
;
81 struct ras_ih_if ih_info
= {
83 .cb
= amdgpu_umc_process_ras_data_cb
,
86 amdgpu_ras_late_fini(adev
, ras_if
, &ih_info
);
91 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device
*adev
,
92 void *ras_error_status
,
93 struct amdgpu_iv_entry
*entry
)
95 struct ras_err_data
*err_data
= (struct ras_err_data
*)ras_error_status
;
97 kgd2kfd_set_sram_ecc_flag(adev
->kfd
.dev
);
98 if (adev
->umc
.funcs
&&
99 adev
->umc
.funcs
->query_ras_error_count
)
100 adev
->umc
.funcs
->query_ras_error_count(adev
, ras_error_status
);
102 if (adev
->umc
.funcs
&&
103 adev
->umc
.funcs
->query_ras_error_address
&&
104 adev
->umc
.max_ras_err_cnt_per_query
) {
106 kcalloc(adev
->umc
.max_ras_err_cnt_per_query
,
107 sizeof(struct eeprom_table_record
), GFP_KERNEL
);
109 /* still call query_ras_error_address to clear error status
110 * even NOMEM error is encountered
112 if(!err_data
->err_addr
)
113 dev_warn(adev
->dev
, "Failed to alloc memory for "
114 "umc error address record!\n");
116 /* umc query_ras_error_address is also responsible for clearing
119 adev
->umc
.funcs
->query_ras_error_address(adev
, ras_error_status
);
122 /* only uncorrectable error needs gpu reset */
123 if (err_data
->ue_count
) {
124 dev_info(adev
->dev
, "%ld uncorrectable hardware errors "
125 "detected in UMC block\n",
128 if (err_data
->err_addr_cnt
&&
129 amdgpu_ras_add_bad_pages(adev
, err_data
->err_addr
,
130 err_data
->err_addr_cnt
))
131 dev_warn(adev
->dev
, "Failed to add ras bad page!\n");
133 amdgpu_ras_reset_gpu(adev
);
136 kfree(err_data
->err_addr
);
137 return AMDGPU_RAS_SUCCESS
;
140 int amdgpu_umc_process_ecc_irq(struct amdgpu_device
*adev
,
141 struct amdgpu_irq_src
*source
,
142 struct amdgpu_iv_entry
*entry
)
144 struct ras_common_if
*ras_if
= adev
->umc
.ras_if
;
145 struct ras_dispatch_if ih_data
= {
152 ih_data
.head
= *ras_if
;
154 amdgpu_ras_interrupt_dispatch(adev
, &ih_data
);