1 /*
2  * Copyright 2021 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 #include "umc_v6_7.h"
24 #include "amdgpu_ras.h"
25 #include "amdgpu_umc.h"
26 #include "amdgpu.h"
27 
28 #include "umc/umc_6_7_0_offset.h"
29 #include "umc/umc_6_7_0_sh_mask.h"
30 
31 const uint32_t
32 	umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
33 		{28, 20, 24, 16, 12, 4, 8, 0},
34 		{6, 30, 2, 26, 22, 14, 18, 10},
35 		{19, 11, 15, 7, 3, 27, 31, 23},
36 		{9, 1, 5, 29, 25, 17, 21, 13}
37 };
38 const uint32_t
39 	umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM][UMC_V6_7_CHANNEL_INSTANCE_NUM] = {
40 		{19, 11, 15, 7,	3, 27, 31, 23},
41 		{9, 1, 5, 29, 25, 17, 21, 13},
42 		{28, 20, 24, 16, 12, 4, 8, 0},
43 		{6, 30, 2, 26, 22, 14, 18, 10},
44 };
45 
get_umc_v6_7_reg_offset(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst)46 static inline uint32_t get_umc_v6_7_reg_offset(struct amdgpu_device *adev,
47 					      uint32_t umc_inst,
48 					      uint32_t ch_inst)
49 {
50 	uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst;
51 
52 	/* adjust umc and channel index offset,
53 	 * the register address is not linear on each umc instace */
54 	umc_inst = index / 4;
55 	ch_inst = index % 4;
56 
57 	return adev->umc.channel_offs * ch_inst + UMC_V6_7_INST_DIST * umc_inst;
58 }
59 
get_umc_v6_7_channel_index(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst)60 static inline uint32_t get_umc_v6_7_channel_index(struct amdgpu_device *adev,
61 					      uint32_t umc_inst,
62 					      uint32_t ch_inst)
63 {
64 	return adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
65 }
66 
umc_v6_7_query_error_status_helper(struct amdgpu_device * adev,uint64_t mc_umc_status,uint32_t umc_reg_offset)67 static void umc_v6_7_query_error_status_helper(struct amdgpu_device *adev,
68 						  uint64_t mc_umc_status, uint32_t umc_reg_offset)
69 {
70 	uint32_t mc_umc_addr;
71 	uint64_t reg_value;
72 
73 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)
74 		dev_info(adev->dev, "Deferred error, no user action is needed.\n");
75 
76 	if (mc_umc_status)
77 		dev_info(adev->dev, "MCA STATUS 0x%llx, umc_reg_offset 0x%x\n", mc_umc_status, umc_reg_offset);
78 
79 	/* print IPID registers value */
80 	mc_umc_addr =
81 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_IPIDT0);
82 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
83 	if (reg_value)
84 		dev_info(adev->dev, "MCA IPID 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
85 
86 	/* print SYND registers value */
87 	mc_umc_addr =
88 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_SYNDT0);
89 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
90 	if (reg_value)
91 		dev_info(adev->dev, "MCA SYND 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
92 
93 	/* print MISC0 registers value */
94 	mc_umc_addr =
95 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_MISC0T0);
96 	reg_value = RREG64_PCIE((mc_umc_addr + umc_reg_offset) * 4);
97 	if (reg_value)
98 		dev_info(adev->dev, "MCA MISC0 0x%llx, umc_reg_offset 0x%x\n", reg_value, umc_reg_offset);
99 }
100 
umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst,unsigned long * error_count)101 static void umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device *adev,
102 						   uint32_t umc_inst, uint32_t ch_inst,
103 						   unsigned long *error_count)
104 {
105 	uint64_t mc_umc_status;
106 	uint32_t eccinfo_table_idx;
107 	uint32_t umc_reg_offset;
108 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
109 
110 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
111 						umc_inst, ch_inst);
112 
113 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
114 	/* check for SRAM correctable error
115 	  MCUMC_STATUS is a 64 bit register */
116 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
117 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
118 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
119 		*error_count += 1;
120 
121 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
122 
123 		if (ras->umc_ecc.record_ce_addr_supported)	{
124 			uint64_t err_addr, soc_pa;
125 			uint32_t channel_index =
126 				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
127 
128 			err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr;
129 			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
130 			/* translate umc channel address to soc pa, 3 parts are included */
131 			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
132 					ADDR_OF_256B_BLOCK(channel_index) |
133 					OFFSET_IN_256B_BLOCK(err_addr);
134 
135 			/* The umc channel bits are not original values, they are hashed */
136 			SET_CHANNEL_HASH(channel_index, soc_pa);
137 
138 			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
139 		}
140 	}
141 }
142 
umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_inst,uint32_t ch_inst,unsigned long * error_count)143 static void umc_v6_7_ecc_info_querry_uncorrectable_error_count(struct amdgpu_device *adev,
144 							  uint32_t umc_inst, uint32_t ch_inst,
145 						      unsigned long *error_count)
146 {
147 	uint64_t mc_umc_status;
148 	uint32_t eccinfo_table_idx;
149 	uint32_t umc_reg_offset;
150 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
151 
152 	umc_reg_offset = get_umc_v6_7_reg_offset(adev,
153 						umc_inst, ch_inst);
154 
155 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
156 	/* check the MCUMC_STATUS */
157 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
158 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
159 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
160 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
161 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
162 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
163 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
164 		*error_count += 1;
165 
166 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
167 	}
168 }
169 
umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)170 static void umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
171 					   void *ras_error_status)
172 {
173 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
174 
175 	uint32_t umc_inst        = 0;
176 	uint32_t ch_inst         = 0;
177 
178 	/*TODO: driver needs to toggle DF Cstate to ensure
179 	 * safe access of UMC registers. Will add the protection */
180 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
181 		umc_v6_7_ecc_info_query_correctable_error_count(adev,
182 						      umc_inst, ch_inst,
183 						      &(err_data->ce_count));
184 		umc_v6_7_ecc_info_querry_uncorrectable_error_count(adev,
185 						      umc_inst, ch_inst,
186 							  &(err_data->ue_count));
187 	}
188 }
189 
umc_v6_7_convert_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint64_t err_addr,uint32_t ch_inst,uint32_t umc_inst)190 static void umc_v6_7_convert_error_address(struct amdgpu_device *adev,
191 					struct ras_err_data *err_data, uint64_t err_addr,
192 					uint32_t ch_inst, uint32_t umc_inst)
193 {
194 	uint32_t channel_index;
195 	uint64_t soc_pa, retired_page, column;
196 
197 	channel_index =
198 		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
199 	/* translate umc channel address to soc pa, 3 parts are included */
200 	soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
201 			ADDR_OF_256B_BLOCK(channel_index) |
202 			OFFSET_IN_256B_BLOCK(err_addr);
203 
204 	/* The umc channel bits are not original values, they are hashed */
205 	SET_CHANNEL_HASH(channel_index, soc_pa);
206 
207 	/* clear [C4 C3 C2] in soc physical address */
208 	soc_pa &= ~(0x7ULL << UMC_V6_7_PA_C2_BIT);
209 
210 	/* loop for all possibilities of [C4 C3 C2] */
211 	for (column = 0; column < UMC_V6_7_NA_MAP_PA_NUM; column++) {
212 		retired_page = soc_pa | (column << UMC_V6_7_PA_C2_BIT);
213 		dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
214 		amdgpu_umc_fill_error_record(err_data, err_addr,
215 			retired_page, channel_index, umc_inst);
216 
217 		/* shift R14 bit */
218 		retired_page ^= (0x1ULL << UMC_V6_7_PA_R14_BIT);
219 		dev_info(adev->dev, "Error Address(PA): 0x%llx\n", retired_page);
220 		amdgpu_umc_fill_error_record(err_data, err_addr,
221 			retired_page, channel_index, umc_inst);
222 	}
223 }
224 
umc_v6_7_ecc_info_query_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint32_t ch_inst,uint32_t umc_inst)225 static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev,
226 					 struct ras_err_data *err_data,
227 					 uint32_t ch_inst,
228 					 uint32_t umc_inst)
229 {
230 	uint64_t mc_umc_status, err_addr;
231 	uint32_t eccinfo_table_idx;
232 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
233 
234 	eccinfo_table_idx = umc_inst * adev->umc.channel_inst_num + ch_inst;
235 	mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
236 
237 	if (mc_umc_status == 0)
238 		return;
239 
240 	if (!err_data->err_addr)
241 		return;
242 
243 	/* calculate error address if ue error is detected */
244 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
245 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
246 
247 		err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr;
248 		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
249 
250 		umc_v6_7_convert_error_address(adev, err_data, err_addr,
251 					ch_inst, umc_inst);
252 	}
253 }
254 
umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)255 static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
256 					     void *ras_error_status)
257 {
258 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
259 
260 	uint32_t umc_inst        = 0;
261 	uint32_t ch_inst         = 0;
262 
263 	/*TODO: driver needs to toggle DF Cstate to ensure
264 	 * safe access of UMC resgisters. Will add the protection
265 	 * when firmware interface is ready */
266 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
267 		umc_v6_7_ecc_info_query_error_address(adev,
268 					     err_data,
269 					     ch_inst,
270 					     umc_inst);
271 	}
272 }
273 
umc_v6_7_query_correctable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count,uint32_t ch_inst,uint32_t umc_inst)274 static void umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev,
275 						   uint32_t umc_reg_offset,
276 						   unsigned long *error_count,
277 						   uint32_t ch_inst,
278 						   uint32_t umc_inst)
279 {
280 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
281 	uint32_t ecc_err_cnt, ecc_err_cnt_addr;
282 	uint64_t mc_umc_status;
283 	uint32_t mc_umc_status_addr;
284 
285 	/* UMC 6_1_1 registers */
286 	ecc_err_cnt_sel_addr =
287 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCntSel);
288 	ecc_err_cnt_addr =
289 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccErrCnt);
290 	mc_umc_status_addr =
291 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
292 
293 	/* select the lower chip and check the error count */
294 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
295 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
296 					EccErrCntCsSel, 0);
297 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
298 
299 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
300 	*error_count +=
301 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
302 		 UMC_V6_7_CE_CNT_INIT);
303 
304 	/* select the higher chip and check the err counter */
305 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_EccErrCntSel,
306 					EccErrCntCsSel, 1);
307 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
308 
309 	ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
310 	*error_count +=
311 		(REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_EccErrCnt, EccErrCnt) -
312 		 UMC_V6_7_CE_CNT_INIT);
313 
314 	/* check for SRAM correctable error
315 	  MCUMC_STATUS is a 64 bit register */
316 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
317 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
318 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) {
319 		*error_count += 1;
320 
321 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
322 
323 		{
324 			uint64_t err_addr, soc_pa;
325 			uint32_t mc_umc_addrt0;
326 			uint32_t channel_index;
327 
328 			mc_umc_addrt0 =
329 				SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
330 
331 			channel_index =
332 				adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
333 
334 			err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
335 			err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
336 
337 			/* translate umc channel address to soc pa, 3 parts are included */
338 			soc_pa = ADDR_OF_8KB_BLOCK(err_addr) |
339 					ADDR_OF_256B_BLOCK(channel_index) |
340 					OFFSET_IN_256B_BLOCK(err_addr);
341 
342 			/* The umc channel bits are not original values, they are hashed */
343 			SET_CHANNEL_HASH(channel_index, soc_pa);
344 
345 			dev_info(adev->dev, "Error Address(PA): 0x%llx\n", soc_pa);
346 		}
347 	}
348 }
349 
umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device * adev,uint32_t umc_reg_offset,unsigned long * error_count)350 static void umc_v6_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
351 						      uint32_t umc_reg_offset,
352 						      unsigned long *error_count)
353 {
354 	uint64_t mc_umc_status;
355 	uint32_t mc_umc_status_addr;
356 
357 	mc_umc_status_addr =
358 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
359 
360 	/* check the MCUMC_STATUS */
361 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
362 	if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
363 	    (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
364 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
365 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
366 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
367 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) {
368 		*error_count += 1;
369 
370 		umc_v6_7_query_error_status_helper(adev, mc_umc_status, umc_reg_offset);
371 	}
372 }
373 
umc_v6_7_reset_error_count_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)374 static void umc_v6_7_reset_error_count_per_channel(struct amdgpu_device *adev,
375 						   uint32_t umc_reg_offset)
376 {
377 	uint32_t ecc_err_cnt_addr;
378 	uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
379 
380 	ecc_err_cnt_sel_addr =
381 		SOC15_REG_OFFSET(UMC, 0,
382 				regUMCCH0_0_EccErrCntSel);
383 	ecc_err_cnt_addr =
384 		SOC15_REG_OFFSET(UMC, 0,
385 				regUMCCH0_0_EccErrCnt);
386 
387 	/* select the lower chip */
388 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
389 				       umc_reg_offset) * 4);
390 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
391 					UMCCH0_0_EccErrCntSel,
392 					EccErrCntCsSel, 0);
393 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
394 			ecc_err_cnt_sel);
395 
396 	/* clear lower chip error count */
397 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
398 			UMC_V6_7_CE_CNT_INIT);
399 
400 	/* select the higher chip */
401 	ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
402 					umc_reg_offset) * 4);
403 	ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
404 					UMCCH0_0_EccErrCntSel,
405 					EccErrCntCsSel, 1);
406 	WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
407 			ecc_err_cnt_sel);
408 
409 	/* clear higher chip error count */
410 	WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
411 			UMC_V6_7_CE_CNT_INIT);
412 }
413 
umc_v6_7_reset_error_count(struct amdgpu_device * adev)414 static void umc_v6_7_reset_error_count(struct amdgpu_device *adev)
415 {
416 	uint32_t umc_inst        = 0;
417 	uint32_t ch_inst         = 0;
418 	uint32_t umc_reg_offset  = 0;
419 
420 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
421 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
422 							 umc_inst,
423 							 ch_inst);
424 
425 		umc_v6_7_reset_error_count_per_channel(adev,
426 						       umc_reg_offset);
427 	}
428 }
429 
umc_v6_7_query_ras_error_count(struct amdgpu_device * adev,void * ras_error_status)430 static void umc_v6_7_query_ras_error_count(struct amdgpu_device *adev,
431 					   void *ras_error_status)
432 {
433 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
434 
435 	uint32_t umc_inst        = 0;
436 	uint32_t ch_inst         = 0;
437 	uint32_t umc_reg_offset  = 0;
438 
439 	/*TODO: driver needs to toggle DF Cstate to ensure
440 	 * safe access of UMC registers. Will add the protection */
441 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
442 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
443 							 umc_inst,
444 							 ch_inst);
445 		umc_v6_7_query_correctable_error_count(adev,
446 						       umc_reg_offset,
447 						       &(err_data->ce_count),
448 						       ch_inst, umc_inst);
449 		umc_v6_7_querry_uncorrectable_error_count(adev,
450 							  umc_reg_offset,
451 							  &(err_data->ue_count));
452 	}
453 
454 	umc_v6_7_reset_error_count(adev);
455 }
456 
umc_v6_7_query_error_address(struct amdgpu_device * adev,struct ras_err_data * err_data,uint32_t umc_reg_offset,uint32_t ch_inst,uint32_t umc_inst)457 static void umc_v6_7_query_error_address(struct amdgpu_device *adev,
458 					 struct ras_err_data *err_data,
459 					 uint32_t umc_reg_offset, uint32_t ch_inst,
460 					 uint32_t umc_inst)
461 {
462 	uint32_t mc_umc_status_addr;
463 	uint64_t mc_umc_status = 0, mc_umc_addrt0, err_addr;
464 
465 	mc_umc_status_addr =
466 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
467 	mc_umc_addrt0 =
468 		SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
469 
470 	mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
471 
472 	if (mc_umc_status == 0)
473 		return;
474 
475 	if (!err_data->err_addr) {
476 		/* clear umc status */
477 		WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
478 		return;
479 	}
480 
481 	/* calculate error address if ue error is detected */
482 	if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
483 	    REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1) {
484 		err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
485 		err_addr =
486 			REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
487 
488 		umc_v6_7_convert_error_address(adev, err_data, err_addr,
489 					ch_inst, umc_inst);
490 	}
491 
492 	/* clear umc status */
493 	WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
494 }
495 
umc_v6_7_query_ras_error_address(struct amdgpu_device * adev,void * ras_error_status)496 static void umc_v6_7_query_ras_error_address(struct amdgpu_device *adev,
497 					     void *ras_error_status)
498 {
499 	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
500 
501 	uint32_t umc_inst        = 0;
502 	uint32_t ch_inst         = 0;
503 	uint32_t umc_reg_offset  = 0;
504 
505 	/*TODO: driver needs to toggle DF Cstate to ensure
506 	 * safe access of UMC resgisters. Will add the protection
507 	 * when firmware interface is ready */
508 	LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
509 		umc_reg_offset = get_umc_v6_7_reg_offset(adev,
510 							 umc_inst,
511 							 ch_inst);
512 		umc_v6_7_query_error_address(adev,
513 					     err_data,
514 					     umc_reg_offset, ch_inst,
515 					     umc_inst);
516 	}
517 }
518 
umc_v6_7_query_ras_poison_mode_per_channel(struct amdgpu_device * adev,uint32_t umc_reg_offset)519 static uint32_t umc_v6_7_query_ras_poison_mode_per_channel(
520 						struct amdgpu_device *adev,
521 						uint32_t umc_reg_offset)
522 {
523 	uint32_t ecc_ctrl_addr, ecc_ctrl;
524 
525 	ecc_ctrl_addr =
526 		SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_EccCtrl);
527 	ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
528 					umc_reg_offset) * 4);
529 
530 	return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_EccCtrl, UCFatalEn);
531 }
532 
umc_v6_7_query_ras_poison_mode(struct amdgpu_device * adev)533 static bool umc_v6_7_query_ras_poison_mode(struct amdgpu_device *adev)
534 {
535 	uint32_t umc_reg_offset  = 0;
536 
537 	/* Enabling fatal error in umc instance0 channel0 will be
538 	 * considered as fatal error mode
539 	 */
540 	umc_reg_offset = get_umc_v6_7_reg_offset(adev, 0, 0);
541 	return !umc_v6_7_query_ras_poison_mode_per_channel(adev, umc_reg_offset);
542 }
543 
544 const struct amdgpu_ras_block_hw_ops umc_v6_7_ras_hw_ops = {
545 	.query_ras_error_count = umc_v6_7_query_ras_error_count,
546 	.query_ras_error_address = umc_v6_7_query_ras_error_address,
547 };
548 
549 struct amdgpu_umc_ras umc_v6_7_ras = {
550 	.ras_block = {
551 		.hw_ops = &umc_v6_7_ras_hw_ops,
552 	},
553 	.query_ras_poison_mode = umc_v6_7_query_ras_poison_mode,
554 	.ecc_info_query_ras_error_count = umc_v6_7_ecc_info_query_ras_error_count,
555 	.ecc_info_query_ras_error_address = umc_v6_7_ecc_info_query_ras_error_address,
556 	.convert_ras_error_address = umc_v6_7_convert_error_address,
557 };
558