1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 
7 #include "mce_amd.h"
8 
9 static struct amd_decoder_ops *fam_ops;
10 
11 static u8 xec_mask	 = 0xf;
12 
13 static bool report_gart_errors;
14 static void (*decode_dram_ecc)(int node_id, struct mce *m);
15 
amd_report_gart_errors(bool v)16 void amd_report_gart_errors(bool v)
17 {
18 	report_gart_errors = v;
19 }
20 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
21 
amd_register_ecc_decoder(void (* f)(int,struct mce *))22 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
23 {
24 	decode_dram_ecc = f;
25 }
26 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
27 
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))28 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
29 {
30 	if (decode_dram_ecc) {
31 		WARN_ON(decode_dram_ecc != f);
32 
33 		decode_dram_ecc = NULL;
34 	}
35 }
36 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37 
38 /*
39  * string representation for the different MCA reported error types, see F3x48
40  * or MSR0000_0411.
41  */
42 
43 /* transaction type */
44 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
45 
46 /* cache level */
47 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
48 
49 /* memory transaction type */
50 static const char * const rrrr_msgs[] = {
51        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52 };
53 
54 /* participating processor */
55 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56 EXPORT_SYMBOL_GPL(pp_msgs);
57 
58 /* request timeout */
59 static const char * const to_msgs[] = { "no timeout", "timed out" };
60 
61 /* memory or i/o */
62 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
63 
64 /* internal error type */
65 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
66 
67 static const char * const f15h_mc1_mce_desc[] = {
68 	"UC during a demand linefill from L2",
69 	"Parity error during data load from IC",
70 	"Parity error for IC valid bit",
71 	"Main tag parity error",
72 	"Parity error in prediction queue",
73 	"PFB data/address parity error",
74 	"Parity error in the branch status reg",
75 	"PFB promotion address error",
76 	"Tag error during probe/victimization",
77 	"Parity error for IC probe tag valid bit",
78 	"PFB non-cacheable bit parity error",
79 	"PFB valid bit parity error",			/* xec = 0xd */
80 	"Microcode Patch Buffer",			/* xec = 010 */
81 	"uop queue",
82 	"insn buffer",
83 	"predecode buffer",
84 	"fetch address FIFO",
85 	"dispatch uop queue"
86 };
87 
88 static const char * const f15h_mc2_mce_desc[] = {
89 	"Fill ECC error on data fills",			/* xec = 0x4 */
90 	"Fill parity error on insn fills",
91 	"Prefetcher request FIFO parity error",
92 	"PRQ address parity error",
93 	"PRQ data parity error",
94 	"WCC Tag ECC error",
95 	"WCC Data ECC error",
96 	"WCB Data parity error",
97 	"VB Data ECC or parity error",
98 	"L2 Tag ECC error",				/* xec = 0x10 */
99 	"Hard L2 Tag ECC error",
100 	"Multiple hits on L2 tag",
101 	"XAB parity error",
102 	"PRB address parity error"
103 };
104 
105 static const char * const mc4_mce_desc[] = {
106 	"DRAM ECC error detected on the NB",
107 	"CRC error detected on HT link",
108 	"Link-defined sync error packets detected on HT link",
109 	"HT Master abort",
110 	"HT Target abort",
111 	"Invalid GART PTE entry during GART table walk",
112 	"Unsupported atomic RMW received from an IO link",
113 	"Watchdog timeout due to lack of progress",
114 	"DRAM ECC error detected on the NB",
115 	"SVM DMA Exclusion Vector error",
116 	"HT data error detected on link",
117 	"Protocol error (link, L3, probe filter)",
118 	"NB internal arrays parity error",
119 	"DRAM addr/ctl signals parity error",
120 	"IO link transmission error",
121 	"L3 data cache ECC error",			/* xec = 0x1c */
122 	"L3 cache tag error",
123 	"L3 LRU parity bits error",
124 	"ECC Error in the Probe Filter directory"
125 };
126 
127 static const char * const mc5_mce_desc[] = {
128 	"CPU Watchdog timer expire",
129 	"Wakeup array dest tag",
130 	"AG payload array",
131 	"EX payload array",
132 	"IDRF array",
133 	"Retire dispatch queue",
134 	"Mapper checkpoint array",
135 	"Physical register file EX0 port",
136 	"Physical register file EX1 port",
137 	"Physical register file AG0 port",
138 	"Physical register file AG1 port",
139 	"Flag register file",
140 	"DE error occurred",
141 	"Retire status queue"
142 };
143 
144 static const char * const mc6_mce_desc[] = {
145 	"Hardware Assertion",
146 	"Free List",
147 	"Physical Register File",
148 	"Retire Queue",
149 	"Scheduler table",
150 	"Status Register File",
151 };
152 
153 /* Scalable MCA error strings */
154 static const char * const smca_ls_mce_desc[] = {
155 	"Load queue parity error",
156 	"Store queue parity error",
157 	"Miss address buffer payload parity error",
158 	"Level 1 TLB parity error",
159 	"DC Tag error type 5",
160 	"DC Tag error type 6",
161 	"DC Tag error type 1",
162 	"Internal error type 1",
163 	"Internal error type 2",
164 	"System Read Data Error Thread 0",
165 	"System Read Data Error Thread 1",
166 	"DC Tag error type 2",
167 	"DC Data error type 1 and poison consumption",
168 	"DC Data error type 2",
169 	"DC Data error type 3",
170 	"DC Tag error type 4",
171 	"Level 2 TLB parity error",
172 	"PDC parity error",
173 	"DC Tag error type 3",
174 	"DC Tag error type 5",
175 	"L2 Fill Data error",
176 };
177 
178 static const char * const smca_if_mce_desc[] = {
179 	"Op Cache Microtag Probe Port Parity Error",
180 	"IC Microtag or Full Tag Multi-hit Error",
181 	"IC Full Tag Parity Error",
182 	"IC Data Array Parity Error",
183 	"Decoupling Queue PhysAddr Parity Error",
184 	"L0 ITLB Parity Error",
185 	"L1 ITLB Parity Error",
186 	"L2 ITLB Parity Error",
187 	"BPQ Thread 0 Snoop Parity Error",
188 	"BPQ Thread 1 Snoop Parity Error",
189 	"L1 BTB Multi-Match Error",
190 	"L2 BTB Multi-Match Error",
191 	"L2 Cache Response Poison Error",
192 	"System Read Data Error",
193 };
194 
195 static const char * const smca_l2_mce_desc[] = {
196 	"L2M Tag Multiple-Way-Hit error",
197 	"L2M Tag or State Array ECC Error",
198 	"L2M Data Array ECC Error",
199 	"Hardware Assert Error",
200 };
201 
202 static const char * const smca_de_mce_desc[] = {
203 	"Micro-op cache tag parity error",
204 	"Micro-op cache data parity error",
205 	"Instruction buffer parity error",
206 	"Micro-op queue parity error",
207 	"Instruction dispatch queue parity error",
208 	"Fetch address FIFO parity error",
209 	"Patch RAM data parity error",
210 	"Patch RAM sequencer parity error",
211 	"Micro-op buffer parity error"
212 };
213 
214 static const char * const smca_ex_mce_desc[] = {
215 	"Watchdog Timeout error",
216 	"Physical register file parity error",
217 	"Flag register file parity error",
218 	"Immediate displacement register file parity error",
219 	"Address generator payload parity error",
220 	"EX payload parity error",
221 	"Checkpoint queue parity error",
222 	"Retire dispatch queue parity error",
223 	"Retire status queue parity error",
224 	"Scheduling queue parity error",
225 	"Branch buffer queue parity error",
226 	"Hardware Assertion error",
227 };
228 
229 static const char * const smca_fp_mce_desc[] = {
230 	"Physical register file (PRF) parity error",
231 	"Freelist (FL) parity error",
232 	"Schedule queue parity error",
233 	"NSQ parity error",
234 	"Retire queue (RQ) parity error",
235 	"Status register file (SRF) parity error",
236 	"Hardware assertion",
237 };
238 
239 static const char * const smca_l3_mce_desc[] = {
240 	"Shadow Tag Macro ECC Error",
241 	"Shadow Tag Macro Multi-way-hit Error",
242 	"L3M Tag ECC Error",
243 	"L3M Tag Multi-way-hit Error",
244 	"L3M Data ECC Error",
245 	"SDP Parity Error or SystemReadDataError from XI",
246 	"L3 Victim Queue Parity Error",
247 	"L3 Hardware Assertion",
248 };
249 
250 static const char * const smca_cs_mce_desc[] = {
251 	"Illegal Request",
252 	"Address Violation",
253 	"Security Violation",
254 	"Illegal Response",
255 	"Unexpected Response",
256 	"Request or Probe Parity Error",
257 	"Read Response Parity Error",
258 	"Atomic Request Parity Error",
259 	"Probe Filter ECC Error",
260 };
261 
262 static const char * const smca_cs2_mce_desc[] = {
263 	"Illegal Request",
264 	"Address Violation",
265 	"Security Violation",
266 	"Illegal Response",
267 	"Unexpected Response",
268 	"Request or Probe Parity Error",
269 	"Read Response Parity Error",
270 	"Atomic Request Parity Error",
271 	"SDP read response had no match in the CS queue",
272 	"Probe Filter Protocol Error",
273 	"Probe Filter ECC Error",
274 	"SDP read response had an unexpected RETRY error",
275 	"Counter overflow error",
276 	"Counter underflow error",
277 };
278 
279 static const char * const smca_pie_mce_desc[] = {
280 	"Hardware Assert",
281 	"Register security violation",
282 	"Link Error",
283 	"Poison data consumption",
284 	"A deferred error was detected in the DF"
285 };
286 
287 static const char * const smca_umc_mce_desc[] = {
288 	"DRAM ECC error",
289 	"Data poison error",
290 	"SDP parity error",
291 	"Advanced peripheral bus error",
292 	"Address/Command parity error",
293 	"Write data CRC error",
294 	"DCQ SRAM ECC error",
295 	"AES SRAM ECC error",
296 };
297 
298 static const char * const smca_pb_mce_desc[] = {
299 	"An ECC error in the Parameter Block RAM array",
300 };
301 
302 static const char * const smca_psp_mce_desc[] = {
303 	"An ECC or parity error in a PSP RAM instance",
304 };
305 
306 static const char * const smca_psp2_mce_desc[] = {
307 	"High SRAM ECC or parity error",
308 	"Low SRAM ECC or parity error",
309 	"Instruction Cache Bank 0 ECC or parity error",
310 	"Instruction Cache Bank 1 ECC or parity error",
311 	"Instruction Tag Ram 0 parity error",
312 	"Instruction Tag Ram 1 parity error",
313 	"Data Cache Bank 0 ECC or parity error",
314 	"Data Cache Bank 1 ECC or parity error",
315 	"Data Cache Bank 2 ECC or parity error",
316 	"Data Cache Bank 3 ECC or parity error",
317 	"Data Tag Bank 0 parity error",
318 	"Data Tag Bank 1 parity error",
319 	"Data Tag Bank 2 parity error",
320 	"Data Tag Bank 3 parity error",
321 	"Dirty Data Ram parity error",
322 	"TLB Bank 0 parity error",
323 	"TLB Bank 1 parity error",
324 	"System Hub Read Buffer ECC or parity error",
325 };
326 
327 static const char * const smca_smu_mce_desc[] = {
328 	"An ECC or parity error in an SMU RAM instance",
329 };
330 
331 static const char * const smca_smu2_mce_desc[] = {
332 	"High SRAM ECC or parity error",
333 	"Low SRAM ECC or parity error",
334 	"Data Cache Bank A ECC or parity error",
335 	"Data Cache Bank B ECC or parity error",
336 	"Data Tag Cache Bank A ECC or parity error",
337 	"Data Tag Cache Bank B ECC or parity error",
338 	"Instruction Cache Bank A ECC or parity error",
339 	"Instruction Cache Bank B ECC or parity error",
340 	"Instruction Tag Cache Bank A ECC or parity error",
341 	"Instruction Tag Cache Bank B ECC or parity error",
342 	"System Hub Read Buffer ECC or parity error",
343 };
344 
345 static const char * const smca_mp5_mce_desc[] = {
346 	"High SRAM ECC or parity error",
347 	"Low SRAM ECC or parity error",
348 	"Data Cache Bank A ECC or parity error",
349 	"Data Cache Bank B ECC or parity error",
350 	"Data Tag Cache Bank A ECC or parity error",
351 	"Data Tag Cache Bank B ECC or parity error",
352 	"Instruction Cache Bank A ECC or parity error",
353 	"Instruction Cache Bank B ECC or parity error",
354 	"Instruction Tag Cache Bank A ECC or parity error",
355 	"Instruction Tag Cache Bank B ECC or parity error",
356 };
357 
358 static const char * const smca_nbio_mce_desc[] = {
359 	"ECC or Parity error",
360 	"PCIE error",
361 	"SDP ErrEvent error",
362 	"SDP Egress Poison Error",
363 	"IOHC Internal Poison Error",
364 };
365 
366 static const char * const smca_pcie_mce_desc[] = {
367 	"CCIX PER Message logging",
368 	"CCIX Read Response with Status: Non-Data Error",
369 	"CCIX Write Response with Status: Non-Data Error",
370 	"CCIX Read Response with Status: Data Error",
371 	"CCIX Non-okay write response with data error",
372 };
373 
374 struct smca_mce_desc {
375 	const char * const *descs;
376 	unsigned int num_descs;
377 };
378 
379 static struct smca_mce_desc smca_mce_descs[] = {
380 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
381 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
382 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
383 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
384 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
385 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
386 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
387 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
388 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
389 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
390 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
391 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
392 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
393 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
394 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
395 	[SMCA_SMU_V2]	= { smca_smu2_mce_desc,	ARRAY_SIZE(smca_smu2_mce_desc)	},
396 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
397 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
398 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
399 };
400 
f12h_mc0_mce(u16 ec,u8 xec)401 static bool f12h_mc0_mce(u16 ec, u8 xec)
402 {
403 	bool ret = false;
404 
405 	if (MEM_ERROR(ec)) {
406 		u8 ll = LL(ec);
407 		ret = true;
408 
409 		if (ll == LL_L2)
410 			pr_cont("during L1 linefill from L2.\n");
411 		else if (ll == LL_L1)
412 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
413 		else
414 			ret = false;
415 	}
416 	return ret;
417 }
418 
f10h_mc0_mce(u16 ec,u8 xec)419 static bool f10h_mc0_mce(u16 ec, u8 xec)
420 {
421 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
422 		pr_cont("during data scrub.\n");
423 		return true;
424 	}
425 	return f12h_mc0_mce(ec, xec);
426 }
427 
k8_mc0_mce(u16 ec,u8 xec)428 static bool k8_mc0_mce(u16 ec, u8 xec)
429 {
430 	if (BUS_ERROR(ec)) {
431 		pr_cont("during system linefill.\n");
432 		return true;
433 	}
434 
435 	return f10h_mc0_mce(ec, xec);
436 }
437 
cat_mc0_mce(u16 ec,u8 xec)438 static bool cat_mc0_mce(u16 ec, u8 xec)
439 {
440 	u8 r4	 = R4(ec);
441 	bool ret = true;
442 
443 	if (MEM_ERROR(ec)) {
444 
445 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
446 			return false;
447 
448 		switch (r4) {
449 		case R4_DRD:
450 		case R4_DWR:
451 			pr_cont("Data/Tag parity error due to %s.\n",
452 				(r4 == R4_DRD ? "load/hw prf" : "store"));
453 			break;
454 		case R4_EVICT:
455 			pr_cont("Copyback parity error on a tag miss.\n");
456 			break;
457 		case R4_SNOOP:
458 			pr_cont("Tag parity error during snoop.\n");
459 			break;
460 		default:
461 			ret = false;
462 		}
463 	} else if (BUS_ERROR(ec)) {
464 
465 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
466 			return false;
467 
468 		pr_cont("System read data error on a ");
469 
470 		switch (r4) {
471 		case R4_RD:
472 			pr_cont("TLB reload.\n");
473 			break;
474 		case R4_DWR:
475 			pr_cont("store.\n");
476 			break;
477 		case R4_DRD:
478 			pr_cont("load.\n");
479 			break;
480 		default:
481 			ret = false;
482 		}
483 	} else {
484 		ret = false;
485 	}
486 
487 	return ret;
488 }
489 
f15h_mc0_mce(u16 ec,u8 xec)490 static bool f15h_mc0_mce(u16 ec, u8 xec)
491 {
492 	bool ret = true;
493 
494 	if (MEM_ERROR(ec)) {
495 
496 		switch (xec) {
497 		case 0x0:
498 			pr_cont("Data Array access error.\n");
499 			break;
500 
501 		case 0x1:
502 			pr_cont("UC error during a linefill from L2/NB.\n");
503 			break;
504 
505 		case 0x2:
506 		case 0x11:
507 			pr_cont("STQ access error.\n");
508 			break;
509 
510 		case 0x3:
511 			pr_cont("SCB access error.\n");
512 			break;
513 
514 		case 0x10:
515 			pr_cont("Tag error.\n");
516 			break;
517 
518 		case 0x12:
519 			pr_cont("LDQ access error.\n");
520 			break;
521 
522 		default:
523 			ret = false;
524 		}
525 	} else if (BUS_ERROR(ec)) {
526 
527 		if (!xec)
528 			pr_cont("System Read Data Error.\n");
529 		else
530 			pr_cont(" Internal error condition type %d.\n", xec);
531 	} else if (INT_ERROR(ec)) {
532 		if (xec <= 0x1f)
533 			pr_cont("Hardware Assert.\n");
534 		else
535 			ret = false;
536 
537 	} else
538 		ret = false;
539 
540 	return ret;
541 }
542 
decode_mc0_mce(struct mce * m)543 static void decode_mc0_mce(struct mce *m)
544 {
545 	u16 ec = EC(m->status);
546 	u8 xec = XEC(m->status, xec_mask);
547 
548 	pr_emerg(HW_ERR "MC0 Error: ");
549 
550 	/* TLB error signatures are the same across families */
551 	if (TLB_ERROR(ec)) {
552 		if (TT(ec) == TT_DATA) {
553 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
554 				((xec == 2) ? "locked miss"
555 					    : (xec ? "multimatch" : "parity")));
556 			return;
557 		}
558 	} else if (fam_ops->mc0_mce(ec, xec))
559 		;
560 	else
561 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
562 }
563 
k8_mc1_mce(u16 ec,u8 xec)564 static bool k8_mc1_mce(u16 ec, u8 xec)
565 {
566 	u8 ll	 = LL(ec);
567 	bool ret = true;
568 
569 	if (!MEM_ERROR(ec))
570 		return false;
571 
572 	if (ll == 0x2)
573 		pr_cont("during a linefill from L2.\n");
574 	else if (ll == 0x1) {
575 		switch (R4(ec)) {
576 		case R4_IRD:
577 			pr_cont("Parity error during data load.\n");
578 			break;
579 
580 		case R4_EVICT:
581 			pr_cont("Copyback Parity/Victim error.\n");
582 			break;
583 
584 		case R4_SNOOP:
585 			pr_cont("Tag Snoop error.\n");
586 			break;
587 
588 		default:
589 			ret = false;
590 			break;
591 		}
592 	} else
593 		ret = false;
594 
595 	return ret;
596 }
597 
cat_mc1_mce(u16 ec,u8 xec)598 static bool cat_mc1_mce(u16 ec, u8 xec)
599 {
600 	u8 r4    = R4(ec);
601 	bool ret = true;
602 
603 	if (!MEM_ERROR(ec))
604 		return false;
605 
606 	if (TT(ec) != TT_INSTR)
607 		return false;
608 
609 	if (r4 == R4_IRD)
610 		pr_cont("Data/tag array parity error for a tag hit.\n");
611 	else if (r4 == R4_SNOOP)
612 		pr_cont("Tag error during snoop/victimization.\n");
613 	else if (xec == 0x0)
614 		pr_cont("Tag parity error from victim castout.\n");
615 	else if (xec == 0x2)
616 		pr_cont("Microcode patch RAM parity error.\n");
617 	else
618 		ret = false;
619 
620 	return ret;
621 }
622 
f15h_mc1_mce(u16 ec,u8 xec)623 static bool f15h_mc1_mce(u16 ec, u8 xec)
624 {
625 	bool ret = true;
626 
627 	if (!MEM_ERROR(ec))
628 		return false;
629 
630 	switch (xec) {
631 	case 0x0 ... 0xa:
632 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
633 		break;
634 
635 	case 0xd:
636 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
637 		break;
638 
639 	case 0x10:
640 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
641 		break;
642 
643 	case 0x11 ... 0x15:
644 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
645 		break;
646 
647 	default:
648 		ret = false;
649 	}
650 	return ret;
651 }
652 
decode_mc1_mce(struct mce * m)653 static void decode_mc1_mce(struct mce *m)
654 {
655 	u16 ec = EC(m->status);
656 	u8 xec = XEC(m->status, xec_mask);
657 
658 	pr_emerg(HW_ERR "MC1 Error: ");
659 
660 	if (TLB_ERROR(ec))
661 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
662 			(xec ? "multimatch" : "parity error"));
663 	else if (BUS_ERROR(ec)) {
664 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
665 
666 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
667 	} else if (INT_ERROR(ec)) {
668 		if (xec <= 0x3f)
669 			pr_cont("Hardware Assert.\n");
670 		else
671 			goto wrong_mc1_mce;
672 	} else if (fam_ops->mc1_mce(ec, xec))
673 		;
674 	else
675 		goto wrong_mc1_mce;
676 
677 	return;
678 
679 wrong_mc1_mce:
680 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
681 }
682 
k8_mc2_mce(u16 ec,u8 xec)683 static bool k8_mc2_mce(u16 ec, u8 xec)
684 {
685 	bool ret = true;
686 
687 	if (xec == 0x1)
688 		pr_cont(" in the write data buffers.\n");
689 	else if (xec == 0x3)
690 		pr_cont(" in the victim data buffers.\n");
691 	else if (xec == 0x2 && MEM_ERROR(ec))
692 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
693 	else if (xec == 0x0) {
694 		if (TLB_ERROR(ec))
695 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
696 				TT_MSG(ec));
697 		else if (BUS_ERROR(ec))
698 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
699 				R4_MSG(ec), PP_MSG(ec));
700 		else if (MEM_ERROR(ec)) {
701 			u8 r4 = R4(ec);
702 
703 			if (r4 >= 0x7)
704 				pr_cont(": %s error during data copyback.\n",
705 					R4_MSG(ec));
706 			else if (r4 <= 0x1)
707 				pr_cont(": %s parity/ECC error during data "
708 					"access from L2.\n", R4_MSG(ec));
709 			else
710 				ret = false;
711 		} else
712 			ret = false;
713 	} else
714 		ret = false;
715 
716 	return ret;
717 }
718 
f15h_mc2_mce(u16 ec,u8 xec)719 static bool f15h_mc2_mce(u16 ec, u8 xec)
720 {
721 	bool ret = true;
722 
723 	if (TLB_ERROR(ec)) {
724 		if (xec == 0x0)
725 			pr_cont("Data parity TLB read error.\n");
726 		else if (xec == 0x1)
727 			pr_cont("Poison data provided for TLB fill.\n");
728 		else
729 			ret = false;
730 	} else if (BUS_ERROR(ec)) {
731 		if (xec > 2)
732 			ret = false;
733 
734 		pr_cont("Error during attempted NB data read.\n");
735 	} else if (MEM_ERROR(ec)) {
736 		switch (xec) {
737 		case 0x4 ... 0xc:
738 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
739 			break;
740 
741 		case 0x10 ... 0x14:
742 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
743 			break;
744 
745 		default:
746 			ret = false;
747 		}
748 	} else if (INT_ERROR(ec)) {
749 		if (xec <= 0x3f)
750 			pr_cont("Hardware Assert.\n");
751 		else
752 			ret = false;
753 	}
754 
755 	return ret;
756 }
757 
f16h_mc2_mce(u16 ec,u8 xec)758 static bool f16h_mc2_mce(u16 ec, u8 xec)
759 {
760 	u8 r4 = R4(ec);
761 
762 	if (!MEM_ERROR(ec))
763 		return false;
764 
765 	switch (xec) {
766 	case 0x04 ... 0x05:
767 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
768 		break;
769 
770 	case 0x09 ... 0x0b:
771 	case 0x0d ... 0x0f:
772 		pr_cont("ECC error in L2 tag (%s).\n",
773 			((r4 == R4_GEN)   ? "BankReq" :
774 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
775 		break;
776 
777 	case 0x10 ... 0x19:
778 	case 0x1b:
779 		pr_cont("ECC error in L2 data array (%s).\n",
780 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
781 			((r4 == R4_GEN)   ? "Attr" :
782 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
783 		break;
784 
785 	case 0x1c ... 0x1d:
786 	case 0x1f:
787 		pr_cont("Parity error in L2 attribute bits (%s).\n",
788 			((r4 == R4_RD)  ? "Hit"  :
789 			((r4 == R4_GEN) ? "Attr" : "Fill")));
790 		break;
791 
792 	default:
793 		return false;
794 	}
795 
796 	return true;
797 }
798 
decode_mc2_mce(struct mce * m)799 static void decode_mc2_mce(struct mce *m)
800 {
801 	u16 ec = EC(m->status);
802 	u8 xec = XEC(m->status, xec_mask);
803 
804 	pr_emerg(HW_ERR "MC2 Error: ");
805 
806 	if (!fam_ops->mc2_mce(ec, xec))
807 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
808 }
809 
decode_mc3_mce(struct mce * m)810 static void decode_mc3_mce(struct mce *m)
811 {
812 	u16 ec = EC(m->status);
813 	u8 xec = XEC(m->status, xec_mask);
814 
815 	if (boot_cpu_data.x86 >= 0x14) {
816 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
817 			 " please report on LKML.\n");
818 		return;
819 	}
820 
821 	pr_emerg(HW_ERR "MC3 Error");
822 
823 	if (xec == 0x0) {
824 		u8 r4 = R4(ec);
825 
826 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
827 			goto wrong_mc3_mce;
828 
829 		pr_cont(" during %s.\n", R4_MSG(ec));
830 	} else
831 		goto wrong_mc3_mce;
832 
833 	return;
834 
835  wrong_mc3_mce:
836 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
837 }
838 
decode_mc4_mce(struct mce * m)839 static void decode_mc4_mce(struct mce *m)
840 {
841 	unsigned int fam = x86_family(m->cpuid);
842 	int node_id = amd_get_nb_id(m->extcpu);
843 	u16 ec = EC(m->status);
844 	u8 xec = XEC(m->status, 0x1f);
845 	u8 offset = 0;
846 
847 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
848 
849 	switch (xec) {
850 	case 0x0 ... 0xe:
851 
852 		/* special handling for DRAM ECCs */
853 		if (xec == 0x0 || xec == 0x8) {
854 			/* no ECCs on F11h */
855 			if (fam == 0x11)
856 				goto wrong_mc4_mce;
857 
858 			pr_cont("%s.\n", mc4_mce_desc[xec]);
859 
860 			if (decode_dram_ecc)
861 				decode_dram_ecc(node_id, m);
862 			return;
863 		}
864 		break;
865 
866 	case 0xf:
867 		if (TLB_ERROR(ec))
868 			pr_cont("GART Table Walk data error.\n");
869 		else if (BUS_ERROR(ec))
870 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
871 		else
872 			goto wrong_mc4_mce;
873 		return;
874 
875 	case 0x19:
876 		if (fam == 0x15 || fam == 0x16)
877 			pr_cont("Compute Unit Data Error.\n");
878 		else
879 			goto wrong_mc4_mce;
880 		return;
881 
882 	case 0x1c ... 0x1f:
883 		offset = 13;
884 		break;
885 
886 	default:
887 		goto wrong_mc4_mce;
888 	}
889 
890 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
891 	return;
892 
893  wrong_mc4_mce:
894 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
895 }
896 
decode_mc5_mce(struct mce * m)897 static void decode_mc5_mce(struct mce *m)
898 {
899 	unsigned int fam = x86_family(m->cpuid);
900 	u16 ec = EC(m->status);
901 	u8 xec = XEC(m->status, xec_mask);
902 
903 	if (fam == 0xf || fam == 0x11)
904 		goto wrong_mc5_mce;
905 
906 	pr_emerg(HW_ERR "MC5 Error: ");
907 
908 	if (INT_ERROR(ec)) {
909 		if (xec <= 0x1f) {
910 			pr_cont("Hardware Assert.\n");
911 			return;
912 		} else
913 			goto wrong_mc5_mce;
914 	}
915 
916 	if (xec == 0x0 || xec == 0xc)
917 		pr_cont("%s.\n", mc5_mce_desc[xec]);
918 	else if (xec <= 0xd)
919 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
920 	else
921 		goto wrong_mc5_mce;
922 
923 	return;
924 
925  wrong_mc5_mce:
926 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
927 }
928 
decode_mc6_mce(struct mce * m)929 static void decode_mc6_mce(struct mce *m)
930 {
931 	u8 xec = XEC(m->status, xec_mask);
932 
933 	pr_emerg(HW_ERR "MC6 Error: ");
934 
935 	if (xec > 0x5)
936 		goto wrong_mc6_mce;
937 
938 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
939 	return;
940 
941  wrong_mc6_mce:
942 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
943 }
944 
945 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)946 static void decode_smca_error(struct mce *m)
947 {
948 	struct smca_hwid *hwid;
949 	enum smca_bank_types bank_type;
950 	const char *ip_name;
951 	u8 xec = XEC(m->status, xec_mask);
952 
953 	if (m->bank >= ARRAY_SIZE(smca_banks))
954 		return;
955 
956 	hwid = smca_banks[m->bank].hwid;
957 	if (!hwid)
958 		return;
959 
960 	bank_type = hwid->bank_type;
961 
962 	if (bank_type == SMCA_RESERVED) {
963 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
964 		return;
965 	}
966 
967 	ip_name = smca_get_long_name(bank_type);
968 
969 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
970 
971 	/* Only print the decode of valid error codes */
972 	if (xec < smca_mce_descs[bank_type].num_descs &&
973 			(hwid->xec_bitmap & BIT_ULL(xec))) {
974 		pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
975 	}
976 
977 	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
978 		decode_dram_ecc(cpu_to_node(m->extcpu), m);
979 }
980 
amd_decode_err_code(u16 ec)981 static inline void amd_decode_err_code(u16 ec)
982 {
983 	if (INT_ERROR(ec)) {
984 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
985 		return;
986 	}
987 
988 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
989 
990 	if (BUS_ERROR(ec))
991 		pr_cont(", mem/io: %s", II_MSG(ec));
992 	else
993 		pr_cont(", tx: %s", TT_MSG(ec));
994 
995 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
996 		pr_cont(", mem-tx: %s", R4_MSG(ec));
997 
998 		if (BUS_ERROR(ec))
999 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1000 	}
1001 
1002 	pr_cont("\n");
1003 }
1004 
1005 /*
1006  * Filter out unwanted MCE signatures here.
1007  */
ignore_mce(struct mce * m)1008 static bool ignore_mce(struct mce *m)
1009 {
1010 	/*
1011 	 * NB GART TLB error reporting is disabled by default.
1012 	 */
1013 	if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
1014 		return true;
1015 
1016 	return false;
1017 }
1018 
decode_error_status(struct mce * m)1019 static const char *decode_error_status(struct mce *m)
1020 {
1021 	if (m->status & MCI_STATUS_UC) {
1022 		if (m->status & MCI_STATUS_PCC)
1023 			return "System Fatal error.";
1024 		if (m->mcgstatus & MCG_STATUS_RIPV)
1025 			return "Uncorrected, software restartable error.";
1026 		return "Uncorrected, software containable error.";
1027 	}
1028 
1029 	if (m->status & MCI_STATUS_DEFERRED)
1030 		return "Deferred error, no action required.";
1031 
1032 	return "Corrected error, no action required.";
1033 }
1034 
1035 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)1036 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1037 {
1038 	struct mce *m = (struct mce *)data;
1039 	unsigned int fam = x86_family(m->cpuid);
1040 	int ecc;
1041 
1042 	if (ignore_mce(m))
1043 		return NOTIFY_STOP;
1044 
1045 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1046 
1047 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1048 		m->extcpu,
1049 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1050 		m->bank,
1051 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
1052 		((m->status & MCI_STATUS_UC)	? "UE"	  :
1053 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1054 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
1055 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
1056 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
1057 
1058 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1059 		u32 low, high;
1060 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1061 
1062 		if (!rdmsr_safe(addr, &low, &high) &&
1063 		    (low & MCI_CONFIG_MCAX))
1064 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1065 
1066 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1067 	}
1068 
1069 	/* do the two bits[14:13] together */
1070 	ecc = (m->status >> 45) & 0x3;
1071 	if (ecc)
1072 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1073 
1074 	if (fam >= 0x15) {
1075 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1076 
1077 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
1078 		if (fam != 0x15 || m->bank != 4)
1079 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1080 	}
1081 
1082 	if (fam >= 0x17)
1083 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1084 
1085 	pr_cont("]: 0x%016llx\n", m->status);
1086 
1087 	if (m->status & MCI_STATUS_ADDRV)
1088 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1089 
1090 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1091 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1092 
1093 		if (m->status & MCI_STATUS_SYNDV)
1094 			pr_cont(", Syndrome: 0x%016llx", m->synd);
1095 
1096 		pr_cont("\n");
1097 
1098 		decode_smca_error(m);
1099 		goto err_code;
1100 	}
1101 
1102 	if (m->tsc)
1103 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1104 
1105 	if (!fam_ops)
1106 		goto err_code;
1107 
1108 	switch (m->bank) {
1109 	case 0:
1110 		decode_mc0_mce(m);
1111 		break;
1112 
1113 	case 1:
1114 		decode_mc1_mce(m);
1115 		break;
1116 
1117 	case 2:
1118 		decode_mc2_mce(m);
1119 		break;
1120 
1121 	case 3:
1122 		decode_mc3_mce(m);
1123 		break;
1124 
1125 	case 4:
1126 		decode_mc4_mce(m);
1127 		break;
1128 
1129 	case 5:
1130 		decode_mc5_mce(m);
1131 		break;
1132 
1133 	case 6:
1134 		decode_mc6_mce(m);
1135 		break;
1136 
1137 	default:
1138 		break;
1139 	}
1140 
1141  err_code:
1142 	amd_decode_err_code(m->status & 0xffff);
1143 
1144 	return NOTIFY_STOP;
1145 }
1146 
1147 static struct notifier_block amd_mce_dec_nb = {
1148 	.notifier_call	= amd_decode_mce,
1149 	.priority	= MCE_PRIO_EDAC,
1150 };
1151 
mce_amd_init(void)1152 static int __init mce_amd_init(void)
1153 {
1154 	struct cpuinfo_x86 *c = &boot_cpu_data;
1155 
1156 	if (c->x86_vendor != X86_VENDOR_AMD &&
1157 	    c->x86_vendor != X86_VENDOR_HYGON)
1158 		return -ENODEV;
1159 
1160 	fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1161 	if (!fam_ops)
1162 		return -ENOMEM;
1163 
1164 	switch (c->x86) {
1165 	case 0xf:
1166 		fam_ops->mc0_mce = k8_mc0_mce;
1167 		fam_ops->mc1_mce = k8_mc1_mce;
1168 		fam_ops->mc2_mce = k8_mc2_mce;
1169 		break;
1170 
1171 	case 0x10:
1172 		fam_ops->mc0_mce = f10h_mc0_mce;
1173 		fam_ops->mc1_mce = k8_mc1_mce;
1174 		fam_ops->mc2_mce = k8_mc2_mce;
1175 		break;
1176 
1177 	case 0x11:
1178 		fam_ops->mc0_mce = k8_mc0_mce;
1179 		fam_ops->mc1_mce = k8_mc1_mce;
1180 		fam_ops->mc2_mce = k8_mc2_mce;
1181 		break;
1182 
1183 	case 0x12:
1184 		fam_ops->mc0_mce = f12h_mc0_mce;
1185 		fam_ops->mc1_mce = k8_mc1_mce;
1186 		fam_ops->mc2_mce = k8_mc2_mce;
1187 		break;
1188 
1189 	case 0x14:
1190 		fam_ops->mc0_mce = cat_mc0_mce;
1191 		fam_ops->mc1_mce = cat_mc1_mce;
1192 		fam_ops->mc2_mce = k8_mc2_mce;
1193 		break;
1194 
1195 	case 0x15:
1196 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1197 
1198 		fam_ops->mc0_mce = f15h_mc0_mce;
1199 		fam_ops->mc1_mce = f15h_mc1_mce;
1200 		fam_ops->mc2_mce = f15h_mc2_mce;
1201 		break;
1202 
1203 	case 0x16:
1204 		xec_mask = 0x1f;
1205 		fam_ops->mc0_mce = cat_mc0_mce;
1206 		fam_ops->mc1_mce = cat_mc1_mce;
1207 		fam_ops->mc2_mce = f16h_mc2_mce;
1208 		break;
1209 
1210 	case 0x17:
1211 	case 0x18:
1212 		xec_mask = 0x3f;
1213 		if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1214 			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1215 			goto err_out;
1216 		}
1217 		break;
1218 
1219 	default:
1220 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1221 		goto err_out;
1222 	}
1223 
1224 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1225 
1226 	mce_register_decode_chain(&amd_mce_dec_nb);
1227 
1228 	return 0;
1229 
1230 err_out:
1231 	kfree(fam_ops);
1232 	fam_ops = NULL;
1233 	return -EINVAL;
1234 }
1235 early_initcall(mce_amd_init);
1236 
1237 #ifdef MODULE
mce_amd_exit(void)1238 static void __exit mce_amd_exit(void)
1239 {
1240 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1241 	kfree(fam_ops);
1242 }
1243 
1244 MODULE_DESCRIPTION("AMD MCE decoder");
1245 MODULE_ALIAS("edac-mce-amd");
1246 MODULE_LICENSE("GPL");
1247 module_exit(mce_amd_exit);
1248 #endif
1249