1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4
5 #include <asm/cpu.h>
6
7 #include "mce_amd.h"
8
9 static struct amd_decoder_ops *fam_ops;
10
11 static u8 xec_mask = 0xf;
12
13 static bool report_gart_errors;
14 static void (*decode_dram_ecc)(int node_id, struct mce *m);
15
amd_report_gart_errors(bool v)16 void amd_report_gart_errors(bool v)
17 {
18 report_gart_errors = v;
19 }
20 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
21
amd_register_ecc_decoder(void (* f)(int,struct mce *))22 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
23 {
24 decode_dram_ecc = f;
25 }
26 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
27
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))28 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
29 {
30 if (decode_dram_ecc) {
31 WARN_ON(decode_dram_ecc != f);
32
33 decode_dram_ecc = NULL;
34 }
35 }
36 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
37
38 /*
39 * string representation for the different MCA reported error types, see F3x48
40 * or MSR0000_0411.
41 */
42
43 /* transaction type */
44 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
45
46 /* cache level */
47 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
48
49 /* memory transaction type */
50 static const char * const rrrr_msgs[] = {
51 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
52 };
53
54 /* participating processor */
55 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56 EXPORT_SYMBOL_GPL(pp_msgs);
57
58 /* request timeout */
59 static const char * const to_msgs[] = { "no timeout", "timed out" };
60
61 /* memory or i/o */
62 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
63
64 /* internal error type */
65 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
66
67 static const char * const f15h_mc1_mce_desc[] = {
68 "UC during a demand linefill from L2",
69 "Parity error during data load from IC",
70 "Parity error for IC valid bit",
71 "Main tag parity error",
72 "Parity error in prediction queue",
73 "PFB data/address parity error",
74 "Parity error in the branch status reg",
75 "PFB promotion address error",
76 "Tag error during probe/victimization",
77 "Parity error for IC probe tag valid bit",
78 "PFB non-cacheable bit parity error",
79 "PFB valid bit parity error", /* xec = 0xd */
80 "Microcode Patch Buffer", /* xec = 010 */
81 "uop queue",
82 "insn buffer",
83 "predecode buffer",
84 "fetch address FIFO",
85 "dispatch uop queue"
86 };
87
88 static const char * const f15h_mc2_mce_desc[] = {
89 "Fill ECC error on data fills", /* xec = 0x4 */
90 "Fill parity error on insn fills",
91 "Prefetcher request FIFO parity error",
92 "PRQ address parity error",
93 "PRQ data parity error",
94 "WCC Tag ECC error",
95 "WCC Data ECC error",
96 "WCB Data parity error",
97 "VB Data ECC or parity error",
98 "L2 Tag ECC error", /* xec = 0x10 */
99 "Hard L2 Tag ECC error",
100 "Multiple hits on L2 tag",
101 "XAB parity error",
102 "PRB address parity error"
103 };
104
105 static const char * const mc4_mce_desc[] = {
106 "DRAM ECC error detected on the NB",
107 "CRC error detected on HT link",
108 "Link-defined sync error packets detected on HT link",
109 "HT Master abort",
110 "HT Target abort",
111 "Invalid GART PTE entry during GART table walk",
112 "Unsupported atomic RMW received from an IO link",
113 "Watchdog timeout due to lack of progress",
114 "DRAM ECC error detected on the NB",
115 "SVM DMA Exclusion Vector error",
116 "HT data error detected on link",
117 "Protocol error (link, L3, probe filter)",
118 "NB internal arrays parity error",
119 "DRAM addr/ctl signals parity error",
120 "IO link transmission error",
121 "L3 data cache ECC error", /* xec = 0x1c */
122 "L3 cache tag error",
123 "L3 LRU parity bits error",
124 "ECC Error in the Probe Filter directory"
125 };
126
127 static const char * const mc5_mce_desc[] = {
128 "CPU Watchdog timer expire",
129 "Wakeup array dest tag",
130 "AG payload array",
131 "EX payload array",
132 "IDRF array",
133 "Retire dispatch queue",
134 "Mapper checkpoint array",
135 "Physical register file EX0 port",
136 "Physical register file EX1 port",
137 "Physical register file AG0 port",
138 "Physical register file AG1 port",
139 "Flag register file",
140 "DE error occurred",
141 "Retire status queue"
142 };
143
144 static const char * const mc6_mce_desc[] = {
145 "Hardware Assertion",
146 "Free List",
147 "Physical Register File",
148 "Retire Queue",
149 "Scheduler table",
150 "Status Register File",
151 };
152
153 /* Scalable MCA error strings */
154 static const char * const smca_ls_mce_desc[] = {
155 "Load queue parity error",
156 "Store queue parity error",
157 "Miss address buffer payload parity error",
158 "Level 1 TLB parity error",
159 "DC Tag error type 5",
160 "DC Tag error type 6",
161 "DC Tag error type 1",
162 "Internal error type 1",
163 "Internal error type 2",
164 "System Read Data Error Thread 0",
165 "System Read Data Error Thread 1",
166 "DC Tag error type 2",
167 "DC Data error type 1 and poison consumption",
168 "DC Data error type 2",
169 "DC Data error type 3",
170 "DC Tag error type 4",
171 "Level 2 TLB parity error",
172 "PDC parity error",
173 "DC Tag error type 3",
174 "DC Tag error type 5",
175 "L2 Fill Data error",
176 };
177
178 static const char * const smca_if_mce_desc[] = {
179 "Op Cache Microtag Probe Port Parity Error",
180 "IC Microtag or Full Tag Multi-hit Error",
181 "IC Full Tag Parity Error",
182 "IC Data Array Parity Error",
183 "Decoupling Queue PhysAddr Parity Error",
184 "L0 ITLB Parity Error",
185 "L1 ITLB Parity Error",
186 "L2 ITLB Parity Error",
187 "BPQ Thread 0 Snoop Parity Error",
188 "BPQ Thread 1 Snoop Parity Error",
189 "L1 BTB Multi-Match Error",
190 "L2 BTB Multi-Match Error",
191 "L2 Cache Response Poison Error",
192 "System Read Data Error",
193 };
194
195 static const char * const smca_l2_mce_desc[] = {
196 "L2M Tag Multiple-Way-Hit error",
197 "L2M Tag or State Array ECC Error",
198 "L2M Data Array ECC Error",
199 "Hardware Assert Error",
200 };
201
202 static const char * const smca_de_mce_desc[] = {
203 "Micro-op cache tag parity error",
204 "Micro-op cache data parity error",
205 "Instruction buffer parity error",
206 "Micro-op queue parity error",
207 "Instruction dispatch queue parity error",
208 "Fetch address FIFO parity error",
209 "Patch RAM data parity error",
210 "Patch RAM sequencer parity error",
211 "Micro-op buffer parity error"
212 };
213
214 static const char * const smca_ex_mce_desc[] = {
215 "Watchdog Timeout error",
216 "Physical register file parity error",
217 "Flag register file parity error",
218 "Immediate displacement register file parity error",
219 "Address generator payload parity error",
220 "EX payload parity error",
221 "Checkpoint queue parity error",
222 "Retire dispatch queue parity error",
223 "Retire status queue parity error",
224 "Scheduling queue parity error",
225 "Branch buffer queue parity error",
226 "Hardware Assertion error",
227 };
228
229 static const char * const smca_fp_mce_desc[] = {
230 "Physical register file (PRF) parity error",
231 "Freelist (FL) parity error",
232 "Schedule queue parity error",
233 "NSQ parity error",
234 "Retire queue (RQ) parity error",
235 "Status register file (SRF) parity error",
236 "Hardware assertion",
237 };
238
239 static const char * const smca_l3_mce_desc[] = {
240 "Shadow Tag Macro ECC Error",
241 "Shadow Tag Macro Multi-way-hit Error",
242 "L3M Tag ECC Error",
243 "L3M Tag Multi-way-hit Error",
244 "L3M Data ECC Error",
245 "SDP Parity Error or SystemReadDataError from XI",
246 "L3 Victim Queue Parity Error",
247 "L3 Hardware Assertion",
248 };
249
250 static const char * const smca_cs_mce_desc[] = {
251 "Illegal Request",
252 "Address Violation",
253 "Security Violation",
254 "Illegal Response",
255 "Unexpected Response",
256 "Request or Probe Parity Error",
257 "Read Response Parity Error",
258 "Atomic Request Parity Error",
259 "Probe Filter ECC Error",
260 };
261
262 static const char * const smca_cs2_mce_desc[] = {
263 "Illegal Request",
264 "Address Violation",
265 "Security Violation",
266 "Illegal Response",
267 "Unexpected Response",
268 "Request or Probe Parity Error",
269 "Read Response Parity Error",
270 "Atomic Request Parity Error",
271 "SDP read response had no match in the CS queue",
272 "Probe Filter Protocol Error",
273 "Probe Filter ECC Error",
274 "SDP read response had an unexpected RETRY error",
275 "Counter overflow error",
276 "Counter underflow error",
277 };
278
279 static const char * const smca_pie_mce_desc[] = {
280 "Hardware Assert",
281 "Register security violation",
282 "Link Error",
283 "Poison data consumption",
284 "A deferred error was detected in the DF"
285 };
286
287 static const char * const smca_umc_mce_desc[] = {
288 "DRAM ECC error",
289 "Data poison error",
290 "SDP parity error",
291 "Advanced peripheral bus error",
292 "Address/Command parity error",
293 "Write data CRC error",
294 "DCQ SRAM ECC error",
295 "AES SRAM ECC error",
296 };
297
298 static const char * const smca_pb_mce_desc[] = {
299 "An ECC error in the Parameter Block RAM array",
300 };
301
302 static const char * const smca_psp_mce_desc[] = {
303 "An ECC or parity error in a PSP RAM instance",
304 };
305
306 static const char * const smca_psp2_mce_desc[] = {
307 "High SRAM ECC or parity error",
308 "Low SRAM ECC or parity error",
309 "Instruction Cache Bank 0 ECC or parity error",
310 "Instruction Cache Bank 1 ECC or parity error",
311 "Instruction Tag Ram 0 parity error",
312 "Instruction Tag Ram 1 parity error",
313 "Data Cache Bank 0 ECC or parity error",
314 "Data Cache Bank 1 ECC or parity error",
315 "Data Cache Bank 2 ECC or parity error",
316 "Data Cache Bank 3 ECC or parity error",
317 "Data Tag Bank 0 parity error",
318 "Data Tag Bank 1 parity error",
319 "Data Tag Bank 2 parity error",
320 "Data Tag Bank 3 parity error",
321 "Dirty Data Ram parity error",
322 "TLB Bank 0 parity error",
323 "TLB Bank 1 parity error",
324 "System Hub Read Buffer ECC or parity error",
325 };
326
327 static const char * const smca_smu_mce_desc[] = {
328 "An ECC or parity error in an SMU RAM instance",
329 };
330
331 static const char * const smca_smu2_mce_desc[] = {
332 "High SRAM ECC or parity error",
333 "Low SRAM ECC or parity error",
334 "Data Cache Bank A ECC or parity error",
335 "Data Cache Bank B ECC or parity error",
336 "Data Tag Cache Bank A ECC or parity error",
337 "Data Tag Cache Bank B ECC or parity error",
338 "Instruction Cache Bank A ECC or parity error",
339 "Instruction Cache Bank B ECC or parity error",
340 "Instruction Tag Cache Bank A ECC or parity error",
341 "Instruction Tag Cache Bank B ECC or parity error",
342 "System Hub Read Buffer ECC or parity error",
343 };
344
345 static const char * const smca_mp5_mce_desc[] = {
346 "High SRAM ECC or parity error",
347 "Low SRAM ECC or parity error",
348 "Data Cache Bank A ECC or parity error",
349 "Data Cache Bank B ECC or parity error",
350 "Data Tag Cache Bank A ECC or parity error",
351 "Data Tag Cache Bank B ECC or parity error",
352 "Instruction Cache Bank A ECC or parity error",
353 "Instruction Cache Bank B ECC or parity error",
354 "Instruction Tag Cache Bank A ECC or parity error",
355 "Instruction Tag Cache Bank B ECC or parity error",
356 };
357
358 static const char * const smca_nbio_mce_desc[] = {
359 "ECC or Parity error",
360 "PCIE error",
361 "SDP ErrEvent error",
362 "SDP Egress Poison Error",
363 "IOHC Internal Poison Error",
364 };
365
366 static const char * const smca_pcie_mce_desc[] = {
367 "CCIX PER Message logging",
368 "CCIX Read Response with Status: Non-Data Error",
369 "CCIX Write Response with Status: Non-Data Error",
370 "CCIX Read Response with Status: Data Error",
371 "CCIX Non-okay write response with data error",
372 };
373
374 struct smca_mce_desc {
375 const char * const *descs;
376 unsigned int num_descs;
377 };
378
379 static struct smca_mce_desc smca_mce_descs[] = {
380 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
381 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
382 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
383 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
384 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
385 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
386 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
387 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
388 [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
389 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
390 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
391 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
392 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
393 [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
394 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
395 [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
396 [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
397 [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
398 [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
399 };
400
f12h_mc0_mce(u16 ec,u8 xec)401 static bool f12h_mc0_mce(u16 ec, u8 xec)
402 {
403 bool ret = false;
404
405 if (MEM_ERROR(ec)) {
406 u8 ll = LL(ec);
407 ret = true;
408
409 if (ll == LL_L2)
410 pr_cont("during L1 linefill from L2.\n");
411 else if (ll == LL_L1)
412 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
413 else
414 ret = false;
415 }
416 return ret;
417 }
418
f10h_mc0_mce(u16 ec,u8 xec)419 static bool f10h_mc0_mce(u16 ec, u8 xec)
420 {
421 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
422 pr_cont("during data scrub.\n");
423 return true;
424 }
425 return f12h_mc0_mce(ec, xec);
426 }
427
k8_mc0_mce(u16 ec,u8 xec)428 static bool k8_mc0_mce(u16 ec, u8 xec)
429 {
430 if (BUS_ERROR(ec)) {
431 pr_cont("during system linefill.\n");
432 return true;
433 }
434
435 return f10h_mc0_mce(ec, xec);
436 }
437
cat_mc0_mce(u16 ec,u8 xec)438 static bool cat_mc0_mce(u16 ec, u8 xec)
439 {
440 u8 r4 = R4(ec);
441 bool ret = true;
442
443 if (MEM_ERROR(ec)) {
444
445 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
446 return false;
447
448 switch (r4) {
449 case R4_DRD:
450 case R4_DWR:
451 pr_cont("Data/Tag parity error due to %s.\n",
452 (r4 == R4_DRD ? "load/hw prf" : "store"));
453 break;
454 case R4_EVICT:
455 pr_cont("Copyback parity error on a tag miss.\n");
456 break;
457 case R4_SNOOP:
458 pr_cont("Tag parity error during snoop.\n");
459 break;
460 default:
461 ret = false;
462 }
463 } else if (BUS_ERROR(ec)) {
464
465 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
466 return false;
467
468 pr_cont("System read data error on a ");
469
470 switch (r4) {
471 case R4_RD:
472 pr_cont("TLB reload.\n");
473 break;
474 case R4_DWR:
475 pr_cont("store.\n");
476 break;
477 case R4_DRD:
478 pr_cont("load.\n");
479 break;
480 default:
481 ret = false;
482 }
483 } else {
484 ret = false;
485 }
486
487 return ret;
488 }
489
f15h_mc0_mce(u16 ec,u8 xec)490 static bool f15h_mc0_mce(u16 ec, u8 xec)
491 {
492 bool ret = true;
493
494 if (MEM_ERROR(ec)) {
495
496 switch (xec) {
497 case 0x0:
498 pr_cont("Data Array access error.\n");
499 break;
500
501 case 0x1:
502 pr_cont("UC error during a linefill from L2/NB.\n");
503 break;
504
505 case 0x2:
506 case 0x11:
507 pr_cont("STQ access error.\n");
508 break;
509
510 case 0x3:
511 pr_cont("SCB access error.\n");
512 break;
513
514 case 0x10:
515 pr_cont("Tag error.\n");
516 break;
517
518 case 0x12:
519 pr_cont("LDQ access error.\n");
520 break;
521
522 default:
523 ret = false;
524 }
525 } else if (BUS_ERROR(ec)) {
526
527 if (!xec)
528 pr_cont("System Read Data Error.\n");
529 else
530 pr_cont(" Internal error condition type %d.\n", xec);
531 } else if (INT_ERROR(ec)) {
532 if (xec <= 0x1f)
533 pr_cont("Hardware Assert.\n");
534 else
535 ret = false;
536
537 } else
538 ret = false;
539
540 return ret;
541 }
542
decode_mc0_mce(struct mce * m)543 static void decode_mc0_mce(struct mce *m)
544 {
545 u16 ec = EC(m->status);
546 u8 xec = XEC(m->status, xec_mask);
547
548 pr_emerg(HW_ERR "MC0 Error: ");
549
550 /* TLB error signatures are the same across families */
551 if (TLB_ERROR(ec)) {
552 if (TT(ec) == TT_DATA) {
553 pr_cont("%s TLB %s.\n", LL_MSG(ec),
554 ((xec == 2) ? "locked miss"
555 : (xec ? "multimatch" : "parity")));
556 return;
557 }
558 } else if (fam_ops->mc0_mce(ec, xec))
559 ;
560 else
561 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
562 }
563
k8_mc1_mce(u16 ec,u8 xec)564 static bool k8_mc1_mce(u16 ec, u8 xec)
565 {
566 u8 ll = LL(ec);
567 bool ret = true;
568
569 if (!MEM_ERROR(ec))
570 return false;
571
572 if (ll == 0x2)
573 pr_cont("during a linefill from L2.\n");
574 else if (ll == 0x1) {
575 switch (R4(ec)) {
576 case R4_IRD:
577 pr_cont("Parity error during data load.\n");
578 break;
579
580 case R4_EVICT:
581 pr_cont("Copyback Parity/Victim error.\n");
582 break;
583
584 case R4_SNOOP:
585 pr_cont("Tag Snoop error.\n");
586 break;
587
588 default:
589 ret = false;
590 break;
591 }
592 } else
593 ret = false;
594
595 return ret;
596 }
597
cat_mc1_mce(u16 ec,u8 xec)598 static bool cat_mc1_mce(u16 ec, u8 xec)
599 {
600 u8 r4 = R4(ec);
601 bool ret = true;
602
603 if (!MEM_ERROR(ec))
604 return false;
605
606 if (TT(ec) != TT_INSTR)
607 return false;
608
609 if (r4 == R4_IRD)
610 pr_cont("Data/tag array parity error for a tag hit.\n");
611 else if (r4 == R4_SNOOP)
612 pr_cont("Tag error during snoop/victimization.\n");
613 else if (xec == 0x0)
614 pr_cont("Tag parity error from victim castout.\n");
615 else if (xec == 0x2)
616 pr_cont("Microcode patch RAM parity error.\n");
617 else
618 ret = false;
619
620 return ret;
621 }
622
f15h_mc1_mce(u16 ec,u8 xec)623 static bool f15h_mc1_mce(u16 ec, u8 xec)
624 {
625 bool ret = true;
626
627 if (!MEM_ERROR(ec))
628 return false;
629
630 switch (xec) {
631 case 0x0 ... 0xa:
632 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
633 break;
634
635 case 0xd:
636 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
637 break;
638
639 case 0x10:
640 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
641 break;
642
643 case 0x11 ... 0x15:
644 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
645 break;
646
647 default:
648 ret = false;
649 }
650 return ret;
651 }
652
decode_mc1_mce(struct mce * m)653 static void decode_mc1_mce(struct mce *m)
654 {
655 u16 ec = EC(m->status);
656 u8 xec = XEC(m->status, xec_mask);
657
658 pr_emerg(HW_ERR "MC1 Error: ");
659
660 if (TLB_ERROR(ec))
661 pr_cont("%s TLB %s.\n", LL_MSG(ec),
662 (xec ? "multimatch" : "parity error"));
663 else if (BUS_ERROR(ec)) {
664 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
665
666 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
667 } else if (INT_ERROR(ec)) {
668 if (xec <= 0x3f)
669 pr_cont("Hardware Assert.\n");
670 else
671 goto wrong_mc1_mce;
672 } else if (fam_ops->mc1_mce(ec, xec))
673 ;
674 else
675 goto wrong_mc1_mce;
676
677 return;
678
679 wrong_mc1_mce:
680 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
681 }
682
k8_mc2_mce(u16 ec,u8 xec)683 static bool k8_mc2_mce(u16 ec, u8 xec)
684 {
685 bool ret = true;
686
687 if (xec == 0x1)
688 pr_cont(" in the write data buffers.\n");
689 else if (xec == 0x3)
690 pr_cont(" in the victim data buffers.\n");
691 else if (xec == 0x2 && MEM_ERROR(ec))
692 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
693 else if (xec == 0x0) {
694 if (TLB_ERROR(ec))
695 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
696 TT_MSG(ec));
697 else if (BUS_ERROR(ec))
698 pr_cont(": %s/ECC error in data read from NB: %s.\n",
699 R4_MSG(ec), PP_MSG(ec));
700 else if (MEM_ERROR(ec)) {
701 u8 r4 = R4(ec);
702
703 if (r4 >= 0x7)
704 pr_cont(": %s error during data copyback.\n",
705 R4_MSG(ec));
706 else if (r4 <= 0x1)
707 pr_cont(": %s parity/ECC error during data "
708 "access from L2.\n", R4_MSG(ec));
709 else
710 ret = false;
711 } else
712 ret = false;
713 } else
714 ret = false;
715
716 return ret;
717 }
718
f15h_mc2_mce(u16 ec,u8 xec)719 static bool f15h_mc2_mce(u16 ec, u8 xec)
720 {
721 bool ret = true;
722
723 if (TLB_ERROR(ec)) {
724 if (xec == 0x0)
725 pr_cont("Data parity TLB read error.\n");
726 else if (xec == 0x1)
727 pr_cont("Poison data provided for TLB fill.\n");
728 else
729 ret = false;
730 } else if (BUS_ERROR(ec)) {
731 if (xec > 2)
732 ret = false;
733
734 pr_cont("Error during attempted NB data read.\n");
735 } else if (MEM_ERROR(ec)) {
736 switch (xec) {
737 case 0x4 ... 0xc:
738 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
739 break;
740
741 case 0x10 ... 0x14:
742 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
743 break;
744
745 default:
746 ret = false;
747 }
748 } else if (INT_ERROR(ec)) {
749 if (xec <= 0x3f)
750 pr_cont("Hardware Assert.\n");
751 else
752 ret = false;
753 }
754
755 return ret;
756 }
757
f16h_mc2_mce(u16 ec,u8 xec)758 static bool f16h_mc2_mce(u16 ec, u8 xec)
759 {
760 u8 r4 = R4(ec);
761
762 if (!MEM_ERROR(ec))
763 return false;
764
765 switch (xec) {
766 case 0x04 ... 0x05:
767 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
768 break;
769
770 case 0x09 ... 0x0b:
771 case 0x0d ... 0x0f:
772 pr_cont("ECC error in L2 tag (%s).\n",
773 ((r4 == R4_GEN) ? "BankReq" :
774 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
775 break;
776
777 case 0x10 ... 0x19:
778 case 0x1b:
779 pr_cont("ECC error in L2 data array (%s).\n",
780 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
781 ((r4 == R4_GEN) ? "Attr" :
782 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
783 break;
784
785 case 0x1c ... 0x1d:
786 case 0x1f:
787 pr_cont("Parity error in L2 attribute bits (%s).\n",
788 ((r4 == R4_RD) ? "Hit" :
789 ((r4 == R4_GEN) ? "Attr" : "Fill")));
790 break;
791
792 default:
793 return false;
794 }
795
796 return true;
797 }
798
decode_mc2_mce(struct mce * m)799 static void decode_mc2_mce(struct mce *m)
800 {
801 u16 ec = EC(m->status);
802 u8 xec = XEC(m->status, xec_mask);
803
804 pr_emerg(HW_ERR "MC2 Error: ");
805
806 if (!fam_ops->mc2_mce(ec, xec))
807 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
808 }
809
decode_mc3_mce(struct mce * m)810 static void decode_mc3_mce(struct mce *m)
811 {
812 u16 ec = EC(m->status);
813 u8 xec = XEC(m->status, xec_mask);
814
815 if (boot_cpu_data.x86 >= 0x14) {
816 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
817 " please report on LKML.\n");
818 return;
819 }
820
821 pr_emerg(HW_ERR "MC3 Error");
822
823 if (xec == 0x0) {
824 u8 r4 = R4(ec);
825
826 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
827 goto wrong_mc3_mce;
828
829 pr_cont(" during %s.\n", R4_MSG(ec));
830 } else
831 goto wrong_mc3_mce;
832
833 return;
834
835 wrong_mc3_mce:
836 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
837 }
838
decode_mc4_mce(struct mce * m)839 static void decode_mc4_mce(struct mce *m)
840 {
841 unsigned int fam = x86_family(m->cpuid);
842 int node_id = amd_get_nb_id(m->extcpu);
843 u16 ec = EC(m->status);
844 u8 xec = XEC(m->status, 0x1f);
845 u8 offset = 0;
846
847 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
848
849 switch (xec) {
850 case 0x0 ... 0xe:
851
852 /* special handling for DRAM ECCs */
853 if (xec == 0x0 || xec == 0x8) {
854 /* no ECCs on F11h */
855 if (fam == 0x11)
856 goto wrong_mc4_mce;
857
858 pr_cont("%s.\n", mc4_mce_desc[xec]);
859
860 if (decode_dram_ecc)
861 decode_dram_ecc(node_id, m);
862 return;
863 }
864 break;
865
866 case 0xf:
867 if (TLB_ERROR(ec))
868 pr_cont("GART Table Walk data error.\n");
869 else if (BUS_ERROR(ec))
870 pr_cont("DMA Exclusion Vector Table Walk error.\n");
871 else
872 goto wrong_mc4_mce;
873 return;
874
875 case 0x19:
876 if (fam == 0x15 || fam == 0x16)
877 pr_cont("Compute Unit Data Error.\n");
878 else
879 goto wrong_mc4_mce;
880 return;
881
882 case 0x1c ... 0x1f:
883 offset = 13;
884 break;
885
886 default:
887 goto wrong_mc4_mce;
888 }
889
890 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
891 return;
892
893 wrong_mc4_mce:
894 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
895 }
896
decode_mc5_mce(struct mce * m)897 static void decode_mc5_mce(struct mce *m)
898 {
899 unsigned int fam = x86_family(m->cpuid);
900 u16 ec = EC(m->status);
901 u8 xec = XEC(m->status, xec_mask);
902
903 if (fam == 0xf || fam == 0x11)
904 goto wrong_mc5_mce;
905
906 pr_emerg(HW_ERR "MC5 Error: ");
907
908 if (INT_ERROR(ec)) {
909 if (xec <= 0x1f) {
910 pr_cont("Hardware Assert.\n");
911 return;
912 } else
913 goto wrong_mc5_mce;
914 }
915
916 if (xec == 0x0 || xec == 0xc)
917 pr_cont("%s.\n", mc5_mce_desc[xec]);
918 else if (xec <= 0xd)
919 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
920 else
921 goto wrong_mc5_mce;
922
923 return;
924
925 wrong_mc5_mce:
926 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
927 }
928
decode_mc6_mce(struct mce * m)929 static void decode_mc6_mce(struct mce *m)
930 {
931 u8 xec = XEC(m->status, xec_mask);
932
933 pr_emerg(HW_ERR "MC6 Error: ");
934
935 if (xec > 0x5)
936 goto wrong_mc6_mce;
937
938 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
939 return;
940
941 wrong_mc6_mce:
942 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
943 }
944
945 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)946 static void decode_smca_error(struct mce *m)
947 {
948 struct smca_hwid *hwid;
949 enum smca_bank_types bank_type;
950 const char *ip_name;
951 u8 xec = XEC(m->status, xec_mask);
952
953 if (m->bank >= ARRAY_SIZE(smca_banks))
954 return;
955
956 hwid = smca_banks[m->bank].hwid;
957 if (!hwid)
958 return;
959
960 bank_type = hwid->bank_type;
961
962 if (bank_type == SMCA_RESERVED) {
963 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
964 return;
965 }
966
967 ip_name = smca_get_long_name(bank_type);
968
969 pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
970
971 /* Only print the decode of valid error codes */
972 if (xec < smca_mce_descs[bank_type].num_descs &&
973 (hwid->xec_bitmap & BIT_ULL(xec))) {
974 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
975 }
976
977 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
978 decode_dram_ecc(cpu_to_node(m->extcpu), m);
979 }
980
amd_decode_err_code(u16 ec)981 static inline void amd_decode_err_code(u16 ec)
982 {
983 if (INT_ERROR(ec)) {
984 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
985 return;
986 }
987
988 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
989
990 if (BUS_ERROR(ec))
991 pr_cont(", mem/io: %s", II_MSG(ec));
992 else
993 pr_cont(", tx: %s", TT_MSG(ec));
994
995 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
996 pr_cont(", mem-tx: %s", R4_MSG(ec));
997
998 if (BUS_ERROR(ec))
999 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1000 }
1001
1002 pr_cont("\n");
1003 }
1004
1005 /*
1006 * Filter out unwanted MCE signatures here.
1007 */
ignore_mce(struct mce * m)1008 static bool ignore_mce(struct mce *m)
1009 {
1010 /*
1011 * NB GART TLB error reporting is disabled by default.
1012 */
1013 if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
1014 return true;
1015
1016 return false;
1017 }
1018
decode_error_status(struct mce * m)1019 static const char *decode_error_status(struct mce *m)
1020 {
1021 if (m->status & MCI_STATUS_UC) {
1022 if (m->status & MCI_STATUS_PCC)
1023 return "System Fatal error.";
1024 if (m->mcgstatus & MCG_STATUS_RIPV)
1025 return "Uncorrected, software restartable error.";
1026 return "Uncorrected, software containable error.";
1027 }
1028
1029 if (m->status & MCI_STATUS_DEFERRED)
1030 return "Deferred error, no action required.";
1031
1032 return "Corrected error, no action required.";
1033 }
1034
1035 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)1036 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1037 {
1038 struct mce *m = (struct mce *)data;
1039 unsigned int fam = x86_family(m->cpuid);
1040 int ecc;
1041
1042 if (ignore_mce(m))
1043 return NOTIFY_STOP;
1044
1045 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1046
1047 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1048 m->extcpu,
1049 fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1050 m->bank,
1051 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
1052 ((m->status & MCI_STATUS_UC) ? "UE" :
1053 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
1054 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
1055 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
1056 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
1057
1058 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1059 u32 low, high;
1060 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1061
1062 if (!rdmsr_safe(addr, &low, &high) &&
1063 (low & MCI_CONFIG_MCAX))
1064 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1065
1066 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1067 }
1068
1069 /* do the two bits[14:13] together */
1070 ecc = (m->status >> 45) & 0x3;
1071 if (ecc)
1072 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1073
1074 if (fam >= 0x15) {
1075 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1076
1077 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
1078 if (fam != 0x15 || m->bank != 4)
1079 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1080 }
1081
1082 if (fam >= 0x17)
1083 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1084
1085 pr_cont("]: 0x%016llx\n", m->status);
1086
1087 if (m->status & MCI_STATUS_ADDRV)
1088 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1089
1090 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1091 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1092
1093 if (m->status & MCI_STATUS_SYNDV)
1094 pr_cont(", Syndrome: 0x%016llx", m->synd);
1095
1096 pr_cont("\n");
1097
1098 decode_smca_error(m);
1099 goto err_code;
1100 }
1101
1102 if (m->tsc)
1103 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1104
1105 if (!fam_ops)
1106 goto err_code;
1107
1108 switch (m->bank) {
1109 case 0:
1110 decode_mc0_mce(m);
1111 break;
1112
1113 case 1:
1114 decode_mc1_mce(m);
1115 break;
1116
1117 case 2:
1118 decode_mc2_mce(m);
1119 break;
1120
1121 case 3:
1122 decode_mc3_mce(m);
1123 break;
1124
1125 case 4:
1126 decode_mc4_mce(m);
1127 break;
1128
1129 case 5:
1130 decode_mc5_mce(m);
1131 break;
1132
1133 case 6:
1134 decode_mc6_mce(m);
1135 break;
1136
1137 default:
1138 break;
1139 }
1140
1141 err_code:
1142 amd_decode_err_code(m->status & 0xffff);
1143
1144 return NOTIFY_STOP;
1145 }
1146
1147 static struct notifier_block amd_mce_dec_nb = {
1148 .notifier_call = amd_decode_mce,
1149 .priority = MCE_PRIO_EDAC,
1150 };
1151
mce_amd_init(void)1152 static int __init mce_amd_init(void)
1153 {
1154 struct cpuinfo_x86 *c = &boot_cpu_data;
1155
1156 if (c->x86_vendor != X86_VENDOR_AMD &&
1157 c->x86_vendor != X86_VENDOR_HYGON)
1158 return -ENODEV;
1159
1160 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1161 if (!fam_ops)
1162 return -ENOMEM;
1163
1164 switch (c->x86) {
1165 case 0xf:
1166 fam_ops->mc0_mce = k8_mc0_mce;
1167 fam_ops->mc1_mce = k8_mc1_mce;
1168 fam_ops->mc2_mce = k8_mc2_mce;
1169 break;
1170
1171 case 0x10:
1172 fam_ops->mc0_mce = f10h_mc0_mce;
1173 fam_ops->mc1_mce = k8_mc1_mce;
1174 fam_ops->mc2_mce = k8_mc2_mce;
1175 break;
1176
1177 case 0x11:
1178 fam_ops->mc0_mce = k8_mc0_mce;
1179 fam_ops->mc1_mce = k8_mc1_mce;
1180 fam_ops->mc2_mce = k8_mc2_mce;
1181 break;
1182
1183 case 0x12:
1184 fam_ops->mc0_mce = f12h_mc0_mce;
1185 fam_ops->mc1_mce = k8_mc1_mce;
1186 fam_ops->mc2_mce = k8_mc2_mce;
1187 break;
1188
1189 case 0x14:
1190 fam_ops->mc0_mce = cat_mc0_mce;
1191 fam_ops->mc1_mce = cat_mc1_mce;
1192 fam_ops->mc2_mce = k8_mc2_mce;
1193 break;
1194
1195 case 0x15:
1196 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1197
1198 fam_ops->mc0_mce = f15h_mc0_mce;
1199 fam_ops->mc1_mce = f15h_mc1_mce;
1200 fam_ops->mc2_mce = f15h_mc2_mce;
1201 break;
1202
1203 case 0x16:
1204 xec_mask = 0x1f;
1205 fam_ops->mc0_mce = cat_mc0_mce;
1206 fam_ops->mc1_mce = cat_mc1_mce;
1207 fam_ops->mc2_mce = f16h_mc2_mce;
1208 break;
1209
1210 case 0x17:
1211 case 0x18:
1212 xec_mask = 0x3f;
1213 if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1214 printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1215 goto err_out;
1216 }
1217 break;
1218
1219 default:
1220 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1221 goto err_out;
1222 }
1223
1224 pr_info("MCE: In-kernel MCE decoding enabled.\n");
1225
1226 mce_register_decode_chain(&amd_mce_dec_nb);
1227
1228 return 0;
1229
1230 err_out:
1231 kfree(fam_ops);
1232 fam_ops = NULL;
1233 return -EINVAL;
1234 }
1235 early_initcall(mce_amd_init);
1236
1237 #ifdef MODULE
mce_amd_exit(void)1238 static void __exit mce_amd_exit(void)
1239 {
1240 mce_unregister_decode_chain(&amd_mce_dec_nb);
1241 kfree(fam_ops);
1242 }
1243
1244 MODULE_DESCRIPTION("AMD MCE decoder");
1245 MODULE_ALIAS("edac-mce-amd");
1246 MODULE_LICENSE("GPL");
1247 module_exit(mce_amd_exit);
1248 #endif
1249