Lines Matching +full:input +full:- +full:justification
96 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT) in get_ras_block_str()
99 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) in get_ras_block_str()
100 return ras_mca_block_string[ras_block->sub_block_index]; in get_ras_block_str()
102 return ras_block_string[ras_block->block]; in get_ras_block_str()
142 amdgpu_ras_get_context(adev)->error_query_ready = ready; in amdgpu_ras_set_error_query_ready()
148 return amdgpu_ras_get_context(adev)->error_query_ready; in amdgpu_ras_get_error_query_ready()
158 if ((address >= adev->gmc.mc_vram_size) || in amdgpu_reserve_page_direct()
160 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
161 "RAS WARN: input address 0x%llx is invalid.\n", in amdgpu_reserve_page_direct()
163 return -EINVAL; in amdgpu_reserve_page_direct()
167 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
183 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
184 dev_warn(adev->dev, "Clear EEPROM:\n"); in amdgpu_reserve_page_direct()
185 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
193 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private; in amdgpu_ras_debugfs_read()
195 .head = obj->head, in amdgpu_ras_debugfs_read()
200 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_debugfs_read()
201 return -EINVAL; in amdgpu_ras_debugfs_read()
204 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_debugfs_read()
205 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_debugfs_read()
206 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_debugfs_read()
207 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_debugfs_read()
216 s -= *pos; in amdgpu_ras_debugfs_read()
221 return -EINVAL; in amdgpu_ras_debugfs_read()
244 return -EINVAL; in amdgpu_ras_find_block_id_by_name()
255 int op = -1; in amdgpu_ras_debugfs_ctrl_parse_data()
263 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
270 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
282 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
284 if (op != -1) { in amdgpu_ras_debugfs_ctrl_parse_data()
288 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
290 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
291 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
297 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
299 data->head.block = block_id; in amdgpu_ras_debugfs_ctrl_parse_data()
302 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
304 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; in amdgpu_ras_debugfs_ctrl_parse_data()
306 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
308 data->op = op; in amdgpu_ras_debugfs_ctrl_parse_data()
319 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
320 data->head.sub_block_index = sub_block; in amdgpu_ras_debugfs_ctrl_parse_data()
321 data->inject.address = address; in amdgpu_ras_debugfs_ctrl_parse_data()
322 data->inject.value = value; in amdgpu_ras_debugfs_ctrl_parse_data()
323 data->inject.instance_mask = instance_mask; in amdgpu_ras_debugfs_ctrl_parse_data()
327 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
330 return -EINVAL; in amdgpu_ras_debugfs_ctrl_parse_data()
339 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; in amdgpu_ras_instance_mask_check()
340 uint32_t mask, inst_mask = data->inject.instance_mask; in amdgpu_ras_instance_mask_check()
344 data->inject.instance_mask = 0; in amdgpu_ras_instance_mask_check()
345 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
352 switch (data->head.block) { in amdgpu_ras_instance_mask_check()
354 mask = GENMASK(num_xcc - 1, 0); in amdgpu_ras_instance_mask_check()
357 mask = GENMASK(adev->sdma.num_instances - 1, 0); in amdgpu_ras_instance_mask_check()
361 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); in amdgpu_ras_instance_mask_check()
369 data->inject.instance_mask &= mask; in amdgpu_ras_instance_mask_check()
370 if (inst_mask != data->inject.instance_mask) in amdgpu_ras_instance_mask_check()
371 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
373 inst_mask, data->inject.instance_mask); in amdgpu_ras_instance_mask_check()
398 * - 0: disable RAS on the block. Take ::head as its data.
399 * - 1: enable RAS on the block. Take ::head as its data.
400 * - 2: inject errors on the block. Take ::inject as its data.
411 * .. code-block:: bash
415 …* echo "inject <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/…
427 * ue is multi-uncorrectable
428 * ce is single-correctable
430 * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
436 * .. code-block:: bash
460 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_ctrl_write()
465 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
483 return -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
493 if ((data.inject.address >= adev->gmc.mc_vram_size && in amdgpu_ras_debugfs_ctrl_write()
494 adev->gmc.mc_vram_size) || in amdgpu_ras_debugfs_ctrl_write()
496 dev_warn(adev->dev, "RAS WARN: input address " in amdgpu_ras_debugfs_ctrl_write()
499 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
506 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " in amdgpu_ras_debugfs_ctrl_write()
518 ret = -EINVAL; in amdgpu_ras_debugfs_ctrl_write()
537 * .. code-block:: bash
549 (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_eeprom_write()
553 &(amdgpu_ras_get_context(adev)->eeprom_control)); in amdgpu_ras_debugfs_eeprom_write()
558 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_debugfs_eeprom_write()
594 * .. code-block:: bash
605 .head = obj->head, in amdgpu_ras_sysfs_read()
608 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_sysfs_read()
611 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_sysfs_read()
612 return -EINVAL; in amdgpu_ras_sysfs_read()
614 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_sysfs_read()
615 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_sysfs_read()
616 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_sysfs_read()
617 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_sysfs_read()
626 #define get_obj(obj) do { (obj)->use++; } while (0)
627 #define alive_obj(obj) ((obj)->use)
631 if (obj && (--obj->use == 0)) in put_obj()
632 list_del(&obj->node); in put_obj()
633 if (obj && (obj->use < 0)) in put_obj()
634 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head)); in put_obj()
644 if (!adev->ras_enabled || !con) in amdgpu_ras_create_obj()
647 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_create_obj()
650 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_create_obj()
651 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_create_obj()
654 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_create_obj()
656 obj = &con->objs[head->block]; in amdgpu_ras_create_obj()
662 obj->head = *head; in amdgpu_ras_create_obj()
663 obj->adev = adev; in amdgpu_ras_create_obj()
664 list_add(&obj->node, &con->head); in amdgpu_ras_create_obj()
678 if (!adev->ras_enabled || !con) in amdgpu_ras_find_obj()
682 if (head->block >= AMDGPU_RAS_BLOCK_COUNT) in amdgpu_ras_find_obj()
685 if (head->block == AMDGPU_RAS_BLOCK__MCA) { in amdgpu_ras_find_obj()
686 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST) in amdgpu_ras_find_obj()
689 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index]; in amdgpu_ras_find_obj()
691 obj = &con->objs[head->block]; in amdgpu_ras_find_obj()
697 obj = &con->objs[i]; in amdgpu_ras_find_obj()
711 return adev->ras_hw_enabled & BIT(head->block); in amdgpu_ras_is_feature_allowed()
719 return con->features & BIT(head->block); in amdgpu_ras_is_feature_enabled()
734 * Ras framework checks con->hw_supported to see if it need do in __amdgpu_ras_feature_enable()
736 * IP checks con->support to see if it need disable ras. in __amdgpu_ras_feature_enable()
745 return -EINVAL; in __amdgpu_ras_feature_enable()
750 con->features |= BIT(head->block); in __amdgpu_ras_feature_enable()
753 con->features &= ~BIT(head->block); in __amdgpu_ras_feature_enable()
770 return -EINVAL; in amdgpu_ras_feature_enable()
774 head->block != AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
779 if (head->block == AMDGPU_RAS_BLOCK__GFX && in amdgpu_ras_feature_enable()
784 return -ENOMEM; in amdgpu_ras_feature_enable()
787 info->disable_features = (struct ta_ras_disable_features_input) { in amdgpu_ras_feature_enable()
788 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
789 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
792 info->enable_features = (struct ta_ras_enable_features_input) { in amdgpu_ras_feature_enable()
793 .block_id = amdgpu_ras_block_to_ta(head->block), in amdgpu_ras_feature_enable()
794 .error_type = amdgpu_ras_error_to_ta(head->type), in amdgpu_ras_feature_enable()
798 ret = psp_ras_enable_features(&adev->psp, info, enable); in amdgpu_ras_feature_enable()
800 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", in amdgpu_ras_feature_enable()
825 return -EINVAL; in amdgpu_ras_feature_enable_on_boot()
827 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_feature_enable_on_boot()
833 * with error code -EAGAIN. in amdgpu_ras_feature_enable_on_boot()
840 if (ret == -EINVAL) { in amdgpu_ras_feature_enable_on_boot()
843 dev_info(adev->dev, in amdgpu_ras_feature_enable_on_boot()
853 /* gfx block ras dsiable cmd must send to ras-ta */ in amdgpu_ras_feature_enable_on_boot()
854 if (head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
855 con->features |= BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
860 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
861 con->features &= ~BIT(head->block); in amdgpu_ras_feature_enable_on_boot()
875 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_disable_all_features()
880 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
883 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
888 return con->features; in amdgpu_ras_disable_all_features()
941 return con->features; in amdgpu_ras_enable_all_features()
949 return -EINVAL; in amdgpu_ras_block_match_default()
951 if (block_obj->ras_comm.block == block) in amdgpu_ras_block_match_default()
954 return -EINVAL; in amdgpu_ras_block_match_default()
966 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_get_ras_block()
967 if (!node->ras_obj) { in amdgpu_ras_get_ras_block()
968 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_get_ras_block()
972 obj = node->ras_obj; in amdgpu_ras_get_ras_block()
973 if (obj->ras_block_match) { in amdgpu_ras_get_ras_block()
974 if (obj->ras_block_match(obj, block, sub_block_index) == 0) in amdgpu_ras_get_ras_block()
994 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); in amdgpu_ras_get_ecc_info()
995 if (ret == -EOPNOTSUPP) { in amdgpu_ras_get_ecc_info()
996 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
997 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_ras_get_ecc_info()
998 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1003 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1004 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) in amdgpu_ras_get_ecc_info()
1005 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1007 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1008 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_ras_get_ecc_info()
1009 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1011 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1012 adev->umc.ras->ecc_info_query_ras_error_address) in amdgpu_ras_get_ecc_info()
1013 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1022 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_query_error_status()
1026 return -EINVAL; in amdgpu_ras_query_error_status()
1028 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { in amdgpu_ras_query_error_status()
1031 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); in amdgpu_ras_query_error_status()
1032 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_query_error_status()
1033 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_query_error_status()
1034 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status()
1035 return -EINVAL; in amdgpu_ras_query_error_status()
1038 if (block_obj->hw_ops->query_ras_error_count) in amdgpu_ras_query_error_status()
1039 block_obj->hw_ops->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
1041 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || in amdgpu_ras_query_error_status()
1042 (info->head.block == AMDGPU_RAS_BLOCK__GFX) || in amdgpu_ras_query_error_status()
1043 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { in amdgpu_ras_query_error_status()
1044 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_query_error_status()
1045 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_query_error_status()
1049 obj->err_data.ue_count += err_data.ue_count; in amdgpu_ras_query_error_status()
1050 obj->err_data.ce_count += err_data.ce_count; in amdgpu_ras_query_error_status()
1052 info->ue_count = obj->err_data.ue_count; in amdgpu_ras_query_error_status()
1053 info->ce_count = obj->err_data.ce_count; in amdgpu_ras_query_error_status()
1056 if (!adev->aid_mask && in amdgpu_ras_query_error_status()
1057 adev->smuio.funcs && in amdgpu_ras_query_error_status()
1058 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
1059 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
1060 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
1064 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
1065 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
1066 obj->err_data.ce_count, in amdgpu_ras_query_error_status()
1067 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status()
1069 dev_info(adev->dev, "%ld correctable hardware errors " in amdgpu_ras_query_error_status()
1072 obj->err_data.ce_count, in amdgpu_ras_query_error_status()
1073 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status()
1077 if (!adev->aid_mask && in amdgpu_ras_query_error_status()
1078 adev->smuio.funcs && in amdgpu_ras_query_error_status()
1079 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
1080 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
1081 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
1084 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
1085 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
1086 obj->err_data.ue_count, in amdgpu_ras_query_error_status()
1087 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status()
1089 dev_info(adev->dev, "%ld uncorrectable hardware errors " in amdgpu_ras_query_error_status()
1091 obj->err_data.ue_count, in amdgpu_ras_query_error_status()
1092 get_ras_block_str(&info->head)); in amdgpu_ras_query_error_status()
1105 return -EINVAL; in amdgpu_ras_reset_error_status()
1107 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_reset_error_status()
1108 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_reset_error_status()
1110 return -EINVAL; in amdgpu_ras_reset_error_status()
1113 if (block_obj->hw_ops->reset_ras_error_count) in amdgpu_ras_reset_error_status()
1114 block_obj->hw_ops->reset_ras_error_count(adev); in amdgpu_ras_reset_error_status()
1118 if (block_obj->hw_ops->reset_ras_error_status) in amdgpu_ras_reset_error_status()
1119 block_obj->hw_ops->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
1129 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_error_inject()
1131 .block_id = amdgpu_ras_block_to_ta(info->head.block), in amdgpu_ras_error_inject()
1132 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type), in amdgpu_ras_error_inject()
1133 .sub_block_index = info->head.sub_block_index, in amdgpu_ras_error_inject()
1134 .address = info->address, in amdgpu_ras_error_inject()
1135 .value = info->value, in amdgpu_ras_error_inject()
1137 int ret = -EINVAL; in amdgpu_ras_error_inject()
1139 info->head.block, in amdgpu_ras_error_inject()
1140 info->head.sub_block_index); in amdgpu_ras_error_inject()
1147 return -EINVAL; in amdgpu_ras_error_inject()
1149 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_inject()
1150 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_inject()
1151 get_ras_block_str(&info->head)); in amdgpu_ras_error_inject()
1152 return -EINVAL; in amdgpu_ras_error_inject()
1156 if (adev->gmc.xgmi.num_physical_nodes > 1 && in amdgpu_ras_error_inject()
1157 info->head.block != AMDGPU_RAS_BLOCK__GFX) { in amdgpu_ras_error_inject()
1163 if (block_obj->hw_ops->ras_error_inject) { in amdgpu_ras_error_inject()
1164 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_error_inject()
1165 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); in amdgpu_ras_error_inject()
1167 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, in amdgpu_ras_error_inject()
1168 info->instance_mask); in amdgpu_ras_error_inject()
1171 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); in amdgpu_ras_error_inject()
1175 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1176 get_ras_block_str(&info->head), ret); in amdgpu_ras_error_inject()
1182 * amdgpu_ras_query_error_count_helper -- Get error counter for specific IP
1206 *ce_count += query_info->ce_count; in amdgpu_ras_query_error_count_helper()
1207 *ue_count += query_info->ue_count; in amdgpu_ras_query_error_count_helper()
1211 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_query_error_count_helper()
1212 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_query_error_count_helper()
1213 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) in amdgpu_ras_query_error_count_helper()
1214 dev_warn(adev->dev, in amdgpu_ras_query_error_count_helper()
1222 * amdgpu_ras_query_error_count -- Get error counts of all IPs or specific IP
1233 * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS.
1245 if (!adev->ras_enabled || !con) in amdgpu_ras_query_error_count()
1246 return -EOPNOTSUPP; in amdgpu_ras_query_error_count()
1257 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_error_count()
1259 .head = obj->head, in amdgpu_ras_query_error_count()
1324 * .. code-block:: bash
1337 struct amdgpu_device *adev = con->adev; in amdgpu_ras_sysfs_badpages_read()
1339 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; in amdgpu_ras_sysfs_badpages_read()
1340 unsigned int start = div64_ul(ppos + element_size - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1341 unsigned int end = div64_ul(ppos + count - 1, element_size); in amdgpu_ras_sysfs_badpages_read()
1369 return sysfs_emit(buf, "feature mask: 0x%x\n", con->features); in amdgpu_ras_sysfs_features_read()
1376 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove_bad_page_node()
1377 &con->badpages_attr.attr, in amdgpu_ras_sysfs_remove_bad_page_node()
1385 &con->features_attr.attr, in amdgpu_ras_sysfs_remove_feature_node()
1393 sysfs_remove_group(&adev->dev->kobj, &group); in amdgpu_ras_sysfs_remove_feature_node()
1403 if (!obj || obj->attr_inuse) in amdgpu_ras_sysfs_create()
1404 return -EINVAL; in amdgpu_ras_sysfs_create()
1408 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name), in amdgpu_ras_sysfs_create()
1409 "%s_err_count", head->name); in amdgpu_ras_sysfs_create()
1411 obj->sysfs_attr = (struct device_attribute){ in amdgpu_ras_sysfs_create()
1413 .name = obj->fs_data.sysfs_name, in amdgpu_ras_sysfs_create()
1418 sysfs_attr_init(&obj->sysfs_attr.attr); in amdgpu_ras_sysfs_create()
1420 if (sysfs_add_file_to_group(&adev->dev->kobj, in amdgpu_ras_sysfs_create()
1421 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_create()
1424 return -EINVAL; in amdgpu_ras_sysfs_create()
1427 obj->attr_inuse = 1; in amdgpu_ras_sysfs_create()
1437 if (!obj || !obj->attr_inuse) in amdgpu_ras_sysfs_remove()
1438 return -EINVAL; in amdgpu_ras_sysfs_remove()
1440 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove()
1441 &obj->sysfs_attr.attr, in amdgpu_ras_sysfs_remove()
1443 obj->attr_inuse = 0; in amdgpu_ras_sysfs_remove()
1454 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_sysfs_remove_all()
1455 amdgpu_ras_sysfs_remove(adev, &obj->head); in amdgpu_ras_sysfs_remove_all()
1480 * .. code-block:: bash
1489 struct amdgpu_ras_eeprom_control *eeprom = &con->eeprom_control; in amdgpu_ras_debugfs_create_ctrl_node()
1490 struct drm_minor *minor = adev_to_drm(adev)->primary; in amdgpu_ras_debugfs_create_ctrl_node()
1493 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); in amdgpu_ras_debugfs_create_ctrl_node()
1499 &con->bad_page_cnt_threshold); in amdgpu_ras_debugfs_create_ctrl_node()
1500 debugfs_create_u32("ras_num_recs", 0444, dir, &eeprom->ras_num_recs); in amdgpu_ras_debugfs_create_ctrl_node()
1501 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1502 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1505 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", in amdgpu_ras_debugfs_create_ctrl_node()
1508 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); in amdgpu_ras_debugfs_create_ctrl_node()
1518 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot); in amdgpu_ras_debugfs_create_ctrl_node()
1525 &con->disable_ras_err_cnt_harvest); in amdgpu_ras_debugfs_create_ctrl_node()
1533 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_debugfs_create()
1540 memcpy(obj->fs_data.debugfs_name, in amdgpu_ras_debugfs_create()
1541 head->debugfs_name, in amdgpu_ras_debugfs_create()
1542 sizeof(obj->fs_data.debugfs_name)); in amdgpu_ras_debugfs_create()
1544 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir, in amdgpu_ras_debugfs_create()
1564 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_debugfs_create_all()
1565 if (amdgpu_ras_is_supported(adev, obj->head.block) && in amdgpu_ras_debugfs_create_all()
1566 (obj->attr_inuse == 1)) { in amdgpu_ras_debugfs_create_all()
1568 get_ras_block_str(&obj->head)); in amdgpu_ras_debugfs_create_all()
1569 fs_info.head = obj->head; in amdgpu_ras_debugfs_create_all()
1589 &con->features_attr.attr, in amdgpu_ras_fs_init()
1599 con->features_attr = dev_attr_features; in amdgpu_ras_fs_init()
1606 con->badpages_attr = bin_attr_gpu_vram_bad_pages; in amdgpu_ras_fs_init()
1607 bin_attrs[0] = &con->badpages_attr; in amdgpu_ras_fs_init()
1612 r = sysfs_create_group(&adev->dev->kobj, &group); in amdgpu_ras_fs_init()
1614 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
1625 list_for_each_entry_safe(con_obj, tmp, &con->head, node) { in amdgpu_ras_fs_fini()
1626 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); in amdgpu_ras_fs_fini()
1650 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1651 adev->nbio.ras->handle_ras_controller_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1652 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1654 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1655 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1656 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1663 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_consumption_handler()
1665 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
1674 if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) { in amdgpu_ras_interrupt_poison_consumption_handler()
1675 poison_stat = block_obj->hw_ops->query_poison_status(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1678 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1679 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
1687 if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) in amdgpu_ras_interrupt_poison_consumption_handler()
1688 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1692 dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1693 block_obj->ras_comm.name); in amdgpu_ras_interrupt_poison_consumption_handler()
1703 dev_info(obj->adev->dev, in amdgpu_ras_interrupt_poison_creation_handler()
1710 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_umc_handler()
1714 if (!data->cb) in amdgpu_ras_interrupt_umc_handler()
1720 ret = data->cb(obj->adev, &err_data, entry); in amdgpu_ras_interrupt_umc_handler()
1730 obj->err_data.ue_count += err_data.ue_count; in amdgpu_ras_interrupt_umc_handler()
1731 obj->err_data.ce_count += err_data.ce_count; in amdgpu_ras_interrupt_umc_handler()
1737 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_handler()
1740 while (data->rptr != data->wptr) { in amdgpu_ras_interrupt_handler()
1742 memcpy(&entry, &data->ring[data->rptr], in amdgpu_ras_interrupt_handler()
1743 data->element_size); in amdgpu_ras_interrupt_handler()
1746 data->rptr = (data->aligned_element_size + in amdgpu_ras_interrupt_handler()
1747 data->rptr) % data->ring_size; in amdgpu_ras_interrupt_handler()
1749 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { in amdgpu_ras_interrupt_handler()
1750 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
1755 if (obj->head.block == AMDGPU_RAS_BLOCK__UMC) in amdgpu_ras_interrupt_handler()
1758 dev_warn(obj->adev->dev, in amdgpu_ras_interrupt_handler()
1759 "No RAS interrupt handler for non-UMC block with poison disabled.\n"); in amdgpu_ras_interrupt_handler()
1777 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_dispatch()
1778 struct ras_ih_data *data = &obj->ih_data; in amdgpu_ras_interrupt_dispatch()
1781 return -EINVAL; in amdgpu_ras_interrupt_dispatch()
1783 if (data->inuse == 0) in amdgpu_ras_interrupt_dispatch()
1787 memcpy(&data->ring[data->wptr], info->entry, in amdgpu_ras_interrupt_dispatch()
1788 data->element_size); in amdgpu_ras_interrupt_dispatch()
1791 data->wptr = (data->aligned_element_size + in amdgpu_ras_interrupt_dispatch()
1792 data->wptr) % data->ring_size; in amdgpu_ras_interrupt_dispatch()
1794 schedule_work(&data->ih_work); in amdgpu_ras_interrupt_dispatch()
1806 return -EINVAL; in amdgpu_ras_interrupt_remove_handler()
1808 data = &obj->ih_data; in amdgpu_ras_interrupt_remove_handler()
1809 if (data->inuse == 0) in amdgpu_ras_interrupt_remove_handler()
1812 cancel_work_sync(&data->ih_work); in amdgpu_ras_interrupt_remove_handler()
1814 kfree(data->ring); in amdgpu_ras_interrupt_remove_handler()
1832 return -EINVAL; in amdgpu_ras_interrupt_add_handler()
1838 data = &obj->ih_data; in amdgpu_ras_interrupt_add_handler()
1842 .cb = ras_obj->ras_cb, in amdgpu_ras_interrupt_add_handler()
1848 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler); in amdgpu_ras_interrupt_add_handler()
1850 data->aligned_element_size = ALIGN(data->element_size, 8); in amdgpu_ras_interrupt_add_handler()
1852 data->ring_size = 64 * data->aligned_element_size; in amdgpu_ras_interrupt_add_handler()
1853 data->ring = kmalloc(data->ring_size, GFP_KERNEL); in amdgpu_ras_interrupt_add_handler()
1854 if (!data->ring) { in amdgpu_ras_interrupt_add_handler()
1856 return -ENOMEM; in amdgpu_ras_interrupt_add_handler()
1860 data->inuse = 1; in amdgpu_ras_interrupt_add_handler()
1870 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_interrupt_remove_all()
1871 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); in amdgpu_ras_interrupt_remove_all()
1884 if (!adev->ras_enabled || !con) in amdgpu_ras_log_on_err_counter()
1887 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_log_on_err_counter()
1889 .head = obj->head, in amdgpu_ras_log_on_err_counter()
1908 (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))) in amdgpu_ras_log_on_err_counter()
1913 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_log_on_err_counter()
1914 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && in amdgpu_ras_log_on_err_counter()
1915 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { in amdgpu_ras_log_on_err_counter()
1917 dev_warn(adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_log_on_err_counter()
1931 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) && in amdgpu_ras_error_status_query()
1932 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB)) in amdgpu_ras_error_status_query()
1936 info->head.block, in amdgpu_ras_error_status_query()
1937 info->head.sub_block_index); in amdgpu_ras_error_status_query()
1939 if (!block_obj || !block_obj->hw_ops) { in amdgpu_ras_error_status_query()
1940 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_status_query()
1941 get_ras_block_str(&info->head)); in amdgpu_ras_error_status_query()
1945 if (block_obj->hw_ops->query_ras_error_status) in amdgpu_ras_error_status_query()
1946 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
1955 if (!adev->ras_enabled || !con) in amdgpu_ras_query_err_status()
1958 list_for_each_entry(obj, &con->head, node) { in amdgpu_ras_query_err_status()
1960 .head = obj->head, in amdgpu_ras_query_err_status()
1980 if (!con || !con->eh_data || !bps || !count) in amdgpu_ras_badpages_read()
1981 return -EINVAL; in amdgpu_ras_badpages_read()
1983 mutex_lock(&con->recovery_lock); in amdgpu_ras_badpages_read()
1984 data = con->eh_data; in amdgpu_ras_badpages_read()
1985 if (!data || data->count == 0) { in amdgpu_ras_badpages_read()
1987 ret = -EINVAL; in amdgpu_ras_badpages_read()
1991 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); in amdgpu_ras_badpages_read()
1993 ret = -ENOMEM; in amdgpu_ras_badpages_read()
1997 for (; i < data->count; i++) { in amdgpu_ras_badpages_read()
1999 .bp = data->bps[i].retired_page, in amdgpu_ras_badpages_read()
2003 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, in amdgpu_ras_badpages_read()
2004 data->bps[i].retired_page); in amdgpu_ras_badpages_read()
2005 if (status == -EBUSY) in amdgpu_ras_badpages_read()
2007 else if (status == -ENOENT) in amdgpu_ras_badpages_read()
2011 *count = data->count; in amdgpu_ras_badpages_read()
2013 mutex_unlock(&con->recovery_lock); in amdgpu_ras_badpages_read()
2022 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery()
2025 if (!ras->disable_ras_err_cnt_harvest) { in amdgpu_ras_do_recovery()
2029 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_do_recovery()
2030 device_list_handle = &hive->device_list; in amdgpu_ras_do_recovery()
2033 list_add_tail(&adev->gmc.xgmi.head, &device_list); in amdgpu_ras_do_recovery()
2046 if (amdgpu_device_should_recover_gpu(ras->adev)) { in amdgpu_ras_do_recovery()
2054 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) in amdgpu_ras_do_recovery()
2059 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) { in amdgpu_ras_do_recovery()
2060 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET; in amdgpu_ras_do_recovery()
2067 if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) { in amdgpu_ras_do_recovery()
2068 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_do_recovery()
2071 psp_fatal_error_recovery_quirk(&adev->psp); in amdgpu_ras_do_recovery()
2075 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); in amdgpu_ras_do_recovery()
2077 atomic_set(&ras->in_recovery, 0); in amdgpu_ras_do_recovery()
2084 unsigned int old_space = data->count + data->space_left; in amdgpu_ras_realloc_eh_data_space()
2087 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); in amdgpu_ras_realloc_eh_data_space()
2090 return -ENOMEM; in amdgpu_ras_realloc_eh_data_space()
2093 if (data->bps) { in amdgpu_ras_realloc_eh_data_space()
2094 memcpy(bps, data->bps, in amdgpu_ras_realloc_eh_data_space()
2095 data->count * sizeof(*data->bps)); in amdgpu_ras_realloc_eh_data_space()
2096 kfree(data->bps); in amdgpu_ras_realloc_eh_data_space()
2099 data->bps = bps; in amdgpu_ras_realloc_eh_data_space()
2100 data->space_left += align_space - old_space; in amdgpu_ras_realloc_eh_data_space()
2113 if (!con || !con->eh_data || !bps || pages <= 0) in amdgpu_ras_add_bad_pages()
2116 mutex_lock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2117 data = con->eh_data; in amdgpu_ras_add_bad_pages()
2126 if (!data->space_left && in amdgpu_ras_add_bad_pages()
2128 ret = -ENOMEM; in amdgpu_ras_add_bad_pages()
2132 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, in amdgpu_ras_add_bad_pages()
2136 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); in amdgpu_ras_add_bad_pages()
2137 data->count++; in amdgpu_ras_add_bad_pages()
2138 data->space_left--; in amdgpu_ras_add_bad_pages()
2141 mutex_unlock(&con->recovery_lock); in amdgpu_ras_add_bad_pages()
2159 if (!con || !con->eh_data) { in amdgpu_ras_save_bad_pages()
2166 mutex_lock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2167 control = &con->eeprom_control; in amdgpu_ras_save_bad_pages()
2168 data = con->eh_data; in amdgpu_ras_save_bad_pages()
2169 save_count = data->count - control->ras_num_recs; in amdgpu_ras_save_bad_pages()
2170 mutex_unlock(&con->recovery_lock); in amdgpu_ras_save_bad_pages()
2173 *new_cnt = save_count / adev->umc.retire_unit; in amdgpu_ras_save_bad_pages()
2178 &data->bps[control->ras_num_recs], in amdgpu_ras_save_bad_pages()
2180 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
2181 return -EIO; in amdgpu_ras_save_bad_pages()
2184 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); in amdgpu_ras_save_bad_pages()
2197 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
2202 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) in amdgpu_ras_load_bad_pages()
2205 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL); in amdgpu_ras_load_bad_pages()
2207 return -ENOMEM; in amdgpu_ras_load_bad_pages()
2209 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
2211 dev_err(adev->dev, "Failed to load EEPROM table records!"); in amdgpu_ras_load_bad_pages()
2213 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
2222 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_check_bad_page_unlock()
2226 for (i = 0; i < data->count; i++) in amdgpu_ras_check_bad_page_unlock()
2227 if (addr == data->bps[i].retired_page) in amdgpu_ras_check_bad_page_unlock()
2244 if (!con || !con->eh_data) in amdgpu_ras_check_bad_page()
2247 mutex_lock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
2249 mutex_unlock(&con->recovery_lock); in amdgpu_ras_check_bad_page()
2259 * Justification of value bad_page_cnt_threshold in ras structure in amdgpu_ras_validate_threshold()
2262 * in eeprom or amdgpu_bad_page_threshold == -2, introduce two in amdgpu_ras_validate_threshold()
2266 * - If amdgpu_bad_page_threshold = -2, in amdgpu_ras_validate_threshold()
2269 * - When the value from user is 0 < amdgpu_bad_page_threshold < in amdgpu_ras_validate_threshold()
2273 * - If amdgpu_bad_page_threshold = 0, bad page retirement in amdgpu_ras_validate_threshold()
2279 u64 val = adev->gmc.mc_vram_size; in amdgpu_ras_validate_threshold()
2282 con->bad_page_cnt_threshold = min(lower_32_bits(val), in amdgpu_ras_validate_threshold()
2285 con->bad_page_cnt_threshold = min_t(int, max_count, in amdgpu_ras_validate_threshold()
2303 * adev->ras_enabled is unset, i.e. when "ras_enable" in amdgpu_ras_recovery_init()
2306 con->adev = adev; in amdgpu_ras_recovery_init()
2308 if (!adev->ras_enabled) in amdgpu_ras_recovery_init()
2311 data = &con->eh_data; in amdgpu_ras_recovery_init()
2314 ret = -ENOMEM; in amdgpu_ras_recovery_init()
2318 mutex_init(&con->recovery_lock); in amdgpu_ras_recovery_init()
2319 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); in amdgpu_ras_recovery_init()
2320 atomic_set(&con->in_recovery, 0); in amdgpu_ras_recovery_init()
2321 con->eeprom_control.bad_channel_bitmap = 0; in amdgpu_ras_recovery_init()
2323 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); in amdgpu_ras_recovery_init()
2330 if (adev->gmc.xgmi.pending_reset) in amdgpu_ras_recovery_init()
2332 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); in amdgpu_ras_recovery_init()
2340 if (con->eeprom_control.ras_num_recs) { in amdgpu_ras_recovery_init()
2345 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); in amdgpu_ras_recovery_init()
2347 if (con->update_channel_flag == true) { in amdgpu_ras_recovery_init()
2348 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); in amdgpu_ras_recovery_init()
2349 con->update_channel_flag = false; in amdgpu_ras_recovery_init()
2354 if ((adev->asic_type == CHIP_ALDEBARAN) && in amdgpu_ras_recovery_init()
2355 (adev->gmc.xgmi.connected_to_cpu)) in amdgpu_ras_recovery_init()
2361 kfree((*data)->bps); in amdgpu_ras_recovery_init()
2363 con->eh_data = NULL; in amdgpu_ras_recovery_init()
2365 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
2374 ret = -EINVAL; in amdgpu_ras_recovery_init()
2382 struct ras_err_handler_data *data = con->eh_data; in amdgpu_ras_recovery_fini()
2388 cancel_work_sync(&con->recovery_work); in amdgpu_ras_recovery_fini()
2390 mutex_lock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
2391 con->eh_data = NULL; in amdgpu_ras_recovery_fini()
2392 kfree(data->bps); in amdgpu_ras_recovery_fini()
2394 mutex_unlock(&con->recovery_lock); in amdgpu_ras_recovery_fini()
2403 switch (adev->ip_versions[MP0_HWIP][0]) { in amdgpu_ras_asic_supported()
2412 if (adev->asic_type == CHIP_IP_DISCOVERY) { in amdgpu_ras_asic_supported()
2413 switch (adev->ip_versions[MP0_HWIP][0]) { in amdgpu_ras_asic_supported()
2423 return adev->asic_type == CHIP_VEGA10 || in amdgpu_ras_asic_supported()
2424 adev->asic_type == CHIP_VEGA20 || in amdgpu_ras_asic_supported()
2425 adev->asic_type == CHIP_ARCTURUS || in amdgpu_ras_asic_supported()
2426 adev->asic_type == CHIP_ALDEBARAN || in amdgpu_ras_asic_supported()
2427 adev->asic_type == CHIP_SIENNA_CICHLID; in amdgpu_ras_asic_supported()
2437 struct atom_context *ctx = adev->mode_info.atom_context; in amdgpu_ras_get_quirks()
2442 if (strnstr(ctx->vbios_pn, "D16406", in amdgpu_ras_get_quirks()
2443 sizeof(ctx->vbios_pn)) || in amdgpu_ras_get_quirks()
2444 strnstr(ctx->vbios_pn, "D36002", in amdgpu_ras_get_quirks()
2445 sizeof(ctx->vbios_pn))) in amdgpu_ras_get_quirks()
2446 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_get_quirks()
2460 adev->ras_hw_enabled = adev->ras_enabled = 0; in amdgpu_ras_check_supported()
2465 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { in amdgpu_ras_check_supported()
2467 dev_info(adev->dev, "MEM ECC is active.\n"); in amdgpu_ras_check_supported()
2468 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2471 dev_info(adev->dev, "MEM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2475 dev_info(adev->dev, "SRAM ECC is active.\n"); in amdgpu_ras_check_supported()
2477 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2480 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | in amdgpu_ras_check_supported()
2487 if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || in amdgpu_ras_check_supported()
2488 adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0)) in amdgpu_ras_check_supported()
2489 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2492 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2499 if (!adev->gmc.xgmi.num_physical_nodes) in amdgpu_ras_check_supported()
2500 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); in amdgpu_ras_check_supported()
2502 dev_info(adev->dev, "SRAM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2507 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | in amdgpu_ras_check_supported()
2515 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; in amdgpu_ras_check_supported()
2522 if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) && in amdgpu_ras_check_supported()
2523 adev->gmc.is_app_apu) in amdgpu_ras_check_supported()
2524 adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 : in amdgpu_ras_check_supported()
2525 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
2527 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : in amdgpu_ras_check_supported()
2528 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
2535 struct amdgpu_device *adev = con->adev; in amdgpu_ras_counte_dw()
2540 res = pm_runtime_get_sync(dev->dev); in amdgpu_ras_counte_dw()
2547 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_counte_dw()
2548 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_counte_dw()
2551 pm_runtime_mark_last_busy(dev->dev); in amdgpu_ras_counte_dw()
2553 pm_runtime_put_autosuspend(dev->dev); in amdgpu_ras_counte_dw()
2566 if (adev->gmc.xgmi.connected_to_cpu) { in amdgpu_ras_query_poison_mode()
2568 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
2569 } else if (adev->df.funcs && in amdgpu_ras_query_poison_mode()
2570 adev->df.funcs->query_ras_poison_mode && in amdgpu_ras_query_poison_mode()
2571 adev->umc.ras && in amdgpu_ras_query_poison_mode()
2572 adev->umc.ras->query_ras_poison_mode) { in amdgpu_ras_query_poison_mode()
2574 adev->df.funcs->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
2576 adev->umc.ras->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
2580 con->poison_supported = true; in amdgpu_ras_query_poison_mode()
2582 dev_warn(adev->dev, in amdgpu_ras_query_poison_mode()
2601 return -ENOMEM; in amdgpu_ras_init()
2603 con->adev = adev; in amdgpu_ras_init()
2604 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); in amdgpu_ras_init()
2605 atomic_set(&con->ras_ce_count, 0); in amdgpu_ras_init()
2606 atomic_set(&con->ras_ue_count, 0); in amdgpu_ras_init()
2608 con->objs = (struct ras_manager *)(con + 1); in amdgpu_ras_init()
2614 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { in amdgpu_ras_init()
2618 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { in amdgpu_ras_init()
2619 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_init()
2628 con->update_channel_flag = false; in amdgpu_ras_init()
2629 con->features = 0; in amdgpu_ras_init()
2630 INIT_LIST_HEAD(&con->head); in amdgpu_ras_init()
2632 con->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_init()
2637 switch (adev->ip_versions[NBIO_HWIP][0]) { in amdgpu_ras_init()
2641 if (!adev->gmc.xgmi.connected_to_cpu) in amdgpu_ras_init()
2642 adev->nbio.ras = &nbio_v7_4_ras; in amdgpu_ras_init()
2645 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) in amdgpu_ras_init()
2652 adev->nbio.ras = &nbio_v4_3_ras; in amdgpu_ras_init()
2655 if (!adev->gmc.is_app_apu) in amdgpu_ras_init()
2656 adev->nbio.ras = &nbio_v7_9_ras; in amdgpu_ras_init()
2669 if (adev->nbio.ras && in amdgpu_ras_init()
2670 adev->nbio.ras->init_ras_controller_interrupt) { in amdgpu_ras_init()
2671 r = adev->nbio.ras->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
2676 if (adev->nbio.ras && in amdgpu_ras_init()
2677 adev->nbio.ras->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
2678 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
2686 r = -EINVAL; in amdgpu_ras_init()
2690 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
2692 adev->ras_hw_enabled, adev->ras_enabled); in amdgpu_ras_init()
2704 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_persistent_edc_harvesting_supported()
2705 adev->gmc.is_app_apu) in amdgpu_persistent_edc_harvesting_supported()
2723 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) in amdgpu_persistent_edc_harvesting()
2736 return con->poison_supported; in amdgpu_ras_is_poison_mode_supported()
2750 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_ras_block_late_init()
2757 if (adev->in_suspend || amdgpu_in_reset(adev)) { in amdgpu_ras_block_late_init()
2769 if (adev->in_suspend || amdgpu_in_reset(adev)) in amdgpu_ras_block_late_init()
2773 if (ras_obj->ras_cb || (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
2774 (ras_obj->hw_ops->query_poison_status || in amdgpu_ras_block_late_init()
2775 ras_obj->hw_ops->handle_poison_consumption))) { in amdgpu_ras_block_late_init()
2781 if (ras_obj->hw_ops && in amdgpu_ras_block_late_init()
2782 (ras_obj->hw_ops->query_ras_error_count || in amdgpu_ras_block_late_init()
2783 ras_obj->hw_ops->query_ras_error_status)) { in amdgpu_ras_block_late_init()
2792 return -ENOMEM; in amdgpu_ras_block_late_init()
2793 memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); in amdgpu_ras_block_late_init()
2796 atomic_set(&con->ras_ce_count, ce_count); in amdgpu_ras_block_late_init()
2797 atomic_set(&con->ras_ue_count, ue_count); in amdgpu_ras_block_late_init()
2806 if (ras_obj->ras_cb) in amdgpu_ras_block_late_init()
2830 if (ras_obj->ras_cb) in amdgpu_ras_block_late_fini()
2848 if (!adev->ras_enabled || !con) { in amdgpu_ras_resume()
2855 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { in amdgpu_ras_resume()
2867 list_for_each_entry_safe(obj, tmp, &con->head, node) { in amdgpu_ras_resume()
2868 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { in amdgpu_ras_resume()
2869 amdgpu_ras_feature_enable(adev, &obj->head, 0); in amdgpu_ras_resume()
2881 if (!adev->ras_enabled || !con) in amdgpu_ras_suspend()
2886 if (con->features) in amdgpu_ras_suspend()
2900 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_late_init()
2901 if (!node->ras_obj) { in amdgpu_ras_late_init()
2902 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_late_init()
2906 obj = node->ras_obj; in amdgpu_ras_late_init()
2907 if (obj->ras_late_init) { in amdgpu_ras_late_init()
2908 r = obj->ras_late_init(adev, &obj->ras_comm); in amdgpu_ras_late_init()
2910 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", in amdgpu_ras_late_init()
2911 obj->ras_comm.name, r); in amdgpu_ras_late_init()
2915 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); in amdgpu_ras_late_init()
2926 if (!adev->ras_enabled || !con) in amdgpu_ras_pre_fini()
2931 if (con->features) in amdgpu_ras_pre_fini()
2943 if (!adev->ras_enabled || !con) in amdgpu_ras_fini()
2946 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { in amdgpu_ras_fini()
2947 if (ras_node->ras_obj) { in amdgpu_ras_fini()
2948 obj = ras_node->ras_obj; in amdgpu_ras_fini()
2949 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && in amdgpu_ras_fini()
2950 obj->ras_fini) in amdgpu_ras_fini()
2951 obj->ras_fini(adev, &obj->ras_comm); in amdgpu_ras_fini()
2953 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); in amdgpu_ras_fini()
2957 list_del(&ras_node->node); in amdgpu_ras_fini()
2964 WARN(con->features, "Feature mask is not cleared"); in amdgpu_ras_fini()
2966 if (con->features) in amdgpu_ras_fini()
2969 cancel_delayed_work_sync(&con->ras_counte_delay_work); in amdgpu_ras_fini()
2982 dev_info(adev->dev, "uncorrectable hardware error" in amdgpu_ras_global_ras_isr()
2985 ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; in amdgpu_ras_global_ras_isr()
2992 if (adev->asic_type == CHIP_VEGA20 && in amdgpu_ras_need_emergency_restart()
2993 adev->pm.fw_version <= 0x283400) { in amdgpu_ras_need_emergency_restart()
3008 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { in amdgpu_release_ras_context()
3009 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); in amdgpu_release_ras_context()
3024 if (adev && adev->gmc.xgmi.connected_to_cpu && in find_adev()
3025 adev->gmc.xgmi.physical_node_id == node_id) in find_adev()
3051 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) && in amdgpu_bad_page_notifier()
3052 (XEC(m->status, 0x3f) == 0x0))) in amdgpu_bad_page_notifier()
3064 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; in amdgpu_bad_page_notifier()
3077 umc_inst = GET_UMC_INST(m->ipid); in amdgpu_bad_page_notifier()
3078 ch_inst = GET_CHAN_INDEX(m->ipid); in amdgpu_bad_page_notifier()
3080 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", in amdgpu_bad_page_notifier()
3083 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) in amdgpu_bad_page_notifier()
3122 return adev->psp.ras_context.ras; in amdgpu_ras_get_context()
3128 return -EINVAL; in amdgpu_ras_set_context()
3130 adev->psp.ras_context.ras = ras_con; in amdgpu_ras_set_context()
3144 ret = ras && (adev->ras_enabled & (1 << block)); in amdgpu_ras_is_supported()
3168 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) in amdgpu_ras_reset_gpu()
3169 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
3180 return -EINVAL; in amdgpu_ras_register_ras_block()
3184 return -ENOMEM; in amdgpu_ras_register_ras_block()
3186 INIT_LIST_HEAD(&ras_node->node); in amdgpu_ras_register_ras_block()
3187 ras_node->ras_obj = ras_block_obj; in amdgpu_ras_register_ras_block()
3188 list_add_tail(&ras_node->node, &adev->ras_list); in amdgpu_ras_register_ras_block()
3222 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_memory_id_field()
3223 reg_entry->seg_lo, reg_entry->reg_lo); in amdgpu_ras_inst_get_memory_id_field()
3226 if ((reg_entry->flags & AMDGPU_RAS_ERR_STATUS_VALID) && in amdgpu_ras_inst_get_memory_id_field()
3246 AMDGPU_RAS_REG_ENTRY_OFFSET(reg_entry->hwip, instance, in amdgpu_ras_inst_get_err_cnt_field()
3247 reg_entry->seg_hi, reg_entry->reg_hi); in amdgpu_ras_inst_get_err_cnt_field()
3250 if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && in amdgpu_ras_inst_get_err_cnt_field()
3253 dev_dbg(adev->dev, "Invalid err_info field\n"); in amdgpu_ras_inst_get_err_cnt_field()
3293 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
3301 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()