Lines Matching refs:adev

126 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
129 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
137 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) in amdgpu_ras_set_error_query_ready() argument
139 if (adev && amdgpu_ras_get_context(adev)) in amdgpu_ras_set_error_query_ready()
140 amdgpu_ras_get_context(adev)->error_query_ready = ready; in amdgpu_ras_set_error_query_ready()
143 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) in amdgpu_ras_get_error_query_ready() argument
145 if (adev && amdgpu_ras_get_context(adev)) in amdgpu_ras_get_error_query_ready()
146 return amdgpu_ras_get_context(adev)->error_query_ready; in amdgpu_ras_get_error_query_ready()
151 static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) in amdgpu_reserve_page_direct() argument
156 if ((address >= adev->gmc.mc_vram_size) || in amdgpu_reserve_page_direct()
158 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
164 if (amdgpu_ras_check_bad_page(adev, address)) { in amdgpu_reserve_page_direct()
165 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
177 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, in amdgpu_reserve_page_direct()
179 amdgpu_ras_save_bad_pages(adev); in amdgpu_reserve_page_direct()
182 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
183 dev_warn(adev->dev, "Clear EEPROM:\n"); in amdgpu_reserve_page_direct()
184 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
199 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_debugfs_read()
203 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_debugfs_read()
204 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_debugfs_read()
205 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_debugfs_read()
206 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_debugfs_read()
411 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_ctrl_write() local
415 if (!amdgpu_ras_get_error_query_ready(adev)) { in amdgpu_ras_debugfs_ctrl_write()
416 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
426 ret = amdgpu_reserve_page_direct(adev, data.inject.address); in amdgpu_ras_debugfs_ctrl_write()
433 if (!amdgpu_ras_is_supported(adev, data.head.block)) in amdgpu_ras_debugfs_ctrl_write()
438 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); in amdgpu_ras_debugfs_ctrl_write()
441 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); in amdgpu_ras_debugfs_ctrl_write()
444 if ((data.inject.address >= adev->gmc.mc_vram_size) || in amdgpu_ras_debugfs_ctrl_write()
446 dev_warn(adev->dev, "RAS WARN: input address " in amdgpu_ras_debugfs_ctrl_write()
455 amdgpu_ras_check_bad_page(adev, data.inject.address)) { in amdgpu_ras_debugfs_ctrl_write()
456 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " in amdgpu_ras_debugfs_ctrl_write()
463 ret = amdgpu_ras_error_inject(adev, &data.inject); in amdgpu_ras_debugfs_ctrl_write()
496 struct amdgpu_device *adev = in amdgpu_ras_debugfs_eeprom_write() local
501 &(amdgpu_ras_get_context(adev)->eeprom_control)); in amdgpu_ras_debugfs_eeprom_write()
506 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_debugfs_eeprom_write()
556 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_sysfs_read()
559 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_sysfs_read()
562 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_sysfs_read()
563 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_sysfs_read()
564 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_sysfs_read()
565 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_sysfs_read()
586 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, in amdgpu_ras_create_obj() argument
589 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_create_obj()
592 if (!adev->ras_enabled || !con) in amdgpu_ras_create_obj()
611 obj->adev = adev; in amdgpu_ras_create_obj()
619 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, in amdgpu_ras_find_obj() argument
622 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_find_obj()
626 if (!adev->ras_enabled || !con) in amdgpu_ras_find_obj()
656 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, in amdgpu_ras_is_feature_allowed() argument
659 return adev->ras_hw_enabled & BIT(head->block); in amdgpu_ras_is_feature_allowed()
662 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, in amdgpu_ras_is_feature_enabled() argument
665 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_is_feature_enabled()
674 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, in __amdgpu_ras_feature_enable() argument
677 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in __amdgpu_ras_feature_enable()
678 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in __amdgpu_ras_feature_enable()
686 if (!amdgpu_ras_is_feature_allowed(adev, head)) in __amdgpu_ras_feature_enable()
691 obj = amdgpu_ras_create_obj(adev, head); in __amdgpu_ras_feature_enable()
700 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { in __amdgpu_ras_feature_enable()
710 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, in amdgpu_ras_feature_enable() argument
713 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_feature_enable()
739 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); in amdgpu_ras_feature_enable()
743 !amdgpu_sriov_vf(adev) && in amdgpu_ras_feature_enable()
745 ret = psp_ras_enable_features(&adev->psp, info, enable); in amdgpu_ras_feature_enable()
747 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", in amdgpu_ras_feature_enable()
750 amdgpu_ras_is_poison_mode_supported(adev), ret); in amdgpu_ras_feature_enable()
756 __amdgpu_ras_feature_enable(adev, head, enable); in amdgpu_ras_feature_enable()
765 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, in amdgpu_ras_feature_enable_on_boot() argument
768 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_feature_enable_on_boot()
782 ret = amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
788 ret = __amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
790 dev_info(adev->dev, in amdgpu_ras_feature_enable_on_boot()
796 ret = __amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
804 ret = amdgpu_ras_feature_enable(adev, head, 0); in amdgpu_ras_feature_enable_on_boot()
807 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
811 ret = amdgpu_ras_feature_enable(adev, head, enable); in amdgpu_ras_feature_enable_on_boot()
816 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, in amdgpu_ras_disable_all_features() argument
819 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_disable_all_features()
827 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
830 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
838 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, in amdgpu_ras_enable_all_features() argument
841 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_enable_all_features()
860 if (__amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
863 if (amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
880 if (__amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
883 if (amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
904 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, in amdgpu_ras_get_ras_block() argument
913 if (!amdgpu_ras_is_supported(adev, block)) in amdgpu_ras_get_ras_block()
916 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_get_ras_block()
918 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_get_ras_block()
935 static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) in amdgpu_ras_get_ecc_info() argument
937 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_get_ecc_info()
944 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); in amdgpu_ras_get_ecc_info()
946 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
947 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_ras_get_ecc_info()
948 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
953 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
954 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) in amdgpu_ras_get_ecc_info()
955 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
957 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
958 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_ras_get_ecc_info()
959 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
961 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
962 adev->umc.ras->ecc_info_query_ras_error_address) in amdgpu_ras_get_ecc_info()
963 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
968 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, in amdgpu_ras_query_error_status() argument
972 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_query_error_status()
979 amdgpu_ras_get_ecc_info(adev, &err_data); in amdgpu_ras_query_error_status()
981 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); in amdgpu_ras_query_error_status()
983 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_query_error_status()
989 block_obj->hw_ops->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
995 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_query_error_status()
1006 if (adev->smuio.funcs && in amdgpu_ras_query_error_status()
1007 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
1008 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
1009 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
1013 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
1014 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
1018 dev_info(adev->dev, "%ld correctable hardware errors " in amdgpu_ras_query_error_status()
1026 if (adev->smuio.funcs && in amdgpu_ras_query_error_status()
1027 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
1028 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
1029 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
1032 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
1033 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
1037 dev_info(adev->dev, "%ld uncorrectable hardware errors " in amdgpu_ras_query_error_status()
1047 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, in amdgpu_ras_reset_error_status() argument
1050 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); in amdgpu_ras_reset_error_status()
1052 if (!amdgpu_ras_is_supported(adev, block)) in amdgpu_ras_reset_error_status()
1056 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_reset_error_status()
1062 block_obj->hw_ops->reset_ras_error_count(adev); in amdgpu_ras_reset_error_status()
1067 block_obj->hw_ops->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
1074 int amdgpu_ras_error_inject(struct amdgpu_device *adev, in amdgpu_ras_error_inject() argument
1077 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_error_inject()
1086 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, in amdgpu_ras_error_inject()
1094 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_inject()
1100 if (adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_error_inject()
1102 amdgpu_xgmi_get_relative_phy_addr(adev, in amdgpu_ras_error_inject()
1108 ret = block_obj->hw_ops->ras_error_inject(adev, info); in amdgpu_ras_error_inject()
1112 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info); in amdgpu_ras_error_inject()
1114 ret = psp_ras_trigger_error(&adev->psp, &block_info); in amdgpu_ras_error_inject()
1118 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1135 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, in amdgpu_ras_query_error_count() argument
1139 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_query_error_count()
1143 if (!adev->ras_enabled || !con) in amdgpu_ras_query_error_count()
1159 res = amdgpu_ras_query_error_status(adev, &info); in amdgpu_ras_query_error_count()
1163 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_query_error_count()
1164 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_query_error_count()
1165 if (amdgpu_ras_reset_error_status(adev, info.head.block)) in amdgpu_ras_query_error_count()
1166 dev_warn(adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_query_error_count()
1186 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1238 struct amdgpu_device *adev = con->adev; in amdgpu_ras_sysfs_badpages_read() local
1249 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) in amdgpu_ras_sysfs_badpages_read()
1273 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_bad_page_node() argument
1275 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_bad_page_node()
1277 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove_bad_page_node()
1282 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_feature_node() argument
1284 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_feature_node()
1294 sysfs_remove_group(&adev->dev->kobj, &group); in amdgpu_ras_sysfs_remove_feature_node()
1299 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, in amdgpu_ras_sysfs_create() argument
1302 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_sysfs_create()
1321 if (sysfs_add_file_to_group(&adev->dev->kobj, in amdgpu_ras_sysfs_create()
1333 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, in amdgpu_ras_sysfs_remove() argument
1336 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_sysfs_remove()
1341 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove()
1350 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_all() argument
1352 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_all()
1356 amdgpu_ras_sysfs_remove(adev, &obj->head); in amdgpu_ras_sysfs_remove_all()
1360 amdgpu_ras_sysfs_remove_bad_page_node(adev); in amdgpu_ras_sysfs_remove_all()
1362 amdgpu_ras_sysfs_remove_feature_node(adev); in amdgpu_ras_sysfs_remove_all()
1387 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) in amdgpu_ras_debugfs_create_ctrl_node() argument
1389 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_debugfs_create_ctrl_node()
1390 struct drm_minor *minor = adev_to_drm(adev)->primary; in amdgpu_ras_debugfs_create_ctrl_node()
1394 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1396 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1400 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1401 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1402 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1405 S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1428 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, in amdgpu_ras_debugfs_create() argument
1432 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_debugfs_create()
1447 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) in amdgpu_ras_debugfs_create_all() argument
1449 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_debugfs_create_all()
1461 dir = amdgpu_ras_debugfs_create_ctrl_node(adev); in amdgpu_ras_debugfs_create_all()
1464 if (amdgpu_ras_is_supported(adev, obj->head.block) && in amdgpu_ras_debugfs_create_all()
1469 amdgpu_ras_debugfs_create(adev, &fs_info, dir); in amdgpu_ras_debugfs_create_all()
1481 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) in amdgpu_ras_fs_init() argument
1483 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fs_init()
1511 r = sysfs_create_group(&adev->dev->kobj, &group); in amdgpu_ras_fs_init()
1513 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
1518 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) in amdgpu_ras_fs_fini() argument
1520 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fs_fini()
1525 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); in amdgpu_ras_fs_fini()
1531 amdgpu_ras_sysfs_remove_all(adev); in amdgpu_ras_fs_fini()
1543 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) in amdgpu_ras_interrupt_fatal_error_handler() argument
1546 if (amdgpu_sriov_vf(adev) || in amdgpu_ras_interrupt_fatal_error_handler()
1547 !amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) in amdgpu_ras_interrupt_fatal_error_handler()
1550 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1551 adev->nbio.ras->handle_ras_controller_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1552 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1554 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1555 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1556 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1563 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_consumption_handler() local
1566 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
1576 poison_stat = block_obj->hw_ops->query_poison_status(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1579 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1586 if (!adev->gmc.xgmi.connected_to_cpu) in amdgpu_ras_interrupt_poison_consumption_handler()
1587 amdgpu_umc_poison_handler(adev, &err_data, false); in amdgpu_ras_interrupt_poison_consumption_handler()
1590 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1594 dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1596 amdgpu_ras_reset_gpu(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1603 dev_info(obj->adev->dev, in amdgpu_ras_interrupt_poison_creation_handler()
1620 ret = data->cb(obj->adev, &err_data, entry); in amdgpu_ras_interrupt_umc_handler()
1649 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { in amdgpu_ras_interrupt_handler()
1658 dev_warn(obj->adev->dev, in amdgpu_ras_interrupt_handler()
1674 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, in amdgpu_ras_interrupt_dispatch() argument
1677 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_dispatch()
1699 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, in amdgpu_ras_interrupt_remove_handler() argument
1702 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_interrupt_remove_handler()
1721 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, in amdgpu_ras_interrupt_add_handler() argument
1724 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_interrupt_add_handler()
1730 obj = amdgpu_ras_create_obj(adev, head); in amdgpu_ras_interrupt_add_handler()
1765 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) in amdgpu_ras_interrupt_remove_all() argument
1767 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_interrupt_remove_all()
1771 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); in amdgpu_ras_interrupt_remove_all()
1779 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) in amdgpu_ras_log_on_err_counter() argument
1781 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_log_on_err_counter()
1784 if (!adev->ras_enabled || !con) in amdgpu_ras_log_on_err_counter()
1808 (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))) in amdgpu_ras_log_on_err_counter()
1811 amdgpu_ras_query_error_status(adev, &info); in amdgpu_ras_log_on_err_counter()
1813 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_log_on_err_counter()
1814 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && in amdgpu_ras_log_on_err_counter()
1815 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { in amdgpu_ras_log_on_err_counter()
1816 if (amdgpu_ras_reset_error_status(adev, info.head.block)) in amdgpu_ras_log_on_err_counter()
1817 dev_warn(adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_log_on_err_counter()
1823 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, in amdgpu_ras_error_status_query() argument
1835 block_obj = amdgpu_ras_get_ras_block(adev, in amdgpu_ras_error_status_query()
1840 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_status_query()
1846 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
1850 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) in amdgpu_ras_query_err_status() argument
1852 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_query_err_status()
1855 if (!adev->ras_enabled || !con) in amdgpu_ras_query_err_status()
1863 amdgpu_ras_error_status_query(adev, &info); in amdgpu_ras_query_err_status()
1872 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, in amdgpu_ras_badpages_read() argument
1875 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_badpages_read()
1903 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, in amdgpu_ras_badpages_read()
1922 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery() local
1926 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); in amdgpu_ras_do_recovery()
1929 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_do_recovery()
1933 list_add_tail(&adev->gmc.xgmi.head, &device_list); in amdgpu_ras_do_recovery()
1946 if (amdgpu_device_should_recover_gpu(ras->adev)) { in amdgpu_ras_do_recovery()
1951 reset_context.reset_req_dev = adev; in amdgpu_ras_do_recovery()
1954 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); in amdgpu_ras_do_recovery()
1960 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, in amdgpu_ras_realloc_eh_data_space() argument
1984 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, in amdgpu_ras_add_bad_pages() argument
1987 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_add_bad_pages()
2006 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { in amdgpu_ras_add_bad_pages()
2011 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, in amdgpu_ras_add_bad_pages()
2029 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) in amdgpu_ras_save_bad_pages() argument
2031 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_save_bad_pages()
2049 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
2053 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); in amdgpu_ras_save_bad_pages()
2063 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) in amdgpu_ras_load_bad_pages() argument
2066 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
2080 dev_err(adev->dev, "Failed to load EEPROM table records!"); in amdgpu_ras_load_bad_pages()
2082 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
2107 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, in amdgpu_ras_check_bad_page() argument
2110 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_check_bad_page()
2122 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, in amdgpu_ras_validate_threshold() argument
2125 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_validate_threshold()
2147 u64 val = adev->gmc.mc_vram_size; in amdgpu_ras_validate_threshold()
2158 int amdgpu_ras_recovery_init(struct amdgpu_device *adev) in amdgpu_ras_recovery_init() argument
2160 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_recovery_init()
2166 if (!con || amdgpu_sriov_vf(adev)) in amdgpu_ras_recovery_init()
2174 con->adev = adev; in amdgpu_ras_recovery_init()
2176 if (!adev->ras_enabled) in amdgpu_ras_recovery_init()
2192 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); in amdgpu_ras_recovery_init()
2198 if (adev->gmc.xgmi.pending_reset) in amdgpu_ras_recovery_init()
2209 ret = amdgpu_ras_load_bad_pages(adev); in amdgpu_ras_recovery_init()
2213 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); in amdgpu_ras_recovery_init()
2216 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); in amdgpu_ras_recovery_init()
2222 if ((adev->asic_type == CHIP_ALDEBARAN) && in amdgpu_ras_recovery_init()
2223 (adev->gmc.xgmi.connected_to_cpu)) in amdgpu_ras_recovery_init()
2224 amdgpu_register_bad_pages_mca_notifier(adev); in amdgpu_ras_recovery_init()
2233 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
2247 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) in amdgpu_ras_recovery_fini() argument
2249 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_recovery_fini()
2268 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) in amdgpu_ras_asic_supported() argument
2270 if (amdgpu_sriov_vf(adev)) { in amdgpu_ras_asic_supported()
2271 switch (adev->ip_versions[MP0_HWIP][0]) { in amdgpu_ras_asic_supported()
2279 if (adev->asic_type == CHIP_IP_DISCOVERY) { in amdgpu_ras_asic_supported()
2280 switch (adev->ip_versions[MP0_HWIP][0]) { in amdgpu_ras_asic_supported()
2289 return adev->asic_type == CHIP_VEGA10 || in amdgpu_ras_asic_supported()
2290 adev->asic_type == CHIP_VEGA20 || in amdgpu_ras_asic_supported()
2291 adev->asic_type == CHIP_ARCTURUS || in amdgpu_ras_asic_supported()
2292 adev->asic_type == CHIP_ALDEBARAN || in amdgpu_ras_asic_supported()
2293 adev->asic_type == CHIP_SIENNA_CICHLID; in amdgpu_ras_asic_supported()
2301 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) in amdgpu_ras_get_quirks() argument
2303 struct atom_context *ctx = adev->mode_info.atom_context; in amdgpu_ras_get_quirks()
2312 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_get_quirks()
2324 static void amdgpu_ras_check_supported(struct amdgpu_device *adev) in amdgpu_ras_check_supported() argument
2326 adev->ras_hw_enabled = adev->ras_enabled = 0; in amdgpu_ras_check_supported()
2328 if (!adev->is_atom_fw || in amdgpu_ras_check_supported()
2329 !amdgpu_ras_asic_supported(adev)) in amdgpu_ras_check_supported()
2332 if (!adev->gmc.xgmi.connected_to_cpu) { in amdgpu_ras_check_supported()
2333 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { in amdgpu_ras_check_supported()
2334 dev_info(adev->dev, "MEM ECC is active.\n"); in amdgpu_ras_check_supported()
2335 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2338 dev_info(adev->dev, "MEM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2341 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { in amdgpu_ras_check_supported()
2342 dev_info(adev->dev, "SRAM ECC is active.\n"); in amdgpu_ras_check_supported()
2343 if (!amdgpu_sriov_vf(adev)) { in amdgpu_ras_check_supported()
2344 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2347 if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0)) in amdgpu_ras_check_supported()
2348 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2351 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2354 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | in amdgpu_ras_check_supported()
2359 dev_info(adev->dev, "SRAM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2364 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | in amdgpu_ras_check_supported()
2369 amdgpu_ras_get_quirks(adev); in amdgpu_ras_check_supported()
2372 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; in amdgpu_ras_check_supported()
2374 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : in amdgpu_ras_check_supported()
2375 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
2382 struct amdgpu_device *adev = con->adev; in amdgpu_ras_counte_dw() local
2383 struct drm_device *dev = adev_to_drm(adev); in amdgpu_ras_counte_dw()
2393 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { in amdgpu_ras_counte_dw()
2403 int amdgpu_ras_init(struct amdgpu_device *adev) in amdgpu_ras_init() argument
2405 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_init()
2419 con->adev = adev; in amdgpu_ras_init()
2426 amdgpu_ras_set_context(adev, con); in amdgpu_ras_init()
2428 amdgpu_ras_check_supported(adev); in amdgpu_ras_init()
2430 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { in amdgpu_ras_init()
2434 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { in amdgpu_ras_init()
2453 switch (adev->asic_type) { in amdgpu_ras_init()
2457 if (!adev->gmc.xgmi.connected_to_cpu) { in amdgpu_ras_init()
2458 adev->nbio.ras = &nbio_v7_4_ras; in amdgpu_ras_init()
2459 amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block); in amdgpu_ras_init()
2460 adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm; in amdgpu_ras_init()
2468 if (adev->nbio.ras && in amdgpu_ras_init()
2469 adev->nbio.ras->init_ras_controller_interrupt) { in amdgpu_ras_init()
2470 r = adev->nbio.ras->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
2475 if (adev->nbio.ras && in amdgpu_ras_init()
2476 adev->nbio.ras->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
2477 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
2483 if (adev->gmc.xgmi.connected_to_cpu) { in amdgpu_ras_init()
2487 else if (adev->df.funcs && in amdgpu_ras_init()
2488 adev->df.funcs->query_ras_poison_mode && in amdgpu_ras_init()
2489 adev->umc.ras && in amdgpu_ras_init()
2490 adev->umc.ras->query_ras_poison_mode) { in amdgpu_ras_init()
2492 adev->df.funcs->query_ras_poison_mode(adev); in amdgpu_ras_init()
2494 adev->umc.ras->query_ras_poison_mode(adev); in amdgpu_ras_init()
2499 dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", in amdgpu_ras_init()
2503 if (amdgpu_ras_fs_init(adev)) { in amdgpu_ras_init()
2508 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
2510 adev->ras_hw_enabled, adev->ras_enabled); in amdgpu_ras_init()
2514 amdgpu_ras_set_context(adev, NULL); in amdgpu_ras_init()
2520 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) in amdgpu_persistent_edc_harvesting_supported() argument
2522 if (adev->gmc.xgmi.connected_to_cpu) in amdgpu_persistent_edc_harvesting_supported()
2527 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, in amdgpu_persistent_edc_harvesting() argument
2534 if (!amdgpu_persistent_edc_harvesting_supported(adev)) in amdgpu_persistent_edc_harvesting()
2537 if (amdgpu_ras_query_error_status(adev, &info) != 0) in amdgpu_persistent_edc_harvesting()
2540 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) in amdgpu_persistent_edc_harvesting()
2546 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) in amdgpu_ras_is_poison_mode_supported() argument
2548 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_is_poison_mode_supported()
2557 int amdgpu_ras_block_late_init(struct amdgpu_device *adev, in amdgpu_ras_block_late_init() argument
2561 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_block_late_init()
2566 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_ras_block_late_init()
2567 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); in amdgpu_ras_block_late_init()
2571 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); in amdgpu_ras_block_late_init()
2573 if (adev->in_suspend || amdgpu_in_reset(adev)) { in amdgpu_ras_block_late_init()
2582 amdgpu_persistent_edc_harvesting(adev, ras_block); in amdgpu_ras_block_late_init()
2585 if (adev->in_suspend || amdgpu_in_reset(adev)) in amdgpu_ras_block_late_init()
2592 r = amdgpu_ras_interrupt_add_handler(adev, ras_block); in amdgpu_ras_block_late_init()
2597 r = amdgpu_ras_sysfs_create(adev, ras_block); in amdgpu_ras_block_late_init()
2603 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { in amdgpu_ras_block_late_init()
2612 amdgpu_ras_interrupt_remove_handler(adev, ras_block); in amdgpu_ras_block_late_init()
2614 amdgpu_ras_feature_enable(adev, ras_block, 0); in amdgpu_ras_block_late_init()
2618 static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, in amdgpu_ras_block_late_init_default() argument
2621 return amdgpu_ras_block_late_init(adev, ras_block); in amdgpu_ras_block_late_init_default()
2625 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, in amdgpu_ras_block_late_fini() argument
2632 amdgpu_ras_sysfs_remove(adev, ras_block); in amdgpu_ras_block_late_fini()
2636 amdgpu_ras_interrupt_remove_handler(adev, ras_block); in amdgpu_ras_block_late_fini()
2639 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, in amdgpu_ras_block_late_fini_default() argument
2642 return amdgpu_ras_block_late_fini(adev, ras_block); in amdgpu_ras_block_late_fini_default()
2648 void amdgpu_ras_resume(struct amdgpu_device *adev) in amdgpu_ras_resume() argument
2650 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_resume()
2653 if (!adev->ras_enabled || !con) { in amdgpu_ras_resume()
2655 amdgpu_release_ras_context(adev); in amdgpu_ras_resume()
2666 amdgpu_ras_enable_all_features(adev, 1); in amdgpu_ras_resume()
2673 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { in amdgpu_ras_resume()
2674 amdgpu_ras_feature_enable(adev, &obj->head, 0); in amdgpu_ras_resume()
2682 void amdgpu_ras_suspend(struct amdgpu_device *adev) in amdgpu_ras_suspend() argument
2684 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_suspend()
2686 if (!adev->ras_enabled || !con) in amdgpu_ras_suspend()
2689 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_suspend()
2692 amdgpu_ras_disable_all_features(adev, 1); in amdgpu_ras_suspend()
2695 int amdgpu_ras_late_init(struct amdgpu_device *adev) in amdgpu_ras_late_init() argument
2702 if (amdgpu_sriov_vf(adev)) in amdgpu_ras_late_init()
2705 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_late_init()
2707 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_late_init()
2713 r = obj->ras_late_init(adev, &obj->ras_comm); in amdgpu_ras_late_init()
2715 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", in amdgpu_ras_late_init()
2720 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); in amdgpu_ras_late_init()
2727 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) in amdgpu_ras_pre_fini() argument
2729 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_pre_fini()
2731 if (!adev->ras_enabled || !con) in amdgpu_ras_pre_fini()
2737 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_pre_fini()
2738 amdgpu_ras_recovery_fini(adev); in amdgpu_ras_pre_fini()
2742 int amdgpu_ras_fini(struct amdgpu_device *adev) in amdgpu_ras_fini() argument
2746 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fini()
2748 if (!adev->ras_enabled || !con) in amdgpu_ras_fini()
2751 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { in amdgpu_ras_fini()
2754 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && in amdgpu_ras_fini()
2756 obj->ras_fini(adev, &obj->ras_comm); in amdgpu_ras_fini()
2758 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); in amdgpu_ras_fini()
2766 amdgpu_ras_fs_fini(adev); in amdgpu_ras_fini()
2767 amdgpu_ras_interrupt_remove_all(adev); in amdgpu_ras_fini()
2772 amdgpu_ras_disable_all_features(adev, 1); in amdgpu_ras_fini()
2776 amdgpu_ras_set_context(adev, NULL); in amdgpu_ras_fini()
2782 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) in amdgpu_ras_global_ras_isr() argument
2784 amdgpu_ras_check_supported(adev); in amdgpu_ras_global_ras_isr()
2785 if (!adev->ras_hw_enabled) in amdgpu_ras_global_ras_isr()
2789 dev_info(adev->dev, "uncorrectable hardware error" in amdgpu_ras_global_ras_isr()
2792 amdgpu_ras_reset_gpu(adev); in amdgpu_ras_global_ras_isr()
2796 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) in amdgpu_ras_need_emergency_restart() argument
2798 if (adev->asic_type == CHIP_VEGA20 && in amdgpu_ras_need_emergency_restart()
2799 adev->pm.fw_version <= 0x283400) { in amdgpu_ras_need_emergency_restart()
2800 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && in amdgpu_ras_need_emergency_restart()
2807 void amdgpu_release_ras_context(struct amdgpu_device *adev) in amdgpu_release_ras_context() argument
2809 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_release_ras_context()
2814 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { in amdgpu_release_ras_context()
2816 amdgpu_ras_set_context(adev, NULL); in amdgpu_release_ras_context()
2825 struct amdgpu_device *adev = NULL; in find_adev() local
2828 adev = mce_adev_list.devs[i]; in find_adev()
2830 if (adev && adev->gmc.xgmi.connected_to_cpu && in find_adev()
2831 adev->gmc.xgmi.physical_node_id == node_id) in find_adev()
2833 adev = NULL; in find_adev()
2836 return adev; in find_adev()
2848 struct amdgpu_device *adev = NULL; in amdgpu_bad_page_notifier() local
2873 adev = find_adev(gpu_id); in amdgpu_bad_page_notifier()
2874 if (!adev) { in amdgpu_bad_page_notifier()
2887 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", in amdgpu_bad_page_notifier()
2891 kcalloc(adev->umc.max_ras_err_cnt_per_query, in amdgpu_bad_page_notifier()
2894 dev_warn(adev->dev, in amdgpu_bad_page_notifier()
2902 if (adev->umc.ras && in amdgpu_bad_page_notifier()
2903 adev->umc.ras->convert_ras_error_address) in amdgpu_bad_page_notifier()
2904 adev->umc.ras->convert_ras_error_address(adev, in amdgpu_bad_page_notifier()
2908 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, in amdgpu_bad_page_notifier()
2910 amdgpu_ras_save_bad_pages(adev); in amdgpu_bad_page_notifier()
2922 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) in amdgpu_register_bad_pages_mca_notifier() argument
2932 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; in amdgpu_register_bad_pages_mca_notifier()
2945 struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) in amdgpu_ras_get_context() argument
2947 if (!adev) in amdgpu_ras_get_context()
2950 return adev->psp.ras_context.ras; in amdgpu_ras_get_context()
2953 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) in amdgpu_ras_set_context() argument
2955 if (!adev) in amdgpu_ras_set_context()
2958 adev->psp.ras_context.ras = ras_con; in amdgpu_ras_set_context()
2963 int amdgpu_ras_is_supported(struct amdgpu_device *adev, in amdgpu_ras_is_supported() argument
2966 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_is_supported()
2970 return ras && (adev->ras_enabled & (1 << block)); in amdgpu_ras_is_supported()
2973 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) in amdgpu_ras_reset_gpu() argument
2975 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_reset_gpu()
2978 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
2984 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, in amdgpu_ras_register_ras_block() argument
2988 if (!adev || !ras_block_obj) in amdgpu_ras_register_ras_block()
2991 if (!amdgpu_ras_asic_supported(adev)) in amdgpu_ras_register_ras_block()
3000 list_add_tail(&ras_node->node, &adev->ras_list); in amdgpu_ras_register_ras_block()