Lines Matching refs:adev
86 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
89 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) in amdgpu_ras_set_error_query_ready() argument
91 if (adev && amdgpu_ras_get_context(adev)) in amdgpu_ras_set_error_query_ready()
92 amdgpu_ras_get_context(adev)->error_query_ready = ready; in amdgpu_ras_set_error_query_ready()
95 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) in amdgpu_ras_get_error_query_ready() argument
97 if (adev && amdgpu_ras_get_context(adev)) in amdgpu_ras_get_error_query_ready()
98 return amdgpu_ras_get_context(adev)->error_query_ready; in amdgpu_ras_get_error_query_ready()
103 static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) in amdgpu_reserve_page_direct() argument
108 if ((address >= adev->gmc.mc_vram_size) || in amdgpu_reserve_page_direct()
110 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
116 if (amdgpu_ras_check_bad_page(adev, address)) { in amdgpu_reserve_page_direct()
117 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
134 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, in amdgpu_reserve_page_direct()
136 amdgpu_ras_save_bad_pages(adev); in amdgpu_reserve_page_direct()
139 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
140 dev_warn(adev->dev, "Clear EEPROM:\n"); in amdgpu_reserve_page_direct()
141 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
156 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_debugfs_read()
361 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_ctrl_write() local
365 if (!amdgpu_ras_get_error_query_ready(adev)) { in amdgpu_ras_debugfs_ctrl_write()
366 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
376 ret = amdgpu_reserve_page_direct(adev, data.inject.address); in amdgpu_ras_debugfs_ctrl_write()
383 if (!amdgpu_ras_is_supported(adev, data.head.block)) in amdgpu_ras_debugfs_ctrl_write()
388 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); in amdgpu_ras_debugfs_ctrl_write()
391 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); in amdgpu_ras_debugfs_ctrl_write()
394 if ((data.inject.address >= adev->gmc.mc_vram_size) || in amdgpu_ras_debugfs_ctrl_write()
396 dev_warn(adev->dev, "RAS WARN: input address " in amdgpu_ras_debugfs_ctrl_write()
405 amdgpu_ras_check_bad_page(adev, data.inject.address)) { in amdgpu_ras_debugfs_ctrl_write()
406 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " in amdgpu_ras_debugfs_ctrl_write()
413 ret = amdgpu_ras_error_inject(adev, &data.inject); in amdgpu_ras_debugfs_ctrl_write()
446 struct amdgpu_device *adev = in amdgpu_ras_debugfs_eeprom_write() local
451 &(amdgpu_ras_get_context(adev)->eeprom_control)); in amdgpu_ras_debugfs_eeprom_write()
456 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_debugfs_eeprom_write()
506 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_sysfs_read()
509 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_sysfs_read()
513 if (obj->adev->asic_type == CHIP_ALDEBARAN) { in amdgpu_ras_sysfs_read()
514 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_sysfs_read()
536 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, in amdgpu_ras_create_obj() argument
539 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_create_obj()
542 if (!adev->ras_enabled || !con) in amdgpu_ras_create_obj()
554 obj->adev = adev; in amdgpu_ras_create_obj()
562 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, in amdgpu_ras_find_obj() argument
565 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_find_obj()
569 if (!adev->ras_enabled || !con) in amdgpu_ras_find_obj()
597 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, in amdgpu_ras_is_feature_allowed() argument
600 return adev->ras_hw_enabled & BIT(head->block); in amdgpu_ras_is_feature_allowed()
603 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, in amdgpu_ras_is_feature_enabled() argument
606 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_is_feature_enabled()
615 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, in __amdgpu_ras_feature_enable() argument
618 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in __amdgpu_ras_feature_enable()
619 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in __amdgpu_ras_feature_enable()
627 if (!amdgpu_ras_is_feature_allowed(adev, head)) in __amdgpu_ras_feature_enable()
629 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) in __amdgpu_ras_feature_enable()
634 obj = amdgpu_ras_create_obj(adev, head); in __amdgpu_ras_feature_enable()
643 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { in __amdgpu_ras_feature_enable()
653 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, in amdgpu_ras_feature_enable() argument
656 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_feature_enable()
680 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); in amdgpu_ras_feature_enable()
682 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) { in amdgpu_ras_feature_enable()
688 ret = psp_ras_enable_features(&adev->psp, info, enable); in amdgpu_ras_feature_enable()
690 dev_err(adev->dev, "ras %s %s failed %d\n", in amdgpu_ras_feature_enable()
699 __amdgpu_ras_feature_enable(adev, head, enable); in amdgpu_ras_feature_enable()
707 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, in amdgpu_ras_feature_enable_on_boot() argument
710 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_feature_enable_on_boot()
724 ret = amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
730 ret = __amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
732 dev_info(adev->dev, in amdgpu_ras_feature_enable_on_boot()
738 ret = __amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
746 ret = amdgpu_ras_feature_enable(adev, head, 0); in amdgpu_ras_feature_enable_on_boot()
749 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
753 ret = amdgpu_ras_feature_enable(adev, head, enable); in amdgpu_ras_feature_enable_on_boot()
758 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, in amdgpu_ras_disable_all_features() argument
761 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_disable_all_features()
769 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
772 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
780 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, in amdgpu_ras_enable_all_features() argument
783 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_enable_all_features()
800 if (__amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
803 if (amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
813 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, in amdgpu_ras_query_error_status() argument
816 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_query_error_status()
825 if (adev->umc.ras_funcs && in amdgpu_ras_query_error_status()
826 adev->umc.ras_funcs->query_ras_error_count) in amdgpu_ras_query_error_status()
827 adev->umc.ras_funcs->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
831 if (adev->umc.ras_funcs && in amdgpu_ras_query_error_status()
832 adev->umc.ras_funcs->query_ras_error_address) in amdgpu_ras_query_error_status()
833 adev->umc.ras_funcs->query_ras_error_address(adev, &err_data); in amdgpu_ras_query_error_status()
836 if (adev->sdma.funcs->query_ras_error_count) { in amdgpu_ras_query_error_status()
837 for (i = 0; i < adev->sdma.num_instances; i++) in amdgpu_ras_query_error_status()
838 adev->sdma.funcs->query_ras_error_count(adev, i, in amdgpu_ras_query_error_status()
843 if (adev->gfx.ras_funcs && in amdgpu_ras_query_error_status()
844 adev->gfx.ras_funcs->query_ras_error_count) in amdgpu_ras_query_error_status()
845 adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
847 if (adev->gfx.ras_funcs && in amdgpu_ras_query_error_status()
848 adev->gfx.ras_funcs->query_ras_error_status) in amdgpu_ras_query_error_status()
849 adev->gfx.ras_funcs->query_ras_error_status(adev); in amdgpu_ras_query_error_status()
852 if (adev->mmhub.ras_funcs && in amdgpu_ras_query_error_status()
853 adev->mmhub.ras_funcs->query_ras_error_count) in amdgpu_ras_query_error_status()
854 adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
856 if (adev->mmhub.ras_funcs && in amdgpu_ras_query_error_status()
857 adev->mmhub.ras_funcs->query_ras_error_status) in amdgpu_ras_query_error_status()
858 adev->mmhub.ras_funcs->query_ras_error_status(adev); in amdgpu_ras_query_error_status()
861 if (adev->nbio.ras_funcs && in amdgpu_ras_query_error_status()
862 adev->nbio.ras_funcs->query_ras_error_count) in amdgpu_ras_query_error_status()
863 adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
866 if (adev->gmc.xgmi.ras_funcs && in amdgpu_ras_query_error_status()
867 adev->gmc.xgmi.ras_funcs->query_ras_error_count) in amdgpu_ras_query_error_status()
868 adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
871 if (adev->hdp.ras_funcs && in amdgpu_ras_query_error_status()
872 adev->hdp.ras_funcs->query_ras_error_count) in amdgpu_ras_query_error_status()
873 adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
886 if (adev->smuio.funcs && in amdgpu_ras_query_error_status()
887 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
888 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
889 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
893 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
894 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
898 dev_info(adev->dev, "%ld correctable hardware errors " in amdgpu_ras_query_error_status()
906 if (adev->smuio.funcs && in amdgpu_ras_query_error_status()
907 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
908 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
909 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
912 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
913 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
917 dev_info(adev->dev, "%ld uncorrectable hardware errors " in amdgpu_ras_query_error_status()
927 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, in amdgpu_ras_reset_error_status() argument
930 if (!amdgpu_ras_is_supported(adev, block)) in amdgpu_ras_reset_error_status()
935 if (adev->gfx.ras_funcs && in amdgpu_ras_reset_error_status()
936 adev->gfx.ras_funcs->reset_ras_error_count) in amdgpu_ras_reset_error_status()
937 adev->gfx.ras_funcs->reset_ras_error_count(adev); in amdgpu_ras_reset_error_status()
939 if (adev->gfx.ras_funcs && in amdgpu_ras_reset_error_status()
940 adev->gfx.ras_funcs->reset_ras_error_status) in amdgpu_ras_reset_error_status()
941 adev->gfx.ras_funcs->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
944 if (adev->mmhub.ras_funcs && in amdgpu_ras_reset_error_status()
945 adev->mmhub.ras_funcs->reset_ras_error_count) in amdgpu_ras_reset_error_status()
946 adev->mmhub.ras_funcs->reset_ras_error_count(adev); in amdgpu_ras_reset_error_status()
948 if (adev->mmhub.ras_funcs && in amdgpu_ras_reset_error_status()
949 adev->mmhub.ras_funcs->reset_ras_error_status) in amdgpu_ras_reset_error_status()
950 adev->mmhub.ras_funcs->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
953 if (adev->sdma.funcs->reset_ras_error_count) in amdgpu_ras_reset_error_status()
954 adev->sdma.funcs->reset_ras_error_count(adev); in amdgpu_ras_reset_error_status()
957 if (adev->hdp.ras_funcs && in amdgpu_ras_reset_error_status()
958 adev->hdp.ras_funcs->reset_ras_error_count) in amdgpu_ras_reset_error_status()
959 adev->hdp.ras_funcs->reset_ras_error_count(adev); in amdgpu_ras_reset_error_status()
969 static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, in amdgpu_ras_error_inject_xgmi() argument
974 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) in amdgpu_ras_error_inject_xgmi()
975 dev_warn(adev->dev, "Failed to disallow df cstate"); in amdgpu_ras_error_inject_xgmi()
977 if (amdgpu_dpm_allow_xgmi_power_down(adev, false)) in amdgpu_ras_error_inject_xgmi()
978 dev_warn(adev->dev, "Failed to disallow XGMI power down"); in amdgpu_ras_error_inject_xgmi()
980 ret = psp_ras_trigger_error(&adev->psp, block_info); in amdgpu_ras_error_inject_xgmi()
985 if (amdgpu_dpm_allow_xgmi_power_down(adev, true)) in amdgpu_ras_error_inject_xgmi()
986 dev_warn(adev->dev, "Failed to allow XGMI power down"); in amdgpu_ras_error_inject_xgmi()
988 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) in amdgpu_ras_error_inject_xgmi()
989 dev_warn(adev->dev, "Failed to allow df cstate"); in amdgpu_ras_error_inject_xgmi()
995 int amdgpu_ras_error_inject(struct amdgpu_device *adev, in amdgpu_ras_error_inject() argument
998 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_error_inject()
1012 if (adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_error_inject()
1014 amdgpu_xgmi_get_relative_phy_addr(adev, in amdgpu_ras_error_inject()
1020 if (adev->gfx.ras_funcs && in amdgpu_ras_error_inject()
1021 adev->gfx.ras_funcs->ras_error_inject) in amdgpu_ras_error_inject()
1022 ret = adev->gfx.ras_funcs->ras_error_inject(adev, info); in amdgpu_ras_error_inject()
1030 ret = psp_ras_trigger_error(&adev->psp, &block_info); in amdgpu_ras_error_inject()
1033 ret = amdgpu_ras_error_inject_xgmi(adev, &block_info); in amdgpu_ras_error_inject()
1036 dev_info(adev->dev, "%s error injection is not supported yet\n", in amdgpu_ras_error_inject()
1042 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1059 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, in amdgpu_ras_query_error_count() argument
1063 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_query_error_count()
1067 if (!adev->ras_enabled || !con) in amdgpu_ras_query_error_count()
1083 res = amdgpu_ras_query_error_status(adev, &info); in amdgpu_ras_query_error_count()
1104 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1156 struct amdgpu_device *adev = con->adev; in amdgpu_ras_sysfs_badpages_read() local
1167 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) in amdgpu_ras_sysfs_badpages_read()
1191 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_bad_page_node() argument
1193 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_bad_page_node()
1195 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove_bad_page_node()
1200 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_feature_node() argument
1202 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_feature_node()
1212 sysfs_remove_group(&adev->dev->kobj, &group); in amdgpu_ras_sysfs_remove_feature_node()
1217 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, in amdgpu_ras_sysfs_create() argument
1220 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_sysfs_create()
1240 if (sysfs_add_file_to_group(&adev->dev->kobj, in amdgpu_ras_sysfs_create()
1252 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, in amdgpu_ras_sysfs_remove() argument
1255 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_sysfs_remove()
1260 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove()
1269 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_all() argument
1271 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_all()
1275 amdgpu_ras_sysfs_remove(adev, &obj->head); in amdgpu_ras_sysfs_remove_all()
1279 amdgpu_ras_sysfs_remove_bad_page_node(adev); in amdgpu_ras_sysfs_remove_all()
1281 amdgpu_ras_sysfs_remove_feature_node(adev); in amdgpu_ras_sysfs_remove_all()
1306 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) in amdgpu_ras_debugfs_create_ctrl_node() argument
1308 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_debugfs_create_ctrl_node()
1309 struct drm_minor *minor = adev_to_drm(adev)->primary; in amdgpu_ras_debugfs_create_ctrl_node()
1313 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1315 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1319 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1320 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1321 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1324 S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1347 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, in amdgpu_ras_debugfs_create() argument
1351 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_debugfs_create()
1366 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) in amdgpu_ras_debugfs_create_all() argument
1368 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_debugfs_create_all()
1380 dir = amdgpu_ras_debugfs_create_ctrl_node(adev); in amdgpu_ras_debugfs_create_all()
1383 if (amdgpu_ras_is_supported(adev, obj->head.block) && in amdgpu_ras_debugfs_create_all()
1388 amdgpu_ras_debugfs_create(adev, &fs_info, dir); in amdgpu_ras_debugfs_create_all()
1400 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) in amdgpu_ras_fs_init() argument
1402 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fs_init()
1430 r = sysfs_create_group(&adev->dev->kobj, &group); in amdgpu_ras_fs_init()
1432 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
1437 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) in amdgpu_ras_fs_fini() argument
1439 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fs_fini()
1444 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); in amdgpu_ras_fs_fini()
1450 amdgpu_ras_sysfs_remove_all(adev); in amdgpu_ras_fs_fini()
1476 ret = data->cb(obj->adev, &err_data, &entry); in amdgpu_ras_interrupt_handler()
1503 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, in amdgpu_ras_interrupt_dispatch() argument
1506 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_dispatch()
1528 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, in amdgpu_ras_interrupt_remove_handler() argument
1531 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_remove_handler()
1550 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, in amdgpu_ras_interrupt_add_handler() argument
1553 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_add_handler()
1558 obj = amdgpu_ras_create_obj(adev, &info->head); in amdgpu_ras_interrupt_add_handler()
1591 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) in amdgpu_ras_interrupt_remove_all() argument
1593 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_interrupt_remove_all()
1600 amdgpu_ras_interrupt_remove_handler(adev, &info); in amdgpu_ras_interrupt_remove_all()
1608 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) in amdgpu_ras_log_on_err_counter() argument
1610 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_log_on_err_counter()
1613 if (!adev->ras_enabled || !con) in amdgpu_ras_log_on_err_counter()
1630 amdgpu_ras_query_error_status(adev, &info); in amdgpu_ras_log_on_err_counter()
1635 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, in amdgpu_ras_error_status_query() argument
1644 if (adev->gfx.ras_funcs && in amdgpu_ras_error_status_query()
1645 adev->gfx.ras_funcs->query_ras_error_status) in amdgpu_ras_error_status_query()
1646 adev->gfx.ras_funcs->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
1649 if (adev->mmhub.ras_funcs && in amdgpu_ras_error_status_query()
1650 adev->mmhub.ras_funcs->query_ras_error_status) in amdgpu_ras_error_status_query()
1651 adev->mmhub.ras_funcs->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
1658 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) in amdgpu_ras_query_err_status() argument
1660 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_query_err_status()
1663 if (!adev->ras_enabled || !con) in amdgpu_ras_query_err_status()
1671 amdgpu_ras_error_status_query(adev, &info); in amdgpu_ras_query_err_status()
1680 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, in amdgpu_ras_badpages_read() argument
1683 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_badpages_read()
1712 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), in amdgpu_ras_badpages_read()
1731 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery() local
1735 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); in amdgpu_ras_do_recovery()
1738 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_do_recovery()
1742 list_add_tail(&adev->gmc.xgmi.head, &device_list); in amdgpu_ras_do_recovery()
1755 if (amdgpu_device_should_recover_gpu(ras->adev)) in amdgpu_ras_do_recovery()
1756 amdgpu_device_gpu_recover(ras->adev, NULL); in amdgpu_ras_do_recovery()
1761 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, in amdgpu_ras_realloc_eh_data_space() argument
1786 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, in amdgpu_ras_add_bad_pages() argument
1789 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_add_bad_pages()
1808 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { in amdgpu_ras_add_bad_pages()
1814 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), in amdgpu_ras_add_bad_pages()
1832 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) in amdgpu_ras_save_bad_pages() argument
1834 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_save_bad_pages()
1850 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
1854 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); in amdgpu_ras_save_bad_pages()
1864 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) in amdgpu_ras_load_bad_pages() argument
1867 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
1881 dev_err(adev->dev, "Failed to load EEPROM table records!"); in amdgpu_ras_load_bad_pages()
1883 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
1908 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, in amdgpu_ras_check_bad_page() argument
1911 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_check_bad_page()
1923 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, in amdgpu_ras_validate_threshold() argument
1926 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_validate_threshold()
1948 u64 val = adev->gmc.mc_vram_size; in amdgpu_ras_validate_threshold()
1959 int amdgpu_ras_recovery_init(struct amdgpu_device *adev) in amdgpu_ras_recovery_init() argument
1961 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_recovery_init()
1975 con->adev = adev; in amdgpu_ras_recovery_init()
1977 if (!adev->ras_enabled) in amdgpu_ras_recovery_init()
1992 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); in amdgpu_ras_recovery_init()
1998 if (adev->gmc.xgmi.pending_reset) in amdgpu_ras_recovery_init()
2009 ret = amdgpu_ras_load_bad_pages(adev); in amdgpu_ras_recovery_init()
2013 if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) in amdgpu_ras_recovery_init()
2014 adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); in amdgpu_ras_recovery_init()
2024 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
2038 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) in amdgpu_ras_recovery_fini() argument
2040 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_recovery_fini()
2060 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, in amdgpu_ras_request_reset_on_boot() argument
2063 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_request_reset_on_boot()
2072 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) in amdgpu_ras_asic_supported() argument
2074 return adev->asic_type == CHIP_VEGA10 || in amdgpu_ras_asic_supported()
2075 adev->asic_type == CHIP_VEGA20 || in amdgpu_ras_asic_supported()
2076 adev->asic_type == CHIP_ARCTURUS || in amdgpu_ras_asic_supported()
2077 adev->asic_type == CHIP_ALDEBARAN || in amdgpu_ras_asic_supported()
2078 adev->asic_type == CHIP_SIENNA_CICHLID; in amdgpu_ras_asic_supported()
2086 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) in amdgpu_ras_get_quirks() argument
2088 struct atom_context *ctx = adev->mode_info.atom_context; in amdgpu_ras_get_quirks()
2097 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_get_quirks()
2109 static void amdgpu_ras_check_supported(struct amdgpu_device *adev) in amdgpu_ras_check_supported() argument
2111 adev->ras_hw_enabled = adev->ras_enabled = 0; in amdgpu_ras_check_supported()
2113 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || in amdgpu_ras_check_supported()
2114 !amdgpu_ras_asic_supported(adev)) in amdgpu_ras_check_supported()
2117 if (!adev->gmc.xgmi.connected_to_cpu) { in amdgpu_ras_check_supported()
2118 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { in amdgpu_ras_check_supported()
2119 dev_info(adev->dev, "MEM ECC is active.\n"); in amdgpu_ras_check_supported()
2120 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2123 dev_info(adev->dev, "MEM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2126 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { in amdgpu_ras_check_supported()
2127 dev_info(adev->dev, "SRAM ECC is active.\n"); in amdgpu_ras_check_supported()
2128 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2131 dev_info(adev->dev, "SRAM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2136 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | in amdgpu_ras_check_supported()
2141 amdgpu_ras_get_quirks(adev); in amdgpu_ras_check_supported()
2144 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; in amdgpu_ras_check_supported()
2146 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : in amdgpu_ras_check_supported()
2147 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
2154 struct amdgpu_device *adev = con->adev; in amdgpu_ras_counte_dw() local
2155 struct drm_device *dev = adev_to_drm(adev); in amdgpu_ras_counte_dw()
2165 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { in amdgpu_ras_counte_dw()
2175 int amdgpu_ras_init(struct amdgpu_device *adev) in amdgpu_ras_init() argument
2177 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_init()
2189 con->adev = adev; in amdgpu_ras_init()
2196 amdgpu_ras_set_context(adev, con); in amdgpu_ras_init()
2198 amdgpu_ras_check_supported(adev); in amdgpu_ras_init()
2200 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { in amdgpu_ras_init()
2204 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { in amdgpu_ras_init()
2222 switch (adev->asic_type) { in amdgpu_ras_init()
2226 if (!adev->gmc.xgmi.connected_to_cpu) in amdgpu_ras_init()
2227 adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; in amdgpu_ras_init()
2234 if (adev->nbio.ras_funcs && in amdgpu_ras_init()
2235 adev->nbio.ras_funcs->init_ras_controller_interrupt) { in amdgpu_ras_init()
2236 r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
2241 if (adev->nbio.ras_funcs && in amdgpu_ras_init()
2242 adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
2243 r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
2248 if (amdgpu_ras_fs_init(adev)) { in amdgpu_ras_init()
2253 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
2255 adev->ras_hw_enabled, adev->ras_enabled); in amdgpu_ras_init()
2259 amdgpu_ras_set_context(adev, NULL); in amdgpu_ras_init()
2265 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) in amdgpu_persistent_edc_harvesting_supported() argument
2267 if (adev->gmc.xgmi.connected_to_cpu) in amdgpu_persistent_edc_harvesting_supported()
2272 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, in amdgpu_persistent_edc_harvesting() argument
2279 if (!amdgpu_persistent_edc_harvesting_supported(adev)) in amdgpu_persistent_edc_harvesting()
2282 if (amdgpu_ras_query_error_status(adev, &info) != 0) in amdgpu_persistent_edc_harvesting()
2285 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) in amdgpu_persistent_edc_harvesting()
2292 int amdgpu_ras_late_init(struct amdgpu_device *adev, in amdgpu_ras_late_init() argument
2297 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_late_init()
2302 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_ras_late_init()
2303 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); in amdgpu_ras_late_init()
2307 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); in amdgpu_ras_late_init()
2311 amdgpu_ras_request_reset_on_boot(adev, in amdgpu_ras_late_init()
2314 } else if (adev->in_suspend || amdgpu_in_reset(adev)) { in amdgpu_ras_late_init()
2323 amdgpu_persistent_edc_harvesting(adev, ras_block); in amdgpu_ras_late_init()
2326 if (adev->in_suspend || amdgpu_in_reset(adev)) in amdgpu_ras_late_init()
2330 r = amdgpu_ras_interrupt_add_handler(adev, ih_info); in amdgpu_ras_late_init()
2335 r = amdgpu_ras_sysfs_create(adev, fs_info); in amdgpu_ras_late_init()
2341 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { in amdgpu_ras_late_init()
2348 amdgpu_ras_sysfs_remove(adev, ras_block); in amdgpu_ras_late_init()
2351 amdgpu_ras_interrupt_remove_handler(adev, ih_info); in amdgpu_ras_late_init()
2353 amdgpu_ras_feature_enable(adev, ras_block, 0); in amdgpu_ras_late_init()
2358 void amdgpu_ras_late_fini(struct amdgpu_device *adev, in amdgpu_ras_late_fini() argument
2365 amdgpu_ras_sysfs_remove(adev, ras_block); in amdgpu_ras_late_fini()
2367 amdgpu_ras_interrupt_remove_handler(adev, ih_info); in amdgpu_ras_late_fini()
2368 amdgpu_ras_feature_enable(adev, ras_block, 0); in amdgpu_ras_late_fini()
2374 void amdgpu_ras_resume(struct amdgpu_device *adev) in amdgpu_ras_resume() argument
2376 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_resume()
2379 if (!adev->ras_enabled || !con) { in amdgpu_ras_resume()
2381 amdgpu_release_ras_context(adev); in amdgpu_ras_resume()
2392 amdgpu_ras_enable_all_features(adev, 1); in amdgpu_ras_resume()
2399 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { in amdgpu_ras_resume()
2400 amdgpu_ras_feature_enable(adev, &obj->head, 0); in amdgpu_ras_resume()
2416 amdgpu_ras_disable_all_features(adev, 1); in amdgpu_ras_resume()
2417 amdgpu_ras_reset_gpu(adev); in amdgpu_ras_resume()
2421 void amdgpu_ras_suspend(struct amdgpu_device *adev) in amdgpu_ras_suspend() argument
2423 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_suspend()
2425 if (!adev->ras_enabled || !con) in amdgpu_ras_suspend()
2428 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_suspend()
2431 amdgpu_ras_disable_all_features(adev, 1); in amdgpu_ras_suspend()
2435 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) in amdgpu_ras_pre_fini() argument
2437 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_pre_fini()
2439 if (!adev->ras_enabled || !con) in amdgpu_ras_pre_fini()
2444 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_pre_fini()
2445 amdgpu_ras_recovery_fini(adev); in amdgpu_ras_pre_fini()
2449 int amdgpu_ras_fini(struct amdgpu_device *adev) in amdgpu_ras_fini() argument
2451 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fini()
2453 if (!adev->ras_enabled || !con) in amdgpu_ras_fini()
2456 amdgpu_ras_fs_fini(adev); in amdgpu_ras_fini()
2457 amdgpu_ras_interrupt_remove_all(adev); in amdgpu_ras_fini()
2462 amdgpu_ras_disable_all_features(adev, 1); in amdgpu_ras_fini()
2466 amdgpu_ras_set_context(adev, NULL); in amdgpu_ras_fini()
2472 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) in amdgpu_ras_global_ras_isr() argument
2474 amdgpu_ras_check_supported(adev); in amdgpu_ras_global_ras_isr()
2475 if (!adev->ras_hw_enabled) in amdgpu_ras_global_ras_isr()
2479 dev_info(adev->dev, "uncorrectable hardware error" in amdgpu_ras_global_ras_isr()
2482 amdgpu_ras_reset_gpu(adev); in amdgpu_ras_global_ras_isr()
2486 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) in amdgpu_ras_need_emergency_restart() argument
2488 if (adev->asic_type == CHIP_VEGA20 && in amdgpu_ras_need_emergency_restart()
2489 adev->pm.fw_version <= 0x283400) { in amdgpu_ras_need_emergency_restart()
2490 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && in amdgpu_ras_need_emergency_restart()
2497 void amdgpu_release_ras_context(struct amdgpu_device *adev) in amdgpu_release_ras_context() argument
2499 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_release_ras_context()
2504 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { in amdgpu_release_ras_context()
2506 amdgpu_ras_set_context(adev, NULL); in amdgpu_release_ras_context()