Lines Matching refs:hdev

43 static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region)  in hl_set_dram_bar()  argument
45 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_set_dram_bar()
54 old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); in hl_set_dram_bar()
63 static int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, in hl_access_sram_dram_region() argument
66 struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; in hl_access_sram_dram_region()
71 old_base = hl_set_dram_bar(hdev, addr, region); in hl_access_sram_dram_region()
76 acc_addr = hdev->pcie_bar[region->bar_id] + addr - region->region_base + in hl_access_sram_dram_region()
100 rc = hl_set_dram_bar(hdev, old_base, region); in hl_access_sram_dram_region()
108 static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, in hl_dma_alloc_common() argument
116 ptr = hdev->asic_funcs->asic_dma_alloc_coherent(hdev, size, dma_handle, flag); in hl_dma_alloc_common()
119 ptr = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, size, dma_handle); in hl_dma_alloc_common()
122 ptr = hdev->asic_funcs->asic_dma_pool_zalloc(hdev, size, flag, dma_handle); in hl_dma_alloc_common()
127 trace_habanalabs_dma_alloc(hdev->dev, (u64) (uintptr_t) ptr, *dma_handle, size, in hl_dma_alloc_common()
133 static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *cpu_addr, in hl_asic_dma_free_common() argument
139 hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); in hl_asic_dma_free_common()
142 hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, size, cpu_addr); in hl_asic_dma_free_common()
145 hdev->asic_funcs->asic_dma_pool_free(hdev, cpu_addr, dma_handle); in hl_asic_dma_free_common()
149 trace_habanalabs_dma_free(hdev->dev, (u64) (uintptr_t) cpu_addr, dma_handle, size, caller); in hl_asic_dma_free_common()
152 void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, in hl_asic_dma_alloc_coherent_caller() argument
155 return hl_dma_alloc_common(hdev, size, dma_handle, flag, DMA_ALLOC_COHERENT, caller); in hl_asic_dma_alloc_coherent_caller()
158 void hl_asic_dma_free_coherent_caller(struct hl_device *hdev, size_t size, void *cpu_addr, in hl_asic_dma_free_coherent_caller() argument
161 hl_asic_dma_free_common(hdev, size, cpu_addr, dma_handle, DMA_ALLOC_COHERENT, caller); in hl_asic_dma_free_coherent_caller()
164 void *hl_cpu_accessible_dma_pool_alloc_caller(struct hl_device *hdev, size_t size, in hl_cpu_accessible_dma_pool_alloc_caller() argument
167 return hl_dma_alloc_common(hdev, size, dma_handle, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); in hl_cpu_accessible_dma_pool_alloc_caller()
170 void hl_cpu_accessible_dma_pool_free_caller(struct hl_device *hdev, size_t size, void *vaddr, in hl_cpu_accessible_dma_pool_free_caller() argument
173 hl_asic_dma_free_common(hdev, size, vaddr, 0, DMA_ALLOC_CPU_ACCESSIBLE, caller); in hl_cpu_accessible_dma_pool_free_caller()
176 void *hl_asic_dma_pool_zalloc_caller(struct hl_device *hdev, size_t size, gfp_t mem_flags, in hl_asic_dma_pool_zalloc_caller() argument
179 return hl_dma_alloc_common(hdev, size, dma_handle, mem_flags, DMA_ALLOC_POOL, caller); in hl_asic_dma_pool_zalloc_caller()
182 void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_t dma_addr, in hl_asic_dma_pool_free_caller() argument
185 hl_asic_dma_free_common(hdev, 0, vaddr, dma_addr, DMA_ALLOC_POOL, caller); in hl_asic_dma_pool_free_caller()
188 int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) in hl_dma_map_sgtable() argument
190 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_dma_map_sgtable()
194 rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0); in hl_dma_map_sgtable()
206 void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir) in hl_dma_unmap_sgtable() argument
208 struct asic_fixed_properties *prop = &hdev->asic_prop; in hl_dma_unmap_sgtable()
217 dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0); in hl_dma_unmap_sgtable()
228 int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val, in hl_access_cfg_region() argument
231 struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG]; in hl_access_cfg_region()
235 dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32)); in hl_access_cfg_region()
257 dev_err(hdev->dev, "access type %d is not supported\n", acc_type); in hl_access_cfg_region()
273 int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, in hl_access_dev_mem() argument
278 return hl_access_cfg_region(hdev, addr, val, acc_type); in hl_access_dev_mem()
281 return hl_access_sram_dram_region(hdev, addr, val, acc_type, in hl_access_dev_mem()
314 enum hl_device_status hl_device_status(struct hl_device *hdev) in hl_device_status() argument
318 if (hdev->reset_info.in_reset) { in hl_device_status()
319 if (hdev->reset_info.in_compute_reset) in hl_device_status()
323 } else if (hdev->reset_info.needs_reset) { in hl_device_status()
325 } else if (hdev->disabled) { in hl_device_status()
327 } else if (!hdev->init_done) { in hl_device_status()
336 bool hl_device_operational(struct hl_device *hdev, in hl_device_operational() argument
341 current_status = hl_device_status(hdev); in hl_device_operational()
363 struct hl_device *hdev; in hpriv_release() local
367 hdev = hpriv->hdev; in hpriv_release()
369 hdev->asic_funcs->send_device_activity(hdev, false); in hpriv_release()
378 if ((!hdev->pldm) && (hdev->pdev) && in hpriv_release()
379 (!hdev->asic_funcs->is_device_idle(hdev, in hpriv_release()
382 dev_err(hdev->dev, in hpriv_release()
398 mutex_lock(&hdev->fpriv_list_lock); in hpriv_release()
400 mutex_unlock(&hdev->fpriv_list_lock); in hpriv_release()
402 if (!device_is_idle || hdev->reset_upon_device_release) { in hpriv_release()
403 hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); in hpriv_release()
405 int rc = hdev->asic_funcs->scrub_device_mem(hdev); in hpriv_release()
408 dev_err(hdev->dev, "failed to scrub memory from hpriv release (%d)\n", rc); in hpriv_release()
415 mutex_lock(&hdev->fpriv_list_lock); in hpriv_release()
416 hdev->is_compute_ctx_active = false; in hpriv_release()
417 mutex_unlock(&hdev->fpriv_list_lock); in hpriv_release()
419 hdev->compute_ctx_in_release = 0; in hpriv_release()
451 struct hl_device *hdev = hpriv->hdev; in hl_device_release() local
455 if (!hdev) { in hl_device_release()
464 hl_release_pending_user_interrupts(hpriv->hdev); in hl_device_release()
466 hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr); in hl_device_release()
469 hdev->compute_ctx_in_release = 1; in hl_device_release()
472 dev_notice(hdev->dev, in hl_device_release()
475 hdev->last_open_session_duration_jif = in hl_device_release()
476 jiffies - hdev->last_successful_open_jif; in hl_device_release()
484 struct hl_device *hdev = hpriv->hdev; in hl_device_release_ctrl() local
488 if (!hdev) { in hl_device_release_ctrl()
493 mutex_lock(&hdev->fpriv_ctrl_list_lock); in hl_device_release_ctrl()
495 mutex_unlock(&hdev->fpriv_ctrl_list_lock); in hl_device_release_ctrl()
521 struct hl_device *hdev = hpriv->hdev; in hl_mmap() local
524 if (!hdev) { in hl_mmap()
578 static int device_init_cdev(struct hl_device *hdev, struct class *hclass, in device_init_cdev() argument
591 (*dev)->devt = MKDEV(hdev->major, minor); in device_init_cdev()
594 dev_set_drvdata(*dev, hdev); in device_init_cdev()
600 static int device_cdev_sysfs_add(struct hl_device *hdev) in device_cdev_sysfs_add() argument
604 rc = cdev_device_add(&hdev->cdev, hdev->dev); in device_cdev_sysfs_add()
606 dev_err(hdev->dev, in device_cdev_sysfs_add()
611 rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl); in device_cdev_sysfs_add()
613 dev_err(hdev->dev, in device_cdev_sysfs_add()
619 rc = hl_sysfs_init(hdev); in device_cdev_sysfs_add()
621 dev_err(hdev->dev, "failed to initialize sysfs\n"); in device_cdev_sysfs_add()
625 hdev->cdev_sysfs_created = true; in device_cdev_sysfs_add()
630 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); in device_cdev_sysfs_add()
632 cdev_device_del(&hdev->cdev, hdev->dev); in device_cdev_sysfs_add()
636 static void device_cdev_sysfs_del(struct hl_device *hdev) in device_cdev_sysfs_del() argument
638 if (!hdev->cdev_sysfs_created) in device_cdev_sysfs_del()
641 hl_sysfs_fini(hdev); in device_cdev_sysfs_del()
642 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl); in device_cdev_sysfs_del()
643 cdev_device_del(&hdev->cdev, hdev->dev); in device_cdev_sysfs_del()
646 put_device(hdev->dev); in device_cdev_sysfs_del()
647 put_device(hdev->dev_ctrl); in device_cdev_sysfs_del()
654 struct hl_device *hdev = device_reset_work->hdev; in device_hard_reset_pending() local
660 rc = hl_device_reset(hdev, flags); in device_hard_reset_pending()
661 if ((rc == -EBUSY) && !hdev->device_fini_pending) { in device_hard_reset_pending()
662 dev_info(hdev->dev, in device_hard_reset_pending()
680 static int device_early_init(struct hl_device *hdev) in device_early_init() argument
685 switch (hdev->asic_type) { in device_early_init()
687 goya_set_asic_funcs(hdev); in device_early_init()
688 strscpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name)); in device_early_init()
691 gaudi_set_asic_funcs(hdev); in device_early_init()
692 strscpy(hdev->asic_name, "GAUDI", sizeof(hdev->asic_name)); in device_early_init()
695 gaudi_set_asic_funcs(hdev); in device_early_init()
696 strscpy(hdev->asic_name, "GAUDI SEC", sizeof(hdev->asic_name)); in device_early_init()
699 gaudi2_set_asic_funcs(hdev); in device_early_init()
700 strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); in device_early_init()
703 gaudi2_set_asic_funcs(hdev); in device_early_init()
704 strscpy(hdev->asic_name, "GAUDI2 SEC", sizeof(hdev->asic_name)); in device_early_init()
707 dev_err(hdev->dev, "Unrecognized ASIC type %d\n", in device_early_init()
708 hdev->asic_type); in device_early_init()
712 rc = hdev->asic_funcs->early_init(hdev); in device_early_init()
716 rc = hl_asid_init(hdev); in device_early_init()
720 if (hdev->asic_prop.completion_queues_count) { in device_early_init()
721 hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count, in device_early_init()
724 if (!hdev->cq_wq) { in device_early_init()
730 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) { in device_early_init()
732 hdev->cq_wq[i] = create_singlethread_workqueue(workq_name); in device_early_init()
733 if (hdev->cq_wq[i] == NULL) { in device_early_init()
734 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n"); in device_early_init()
740 hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0); in device_early_init()
741 if (hdev->eq_wq == NULL) { in device_early_init()
742 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); in device_early_init()
747 hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0); in device_early_init()
748 if (!hdev->cs_cmplt_wq) { in device_early_init()
749 dev_err(hdev->dev, in device_early_init()
755 hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0); in device_early_init()
756 if (!hdev->ts_free_obj_wq) { in device_early_init()
757 dev_err(hdev->dev, in device_early_init()
763 hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); in device_early_init()
764 if (!hdev->pf_wq) { in device_early_init()
765 dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); in device_early_init()
770 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info), in device_early_init()
772 if (!hdev->hl_chip_info) { in device_early_init()
777 rc = hl_mmu_if_set_funcs(hdev); in device_early_init()
781 hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); in device_early_init()
783 hdev->device_reset_work.wq = in device_early_init()
785 if (!hdev->device_reset_work.wq) { in device_early_init()
787 dev_err(hdev->dev, "Failed to create device reset WQ\n"); in device_early_init()
791 INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, in device_early_init()
793 hdev->device_reset_work.hdev = hdev; in device_early_init()
794 hdev->device_fini_pending = 0; in device_early_init()
796 mutex_init(&hdev->send_cpu_message_lock); in device_early_init()
797 mutex_init(&hdev->debug_lock); in device_early_init()
798 INIT_LIST_HEAD(&hdev->cs_mirror_list); in device_early_init()
799 spin_lock_init(&hdev->cs_mirror_lock); in device_early_init()
800 spin_lock_init(&hdev->reset_info.lock); in device_early_init()
801 INIT_LIST_HEAD(&hdev->fpriv_list); in device_early_init()
802 INIT_LIST_HEAD(&hdev->fpriv_ctrl_list); in device_early_init()
803 mutex_init(&hdev->fpriv_list_lock); in device_early_init()
804 mutex_init(&hdev->fpriv_ctrl_list_lock); in device_early_init()
805 mutex_init(&hdev->clk_throttling.lock); in device_early_init()
810 hl_mem_mgr_fini(&hdev->kernel_mem_mgr); in device_early_init()
812 kfree(hdev->hl_chip_info); in device_early_init()
814 destroy_workqueue(hdev->pf_wq); in device_early_init()
816 destroy_workqueue(hdev->ts_free_obj_wq); in device_early_init()
818 destroy_workqueue(hdev->cs_cmplt_wq); in device_early_init()
820 destroy_workqueue(hdev->eq_wq); in device_early_init()
822 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in device_early_init()
823 if (hdev->cq_wq[i]) in device_early_init()
824 destroy_workqueue(hdev->cq_wq[i]); in device_early_init()
825 kfree(hdev->cq_wq); in device_early_init()
827 hl_asid_fini(hdev); in device_early_init()
829 if (hdev->asic_funcs->early_fini) in device_early_init()
830 hdev->asic_funcs->early_fini(hdev); in device_early_init()
841 static void device_early_fini(struct hl_device *hdev) in device_early_fini() argument
845 mutex_destroy(&hdev->debug_lock); in device_early_fini()
846 mutex_destroy(&hdev->send_cpu_message_lock); in device_early_fini()
848 mutex_destroy(&hdev->fpriv_list_lock); in device_early_fini()
849 mutex_destroy(&hdev->fpriv_ctrl_list_lock); in device_early_fini()
851 mutex_destroy(&hdev->clk_throttling.lock); in device_early_fini()
853 hl_mem_mgr_fini(&hdev->kernel_mem_mgr); in device_early_fini()
855 kfree(hdev->hl_chip_info); in device_early_fini()
857 destroy_workqueue(hdev->pf_wq); in device_early_fini()
858 destroy_workqueue(hdev->ts_free_obj_wq); in device_early_fini()
859 destroy_workqueue(hdev->cs_cmplt_wq); in device_early_fini()
860 destroy_workqueue(hdev->eq_wq); in device_early_fini()
861 destroy_workqueue(hdev->device_reset_work.wq); in device_early_fini()
863 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in device_early_fini()
864 destroy_workqueue(hdev->cq_wq[i]); in device_early_fini()
865 kfree(hdev->cq_wq); in device_early_fini()
867 hl_asid_fini(hdev); in device_early_fini()
869 if (hdev->asic_funcs->early_fini) in device_early_fini()
870 hdev->asic_funcs->early_fini(hdev); in device_early_fini()
875 struct hl_device *hdev = container_of(work, struct hl_device, in hl_device_heartbeat() local
878 if (!hl_device_operational(hdev, NULL)) in hl_device_heartbeat()
881 if (!hdev->asic_funcs->send_heartbeat(hdev)) in hl_device_heartbeat()
884 if (hl_device_operational(hdev, NULL)) in hl_device_heartbeat()
885 dev_err(hdev->dev, "Device heartbeat failed!\n"); in hl_device_heartbeat()
887 hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); in hl_device_heartbeat()
902 if (!hdev->reset_info.in_reset) in hl_device_heartbeat()
903 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT; in hl_device_heartbeat()
905 schedule_delayed_work(&hdev->work_heartbeat, in hl_device_heartbeat()
917 static int device_late_init(struct hl_device *hdev) in device_late_init() argument
921 if (hdev->asic_funcs->late_init) { in device_late_init()
922 rc = hdev->asic_funcs->late_init(hdev); in device_late_init()
924 dev_err(hdev->dev, in device_late_init()
930 hdev->high_pll = hdev->asic_prop.high_pll; in device_late_init()
932 if (hdev->heartbeat) { in device_late_init()
933 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat); in device_late_init()
934 schedule_delayed_work(&hdev->work_heartbeat, in device_late_init()
938 hdev->late_init_done = true; in device_late_init()
949 static void device_late_fini(struct hl_device *hdev) in device_late_fini() argument
951 if (!hdev->late_init_done) in device_late_fini()
954 if (hdev->heartbeat) in device_late_fini()
955 cancel_delayed_work_sync(&hdev->work_heartbeat); in device_late_fini()
957 if (hdev->asic_funcs->late_fini) in device_late_fini()
958 hdev->asic_funcs->late_fini(hdev); in device_late_fini()
960 hdev->late_init_done = false; in device_late_fini()
963 int hl_device_utilization(struct hl_device *hdev, u32 *utilization) in hl_device_utilization() argument
968 max_power = hdev->max_power; in hl_device_utilization()
969 dc_power = hdev->asic_prop.dc_power_default; in hl_device_utilization()
970 rc = hl_fw_cpucp_power_get(hdev, &curr_power); in hl_device_utilization()
983 int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable) in hl_device_set_debug_mode() argument
987 mutex_lock(&hdev->debug_lock); in hl_device_set_debug_mode()
990 if (!hdev->in_debug) { in hl_device_set_debug_mode()
991 dev_err(hdev->dev, in hl_device_set_debug_mode()
997 if (!hdev->reset_info.hard_reset_pending) in hl_device_set_debug_mode()
998 hdev->asic_funcs->halt_coresight(hdev, ctx); in hl_device_set_debug_mode()
1000 hdev->in_debug = 0; in hl_device_set_debug_mode()
1005 if (hdev->in_debug) { in hl_device_set_debug_mode()
1006 dev_err(hdev->dev, in hl_device_set_debug_mode()
1012 hdev->in_debug = 1; in hl_device_set_debug_mode()
1015 mutex_unlock(&hdev->debug_lock); in hl_device_set_debug_mode()
1020 static void take_release_locks(struct hl_device *hdev) in take_release_locks() argument
1025 hdev->asic_funcs->hw_queues_lock(hdev); in take_release_locks()
1026 hdev->asic_funcs->hw_queues_unlock(hdev); in take_release_locks()
1029 mutex_lock(&hdev->send_cpu_message_lock); in take_release_locks()
1030 mutex_unlock(&hdev->send_cpu_message_lock); in take_release_locks()
1033 mutex_lock(&hdev->fpriv_list_lock); in take_release_locks()
1034 mutex_unlock(&hdev->fpriv_list_lock); in take_release_locks()
1035 mutex_lock(&hdev->fpriv_ctrl_list_lock); in take_release_locks()
1036 mutex_unlock(&hdev->fpriv_ctrl_list_lock); in take_release_locks()
1039 static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset, in cleanup_resources() argument
1043 device_late_fini(hdev); in cleanup_resources()
1050 hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset); in cleanup_resources()
1053 hl_cs_rollback_all(hdev, skip_wq_flush); in cleanup_resources()
1056 flush_workqueue(hdev->pf_wq); in cleanup_resources()
1061 hl_release_pending_user_interrupts(hdev); in cleanup_resources()
1073 int hl_device_suspend(struct hl_device *hdev) in hl_device_suspend() argument
1077 pci_save_state(hdev->pdev); in hl_device_suspend()
1080 spin_lock(&hdev->reset_info.lock); in hl_device_suspend()
1081 if (hdev->reset_info.in_reset) { in hl_device_suspend()
1082 spin_unlock(&hdev->reset_info.lock); in hl_device_suspend()
1083 dev_err(hdev->dev, "Can't suspend while in reset\n"); in hl_device_suspend()
1086 hdev->reset_info.in_reset = 1; in hl_device_suspend()
1087 spin_unlock(&hdev->reset_info.lock); in hl_device_suspend()
1090 hdev->disabled = true; in hl_device_suspend()
1092 take_release_locks(hdev); in hl_device_suspend()
1094 rc = hdev->asic_funcs->suspend(hdev); in hl_device_suspend()
1096 dev_err(hdev->dev, in hl_device_suspend()
1100 pci_disable_device(hdev->pdev); in hl_device_suspend()
1101 pci_set_power_state(hdev->pdev, PCI_D3hot); in hl_device_suspend()
1115 int hl_device_resume(struct hl_device *hdev) in hl_device_resume() argument
1119 pci_set_power_state(hdev->pdev, PCI_D0); in hl_device_resume()
1120 pci_restore_state(hdev->pdev); in hl_device_resume()
1121 rc = pci_enable_device_mem(hdev->pdev); in hl_device_resume()
1123 dev_err(hdev->dev, in hl_device_resume()
1128 pci_set_master(hdev->pdev); in hl_device_resume()
1130 rc = hdev->asic_funcs->resume(hdev); in hl_device_resume()
1132 dev_err(hdev->dev, "Failed to resume device after suspend\n"); in hl_device_resume()
1140 spin_lock(&hdev->reset_info.lock); in hl_device_resume()
1141 hdev->reset_info.in_reset = 0; in hl_device_resume()
1142 spin_unlock(&hdev->reset_info.lock); in hl_device_resume()
1144 rc = hl_device_reset(hdev, HL_DRV_RESET_HARD); in hl_device_resume()
1146 dev_err(hdev->dev, "Failed to reset device during resume\n"); in hl_device_resume()
1153 pci_clear_master(hdev->pdev); in hl_device_resume()
1154 pci_disable_device(hdev->pdev); in hl_device_resume()
1159 static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev) in device_kill_open_processes() argument
1167 fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; in device_kill_open_processes()
1168 fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; in device_kill_open_processes()
1179 if (hdev->process_kill_trial_cnt) { in device_kill_open_processes()
1197 dev_info(hdev->dev, "Killing user process pid=%d\n", in device_kill_open_processes()
1209 dev_dbg(hdev->dev, in device_kill_open_processes()
1227 dev_dbg(hdev->dev, in device_kill_open_processes()
1240 if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS) in device_kill_open_processes()
1243 hdev->process_kill_trial_cnt++; in device_kill_open_processes()
1248 static void device_disable_open_processes(struct hl_device *hdev, bool control_dev) in device_disable_open_processes() argument
1254 fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock; in device_disable_open_processes()
1255 fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list; in device_disable_open_processes()
1259 hpriv->hdev = NULL; in device_disable_open_processes()
1263 static void handle_reset_trigger(struct hl_device *hdev, u32 flags) in handle_reset_trigger() argument
1274 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT; in handle_reset_trigger()
1277 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR; in handle_reset_trigger()
1280 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; in handle_reset_trigger()
1283 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; in handle_reset_trigger()
1291 if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) { in handle_reset_trigger()
1292 hdev->reset_info.prev_reset_trigger = cur_reset_trigger; in handle_reset_trigger()
1293 hdev->reset_info.reset_trigger_repeated = 0; in handle_reset_trigger()
1295 hdev->reset_info.reset_trigger_repeated = 1; in handle_reset_trigger()
1315 if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) in handle_reset_trigger()
1316 dev_warn(hdev->dev, in handle_reset_trigger()
1337 int hl_device_reset(struct hl_device *hdev, u32 flags) in hl_device_reset() argument
1346 if (!hdev->init_done) { in hl_device_reset()
1347 dev_err(hdev->dev, "Can't reset before initialization is done\n"); in hl_device_reset()
1357 if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { in hl_device_reset()
1362 if (hdev->reset_upon_device_release && (flags & HL_DRV_RESET_DEV_RELEASE)) { in hl_device_reset()
1364 dev_crit(hdev->dev, in hl_device_reset()
1374 if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) { in hl_device_reset()
1380 dev_dbg(hdev->dev, "Doing hard-reset instead of compute reset\n"); in hl_device_reset()
1384 if (from_hard_reset_thread && hdev->process_kill_trial_cnt) in hl_device_reset()
1394 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1395 if (hdev->reset_info.in_reset) { in hl_device_reset()
1397 if (hard_reset && hdev->reset_info.in_compute_reset) in hl_device_reset()
1398 hdev->reset_info.hard_reset_schedule_flags = flags; in hl_device_reset()
1399 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1406 hdev->reset_info.in_compute_reset = !hard_reset; in hl_device_reset()
1408 hdev->reset_info.in_reset = 1; in hl_device_reset()
1410 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1415 handle_reset_trigger(hdev, flags); in hl_device_reset()
1418 hdev->disabled = true; in hl_device_reset()
1420 take_release_locks(hdev); in hl_device_reset()
1423 dev_info(hdev->dev, "Going to reset device\n"); in hl_device_reset()
1425 dev_dbg(hdev->dev, "Going to reset device after release by user\n"); in hl_device_reset()
1427 dev_dbg(hdev->dev, "Going to reset engines of inference device\n"); in hl_device_reset()
1432 hdev->reset_info.hard_reset_pending = true; in hl_device_reset()
1434 hdev->process_kill_trial_cnt = 0; in hl_device_reset()
1436 hdev->device_reset_work.flags = flags; in hl_device_reset()
1442 queue_delayed_work(hdev->device_reset_work.wq, in hl_device_reset()
1443 &hdev->device_reset_work.reset_work, 0); in hl_device_reset()
1448 cleanup_resources(hdev, hard_reset, fw_reset, skip_wq_flush); in hl_device_reset()
1456 rc = device_kill_open_processes(hdev, 0, false); in hl_device_reset()
1459 if (hdev->device_fini_pending) { in hl_device_reset()
1460 dev_crit(hdev->dev, in hl_device_reset()
1470 dev_crit(hdev->dev, in hl_device_reset()
1478 flush_workqueue(hdev->eq_wq); in hl_device_reset()
1482 hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset); in hl_device_reset()
1485 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; in hl_device_reset()
1488 if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) in hl_device_reset()
1489 hdev->kernel_ctx = NULL; in hl_device_reset()
1491 hl_vm_fini(hdev); in hl_device_reset()
1492 hl_mmu_fini(hdev); in hl_device_reset()
1493 hl_eq_reset(hdev, &hdev->event_queue); in hl_device_reset()
1497 hl_hw_queue_reset(hdev, hard_reset); in hl_device_reset()
1498 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in hl_device_reset()
1499 hl_cq_reset(hdev, &hdev->completion_queue[i]); in hl_device_reset()
1502 ctx = hl_get_compute_ctx(hdev); in hl_device_reset()
1512 hdev->device_cpu_disabled = false; in hl_device_reset()
1513 hdev->reset_info.hard_reset_pending = false; in hl_device_reset()
1515 if (hdev->reset_info.reset_trigger_repeated && in hl_device_reset()
1516 (hdev->reset_info.prev_reset_trigger == in hl_device_reset()
1521 dev_crit(hdev->dev, in hl_device_reset()
1527 if (hdev->kernel_ctx) { in hl_device_reset()
1528 dev_crit(hdev->dev, in hl_device_reset()
1534 rc = hl_mmu_init(hdev); in hl_device_reset()
1536 dev_err(hdev->dev, in hl_device_reset()
1542 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), in hl_device_reset()
1544 if (!hdev->kernel_ctx) { in hl_device_reset()
1546 hl_mmu_fini(hdev); in hl_device_reset()
1550 hdev->is_compute_ctx_active = false; in hl_device_reset()
1552 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); in hl_device_reset()
1554 dev_err(hdev->dev, in hl_device_reset()
1556 kfree(hdev->kernel_ctx); in hl_device_reset()
1557 hdev->kernel_ctx = NULL; in hl_device_reset()
1558 hl_mmu_fini(hdev); in hl_device_reset()
1567 hdev->disabled = false; in hl_device_reset()
1571 rc = hl_fw_read_preboot_status(hdev); in hl_device_reset()
1576 rc = hdev->asic_funcs->hw_init(hdev); in hl_device_reset()
1578 dev_err(hdev->dev, "failed to initialize the H/W after reset\n"); in hl_device_reset()
1583 if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, in hl_device_reset()
1585 dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n", in hl_device_reset()
1592 rc = hdev->asic_funcs->test_queues(hdev); in hl_device_reset()
1594 dev_err(hdev->dev, "Failed to detect if device is alive after reset\n"); in hl_device_reset()
1599 rc = device_late_init(hdev); in hl_device_reset()
1601 dev_err(hdev->dev, "Failed late init after hard reset\n"); in hl_device_reset()
1605 rc = hl_vm_init(hdev); in hl_device_reset()
1607 dev_err(hdev->dev, "Failed to init memory module after hard reset\n"); in hl_device_reset()
1611 if (!hdev->asic_prop.fw_security_enabled) in hl_device_reset()
1612 hl_fw_set_max_power(hdev); in hl_device_reset()
1614 rc = hdev->asic_funcs->compute_reset_late_init(hdev); in hl_device_reset()
1617 dev_err(hdev->dev, in hl_device_reset()
1620 dev_err(hdev->dev, "Failed late init after compute reset\n"); in hl_device_reset()
1625 rc = hdev->asic_funcs->scrub_device_mem(hdev); in hl_device_reset()
1627 dev_err(hdev->dev, "scrub mem failed from device reset (%d)\n", rc); in hl_device_reset()
1631 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1632 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
1638 if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags) in hl_device_reset()
1641 hdev->reset_info.in_reset = 0; in hl_device_reset()
1643 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1645 hdev->reset_info.needs_reset = false; in hl_device_reset()
1648 dev_info(hdev->dev, "Successfully finished resetting the device\n"); in hl_device_reset()
1650 dev_dbg(hdev->dev, "Successfully finished resetting the device\n"); in hl_device_reset()
1653 hdev->reset_info.hard_reset_cnt++; in hl_device_reset()
1660 hdev->asic_funcs->enable_events_from_fw(hdev); in hl_device_reset()
1662 hdev->reset_info.compute_reset_cnt++; in hl_device_reset()
1666 dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); in hl_device_reset()
1667 flags = hdev->reset_info.hard_reset_schedule_flags; in hl_device_reset()
1668 hdev->reset_info.hard_reset_schedule_flags = 0; in hl_device_reset()
1669 hdev->disabled = true; in hl_device_reset()
1671 handle_reset_trigger(hdev, flags); in hl_device_reset()
1678 hdev->disabled = true; in hl_device_reset()
1680 spin_lock(&hdev->reset_info.lock); in hl_device_reset()
1681 hdev->reset_info.in_compute_reset = 0; in hl_device_reset()
1684 dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n"); in hl_device_reset()
1685 hdev->reset_info.hard_reset_cnt++; in hl_device_reset()
1687 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1688 dev_err(hdev->dev, "Failed to reset device after user release\n"); in hl_device_reset()
1694 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1695 dev_err(hdev->dev, "Failed to do compute reset\n"); in hl_device_reset()
1696 hdev->reset_info.compute_reset_cnt++; in hl_device_reset()
1702 hdev->reset_info.in_reset = 0; in hl_device_reset()
1704 spin_unlock(&hdev->reset_info.lock); in hl_device_reset()
1727 void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) in hl_notifier_event_send_all() argument
1731 mutex_lock(&hdev->fpriv_list_lock); in hl_notifier_event_send_all()
1733 list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) in hl_notifier_event_send_all()
1736 mutex_unlock(&hdev->fpriv_list_lock); in hl_notifier_event_send_all()
1739 mutex_lock(&hdev->fpriv_ctrl_list_lock); in hl_notifier_event_send_all()
1741 list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node) in hl_notifier_event_send_all()
1744 mutex_unlock(&hdev->fpriv_ctrl_list_lock); in hl_notifier_event_send_all()
1756 int hl_device_init(struct hl_device *hdev, struct class *hclass) in hl_device_init() argument
1762 hdev->cdev_idx = hdev->id / 2; in hl_device_init()
1764 name = kasprintf(GFP_KERNEL, "hl%d", hdev->cdev_idx); in hl_device_init()
1771 rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name, in hl_device_init()
1772 &hdev->cdev, &hdev->dev); in hl_device_init()
1779 name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->cdev_idx); in hl_device_init()
1786 rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops, in hl_device_init()
1787 name, &hdev->cdev_ctrl, &hdev->dev_ctrl); in hl_device_init()
1795 rc = device_early_init(hdev); in hl_device_init()
1799 user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count + in hl_device_init()
1800 hdev->asic_prop.user_interrupt_count; in hl_device_init()
1803 hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt), in hl_device_init()
1805 if (!hdev->user_interrupt) { in hl_device_init()
1815 rc = hdev->asic_funcs->sw_init(hdev); in hl_device_init()
1821 hl_multi_cs_completion_init(hdev); in hl_device_init()
1828 rc = hl_hw_queues_create(hdev); in hl_device_init()
1830 dev_err(hdev->dev, "failed to initialize kernel queues\n"); in hl_device_init()
1834 cq_cnt = hdev->asic_prop.completion_queues_count; in hl_device_init()
1842 hdev->completion_queue = kcalloc(cq_cnt, in hl_device_init()
1843 sizeof(*hdev->completion_queue), in hl_device_init()
1846 if (!hdev->completion_queue) { in hl_device_init()
1847 dev_err(hdev->dev, in hl_device_init()
1855 rc = hl_cq_init(hdev, &hdev->completion_queue[i], in hl_device_init()
1856 hdev->asic_funcs->get_queue_id_for_cq(hdev, i)); in hl_device_init()
1858 dev_err(hdev->dev, in hl_device_init()
1862 hdev->completion_queue[i].cq_idx = i; in hl_device_init()
1865 hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs, in hl_device_init()
1867 if (!hdev->shadow_cs_queue) { in hl_device_init()
1877 rc = hl_eq_init(hdev, &hdev->event_queue); in hl_device_init()
1879 dev_err(hdev->dev, "failed to initialize event queue\n"); in hl_device_init()
1884 rc = hl_mmu_init(hdev); in hl_device_init()
1886 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n"); in hl_device_init()
1891 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL); in hl_device_init()
1892 if (!hdev->kernel_ctx) { in hl_device_init()
1897 hdev->is_compute_ctx_active = false; in hl_device_init()
1899 hdev->asic_funcs->state_dump_init(hdev); in hl_device_init()
1901 hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; in hl_device_init()
1902 hl_debugfs_add_device(hdev); in hl_device_init()
1907 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); in hl_device_init()
1909 dev_err(hdev->dev, "failed to initialize kernel context\n"); in hl_device_init()
1910 kfree(hdev->kernel_ctx); in hl_device_init()
1914 rc = hl_cb_pool_init(hdev); in hl_device_init()
1916 dev_err(hdev->dev, "failed to initialize CB pool\n"); in hl_device_init()
1920 rc = hl_dec_init(hdev); in hl_device_init()
1922 dev_err(hdev->dev, "Failed to initialize the decoder module\n"); in hl_device_init()
1937 hdev->disabled = false; in hl_device_init()
1939 rc = hdev->asic_funcs->hw_init(hdev); in hl_device_init()
1941 dev_err(hdev->dev, "failed to initialize the H/W\n"); in hl_device_init()
1947 rc = hdev->asic_funcs->test_queues(hdev); in hl_device_init()
1949 dev_err(hdev->dev, "Failed to detect if device is alive\n"); in hl_device_init()
1954 rc = device_late_init(hdev); in hl_device_init()
1956 dev_err(hdev->dev, "Failed late initialization\n"); in hl_device_init()
1961 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n", in hl_device_init()
1962 hdev->asic_name, in hl_device_init()
1963 hdev->asic_prop.dram_size / SZ_1G); in hl_device_init()
1965 rc = hl_vm_init(hdev); in hl_device_init()
1967 dev_err(hdev->dev, "Failed to initialize memory module\n"); in hl_device_init()
1978 rc = device_cdev_sysfs_add(hdev); in hl_device_init()
1980 dev_err(hdev->dev, in hl_device_init()
1989 if (hdev->asic_prop.set_max_power_on_device_init && in hl_device_init()
1990 !hdev->asic_prop.fw_security_enabled) in hl_device_init()
1991 hl_fw_set_max_power(hdev); in hl_device_init()
1999 rc = hl_hwmon_init(hdev); in hl_device_init()
2001 dev_err(hdev->dev, "Failed to initialize hwmon\n"); in hl_device_init()
2006 dev_notice(hdev->dev, in hl_device_init()
2009 hdev->init_done = true; in hl_device_init()
2016 hdev->asic_funcs->enable_events_from_fw(hdev); in hl_device_init()
2021 hl_cb_pool_fini(hdev); in hl_device_init()
2023 if (hl_ctx_put(hdev->kernel_ctx) != 1) in hl_device_init()
2024 dev_err(hdev->dev, in hl_device_init()
2027 hl_debugfs_remove_device(hdev); in hl_device_init()
2029 hl_mmu_fini(hdev); in hl_device_init()
2031 hl_eq_fini(hdev, &hdev->event_queue); in hl_device_init()
2033 kfree(hdev->shadow_cs_queue); in hl_device_init()
2036 hl_cq_fini(hdev, &hdev->completion_queue[i]); in hl_device_init()
2037 kfree(hdev->completion_queue); in hl_device_init()
2039 hl_hw_queues_destroy(hdev); in hl_device_init()
2041 hdev->asic_funcs->sw_fini(hdev); in hl_device_init()
2043 kfree(hdev->user_interrupt); in hl_device_init()
2045 device_early_fini(hdev); in hl_device_init()
2047 put_device(hdev->dev_ctrl); in hl_device_init()
2049 put_device(hdev->dev); in hl_device_init()
2051 hdev->disabled = true; in hl_device_init()
2053 device_cdev_sysfs_add(hdev); in hl_device_init()
2054 if (hdev->pdev) in hl_device_init()
2055 dev_err(&hdev->pdev->dev, in hl_device_init()
2057 hdev->cdev_idx); in hl_device_init()
2060 hdev->cdev_idx); in hl_device_init()
2072 void hl_device_fini(struct hl_device *hdev) in hl_device_fini() argument
2079 dev_info(hdev->dev, "Removing device\n"); in hl_device_fini()
2081 hdev->device_fini_pending = 1; in hl_device_fini()
2082 flush_delayed_work(&hdev->device_reset_work.reset_work); in hl_device_fini()
2084 if (hdev->pldm) in hl_device_fini()
2099 spin_lock(&hdev->reset_info.lock); in hl_device_fini()
2100 device_in_reset = !!hdev->reset_info.in_reset; in hl_device_fini()
2102 hdev->reset_info.in_reset = 1; in hl_device_fini()
2103 spin_unlock(&hdev->reset_info.lock); in hl_device_fini()
2108 spin_lock(&hdev->reset_info.lock); in hl_device_fini()
2109 device_in_reset = !!hdev->reset_info.in_reset; in hl_device_fini()
2111 hdev->reset_info.in_reset = 1; in hl_device_fini()
2112 spin_unlock(&hdev->reset_info.lock); in hl_device_fini()
2115 dev_crit(hdev->dev, in hl_device_fini()
2128 hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0); in hl_device_fini()
2131 hdev->disabled = true; in hl_device_fini()
2133 take_release_locks(hdev); in hl_device_fini()
2135 hdev->reset_info.hard_reset_pending = true; in hl_device_fini()
2137 hl_hwmon_fini(hdev); in hl_device_fini()
2139 cleanup_resources(hdev, true, false, false); in hl_device_fini()
2145 dev_info(hdev->dev, in hl_device_fini()
2149 rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false); in hl_device_fini()
2151 dev_crit(hdev->dev, "Failed to kill all open processes\n"); in hl_device_fini()
2152 device_disable_open_processes(hdev, false); in hl_device_fini()
2155 rc = device_kill_open_processes(hdev, 0, true); in hl_device_fini()
2157 dev_crit(hdev->dev, "Failed to kill all control device open processes\n"); in hl_device_fini()
2158 device_disable_open_processes(hdev, true); in hl_device_fini()
2161 hl_cb_pool_fini(hdev); in hl_device_fini()
2164 hdev->asic_funcs->hw_fini(hdev, true, false); in hl_device_fini()
2166 hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE; in hl_device_fini()
2169 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) in hl_device_fini()
2170 dev_err(hdev->dev, "kernel ctx is still alive\n"); in hl_device_fini()
2172 hl_debugfs_remove_device(hdev); in hl_device_fini()
2174 hl_dec_fini(hdev); in hl_device_fini()
2176 hl_vm_fini(hdev); in hl_device_fini()
2178 hl_mmu_fini(hdev); in hl_device_fini()
2180 hl_eq_fini(hdev, &hdev->event_queue); in hl_device_fini()
2182 kfree(hdev->shadow_cs_queue); in hl_device_fini()
2184 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) in hl_device_fini()
2185 hl_cq_fini(hdev, &hdev->completion_queue[i]); in hl_device_fini()
2186 kfree(hdev->completion_queue); in hl_device_fini()
2187 kfree(hdev->user_interrupt); in hl_device_fini()
2189 hl_hw_queues_destroy(hdev); in hl_device_fini()
2192 hdev->asic_funcs->sw_fini(hdev); in hl_device_fini()
2194 device_early_fini(hdev); in hl_device_fini()
2197 device_cdev_sysfs_del(hdev); in hl_device_fini()
2215 inline u32 hl_rreg(struct hl_device *hdev, u32 reg) in hl_rreg() argument
2217 return readl(hdev->rmmio + reg); in hl_rreg()
2230 inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) in hl_wreg() argument
2232 writel(val, hdev->rmmio + reg); in hl_wreg()