1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2021 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 
13 #include <linux/pci.h>
14 #include <linux/aer.h>
15 #include <linux/module.h>
16 
17 #define CREATE_TRACE_POINTS
18 #include <trace/events/habanalabs.h>
19 
20 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
21 
22 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
23 
24 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
25 MODULE_DESCRIPTION(HL_DRIVER_DESC);
26 MODULE_LICENSE("GPL v2");
27 
28 static int hl_major;
29 static struct class *hl_class;
30 static DEFINE_IDR(hl_devs_idr);
31 static DEFINE_MUTEX(hl_devs_idr_lock);
32 
33 #define HL_DEFAULT_TIMEOUT_LOCKED	30	/* 30 seconds */
34 #define GAUDI_DEFAULT_TIMEOUT_LOCKED	600	/* 10 minutes */
35 
36 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
37 static int reset_on_lockup = 1;
38 static int memory_scrub;
39 static ulong boot_error_status_mask = ULONG_MAX;
40 
41 module_param(timeout_locked, int, 0444);
42 MODULE_PARM_DESC(timeout_locked,
43 	"Device lockup timeout in seconds (0 = disabled, default 30s)");
44 
45 module_param(reset_on_lockup, int, 0444);
46 MODULE_PARM_DESC(reset_on_lockup,
47 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
48 
49 module_param(memory_scrub, int, 0444);
50 MODULE_PARM_DESC(memory_scrub,
51 	"Scrub device memory in various states (0 = no, 1 = yes, default no)");
52 
53 module_param(boot_error_status_mask, ulong, 0444);
54 MODULE_PARM_DESC(boot_error_status_mask,
55 	"Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
56 
57 #define PCI_VENDOR_ID_HABANALABS	0x1da3
58 
59 #define PCI_IDS_GOYA			0x0001
60 #define PCI_IDS_GAUDI			0x1000
61 #define PCI_IDS_GAUDI_SEC		0x1010
62 
63 #define PCI_IDS_GAUDI2			0x1020
64 
65 static const struct pci_device_id ids[] = {
66 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
67 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
68 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
69 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
70 	{ 0, }
71 };
72 MODULE_DEVICE_TABLE(pci, ids);
73 
74 /*
75  * get_asic_type - translate device id to asic type
76  *
77  * @device: id of the PCI device
78  *
79  * Translate device id to asic type.
80  * In case of unidentified device, return -1
81  */
get_asic_type(u16 device)82 static enum hl_asic_type get_asic_type(u16 device)
83 {
84 	enum hl_asic_type asic_type;
85 
86 	switch (device) {
87 	case PCI_IDS_GOYA:
88 		asic_type = ASIC_GOYA;
89 		break;
90 	case PCI_IDS_GAUDI:
91 		asic_type = ASIC_GAUDI;
92 		break;
93 	case PCI_IDS_GAUDI_SEC:
94 		asic_type = ASIC_GAUDI_SEC;
95 		break;
96 	case PCI_IDS_GAUDI2:
97 		asic_type = ASIC_GAUDI2;
98 		break;
99 	default:
100 		asic_type = ASIC_INVALID;
101 		break;
102 	}
103 
104 	return asic_type;
105 }
106 
is_asic_secured(enum hl_asic_type asic_type)107 static bool is_asic_secured(enum hl_asic_type asic_type)
108 {
109 	switch (asic_type) {
110 	case ASIC_GAUDI_SEC:
111 		return true;
112 	default:
113 		return false;
114 	}
115 }
116 
117 /*
118  * hl_device_open - open function for habanalabs device
119  *
120  * @inode: pointer to inode structure
121  * @filp: pointer to file structure
122  *
123  * Called when process opens an habanalabs device.
124  */
hl_device_open(struct inode * inode,struct file * filp)125 int hl_device_open(struct inode *inode, struct file *filp)
126 {
127 	enum hl_device_status status;
128 	struct hl_device *hdev;
129 	struct hl_fpriv *hpriv;
130 	int rc;
131 
132 	mutex_lock(&hl_devs_idr_lock);
133 	hdev = idr_find(&hl_devs_idr, iminor(inode));
134 	mutex_unlock(&hl_devs_idr_lock);
135 
136 	if (!hdev) {
137 		pr_err("Couldn't find device %d:%d\n",
138 			imajor(inode), iminor(inode));
139 		return -ENXIO;
140 	}
141 
142 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
143 	if (!hpriv)
144 		return -ENOMEM;
145 
146 	hpriv->hdev = hdev;
147 	filp->private_data = hpriv;
148 	hpriv->filp = filp;
149 
150 	mutex_init(&hpriv->notifier_event.lock);
151 	mutex_init(&hpriv->restore_phase_mutex);
152 	mutex_init(&hpriv->ctx_lock);
153 	kref_init(&hpriv->refcount);
154 	nonseekable_open(inode, filp);
155 
156 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
157 	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
158 
159 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
160 
161 	mutex_lock(&hdev->fpriv_list_lock);
162 
163 	if (!hl_device_operational(hdev, &status)) {
164 		dev_dbg_ratelimited(hdev->dev,
165 			"Can't open %s because it is %s\n",
166 			dev_name(hdev->dev), hdev->status[status]);
167 
168 		if (status == HL_DEVICE_STATUS_IN_RESET ||
169 					status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
170 			rc = -EAGAIN;
171 		else
172 			rc = -EPERM;
173 
174 		goto out_err;
175 	}
176 
177 	if (hdev->is_in_dram_scrub) {
178 		dev_dbg_ratelimited(hdev->dev,
179 			"Can't open %s during dram scrub\n",
180 			dev_name(hdev->dev));
181 		rc = -EAGAIN;
182 		goto out_err;
183 	}
184 
185 	if (hdev->compute_ctx_in_release) {
186 		dev_dbg_ratelimited(hdev->dev,
187 			"Can't open %s because another user is still releasing it\n",
188 			dev_name(hdev->dev));
189 		rc = -EAGAIN;
190 		goto out_err;
191 	}
192 
193 	if (hdev->is_compute_ctx_active) {
194 		dev_dbg_ratelimited(hdev->dev,
195 			"Can't open %s because another user is working on it\n",
196 			dev_name(hdev->dev));
197 		rc = -EBUSY;
198 		goto out_err;
199 	}
200 
201 	rc = hl_ctx_create(hdev, hpriv);
202 	if (rc) {
203 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
204 		goto out_err;
205 	}
206 
207 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
208 	mutex_unlock(&hdev->fpriv_list_lock);
209 
210 	hdev->asic_funcs->send_device_activity(hdev, true);
211 
212 	hl_debugfs_add_file(hpriv);
213 
214 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
215 	atomic_set(&hdev->captured_err_info.razwi.write_enable, 1);
216 	hdev->captured_err_info.undef_opcode.write_enable = true;
217 
218 	hdev->open_counter++;
219 	hdev->last_successful_open_jif = jiffies;
220 	hdev->last_successful_open_ktime = ktime_get();
221 
222 	return 0;
223 
224 out_err:
225 	mutex_unlock(&hdev->fpriv_list_lock);
226 	hl_mem_mgr_fini(&hpriv->mem_mgr);
227 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
228 	filp->private_data = NULL;
229 	mutex_destroy(&hpriv->ctx_lock);
230 	mutex_destroy(&hpriv->restore_phase_mutex);
231 	mutex_destroy(&hpriv->notifier_event.lock);
232 	put_pid(hpriv->taskpid);
233 
234 	kfree(hpriv);
235 
236 	return rc;
237 }
238 
hl_device_open_ctrl(struct inode * inode,struct file * filp)239 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
240 {
241 	struct hl_device *hdev;
242 	struct hl_fpriv *hpriv;
243 	int rc;
244 
245 	mutex_lock(&hl_devs_idr_lock);
246 	hdev = idr_find(&hl_devs_idr, iminor(inode));
247 	mutex_unlock(&hl_devs_idr_lock);
248 
249 	if (!hdev) {
250 		pr_err("Couldn't find device %d:%d\n",
251 			imajor(inode), iminor(inode));
252 		return -ENXIO;
253 	}
254 
255 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
256 	if (!hpriv)
257 		return -ENOMEM;
258 
259 	/* Prevent other routines from reading partial hpriv data by
260 	 * initializing hpriv fields before inserting it to the list
261 	 */
262 	hpriv->hdev = hdev;
263 	filp->private_data = hpriv;
264 	hpriv->filp = filp;
265 
266 	mutex_init(&hpriv->notifier_event.lock);
267 	nonseekable_open(inode, filp);
268 
269 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
270 
271 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
272 
273 	if (!hl_device_operational(hdev, NULL)) {
274 		dev_dbg_ratelimited(hdev->dev_ctrl,
275 			"Can't open %s because it is disabled or in reset\n",
276 			dev_name(hdev->dev_ctrl));
277 		rc = -EPERM;
278 		goto out_err;
279 	}
280 
281 	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
282 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
283 
284 	return 0;
285 
286 out_err:
287 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
288 	filp->private_data = NULL;
289 	put_pid(hpriv->taskpid);
290 
291 	kfree(hpriv);
292 
293 	return rc;
294 }
295 
set_driver_behavior_per_device(struct hl_device * hdev)296 static void set_driver_behavior_per_device(struct hl_device *hdev)
297 {
298 	hdev->nic_ports_mask = 0;
299 	hdev->fw_components = FW_TYPE_ALL_TYPES;
300 	hdev->mmu_enable = MMU_EN_ALL;
301 	hdev->cpu_queues_enable = 1;
302 	hdev->pldm = 0;
303 	hdev->hard_reset_on_fw_events = 1;
304 	hdev->bmc_enable = 1;
305 	hdev->reset_on_preboot_fail = 1;
306 	hdev->heartbeat = 1;
307 }
308 
copy_kernel_module_params_to_device(struct hl_device * hdev)309 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
310 {
311 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
312 
313 	hdev->major = hl_major;
314 	hdev->memory_scrub = memory_scrub;
315 	hdev->reset_on_lockup = reset_on_lockup;
316 	hdev->boot_error_status_mask = boot_error_status_mask;
317 }
318 
fixup_device_params_per_asic(struct hl_device * hdev,int timeout)319 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
320 {
321 	switch (hdev->asic_type) {
322 	case ASIC_GAUDI:
323 	case ASIC_GAUDI_SEC:
324 		/* If user didn't request a different timeout than the default one, we have
325 		 * a different default timeout for Gaudi
326 		 */
327 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
328 			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
329 										MSEC_PER_SEC);
330 
331 		hdev->reset_upon_device_release = 0;
332 		break;
333 
334 	case ASIC_GOYA:
335 		hdev->reset_upon_device_release = 0;
336 		break;
337 
338 	default:
339 		hdev->reset_upon_device_release = 1;
340 		break;
341 	}
342 }
343 
fixup_device_params(struct hl_device * hdev)344 static int fixup_device_params(struct hl_device *hdev)
345 {
346 	int tmp_timeout;
347 
348 	tmp_timeout = timeout_locked;
349 
350 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
351 	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
352 
353 	if (tmp_timeout)
354 		hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
355 	else
356 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
357 
358 	hdev->stop_on_err = true;
359 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
360 	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
361 
362 	/* Enable only after the initialization of the device */
363 	hdev->disabled = true;
364 
365 	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
366 			(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
367 		pr_err("Preboot must be set along with other components");
368 		return -EINVAL;
369 	}
370 
371 	/* If CPU queues not enabled, no way to do heartbeat */
372 	if (!hdev->cpu_queues_enable)
373 		hdev->heartbeat = 0;
374 
375 	fixup_device_params_per_asic(hdev, tmp_timeout);
376 
377 	return 0;
378 }
379 
380 /**
381  * create_hdev - create habanalabs device instance
382  *
383  * @dev: will hold the pointer to the new habanalabs device structure
384  * @pdev: pointer to the pci device
385  *
386  * Allocate memory for habanalabs device and initialize basic fields
387  * Identify the ASIC type
388  * Allocate ID (minor) for the device (only for real devices)
389  */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev)390 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
391 {
392 	int main_id, ctrl_id = 0, rc = 0;
393 	struct hl_device *hdev;
394 
395 	*dev = NULL;
396 
397 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
398 	if (!hdev)
399 		return -ENOMEM;
400 
401 	/* Will be NULL in case of simulator device */
402 	hdev->pdev = pdev;
403 
404 	/* Assign status description string */
405 	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
406 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
407 	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
408 	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
409 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
410 					"in device creation", HL_STR_MAX);
411 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
412 					"in reset after device release", HL_STR_MAX);
413 
414 
415 	/* First, we must find out which ASIC are we handling. This is needed
416 	 * to configure the behavior of the driver (kernel parameters)
417 	 */
418 	hdev->asic_type = get_asic_type(pdev->device);
419 	if (hdev->asic_type == ASIC_INVALID) {
420 		dev_err(&pdev->dev, "Unsupported ASIC\n");
421 		rc = -ENODEV;
422 		goto free_hdev;
423 	}
424 
425 	copy_kernel_module_params_to_device(hdev);
426 
427 	set_driver_behavior_per_device(hdev);
428 
429 	fixup_device_params(hdev);
430 
431 	mutex_lock(&hl_devs_idr_lock);
432 
433 	/* Always save 2 numbers, 1 for main device and 1 for control.
434 	 * They must be consecutive
435 	 */
436 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
437 
438 	if (main_id >= 0)
439 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
440 					main_id + 2, GFP_KERNEL);
441 
442 	mutex_unlock(&hl_devs_idr_lock);
443 
444 	if ((main_id < 0) || (ctrl_id < 0)) {
445 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
446 			pr_err("too many devices in the system\n");
447 
448 		if (main_id >= 0) {
449 			mutex_lock(&hl_devs_idr_lock);
450 			idr_remove(&hl_devs_idr, main_id);
451 			mutex_unlock(&hl_devs_idr_lock);
452 		}
453 
454 		rc = -EBUSY;
455 		goto free_hdev;
456 	}
457 
458 	hdev->id = main_id;
459 	hdev->id_control = ctrl_id;
460 
461 	*dev = hdev;
462 
463 	return 0;
464 
465 free_hdev:
466 	kfree(hdev);
467 	return rc;
468 }
469 
470 /*
471  * destroy_hdev - destroy habanalabs device instance
472  *
473  * @dev: pointer to the habanalabs device structure
474  *
475  */
destroy_hdev(struct hl_device * hdev)476 static void destroy_hdev(struct hl_device *hdev)
477 {
478 	/* Remove device from the device list */
479 	mutex_lock(&hl_devs_idr_lock);
480 	idr_remove(&hl_devs_idr, hdev->id);
481 	idr_remove(&hl_devs_idr, hdev->id_control);
482 	mutex_unlock(&hl_devs_idr_lock);
483 
484 	kfree(hdev);
485 }
486 
hl_pmops_suspend(struct device * dev)487 static int hl_pmops_suspend(struct device *dev)
488 {
489 	struct hl_device *hdev = dev_get_drvdata(dev);
490 
491 	pr_debug("Going to suspend PCI device\n");
492 
493 	if (!hdev) {
494 		pr_err("device pointer is NULL in suspend\n");
495 		return 0;
496 	}
497 
498 	return hl_device_suspend(hdev);
499 }
500 
hl_pmops_resume(struct device * dev)501 static int hl_pmops_resume(struct device *dev)
502 {
503 	struct hl_device *hdev = dev_get_drvdata(dev);
504 
505 	pr_debug("Going to resume PCI device\n");
506 
507 	if (!hdev) {
508 		pr_err("device pointer is NULL in resume\n");
509 		return 0;
510 	}
511 
512 	return hl_device_resume(hdev);
513 }
514 
515 /**
516  * hl_pci_probe - probe PCI habanalabs devices
517  *
518  * @pdev: pointer to pci device
519  * @id: pointer to pci device id structure
520  *
521  * Standard PCI probe function for habanalabs device.
522  * Create a new habanalabs device and initialize it according to the
523  * device's type
524  */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)525 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
526 {
527 	struct hl_device *hdev;
528 	int rc;
529 
530 	dev_info(&pdev->dev, HL_NAME
531 		 " device found [%04x:%04x] (rev %x)\n",
532 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
533 
534 	rc = create_hdev(&hdev, pdev);
535 	if (rc)
536 		return rc;
537 
538 	pci_set_drvdata(pdev, hdev);
539 
540 	pci_enable_pcie_error_reporting(pdev);
541 
542 	rc = hl_device_init(hdev, hl_class);
543 	if (rc) {
544 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
545 		rc = -ENODEV;
546 		goto disable_device;
547 	}
548 
549 	return 0;
550 
551 disable_device:
552 	pci_disable_pcie_error_reporting(pdev);
553 	pci_set_drvdata(pdev, NULL);
554 	destroy_hdev(hdev);
555 
556 	return rc;
557 }
558 
559 /*
560  * hl_pci_remove - remove PCI habanalabs devices
561  *
562  * @pdev: pointer to pci device
563  *
564  * Standard PCI remove function for habanalabs device
565  */
hl_pci_remove(struct pci_dev * pdev)566 static void hl_pci_remove(struct pci_dev *pdev)
567 {
568 	struct hl_device *hdev;
569 
570 	hdev = pci_get_drvdata(pdev);
571 	if (!hdev)
572 		return;
573 
574 	hl_device_fini(hdev);
575 	pci_disable_pcie_error_reporting(pdev);
576 	pci_set_drvdata(pdev, NULL);
577 	destroy_hdev(hdev);
578 }
579 
580 /**
581  * hl_pci_err_detected - a PCI bus error detected on this device
582  *
583  * @pdev: pointer to pci device
584  * @state: PCI error type
585  *
586  * Called by the PCI subsystem whenever a non-correctable
587  * PCI bus error is detected
588  */
589 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)590 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
591 {
592 	struct hl_device *hdev = pci_get_drvdata(pdev);
593 	enum pci_ers_result result;
594 
595 	switch (state) {
596 	case pci_channel_io_normal:
597 		return PCI_ERS_RESULT_CAN_RECOVER;
598 
599 	case pci_channel_io_frozen:
600 		dev_warn(hdev->dev, "frozen state error detected\n");
601 		result = PCI_ERS_RESULT_NEED_RESET;
602 		break;
603 
604 	case pci_channel_io_perm_failure:
605 		dev_warn(hdev->dev, "failure state error detected\n");
606 		result = PCI_ERS_RESULT_DISCONNECT;
607 		break;
608 
609 	default:
610 		result = PCI_ERS_RESULT_NONE;
611 	}
612 
613 	hdev->asic_funcs->halt_engines(hdev, true, false);
614 
615 	return result;
616 }
617 
618 /**
619  * hl_pci_err_resume - resume after a PCI slot reset
620  *
621  * @pdev: pointer to pci device
622  *
623  */
hl_pci_err_resume(struct pci_dev * pdev)624 static void hl_pci_err_resume(struct pci_dev *pdev)
625 {
626 	struct hl_device *hdev = pci_get_drvdata(pdev);
627 
628 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
629 	hl_device_resume(hdev);
630 }
631 
632 /**
633  * hl_pci_err_slot_reset - a PCI slot reset has just happened
634  *
635  * @pdev: pointer to pci device
636  *
637  * Determine if the driver can recover from the PCI slot reset
638  */
hl_pci_err_slot_reset(struct pci_dev * pdev)639 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
640 {
641 	return PCI_ERS_RESULT_RECOVERED;
642 }
643 
644 static const struct dev_pm_ops hl_pm_ops = {
645 	.suspend = hl_pmops_suspend,
646 	.resume = hl_pmops_resume,
647 };
648 
649 static const struct pci_error_handlers hl_pci_err_handler = {
650 	.error_detected = hl_pci_err_detected,
651 	.slot_reset = hl_pci_err_slot_reset,
652 	.resume = hl_pci_err_resume,
653 };
654 
655 static struct pci_driver hl_pci_driver = {
656 	.name = HL_NAME,
657 	.id_table = ids,
658 	.probe = hl_pci_probe,
659 	.remove = hl_pci_remove,
660 	.shutdown = hl_pci_remove,
661 	.driver = {
662 		.name = HL_NAME,
663 		.pm = &hl_pm_ops,
664 		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
665 	},
666 	.err_handler = &hl_pci_err_handler,
667 };
668 
669 /*
670  * hl_init - Initialize the habanalabs kernel driver
671  */
hl_init(void)672 static int __init hl_init(void)
673 {
674 	int rc;
675 	dev_t dev;
676 
677 	pr_info("loading driver\n");
678 
679 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
680 	if (rc < 0) {
681 		pr_err("unable to get major\n");
682 		return rc;
683 	}
684 
685 	hl_major = MAJOR(dev);
686 
687 	hl_class = class_create(THIS_MODULE, HL_NAME);
688 	if (IS_ERR(hl_class)) {
689 		pr_err("failed to allocate class\n");
690 		rc = PTR_ERR(hl_class);
691 		goto remove_major;
692 	}
693 
694 	hl_debugfs_init();
695 
696 	rc = pci_register_driver(&hl_pci_driver);
697 	if (rc) {
698 		pr_err("failed to register pci device\n");
699 		goto remove_debugfs;
700 	}
701 
702 	pr_debug("driver loaded\n");
703 
704 	return 0;
705 
706 remove_debugfs:
707 	hl_debugfs_fini();
708 	class_destroy(hl_class);
709 remove_major:
710 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
711 	return rc;
712 }
713 
714 /*
715  * hl_exit - Release all resources of the habanalabs kernel driver
716  */
hl_exit(void)717 static void __exit hl_exit(void)
718 {
719 	pci_unregister_driver(&hl_pci_driver);
720 
721 	/*
722 	 * Removing debugfs must be after all devices or simulator devices
723 	 * have been removed because otherwise we get a bug in the
724 	 * debugfs module for referencing NULL objects
725 	 */
726 	hl_debugfs_fini();
727 
728 	class_destroy(hl_class);
729 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
730 
731 	idr_destroy(&hl_devs_idr);
732 
733 	pr_debug("driver removed\n");
734 }
735 
736 module_init(hl_init);
737 module_exit(hl_exit);
738