1 // SPDX-License-Identifier: GPL-2.0
2 
3 /*
4  * Copyright 2016-2019 HabanaLabs, Ltd.
5  * All Rights Reserved.
6  *
7  */
8 
9 #define pr_fmt(fmt)		"habanalabs: " fmt
10 
11 #include "habanalabs.h"
12 
13 #include <linux/pci.h>
14 #include <linux/aer.h>
15 #include <linux/module.h>
16 
17 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
18 
19 #define HL_DRIVER_DESC		"Driver for HabanaLabs's AI Accelerators"
20 
21 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
22 MODULE_DESCRIPTION(HL_DRIVER_DESC);
23 MODULE_LICENSE("GPL v2");
24 
25 static int hl_major;
26 static struct class *hl_class;
27 static DEFINE_IDR(hl_devs_idr);
28 static DEFINE_MUTEX(hl_devs_idr_lock);
29 
30 static int timeout_locked = 5;
31 static int reset_on_lockup = 1;
32 
33 module_param(timeout_locked, int, 0444);
34 MODULE_PARM_DESC(timeout_locked,
35 	"Device lockup timeout in seconds (0 = disabled, default 5s)");
36 
37 module_param(reset_on_lockup, int, 0444);
38 MODULE_PARM_DESC(reset_on_lockup,
39 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
40 
41 #define PCI_VENDOR_ID_HABANALABS	0x1da3
42 
43 #define PCI_IDS_GOYA			0x0001
44 #define PCI_IDS_GAUDI			0x1000
45 
46 static const struct pci_device_id ids[] = {
47 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
48 	{ PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
49 	{ 0, }
50 };
51 MODULE_DEVICE_TABLE(pci, ids);
52 
53 /*
54  * get_asic_type - translate device id to asic type
55  *
56  * @device: id of the PCI device
57  *
58  * Translate device id to asic type.
59  * In case of unidentified device, return -1
60  */
get_asic_type(u16 device)61 static enum hl_asic_type get_asic_type(u16 device)
62 {
63 	enum hl_asic_type asic_type;
64 
65 	switch (device) {
66 	case PCI_IDS_GOYA:
67 		asic_type = ASIC_GOYA;
68 		break;
69 	case PCI_IDS_GAUDI:
70 		asic_type = ASIC_GAUDI;
71 		break;
72 	default:
73 		asic_type = ASIC_INVALID;
74 		break;
75 	}
76 
77 	return asic_type;
78 }
79 
80 /*
81  * hl_device_open - open function for habanalabs device
82  *
83  * @inode: pointer to inode structure
84  * @filp: pointer to file structure
85  *
86  * Called when process opens an habanalabs device.
87  */
hl_device_open(struct inode * inode,struct file * filp)88 int hl_device_open(struct inode *inode, struct file *filp)
89 {
90 	struct hl_device *hdev;
91 	struct hl_fpriv *hpriv;
92 	int rc;
93 
94 	mutex_lock(&hl_devs_idr_lock);
95 	hdev = idr_find(&hl_devs_idr, iminor(inode));
96 	mutex_unlock(&hl_devs_idr_lock);
97 
98 	if (!hdev) {
99 		pr_err("Couldn't find device %d:%d\n",
100 			imajor(inode), iminor(inode));
101 		return -ENXIO;
102 	}
103 
104 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
105 	if (!hpriv)
106 		return -ENOMEM;
107 
108 	hpriv->hdev = hdev;
109 	filp->private_data = hpriv;
110 	hpriv->filp = filp;
111 	mutex_init(&hpriv->restore_phase_mutex);
112 	kref_init(&hpriv->refcount);
113 	nonseekable_open(inode, filp);
114 
115 	hl_cb_mgr_init(&hpriv->cb_mgr);
116 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
117 
118 	hpriv->taskpid = find_get_pid(current->pid);
119 
120 	mutex_lock(&hdev->fpriv_list_lock);
121 
122 	if (hl_device_disabled_or_in_reset(hdev)) {
123 		dev_err_ratelimited(hdev->dev,
124 			"Can't open %s because it is disabled or in reset\n",
125 			dev_name(hdev->dev));
126 		rc = -EPERM;
127 		goto out_err;
128 	}
129 
130 	if (hdev->in_debug) {
131 		dev_err_ratelimited(hdev->dev,
132 			"Can't open %s because it is being debugged by another user\n",
133 			dev_name(hdev->dev));
134 		rc = -EPERM;
135 		goto out_err;
136 	}
137 
138 	if (hdev->compute_ctx) {
139 		dev_dbg_ratelimited(hdev->dev,
140 			"Can't open %s because another user is working on it\n",
141 			dev_name(hdev->dev));
142 		rc = -EBUSY;
143 		goto out_err;
144 	}
145 
146 	rc = hl_ctx_create(hdev, hpriv);
147 	if (rc) {
148 		dev_err(hdev->dev, "Failed to create context %d\n", rc);
149 		goto out_err;
150 	}
151 
152 	/* Device is IDLE at this point so it is legal to change PLLs.
153 	 * There is no need to check anything because if the PLL is
154 	 * already HIGH, the set function will return without doing
155 	 * anything
156 	 */
157 	hl_device_set_frequency(hdev, PLL_HIGH);
158 
159 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
160 	mutex_unlock(&hdev->fpriv_list_lock);
161 
162 	hl_debugfs_add_file(hpriv);
163 
164 	return 0;
165 
166 out_err:
167 	mutex_unlock(&hdev->fpriv_list_lock);
168 
169 	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
170 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
171 	filp->private_data = NULL;
172 	mutex_destroy(&hpriv->restore_phase_mutex);
173 	put_pid(hpriv->taskpid);
174 
175 	kfree(hpriv);
176 
177 	return rc;
178 }
179 
hl_device_open_ctrl(struct inode * inode,struct file * filp)180 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
181 {
182 	struct hl_device *hdev;
183 	struct hl_fpriv *hpriv;
184 	int rc;
185 
186 	mutex_lock(&hl_devs_idr_lock);
187 	hdev = idr_find(&hl_devs_idr, iminor(inode));
188 	mutex_unlock(&hl_devs_idr_lock);
189 
190 	if (!hdev) {
191 		pr_err("Couldn't find device %d:%d\n",
192 			imajor(inode), iminor(inode));
193 		return -ENXIO;
194 	}
195 
196 	hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
197 	if (!hpriv)
198 		return -ENOMEM;
199 
200 	mutex_lock(&hdev->fpriv_list_lock);
201 
202 	if (hl_device_disabled_or_in_reset(hdev)) {
203 		dev_err_ratelimited(hdev->dev_ctrl,
204 			"Can't open %s because it is disabled or in reset\n",
205 			dev_name(hdev->dev_ctrl));
206 		rc = -EPERM;
207 		goto out_err;
208 	}
209 
210 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
211 	mutex_unlock(&hdev->fpriv_list_lock);
212 
213 	hpriv->hdev = hdev;
214 	filp->private_data = hpriv;
215 	hpriv->filp = filp;
216 	hpriv->is_control = true;
217 	nonseekable_open(inode, filp);
218 
219 	hpriv->taskpid = find_get_pid(current->pid);
220 
221 	return 0;
222 
223 out_err:
224 	mutex_unlock(&hdev->fpriv_list_lock);
225 	kfree(hpriv);
226 	return rc;
227 }
228 
set_driver_behavior_per_device(struct hl_device * hdev)229 static void set_driver_behavior_per_device(struct hl_device *hdev)
230 {
231 	hdev->mmu_enable = 1;
232 	hdev->cpu_enable = 1;
233 	hdev->fw_loading = 1;
234 	hdev->cpu_queues_enable = 1;
235 	hdev->heartbeat = 1;
236 	hdev->clock_gating_mask = ULONG_MAX;
237 
238 	hdev->reset_pcilink = 0;
239 	hdev->axi_drain = 0;
240 	hdev->sram_scrambler_enable = 1;
241 	hdev->dram_scrambler_enable = 1;
242 	hdev->bmc_enable = 1;
243 	hdev->hard_reset_on_fw_events = 1;
244 }
245 
246 /*
247  * create_hdev - create habanalabs device instance
248  *
249  * @dev: will hold the pointer to the new habanalabs device structure
250  * @pdev: pointer to the pci device
251  * @asic_type: in case of simulator device, which device is it
252  * @minor: in case of simulator device, the minor of the device
253  *
254  * Allocate memory for habanalabs device and initialize basic fields
255  * Identify the ASIC type
256  * Allocate ID (minor) for the device (only for real devices)
257  */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)258 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
259 		enum hl_asic_type asic_type, int minor)
260 {
261 	struct hl_device *hdev;
262 	int rc, main_id, ctrl_id = 0;
263 
264 	*dev = NULL;
265 
266 	hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
267 	if (!hdev)
268 		return -ENOMEM;
269 
270 	/* First, we must find out which ASIC are we handling. This is needed
271 	 * to configure the behavior of the driver (kernel parameters)
272 	 */
273 	if (pdev) {
274 		hdev->asic_type = get_asic_type(pdev->device);
275 		if (hdev->asic_type == ASIC_INVALID) {
276 			dev_err(&pdev->dev, "Unsupported ASIC\n");
277 			rc = -ENODEV;
278 			goto free_hdev;
279 		}
280 	} else {
281 		hdev->asic_type = asic_type;
282 	}
283 
284 	hdev->major = hl_major;
285 	hdev->reset_on_lockup = reset_on_lockup;
286 	hdev->pldm = 0;
287 
288 	set_driver_behavior_per_device(hdev);
289 
290 	if (timeout_locked)
291 		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
292 	else
293 		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
294 
295 	hdev->disabled = true;
296 	hdev->pdev = pdev; /* can be NULL in case of simulator device */
297 
298 	/* Set default DMA mask to 32 bits */
299 	hdev->dma_mask = 32;
300 
301 	mutex_lock(&hl_devs_idr_lock);
302 
303 	/* Always save 2 numbers, 1 for main device and 1 for control.
304 	 * They must be consecutive
305 	 */
306 	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
307 				GFP_KERNEL);
308 
309 	if (main_id >= 0)
310 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
311 					main_id + 2, GFP_KERNEL);
312 
313 	mutex_unlock(&hl_devs_idr_lock);
314 
315 	if ((main_id < 0) || (ctrl_id < 0)) {
316 		if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
317 			pr_err("too many devices in the system\n");
318 
319 		if (main_id >= 0) {
320 			mutex_lock(&hl_devs_idr_lock);
321 			idr_remove(&hl_devs_idr, main_id);
322 			mutex_unlock(&hl_devs_idr_lock);
323 		}
324 
325 		rc = -EBUSY;
326 		goto free_hdev;
327 	}
328 
329 	hdev->id = main_id;
330 	hdev->id_control = ctrl_id;
331 
332 	*dev = hdev;
333 
334 	return 0;
335 
336 free_hdev:
337 	kfree(hdev);
338 	return rc;
339 }
340 
341 /*
342  * destroy_hdev - destroy habanalabs device instance
343  *
344  * @dev: pointer to the habanalabs device structure
345  *
346  */
destroy_hdev(struct hl_device * hdev)347 void destroy_hdev(struct hl_device *hdev)
348 {
349 	/* Remove device from the device list */
350 	mutex_lock(&hl_devs_idr_lock);
351 	idr_remove(&hl_devs_idr, hdev->id);
352 	idr_remove(&hl_devs_idr, hdev->id_control);
353 	mutex_unlock(&hl_devs_idr_lock);
354 
355 	kfree(hdev);
356 }
357 
hl_pmops_suspend(struct device * dev)358 static int hl_pmops_suspend(struct device *dev)
359 {
360 	struct hl_device *hdev = dev_get_drvdata(dev);
361 
362 	pr_debug("Going to suspend PCI device\n");
363 
364 	if (!hdev) {
365 		pr_err("device pointer is NULL in suspend\n");
366 		return 0;
367 	}
368 
369 	return hl_device_suspend(hdev);
370 }
371 
hl_pmops_resume(struct device * dev)372 static int hl_pmops_resume(struct device *dev)
373 {
374 	struct hl_device *hdev = dev_get_drvdata(dev);
375 
376 	pr_debug("Going to resume PCI device\n");
377 
378 	if (!hdev) {
379 		pr_err("device pointer is NULL in resume\n");
380 		return 0;
381 	}
382 
383 	return hl_device_resume(hdev);
384 }
385 
386 /*
387  * hl_pci_probe - probe PCI habanalabs devices
388  *
389  * @pdev: pointer to pci device
390  * @id: pointer to pci device id structure
391  *
392  * Standard PCI probe function for habanalabs device.
393  * Create a new habanalabs device and initialize it according to the
394  * device's type
395  */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)396 static int hl_pci_probe(struct pci_dev *pdev,
397 				const struct pci_device_id *id)
398 {
399 	struct hl_device *hdev;
400 	int rc;
401 
402 	dev_info(&pdev->dev, HL_NAME
403 		 " device found [%04x:%04x] (rev %x)\n",
404 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
405 
406 	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
407 	if (rc)
408 		return rc;
409 
410 	pci_set_drvdata(pdev, hdev);
411 
412 	pci_enable_pcie_error_reporting(pdev);
413 
414 	rc = hl_device_init(hdev, hl_class);
415 	if (rc) {
416 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
417 		rc = -ENODEV;
418 		goto disable_device;
419 	}
420 
421 	return 0;
422 
423 disable_device:
424 	pci_set_drvdata(pdev, NULL);
425 	destroy_hdev(hdev);
426 
427 	return rc;
428 }
429 
430 /*
431  * hl_pci_remove - remove PCI habanalabs devices
432  *
433  * @pdev: pointer to pci device
434  *
435  * Standard PCI remove function for habanalabs device
436  */
hl_pci_remove(struct pci_dev * pdev)437 static void hl_pci_remove(struct pci_dev *pdev)
438 {
439 	struct hl_device *hdev;
440 
441 	hdev = pci_get_drvdata(pdev);
442 	if (!hdev)
443 		return;
444 
445 	hl_device_fini(hdev);
446 	pci_disable_pcie_error_reporting(pdev);
447 	pci_set_drvdata(pdev, NULL);
448 	destroy_hdev(hdev);
449 }
450 
451 /**
452  * hl_pci_err_detected - a PCI bus error detected on this device
453  *
454  * @pdev: pointer to pci device
455  * @state: PCI error type
456  *
457  * Called by the PCI subsystem whenever a non-correctable
458  * PCI bus error is detected
459  */
460 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)461 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
462 {
463 	struct hl_device *hdev = pci_get_drvdata(pdev);
464 	enum pci_ers_result result;
465 
466 	switch (state) {
467 	case pci_channel_io_normal:
468 		return PCI_ERS_RESULT_CAN_RECOVER;
469 
470 	case pci_channel_io_frozen:
471 		dev_warn(hdev->dev, "frozen state error detected\n");
472 		result = PCI_ERS_RESULT_NEED_RESET;
473 		break;
474 
475 	case pci_channel_io_perm_failure:
476 		dev_warn(hdev->dev, "failure state error detected\n");
477 		result = PCI_ERS_RESULT_DISCONNECT;
478 		break;
479 
480 	default:
481 		result = PCI_ERS_RESULT_NONE;
482 	}
483 
484 	hdev->asic_funcs->halt_engines(hdev, true);
485 
486 	return result;
487 }
488 
489 /**
490  * hl_pci_err_resume - resume after a PCI slot reset
491  *
492  * @pdev: pointer to pci device
493  *
494  */
hl_pci_err_resume(struct pci_dev * pdev)495 static void hl_pci_err_resume(struct pci_dev *pdev)
496 {
497 	struct hl_device *hdev = pci_get_drvdata(pdev);
498 
499 	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
500 	hl_device_resume(hdev);
501 }
502 
503 /**
504  * hl_pci_err_slot_reset - a PCI slot reset has just happened
505  *
506  * @pdev: pointer to pci device
507  *
508  * Determine if the driver can recover from the PCI slot reset
509  */
hl_pci_err_slot_reset(struct pci_dev * pdev)510 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
511 {
512 	return PCI_ERS_RESULT_RECOVERED;
513 }
514 
515 static const struct dev_pm_ops hl_pm_ops = {
516 	.suspend = hl_pmops_suspend,
517 	.resume = hl_pmops_resume,
518 };
519 
520 static const struct pci_error_handlers hl_pci_err_handler = {
521 	.error_detected = hl_pci_err_detected,
522 	.slot_reset = hl_pci_err_slot_reset,
523 	.resume = hl_pci_err_resume,
524 };
525 
526 static struct pci_driver hl_pci_driver = {
527 	.name = HL_NAME,
528 	.id_table = ids,
529 	.probe = hl_pci_probe,
530 	.remove = hl_pci_remove,
531 	.driver.pm = &hl_pm_ops,
532 	.err_handler = &hl_pci_err_handler,
533 };
534 
535 /*
536  * hl_init - Initialize the habanalabs kernel driver
537  */
hl_init(void)538 static int __init hl_init(void)
539 {
540 	int rc;
541 	dev_t dev;
542 
543 	pr_info("loading driver\n");
544 
545 	rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
546 	if (rc < 0) {
547 		pr_err("unable to get major\n");
548 		return rc;
549 	}
550 
551 	hl_major = MAJOR(dev);
552 
553 	hl_class = class_create(THIS_MODULE, HL_NAME);
554 	if (IS_ERR(hl_class)) {
555 		pr_err("failed to allocate class\n");
556 		rc = PTR_ERR(hl_class);
557 		goto remove_major;
558 	}
559 
560 	hl_debugfs_init();
561 
562 	rc = pci_register_driver(&hl_pci_driver);
563 	if (rc) {
564 		pr_err("failed to register pci device\n");
565 		goto remove_debugfs;
566 	}
567 
568 	pr_debug("driver loaded\n");
569 
570 	return 0;
571 
572 remove_debugfs:
573 	hl_debugfs_fini();
574 	class_destroy(hl_class);
575 remove_major:
576 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
577 	return rc;
578 }
579 
580 /*
581  * hl_exit - Release all resources of the habanalabs kernel driver
582  */
hl_exit(void)583 static void __exit hl_exit(void)
584 {
585 	pci_unregister_driver(&hl_pci_driver);
586 
587 	/*
588 	 * Removing debugfs must be after all devices or simulator devices
589 	 * have been removed because otherwise we get a bug in the
590 	 * debugfs module for referencing NULL objects
591 	 */
592 	hl_debugfs_fini();
593 
594 	class_destroy(hl_class);
595 	unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
596 
597 	idr_destroy(&hl_devs_idr);
598 
599 	pr_debug("driver removed\n");
600 }
601 
602 module_init(hl_init);
603 module_exit(hl_exit);
604