1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 *
7 */
8
9 #define pr_fmt(fmt) "habanalabs: " fmt
10
11 #include "habanalabs.h"
12
13 #include <linux/pci.h>
14 #include <linux/aer.h>
15 #include <linux/module.h>
16
17 #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
18
19 #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
20
21 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
22 MODULE_DESCRIPTION(HL_DRIVER_DESC);
23 MODULE_LICENSE("GPL v2");
24
25 static int hl_major;
26 static struct class *hl_class;
27 static DEFINE_IDR(hl_devs_idr);
28 static DEFINE_MUTEX(hl_devs_idr_lock);
29
30 static int timeout_locked = 5;
31 static int reset_on_lockup = 1;
32
33 module_param(timeout_locked, int, 0444);
34 MODULE_PARM_DESC(timeout_locked,
35 "Device lockup timeout in seconds (0 = disabled, default 5s)");
36
37 module_param(reset_on_lockup, int, 0444);
38 MODULE_PARM_DESC(reset_on_lockup,
39 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
40
41 #define PCI_VENDOR_ID_HABANALABS 0x1da3
42
43 #define PCI_IDS_GOYA 0x0001
44 #define PCI_IDS_GAUDI 0x1000
45
46 static const struct pci_device_id ids[] = {
47 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
48 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
49 { 0, }
50 };
51 MODULE_DEVICE_TABLE(pci, ids);
52
53 /*
54 * get_asic_type - translate device id to asic type
55 *
56 * @device: id of the PCI device
57 *
58 * Translate device id to asic type.
59 * In case of unidentified device, return -1
60 */
get_asic_type(u16 device)61 static enum hl_asic_type get_asic_type(u16 device)
62 {
63 enum hl_asic_type asic_type;
64
65 switch (device) {
66 case PCI_IDS_GOYA:
67 asic_type = ASIC_GOYA;
68 break;
69 case PCI_IDS_GAUDI:
70 asic_type = ASIC_GAUDI;
71 break;
72 default:
73 asic_type = ASIC_INVALID;
74 break;
75 }
76
77 return asic_type;
78 }
79
80 /*
81 * hl_device_open - open function for habanalabs device
82 *
83 * @inode: pointer to inode structure
84 * @filp: pointer to file structure
85 *
86 * Called when process opens an habanalabs device.
87 */
hl_device_open(struct inode * inode,struct file * filp)88 int hl_device_open(struct inode *inode, struct file *filp)
89 {
90 struct hl_device *hdev;
91 struct hl_fpriv *hpriv;
92 int rc;
93
94 mutex_lock(&hl_devs_idr_lock);
95 hdev = idr_find(&hl_devs_idr, iminor(inode));
96 mutex_unlock(&hl_devs_idr_lock);
97
98 if (!hdev) {
99 pr_err("Couldn't find device %d:%d\n",
100 imajor(inode), iminor(inode));
101 return -ENXIO;
102 }
103
104 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
105 if (!hpriv)
106 return -ENOMEM;
107
108 hpriv->hdev = hdev;
109 filp->private_data = hpriv;
110 hpriv->filp = filp;
111 mutex_init(&hpriv->restore_phase_mutex);
112 kref_init(&hpriv->refcount);
113 nonseekable_open(inode, filp);
114
115 hl_cb_mgr_init(&hpriv->cb_mgr);
116 hl_ctx_mgr_init(&hpriv->ctx_mgr);
117
118 hpriv->taskpid = find_get_pid(current->pid);
119
120 mutex_lock(&hdev->fpriv_list_lock);
121
122 if (hl_device_disabled_or_in_reset(hdev)) {
123 dev_err_ratelimited(hdev->dev,
124 "Can't open %s because it is disabled or in reset\n",
125 dev_name(hdev->dev));
126 rc = -EPERM;
127 goto out_err;
128 }
129
130 if (hdev->in_debug) {
131 dev_err_ratelimited(hdev->dev,
132 "Can't open %s because it is being debugged by another user\n",
133 dev_name(hdev->dev));
134 rc = -EPERM;
135 goto out_err;
136 }
137
138 if (hdev->compute_ctx) {
139 dev_dbg_ratelimited(hdev->dev,
140 "Can't open %s because another user is working on it\n",
141 dev_name(hdev->dev));
142 rc = -EBUSY;
143 goto out_err;
144 }
145
146 rc = hl_ctx_create(hdev, hpriv);
147 if (rc) {
148 dev_err(hdev->dev, "Failed to create context %d\n", rc);
149 goto out_err;
150 }
151
152 /* Device is IDLE at this point so it is legal to change PLLs.
153 * There is no need to check anything because if the PLL is
154 * already HIGH, the set function will return without doing
155 * anything
156 */
157 hl_device_set_frequency(hdev, PLL_HIGH);
158
159 list_add(&hpriv->dev_node, &hdev->fpriv_list);
160 mutex_unlock(&hdev->fpriv_list_lock);
161
162 hl_debugfs_add_file(hpriv);
163
164 return 0;
165
166 out_err:
167 mutex_unlock(&hdev->fpriv_list_lock);
168
169 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
170 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
171 filp->private_data = NULL;
172 mutex_destroy(&hpriv->restore_phase_mutex);
173 put_pid(hpriv->taskpid);
174
175 kfree(hpriv);
176
177 return rc;
178 }
179
hl_device_open_ctrl(struct inode * inode,struct file * filp)180 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
181 {
182 struct hl_device *hdev;
183 struct hl_fpriv *hpriv;
184 int rc;
185
186 mutex_lock(&hl_devs_idr_lock);
187 hdev = idr_find(&hl_devs_idr, iminor(inode));
188 mutex_unlock(&hl_devs_idr_lock);
189
190 if (!hdev) {
191 pr_err("Couldn't find device %d:%d\n",
192 imajor(inode), iminor(inode));
193 return -ENXIO;
194 }
195
196 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
197 if (!hpriv)
198 return -ENOMEM;
199
200 mutex_lock(&hdev->fpriv_list_lock);
201
202 if (hl_device_disabled_or_in_reset(hdev)) {
203 dev_err_ratelimited(hdev->dev_ctrl,
204 "Can't open %s because it is disabled or in reset\n",
205 dev_name(hdev->dev_ctrl));
206 rc = -EPERM;
207 goto out_err;
208 }
209
210 list_add(&hpriv->dev_node, &hdev->fpriv_list);
211 mutex_unlock(&hdev->fpriv_list_lock);
212
213 hpriv->hdev = hdev;
214 filp->private_data = hpriv;
215 hpriv->filp = filp;
216 hpriv->is_control = true;
217 nonseekable_open(inode, filp);
218
219 hpriv->taskpid = find_get_pid(current->pid);
220
221 return 0;
222
223 out_err:
224 mutex_unlock(&hdev->fpriv_list_lock);
225 kfree(hpriv);
226 return rc;
227 }
228
set_driver_behavior_per_device(struct hl_device * hdev)229 static void set_driver_behavior_per_device(struct hl_device *hdev)
230 {
231 hdev->mmu_enable = 1;
232 hdev->cpu_enable = 1;
233 hdev->fw_loading = 1;
234 hdev->cpu_queues_enable = 1;
235 hdev->heartbeat = 1;
236 hdev->clock_gating_mask = ULONG_MAX;
237
238 hdev->reset_pcilink = 0;
239 hdev->axi_drain = 0;
240 hdev->sram_scrambler_enable = 1;
241 hdev->dram_scrambler_enable = 1;
242 hdev->bmc_enable = 1;
243 hdev->hard_reset_on_fw_events = 1;
244 }
245
246 /*
247 * create_hdev - create habanalabs device instance
248 *
249 * @dev: will hold the pointer to the new habanalabs device structure
250 * @pdev: pointer to the pci device
251 * @asic_type: in case of simulator device, which device is it
252 * @minor: in case of simulator device, the minor of the device
253 *
254 * Allocate memory for habanalabs device and initialize basic fields
255 * Identify the ASIC type
256 * Allocate ID (minor) for the device (only for real devices)
257 */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev,enum hl_asic_type asic_type,int minor)258 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
259 enum hl_asic_type asic_type, int minor)
260 {
261 struct hl_device *hdev;
262 int rc, main_id, ctrl_id = 0;
263
264 *dev = NULL;
265
266 hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
267 if (!hdev)
268 return -ENOMEM;
269
270 /* First, we must find out which ASIC are we handling. This is needed
271 * to configure the behavior of the driver (kernel parameters)
272 */
273 if (pdev) {
274 hdev->asic_type = get_asic_type(pdev->device);
275 if (hdev->asic_type == ASIC_INVALID) {
276 dev_err(&pdev->dev, "Unsupported ASIC\n");
277 rc = -ENODEV;
278 goto free_hdev;
279 }
280 } else {
281 hdev->asic_type = asic_type;
282 }
283
284 hdev->major = hl_major;
285 hdev->reset_on_lockup = reset_on_lockup;
286 hdev->pldm = 0;
287
288 set_driver_behavior_per_device(hdev);
289
290 if (timeout_locked)
291 hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
292 else
293 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
294
295 hdev->disabled = true;
296 hdev->pdev = pdev; /* can be NULL in case of simulator device */
297
298 /* Set default DMA mask to 32 bits */
299 hdev->dma_mask = 32;
300
301 mutex_lock(&hl_devs_idr_lock);
302
303 /* Always save 2 numbers, 1 for main device and 1 for control.
304 * They must be consecutive
305 */
306 main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
307 GFP_KERNEL);
308
309 if (main_id >= 0)
310 ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
311 main_id + 2, GFP_KERNEL);
312
313 mutex_unlock(&hl_devs_idr_lock);
314
315 if ((main_id < 0) || (ctrl_id < 0)) {
316 if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
317 pr_err("too many devices in the system\n");
318
319 if (main_id >= 0) {
320 mutex_lock(&hl_devs_idr_lock);
321 idr_remove(&hl_devs_idr, main_id);
322 mutex_unlock(&hl_devs_idr_lock);
323 }
324
325 rc = -EBUSY;
326 goto free_hdev;
327 }
328
329 hdev->id = main_id;
330 hdev->id_control = ctrl_id;
331
332 *dev = hdev;
333
334 return 0;
335
336 free_hdev:
337 kfree(hdev);
338 return rc;
339 }
340
341 /*
342 * destroy_hdev - destroy habanalabs device instance
343 *
344 * @dev: pointer to the habanalabs device structure
345 *
346 */
destroy_hdev(struct hl_device * hdev)347 void destroy_hdev(struct hl_device *hdev)
348 {
349 /* Remove device from the device list */
350 mutex_lock(&hl_devs_idr_lock);
351 idr_remove(&hl_devs_idr, hdev->id);
352 idr_remove(&hl_devs_idr, hdev->id_control);
353 mutex_unlock(&hl_devs_idr_lock);
354
355 kfree(hdev);
356 }
357
hl_pmops_suspend(struct device * dev)358 static int hl_pmops_suspend(struct device *dev)
359 {
360 struct hl_device *hdev = dev_get_drvdata(dev);
361
362 pr_debug("Going to suspend PCI device\n");
363
364 if (!hdev) {
365 pr_err("device pointer is NULL in suspend\n");
366 return 0;
367 }
368
369 return hl_device_suspend(hdev);
370 }
371
hl_pmops_resume(struct device * dev)372 static int hl_pmops_resume(struct device *dev)
373 {
374 struct hl_device *hdev = dev_get_drvdata(dev);
375
376 pr_debug("Going to resume PCI device\n");
377
378 if (!hdev) {
379 pr_err("device pointer is NULL in resume\n");
380 return 0;
381 }
382
383 return hl_device_resume(hdev);
384 }
385
386 /*
387 * hl_pci_probe - probe PCI habanalabs devices
388 *
389 * @pdev: pointer to pci device
390 * @id: pointer to pci device id structure
391 *
392 * Standard PCI probe function for habanalabs device.
393 * Create a new habanalabs device and initialize it according to the
394 * device's type
395 */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)396 static int hl_pci_probe(struct pci_dev *pdev,
397 const struct pci_device_id *id)
398 {
399 struct hl_device *hdev;
400 int rc;
401
402 dev_info(&pdev->dev, HL_NAME
403 " device found [%04x:%04x] (rev %x)\n",
404 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
405
406 rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
407 if (rc)
408 return rc;
409
410 pci_set_drvdata(pdev, hdev);
411
412 pci_enable_pcie_error_reporting(pdev);
413
414 rc = hl_device_init(hdev, hl_class);
415 if (rc) {
416 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
417 rc = -ENODEV;
418 goto disable_device;
419 }
420
421 return 0;
422
423 disable_device:
424 pci_set_drvdata(pdev, NULL);
425 destroy_hdev(hdev);
426
427 return rc;
428 }
429
430 /*
431 * hl_pci_remove - remove PCI habanalabs devices
432 *
433 * @pdev: pointer to pci device
434 *
435 * Standard PCI remove function for habanalabs device
436 */
hl_pci_remove(struct pci_dev * pdev)437 static void hl_pci_remove(struct pci_dev *pdev)
438 {
439 struct hl_device *hdev;
440
441 hdev = pci_get_drvdata(pdev);
442 if (!hdev)
443 return;
444
445 hl_device_fini(hdev);
446 pci_disable_pcie_error_reporting(pdev);
447 pci_set_drvdata(pdev, NULL);
448 destroy_hdev(hdev);
449 }
450
451 /**
452 * hl_pci_err_detected - a PCI bus error detected on this device
453 *
454 * @pdev: pointer to pci device
455 * @state: PCI error type
456 *
457 * Called by the PCI subsystem whenever a non-correctable
458 * PCI bus error is detected
459 */
460 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)461 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
462 {
463 struct hl_device *hdev = pci_get_drvdata(pdev);
464 enum pci_ers_result result;
465
466 switch (state) {
467 case pci_channel_io_normal:
468 return PCI_ERS_RESULT_CAN_RECOVER;
469
470 case pci_channel_io_frozen:
471 dev_warn(hdev->dev, "frozen state error detected\n");
472 result = PCI_ERS_RESULT_NEED_RESET;
473 break;
474
475 case pci_channel_io_perm_failure:
476 dev_warn(hdev->dev, "failure state error detected\n");
477 result = PCI_ERS_RESULT_DISCONNECT;
478 break;
479
480 default:
481 result = PCI_ERS_RESULT_NONE;
482 }
483
484 hdev->asic_funcs->halt_engines(hdev, true);
485
486 return result;
487 }
488
489 /**
490 * hl_pci_err_resume - resume after a PCI slot reset
491 *
492 * @pdev: pointer to pci device
493 *
494 */
hl_pci_err_resume(struct pci_dev * pdev)495 static void hl_pci_err_resume(struct pci_dev *pdev)
496 {
497 struct hl_device *hdev = pci_get_drvdata(pdev);
498
499 dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
500 hl_device_resume(hdev);
501 }
502
503 /**
504 * hl_pci_err_slot_reset - a PCI slot reset has just happened
505 *
506 * @pdev: pointer to pci device
507 *
508 * Determine if the driver can recover from the PCI slot reset
509 */
hl_pci_err_slot_reset(struct pci_dev * pdev)510 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
511 {
512 return PCI_ERS_RESULT_RECOVERED;
513 }
514
515 static const struct dev_pm_ops hl_pm_ops = {
516 .suspend = hl_pmops_suspend,
517 .resume = hl_pmops_resume,
518 };
519
520 static const struct pci_error_handlers hl_pci_err_handler = {
521 .error_detected = hl_pci_err_detected,
522 .slot_reset = hl_pci_err_slot_reset,
523 .resume = hl_pci_err_resume,
524 };
525
526 static struct pci_driver hl_pci_driver = {
527 .name = HL_NAME,
528 .id_table = ids,
529 .probe = hl_pci_probe,
530 .remove = hl_pci_remove,
531 .driver.pm = &hl_pm_ops,
532 .err_handler = &hl_pci_err_handler,
533 };
534
535 /*
536 * hl_init - Initialize the habanalabs kernel driver
537 */
hl_init(void)538 static int __init hl_init(void)
539 {
540 int rc;
541 dev_t dev;
542
543 pr_info("loading driver\n");
544
545 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
546 if (rc < 0) {
547 pr_err("unable to get major\n");
548 return rc;
549 }
550
551 hl_major = MAJOR(dev);
552
553 hl_class = class_create(THIS_MODULE, HL_NAME);
554 if (IS_ERR(hl_class)) {
555 pr_err("failed to allocate class\n");
556 rc = PTR_ERR(hl_class);
557 goto remove_major;
558 }
559
560 hl_debugfs_init();
561
562 rc = pci_register_driver(&hl_pci_driver);
563 if (rc) {
564 pr_err("failed to register pci device\n");
565 goto remove_debugfs;
566 }
567
568 pr_debug("driver loaded\n");
569
570 return 0;
571
572 remove_debugfs:
573 hl_debugfs_fini();
574 class_destroy(hl_class);
575 remove_major:
576 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
577 return rc;
578 }
579
580 /*
581 * hl_exit - Release all resources of the habanalabs kernel driver
582 */
hl_exit(void)583 static void __exit hl_exit(void)
584 {
585 pci_unregister_driver(&hl_pci_driver);
586
587 /*
588 * Removing debugfs must be after all devices or simulator devices
589 * have been removed because otherwise we get a bug in the
590 * debugfs module for referencing NULL objects
591 */
592 hl_debugfs_fini();
593
594 class_destroy(hl_class);
595 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
596
597 idr_destroy(&hl_devs_idr);
598
599 pr_debug("driver removed\n");
600 }
601
602 module_init(hl_init);
603 module_exit(hl_exit);
604