1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin");
84
85 #define AMDGPU_RESUME_MS 2000
86
87 const char *amdgpu_asic_name[] = {
88 "TAHITI",
89 "PITCAIRN",
90 "VERDE",
91 "OLAND",
92 "HAINAN",
93 "BONAIRE",
94 "KAVERI",
95 "KABINI",
96 "HAWAII",
97 "MULLINS",
98 "TOPAZ",
99 "TONGA",
100 "FIJI",
101 "CARRIZO",
102 "STONEY",
103 "POLARIS10",
104 "POLARIS11",
105 "POLARIS12",
106 "VEGAM",
107 "VEGA10",
108 "VEGA12",
109 "VEGA20",
110 "RAVEN",
111 "ARCTURUS",
112 "RENOIR",
113 "NAVI10",
114 "NAVI14",
115 "NAVI12",
116 "SIENNA_CICHLID",
117 "NAVY_FLOUNDER",
118 "LAST",
119 };
120
121 /**
122 * DOC: pcie_replay_count
123 *
124 * The amdgpu driver provides a sysfs API for reporting the total number
125 * of PCIe replays (NAKs)
126 * The file pcie_replay_count is used for this and returns the total
127 * number of replays as a sum of the NAKs generated and NAKs received
128 */
129
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)130 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
131 struct device_attribute *attr, char *buf)
132 {
133 struct drm_device *ddev = dev_get_drvdata(dev);
134 struct amdgpu_device *adev = drm_to_adev(ddev);
135 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
136
137 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
138 }
139
140 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
141 amdgpu_device_get_pcie_replay_count, NULL);
142
143 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
144
145 /**
146 * DOC: product_name
147 *
148 * The amdgpu driver provides a sysfs API for reporting the product name
149 * for the device
150 * The file serial_number is used for this and returns the product name
151 * as returned from the FRU.
152 * NOTE: This is only available for certain server cards
153 */
154
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)155 static ssize_t amdgpu_device_get_product_name(struct device *dev,
156 struct device_attribute *attr, char *buf)
157 {
158 struct drm_device *ddev = dev_get_drvdata(dev);
159 struct amdgpu_device *adev = drm_to_adev(ddev);
160
161 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
162 }
163
164 static DEVICE_ATTR(product_name, S_IRUGO,
165 amdgpu_device_get_product_name, NULL);
166
167 /**
168 * DOC: product_number
169 *
170 * The amdgpu driver provides a sysfs API for reporting the part number
171 * for the device
172 * The file serial_number is used for this and returns the part number
173 * as returned from the FRU.
174 * NOTE: This is only available for certain server cards
175 */
176
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)177 static ssize_t amdgpu_device_get_product_number(struct device *dev,
178 struct device_attribute *attr, char *buf)
179 {
180 struct drm_device *ddev = dev_get_drvdata(dev);
181 struct amdgpu_device *adev = drm_to_adev(ddev);
182
183 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
184 }
185
186 static DEVICE_ATTR(product_number, S_IRUGO,
187 amdgpu_device_get_product_number, NULL);
188
189 /**
190 * DOC: serial_number
191 *
192 * The amdgpu driver provides a sysfs API for reporting the serial number
193 * for the device
194 * The file serial_number is used for this and returns the serial number
195 * as returned from the FRU.
196 * NOTE: This is only available for certain server cards
197 */
198
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)199 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
200 struct device_attribute *attr, char *buf)
201 {
202 struct drm_device *ddev = dev_get_drvdata(dev);
203 struct amdgpu_device *adev = drm_to_adev(ddev);
204
205 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
206 }
207
208 static DEVICE_ATTR(serial_number, S_IRUGO,
209 amdgpu_device_get_serial_number, NULL);
210
211 /**
212 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
213 *
214 * @dev: drm_device pointer
215 *
216 * Returns true if the device is a dGPU with HG/PX power control,
217 * otherwise return false.
218 */
amdgpu_device_supports_boco(struct drm_device * dev)219 bool amdgpu_device_supports_boco(struct drm_device *dev)
220 {
221 struct amdgpu_device *adev = drm_to_adev(dev);
222
223 if (adev->flags & AMD_IS_PX)
224 return true;
225 return false;
226 }
227
228 /**
229 * amdgpu_device_supports_baco - Does the device support BACO
230 *
231 * @dev: drm_device pointer
232 *
233 * Returns true if the device supporte BACO,
234 * otherwise return false.
235 */
amdgpu_device_supports_baco(struct drm_device * dev)236 bool amdgpu_device_supports_baco(struct drm_device *dev)
237 {
238 struct amdgpu_device *adev = drm_to_adev(dev);
239
240 return amdgpu_asic_supports_baco(adev);
241 }
242
243 /*
244 * VRAM access helper functions
245 */
246
247 /**
248 * amdgpu_device_vram_access - read/write a buffer in vram
249 *
250 * @adev: amdgpu_device pointer
251 * @pos: offset of the buffer in vram
252 * @buf: virtual address of the buffer in system memory
253 * @size: read/write size, sizeof(@buf) must > @size
254 * @write: true - write to vram, otherwise - read from vram
255 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)256 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
257 uint32_t *buf, size_t size, bool write)
258 {
259 unsigned long flags;
260 uint32_t hi = ~0;
261 uint64_t last;
262
263
264 #ifdef CONFIG_64BIT
265 last = min(pos + size, adev->gmc.visible_vram_size);
266 if (last > pos) {
267 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
268 size_t count = last - pos;
269
270 if (write) {
271 memcpy_toio(addr, buf, count);
272 mb();
273 amdgpu_asic_flush_hdp(adev, NULL);
274 } else {
275 amdgpu_asic_invalidate_hdp(adev, NULL);
276 mb();
277 memcpy_fromio(buf, addr, count);
278 }
279
280 if (count == size)
281 return;
282
283 pos += count;
284 buf += count / 4;
285 size -= count;
286 }
287 #endif
288
289 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
290 for (last = pos + size; pos < last; pos += 4) {
291 uint32_t tmp = pos >> 31;
292
293 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
294 if (tmp != hi) {
295 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
296 hi = tmp;
297 }
298 if (write)
299 WREG32_NO_KIQ(mmMM_DATA, *buf++);
300 else
301 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
302 }
303 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
304 }
305
306 /*
307 * register access helper functions.
308 */
309 /**
310 * amdgpu_device_rreg - read a memory mapped IO or indirect register
311 *
312 * @adev: amdgpu_device pointer
313 * @reg: dword aligned register offset
314 * @acc_flags: access flags which require special behavior
315 *
316 * Returns the 32 bit value from the offset specified.
317 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)318 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
319 uint32_t reg, uint32_t acc_flags)
320 {
321 uint32_t ret;
322
323 if (adev->in_pci_err_recovery)
324 return 0;
325
326 if ((reg * 4) < adev->rmmio_size) {
327 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
328 amdgpu_sriov_runtime(adev) &&
329 down_read_trylock(&adev->reset_sem)) {
330 ret = amdgpu_kiq_rreg(adev, reg);
331 up_read(&adev->reset_sem);
332 } else {
333 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
334 }
335 } else {
336 ret = adev->pcie_rreg(adev, reg * 4);
337 }
338
339 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
340
341 return ret;
342 }
343
344 /*
345 * MMIO register read with bytes helper functions
346 * @offset:bytes offset from MMIO start
347 *
348 */
349
350 /**
351 * amdgpu_mm_rreg8 - read a memory mapped IO register
352 *
353 * @adev: amdgpu_device pointer
354 * @offset: byte aligned register offset
355 *
356 * Returns the 8 bit value from the offset specified.
357 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)358 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
359 {
360 if (adev->in_pci_err_recovery)
361 return 0;
362
363 if (offset < adev->rmmio_size)
364 return (readb(adev->rmmio + offset));
365 BUG();
366 }
367
368 /*
369 * MMIO register write with bytes helper functions
370 * @offset:bytes offset from MMIO start
371 * @value: the value want to be written to the register
372 *
373 */
374 /**
375 * amdgpu_mm_wreg8 - read a memory mapped IO register
376 *
377 * @adev: amdgpu_device pointer
378 * @offset: byte aligned register offset
379 * @value: 8 bit value to write
380 *
381 * Writes the value specified to the offset specified.
382 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)383 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
384 {
385 if (adev->in_pci_err_recovery)
386 return;
387
388 if (offset < adev->rmmio_size)
389 writeb(value, adev->rmmio + offset);
390 else
391 BUG();
392 }
393
394 /**
395 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
396 *
397 * @adev: amdgpu_device pointer
398 * @reg: dword aligned register offset
399 * @v: 32 bit value to write to the register
400 * @acc_flags: access flags which require special behavior
401 *
402 * Writes the value specified to the offset specified.
403 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)404 void amdgpu_device_wreg(struct amdgpu_device *adev,
405 uint32_t reg, uint32_t v,
406 uint32_t acc_flags)
407 {
408 if (adev->in_pci_err_recovery)
409 return;
410
411 if ((reg * 4) < adev->rmmio_size) {
412 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
413 amdgpu_sriov_runtime(adev) &&
414 down_read_trylock(&adev->reset_sem)) {
415 amdgpu_kiq_wreg(adev, reg, v);
416 up_read(&adev->reset_sem);
417 } else {
418 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
419 }
420 } else {
421 adev->pcie_wreg(adev, reg * 4, v);
422 }
423
424 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
425 }
426
427 /*
428 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
429 *
430 * this function is invoked only the debugfs register access
431 * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)432 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
433 uint32_t reg, uint32_t v)
434 {
435 if (adev->in_pci_err_recovery)
436 return;
437
438 if (amdgpu_sriov_fullaccess(adev) &&
439 adev->gfx.rlc.funcs &&
440 adev->gfx.rlc.funcs->is_rlcg_access_range) {
441 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
442 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
443 } else {
444 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
445 }
446 }
447
448 /**
449 * amdgpu_io_rreg - read an IO register
450 *
451 * @adev: amdgpu_device pointer
452 * @reg: dword aligned register offset
453 *
454 * Returns the 32 bit value from the offset specified.
455 */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)456 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
457 {
458 if (adev->in_pci_err_recovery)
459 return 0;
460
461 if ((reg * 4) < adev->rio_mem_size)
462 return ioread32(adev->rio_mem + (reg * 4));
463 else {
464 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
465 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
466 }
467 }
468
469 /**
470 * amdgpu_io_wreg - write to an IO register
471 *
472 * @adev: amdgpu_device pointer
473 * @reg: dword aligned register offset
474 * @v: 32 bit value to write to the register
475 *
476 * Writes the value specified to the offset specified.
477 */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)478 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
479 {
480 if (adev->in_pci_err_recovery)
481 return;
482
483 if ((reg * 4) < adev->rio_mem_size)
484 iowrite32(v, adev->rio_mem + (reg * 4));
485 else {
486 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
487 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
488 }
489 }
490
491 /**
492 * amdgpu_mm_rdoorbell - read a doorbell dword
493 *
494 * @adev: amdgpu_device pointer
495 * @index: doorbell index
496 *
497 * Returns the value in the doorbell aperture at the
498 * requested doorbell index (CIK).
499 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)500 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
501 {
502 if (adev->in_pci_err_recovery)
503 return 0;
504
505 if (index < adev->doorbell.num_doorbells) {
506 return readl(adev->doorbell.ptr + index);
507 } else {
508 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
509 return 0;
510 }
511 }
512
513 /**
514 * amdgpu_mm_wdoorbell - write a doorbell dword
515 *
516 * @adev: amdgpu_device pointer
517 * @index: doorbell index
518 * @v: value to write
519 *
520 * Writes @v to the doorbell aperture at the
521 * requested doorbell index (CIK).
522 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)523 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
524 {
525 if (adev->in_pci_err_recovery)
526 return;
527
528 if (index < adev->doorbell.num_doorbells) {
529 writel(v, adev->doorbell.ptr + index);
530 } else {
531 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
532 }
533 }
534
535 /**
536 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
537 *
538 * @adev: amdgpu_device pointer
539 * @index: doorbell index
540 *
541 * Returns the value in the doorbell aperture at the
542 * requested doorbell index (VEGA10+).
543 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)544 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
545 {
546 if (adev->in_pci_err_recovery)
547 return 0;
548
549 if (index < adev->doorbell.num_doorbells) {
550 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
551 } else {
552 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
553 return 0;
554 }
555 }
556
557 /**
558 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
559 *
560 * @adev: amdgpu_device pointer
561 * @index: doorbell index
562 * @v: value to write
563 *
564 * Writes @v to the doorbell aperture at the
565 * requested doorbell index (VEGA10+).
566 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)567 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
568 {
569 if (adev->in_pci_err_recovery)
570 return;
571
572 if (index < adev->doorbell.num_doorbells) {
573 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
574 } else {
575 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
576 }
577 }
578
579 /**
580 * amdgpu_device_indirect_rreg - read an indirect register
581 *
582 * @adev: amdgpu_device pointer
583 * @pcie_index: mmio register offset
584 * @pcie_data: mmio register offset
585 *
586 * Returns the value of indirect register @reg_addr
587 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)588 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
589 u32 pcie_index, u32 pcie_data,
590 u32 reg_addr)
591 {
592 unsigned long flags;
593 u32 r;
594 void __iomem *pcie_index_offset;
595 void __iomem *pcie_data_offset;
596
597 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
598 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
599 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
600
601 writel(reg_addr, pcie_index_offset);
602 readl(pcie_index_offset);
603 r = readl(pcie_data_offset);
604 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
605
606 return r;
607 }
608
609 /**
610 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
611 *
612 * @adev: amdgpu_device pointer
613 * @pcie_index: mmio register offset
614 * @pcie_data: mmio register offset
615 *
616 * Returns the value of indirect register @reg_addr
617 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)618 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
619 u32 pcie_index, u32 pcie_data,
620 u32 reg_addr)
621 {
622 unsigned long flags;
623 u64 r;
624 void __iomem *pcie_index_offset;
625 void __iomem *pcie_data_offset;
626
627 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
628 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
629 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
630
631 /* read low 32 bits */
632 writel(reg_addr, pcie_index_offset);
633 readl(pcie_index_offset);
634 r = readl(pcie_data_offset);
635 /* read high 32 bits */
636 writel(reg_addr + 4, pcie_index_offset);
637 readl(pcie_index_offset);
638 r |= ((u64)readl(pcie_data_offset) << 32);
639 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
640
641 return r;
642 }
643
644 /**
645 * amdgpu_device_indirect_wreg - write an indirect register address
646 *
647 * @adev: amdgpu_device pointer
648 * @pcie_index: mmio register offset
649 * @pcie_data: mmio register offset
650 * @reg_addr: indirect register offset
651 * @reg_data: indirect register data
652 *
653 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)654 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
655 u32 pcie_index, u32 pcie_data,
656 u32 reg_addr, u32 reg_data)
657 {
658 unsigned long flags;
659 void __iomem *pcie_index_offset;
660 void __iomem *pcie_data_offset;
661
662 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
663 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
664 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
665
666 writel(reg_addr, pcie_index_offset);
667 readl(pcie_index_offset);
668 writel(reg_data, pcie_data_offset);
669 readl(pcie_data_offset);
670 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
671 }
672
673 /**
674 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
675 *
676 * @adev: amdgpu_device pointer
677 * @pcie_index: mmio register offset
678 * @pcie_data: mmio register offset
679 * @reg_addr: indirect register offset
680 * @reg_data: indirect register data
681 *
682 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)683 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
684 u32 pcie_index, u32 pcie_data,
685 u32 reg_addr, u64 reg_data)
686 {
687 unsigned long flags;
688 void __iomem *pcie_index_offset;
689 void __iomem *pcie_data_offset;
690
691 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
692 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
693 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
694
695 /* write low 32 bits */
696 writel(reg_addr, pcie_index_offset);
697 readl(pcie_index_offset);
698 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
699 readl(pcie_data_offset);
700 /* write high 32 bits */
701 writel(reg_addr + 4, pcie_index_offset);
702 readl(pcie_index_offset);
703 writel((u32)(reg_data >> 32), pcie_data_offset);
704 readl(pcie_data_offset);
705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
706 }
707
708 /**
709 * amdgpu_invalid_rreg - dummy reg read function
710 *
711 * @adev: amdgpu_device pointer
712 * @reg: offset of register
713 *
714 * Dummy register read function. Used for register blocks
715 * that certain asics don't have (all asics).
716 * Returns the value in the register.
717 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)718 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
719 {
720 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
721 BUG();
722 return 0;
723 }
724
725 /**
726 * amdgpu_invalid_wreg - dummy reg write function
727 *
728 * @adev: amdgpu_device pointer
729 * @reg: offset of register
730 * @v: value to write to the register
731 *
732 * Dummy register read function. Used for register blocks
733 * that certain asics don't have (all asics).
734 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)735 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
736 {
737 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
738 reg, v);
739 BUG();
740 }
741
742 /**
743 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
744 *
745 * @adev: amdgpu_device pointer
746 * @reg: offset of register
747 *
748 * Dummy register read function. Used for register blocks
749 * that certain asics don't have (all asics).
750 * Returns the value in the register.
751 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)752 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
753 {
754 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
755 BUG();
756 return 0;
757 }
758
759 /**
760 * amdgpu_invalid_wreg64 - dummy reg write function
761 *
762 * @adev: amdgpu_device pointer
763 * @reg: offset of register
764 * @v: value to write to the register
765 *
766 * Dummy register read function. Used for register blocks
767 * that certain asics don't have (all asics).
768 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)769 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
770 {
771 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
772 reg, v);
773 BUG();
774 }
775
776 /**
777 * amdgpu_block_invalid_rreg - dummy reg read function
778 *
779 * @adev: amdgpu_device pointer
780 * @block: offset of instance
781 * @reg: offset of register
782 *
783 * Dummy register read function. Used for register blocks
784 * that certain asics don't have (all asics).
785 * Returns the value in the register.
786 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)787 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
788 uint32_t block, uint32_t reg)
789 {
790 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
791 reg, block);
792 BUG();
793 return 0;
794 }
795
796 /**
797 * amdgpu_block_invalid_wreg - dummy reg write function
798 *
799 * @adev: amdgpu_device pointer
800 * @block: offset of instance
801 * @reg: offset of register
802 * @v: value to write to the register
803 *
804 * Dummy register read function. Used for register blocks
805 * that certain asics don't have (all asics).
806 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)807 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
808 uint32_t block,
809 uint32_t reg, uint32_t v)
810 {
811 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
812 reg, block, v);
813 BUG();
814 }
815
816 /**
817 * amdgpu_device_asic_init - Wrapper for atom asic_init
818 *
819 * @adev: amdgpu_device pointer
820 *
821 * Does any asic specific work and then calls atom asic init.
822 */
amdgpu_device_asic_init(struct amdgpu_device * adev)823 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
824 {
825 amdgpu_asic_pre_asic_init(adev);
826
827 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
828 }
829
830 /**
831 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
832 *
833 * @adev: amdgpu_device pointer
834 *
835 * Allocates a scratch page of VRAM for use by various things in the
836 * driver.
837 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)838 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
839 {
840 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
841 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
842 &adev->vram_scratch.robj,
843 &adev->vram_scratch.gpu_addr,
844 (void **)&adev->vram_scratch.ptr);
845 }
846
847 /**
848 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
849 *
850 * @adev: amdgpu_device pointer
851 *
852 * Frees the VRAM scratch page.
853 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)854 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
855 {
856 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
857 }
858
859 /**
860 * amdgpu_device_program_register_sequence - program an array of registers.
861 *
862 * @adev: amdgpu_device pointer
863 * @registers: pointer to the register array
864 * @array_size: size of the register array
865 *
866 * Programs an array or registers with and and or masks.
867 * This is a helper for setting golden registers.
868 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)869 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
870 const u32 *registers,
871 const u32 array_size)
872 {
873 u32 tmp, reg, and_mask, or_mask;
874 int i;
875
876 if (array_size % 3)
877 return;
878
879 for (i = 0; i < array_size; i +=3) {
880 reg = registers[i + 0];
881 and_mask = registers[i + 1];
882 or_mask = registers[i + 2];
883
884 if (and_mask == 0xffffffff) {
885 tmp = or_mask;
886 } else {
887 tmp = RREG32(reg);
888 tmp &= ~and_mask;
889 if (adev->family >= AMDGPU_FAMILY_AI)
890 tmp |= (or_mask & and_mask);
891 else
892 tmp |= or_mask;
893 }
894 WREG32(reg, tmp);
895 }
896 }
897
898 /**
899 * amdgpu_device_pci_config_reset - reset the GPU
900 *
901 * @adev: amdgpu_device pointer
902 *
903 * Resets the GPU using the pci config reset sequence.
904 * Only applicable to asics prior to vega10.
905 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)906 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
907 {
908 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
909 }
910
911 /*
912 * GPU doorbell aperture helpers function.
913 */
914 /**
915 * amdgpu_device_doorbell_init - Init doorbell driver information.
916 *
917 * @adev: amdgpu_device pointer
918 *
919 * Init doorbell driver information (CIK)
920 * Returns 0 on success, error on failure.
921 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)922 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
923 {
924
925 /* No doorbell on SI hardware generation */
926 if (adev->asic_type < CHIP_BONAIRE) {
927 adev->doorbell.base = 0;
928 adev->doorbell.size = 0;
929 adev->doorbell.num_doorbells = 0;
930 adev->doorbell.ptr = NULL;
931 return 0;
932 }
933
934 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
935 return -EINVAL;
936
937 amdgpu_asic_init_doorbell_index(adev);
938
939 /* doorbell bar mapping */
940 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
941 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
942
943 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
944 adev->doorbell_index.max_assignment+1);
945 if (adev->doorbell.num_doorbells == 0)
946 return -EINVAL;
947
948 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
949 * paging queue doorbell use the second page. The
950 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
951 * doorbells are in the first page. So with paging queue enabled,
952 * the max num_doorbells should + 1 page (0x400 in dword)
953 */
954 if (adev->asic_type >= CHIP_VEGA10)
955 adev->doorbell.num_doorbells += 0x400;
956
957 adev->doorbell.ptr = ioremap(adev->doorbell.base,
958 adev->doorbell.num_doorbells *
959 sizeof(u32));
960 if (adev->doorbell.ptr == NULL)
961 return -ENOMEM;
962
963 return 0;
964 }
965
966 /**
967 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
968 *
969 * @adev: amdgpu_device pointer
970 *
971 * Tear down doorbell driver information (CIK)
972 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)973 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
974 {
975 iounmap(adev->doorbell.ptr);
976 adev->doorbell.ptr = NULL;
977 }
978
979
980
981 /*
982 * amdgpu_device_wb_*()
983 * Writeback is the method by which the GPU updates special pages in memory
984 * with the status of certain GPU events (fences, ring pointers,etc.).
985 */
986
987 /**
988 * amdgpu_device_wb_fini - Disable Writeback and free memory
989 *
990 * @adev: amdgpu_device pointer
991 *
992 * Disables Writeback and frees the Writeback memory (all asics).
993 * Used at driver shutdown.
994 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)995 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
996 {
997 if (adev->wb.wb_obj) {
998 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
999 &adev->wb.gpu_addr,
1000 (void **)&adev->wb.wb);
1001 adev->wb.wb_obj = NULL;
1002 }
1003 }
1004
1005 /**
1006 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1007 *
1008 * @adev: amdgpu_device pointer
1009 *
1010 * Initializes writeback and allocates writeback memory (all asics).
1011 * Used at driver startup.
1012 * Returns 0 on success or an -error on failure.
1013 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1014 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1015 {
1016 int r;
1017
1018 if (adev->wb.wb_obj == NULL) {
1019 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1020 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1021 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1022 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1023 (void **)&adev->wb.wb);
1024 if (r) {
1025 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1026 return r;
1027 }
1028
1029 adev->wb.num_wb = AMDGPU_MAX_WB;
1030 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1031
1032 /* clear wb memory */
1033 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1034 }
1035
1036 return 0;
1037 }
1038
1039 /**
1040 * amdgpu_device_wb_get - Allocate a wb entry
1041 *
1042 * @adev: amdgpu_device pointer
1043 * @wb: wb index
1044 *
1045 * Allocate a wb slot for use by the driver (all asics).
1046 * Returns 0 on success or -EINVAL on failure.
1047 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1048 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1049 {
1050 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1051
1052 if (offset < adev->wb.num_wb) {
1053 __set_bit(offset, adev->wb.used);
1054 *wb = offset << 3; /* convert to dw offset */
1055 return 0;
1056 } else {
1057 return -EINVAL;
1058 }
1059 }
1060
1061 /**
1062 * amdgpu_device_wb_free - Free a wb entry
1063 *
1064 * @adev: amdgpu_device pointer
1065 * @wb: wb index
1066 *
1067 * Free a wb slot allocated for use by the driver (all asics)
1068 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1069 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1070 {
1071 wb >>= 3;
1072 if (wb < adev->wb.num_wb)
1073 __clear_bit(wb, adev->wb.used);
1074 }
1075
1076 /**
1077 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1078 *
1079 * @adev: amdgpu_device pointer
1080 *
1081 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1082 * to fail, but if any of the BARs is not accessible after the size we abort
1083 * driver loading by returning -ENODEV.
1084 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1085 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1086 {
1087 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1088 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1089 struct pci_bus *root;
1090 struct resource *res;
1091 unsigned i;
1092 u16 cmd;
1093 int r;
1094
1095 /* Bypass for VF */
1096 if (amdgpu_sriov_vf(adev))
1097 return 0;
1098
1099 /* skip if the bios has already enabled large BAR */
1100 if (adev->gmc.real_vram_size &&
1101 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1102 return 0;
1103
1104 /* Check if the root BUS has 64bit memory resources */
1105 root = adev->pdev->bus;
1106 while (root->parent)
1107 root = root->parent;
1108
1109 pci_bus_for_each_resource(root, res, i) {
1110 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1111 res->start > 0x100000000ull)
1112 break;
1113 }
1114
1115 /* Trying to resize is pointless without a root hub window above 4GB */
1116 if (!res)
1117 return 0;
1118
1119 /* Disable memory decoding while we change the BAR addresses and size */
1120 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1121 pci_write_config_word(adev->pdev, PCI_COMMAND,
1122 cmd & ~PCI_COMMAND_MEMORY);
1123
1124 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1125 amdgpu_device_doorbell_fini(adev);
1126 if (adev->asic_type >= CHIP_BONAIRE)
1127 pci_release_resource(adev->pdev, 2);
1128
1129 pci_release_resource(adev->pdev, 0);
1130
1131 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1132 if (r == -ENOSPC)
1133 DRM_INFO("Not enough PCI address space for a large BAR.");
1134 else if (r && r != -ENOTSUPP)
1135 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1136
1137 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1138
1139 /* When the doorbell or fb BAR isn't available we have no chance of
1140 * using the device.
1141 */
1142 r = amdgpu_device_doorbell_init(adev);
1143 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1144 return -ENODEV;
1145
1146 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1147
1148 return 0;
1149 }
1150
1151 /*
1152 * GPU helpers function.
1153 */
1154 /**
1155 * amdgpu_device_need_post - check if the hw need post or not
1156 *
1157 * @adev: amdgpu_device pointer
1158 *
1159 * Check if the asic has been initialized (all asics) at driver startup
1160 * or post is needed if hw reset is performed.
1161 * Returns true if need or false if not.
1162 */
amdgpu_device_need_post(struct amdgpu_device * adev)1163 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1164 {
1165 uint32_t reg;
1166
1167 if (amdgpu_sriov_vf(adev))
1168 return false;
1169
1170 if (amdgpu_passthrough(adev)) {
1171 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1172 * some old smc fw still need driver do vPost otherwise gpu hang, while
1173 * those smc fw version above 22.15 doesn't have this flaw, so we force
1174 * vpost executed for smc version below 22.15
1175 */
1176 if (adev->asic_type == CHIP_FIJI) {
1177 int err;
1178 uint32_t fw_ver;
1179 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1180 /* force vPost if error occured */
1181 if (err)
1182 return true;
1183
1184 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1185 if (fw_ver < 0x00160e00)
1186 return true;
1187 }
1188 }
1189
1190 if (adev->has_hw_reset) {
1191 adev->has_hw_reset = false;
1192 return true;
1193 }
1194
1195 /* bios scratch used on CIK+ */
1196 if (adev->asic_type >= CHIP_BONAIRE)
1197 return amdgpu_atombios_scratch_need_asic_init(adev);
1198
1199 /* check MEM_SIZE for older asics */
1200 reg = amdgpu_asic_get_config_memsize(adev);
1201
1202 if ((reg != 0) && (reg != 0xffffffff))
1203 return false;
1204
1205 return true;
1206 }
1207
1208 /* if we get transitioned to only one device, take VGA back */
1209 /**
1210 * amdgpu_device_vga_set_decode - enable/disable vga decode
1211 *
1212 * @cookie: amdgpu_device pointer
1213 * @state: enable/disable vga decode
1214 *
1215 * Enable/disable vga decode (all asics).
1216 * Returns VGA resource flags.
1217 */
amdgpu_device_vga_set_decode(void * cookie,bool state)1218 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1219 {
1220 struct amdgpu_device *adev = cookie;
1221 amdgpu_asic_set_vga_state(adev, state);
1222 if (state)
1223 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1224 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1225 else
1226 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1227 }
1228
1229 /**
1230 * amdgpu_device_check_block_size - validate the vm block size
1231 *
1232 * @adev: amdgpu_device pointer
1233 *
1234 * Validates the vm block size specified via module parameter.
1235 * The vm block size defines number of bits in page table versus page directory,
1236 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1237 * page table and the remaining bits are in the page directory.
1238 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1239 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1240 {
1241 /* defines number of bits in page table versus page directory,
1242 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1243 * page table and the remaining bits are in the page directory */
1244 if (amdgpu_vm_block_size == -1)
1245 return;
1246
1247 if (amdgpu_vm_block_size < 9) {
1248 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1249 amdgpu_vm_block_size);
1250 amdgpu_vm_block_size = -1;
1251 }
1252 }
1253
1254 /**
1255 * amdgpu_device_check_vm_size - validate the vm size
1256 *
1257 * @adev: amdgpu_device pointer
1258 *
1259 * Validates the vm size in GB specified via module parameter.
1260 * The VM size is the size of the GPU virtual memory space in GB.
1261 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1262 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1263 {
1264 /* no need to check the default value */
1265 if (amdgpu_vm_size == -1)
1266 return;
1267
1268 if (amdgpu_vm_size < 1) {
1269 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1270 amdgpu_vm_size);
1271 amdgpu_vm_size = -1;
1272 }
1273 }
1274
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1275 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1276 {
1277 struct sysinfo si;
1278 bool is_os_64 = (sizeof(void *) == 8);
1279 uint64_t total_memory;
1280 uint64_t dram_size_seven_GB = 0x1B8000000;
1281 uint64_t dram_size_three_GB = 0xB8000000;
1282
1283 if (amdgpu_smu_memory_pool_size == 0)
1284 return;
1285
1286 if (!is_os_64) {
1287 DRM_WARN("Not 64-bit OS, feature not supported\n");
1288 goto def_value;
1289 }
1290 si_meminfo(&si);
1291 total_memory = (uint64_t)si.totalram * si.mem_unit;
1292
1293 if ((amdgpu_smu_memory_pool_size == 1) ||
1294 (amdgpu_smu_memory_pool_size == 2)) {
1295 if (total_memory < dram_size_three_GB)
1296 goto def_value1;
1297 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1298 (amdgpu_smu_memory_pool_size == 8)) {
1299 if (total_memory < dram_size_seven_GB)
1300 goto def_value1;
1301 } else {
1302 DRM_WARN("Smu memory pool size not supported\n");
1303 goto def_value;
1304 }
1305 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1306
1307 return;
1308
1309 def_value1:
1310 DRM_WARN("No enough system memory\n");
1311 def_value:
1312 adev->pm.smu_prv_buffer_size = 0;
1313 }
1314
1315 /**
1316 * amdgpu_device_check_arguments - validate module params
1317 *
1318 * @adev: amdgpu_device pointer
1319 *
1320 * Validates certain module parameters and updates
1321 * the associated values used by the driver (all asics).
1322 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1323 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1324 {
1325 if (amdgpu_sched_jobs < 4) {
1326 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1327 amdgpu_sched_jobs);
1328 amdgpu_sched_jobs = 4;
1329 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1330 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1331 amdgpu_sched_jobs);
1332 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1333 }
1334
1335 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1336 /* gart size must be greater or equal to 32M */
1337 dev_warn(adev->dev, "gart size (%d) too small\n",
1338 amdgpu_gart_size);
1339 amdgpu_gart_size = -1;
1340 }
1341
1342 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1343 /* gtt size must be greater or equal to 32M */
1344 dev_warn(adev->dev, "gtt size (%d) too small\n",
1345 amdgpu_gtt_size);
1346 amdgpu_gtt_size = -1;
1347 }
1348
1349 /* valid range is between 4 and 9 inclusive */
1350 if (amdgpu_vm_fragment_size != -1 &&
1351 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1352 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1353 amdgpu_vm_fragment_size = -1;
1354 }
1355
1356 if (amdgpu_sched_hw_submission < 2) {
1357 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1358 amdgpu_sched_hw_submission);
1359 amdgpu_sched_hw_submission = 2;
1360 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1361 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1362 amdgpu_sched_hw_submission);
1363 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1364 }
1365
1366 amdgpu_device_check_smu_prv_buffer_size(adev);
1367
1368 amdgpu_device_check_vm_size(adev);
1369
1370 amdgpu_device_check_block_size(adev);
1371
1372 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1373
1374 amdgpu_gmc_tmz_set(adev);
1375
1376 if (amdgpu_num_kcq == -1) {
1377 amdgpu_num_kcq = 8;
1378 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1379 amdgpu_num_kcq = 8;
1380 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1381 }
1382
1383 amdgpu_gmc_noretry_set(adev);
1384
1385 return 0;
1386 }
1387
1388 /**
1389 * amdgpu_switcheroo_set_state - set switcheroo state
1390 *
1391 * @pdev: pci dev pointer
1392 * @state: vga_switcheroo state
1393 *
1394 * Callback for the switcheroo driver. Suspends or resumes the
1395 * the asics before or after it is powered up using ACPI methods.
1396 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1397 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1398 enum vga_switcheroo_state state)
1399 {
1400 struct drm_device *dev = pci_get_drvdata(pdev);
1401 int r;
1402
1403 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1404 return;
1405
1406 if (state == VGA_SWITCHEROO_ON) {
1407 pr_info("switched on\n");
1408 /* don't suspend or resume card normally */
1409 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1410
1411 pci_set_power_state(dev->pdev, PCI_D0);
1412 amdgpu_device_load_pci_state(dev->pdev);
1413 r = pci_enable_device(dev->pdev);
1414 if (r)
1415 DRM_WARN("pci_enable_device failed (%d)\n", r);
1416 amdgpu_device_resume(dev, true);
1417
1418 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1419 drm_kms_helper_poll_enable(dev);
1420 } else {
1421 pr_info("switched off\n");
1422 drm_kms_helper_poll_disable(dev);
1423 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1424 amdgpu_device_suspend(dev, true);
1425 amdgpu_device_cache_pci_state(dev->pdev);
1426 /* Shut down the device */
1427 pci_disable_device(dev->pdev);
1428 pci_set_power_state(dev->pdev, PCI_D3cold);
1429 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1430 }
1431 }
1432
1433 /**
1434 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1435 *
1436 * @pdev: pci dev pointer
1437 *
1438 * Callback for the switcheroo driver. Check of the switcheroo
1439 * state can be changed.
1440 * Returns true if the state can be changed, false if not.
1441 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1442 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1443 {
1444 struct drm_device *dev = pci_get_drvdata(pdev);
1445
1446 /*
1447 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1448 * locking inversion with the driver load path. And the access here is
1449 * completely racy anyway. So don't bother with locking for now.
1450 */
1451 return atomic_read(&dev->open_count) == 0;
1452 }
1453
1454 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1455 .set_gpu_state = amdgpu_switcheroo_set_state,
1456 .reprobe = NULL,
1457 .can_switch = amdgpu_switcheroo_can_switch,
1458 };
1459
1460 /**
1461 * amdgpu_device_ip_set_clockgating_state - set the CG state
1462 *
1463 * @dev: amdgpu_device pointer
1464 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1465 * @state: clockgating state (gate or ungate)
1466 *
1467 * Sets the requested clockgating state for all instances of
1468 * the hardware IP specified.
1469 * Returns the error code from the last instance.
1470 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1471 int amdgpu_device_ip_set_clockgating_state(void *dev,
1472 enum amd_ip_block_type block_type,
1473 enum amd_clockgating_state state)
1474 {
1475 struct amdgpu_device *adev = dev;
1476 int i, r = 0;
1477
1478 for (i = 0; i < adev->num_ip_blocks; i++) {
1479 if (!adev->ip_blocks[i].status.valid)
1480 continue;
1481 if (adev->ip_blocks[i].version->type != block_type)
1482 continue;
1483 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1484 continue;
1485 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1486 (void *)adev, state);
1487 if (r)
1488 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1489 adev->ip_blocks[i].version->funcs->name, r);
1490 }
1491 return r;
1492 }
1493
1494 /**
1495 * amdgpu_device_ip_set_powergating_state - set the PG state
1496 *
1497 * @dev: amdgpu_device pointer
1498 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1499 * @state: powergating state (gate or ungate)
1500 *
1501 * Sets the requested powergating state for all instances of
1502 * the hardware IP specified.
1503 * Returns the error code from the last instance.
1504 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1505 int amdgpu_device_ip_set_powergating_state(void *dev,
1506 enum amd_ip_block_type block_type,
1507 enum amd_powergating_state state)
1508 {
1509 struct amdgpu_device *adev = dev;
1510 int i, r = 0;
1511
1512 for (i = 0; i < adev->num_ip_blocks; i++) {
1513 if (!adev->ip_blocks[i].status.valid)
1514 continue;
1515 if (adev->ip_blocks[i].version->type != block_type)
1516 continue;
1517 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1518 continue;
1519 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1520 (void *)adev, state);
1521 if (r)
1522 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1523 adev->ip_blocks[i].version->funcs->name, r);
1524 }
1525 return r;
1526 }
1527
1528 /**
1529 * amdgpu_device_ip_get_clockgating_state - get the CG state
1530 *
1531 * @adev: amdgpu_device pointer
1532 * @flags: clockgating feature flags
1533 *
1534 * Walks the list of IPs on the device and updates the clockgating
1535 * flags for each IP.
1536 * Updates @flags with the feature flags for each hardware IP where
1537 * clockgating is enabled.
1538 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1539 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1540 u32 *flags)
1541 {
1542 int i;
1543
1544 for (i = 0; i < adev->num_ip_blocks; i++) {
1545 if (!adev->ip_blocks[i].status.valid)
1546 continue;
1547 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1548 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1549 }
1550 }
1551
1552 /**
1553 * amdgpu_device_ip_wait_for_idle - wait for idle
1554 *
1555 * @adev: amdgpu_device pointer
1556 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1557 *
1558 * Waits for the request hardware IP to be idle.
1559 * Returns 0 for success or a negative error code on failure.
1560 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1561 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1562 enum amd_ip_block_type block_type)
1563 {
1564 int i, r;
1565
1566 for (i = 0; i < adev->num_ip_blocks; i++) {
1567 if (!adev->ip_blocks[i].status.valid)
1568 continue;
1569 if (adev->ip_blocks[i].version->type == block_type) {
1570 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1571 if (r)
1572 return r;
1573 break;
1574 }
1575 }
1576 return 0;
1577
1578 }
1579
1580 /**
1581 * amdgpu_device_ip_is_idle - is the hardware IP idle
1582 *
1583 * @adev: amdgpu_device pointer
1584 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1585 *
1586 * Check if the hardware IP is idle or not.
1587 * Returns true if it the IP is idle, false if not.
1588 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1589 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1590 enum amd_ip_block_type block_type)
1591 {
1592 int i;
1593
1594 for (i = 0; i < adev->num_ip_blocks; i++) {
1595 if (!adev->ip_blocks[i].status.valid)
1596 continue;
1597 if (adev->ip_blocks[i].version->type == block_type)
1598 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1599 }
1600 return true;
1601
1602 }
1603
1604 /**
1605 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1606 *
1607 * @adev: amdgpu_device pointer
1608 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1609 *
1610 * Returns a pointer to the hardware IP block structure
1611 * if it exists for the asic, otherwise NULL.
1612 */
1613 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1614 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1615 enum amd_ip_block_type type)
1616 {
1617 int i;
1618
1619 for (i = 0; i < adev->num_ip_blocks; i++)
1620 if (adev->ip_blocks[i].version->type == type)
1621 return &adev->ip_blocks[i];
1622
1623 return NULL;
1624 }
1625
1626 /**
1627 * amdgpu_device_ip_block_version_cmp
1628 *
1629 * @adev: amdgpu_device pointer
1630 * @type: enum amd_ip_block_type
1631 * @major: major version
1632 * @minor: minor version
1633 *
1634 * return 0 if equal or greater
1635 * return 1 if smaller or the ip_block doesn't exist
1636 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1637 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1638 enum amd_ip_block_type type,
1639 u32 major, u32 minor)
1640 {
1641 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1642
1643 if (ip_block && ((ip_block->version->major > major) ||
1644 ((ip_block->version->major == major) &&
1645 (ip_block->version->minor >= minor))))
1646 return 0;
1647
1648 return 1;
1649 }
1650
1651 /**
1652 * amdgpu_device_ip_block_add
1653 *
1654 * @adev: amdgpu_device pointer
1655 * @ip_block_version: pointer to the IP to add
1656 *
1657 * Adds the IP block driver information to the collection of IPs
1658 * on the asic.
1659 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1660 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1661 const struct amdgpu_ip_block_version *ip_block_version)
1662 {
1663 if (!ip_block_version)
1664 return -EINVAL;
1665
1666 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1667 ip_block_version->funcs->name);
1668
1669 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1670
1671 return 0;
1672 }
1673
1674 /**
1675 * amdgpu_device_enable_virtual_display - enable virtual display feature
1676 *
1677 * @adev: amdgpu_device pointer
1678 *
1679 * Enabled the virtual display feature if the user has enabled it via
1680 * the module parameter virtual_display. This feature provides a virtual
1681 * display hardware on headless boards or in virtualized environments.
1682 * This function parses and validates the configuration string specified by
1683 * the user and configues the virtual display configuration (number of
1684 * virtual connectors, crtcs, etc.) specified.
1685 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1686 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1687 {
1688 adev->enable_virtual_display = false;
1689
1690 if (amdgpu_virtual_display) {
1691 struct drm_device *ddev = adev_to_drm(adev);
1692 const char *pci_address_name = pci_name(ddev->pdev);
1693 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1694
1695 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1696 pciaddstr_tmp = pciaddstr;
1697 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1698 pciaddname = strsep(&pciaddname_tmp, ",");
1699 if (!strcmp("all", pciaddname)
1700 || !strcmp(pci_address_name, pciaddname)) {
1701 long num_crtc;
1702 int res = -1;
1703
1704 adev->enable_virtual_display = true;
1705
1706 if (pciaddname_tmp)
1707 res = kstrtol(pciaddname_tmp, 10,
1708 &num_crtc);
1709
1710 if (!res) {
1711 if (num_crtc < 1)
1712 num_crtc = 1;
1713 if (num_crtc > 6)
1714 num_crtc = 6;
1715 adev->mode_info.num_crtc = num_crtc;
1716 } else {
1717 adev->mode_info.num_crtc = 1;
1718 }
1719 break;
1720 }
1721 }
1722
1723 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1724 amdgpu_virtual_display, pci_address_name,
1725 adev->enable_virtual_display, adev->mode_info.num_crtc);
1726
1727 kfree(pciaddstr);
1728 }
1729 }
1730
1731 /**
1732 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1733 *
1734 * @adev: amdgpu_device pointer
1735 *
1736 * Parses the asic configuration parameters specified in the gpu info
1737 * firmware and makes them availale to the driver for use in configuring
1738 * the asic.
1739 * Returns 0 on success, -EINVAL on failure.
1740 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1741 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1742 {
1743 const char *chip_name;
1744 char fw_name[40];
1745 int err;
1746 const struct gpu_info_firmware_header_v1_0 *hdr;
1747
1748 adev->firmware.gpu_info_fw = NULL;
1749
1750 if (adev->mman.discovery_bin) {
1751 amdgpu_discovery_get_gfx_info(adev);
1752
1753 /*
1754 * FIXME: The bounding box is still needed by Navi12, so
1755 * temporarily read it from gpu_info firmware. Should be droped
1756 * when DAL no longer needs it.
1757 */
1758 if (adev->asic_type != CHIP_NAVI12)
1759 return 0;
1760 }
1761
1762 switch (adev->asic_type) {
1763 #ifdef CONFIG_DRM_AMDGPU_SI
1764 case CHIP_VERDE:
1765 case CHIP_TAHITI:
1766 case CHIP_PITCAIRN:
1767 case CHIP_OLAND:
1768 case CHIP_HAINAN:
1769 #endif
1770 #ifdef CONFIG_DRM_AMDGPU_CIK
1771 case CHIP_BONAIRE:
1772 case CHIP_HAWAII:
1773 case CHIP_KAVERI:
1774 case CHIP_KABINI:
1775 case CHIP_MULLINS:
1776 #endif
1777 case CHIP_TOPAZ:
1778 case CHIP_TONGA:
1779 case CHIP_FIJI:
1780 case CHIP_POLARIS10:
1781 case CHIP_POLARIS11:
1782 case CHIP_POLARIS12:
1783 case CHIP_VEGAM:
1784 case CHIP_CARRIZO:
1785 case CHIP_STONEY:
1786 case CHIP_VEGA20:
1787 case CHIP_SIENNA_CICHLID:
1788 case CHIP_NAVY_FLOUNDER:
1789 default:
1790 return 0;
1791 case CHIP_VEGA10:
1792 chip_name = "vega10";
1793 break;
1794 case CHIP_VEGA12:
1795 chip_name = "vega12";
1796 break;
1797 case CHIP_RAVEN:
1798 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1799 chip_name = "raven2";
1800 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1801 chip_name = "picasso";
1802 else
1803 chip_name = "raven";
1804 break;
1805 case CHIP_ARCTURUS:
1806 chip_name = "arcturus";
1807 break;
1808 case CHIP_RENOIR:
1809 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1810 chip_name = "renoir";
1811 else
1812 chip_name = "green_sardine";
1813 break;
1814 case CHIP_NAVI10:
1815 chip_name = "navi10";
1816 break;
1817 case CHIP_NAVI14:
1818 chip_name = "navi14";
1819 break;
1820 case CHIP_NAVI12:
1821 chip_name = "navi12";
1822 break;
1823 }
1824
1825 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1826 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1827 if (err) {
1828 dev_err(adev->dev,
1829 "Failed to load gpu_info firmware \"%s\"\n",
1830 fw_name);
1831 goto out;
1832 }
1833 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1834 if (err) {
1835 dev_err(adev->dev,
1836 "Failed to validate gpu_info firmware \"%s\"\n",
1837 fw_name);
1838 goto out;
1839 }
1840
1841 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1842 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1843
1844 switch (hdr->version_major) {
1845 case 1:
1846 {
1847 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1848 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1849 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1850
1851 /*
1852 * Should be droped when DAL no longer needs it.
1853 */
1854 if (adev->asic_type == CHIP_NAVI12)
1855 goto parse_soc_bounding_box;
1856
1857 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1858 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1859 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1860 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1861 adev->gfx.config.max_texture_channel_caches =
1862 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1863 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1864 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1865 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1866 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1867 adev->gfx.config.double_offchip_lds_buf =
1868 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1869 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1870 adev->gfx.cu_info.max_waves_per_simd =
1871 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1872 adev->gfx.cu_info.max_scratch_slots_per_cu =
1873 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1874 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1875 if (hdr->version_minor >= 1) {
1876 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1877 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1878 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1879 adev->gfx.config.num_sc_per_sh =
1880 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1881 adev->gfx.config.num_packer_per_sc =
1882 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1883 }
1884
1885 parse_soc_bounding_box:
1886 /*
1887 * soc bounding box info is not integrated in disocovery table,
1888 * we always need to parse it from gpu info firmware if needed.
1889 */
1890 if (hdr->version_minor == 2) {
1891 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1892 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1893 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1894 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1895 }
1896 break;
1897 }
1898 default:
1899 dev_err(adev->dev,
1900 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1901 err = -EINVAL;
1902 goto out;
1903 }
1904 out:
1905 return err;
1906 }
1907
1908 /**
1909 * amdgpu_device_ip_early_init - run early init for hardware IPs
1910 *
1911 * @adev: amdgpu_device pointer
1912 *
1913 * Early initialization pass for hardware IPs. The hardware IPs that make
1914 * up each asic are discovered each IP's early_init callback is run. This
1915 * is the first stage in initializing the asic.
1916 * Returns 0 on success, negative error code on failure.
1917 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1918 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1919 {
1920 int i, r;
1921
1922 amdgpu_device_enable_virtual_display(adev);
1923
1924 if (amdgpu_sriov_vf(adev)) {
1925 r = amdgpu_virt_request_full_gpu(adev, true);
1926 if (r)
1927 return r;
1928 }
1929
1930 switch (adev->asic_type) {
1931 #ifdef CONFIG_DRM_AMDGPU_SI
1932 case CHIP_VERDE:
1933 case CHIP_TAHITI:
1934 case CHIP_PITCAIRN:
1935 case CHIP_OLAND:
1936 case CHIP_HAINAN:
1937 adev->family = AMDGPU_FAMILY_SI;
1938 r = si_set_ip_blocks(adev);
1939 if (r)
1940 return r;
1941 break;
1942 #endif
1943 #ifdef CONFIG_DRM_AMDGPU_CIK
1944 case CHIP_BONAIRE:
1945 case CHIP_HAWAII:
1946 case CHIP_KAVERI:
1947 case CHIP_KABINI:
1948 case CHIP_MULLINS:
1949 if (adev->flags & AMD_IS_APU)
1950 adev->family = AMDGPU_FAMILY_KV;
1951 else
1952 adev->family = AMDGPU_FAMILY_CI;
1953
1954 r = cik_set_ip_blocks(adev);
1955 if (r)
1956 return r;
1957 break;
1958 #endif
1959 case CHIP_TOPAZ:
1960 case CHIP_TONGA:
1961 case CHIP_FIJI:
1962 case CHIP_POLARIS10:
1963 case CHIP_POLARIS11:
1964 case CHIP_POLARIS12:
1965 case CHIP_VEGAM:
1966 case CHIP_CARRIZO:
1967 case CHIP_STONEY:
1968 if (adev->flags & AMD_IS_APU)
1969 adev->family = AMDGPU_FAMILY_CZ;
1970 else
1971 adev->family = AMDGPU_FAMILY_VI;
1972
1973 r = vi_set_ip_blocks(adev);
1974 if (r)
1975 return r;
1976 break;
1977 case CHIP_VEGA10:
1978 case CHIP_VEGA12:
1979 case CHIP_VEGA20:
1980 case CHIP_RAVEN:
1981 case CHIP_ARCTURUS:
1982 case CHIP_RENOIR:
1983 if (adev->flags & AMD_IS_APU)
1984 adev->family = AMDGPU_FAMILY_RV;
1985 else
1986 adev->family = AMDGPU_FAMILY_AI;
1987
1988 r = soc15_set_ip_blocks(adev);
1989 if (r)
1990 return r;
1991 break;
1992 case CHIP_NAVI10:
1993 case CHIP_NAVI14:
1994 case CHIP_NAVI12:
1995 case CHIP_SIENNA_CICHLID:
1996 case CHIP_NAVY_FLOUNDER:
1997 adev->family = AMDGPU_FAMILY_NV;
1998
1999 r = nv_set_ip_blocks(adev);
2000 if (r)
2001 return r;
2002 break;
2003 default:
2004 /* FIXME: not supported yet */
2005 return -EINVAL;
2006 }
2007
2008 amdgpu_amdkfd_device_probe(adev);
2009
2010 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2011 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2012 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2013
2014 for (i = 0; i < adev->num_ip_blocks; i++) {
2015 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2016 DRM_ERROR("disabled ip block: %d <%s>\n",
2017 i, adev->ip_blocks[i].version->funcs->name);
2018 adev->ip_blocks[i].status.valid = false;
2019 } else {
2020 if (adev->ip_blocks[i].version->funcs->early_init) {
2021 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2022 if (r == -ENOENT) {
2023 adev->ip_blocks[i].status.valid = false;
2024 } else if (r) {
2025 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2026 adev->ip_blocks[i].version->funcs->name, r);
2027 return r;
2028 } else {
2029 adev->ip_blocks[i].status.valid = true;
2030 }
2031 } else {
2032 adev->ip_blocks[i].status.valid = true;
2033 }
2034 }
2035 /* get the vbios after the asic_funcs are set up */
2036 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2037 r = amdgpu_device_parse_gpu_info_fw(adev);
2038 if (r)
2039 return r;
2040
2041 /* Read BIOS */
2042 if (!amdgpu_get_bios(adev))
2043 return -EINVAL;
2044
2045 r = amdgpu_atombios_init(adev);
2046 if (r) {
2047 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2048 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2049 return r;
2050 }
2051 }
2052 }
2053
2054 adev->cg_flags &= amdgpu_cg_mask;
2055 adev->pg_flags &= amdgpu_pg_mask;
2056
2057 return 0;
2058 }
2059
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2060 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2061 {
2062 int i, r;
2063
2064 for (i = 0; i < adev->num_ip_blocks; i++) {
2065 if (!adev->ip_blocks[i].status.sw)
2066 continue;
2067 if (adev->ip_blocks[i].status.hw)
2068 continue;
2069 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2070 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2071 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2072 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2073 if (r) {
2074 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2075 adev->ip_blocks[i].version->funcs->name, r);
2076 return r;
2077 }
2078 adev->ip_blocks[i].status.hw = true;
2079 }
2080 }
2081
2082 return 0;
2083 }
2084
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2085 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2086 {
2087 int i, r;
2088
2089 for (i = 0; i < adev->num_ip_blocks; i++) {
2090 if (!adev->ip_blocks[i].status.sw)
2091 continue;
2092 if (adev->ip_blocks[i].status.hw)
2093 continue;
2094 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2095 if (r) {
2096 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2097 adev->ip_blocks[i].version->funcs->name, r);
2098 return r;
2099 }
2100 adev->ip_blocks[i].status.hw = true;
2101 }
2102
2103 return 0;
2104 }
2105
amdgpu_device_fw_loading(struct amdgpu_device * adev)2106 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2107 {
2108 int r = 0;
2109 int i;
2110 uint32_t smu_version;
2111
2112 if (adev->asic_type >= CHIP_VEGA10) {
2113 for (i = 0; i < adev->num_ip_blocks; i++) {
2114 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2115 continue;
2116
2117 /* no need to do the fw loading again if already done*/
2118 if (adev->ip_blocks[i].status.hw == true)
2119 break;
2120
2121 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2122 r = adev->ip_blocks[i].version->funcs->resume(adev);
2123 if (r) {
2124 DRM_ERROR("resume of IP block <%s> failed %d\n",
2125 adev->ip_blocks[i].version->funcs->name, r);
2126 return r;
2127 }
2128 } else {
2129 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2130 if (r) {
2131 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2132 adev->ip_blocks[i].version->funcs->name, r);
2133 return r;
2134 }
2135 }
2136
2137 adev->ip_blocks[i].status.hw = true;
2138 break;
2139 }
2140 }
2141
2142 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2143 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2144
2145 return r;
2146 }
2147
2148 /**
2149 * amdgpu_device_ip_init - run init for hardware IPs
2150 *
2151 * @adev: amdgpu_device pointer
2152 *
2153 * Main initialization pass for hardware IPs. The list of all the hardware
2154 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2155 * are run. sw_init initializes the software state associated with each IP
2156 * and hw_init initializes the hardware associated with each IP.
2157 * Returns 0 on success, negative error code on failure.
2158 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2159 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2160 {
2161 int i, r;
2162
2163 r = amdgpu_ras_init(adev);
2164 if (r)
2165 return r;
2166
2167 for (i = 0; i < adev->num_ip_blocks; i++) {
2168 if (!adev->ip_blocks[i].status.valid)
2169 continue;
2170 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2171 if (r) {
2172 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2173 adev->ip_blocks[i].version->funcs->name, r);
2174 goto init_failed;
2175 }
2176 adev->ip_blocks[i].status.sw = true;
2177
2178 /* need to do gmc hw init early so we can allocate gpu mem */
2179 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2180 r = amdgpu_device_vram_scratch_init(adev);
2181 if (r) {
2182 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2183 goto init_failed;
2184 }
2185 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2186 if (r) {
2187 DRM_ERROR("hw_init %d failed %d\n", i, r);
2188 goto init_failed;
2189 }
2190 r = amdgpu_device_wb_init(adev);
2191 if (r) {
2192 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2193 goto init_failed;
2194 }
2195 adev->ip_blocks[i].status.hw = true;
2196
2197 /* right after GMC hw init, we create CSA */
2198 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2199 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2200 AMDGPU_GEM_DOMAIN_VRAM,
2201 AMDGPU_CSA_SIZE);
2202 if (r) {
2203 DRM_ERROR("allocate CSA failed %d\n", r);
2204 goto init_failed;
2205 }
2206 }
2207 }
2208 }
2209
2210 if (amdgpu_sriov_vf(adev))
2211 amdgpu_virt_init_data_exchange(adev);
2212
2213 r = amdgpu_ib_pool_init(adev);
2214 if (r) {
2215 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2216 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2217 goto init_failed;
2218 }
2219
2220 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2221 if (r)
2222 goto init_failed;
2223
2224 r = amdgpu_device_ip_hw_init_phase1(adev);
2225 if (r)
2226 goto init_failed;
2227
2228 r = amdgpu_device_fw_loading(adev);
2229 if (r)
2230 goto init_failed;
2231
2232 r = amdgpu_device_ip_hw_init_phase2(adev);
2233 if (r)
2234 goto init_failed;
2235
2236 /*
2237 * retired pages will be loaded from eeprom and reserved here,
2238 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2239 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2240 * for I2C communication which only true at this point.
2241 *
2242 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2243 * failure from bad gpu situation and stop amdgpu init process
2244 * accordingly. For other failed cases, it will still release all
2245 * the resource and print error message, rather than returning one
2246 * negative value to upper level.
2247 *
2248 * Note: theoretically, this should be called before all vram allocations
2249 * to protect retired page from abusing
2250 */
2251 r = amdgpu_ras_recovery_init(adev);
2252 if (r)
2253 goto init_failed;
2254
2255 if (adev->gmc.xgmi.num_physical_nodes > 1)
2256 amdgpu_xgmi_add_device(adev);
2257 amdgpu_amdkfd_device_init(adev);
2258
2259 amdgpu_fru_get_product_info(adev);
2260
2261 init_failed:
2262 if (amdgpu_sriov_vf(adev))
2263 amdgpu_virt_release_full_gpu(adev, true);
2264
2265 return r;
2266 }
2267
2268 /**
2269 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2270 *
2271 * @adev: amdgpu_device pointer
2272 *
2273 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2274 * this function before a GPU reset. If the value is retained after a
2275 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2276 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2277 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2278 {
2279 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2280 }
2281
2282 /**
2283 * amdgpu_device_check_vram_lost - check if vram is valid
2284 *
2285 * @adev: amdgpu_device pointer
2286 *
2287 * Checks the reset magic value written to the gart pointer in VRAM.
2288 * The driver calls this after a GPU reset to see if the contents of
2289 * VRAM is lost or now.
2290 * returns true if vram is lost, false if not.
2291 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2292 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2293 {
2294 if (memcmp(adev->gart.ptr, adev->reset_magic,
2295 AMDGPU_RESET_MAGIC_NUM))
2296 return true;
2297
2298 if (!amdgpu_in_reset(adev))
2299 return false;
2300
2301 /*
2302 * For all ASICs with baco/mode1 reset, the VRAM is
2303 * always assumed to be lost.
2304 */
2305 switch (amdgpu_asic_reset_method(adev)) {
2306 case AMD_RESET_METHOD_BACO:
2307 case AMD_RESET_METHOD_MODE1:
2308 return true;
2309 default:
2310 return false;
2311 }
2312 }
2313
2314 /**
2315 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2316 *
2317 * @adev: amdgpu_device pointer
2318 * @state: clockgating state (gate or ungate)
2319 *
2320 * The list of all the hardware IPs that make up the asic is walked and the
2321 * set_clockgating_state callbacks are run.
2322 * Late initialization pass enabling clockgating for hardware IPs.
2323 * Fini or suspend, pass disabling clockgating for hardware IPs.
2324 * Returns 0 on success, negative error code on failure.
2325 */
2326
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2327 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2328 enum amd_clockgating_state state)
2329 {
2330 int i, j, r;
2331
2332 if (amdgpu_emu_mode == 1)
2333 return 0;
2334
2335 for (j = 0; j < adev->num_ip_blocks; j++) {
2336 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2337 if (!adev->ip_blocks[i].status.late_initialized)
2338 continue;
2339 /* skip CG for VCE/UVD, it's handled specially */
2340 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2341 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2342 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2343 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2344 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2345 /* enable clockgating to save power */
2346 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2347 state);
2348 if (r) {
2349 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2350 adev->ip_blocks[i].version->funcs->name, r);
2351 return r;
2352 }
2353 }
2354 }
2355
2356 return 0;
2357 }
2358
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2359 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2360 {
2361 int i, j, r;
2362
2363 if (amdgpu_emu_mode == 1)
2364 return 0;
2365
2366 for (j = 0; j < adev->num_ip_blocks; j++) {
2367 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2368 if (!adev->ip_blocks[i].status.late_initialized)
2369 continue;
2370 /* skip CG for VCE/UVD, it's handled specially */
2371 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2372 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2373 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2374 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2375 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2376 /* enable powergating to save power */
2377 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2378 state);
2379 if (r) {
2380 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2381 adev->ip_blocks[i].version->funcs->name, r);
2382 return r;
2383 }
2384 }
2385 }
2386 return 0;
2387 }
2388
amdgpu_device_enable_mgpu_fan_boost(void)2389 static int amdgpu_device_enable_mgpu_fan_boost(void)
2390 {
2391 struct amdgpu_gpu_instance *gpu_ins;
2392 struct amdgpu_device *adev;
2393 int i, ret = 0;
2394
2395 mutex_lock(&mgpu_info.mutex);
2396
2397 /*
2398 * MGPU fan boost feature should be enabled
2399 * only when there are two or more dGPUs in
2400 * the system
2401 */
2402 if (mgpu_info.num_dgpu < 2)
2403 goto out;
2404
2405 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2406 gpu_ins = &(mgpu_info.gpu_ins[i]);
2407 adev = gpu_ins->adev;
2408 if (!(adev->flags & AMD_IS_APU) &&
2409 !gpu_ins->mgpu_fan_enabled) {
2410 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2411 if (ret)
2412 break;
2413
2414 gpu_ins->mgpu_fan_enabled = 1;
2415 }
2416 }
2417
2418 out:
2419 mutex_unlock(&mgpu_info.mutex);
2420
2421 return ret;
2422 }
2423
2424 /**
2425 * amdgpu_device_ip_late_init - run late init for hardware IPs
2426 *
2427 * @adev: amdgpu_device pointer
2428 *
2429 * Late initialization pass for hardware IPs. The list of all the hardware
2430 * IPs that make up the asic is walked and the late_init callbacks are run.
2431 * late_init covers any special initialization that an IP requires
2432 * after all of the have been initialized or something that needs to happen
2433 * late in the init process.
2434 * Returns 0 on success, negative error code on failure.
2435 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2436 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2437 {
2438 struct amdgpu_gpu_instance *gpu_instance;
2439 int i = 0, r;
2440
2441 for (i = 0; i < adev->num_ip_blocks; i++) {
2442 if (!adev->ip_blocks[i].status.hw)
2443 continue;
2444 if (adev->ip_blocks[i].version->funcs->late_init) {
2445 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2446 if (r) {
2447 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2448 adev->ip_blocks[i].version->funcs->name, r);
2449 return r;
2450 }
2451 }
2452 adev->ip_blocks[i].status.late_initialized = true;
2453 }
2454
2455 amdgpu_ras_set_error_query_ready(adev, true);
2456
2457 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2458 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2459
2460 amdgpu_device_fill_reset_magic(adev);
2461
2462 r = amdgpu_device_enable_mgpu_fan_boost();
2463 if (r)
2464 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2465
2466
2467 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2468 mutex_lock(&mgpu_info.mutex);
2469
2470 /*
2471 * Reset device p-state to low as this was booted with high.
2472 *
2473 * This should be performed only after all devices from the same
2474 * hive get initialized.
2475 *
2476 * However, it's unknown how many device in the hive in advance.
2477 * As this is counted one by one during devices initializations.
2478 *
2479 * So, we wait for all XGMI interlinked devices initialized.
2480 * This may bring some delays as those devices may come from
2481 * different hives. But that should be OK.
2482 */
2483 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2484 for (i = 0; i < mgpu_info.num_gpu; i++) {
2485 gpu_instance = &(mgpu_info.gpu_ins[i]);
2486 if (gpu_instance->adev->flags & AMD_IS_APU)
2487 continue;
2488
2489 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2490 AMDGPU_XGMI_PSTATE_MIN);
2491 if (r) {
2492 DRM_ERROR("pstate setting failed (%d).\n", r);
2493 break;
2494 }
2495 }
2496 }
2497
2498 mutex_unlock(&mgpu_info.mutex);
2499 }
2500
2501 return 0;
2502 }
2503
2504 /**
2505 * amdgpu_device_ip_fini - run fini for hardware IPs
2506 *
2507 * @adev: amdgpu_device pointer
2508 *
2509 * Main teardown pass for hardware IPs. The list of all the hardware
2510 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2511 * are run. hw_fini tears down the hardware associated with each IP
2512 * and sw_fini tears down any software state associated with each IP.
2513 * Returns 0 on success, negative error code on failure.
2514 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2515 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2516 {
2517 int i, r;
2518
2519 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2520 amdgpu_virt_release_ras_err_handler_data(adev);
2521
2522 amdgpu_ras_pre_fini(adev);
2523
2524 if (adev->gmc.xgmi.num_physical_nodes > 1)
2525 amdgpu_xgmi_remove_device(adev);
2526
2527 amdgpu_amdkfd_device_fini(adev);
2528
2529 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2530 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2531
2532 /* need to disable SMC first */
2533 for (i = 0; i < adev->num_ip_blocks; i++) {
2534 if (!adev->ip_blocks[i].status.hw)
2535 continue;
2536 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2537 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2538 /* XXX handle errors */
2539 if (r) {
2540 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2541 adev->ip_blocks[i].version->funcs->name, r);
2542 }
2543 adev->ip_blocks[i].status.hw = false;
2544 break;
2545 }
2546 }
2547
2548 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2549 if (!adev->ip_blocks[i].status.hw)
2550 continue;
2551
2552 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2553 /* XXX handle errors */
2554 if (r) {
2555 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2556 adev->ip_blocks[i].version->funcs->name, r);
2557 }
2558
2559 adev->ip_blocks[i].status.hw = false;
2560 }
2561
2562
2563 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2564 if (!adev->ip_blocks[i].status.sw)
2565 continue;
2566
2567 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2568 amdgpu_ucode_free_bo(adev);
2569 amdgpu_free_static_csa(&adev->virt.csa_obj);
2570 amdgpu_device_wb_fini(adev);
2571 amdgpu_device_vram_scratch_fini(adev);
2572 amdgpu_ib_pool_fini(adev);
2573 }
2574
2575 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2576 /* XXX handle errors */
2577 if (r) {
2578 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2579 adev->ip_blocks[i].version->funcs->name, r);
2580 }
2581 adev->ip_blocks[i].status.sw = false;
2582 adev->ip_blocks[i].status.valid = false;
2583 }
2584
2585 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2586 if (!adev->ip_blocks[i].status.late_initialized)
2587 continue;
2588 if (adev->ip_blocks[i].version->funcs->late_fini)
2589 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2590 adev->ip_blocks[i].status.late_initialized = false;
2591 }
2592
2593 amdgpu_ras_fini(adev);
2594
2595 if (amdgpu_sriov_vf(adev))
2596 if (amdgpu_virt_release_full_gpu(adev, false))
2597 DRM_ERROR("failed to release exclusive mode on fini\n");
2598
2599 return 0;
2600 }
2601
2602 /**
2603 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2604 *
2605 * @work: work_struct.
2606 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2607 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2608 {
2609 struct amdgpu_device *adev =
2610 container_of(work, struct amdgpu_device, delayed_init_work.work);
2611 int r;
2612
2613 r = amdgpu_ib_ring_tests(adev);
2614 if (r)
2615 DRM_ERROR("ib ring test failed (%d).\n", r);
2616 }
2617
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2618 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2619 {
2620 struct amdgpu_device *adev =
2621 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2622
2623 mutex_lock(&adev->gfx.gfx_off_mutex);
2624 if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2625 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2626 adev->gfx.gfx_off_state = true;
2627 }
2628 mutex_unlock(&adev->gfx.gfx_off_mutex);
2629 }
2630
2631 /**
2632 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2633 *
2634 * @adev: amdgpu_device pointer
2635 *
2636 * Main suspend function for hardware IPs. The list of all the hardware
2637 * IPs that make up the asic is walked, clockgating is disabled and the
2638 * suspend callbacks are run. suspend puts the hardware and software state
2639 * in each IP into a state suitable for suspend.
2640 * Returns 0 on success, negative error code on failure.
2641 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2642 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2643 {
2644 int i, r;
2645
2646 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2647 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2648
2649 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2650 if (!adev->ip_blocks[i].status.valid)
2651 continue;
2652
2653 /* displays are handled separately */
2654 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2655 continue;
2656
2657 /* XXX handle errors */
2658 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2659 /* XXX handle errors */
2660 if (r) {
2661 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2662 adev->ip_blocks[i].version->funcs->name, r);
2663 return r;
2664 }
2665
2666 adev->ip_blocks[i].status.hw = false;
2667 }
2668
2669 return 0;
2670 }
2671
2672 /**
2673 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2674 *
2675 * @adev: amdgpu_device pointer
2676 *
2677 * Main suspend function for hardware IPs. The list of all the hardware
2678 * IPs that make up the asic is walked, clockgating is disabled and the
2679 * suspend callbacks are run. suspend puts the hardware and software state
2680 * in each IP into a state suitable for suspend.
2681 * Returns 0 on success, negative error code on failure.
2682 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2683 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2684 {
2685 int i, r;
2686
2687 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2688 if (!adev->ip_blocks[i].status.valid)
2689 continue;
2690 /* displays are handled in phase1 */
2691 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2692 continue;
2693 /* PSP lost connection when err_event_athub occurs */
2694 if (amdgpu_ras_intr_triggered() &&
2695 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2696 adev->ip_blocks[i].status.hw = false;
2697 continue;
2698 }
2699 /* XXX handle errors */
2700 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2701 /* XXX handle errors */
2702 if (r) {
2703 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2704 adev->ip_blocks[i].version->funcs->name, r);
2705 }
2706 adev->ip_blocks[i].status.hw = false;
2707 /* handle putting the SMC in the appropriate state */
2708 if(!amdgpu_sriov_vf(adev)){
2709 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2710 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2711 if (r) {
2712 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2713 adev->mp1_state, r);
2714 return r;
2715 }
2716 }
2717 }
2718 adev->ip_blocks[i].status.hw = false;
2719 }
2720
2721 return 0;
2722 }
2723
2724 /**
2725 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2726 *
2727 * @adev: amdgpu_device pointer
2728 *
2729 * Main suspend function for hardware IPs. The list of all the hardware
2730 * IPs that make up the asic is walked, clockgating is disabled and the
2731 * suspend callbacks are run. suspend puts the hardware and software state
2732 * in each IP into a state suitable for suspend.
2733 * Returns 0 on success, negative error code on failure.
2734 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2735 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2736 {
2737 int r;
2738
2739 if (amdgpu_sriov_vf(adev))
2740 amdgpu_virt_request_full_gpu(adev, false);
2741
2742 r = amdgpu_device_ip_suspend_phase1(adev);
2743 if (r)
2744 return r;
2745 r = amdgpu_device_ip_suspend_phase2(adev);
2746
2747 if (amdgpu_sriov_vf(adev))
2748 amdgpu_virt_release_full_gpu(adev, false);
2749
2750 return r;
2751 }
2752
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2753 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2754 {
2755 int i, r;
2756
2757 static enum amd_ip_block_type ip_order[] = {
2758 AMD_IP_BLOCK_TYPE_GMC,
2759 AMD_IP_BLOCK_TYPE_COMMON,
2760 AMD_IP_BLOCK_TYPE_PSP,
2761 AMD_IP_BLOCK_TYPE_IH,
2762 };
2763
2764 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2765 int j;
2766 struct amdgpu_ip_block *block;
2767
2768 block = &adev->ip_blocks[i];
2769 block->status.hw = false;
2770
2771 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2772
2773 if (block->version->type != ip_order[j] ||
2774 !block->status.valid)
2775 continue;
2776
2777 r = block->version->funcs->hw_init(adev);
2778 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2779 if (r)
2780 return r;
2781 block->status.hw = true;
2782 }
2783 }
2784
2785 return 0;
2786 }
2787
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2788 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2789 {
2790 int i, r;
2791
2792 static enum amd_ip_block_type ip_order[] = {
2793 AMD_IP_BLOCK_TYPE_SMC,
2794 AMD_IP_BLOCK_TYPE_DCE,
2795 AMD_IP_BLOCK_TYPE_GFX,
2796 AMD_IP_BLOCK_TYPE_SDMA,
2797 AMD_IP_BLOCK_TYPE_UVD,
2798 AMD_IP_BLOCK_TYPE_VCE,
2799 AMD_IP_BLOCK_TYPE_VCN
2800 };
2801
2802 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2803 int j;
2804 struct amdgpu_ip_block *block;
2805
2806 for (j = 0; j < adev->num_ip_blocks; j++) {
2807 block = &adev->ip_blocks[j];
2808
2809 if (block->version->type != ip_order[i] ||
2810 !block->status.valid ||
2811 block->status.hw)
2812 continue;
2813
2814 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2815 r = block->version->funcs->resume(adev);
2816 else
2817 r = block->version->funcs->hw_init(adev);
2818
2819 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2820 if (r)
2821 return r;
2822 block->status.hw = true;
2823 }
2824 }
2825
2826 return 0;
2827 }
2828
2829 /**
2830 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2831 *
2832 * @adev: amdgpu_device pointer
2833 *
2834 * First resume function for hardware IPs. The list of all the hardware
2835 * IPs that make up the asic is walked and the resume callbacks are run for
2836 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2837 * after a suspend and updates the software state as necessary. This
2838 * function is also used for restoring the GPU after a GPU reset.
2839 * Returns 0 on success, negative error code on failure.
2840 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2841 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2842 {
2843 int i, r;
2844
2845 for (i = 0; i < adev->num_ip_blocks; i++) {
2846 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2847 continue;
2848 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2849 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2850 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2851
2852 r = adev->ip_blocks[i].version->funcs->resume(adev);
2853 if (r) {
2854 DRM_ERROR("resume of IP block <%s> failed %d\n",
2855 adev->ip_blocks[i].version->funcs->name, r);
2856 return r;
2857 }
2858 adev->ip_blocks[i].status.hw = true;
2859 }
2860 }
2861
2862 return 0;
2863 }
2864
2865 /**
2866 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2867 *
2868 * @adev: amdgpu_device pointer
2869 *
2870 * First resume function for hardware IPs. The list of all the hardware
2871 * IPs that make up the asic is walked and the resume callbacks are run for
2872 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2873 * functional state after a suspend and updates the software state as
2874 * necessary. This function is also used for restoring the GPU after a GPU
2875 * reset.
2876 * Returns 0 on success, negative error code on failure.
2877 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2878 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2879 {
2880 int i, r;
2881
2882 for (i = 0; i < adev->num_ip_blocks; i++) {
2883 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2884 continue;
2885 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2886 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2887 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2888 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2889 continue;
2890 r = adev->ip_blocks[i].version->funcs->resume(adev);
2891 if (r) {
2892 DRM_ERROR("resume of IP block <%s> failed %d\n",
2893 adev->ip_blocks[i].version->funcs->name, r);
2894 return r;
2895 }
2896 adev->ip_blocks[i].status.hw = true;
2897 }
2898
2899 return 0;
2900 }
2901
2902 /**
2903 * amdgpu_device_ip_resume - run resume for hardware IPs
2904 *
2905 * @adev: amdgpu_device pointer
2906 *
2907 * Main resume function for hardware IPs. The hardware IPs
2908 * are split into two resume functions because they are
2909 * are also used in in recovering from a GPU reset and some additional
2910 * steps need to be take between them. In this case (S3/S4) they are
2911 * run sequentially.
2912 * Returns 0 on success, negative error code on failure.
2913 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2914 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2915 {
2916 int r;
2917
2918 r = amdgpu_device_ip_resume_phase1(adev);
2919 if (r)
2920 return r;
2921
2922 r = amdgpu_device_fw_loading(adev);
2923 if (r)
2924 return r;
2925
2926 r = amdgpu_device_ip_resume_phase2(adev);
2927
2928 return r;
2929 }
2930
2931 /**
2932 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2933 *
2934 * @adev: amdgpu_device pointer
2935 *
2936 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2937 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2938 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2939 {
2940 if (amdgpu_sriov_vf(adev)) {
2941 if (adev->is_atom_fw) {
2942 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2943 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2944 } else {
2945 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2946 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2947 }
2948
2949 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2950 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2951 }
2952 }
2953
2954 /**
2955 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2956 *
2957 * @asic_type: AMD asic type
2958 *
2959 * Check if there is DC (new modesetting infrastructre) support for an asic.
2960 * returns true if DC has support, false if not.
2961 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2962 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2963 {
2964 switch (asic_type) {
2965 #if defined(CONFIG_DRM_AMD_DC)
2966 #if defined(CONFIG_DRM_AMD_DC_SI)
2967 case CHIP_TAHITI:
2968 case CHIP_PITCAIRN:
2969 case CHIP_VERDE:
2970 case CHIP_OLAND:
2971 #endif
2972 case CHIP_BONAIRE:
2973 case CHIP_KAVERI:
2974 case CHIP_KABINI:
2975 case CHIP_MULLINS:
2976 /*
2977 * We have systems in the wild with these ASICs that require
2978 * LVDS and VGA support which is not supported with DC.
2979 *
2980 * Fallback to the non-DC driver here by default so as not to
2981 * cause regressions.
2982 */
2983 return amdgpu_dc > 0;
2984 case CHIP_HAWAII:
2985 case CHIP_CARRIZO:
2986 case CHIP_STONEY:
2987 case CHIP_POLARIS10:
2988 case CHIP_POLARIS11:
2989 case CHIP_POLARIS12:
2990 case CHIP_VEGAM:
2991 case CHIP_TONGA:
2992 case CHIP_FIJI:
2993 case CHIP_VEGA10:
2994 case CHIP_VEGA12:
2995 case CHIP_VEGA20:
2996 #if defined(CONFIG_DRM_AMD_DC_DCN)
2997 case CHIP_RAVEN:
2998 case CHIP_NAVI10:
2999 case CHIP_NAVI14:
3000 case CHIP_NAVI12:
3001 case CHIP_RENOIR:
3002 #endif
3003 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3004 case CHIP_SIENNA_CICHLID:
3005 case CHIP_NAVY_FLOUNDER:
3006 #endif
3007 return amdgpu_dc != 0;
3008 #endif
3009 default:
3010 if (amdgpu_dc > 0)
3011 DRM_INFO("Display Core has been requested via kernel parameter "
3012 "but isn't supported by ASIC, ignoring\n");
3013 return false;
3014 }
3015 }
3016
3017 /**
3018 * amdgpu_device_has_dc_support - check if dc is supported
3019 *
3020 * @adev: amdgpu_device pointer
3021 *
3022 * Returns true for supported, false for not supported
3023 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3024 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3025 {
3026 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3027 return false;
3028
3029 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3030 }
3031
3032
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3033 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3034 {
3035 struct amdgpu_device *adev =
3036 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3037 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3038
3039 /* It's a bug to not have a hive within this function */
3040 if (WARN_ON(!hive))
3041 return;
3042
3043 /*
3044 * Use task barrier to synchronize all xgmi reset works across the
3045 * hive. task_barrier_enter and task_barrier_exit will block
3046 * until all the threads running the xgmi reset works reach
3047 * those points. task_barrier_full will do both blocks.
3048 */
3049 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3050
3051 task_barrier_enter(&hive->tb);
3052 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3053
3054 if (adev->asic_reset_res)
3055 goto fail;
3056
3057 task_barrier_exit(&hive->tb);
3058 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3059
3060 if (adev->asic_reset_res)
3061 goto fail;
3062
3063 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3064 adev->mmhub.funcs->reset_ras_error_count(adev);
3065 } else {
3066
3067 task_barrier_full(&hive->tb);
3068 adev->asic_reset_res = amdgpu_asic_reset(adev);
3069 }
3070
3071 fail:
3072 if (adev->asic_reset_res)
3073 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3074 adev->asic_reset_res, adev_to_drm(adev)->unique);
3075 amdgpu_put_xgmi_hive(hive);
3076 }
3077
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3078 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3079 {
3080 char *input = amdgpu_lockup_timeout;
3081 char *timeout_setting = NULL;
3082 int index = 0;
3083 long timeout;
3084 int ret = 0;
3085
3086 /*
3087 * By default timeout for non compute jobs is 10000.
3088 * And there is no timeout enforced on compute jobs.
3089 * In SR-IOV or passthrough mode, timeout for compute
3090 * jobs are 60000 by default.
3091 */
3092 adev->gfx_timeout = msecs_to_jiffies(10000);
3093 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3094 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3095 adev->compute_timeout = msecs_to_jiffies(60000);
3096 else
3097 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3098
3099 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3100 while ((timeout_setting = strsep(&input, ",")) &&
3101 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3102 ret = kstrtol(timeout_setting, 0, &timeout);
3103 if (ret)
3104 return ret;
3105
3106 if (timeout == 0) {
3107 index++;
3108 continue;
3109 } else if (timeout < 0) {
3110 timeout = MAX_SCHEDULE_TIMEOUT;
3111 } else {
3112 timeout = msecs_to_jiffies(timeout);
3113 }
3114
3115 switch (index++) {
3116 case 0:
3117 adev->gfx_timeout = timeout;
3118 break;
3119 case 1:
3120 adev->compute_timeout = timeout;
3121 break;
3122 case 2:
3123 adev->sdma_timeout = timeout;
3124 break;
3125 case 3:
3126 adev->video_timeout = timeout;
3127 break;
3128 default:
3129 break;
3130 }
3131 }
3132 /*
3133 * There is only one value specified and
3134 * it should apply to all non-compute jobs.
3135 */
3136 if (index == 1) {
3137 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3138 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3139 adev->compute_timeout = adev->gfx_timeout;
3140 }
3141 }
3142
3143 return ret;
3144 }
3145
3146 static const struct attribute *amdgpu_dev_attributes[] = {
3147 &dev_attr_product_name.attr,
3148 &dev_attr_product_number.attr,
3149 &dev_attr_serial_number.attr,
3150 &dev_attr_pcie_replay_count.attr,
3151 NULL
3152 };
3153
3154
3155 /**
3156 * amdgpu_device_init - initialize the driver
3157 *
3158 * @adev: amdgpu_device pointer
3159 * @flags: driver flags
3160 *
3161 * Initializes the driver info and hw (all asics).
3162 * Returns 0 for success or an error on failure.
3163 * Called at driver startup.
3164 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3165 int amdgpu_device_init(struct amdgpu_device *adev,
3166 uint32_t flags)
3167 {
3168 struct drm_device *ddev = adev_to_drm(adev);
3169 struct pci_dev *pdev = adev->pdev;
3170 int r, i;
3171 bool boco = false;
3172 u32 max_MBps;
3173
3174 adev->shutdown = false;
3175 adev->flags = flags;
3176
3177 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3178 adev->asic_type = amdgpu_force_asic_type;
3179 else
3180 adev->asic_type = flags & AMD_ASIC_MASK;
3181
3182 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3183 if (amdgpu_emu_mode == 1)
3184 adev->usec_timeout *= 10;
3185 adev->gmc.gart_size = 512 * 1024 * 1024;
3186 adev->accel_working = false;
3187 adev->num_rings = 0;
3188 adev->mman.buffer_funcs = NULL;
3189 adev->mman.buffer_funcs_ring = NULL;
3190 adev->vm_manager.vm_pte_funcs = NULL;
3191 adev->vm_manager.vm_pte_num_scheds = 0;
3192 adev->gmc.gmc_funcs = NULL;
3193 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3194 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3195
3196 adev->smc_rreg = &amdgpu_invalid_rreg;
3197 adev->smc_wreg = &amdgpu_invalid_wreg;
3198 adev->pcie_rreg = &amdgpu_invalid_rreg;
3199 adev->pcie_wreg = &amdgpu_invalid_wreg;
3200 adev->pciep_rreg = &amdgpu_invalid_rreg;
3201 adev->pciep_wreg = &amdgpu_invalid_wreg;
3202 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3203 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3204 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3205 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3206 adev->didt_rreg = &amdgpu_invalid_rreg;
3207 adev->didt_wreg = &amdgpu_invalid_wreg;
3208 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3209 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3210 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3211 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3212
3213 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3214 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3215 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3216
3217 /* mutex initialization are all done here so we
3218 * can recall function without having locking issues */
3219 atomic_set(&adev->irq.ih.lock, 0);
3220 mutex_init(&adev->firmware.mutex);
3221 mutex_init(&adev->pm.mutex);
3222 mutex_init(&adev->gfx.gpu_clock_mutex);
3223 mutex_init(&adev->srbm_mutex);
3224 mutex_init(&adev->gfx.pipe_reserve_mutex);
3225 mutex_init(&adev->gfx.gfx_off_mutex);
3226 mutex_init(&adev->grbm_idx_mutex);
3227 mutex_init(&adev->mn_lock);
3228 mutex_init(&adev->virt.vf_errors.lock);
3229 hash_init(adev->mn_hash);
3230 atomic_set(&adev->in_gpu_reset, 0);
3231 init_rwsem(&adev->reset_sem);
3232 mutex_init(&adev->psp.mutex);
3233 mutex_init(&adev->notifier_lock);
3234
3235 r = amdgpu_device_check_arguments(adev);
3236 if (r)
3237 return r;
3238
3239 spin_lock_init(&adev->mmio_idx_lock);
3240 spin_lock_init(&adev->smc_idx_lock);
3241 spin_lock_init(&adev->pcie_idx_lock);
3242 spin_lock_init(&adev->uvd_ctx_idx_lock);
3243 spin_lock_init(&adev->didt_idx_lock);
3244 spin_lock_init(&adev->gc_cac_idx_lock);
3245 spin_lock_init(&adev->se_cac_idx_lock);
3246 spin_lock_init(&adev->audio_endpt_idx_lock);
3247 spin_lock_init(&adev->mm_stats.lock);
3248
3249 INIT_LIST_HEAD(&adev->shadow_list);
3250 mutex_init(&adev->shadow_list_lock);
3251
3252 INIT_DELAYED_WORK(&adev->delayed_init_work,
3253 amdgpu_device_delayed_init_work_handler);
3254 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3255 amdgpu_device_delay_enable_gfx_off);
3256
3257 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3258
3259 adev->gfx.gfx_off_req_count = 1;
3260 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3261
3262 atomic_set(&adev->throttling_logging_enabled, 1);
3263 /*
3264 * If throttling continues, logging will be performed every minute
3265 * to avoid log flooding. "-1" is subtracted since the thermal
3266 * throttling interrupt comes every second. Thus, the total logging
3267 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3268 * for throttling interrupt) = 60 seconds.
3269 */
3270 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3271 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3272
3273 /* Registers mapping */
3274 /* TODO: block userspace mapping of io register */
3275 if (adev->asic_type >= CHIP_BONAIRE) {
3276 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3277 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3278 } else {
3279 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3280 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3281 }
3282
3283 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3284 if (adev->rmmio == NULL) {
3285 return -ENOMEM;
3286 }
3287 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3288 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3289
3290 /* io port mapping */
3291 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3292 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3293 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3294 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3295 break;
3296 }
3297 }
3298 if (adev->rio_mem == NULL)
3299 DRM_INFO("PCI I/O BAR is not found.\n");
3300
3301 /* enable PCIE atomic ops */
3302 r = pci_enable_atomic_ops_to_root(adev->pdev,
3303 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3304 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3305 if (r) {
3306 adev->have_atomics_support = false;
3307 DRM_INFO("PCIE atomic ops is not supported\n");
3308 } else {
3309 adev->have_atomics_support = true;
3310 }
3311
3312 amdgpu_device_get_pcie_info(adev);
3313
3314 if (amdgpu_mcbp)
3315 DRM_INFO("MCBP is enabled\n");
3316
3317 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3318 adev->enable_mes = true;
3319
3320 /* detect hw virtualization here */
3321 amdgpu_detect_virtualization(adev);
3322
3323 r = amdgpu_device_get_job_timeout_settings(adev);
3324 if (r) {
3325 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3326 goto failed_unmap;
3327 }
3328
3329 /* early init functions */
3330 r = amdgpu_device_ip_early_init(adev);
3331 if (r)
3332 goto failed_unmap;
3333
3334 /* doorbell bar mapping and doorbell index init*/
3335 amdgpu_device_doorbell_init(adev);
3336
3337 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3338 /* this will fail for cards that aren't VGA class devices, just
3339 * ignore it */
3340 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3341
3342 if (amdgpu_device_supports_boco(ddev))
3343 boco = true;
3344 if (amdgpu_has_atpx() &&
3345 (amdgpu_is_atpx_hybrid() ||
3346 amdgpu_has_atpx_dgpu_power_cntl()) &&
3347 !pci_is_thunderbolt_attached(adev->pdev))
3348 vga_switcheroo_register_client(adev->pdev,
3349 &amdgpu_switcheroo_ops, boco);
3350 if (boco)
3351 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3352
3353 if (amdgpu_emu_mode == 1) {
3354 /* post the asic on emulation mode */
3355 emu_soc_asic_init(adev);
3356 goto fence_driver_init;
3357 }
3358
3359 /* detect if we are with an SRIOV vbios */
3360 amdgpu_device_detect_sriov_bios(adev);
3361
3362 /* check if we need to reset the asic
3363 * E.g., driver was not cleanly unloaded previously, etc.
3364 */
3365 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3366 r = amdgpu_asic_reset(adev);
3367 if (r) {
3368 dev_err(adev->dev, "asic reset on init failed\n");
3369 goto failed;
3370 }
3371 }
3372
3373 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3374
3375 /* Post card if necessary */
3376 if (amdgpu_device_need_post(adev)) {
3377 if (!adev->bios) {
3378 dev_err(adev->dev, "no vBIOS found\n");
3379 r = -EINVAL;
3380 goto failed;
3381 }
3382 DRM_INFO("GPU posting now...\n");
3383 r = amdgpu_device_asic_init(adev);
3384 if (r) {
3385 dev_err(adev->dev, "gpu post error!\n");
3386 goto failed;
3387 }
3388 }
3389
3390 if (adev->is_atom_fw) {
3391 /* Initialize clocks */
3392 r = amdgpu_atomfirmware_get_clock_info(adev);
3393 if (r) {
3394 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3395 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3396 goto failed;
3397 }
3398 } else {
3399 /* Initialize clocks */
3400 r = amdgpu_atombios_get_clock_info(adev);
3401 if (r) {
3402 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3403 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3404 goto failed;
3405 }
3406 /* init i2c buses */
3407 if (!amdgpu_device_has_dc_support(adev))
3408 amdgpu_atombios_i2c_init(adev);
3409 }
3410
3411 fence_driver_init:
3412 /* Fence driver */
3413 r = amdgpu_fence_driver_init(adev);
3414 if (r) {
3415 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3416 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3417 goto failed;
3418 }
3419
3420 /* init the mode config */
3421 drm_mode_config_init(adev_to_drm(adev));
3422
3423 r = amdgpu_device_ip_init(adev);
3424 if (r) {
3425 /* failed in exclusive mode due to timeout */
3426 if (amdgpu_sriov_vf(adev) &&
3427 !amdgpu_sriov_runtime(adev) &&
3428 amdgpu_virt_mmio_blocked(adev) &&
3429 !amdgpu_virt_wait_reset(adev)) {
3430 dev_err(adev->dev, "VF exclusive mode timeout\n");
3431 /* Don't send request since VF is inactive. */
3432 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3433 adev->virt.ops = NULL;
3434 r = -EAGAIN;
3435 goto failed;
3436 }
3437 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3438 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3439 goto failed;
3440 }
3441
3442 dev_info(adev->dev,
3443 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3444 adev->gfx.config.max_shader_engines,
3445 adev->gfx.config.max_sh_per_se,
3446 adev->gfx.config.max_cu_per_sh,
3447 adev->gfx.cu_info.number);
3448
3449 adev->accel_working = true;
3450
3451 amdgpu_vm_check_compute_bug(adev);
3452
3453 /* Initialize the buffer migration limit. */
3454 if (amdgpu_moverate >= 0)
3455 max_MBps = amdgpu_moverate;
3456 else
3457 max_MBps = 8; /* Allow 8 MB/s. */
3458 /* Get a log2 for easy divisions. */
3459 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3460
3461 amdgpu_fbdev_init(adev);
3462
3463 r = amdgpu_pm_sysfs_init(adev);
3464 if (r) {
3465 adev->pm_sysfs_en = false;
3466 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3467 } else
3468 adev->pm_sysfs_en = true;
3469
3470 r = amdgpu_ucode_sysfs_init(adev);
3471 if (r) {
3472 adev->ucode_sysfs_en = false;
3473 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3474 } else
3475 adev->ucode_sysfs_en = true;
3476
3477 if ((amdgpu_testing & 1)) {
3478 if (adev->accel_working)
3479 amdgpu_test_moves(adev);
3480 else
3481 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3482 }
3483 if (amdgpu_benchmarking) {
3484 if (adev->accel_working)
3485 amdgpu_benchmark(adev, amdgpu_benchmarking);
3486 else
3487 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3488 }
3489
3490 /*
3491 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3492 * Otherwise the mgpu fan boost feature will be skipped due to the
3493 * gpu instance is counted less.
3494 */
3495 amdgpu_register_gpu_instance(adev);
3496
3497 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3498 * explicit gating rather than handling it automatically.
3499 */
3500 r = amdgpu_device_ip_late_init(adev);
3501 if (r) {
3502 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3503 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3504 goto failed;
3505 }
3506
3507 /* must succeed. */
3508 amdgpu_ras_resume(adev);
3509
3510 queue_delayed_work(system_wq, &adev->delayed_init_work,
3511 msecs_to_jiffies(AMDGPU_RESUME_MS));
3512
3513 if (amdgpu_sriov_vf(adev))
3514 flush_delayed_work(&adev->delayed_init_work);
3515
3516 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3517 if (r)
3518 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3519
3520 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3521 r = amdgpu_pmu_init(adev);
3522 if (r)
3523 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3524
3525 /* Have stored pci confspace at hand for restore in sudden PCI error */
3526 if (amdgpu_device_cache_pci_state(adev->pdev))
3527 pci_restore_state(pdev);
3528
3529 return 0;
3530
3531 failed:
3532 amdgpu_vf_error_trans_all(adev);
3533 if (boco)
3534 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3535
3536 failed_unmap:
3537 iounmap(adev->rmmio);
3538 adev->rmmio = NULL;
3539
3540 return r;
3541 }
3542
3543 /**
3544 * amdgpu_device_fini - tear down the driver
3545 *
3546 * @adev: amdgpu_device pointer
3547 *
3548 * Tear down the driver info (all asics).
3549 * Called at driver shutdown.
3550 */
amdgpu_device_fini(struct amdgpu_device * adev)3551 void amdgpu_device_fini(struct amdgpu_device *adev)
3552 {
3553 dev_info(adev->dev, "amdgpu: finishing device.\n");
3554 flush_delayed_work(&adev->delayed_init_work);
3555 adev->shutdown = true;
3556
3557 kfree(adev->pci_state);
3558
3559 /* make sure IB test finished before entering exclusive mode
3560 * to avoid preemption on IB test
3561 * */
3562 if (amdgpu_sriov_vf(adev)) {
3563 amdgpu_virt_request_full_gpu(adev, false);
3564 amdgpu_virt_fini_data_exchange(adev);
3565 }
3566
3567 /* disable all interrupts */
3568 amdgpu_irq_disable_all(adev);
3569 if (adev->mode_info.mode_config_initialized){
3570 if (!amdgpu_device_has_dc_support(adev))
3571 drm_helper_force_disable_all(adev_to_drm(adev));
3572 else
3573 drm_atomic_helper_shutdown(adev_to_drm(adev));
3574 }
3575 amdgpu_fence_driver_fini(adev);
3576 if (adev->pm_sysfs_en)
3577 amdgpu_pm_sysfs_fini(adev);
3578 amdgpu_fbdev_fini(adev);
3579 amdgpu_device_ip_fini(adev);
3580 release_firmware(adev->firmware.gpu_info_fw);
3581 adev->firmware.gpu_info_fw = NULL;
3582 adev->accel_working = false;
3583 /* free i2c buses */
3584 if (!amdgpu_device_has_dc_support(adev))
3585 amdgpu_i2c_fini(adev);
3586
3587 if (amdgpu_emu_mode != 1)
3588 amdgpu_atombios_fini(adev);
3589
3590 kfree(adev->bios);
3591 adev->bios = NULL;
3592 if (amdgpu_has_atpx() &&
3593 (amdgpu_is_atpx_hybrid() ||
3594 amdgpu_has_atpx_dgpu_power_cntl()) &&
3595 !pci_is_thunderbolt_attached(adev->pdev))
3596 vga_switcheroo_unregister_client(adev->pdev);
3597 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3598 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3599 vga_client_register(adev->pdev, NULL, NULL, NULL);
3600 if (adev->rio_mem)
3601 pci_iounmap(adev->pdev, adev->rio_mem);
3602 adev->rio_mem = NULL;
3603 iounmap(adev->rmmio);
3604 adev->rmmio = NULL;
3605 amdgpu_device_doorbell_fini(adev);
3606
3607 if (adev->ucode_sysfs_en)
3608 amdgpu_ucode_sysfs_fini(adev);
3609
3610 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3611 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3612 amdgpu_pmu_fini(adev);
3613 if (adev->mman.discovery_bin)
3614 amdgpu_discovery_fini(adev);
3615 }
3616
3617
3618 /*
3619 * Suspend & resume.
3620 */
3621 /**
3622 * amdgpu_device_suspend - initiate device suspend
3623 *
3624 * @dev: drm dev pointer
3625 * @fbcon : notify the fbdev of suspend
3626 *
3627 * Puts the hw in the suspend state (all asics).
3628 * Returns 0 for success or an error on failure.
3629 * Called at driver suspend.
3630 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3631 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3632 {
3633 struct amdgpu_device *adev;
3634 struct drm_crtc *crtc;
3635 struct drm_connector *connector;
3636 struct drm_connector_list_iter iter;
3637 int r;
3638
3639 adev = drm_to_adev(dev);
3640
3641 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3642 return 0;
3643
3644 adev->in_suspend = true;
3645 drm_kms_helper_poll_disable(dev);
3646
3647 if (fbcon)
3648 amdgpu_fbdev_set_suspend(adev, 1);
3649
3650 cancel_delayed_work_sync(&adev->delayed_init_work);
3651
3652 if (!amdgpu_device_has_dc_support(adev)) {
3653 /* turn off display hw */
3654 drm_modeset_lock_all(dev);
3655 drm_connector_list_iter_begin(dev, &iter);
3656 drm_for_each_connector_iter(connector, &iter)
3657 drm_helper_connector_dpms(connector,
3658 DRM_MODE_DPMS_OFF);
3659 drm_connector_list_iter_end(&iter);
3660 drm_modeset_unlock_all(dev);
3661 /* unpin the front buffers and cursors */
3662 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3663 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3664 struct drm_framebuffer *fb = crtc->primary->fb;
3665 struct amdgpu_bo *robj;
3666
3667 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3668 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3669 r = amdgpu_bo_reserve(aobj, true);
3670 if (r == 0) {
3671 amdgpu_bo_unpin(aobj);
3672 amdgpu_bo_unreserve(aobj);
3673 }
3674 }
3675
3676 if (fb == NULL || fb->obj[0] == NULL) {
3677 continue;
3678 }
3679 robj = gem_to_amdgpu_bo(fb->obj[0]);
3680 /* don't unpin kernel fb objects */
3681 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3682 r = amdgpu_bo_reserve(robj, true);
3683 if (r == 0) {
3684 amdgpu_bo_unpin(robj);
3685 amdgpu_bo_unreserve(robj);
3686 }
3687 }
3688 }
3689 }
3690
3691 amdgpu_ras_suspend(adev);
3692
3693 r = amdgpu_device_ip_suspend_phase1(adev);
3694
3695 amdgpu_amdkfd_suspend(adev, !fbcon);
3696
3697 /* evict vram memory */
3698 amdgpu_bo_evict_vram(adev);
3699
3700 amdgpu_fence_driver_suspend(adev);
3701
3702 r = amdgpu_device_ip_suspend_phase2(adev);
3703
3704 /* evict remaining vram memory
3705 * This second call to evict vram is to evict the gart page table
3706 * using the CPU.
3707 */
3708 amdgpu_bo_evict_vram(adev);
3709
3710 return 0;
3711 }
3712
3713 /**
3714 * amdgpu_device_resume - initiate device resume
3715 *
3716 * @dev: drm dev pointer
3717 * @fbcon : notify the fbdev of resume
3718 *
3719 * Bring the hw back to operating state (all asics).
3720 * Returns 0 for success or an error on failure.
3721 * Called at driver resume.
3722 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3723 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3724 {
3725 struct drm_connector *connector;
3726 struct drm_connector_list_iter iter;
3727 struct amdgpu_device *adev = drm_to_adev(dev);
3728 struct drm_crtc *crtc;
3729 int r = 0;
3730
3731 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3732 return 0;
3733
3734 /* post card */
3735 if (amdgpu_device_need_post(adev)) {
3736 r = amdgpu_device_asic_init(adev);
3737 if (r)
3738 dev_err(adev->dev, "amdgpu asic init failed\n");
3739 }
3740
3741 r = amdgpu_device_ip_resume(adev);
3742 if (r) {
3743 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3744 return r;
3745 }
3746 amdgpu_fence_driver_resume(adev);
3747
3748
3749 r = amdgpu_device_ip_late_init(adev);
3750 if (r)
3751 return r;
3752
3753 queue_delayed_work(system_wq, &adev->delayed_init_work,
3754 msecs_to_jiffies(AMDGPU_RESUME_MS));
3755
3756 if (!amdgpu_device_has_dc_support(adev)) {
3757 /* pin cursors */
3758 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3759 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3760
3761 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3762 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3763 r = amdgpu_bo_reserve(aobj, true);
3764 if (r == 0) {
3765 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3766 if (r != 0)
3767 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3768 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3769 amdgpu_bo_unreserve(aobj);
3770 }
3771 }
3772 }
3773 }
3774 r = amdgpu_amdkfd_resume(adev, !fbcon);
3775 if (r)
3776 return r;
3777
3778 /* Make sure IB tests flushed */
3779 flush_delayed_work(&adev->delayed_init_work);
3780
3781 /* blat the mode back in */
3782 if (fbcon) {
3783 if (!amdgpu_device_has_dc_support(adev)) {
3784 /* pre DCE11 */
3785 drm_helper_resume_force_mode(dev);
3786
3787 /* turn on display hw */
3788 drm_modeset_lock_all(dev);
3789
3790 drm_connector_list_iter_begin(dev, &iter);
3791 drm_for_each_connector_iter(connector, &iter)
3792 drm_helper_connector_dpms(connector,
3793 DRM_MODE_DPMS_ON);
3794 drm_connector_list_iter_end(&iter);
3795
3796 drm_modeset_unlock_all(dev);
3797 }
3798 amdgpu_fbdev_set_suspend(adev, 0);
3799 }
3800
3801 drm_kms_helper_poll_enable(dev);
3802
3803 amdgpu_ras_resume(adev);
3804
3805 /*
3806 * Most of the connector probing functions try to acquire runtime pm
3807 * refs to ensure that the GPU is powered on when connector polling is
3808 * performed. Since we're calling this from a runtime PM callback,
3809 * trying to acquire rpm refs will cause us to deadlock.
3810 *
3811 * Since we're guaranteed to be holding the rpm lock, it's safe to
3812 * temporarily disable the rpm helpers so this doesn't deadlock us.
3813 */
3814 #ifdef CONFIG_PM
3815 dev->dev->power.disable_depth++;
3816 #endif
3817 if (!amdgpu_device_has_dc_support(adev))
3818 drm_helper_hpd_irq_event(dev);
3819 else
3820 drm_kms_helper_hotplug_event(dev);
3821 #ifdef CONFIG_PM
3822 dev->dev->power.disable_depth--;
3823 #endif
3824 adev->in_suspend = false;
3825
3826 return 0;
3827 }
3828
3829 /**
3830 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3831 *
3832 * @adev: amdgpu_device pointer
3833 *
3834 * The list of all the hardware IPs that make up the asic is walked and
3835 * the check_soft_reset callbacks are run. check_soft_reset determines
3836 * if the asic is still hung or not.
3837 * Returns true if any of the IPs are still in a hung state, false if not.
3838 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3839 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3840 {
3841 int i;
3842 bool asic_hang = false;
3843
3844 if (amdgpu_sriov_vf(adev))
3845 return true;
3846
3847 if (amdgpu_asic_need_full_reset(adev))
3848 return true;
3849
3850 for (i = 0; i < adev->num_ip_blocks; i++) {
3851 if (!adev->ip_blocks[i].status.valid)
3852 continue;
3853 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3854 adev->ip_blocks[i].status.hang =
3855 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3856 if (adev->ip_blocks[i].status.hang) {
3857 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3858 asic_hang = true;
3859 }
3860 }
3861 return asic_hang;
3862 }
3863
3864 /**
3865 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3866 *
3867 * @adev: amdgpu_device pointer
3868 *
3869 * The list of all the hardware IPs that make up the asic is walked and the
3870 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3871 * handles any IP specific hardware or software state changes that are
3872 * necessary for a soft reset to succeed.
3873 * Returns 0 on success, negative error code on failure.
3874 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3875 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3876 {
3877 int i, r = 0;
3878
3879 for (i = 0; i < adev->num_ip_blocks; i++) {
3880 if (!adev->ip_blocks[i].status.valid)
3881 continue;
3882 if (adev->ip_blocks[i].status.hang &&
3883 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3884 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3885 if (r)
3886 return r;
3887 }
3888 }
3889
3890 return 0;
3891 }
3892
3893 /**
3894 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3895 *
3896 * @adev: amdgpu_device pointer
3897 *
3898 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3899 * reset is necessary to recover.
3900 * Returns true if a full asic reset is required, false if not.
3901 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3902 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3903 {
3904 int i;
3905
3906 if (amdgpu_asic_need_full_reset(adev))
3907 return true;
3908
3909 for (i = 0; i < adev->num_ip_blocks; i++) {
3910 if (!adev->ip_blocks[i].status.valid)
3911 continue;
3912 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3913 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3914 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3915 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3916 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3917 if (adev->ip_blocks[i].status.hang) {
3918 dev_info(adev->dev, "Some block need full reset!\n");
3919 return true;
3920 }
3921 }
3922 }
3923 return false;
3924 }
3925
3926 /**
3927 * amdgpu_device_ip_soft_reset - do a soft reset
3928 *
3929 * @adev: amdgpu_device pointer
3930 *
3931 * The list of all the hardware IPs that make up the asic is walked and the
3932 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3933 * IP specific hardware or software state changes that are necessary to soft
3934 * reset the IP.
3935 * Returns 0 on success, negative error code on failure.
3936 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3937 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3938 {
3939 int i, r = 0;
3940
3941 for (i = 0; i < adev->num_ip_blocks; i++) {
3942 if (!adev->ip_blocks[i].status.valid)
3943 continue;
3944 if (adev->ip_blocks[i].status.hang &&
3945 adev->ip_blocks[i].version->funcs->soft_reset) {
3946 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3947 if (r)
3948 return r;
3949 }
3950 }
3951
3952 return 0;
3953 }
3954
3955 /**
3956 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3957 *
3958 * @adev: amdgpu_device pointer
3959 *
3960 * The list of all the hardware IPs that make up the asic is walked and the
3961 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3962 * handles any IP specific hardware or software state changes that are
3963 * necessary after the IP has been soft reset.
3964 * Returns 0 on success, negative error code on failure.
3965 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3966 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3967 {
3968 int i, r = 0;
3969
3970 for (i = 0; i < adev->num_ip_blocks; i++) {
3971 if (!adev->ip_blocks[i].status.valid)
3972 continue;
3973 if (adev->ip_blocks[i].status.hang &&
3974 adev->ip_blocks[i].version->funcs->post_soft_reset)
3975 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3976 if (r)
3977 return r;
3978 }
3979
3980 return 0;
3981 }
3982
3983 /**
3984 * amdgpu_device_recover_vram - Recover some VRAM contents
3985 *
3986 * @adev: amdgpu_device pointer
3987 *
3988 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3989 * restore things like GPUVM page tables after a GPU reset where
3990 * the contents of VRAM might be lost.
3991 *
3992 * Returns:
3993 * 0 on success, negative error code on failure.
3994 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)3995 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3996 {
3997 struct dma_fence *fence = NULL, *next = NULL;
3998 struct amdgpu_bo *shadow;
3999 long r = 1, tmo;
4000
4001 if (amdgpu_sriov_runtime(adev))
4002 tmo = msecs_to_jiffies(8000);
4003 else
4004 tmo = msecs_to_jiffies(100);
4005
4006 dev_info(adev->dev, "recover vram bo from shadow start\n");
4007 mutex_lock(&adev->shadow_list_lock);
4008 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4009
4010 /* No need to recover an evicted BO */
4011 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4012 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4013 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4014 continue;
4015
4016 r = amdgpu_bo_restore_shadow(shadow, &next);
4017 if (r)
4018 break;
4019
4020 if (fence) {
4021 tmo = dma_fence_wait_timeout(fence, false, tmo);
4022 dma_fence_put(fence);
4023 fence = next;
4024 if (tmo == 0) {
4025 r = -ETIMEDOUT;
4026 break;
4027 } else if (tmo < 0) {
4028 r = tmo;
4029 break;
4030 }
4031 } else {
4032 fence = next;
4033 }
4034 }
4035 mutex_unlock(&adev->shadow_list_lock);
4036
4037 if (fence)
4038 tmo = dma_fence_wait_timeout(fence, false, tmo);
4039 dma_fence_put(fence);
4040
4041 if (r < 0 || tmo <= 0) {
4042 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4043 return -EIO;
4044 }
4045
4046 dev_info(adev->dev, "recover vram bo from shadow done\n");
4047 return 0;
4048 }
4049
4050
4051 /**
4052 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4053 *
4054 * @adev: amdgpu_device pointer
4055 * @from_hypervisor: request from hypervisor
4056 *
4057 * do VF FLR and reinitialize Asic
4058 * return 0 means succeeded otherwise failed
4059 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4060 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4061 bool from_hypervisor)
4062 {
4063 int r;
4064
4065 if (from_hypervisor)
4066 r = amdgpu_virt_request_full_gpu(adev, true);
4067 else
4068 r = amdgpu_virt_reset_gpu(adev);
4069 if (r)
4070 return r;
4071
4072 amdgpu_amdkfd_pre_reset(adev);
4073
4074 /* Resume IP prior to SMC */
4075 r = amdgpu_device_ip_reinit_early_sriov(adev);
4076 if (r)
4077 goto error;
4078
4079 amdgpu_virt_init_data_exchange(adev);
4080 /* we need recover gart prior to run SMC/CP/SDMA resume */
4081 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4082
4083 r = amdgpu_device_fw_loading(adev);
4084 if (r)
4085 return r;
4086
4087 /* now we are okay to resume SMC/CP/SDMA */
4088 r = amdgpu_device_ip_reinit_late_sriov(adev);
4089 if (r)
4090 goto error;
4091
4092 amdgpu_irq_gpu_reset_resume_helper(adev);
4093 r = amdgpu_ib_ring_tests(adev);
4094 amdgpu_amdkfd_post_reset(adev);
4095
4096 error:
4097 amdgpu_virt_release_full_gpu(adev, true);
4098 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4099 amdgpu_inc_vram_lost(adev);
4100 r = amdgpu_device_recover_vram(adev);
4101 }
4102
4103 return r;
4104 }
4105
4106 /**
4107 * amdgpu_device_has_job_running - check if there is any job in mirror list
4108 *
4109 * @adev: amdgpu_device pointer
4110 *
4111 * check if there is any job in mirror list
4112 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4113 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4114 {
4115 int i;
4116 struct drm_sched_job *job;
4117
4118 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4119 struct amdgpu_ring *ring = adev->rings[i];
4120
4121 if (!ring || !ring->sched.thread)
4122 continue;
4123
4124 spin_lock(&ring->sched.job_list_lock);
4125 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4126 struct drm_sched_job, node);
4127 spin_unlock(&ring->sched.job_list_lock);
4128 if (job)
4129 return true;
4130 }
4131 return false;
4132 }
4133
4134 /**
4135 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4136 *
4137 * @adev: amdgpu_device pointer
4138 *
4139 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4140 * a hung GPU.
4141 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4142 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4143 {
4144 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4145 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4146 return false;
4147 }
4148
4149 if (amdgpu_gpu_recovery == 0)
4150 goto disabled;
4151
4152 if (amdgpu_sriov_vf(adev))
4153 return true;
4154
4155 if (amdgpu_gpu_recovery == -1) {
4156 switch (adev->asic_type) {
4157 case CHIP_BONAIRE:
4158 case CHIP_HAWAII:
4159 case CHIP_TOPAZ:
4160 case CHIP_TONGA:
4161 case CHIP_FIJI:
4162 case CHIP_POLARIS10:
4163 case CHIP_POLARIS11:
4164 case CHIP_POLARIS12:
4165 case CHIP_VEGAM:
4166 case CHIP_VEGA20:
4167 case CHIP_VEGA10:
4168 case CHIP_VEGA12:
4169 case CHIP_RAVEN:
4170 case CHIP_ARCTURUS:
4171 case CHIP_RENOIR:
4172 case CHIP_NAVI10:
4173 case CHIP_NAVI14:
4174 case CHIP_NAVI12:
4175 case CHIP_SIENNA_CICHLID:
4176 break;
4177 default:
4178 goto disabled;
4179 }
4180 }
4181
4182 return true;
4183
4184 disabled:
4185 dev_info(adev->dev, "GPU recovery disabled.\n");
4186 return false;
4187 }
4188
4189
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4190 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4191 struct amdgpu_job *job,
4192 bool *need_full_reset_arg)
4193 {
4194 int i, r = 0;
4195 bool need_full_reset = *need_full_reset_arg;
4196
4197 amdgpu_debugfs_wait_dump(adev);
4198
4199 if (amdgpu_sriov_vf(adev)) {
4200 /* stop the data exchange thread */
4201 amdgpu_virt_fini_data_exchange(adev);
4202 }
4203
4204 /* block all schedulers and reset given job's ring */
4205 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4206 struct amdgpu_ring *ring = adev->rings[i];
4207
4208 if (!ring || !ring->sched.thread)
4209 continue;
4210
4211 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4212 amdgpu_fence_driver_force_completion(ring);
4213 }
4214
4215 if(job)
4216 drm_sched_increase_karma(&job->base);
4217
4218 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4219 if (!amdgpu_sriov_vf(adev)) {
4220
4221 if (!need_full_reset)
4222 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4223
4224 if (!need_full_reset) {
4225 amdgpu_device_ip_pre_soft_reset(adev);
4226 r = amdgpu_device_ip_soft_reset(adev);
4227 amdgpu_device_ip_post_soft_reset(adev);
4228 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4229 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4230 need_full_reset = true;
4231 }
4232 }
4233
4234 if (need_full_reset)
4235 r = amdgpu_device_ip_suspend(adev);
4236
4237 *need_full_reset_arg = need_full_reset;
4238 }
4239
4240 return r;
4241 }
4242
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4243 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4244 struct list_head *device_list_handle,
4245 bool *need_full_reset_arg,
4246 bool skip_hw_reset)
4247 {
4248 struct amdgpu_device *tmp_adev = NULL;
4249 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4250 int r = 0;
4251
4252 /*
4253 * ASIC reset has to be done on all HGMI hive nodes ASAP
4254 * to allow proper links negotiation in FW (within 1 sec)
4255 */
4256 if (!skip_hw_reset && need_full_reset) {
4257 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4258 /* For XGMI run all resets in parallel to speed up the process */
4259 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4260 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4261 r = -EALREADY;
4262 } else
4263 r = amdgpu_asic_reset(tmp_adev);
4264
4265 if (r) {
4266 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4267 r, adev_to_drm(tmp_adev)->unique);
4268 break;
4269 }
4270 }
4271
4272 /* For XGMI wait for all resets to complete before proceed */
4273 if (!r) {
4274 list_for_each_entry(tmp_adev, device_list_handle,
4275 gmc.xgmi.head) {
4276 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4277 flush_work(&tmp_adev->xgmi_reset_work);
4278 r = tmp_adev->asic_reset_res;
4279 if (r)
4280 break;
4281 }
4282 }
4283 }
4284 }
4285
4286 if (!r && amdgpu_ras_intr_triggered()) {
4287 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4288 if (tmp_adev->mmhub.funcs &&
4289 tmp_adev->mmhub.funcs->reset_ras_error_count)
4290 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4291 }
4292
4293 amdgpu_ras_intr_cleared();
4294 }
4295
4296 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4297 if (need_full_reset) {
4298 /* post card */
4299 if (amdgpu_device_asic_init(tmp_adev))
4300 dev_warn(tmp_adev->dev, "asic atom init failed!");
4301
4302 if (!r) {
4303 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4304 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4305 if (r)
4306 goto out;
4307
4308 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4309 if (vram_lost) {
4310 DRM_INFO("VRAM is lost due to GPU reset!\n");
4311 amdgpu_inc_vram_lost(tmp_adev);
4312 }
4313
4314 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4315 if (r)
4316 goto out;
4317
4318 r = amdgpu_device_fw_loading(tmp_adev);
4319 if (r)
4320 return r;
4321
4322 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4323 if (r)
4324 goto out;
4325
4326 if (vram_lost)
4327 amdgpu_device_fill_reset_magic(tmp_adev);
4328
4329 /*
4330 * Add this ASIC as tracked as reset was already
4331 * complete successfully.
4332 */
4333 amdgpu_register_gpu_instance(tmp_adev);
4334
4335 r = amdgpu_device_ip_late_init(tmp_adev);
4336 if (r)
4337 goto out;
4338
4339 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4340
4341 /*
4342 * The GPU enters bad state once faulty pages
4343 * by ECC has reached the threshold, and ras
4344 * recovery is scheduled next. So add one check
4345 * here to break recovery if it indeed exceeds
4346 * bad page threshold, and remind user to
4347 * retire this GPU or setting one bigger
4348 * bad_page_threshold value to fix this once
4349 * probing driver again.
4350 */
4351 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4352 /* must succeed. */
4353 amdgpu_ras_resume(tmp_adev);
4354 } else {
4355 r = -EINVAL;
4356 goto out;
4357 }
4358
4359 /* Update PSP FW topology after reset */
4360 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4361 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4362 }
4363 }
4364
4365 out:
4366 if (!r) {
4367 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4368 r = amdgpu_ib_ring_tests(tmp_adev);
4369 if (r) {
4370 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4371 r = amdgpu_device_ip_suspend(tmp_adev);
4372 need_full_reset = true;
4373 r = -EAGAIN;
4374 goto end;
4375 }
4376 }
4377
4378 if (!r)
4379 r = amdgpu_device_recover_vram(tmp_adev);
4380 else
4381 tmp_adev->asic_reset_res = r;
4382 }
4383
4384 end:
4385 *need_full_reset_arg = need_full_reset;
4386 return r;
4387 }
4388
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4389 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4390 struct amdgpu_hive_info *hive)
4391 {
4392 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4393 return false;
4394
4395 if (hive) {
4396 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4397 } else {
4398 down_write(&adev->reset_sem);
4399 }
4400
4401 atomic_inc(&adev->gpu_reset_counter);
4402 switch (amdgpu_asic_reset_method(adev)) {
4403 case AMD_RESET_METHOD_MODE1:
4404 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4405 break;
4406 case AMD_RESET_METHOD_MODE2:
4407 adev->mp1_state = PP_MP1_STATE_RESET;
4408 break;
4409 default:
4410 adev->mp1_state = PP_MP1_STATE_NONE;
4411 break;
4412 }
4413
4414 return true;
4415 }
4416
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4417 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4418 {
4419 amdgpu_vf_error_trans_all(adev);
4420 adev->mp1_state = PP_MP1_STATE_NONE;
4421 atomic_set(&adev->in_gpu_reset, 0);
4422 up_write(&adev->reset_sem);
4423 }
4424
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4425 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4426 {
4427 struct pci_dev *p = NULL;
4428
4429 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4430 adev->pdev->bus->number, 1);
4431 if (p) {
4432 pm_runtime_enable(&(p->dev));
4433 pm_runtime_resume(&(p->dev));
4434 }
4435 }
4436
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4437 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4438 {
4439 enum amd_reset_method reset_method;
4440 struct pci_dev *p = NULL;
4441 u64 expires;
4442
4443 /*
4444 * For now, only BACO and mode1 reset are confirmed
4445 * to suffer the audio issue without proper suspended.
4446 */
4447 reset_method = amdgpu_asic_reset_method(adev);
4448 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4449 (reset_method != AMD_RESET_METHOD_MODE1))
4450 return -EINVAL;
4451
4452 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4453 adev->pdev->bus->number, 1);
4454 if (!p)
4455 return -ENODEV;
4456
4457 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4458 if (!expires)
4459 /*
4460 * If we cannot get the audio device autosuspend delay,
4461 * a fixed 4S interval will be used. Considering 3S is
4462 * the audio controller default autosuspend delay setting.
4463 * 4S used here is guaranteed to cover that.
4464 */
4465 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4466
4467 while (!pm_runtime_status_suspended(&(p->dev))) {
4468 if (!pm_runtime_suspend(&(p->dev)))
4469 break;
4470
4471 if (expires < ktime_get_mono_fast_ns()) {
4472 dev_warn(adev->dev, "failed to suspend display audio\n");
4473 /* TODO: abort the succeeding gpu reset? */
4474 return -ETIMEDOUT;
4475 }
4476 }
4477
4478 pm_runtime_disable(&(p->dev));
4479
4480 return 0;
4481 }
4482
4483 /**
4484 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4485 *
4486 * @adev: amdgpu_device pointer
4487 * @job: which job trigger hang
4488 *
4489 * Attempt to reset the GPU if it has hung (all asics).
4490 * Attempt to do soft-reset or full-reset and reinitialize Asic
4491 * Returns 0 for success or an error on failure.
4492 */
4493
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4494 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4495 struct amdgpu_job *job)
4496 {
4497 struct list_head device_list, *device_list_handle = NULL;
4498 bool need_full_reset = false;
4499 bool job_signaled = false;
4500 struct amdgpu_hive_info *hive = NULL;
4501 struct amdgpu_device *tmp_adev = NULL;
4502 int i, r = 0;
4503 bool need_emergency_restart = false;
4504 bool audio_suspended = false;
4505
4506 /*
4507 * Special case: RAS triggered and full reset isn't supported
4508 */
4509 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4510
4511 /*
4512 * Flush RAM to disk so that after reboot
4513 * the user can read log and see why the system rebooted.
4514 */
4515 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4516 DRM_WARN("Emergency reboot.");
4517
4518 ksys_sync_helper();
4519 emergency_restart();
4520 }
4521
4522 dev_info(adev->dev, "GPU %s begin!\n",
4523 need_emergency_restart ? "jobs stop":"reset");
4524
4525 /*
4526 * Here we trylock to avoid chain of resets executing from
4527 * either trigger by jobs on different adevs in XGMI hive or jobs on
4528 * different schedulers for same device while this TO handler is running.
4529 * We always reset all schedulers for device and all devices for XGMI
4530 * hive so that should take care of them too.
4531 */
4532 hive = amdgpu_get_xgmi_hive(adev);
4533 if (hive) {
4534 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4535 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4536 job ? job->base.id : -1, hive->hive_id);
4537 amdgpu_put_xgmi_hive(hive);
4538 return 0;
4539 }
4540 mutex_lock(&hive->hive_lock);
4541 }
4542
4543 /*
4544 * Build list of devices to reset.
4545 * In case we are in XGMI hive mode, resort the device list
4546 * to put adev in the 1st position.
4547 */
4548 INIT_LIST_HEAD(&device_list);
4549 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4550 if (!hive)
4551 return -ENODEV;
4552 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4553 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4554 device_list_handle = &hive->device_list;
4555 } else {
4556 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4557 device_list_handle = &device_list;
4558 }
4559
4560 /* block all schedulers and reset given job's ring */
4561 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4562 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4563 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4564 job ? job->base.id : -1);
4565 r = 0;
4566 goto skip_recovery;
4567 }
4568
4569 /*
4570 * Try to put the audio codec into suspend state
4571 * before gpu reset started.
4572 *
4573 * Due to the power domain of the graphics device
4574 * is shared with AZ power domain. Without this,
4575 * we may change the audio hardware from behind
4576 * the audio driver's back. That will trigger
4577 * some audio codec errors.
4578 */
4579 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4580 audio_suspended = true;
4581
4582 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4583
4584 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4585
4586 if (!amdgpu_sriov_vf(tmp_adev))
4587 amdgpu_amdkfd_pre_reset(tmp_adev);
4588
4589 /*
4590 * Mark these ASICs to be reseted as untracked first
4591 * And add them back after reset completed
4592 */
4593 amdgpu_unregister_gpu_instance(tmp_adev);
4594
4595 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4596
4597 /* disable ras on ALL IPs */
4598 if (!need_emergency_restart &&
4599 amdgpu_device_ip_need_full_reset(tmp_adev))
4600 amdgpu_ras_suspend(tmp_adev);
4601
4602 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4603 struct amdgpu_ring *ring = tmp_adev->rings[i];
4604
4605 if (!ring || !ring->sched.thread)
4606 continue;
4607
4608 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4609
4610 if (need_emergency_restart)
4611 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4612 }
4613 }
4614
4615 if (need_emergency_restart)
4616 goto skip_sched_resume;
4617
4618 /*
4619 * Must check guilty signal here since after this point all old
4620 * HW fences are force signaled.
4621 *
4622 * job->base holds a reference to parent fence
4623 */
4624 if (job && job->base.s_fence->parent &&
4625 dma_fence_is_signaled(job->base.s_fence->parent)) {
4626 job_signaled = true;
4627 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4628 goto skip_hw_reset;
4629 }
4630
4631 retry: /* Rest of adevs pre asic reset from XGMI hive. */
4632 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4633 r = amdgpu_device_pre_asic_reset(tmp_adev,
4634 (tmp_adev == adev) ? job : NULL,
4635 &need_full_reset);
4636 /*TODO Should we stop ?*/
4637 if (r) {
4638 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4639 r, adev_to_drm(tmp_adev)->unique);
4640 tmp_adev->asic_reset_res = r;
4641 }
4642 }
4643
4644 /* Actual ASIC resets if needed.*/
4645 /* TODO Implement XGMI hive reset logic for SRIOV */
4646 if (amdgpu_sriov_vf(adev)) {
4647 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4648 if (r)
4649 adev->asic_reset_res = r;
4650 } else {
4651 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4652 if (r && r == -EAGAIN)
4653 goto retry;
4654 }
4655
4656 skip_hw_reset:
4657
4658 /* Post ASIC reset for all devs .*/
4659 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4660
4661 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4662 struct amdgpu_ring *ring = tmp_adev->rings[i];
4663
4664 if (!ring || !ring->sched.thread)
4665 continue;
4666
4667 /* No point to resubmit jobs if we didn't HW reset*/
4668 if (!tmp_adev->asic_reset_res && !job_signaled)
4669 drm_sched_resubmit_jobs(&ring->sched);
4670
4671 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4672 }
4673
4674 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4675 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4676 }
4677
4678 tmp_adev->asic_reset_res = 0;
4679
4680 if (r) {
4681 /* bad news, how to tell it to userspace ? */
4682 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4683 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4684 } else {
4685 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4686 }
4687 }
4688
4689 skip_sched_resume:
4690 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4691 /*unlock kfd: SRIOV would do it separately */
4692 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4693 amdgpu_amdkfd_post_reset(tmp_adev);
4694 if (audio_suspended)
4695 amdgpu_device_resume_display_audio(tmp_adev);
4696 amdgpu_device_unlock_adev(tmp_adev);
4697 }
4698
4699 skip_recovery:
4700 if (hive) {
4701 atomic_set(&hive->in_reset, 0);
4702 mutex_unlock(&hive->hive_lock);
4703 amdgpu_put_xgmi_hive(hive);
4704 }
4705
4706 if (r)
4707 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4708 return r;
4709 }
4710
4711 /**
4712 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4713 *
4714 * @adev: amdgpu_device pointer
4715 *
4716 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4717 * and lanes) of the slot the device is in. Handles APUs and
4718 * virtualized environments where PCIE config space may not be available.
4719 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4720 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4721 {
4722 struct pci_dev *pdev;
4723 enum pci_bus_speed speed_cap, platform_speed_cap;
4724 enum pcie_link_width platform_link_width;
4725
4726 if (amdgpu_pcie_gen_cap)
4727 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4728
4729 if (amdgpu_pcie_lane_cap)
4730 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4731
4732 /* covers APUs as well */
4733 if (pci_is_root_bus(adev->pdev->bus)) {
4734 if (adev->pm.pcie_gen_mask == 0)
4735 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4736 if (adev->pm.pcie_mlw_mask == 0)
4737 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4738 return;
4739 }
4740
4741 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4742 return;
4743
4744 pcie_bandwidth_available(adev->pdev, NULL,
4745 &platform_speed_cap, &platform_link_width);
4746
4747 if (adev->pm.pcie_gen_mask == 0) {
4748 /* asic caps */
4749 pdev = adev->pdev;
4750 speed_cap = pcie_get_speed_cap(pdev);
4751 if (speed_cap == PCI_SPEED_UNKNOWN) {
4752 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4753 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4754 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4755 } else {
4756 if (speed_cap == PCIE_SPEED_16_0GT)
4757 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4758 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4759 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4760 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4761 else if (speed_cap == PCIE_SPEED_8_0GT)
4762 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4763 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4764 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4765 else if (speed_cap == PCIE_SPEED_5_0GT)
4766 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4767 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4768 else
4769 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4770 }
4771 /* platform caps */
4772 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4773 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4774 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4775 } else {
4776 if (platform_speed_cap == PCIE_SPEED_16_0GT)
4777 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4778 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4779 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4780 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4781 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4782 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4783 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4784 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4785 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4786 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4787 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4788 else
4789 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4790
4791 }
4792 }
4793 if (adev->pm.pcie_mlw_mask == 0) {
4794 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4795 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4796 } else {
4797 switch (platform_link_width) {
4798 case PCIE_LNK_X32:
4799 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4800 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4802 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4803 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4804 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4805 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4806 break;
4807 case PCIE_LNK_X16:
4808 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4810 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4811 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4812 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4813 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4814 break;
4815 case PCIE_LNK_X12:
4816 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4821 break;
4822 case PCIE_LNK_X8:
4823 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4827 break;
4828 case PCIE_LNK_X4:
4829 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4832 break;
4833 case PCIE_LNK_X2:
4834 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4836 break;
4837 case PCIE_LNK_X1:
4838 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4839 break;
4840 default:
4841 break;
4842 }
4843 }
4844 }
4845 }
4846
amdgpu_device_baco_enter(struct drm_device * dev)4847 int amdgpu_device_baco_enter(struct drm_device *dev)
4848 {
4849 struct amdgpu_device *adev = drm_to_adev(dev);
4850 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4851
4852 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4853 return -ENOTSUPP;
4854
4855 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4856 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4857
4858 return amdgpu_dpm_baco_enter(adev);
4859 }
4860
amdgpu_device_baco_exit(struct drm_device * dev)4861 int amdgpu_device_baco_exit(struct drm_device *dev)
4862 {
4863 struct amdgpu_device *adev = drm_to_adev(dev);
4864 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4865 int ret = 0;
4866
4867 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4868 return -ENOTSUPP;
4869
4870 ret = amdgpu_dpm_baco_exit(adev);
4871 if (ret)
4872 return ret;
4873
4874 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4875 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4876
4877 return 0;
4878 }
4879
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4880 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4881 {
4882 int i;
4883
4884 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4885 struct amdgpu_ring *ring = adev->rings[i];
4886
4887 if (!ring || !ring->sched.thread)
4888 continue;
4889
4890 cancel_delayed_work_sync(&ring->sched.work_tdr);
4891 }
4892 }
4893
4894 /**
4895 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4896 * @pdev: PCI device struct
4897 * @state: PCI channel state
4898 *
4899 * Description: Called when a PCI error is detected.
4900 *
4901 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4902 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4903 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4904 {
4905 struct drm_device *dev = pci_get_drvdata(pdev);
4906 struct amdgpu_device *adev = drm_to_adev(dev);
4907 int i;
4908
4909 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4910
4911 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4912 DRM_WARN("No support for XGMI hive yet...");
4913 return PCI_ERS_RESULT_DISCONNECT;
4914 }
4915
4916 switch (state) {
4917 case pci_channel_io_normal:
4918 return PCI_ERS_RESULT_CAN_RECOVER;
4919 /* Fatal error, prepare for slot reset */
4920 case pci_channel_io_frozen:
4921 /*
4922 * Cancel and wait for all TDRs in progress if failing to
4923 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4924 *
4925 * Locking adev->reset_sem will prevent any external access
4926 * to GPU during PCI error recovery
4927 */
4928 while (!amdgpu_device_lock_adev(adev, NULL))
4929 amdgpu_cancel_all_tdr(adev);
4930
4931 /*
4932 * Block any work scheduling as we do for regular GPU reset
4933 * for the duration of the recovery
4934 */
4935 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4936 struct amdgpu_ring *ring = adev->rings[i];
4937
4938 if (!ring || !ring->sched.thread)
4939 continue;
4940
4941 drm_sched_stop(&ring->sched, NULL);
4942 }
4943 return PCI_ERS_RESULT_NEED_RESET;
4944 case pci_channel_io_perm_failure:
4945 /* Permanent error, prepare for device removal */
4946 return PCI_ERS_RESULT_DISCONNECT;
4947 }
4948
4949 return PCI_ERS_RESULT_NEED_RESET;
4950 }
4951
4952 /**
4953 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4954 * @pdev: pointer to PCI device
4955 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4956 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4957 {
4958
4959 DRM_INFO("PCI error: mmio enabled callback!!\n");
4960
4961 /* TODO - dump whatever for debugging purposes */
4962
4963 /* This called only if amdgpu_pci_error_detected returns
4964 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4965 * works, no need to reset slot.
4966 */
4967
4968 return PCI_ERS_RESULT_RECOVERED;
4969 }
4970
4971 /**
4972 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4973 * @pdev: PCI device struct
4974 *
4975 * Description: This routine is called by the pci error recovery
4976 * code after the PCI slot has been reset, just before we
4977 * should resume normal operations.
4978 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4979 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4980 {
4981 struct drm_device *dev = pci_get_drvdata(pdev);
4982 struct amdgpu_device *adev = drm_to_adev(dev);
4983 int r, i;
4984 bool need_full_reset = true;
4985 u32 memsize;
4986 struct list_head device_list;
4987
4988 DRM_INFO("PCI error: slot reset callback!!\n");
4989
4990 INIT_LIST_HEAD(&device_list);
4991 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4992
4993 /* wait for asic to come out of reset */
4994 msleep(500);
4995
4996 /* Restore PCI confspace */
4997 amdgpu_device_load_pci_state(pdev);
4998
4999 /* confirm ASIC came out of reset */
5000 for (i = 0; i < adev->usec_timeout; i++) {
5001 memsize = amdgpu_asic_get_config_memsize(adev);
5002
5003 if (memsize != 0xffffffff)
5004 break;
5005 udelay(1);
5006 }
5007 if (memsize == 0xffffffff) {
5008 r = -ETIME;
5009 goto out;
5010 }
5011
5012 adev->in_pci_err_recovery = true;
5013 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5014 adev->in_pci_err_recovery = false;
5015 if (r)
5016 goto out;
5017
5018 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5019
5020 out:
5021 if (!r) {
5022 if (amdgpu_device_cache_pci_state(adev->pdev))
5023 pci_restore_state(adev->pdev);
5024
5025 DRM_INFO("PCIe error recovery succeeded\n");
5026 } else {
5027 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5028 amdgpu_device_unlock_adev(adev);
5029 }
5030
5031 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5032 }
5033
5034 /**
5035 * amdgpu_pci_resume() - resume normal ops after PCI reset
5036 * @pdev: pointer to PCI device
5037 *
5038 * Called when the error recovery driver tells us that its
5039 * OK to resume normal operation. Use completion to allow
5040 * halted scsi ops to resume.
5041 */
amdgpu_pci_resume(struct pci_dev * pdev)5042 void amdgpu_pci_resume(struct pci_dev *pdev)
5043 {
5044 struct drm_device *dev = pci_get_drvdata(pdev);
5045 struct amdgpu_device *adev = drm_to_adev(dev);
5046 int i;
5047
5048
5049 DRM_INFO("PCI error: resume callback!!\n");
5050
5051 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5052 struct amdgpu_ring *ring = adev->rings[i];
5053
5054 if (!ring || !ring->sched.thread)
5055 continue;
5056
5057
5058 drm_sched_resubmit_jobs(&ring->sched);
5059 drm_sched_start(&ring->sched, true);
5060 }
5061
5062 amdgpu_device_unlock_adev(adev);
5063 }
5064
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5065 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5066 {
5067 struct drm_device *dev = pci_get_drvdata(pdev);
5068 struct amdgpu_device *adev = drm_to_adev(dev);
5069 int r;
5070
5071 r = pci_save_state(pdev);
5072 if (!r) {
5073 kfree(adev->pci_state);
5074
5075 adev->pci_state = pci_store_saved_state(pdev);
5076
5077 if (!adev->pci_state) {
5078 DRM_ERROR("Failed to store PCI saved state");
5079 return false;
5080 }
5081 } else {
5082 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5083 return false;
5084 }
5085
5086 return true;
5087 }
5088
amdgpu_device_load_pci_state(struct pci_dev * pdev)5089 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5090 {
5091 struct drm_device *dev = pci_get_drvdata(pdev);
5092 struct amdgpu_device *adev = drm_to_adev(dev);
5093 int r;
5094
5095 if (!adev->pci_state)
5096 return false;
5097
5098 r = pci_load_saved_state(pdev, adev->pci_state);
5099
5100 if (!r) {
5101 pci_restore_state(pdev);
5102 } else {
5103 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5104 return false;
5105 }
5106
5107 return true;
5108 }
5109
5110
5111