1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72 
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin");
84 
85 #define AMDGPU_RESUME_MS		2000
86 
87 const char *amdgpu_asic_name[] = {
88 	"TAHITI",
89 	"PITCAIRN",
90 	"VERDE",
91 	"OLAND",
92 	"HAINAN",
93 	"BONAIRE",
94 	"KAVERI",
95 	"KABINI",
96 	"HAWAII",
97 	"MULLINS",
98 	"TOPAZ",
99 	"TONGA",
100 	"FIJI",
101 	"CARRIZO",
102 	"STONEY",
103 	"POLARIS10",
104 	"POLARIS11",
105 	"POLARIS12",
106 	"VEGAM",
107 	"VEGA10",
108 	"VEGA12",
109 	"VEGA20",
110 	"RAVEN",
111 	"ARCTURUS",
112 	"RENOIR",
113 	"NAVI10",
114 	"NAVI14",
115 	"NAVI12",
116 	"SIENNA_CICHLID",
117 	"NAVY_FLOUNDER",
118 	"LAST",
119 };
120 
121 /**
122  * DOC: pcie_replay_count
123  *
124  * The amdgpu driver provides a sysfs API for reporting the total number
125  * of PCIe replays (NAKs)
126  * The file pcie_replay_count is used for this and returns the total
127  * number of replays as a sum of the NAKs generated and NAKs received
128  */
129 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)130 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
131 		struct device_attribute *attr, char *buf)
132 {
133 	struct drm_device *ddev = dev_get_drvdata(dev);
134 	struct amdgpu_device *adev = drm_to_adev(ddev);
135 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
136 
137 	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
138 }
139 
140 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
141 		amdgpu_device_get_pcie_replay_count, NULL);
142 
143 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
144 
145 /**
146  * DOC: product_name
147  *
148  * The amdgpu driver provides a sysfs API for reporting the product name
149  * for the device
150  * The file serial_number is used for this and returns the product name
151  * as returned from the FRU.
152  * NOTE: This is only available for certain server cards
153  */
154 
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)155 static ssize_t amdgpu_device_get_product_name(struct device *dev,
156 		struct device_attribute *attr, char *buf)
157 {
158 	struct drm_device *ddev = dev_get_drvdata(dev);
159 	struct amdgpu_device *adev = drm_to_adev(ddev);
160 
161 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
162 }
163 
164 static DEVICE_ATTR(product_name, S_IRUGO,
165 		amdgpu_device_get_product_name, NULL);
166 
167 /**
168  * DOC: product_number
169  *
170  * The amdgpu driver provides a sysfs API for reporting the part number
171  * for the device
172  * The file serial_number is used for this and returns the part number
173  * as returned from the FRU.
174  * NOTE: This is only available for certain server cards
175  */
176 
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)177 static ssize_t amdgpu_device_get_product_number(struct device *dev,
178 		struct device_attribute *attr, char *buf)
179 {
180 	struct drm_device *ddev = dev_get_drvdata(dev);
181 	struct amdgpu_device *adev = drm_to_adev(ddev);
182 
183 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
184 }
185 
186 static DEVICE_ATTR(product_number, S_IRUGO,
187 		amdgpu_device_get_product_number, NULL);
188 
189 /**
190  * DOC: serial_number
191  *
192  * The amdgpu driver provides a sysfs API for reporting the serial number
193  * for the device
194  * The file serial_number is used for this and returns the serial number
195  * as returned from the FRU.
196  * NOTE: This is only available for certain server cards
197  */
198 
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)199 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
200 		struct device_attribute *attr, char *buf)
201 {
202 	struct drm_device *ddev = dev_get_drvdata(dev);
203 	struct amdgpu_device *adev = drm_to_adev(ddev);
204 
205 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
206 }
207 
208 static DEVICE_ATTR(serial_number, S_IRUGO,
209 		amdgpu_device_get_serial_number, NULL);
210 
211 /**
212  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
213  *
214  * @dev: drm_device pointer
215  *
216  * Returns true if the device is a dGPU with HG/PX power control,
217  * otherwise return false.
218  */
amdgpu_device_supports_boco(struct drm_device * dev)219 bool amdgpu_device_supports_boco(struct drm_device *dev)
220 {
221 	struct amdgpu_device *adev = drm_to_adev(dev);
222 
223 	if (adev->flags & AMD_IS_PX)
224 		return true;
225 	return false;
226 }
227 
228 /**
229  * amdgpu_device_supports_baco - Does the device support BACO
230  *
231  * @dev: drm_device pointer
232  *
233  * Returns true if the device supporte BACO,
234  * otherwise return false.
235  */
amdgpu_device_supports_baco(struct drm_device * dev)236 bool amdgpu_device_supports_baco(struct drm_device *dev)
237 {
238 	struct amdgpu_device *adev = drm_to_adev(dev);
239 
240 	return amdgpu_asic_supports_baco(adev);
241 }
242 
243 /*
244  * VRAM access helper functions
245  */
246 
247 /**
248  * amdgpu_device_vram_access - read/write a buffer in vram
249  *
250  * @adev: amdgpu_device pointer
251  * @pos: offset of the buffer in vram
252  * @buf: virtual address of the buffer in system memory
253  * @size: read/write size, sizeof(@buf) must > @size
254  * @write: true - write to vram, otherwise - read from vram
255  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)256 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
257 			       uint32_t *buf, size_t size, bool write)
258 {
259 	unsigned long flags;
260 	uint32_t hi = ~0;
261 	uint64_t last;
262 
263 
264 #ifdef CONFIG_64BIT
265 	last = min(pos + size, adev->gmc.visible_vram_size);
266 	if (last > pos) {
267 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
268 		size_t count = last - pos;
269 
270 		if (write) {
271 			memcpy_toio(addr, buf, count);
272 			mb();
273 			amdgpu_asic_flush_hdp(adev, NULL);
274 		} else {
275 			amdgpu_asic_invalidate_hdp(adev, NULL);
276 			mb();
277 			memcpy_fromio(buf, addr, count);
278 		}
279 
280 		if (count == size)
281 			return;
282 
283 		pos += count;
284 		buf += count / 4;
285 		size -= count;
286 	}
287 #endif
288 
289 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
290 	for (last = pos + size; pos < last; pos += 4) {
291 		uint32_t tmp = pos >> 31;
292 
293 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
294 		if (tmp != hi) {
295 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
296 			hi = tmp;
297 		}
298 		if (write)
299 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
300 		else
301 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
302 	}
303 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
304 }
305 
306 /*
307  * register access helper functions.
308  */
309 /**
310  * amdgpu_device_rreg - read a memory mapped IO or indirect register
311  *
312  * @adev: amdgpu_device pointer
313  * @reg: dword aligned register offset
314  * @acc_flags: access flags which require special behavior
315  *
316  * Returns the 32 bit value from the offset specified.
317  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)318 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
319 			    uint32_t reg, uint32_t acc_flags)
320 {
321 	uint32_t ret;
322 
323 	if (adev->in_pci_err_recovery)
324 		return 0;
325 
326 	if ((reg * 4) < adev->rmmio_size) {
327 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
328 		    amdgpu_sriov_runtime(adev) &&
329 		    down_read_trylock(&adev->reset_sem)) {
330 			ret = amdgpu_kiq_rreg(adev, reg);
331 			up_read(&adev->reset_sem);
332 		} else {
333 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
334 		}
335 	} else {
336 		ret = adev->pcie_rreg(adev, reg * 4);
337 	}
338 
339 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
340 
341 	return ret;
342 }
343 
344 /*
345  * MMIO register read with bytes helper functions
346  * @offset:bytes offset from MMIO start
347  *
348 */
349 
350 /**
351  * amdgpu_mm_rreg8 - read a memory mapped IO register
352  *
353  * @adev: amdgpu_device pointer
354  * @offset: byte aligned register offset
355  *
356  * Returns the 8 bit value from the offset specified.
357  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)358 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
359 {
360 	if (adev->in_pci_err_recovery)
361 		return 0;
362 
363 	if (offset < adev->rmmio_size)
364 		return (readb(adev->rmmio + offset));
365 	BUG();
366 }
367 
368 /*
369  * MMIO register write with bytes helper functions
370  * @offset:bytes offset from MMIO start
371  * @value: the value want to be written to the register
372  *
373 */
374 /**
375  * amdgpu_mm_wreg8 - read a memory mapped IO register
376  *
377  * @adev: amdgpu_device pointer
378  * @offset: byte aligned register offset
379  * @value: 8 bit value to write
380  *
381  * Writes the value specified to the offset specified.
382  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)383 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
384 {
385 	if (adev->in_pci_err_recovery)
386 		return;
387 
388 	if (offset < adev->rmmio_size)
389 		writeb(value, adev->rmmio + offset);
390 	else
391 		BUG();
392 }
393 
394 /**
395  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
396  *
397  * @adev: amdgpu_device pointer
398  * @reg: dword aligned register offset
399  * @v: 32 bit value to write to the register
400  * @acc_flags: access flags which require special behavior
401  *
402  * Writes the value specified to the offset specified.
403  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)404 void amdgpu_device_wreg(struct amdgpu_device *adev,
405 			uint32_t reg, uint32_t v,
406 			uint32_t acc_flags)
407 {
408 	if (adev->in_pci_err_recovery)
409 		return;
410 
411 	if ((reg * 4) < adev->rmmio_size) {
412 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
413 		    amdgpu_sriov_runtime(adev) &&
414 		    down_read_trylock(&adev->reset_sem)) {
415 			amdgpu_kiq_wreg(adev, reg, v);
416 			up_read(&adev->reset_sem);
417 		} else {
418 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
419 		}
420 	} else {
421 		adev->pcie_wreg(adev, reg * 4, v);
422 	}
423 
424 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
425 }
426 
427 /*
428  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
429  *
430  * this function is invoked only the debugfs register access
431  * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)432 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
433 			     uint32_t reg, uint32_t v)
434 {
435 	if (adev->in_pci_err_recovery)
436 		return;
437 
438 	if (amdgpu_sriov_fullaccess(adev) &&
439 	    adev->gfx.rlc.funcs &&
440 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
441 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
442 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
443 	} else {
444 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
445 	}
446 }
447 
448 /**
449  * amdgpu_io_rreg - read an IO register
450  *
451  * @adev: amdgpu_device pointer
452  * @reg: dword aligned register offset
453  *
454  * Returns the 32 bit value from the offset specified.
455  */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)456 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
457 {
458 	if (adev->in_pci_err_recovery)
459 		return 0;
460 
461 	if ((reg * 4) < adev->rio_mem_size)
462 		return ioread32(adev->rio_mem + (reg * 4));
463 	else {
464 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
465 		return ioread32(adev->rio_mem + (mmMM_DATA * 4));
466 	}
467 }
468 
469 /**
470  * amdgpu_io_wreg - write to an IO register
471  *
472  * @adev: amdgpu_device pointer
473  * @reg: dword aligned register offset
474  * @v: 32 bit value to write to the register
475  *
476  * Writes the value specified to the offset specified.
477  */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)478 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
479 {
480 	if (adev->in_pci_err_recovery)
481 		return;
482 
483 	if ((reg * 4) < adev->rio_mem_size)
484 		iowrite32(v, adev->rio_mem + (reg * 4));
485 	else {
486 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
487 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
488 	}
489 }
490 
491 /**
492  * amdgpu_mm_rdoorbell - read a doorbell dword
493  *
494  * @adev: amdgpu_device pointer
495  * @index: doorbell index
496  *
497  * Returns the value in the doorbell aperture at the
498  * requested doorbell index (CIK).
499  */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)500 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
501 {
502 	if (adev->in_pci_err_recovery)
503 		return 0;
504 
505 	if (index < adev->doorbell.num_doorbells) {
506 		return readl(adev->doorbell.ptr + index);
507 	} else {
508 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
509 		return 0;
510 	}
511 }
512 
513 /**
514  * amdgpu_mm_wdoorbell - write a doorbell dword
515  *
516  * @adev: amdgpu_device pointer
517  * @index: doorbell index
518  * @v: value to write
519  *
520  * Writes @v to the doorbell aperture at the
521  * requested doorbell index (CIK).
522  */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)523 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
524 {
525 	if (adev->in_pci_err_recovery)
526 		return;
527 
528 	if (index < adev->doorbell.num_doorbells) {
529 		writel(v, adev->doorbell.ptr + index);
530 	} else {
531 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
532 	}
533 }
534 
535 /**
536  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
537  *
538  * @adev: amdgpu_device pointer
539  * @index: doorbell index
540  *
541  * Returns the value in the doorbell aperture at the
542  * requested doorbell index (VEGA10+).
543  */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)544 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
545 {
546 	if (adev->in_pci_err_recovery)
547 		return 0;
548 
549 	if (index < adev->doorbell.num_doorbells) {
550 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
551 	} else {
552 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
553 		return 0;
554 	}
555 }
556 
557 /**
558  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
559  *
560  * @adev: amdgpu_device pointer
561  * @index: doorbell index
562  * @v: value to write
563  *
564  * Writes @v to the doorbell aperture at the
565  * requested doorbell index (VEGA10+).
566  */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)567 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
568 {
569 	if (adev->in_pci_err_recovery)
570 		return;
571 
572 	if (index < adev->doorbell.num_doorbells) {
573 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
574 	} else {
575 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
576 	}
577 }
578 
579 /**
580  * amdgpu_device_indirect_rreg - read an indirect register
581  *
582  * @adev: amdgpu_device pointer
583  * @pcie_index: mmio register offset
584  * @pcie_data: mmio register offset
585  *
586  * Returns the value of indirect register @reg_addr
587  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)588 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
589 				u32 pcie_index, u32 pcie_data,
590 				u32 reg_addr)
591 {
592 	unsigned long flags;
593 	u32 r;
594 	void __iomem *pcie_index_offset;
595 	void __iomem *pcie_data_offset;
596 
597 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
598 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
599 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
600 
601 	writel(reg_addr, pcie_index_offset);
602 	readl(pcie_index_offset);
603 	r = readl(pcie_data_offset);
604 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
605 
606 	return r;
607 }
608 
609 /**
610  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
611  *
612  * @adev: amdgpu_device pointer
613  * @pcie_index: mmio register offset
614  * @pcie_data: mmio register offset
615  *
616  * Returns the value of indirect register @reg_addr
617  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)618 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
619 				  u32 pcie_index, u32 pcie_data,
620 				  u32 reg_addr)
621 {
622 	unsigned long flags;
623 	u64 r;
624 	void __iomem *pcie_index_offset;
625 	void __iomem *pcie_data_offset;
626 
627 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
628 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
629 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
630 
631 	/* read low 32 bits */
632 	writel(reg_addr, pcie_index_offset);
633 	readl(pcie_index_offset);
634 	r = readl(pcie_data_offset);
635 	/* read high 32 bits */
636 	writel(reg_addr + 4, pcie_index_offset);
637 	readl(pcie_index_offset);
638 	r |= ((u64)readl(pcie_data_offset) << 32);
639 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
640 
641 	return r;
642 }
643 
644 /**
645  * amdgpu_device_indirect_wreg - write an indirect register address
646  *
647  * @adev: amdgpu_device pointer
648  * @pcie_index: mmio register offset
649  * @pcie_data: mmio register offset
650  * @reg_addr: indirect register offset
651  * @reg_data: indirect register data
652  *
653  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)654 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
655 				 u32 pcie_index, u32 pcie_data,
656 				 u32 reg_addr, u32 reg_data)
657 {
658 	unsigned long flags;
659 	void __iomem *pcie_index_offset;
660 	void __iomem *pcie_data_offset;
661 
662 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
663 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
664 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
665 
666 	writel(reg_addr, pcie_index_offset);
667 	readl(pcie_index_offset);
668 	writel(reg_data, pcie_data_offset);
669 	readl(pcie_data_offset);
670 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
671 }
672 
673 /**
674  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
675  *
676  * @adev: amdgpu_device pointer
677  * @pcie_index: mmio register offset
678  * @pcie_data: mmio register offset
679  * @reg_addr: indirect register offset
680  * @reg_data: indirect register data
681  *
682  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)683 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
684 				   u32 pcie_index, u32 pcie_data,
685 				   u32 reg_addr, u64 reg_data)
686 {
687 	unsigned long flags;
688 	void __iomem *pcie_index_offset;
689 	void __iomem *pcie_data_offset;
690 
691 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
692 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
693 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
694 
695 	/* write low 32 bits */
696 	writel(reg_addr, pcie_index_offset);
697 	readl(pcie_index_offset);
698 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
699 	readl(pcie_data_offset);
700 	/* write high 32 bits */
701 	writel(reg_addr + 4, pcie_index_offset);
702 	readl(pcie_index_offset);
703 	writel((u32)(reg_data >> 32), pcie_data_offset);
704 	readl(pcie_data_offset);
705 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
706 }
707 
708 /**
709  * amdgpu_invalid_rreg - dummy reg read function
710  *
711  * @adev: amdgpu_device pointer
712  * @reg: offset of register
713  *
714  * Dummy register read function.  Used for register blocks
715  * that certain asics don't have (all asics).
716  * Returns the value in the register.
717  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)718 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
719 {
720 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
721 	BUG();
722 	return 0;
723 }
724 
725 /**
726  * amdgpu_invalid_wreg - dummy reg write function
727  *
728  * @adev: amdgpu_device pointer
729  * @reg: offset of register
730  * @v: value to write to the register
731  *
732  * Dummy register read function.  Used for register blocks
733  * that certain asics don't have (all asics).
734  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)735 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
736 {
737 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
738 		  reg, v);
739 	BUG();
740 }
741 
742 /**
743  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
744  *
745  * @adev: amdgpu_device pointer
746  * @reg: offset of register
747  *
748  * Dummy register read function.  Used for register blocks
749  * that certain asics don't have (all asics).
750  * Returns the value in the register.
751  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)752 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
753 {
754 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
755 	BUG();
756 	return 0;
757 }
758 
759 /**
760  * amdgpu_invalid_wreg64 - dummy reg write function
761  *
762  * @adev: amdgpu_device pointer
763  * @reg: offset of register
764  * @v: value to write to the register
765  *
766  * Dummy register read function.  Used for register blocks
767  * that certain asics don't have (all asics).
768  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)769 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
770 {
771 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
772 		  reg, v);
773 	BUG();
774 }
775 
776 /**
777  * amdgpu_block_invalid_rreg - dummy reg read function
778  *
779  * @adev: amdgpu_device pointer
780  * @block: offset of instance
781  * @reg: offset of register
782  *
783  * Dummy register read function.  Used for register blocks
784  * that certain asics don't have (all asics).
785  * Returns the value in the register.
786  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)787 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
788 					  uint32_t block, uint32_t reg)
789 {
790 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
791 		  reg, block);
792 	BUG();
793 	return 0;
794 }
795 
796 /**
797  * amdgpu_block_invalid_wreg - dummy reg write function
798  *
799  * @adev: amdgpu_device pointer
800  * @block: offset of instance
801  * @reg: offset of register
802  * @v: value to write to the register
803  *
804  * Dummy register read function.  Used for register blocks
805  * that certain asics don't have (all asics).
806  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)807 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
808 				      uint32_t block,
809 				      uint32_t reg, uint32_t v)
810 {
811 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
812 		  reg, block, v);
813 	BUG();
814 }
815 
816 /**
817  * amdgpu_device_asic_init - Wrapper for atom asic_init
818  *
819  * @adev: amdgpu_device pointer
820  *
821  * Does any asic specific work and then calls atom asic init.
822  */
amdgpu_device_asic_init(struct amdgpu_device * adev)823 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
824 {
825 	amdgpu_asic_pre_asic_init(adev);
826 
827 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
828 }
829 
830 /**
831  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
832  *
833  * @adev: amdgpu_device pointer
834  *
835  * Allocates a scratch page of VRAM for use by various things in the
836  * driver.
837  */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)838 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
839 {
840 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
841 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
842 				       &adev->vram_scratch.robj,
843 				       &adev->vram_scratch.gpu_addr,
844 				       (void **)&adev->vram_scratch.ptr);
845 }
846 
847 /**
848  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
849  *
850  * @adev: amdgpu_device pointer
851  *
852  * Frees the VRAM scratch page.
853  */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)854 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
855 {
856 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
857 }
858 
859 /**
860  * amdgpu_device_program_register_sequence - program an array of registers.
861  *
862  * @adev: amdgpu_device pointer
863  * @registers: pointer to the register array
864  * @array_size: size of the register array
865  *
866  * Programs an array or registers with and and or masks.
867  * This is a helper for setting golden registers.
868  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)869 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
870 					     const u32 *registers,
871 					     const u32 array_size)
872 {
873 	u32 tmp, reg, and_mask, or_mask;
874 	int i;
875 
876 	if (array_size % 3)
877 		return;
878 
879 	for (i = 0; i < array_size; i +=3) {
880 		reg = registers[i + 0];
881 		and_mask = registers[i + 1];
882 		or_mask = registers[i + 2];
883 
884 		if (and_mask == 0xffffffff) {
885 			tmp = or_mask;
886 		} else {
887 			tmp = RREG32(reg);
888 			tmp &= ~and_mask;
889 			if (adev->family >= AMDGPU_FAMILY_AI)
890 				tmp |= (or_mask & and_mask);
891 			else
892 				tmp |= or_mask;
893 		}
894 		WREG32(reg, tmp);
895 	}
896 }
897 
898 /**
899  * amdgpu_device_pci_config_reset - reset the GPU
900  *
901  * @adev: amdgpu_device pointer
902  *
903  * Resets the GPU using the pci config reset sequence.
904  * Only applicable to asics prior to vega10.
905  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)906 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
907 {
908 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
909 }
910 
911 /*
912  * GPU doorbell aperture helpers function.
913  */
914 /**
915  * amdgpu_device_doorbell_init - Init doorbell driver information.
916  *
917  * @adev: amdgpu_device pointer
918  *
919  * Init doorbell driver information (CIK)
920  * Returns 0 on success, error on failure.
921  */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)922 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
923 {
924 
925 	/* No doorbell on SI hardware generation */
926 	if (adev->asic_type < CHIP_BONAIRE) {
927 		adev->doorbell.base = 0;
928 		adev->doorbell.size = 0;
929 		adev->doorbell.num_doorbells = 0;
930 		adev->doorbell.ptr = NULL;
931 		return 0;
932 	}
933 
934 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
935 		return -EINVAL;
936 
937 	amdgpu_asic_init_doorbell_index(adev);
938 
939 	/* doorbell bar mapping */
940 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
941 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
942 
943 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
944 					     adev->doorbell_index.max_assignment+1);
945 	if (adev->doorbell.num_doorbells == 0)
946 		return -EINVAL;
947 
948 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
949 	 * paging queue doorbell use the second page. The
950 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
951 	 * doorbells are in the first page. So with paging queue enabled,
952 	 * the max num_doorbells should + 1 page (0x400 in dword)
953 	 */
954 	if (adev->asic_type >= CHIP_VEGA10)
955 		adev->doorbell.num_doorbells += 0x400;
956 
957 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
958 				     adev->doorbell.num_doorbells *
959 				     sizeof(u32));
960 	if (adev->doorbell.ptr == NULL)
961 		return -ENOMEM;
962 
963 	return 0;
964 }
965 
966 /**
967  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
968  *
969  * @adev: amdgpu_device pointer
970  *
971  * Tear down doorbell driver information (CIK)
972  */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)973 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
974 {
975 	iounmap(adev->doorbell.ptr);
976 	adev->doorbell.ptr = NULL;
977 }
978 
979 
980 
981 /*
982  * amdgpu_device_wb_*()
983  * Writeback is the method by which the GPU updates special pages in memory
984  * with the status of certain GPU events (fences, ring pointers,etc.).
985  */
986 
987 /**
988  * amdgpu_device_wb_fini - Disable Writeback and free memory
989  *
990  * @adev: amdgpu_device pointer
991  *
992  * Disables Writeback and frees the Writeback memory (all asics).
993  * Used at driver shutdown.
994  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)995 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
996 {
997 	if (adev->wb.wb_obj) {
998 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
999 				      &adev->wb.gpu_addr,
1000 				      (void **)&adev->wb.wb);
1001 		adev->wb.wb_obj = NULL;
1002 	}
1003 }
1004 
1005 /**
1006  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1007  *
1008  * @adev: amdgpu_device pointer
1009  *
1010  * Initializes writeback and allocates writeback memory (all asics).
1011  * Used at driver startup.
1012  * Returns 0 on success or an -error on failure.
1013  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1014 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1015 {
1016 	int r;
1017 
1018 	if (adev->wb.wb_obj == NULL) {
1019 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1020 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1021 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1022 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1023 					    (void **)&adev->wb.wb);
1024 		if (r) {
1025 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1026 			return r;
1027 		}
1028 
1029 		adev->wb.num_wb = AMDGPU_MAX_WB;
1030 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1031 
1032 		/* clear wb memory */
1033 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1034 	}
1035 
1036 	return 0;
1037 }
1038 
1039 /**
1040  * amdgpu_device_wb_get - Allocate a wb entry
1041  *
1042  * @adev: amdgpu_device pointer
1043  * @wb: wb index
1044  *
1045  * Allocate a wb slot for use by the driver (all asics).
1046  * Returns 0 on success or -EINVAL on failure.
1047  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1048 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1049 {
1050 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1051 
1052 	if (offset < adev->wb.num_wb) {
1053 		__set_bit(offset, adev->wb.used);
1054 		*wb = offset << 3; /* convert to dw offset */
1055 		return 0;
1056 	} else {
1057 		return -EINVAL;
1058 	}
1059 }
1060 
1061 /**
1062  * amdgpu_device_wb_free - Free a wb entry
1063  *
1064  * @adev: amdgpu_device pointer
1065  * @wb: wb index
1066  *
1067  * Free a wb slot allocated for use by the driver (all asics)
1068  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1069 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1070 {
1071 	wb >>= 3;
1072 	if (wb < adev->wb.num_wb)
1073 		__clear_bit(wb, adev->wb.used);
1074 }
1075 
1076 /**
1077  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1078  *
1079  * @adev: amdgpu_device pointer
1080  *
1081  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1082  * to fail, but if any of the BARs is not accessible after the size we abort
1083  * driver loading by returning -ENODEV.
1084  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1085 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1086 {
1087 	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1088 	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1089 	struct pci_bus *root;
1090 	struct resource *res;
1091 	unsigned i;
1092 	u16 cmd;
1093 	int r;
1094 
1095 	/* Bypass for VF */
1096 	if (amdgpu_sriov_vf(adev))
1097 		return 0;
1098 
1099 	/* skip if the bios has already enabled large BAR */
1100 	if (adev->gmc.real_vram_size &&
1101 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1102 		return 0;
1103 
1104 	/* Check if the root BUS has 64bit memory resources */
1105 	root = adev->pdev->bus;
1106 	while (root->parent)
1107 		root = root->parent;
1108 
1109 	pci_bus_for_each_resource(root, res, i) {
1110 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1111 		    res->start > 0x100000000ull)
1112 			break;
1113 	}
1114 
1115 	/* Trying to resize is pointless without a root hub window above 4GB */
1116 	if (!res)
1117 		return 0;
1118 
1119 	/* Disable memory decoding while we change the BAR addresses and size */
1120 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1121 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1122 			      cmd & ~PCI_COMMAND_MEMORY);
1123 
1124 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1125 	amdgpu_device_doorbell_fini(adev);
1126 	if (adev->asic_type >= CHIP_BONAIRE)
1127 		pci_release_resource(adev->pdev, 2);
1128 
1129 	pci_release_resource(adev->pdev, 0);
1130 
1131 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1132 	if (r == -ENOSPC)
1133 		DRM_INFO("Not enough PCI address space for a large BAR.");
1134 	else if (r && r != -ENOTSUPP)
1135 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1136 
1137 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1138 
1139 	/* When the doorbell or fb BAR isn't available we have no chance of
1140 	 * using the device.
1141 	 */
1142 	r = amdgpu_device_doorbell_init(adev);
1143 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1144 		return -ENODEV;
1145 
1146 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1147 
1148 	return 0;
1149 }
1150 
1151 /*
1152  * GPU helpers function.
1153  */
1154 /**
1155  * amdgpu_device_need_post - check if the hw need post or not
1156  *
1157  * @adev: amdgpu_device pointer
1158  *
1159  * Check if the asic has been initialized (all asics) at driver startup
1160  * or post is needed if  hw reset is performed.
1161  * Returns true if need or false if not.
1162  */
amdgpu_device_need_post(struct amdgpu_device * adev)1163 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1164 {
1165 	uint32_t reg;
1166 
1167 	if (amdgpu_sriov_vf(adev))
1168 		return false;
1169 
1170 	if (amdgpu_passthrough(adev)) {
1171 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1172 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1173 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1174 		 * vpost executed for smc version below 22.15
1175 		 */
1176 		if (adev->asic_type == CHIP_FIJI) {
1177 			int err;
1178 			uint32_t fw_ver;
1179 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1180 			/* force vPost if error occured */
1181 			if (err)
1182 				return true;
1183 
1184 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1185 			if (fw_ver < 0x00160e00)
1186 				return true;
1187 		}
1188 	}
1189 
1190 	if (adev->has_hw_reset) {
1191 		adev->has_hw_reset = false;
1192 		return true;
1193 	}
1194 
1195 	/* bios scratch used on CIK+ */
1196 	if (adev->asic_type >= CHIP_BONAIRE)
1197 		return amdgpu_atombios_scratch_need_asic_init(adev);
1198 
1199 	/* check MEM_SIZE for older asics */
1200 	reg = amdgpu_asic_get_config_memsize(adev);
1201 
1202 	if ((reg != 0) && (reg != 0xffffffff))
1203 		return false;
1204 
1205 	return true;
1206 }
1207 
1208 /* if we get transitioned to only one device, take VGA back */
1209 /**
1210  * amdgpu_device_vga_set_decode - enable/disable vga decode
1211  *
1212  * @cookie: amdgpu_device pointer
1213  * @state: enable/disable vga decode
1214  *
1215  * Enable/disable vga decode (all asics).
1216  * Returns VGA resource flags.
1217  */
amdgpu_device_vga_set_decode(void * cookie,bool state)1218 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1219 {
1220 	struct amdgpu_device *adev = cookie;
1221 	amdgpu_asic_set_vga_state(adev, state);
1222 	if (state)
1223 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1224 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1225 	else
1226 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1227 }
1228 
1229 /**
1230  * amdgpu_device_check_block_size - validate the vm block size
1231  *
1232  * @adev: amdgpu_device pointer
1233  *
1234  * Validates the vm block size specified via module parameter.
1235  * The vm block size defines number of bits in page table versus page directory,
1236  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1237  * page table and the remaining bits are in the page directory.
1238  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1239 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1240 {
1241 	/* defines number of bits in page table versus page directory,
1242 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1243 	 * page table and the remaining bits are in the page directory */
1244 	if (amdgpu_vm_block_size == -1)
1245 		return;
1246 
1247 	if (amdgpu_vm_block_size < 9) {
1248 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1249 			 amdgpu_vm_block_size);
1250 		amdgpu_vm_block_size = -1;
1251 	}
1252 }
1253 
1254 /**
1255  * amdgpu_device_check_vm_size - validate the vm size
1256  *
1257  * @adev: amdgpu_device pointer
1258  *
1259  * Validates the vm size in GB specified via module parameter.
1260  * The VM size is the size of the GPU virtual memory space in GB.
1261  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1262 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1263 {
1264 	/* no need to check the default value */
1265 	if (amdgpu_vm_size == -1)
1266 		return;
1267 
1268 	if (amdgpu_vm_size < 1) {
1269 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1270 			 amdgpu_vm_size);
1271 		amdgpu_vm_size = -1;
1272 	}
1273 }
1274 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1275 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1276 {
1277 	struct sysinfo si;
1278 	bool is_os_64 = (sizeof(void *) == 8);
1279 	uint64_t total_memory;
1280 	uint64_t dram_size_seven_GB = 0x1B8000000;
1281 	uint64_t dram_size_three_GB = 0xB8000000;
1282 
1283 	if (amdgpu_smu_memory_pool_size == 0)
1284 		return;
1285 
1286 	if (!is_os_64) {
1287 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1288 		goto def_value;
1289 	}
1290 	si_meminfo(&si);
1291 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1292 
1293 	if ((amdgpu_smu_memory_pool_size == 1) ||
1294 		(amdgpu_smu_memory_pool_size == 2)) {
1295 		if (total_memory < dram_size_three_GB)
1296 			goto def_value1;
1297 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1298 		(amdgpu_smu_memory_pool_size == 8)) {
1299 		if (total_memory < dram_size_seven_GB)
1300 			goto def_value1;
1301 	} else {
1302 		DRM_WARN("Smu memory pool size not supported\n");
1303 		goto def_value;
1304 	}
1305 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1306 
1307 	return;
1308 
1309 def_value1:
1310 	DRM_WARN("No enough system memory\n");
1311 def_value:
1312 	adev->pm.smu_prv_buffer_size = 0;
1313 }
1314 
1315 /**
1316  * amdgpu_device_check_arguments - validate module params
1317  *
1318  * @adev: amdgpu_device pointer
1319  *
1320  * Validates certain module parameters and updates
1321  * the associated values used by the driver (all asics).
1322  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1323 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1324 {
1325 	if (amdgpu_sched_jobs < 4) {
1326 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1327 			 amdgpu_sched_jobs);
1328 		amdgpu_sched_jobs = 4;
1329 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1330 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1331 			 amdgpu_sched_jobs);
1332 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1333 	}
1334 
1335 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1336 		/* gart size must be greater or equal to 32M */
1337 		dev_warn(adev->dev, "gart size (%d) too small\n",
1338 			 amdgpu_gart_size);
1339 		amdgpu_gart_size = -1;
1340 	}
1341 
1342 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1343 		/* gtt size must be greater or equal to 32M */
1344 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1345 				 amdgpu_gtt_size);
1346 		amdgpu_gtt_size = -1;
1347 	}
1348 
1349 	/* valid range is between 4 and 9 inclusive */
1350 	if (amdgpu_vm_fragment_size != -1 &&
1351 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1352 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1353 		amdgpu_vm_fragment_size = -1;
1354 	}
1355 
1356 	if (amdgpu_sched_hw_submission < 2) {
1357 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1358 			 amdgpu_sched_hw_submission);
1359 		amdgpu_sched_hw_submission = 2;
1360 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1361 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1362 			 amdgpu_sched_hw_submission);
1363 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1364 	}
1365 
1366 	amdgpu_device_check_smu_prv_buffer_size(adev);
1367 
1368 	amdgpu_device_check_vm_size(adev);
1369 
1370 	amdgpu_device_check_block_size(adev);
1371 
1372 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1373 
1374 	amdgpu_gmc_tmz_set(adev);
1375 
1376 	if (amdgpu_num_kcq == -1) {
1377 		amdgpu_num_kcq = 8;
1378 	} else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1379 		amdgpu_num_kcq = 8;
1380 		dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1381 	}
1382 
1383 	amdgpu_gmc_noretry_set(adev);
1384 
1385 	return 0;
1386 }
1387 
1388 /**
1389  * amdgpu_switcheroo_set_state - set switcheroo state
1390  *
1391  * @pdev: pci dev pointer
1392  * @state: vga_switcheroo state
1393  *
1394  * Callback for the switcheroo driver.  Suspends or resumes the
1395  * the asics before or after it is powered up using ACPI methods.
1396  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1397 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1398 					enum vga_switcheroo_state state)
1399 {
1400 	struct drm_device *dev = pci_get_drvdata(pdev);
1401 	int r;
1402 
1403 	if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1404 		return;
1405 
1406 	if (state == VGA_SWITCHEROO_ON) {
1407 		pr_info("switched on\n");
1408 		/* don't suspend or resume card normally */
1409 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1410 
1411 		pci_set_power_state(dev->pdev, PCI_D0);
1412 		amdgpu_device_load_pci_state(dev->pdev);
1413 		r = pci_enable_device(dev->pdev);
1414 		if (r)
1415 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1416 		amdgpu_device_resume(dev, true);
1417 
1418 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1419 		drm_kms_helper_poll_enable(dev);
1420 	} else {
1421 		pr_info("switched off\n");
1422 		drm_kms_helper_poll_disable(dev);
1423 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1424 		amdgpu_device_suspend(dev, true);
1425 		amdgpu_device_cache_pci_state(dev->pdev);
1426 		/* Shut down the device */
1427 		pci_disable_device(dev->pdev);
1428 		pci_set_power_state(dev->pdev, PCI_D3cold);
1429 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1430 	}
1431 }
1432 
1433 /**
1434  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1435  *
1436  * @pdev: pci dev pointer
1437  *
1438  * Callback for the switcheroo driver.  Check of the switcheroo
1439  * state can be changed.
1440  * Returns true if the state can be changed, false if not.
1441  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1442 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1443 {
1444 	struct drm_device *dev = pci_get_drvdata(pdev);
1445 
1446 	/*
1447 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1448 	* locking inversion with the driver load path. And the access here is
1449 	* completely racy anyway. So don't bother with locking for now.
1450 	*/
1451 	return atomic_read(&dev->open_count) == 0;
1452 }
1453 
1454 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1455 	.set_gpu_state = amdgpu_switcheroo_set_state,
1456 	.reprobe = NULL,
1457 	.can_switch = amdgpu_switcheroo_can_switch,
1458 };
1459 
1460 /**
1461  * amdgpu_device_ip_set_clockgating_state - set the CG state
1462  *
1463  * @dev: amdgpu_device pointer
1464  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1465  * @state: clockgating state (gate or ungate)
1466  *
1467  * Sets the requested clockgating state for all instances of
1468  * the hardware IP specified.
1469  * Returns the error code from the last instance.
1470  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1471 int amdgpu_device_ip_set_clockgating_state(void *dev,
1472 					   enum amd_ip_block_type block_type,
1473 					   enum amd_clockgating_state state)
1474 {
1475 	struct amdgpu_device *adev = dev;
1476 	int i, r = 0;
1477 
1478 	for (i = 0; i < adev->num_ip_blocks; i++) {
1479 		if (!adev->ip_blocks[i].status.valid)
1480 			continue;
1481 		if (adev->ip_blocks[i].version->type != block_type)
1482 			continue;
1483 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1484 			continue;
1485 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1486 			(void *)adev, state);
1487 		if (r)
1488 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1489 				  adev->ip_blocks[i].version->funcs->name, r);
1490 	}
1491 	return r;
1492 }
1493 
1494 /**
1495  * amdgpu_device_ip_set_powergating_state - set the PG state
1496  *
1497  * @dev: amdgpu_device pointer
1498  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1499  * @state: powergating state (gate or ungate)
1500  *
1501  * Sets the requested powergating state for all instances of
1502  * the hardware IP specified.
1503  * Returns the error code from the last instance.
1504  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1505 int amdgpu_device_ip_set_powergating_state(void *dev,
1506 					   enum amd_ip_block_type block_type,
1507 					   enum amd_powergating_state state)
1508 {
1509 	struct amdgpu_device *adev = dev;
1510 	int i, r = 0;
1511 
1512 	for (i = 0; i < adev->num_ip_blocks; i++) {
1513 		if (!adev->ip_blocks[i].status.valid)
1514 			continue;
1515 		if (adev->ip_blocks[i].version->type != block_type)
1516 			continue;
1517 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1518 			continue;
1519 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1520 			(void *)adev, state);
1521 		if (r)
1522 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1523 				  adev->ip_blocks[i].version->funcs->name, r);
1524 	}
1525 	return r;
1526 }
1527 
1528 /**
1529  * amdgpu_device_ip_get_clockgating_state - get the CG state
1530  *
1531  * @adev: amdgpu_device pointer
1532  * @flags: clockgating feature flags
1533  *
1534  * Walks the list of IPs on the device and updates the clockgating
1535  * flags for each IP.
1536  * Updates @flags with the feature flags for each hardware IP where
1537  * clockgating is enabled.
1538  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1539 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1540 					    u32 *flags)
1541 {
1542 	int i;
1543 
1544 	for (i = 0; i < adev->num_ip_blocks; i++) {
1545 		if (!adev->ip_blocks[i].status.valid)
1546 			continue;
1547 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1548 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1549 	}
1550 }
1551 
1552 /**
1553  * amdgpu_device_ip_wait_for_idle - wait for idle
1554  *
1555  * @adev: amdgpu_device pointer
1556  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1557  *
1558  * Waits for the request hardware IP to be idle.
1559  * Returns 0 for success or a negative error code on failure.
1560  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1561 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1562 				   enum amd_ip_block_type block_type)
1563 {
1564 	int i, r;
1565 
1566 	for (i = 0; i < adev->num_ip_blocks; i++) {
1567 		if (!adev->ip_blocks[i].status.valid)
1568 			continue;
1569 		if (adev->ip_blocks[i].version->type == block_type) {
1570 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1571 			if (r)
1572 				return r;
1573 			break;
1574 		}
1575 	}
1576 	return 0;
1577 
1578 }
1579 
1580 /**
1581  * amdgpu_device_ip_is_idle - is the hardware IP idle
1582  *
1583  * @adev: amdgpu_device pointer
1584  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1585  *
1586  * Check if the hardware IP is idle or not.
1587  * Returns true if it the IP is idle, false if not.
1588  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1589 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1590 			      enum amd_ip_block_type block_type)
1591 {
1592 	int i;
1593 
1594 	for (i = 0; i < adev->num_ip_blocks; i++) {
1595 		if (!adev->ip_blocks[i].status.valid)
1596 			continue;
1597 		if (adev->ip_blocks[i].version->type == block_type)
1598 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1599 	}
1600 	return true;
1601 
1602 }
1603 
1604 /**
1605  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1606  *
1607  * @adev: amdgpu_device pointer
1608  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1609  *
1610  * Returns a pointer to the hardware IP block structure
1611  * if it exists for the asic, otherwise NULL.
1612  */
1613 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1614 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1615 			      enum amd_ip_block_type type)
1616 {
1617 	int i;
1618 
1619 	for (i = 0; i < adev->num_ip_blocks; i++)
1620 		if (adev->ip_blocks[i].version->type == type)
1621 			return &adev->ip_blocks[i];
1622 
1623 	return NULL;
1624 }
1625 
1626 /**
1627  * amdgpu_device_ip_block_version_cmp
1628  *
1629  * @adev: amdgpu_device pointer
1630  * @type: enum amd_ip_block_type
1631  * @major: major version
1632  * @minor: minor version
1633  *
1634  * return 0 if equal or greater
1635  * return 1 if smaller or the ip_block doesn't exist
1636  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1637 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1638 				       enum amd_ip_block_type type,
1639 				       u32 major, u32 minor)
1640 {
1641 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1642 
1643 	if (ip_block && ((ip_block->version->major > major) ||
1644 			((ip_block->version->major == major) &&
1645 			(ip_block->version->minor >= minor))))
1646 		return 0;
1647 
1648 	return 1;
1649 }
1650 
1651 /**
1652  * amdgpu_device_ip_block_add
1653  *
1654  * @adev: amdgpu_device pointer
1655  * @ip_block_version: pointer to the IP to add
1656  *
1657  * Adds the IP block driver information to the collection of IPs
1658  * on the asic.
1659  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1660 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1661 			       const struct amdgpu_ip_block_version *ip_block_version)
1662 {
1663 	if (!ip_block_version)
1664 		return -EINVAL;
1665 
1666 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1667 		  ip_block_version->funcs->name);
1668 
1669 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1670 
1671 	return 0;
1672 }
1673 
1674 /**
1675  * amdgpu_device_enable_virtual_display - enable virtual display feature
1676  *
1677  * @adev: amdgpu_device pointer
1678  *
1679  * Enabled the virtual display feature if the user has enabled it via
1680  * the module parameter virtual_display.  This feature provides a virtual
1681  * display hardware on headless boards or in virtualized environments.
1682  * This function parses and validates the configuration string specified by
1683  * the user and configues the virtual display configuration (number of
1684  * virtual connectors, crtcs, etc.) specified.
1685  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1686 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1687 {
1688 	adev->enable_virtual_display = false;
1689 
1690 	if (amdgpu_virtual_display) {
1691 		struct drm_device *ddev = adev_to_drm(adev);
1692 		const char *pci_address_name = pci_name(ddev->pdev);
1693 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1694 
1695 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1696 		pciaddstr_tmp = pciaddstr;
1697 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1698 			pciaddname = strsep(&pciaddname_tmp, ",");
1699 			if (!strcmp("all", pciaddname)
1700 			    || !strcmp(pci_address_name, pciaddname)) {
1701 				long num_crtc;
1702 				int res = -1;
1703 
1704 				adev->enable_virtual_display = true;
1705 
1706 				if (pciaddname_tmp)
1707 					res = kstrtol(pciaddname_tmp, 10,
1708 						      &num_crtc);
1709 
1710 				if (!res) {
1711 					if (num_crtc < 1)
1712 						num_crtc = 1;
1713 					if (num_crtc > 6)
1714 						num_crtc = 6;
1715 					adev->mode_info.num_crtc = num_crtc;
1716 				} else {
1717 					adev->mode_info.num_crtc = 1;
1718 				}
1719 				break;
1720 			}
1721 		}
1722 
1723 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1724 			 amdgpu_virtual_display, pci_address_name,
1725 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1726 
1727 		kfree(pciaddstr);
1728 	}
1729 }
1730 
1731 /**
1732  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1733  *
1734  * @adev: amdgpu_device pointer
1735  *
1736  * Parses the asic configuration parameters specified in the gpu info
1737  * firmware and makes them availale to the driver for use in configuring
1738  * the asic.
1739  * Returns 0 on success, -EINVAL on failure.
1740  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1741 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1742 {
1743 	const char *chip_name;
1744 	char fw_name[40];
1745 	int err;
1746 	const struct gpu_info_firmware_header_v1_0 *hdr;
1747 
1748 	adev->firmware.gpu_info_fw = NULL;
1749 
1750 	if (adev->mman.discovery_bin) {
1751 		amdgpu_discovery_get_gfx_info(adev);
1752 
1753 		/*
1754 		 * FIXME: The bounding box is still needed by Navi12, so
1755 		 * temporarily read it from gpu_info firmware. Should be droped
1756 		 * when DAL no longer needs it.
1757 		 */
1758 		if (adev->asic_type != CHIP_NAVI12)
1759 			return 0;
1760 	}
1761 
1762 	switch (adev->asic_type) {
1763 #ifdef CONFIG_DRM_AMDGPU_SI
1764 	case CHIP_VERDE:
1765 	case CHIP_TAHITI:
1766 	case CHIP_PITCAIRN:
1767 	case CHIP_OLAND:
1768 	case CHIP_HAINAN:
1769 #endif
1770 #ifdef CONFIG_DRM_AMDGPU_CIK
1771 	case CHIP_BONAIRE:
1772 	case CHIP_HAWAII:
1773 	case CHIP_KAVERI:
1774 	case CHIP_KABINI:
1775 	case CHIP_MULLINS:
1776 #endif
1777 	case CHIP_TOPAZ:
1778 	case CHIP_TONGA:
1779 	case CHIP_FIJI:
1780 	case CHIP_POLARIS10:
1781 	case CHIP_POLARIS11:
1782 	case CHIP_POLARIS12:
1783 	case CHIP_VEGAM:
1784 	case CHIP_CARRIZO:
1785 	case CHIP_STONEY:
1786 	case CHIP_VEGA20:
1787 	case CHIP_SIENNA_CICHLID:
1788 	case CHIP_NAVY_FLOUNDER:
1789 	default:
1790 		return 0;
1791 	case CHIP_VEGA10:
1792 		chip_name = "vega10";
1793 		break;
1794 	case CHIP_VEGA12:
1795 		chip_name = "vega12";
1796 		break;
1797 	case CHIP_RAVEN:
1798 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1799 			chip_name = "raven2";
1800 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1801 			chip_name = "picasso";
1802 		else
1803 			chip_name = "raven";
1804 		break;
1805 	case CHIP_ARCTURUS:
1806 		chip_name = "arcturus";
1807 		break;
1808 	case CHIP_RENOIR:
1809 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1810 			chip_name = "renoir";
1811 		else
1812 			chip_name = "green_sardine";
1813 		break;
1814 	case CHIP_NAVI10:
1815 		chip_name = "navi10";
1816 		break;
1817 	case CHIP_NAVI14:
1818 		chip_name = "navi14";
1819 		break;
1820 	case CHIP_NAVI12:
1821 		chip_name = "navi12";
1822 		break;
1823 	}
1824 
1825 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1826 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1827 	if (err) {
1828 		dev_err(adev->dev,
1829 			"Failed to load gpu_info firmware \"%s\"\n",
1830 			fw_name);
1831 		goto out;
1832 	}
1833 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1834 	if (err) {
1835 		dev_err(adev->dev,
1836 			"Failed to validate gpu_info firmware \"%s\"\n",
1837 			fw_name);
1838 		goto out;
1839 	}
1840 
1841 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1842 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1843 
1844 	switch (hdr->version_major) {
1845 	case 1:
1846 	{
1847 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1848 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1849 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1850 
1851 		/*
1852 		 * Should be droped when DAL no longer needs it.
1853 		 */
1854 		if (adev->asic_type == CHIP_NAVI12)
1855 			goto parse_soc_bounding_box;
1856 
1857 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1858 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1859 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1860 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1861 		adev->gfx.config.max_texture_channel_caches =
1862 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1863 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1864 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1865 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1866 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1867 		adev->gfx.config.double_offchip_lds_buf =
1868 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1869 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1870 		adev->gfx.cu_info.max_waves_per_simd =
1871 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1872 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1873 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1874 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1875 		if (hdr->version_minor >= 1) {
1876 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1877 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1878 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1879 			adev->gfx.config.num_sc_per_sh =
1880 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1881 			adev->gfx.config.num_packer_per_sc =
1882 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1883 		}
1884 
1885 parse_soc_bounding_box:
1886 		/*
1887 		 * soc bounding box info is not integrated in disocovery table,
1888 		 * we always need to parse it from gpu info firmware if needed.
1889 		 */
1890 		if (hdr->version_minor == 2) {
1891 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1892 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1893 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1894 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1895 		}
1896 		break;
1897 	}
1898 	default:
1899 		dev_err(adev->dev,
1900 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1901 		err = -EINVAL;
1902 		goto out;
1903 	}
1904 out:
1905 	return err;
1906 }
1907 
1908 /**
1909  * amdgpu_device_ip_early_init - run early init for hardware IPs
1910  *
1911  * @adev: amdgpu_device pointer
1912  *
1913  * Early initialization pass for hardware IPs.  The hardware IPs that make
1914  * up each asic are discovered each IP's early_init callback is run.  This
1915  * is the first stage in initializing the asic.
1916  * Returns 0 on success, negative error code on failure.
1917  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1918 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1919 {
1920 	int i, r;
1921 
1922 	amdgpu_device_enable_virtual_display(adev);
1923 
1924 	if (amdgpu_sriov_vf(adev)) {
1925 		r = amdgpu_virt_request_full_gpu(adev, true);
1926 		if (r)
1927 			return r;
1928 	}
1929 
1930 	switch (adev->asic_type) {
1931 #ifdef CONFIG_DRM_AMDGPU_SI
1932 	case CHIP_VERDE:
1933 	case CHIP_TAHITI:
1934 	case CHIP_PITCAIRN:
1935 	case CHIP_OLAND:
1936 	case CHIP_HAINAN:
1937 		adev->family = AMDGPU_FAMILY_SI;
1938 		r = si_set_ip_blocks(adev);
1939 		if (r)
1940 			return r;
1941 		break;
1942 #endif
1943 #ifdef CONFIG_DRM_AMDGPU_CIK
1944 	case CHIP_BONAIRE:
1945 	case CHIP_HAWAII:
1946 	case CHIP_KAVERI:
1947 	case CHIP_KABINI:
1948 	case CHIP_MULLINS:
1949 		if (adev->flags & AMD_IS_APU)
1950 			adev->family = AMDGPU_FAMILY_KV;
1951 		else
1952 			adev->family = AMDGPU_FAMILY_CI;
1953 
1954 		r = cik_set_ip_blocks(adev);
1955 		if (r)
1956 			return r;
1957 		break;
1958 #endif
1959 	case CHIP_TOPAZ:
1960 	case CHIP_TONGA:
1961 	case CHIP_FIJI:
1962 	case CHIP_POLARIS10:
1963 	case CHIP_POLARIS11:
1964 	case CHIP_POLARIS12:
1965 	case CHIP_VEGAM:
1966 	case CHIP_CARRIZO:
1967 	case CHIP_STONEY:
1968 		if (adev->flags & AMD_IS_APU)
1969 			adev->family = AMDGPU_FAMILY_CZ;
1970 		else
1971 			adev->family = AMDGPU_FAMILY_VI;
1972 
1973 		r = vi_set_ip_blocks(adev);
1974 		if (r)
1975 			return r;
1976 		break;
1977 	case CHIP_VEGA10:
1978 	case CHIP_VEGA12:
1979 	case CHIP_VEGA20:
1980 	case CHIP_RAVEN:
1981 	case CHIP_ARCTURUS:
1982 	case CHIP_RENOIR:
1983 		if (adev->flags & AMD_IS_APU)
1984 			adev->family = AMDGPU_FAMILY_RV;
1985 		else
1986 			adev->family = AMDGPU_FAMILY_AI;
1987 
1988 		r = soc15_set_ip_blocks(adev);
1989 		if (r)
1990 			return r;
1991 		break;
1992 	case  CHIP_NAVI10:
1993 	case  CHIP_NAVI14:
1994 	case  CHIP_NAVI12:
1995 	case  CHIP_SIENNA_CICHLID:
1996 	case  CHIP_NAVY_FLOUNDER:
1997 		adev->family = AMDGPU_FAMILY_NV;
1998 
1999 		r = nv_set_ip_blocks(adev);
2000 		if (r)
2001 			return r;
2002 		break;
2003 	default:
2004 		/* FIXME: not supported yet */
2005 		return -EINVAL;
2006 	}
2007 
2008 	amdgpu_amdkfd_device_probe(adev);
2009 
2010 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2011 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2012 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2013 
2014 	for (i = 0; i < adev->num_ip_blocks; i++) {
2015 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2016 			DRM_ERROR("disabled ip block: %d <%s>\n",
2017 				  i, adev->ip_blocks[i].version->funcs->name);
2018 			adev->ip_blocks[i].status.valid = false;
2019 		} else {
2020 			if (adev->ip_blocks[i].version->funcs->early_init) {
2021 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2022 				if (r == -ENOENT) {
2023 					adev->ip_blocks[i].status.valid = false;
2024 				} else if (r) {
2025 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2026 						  adev->ip_blocks[i].version->funcs->name, r);
2027 					return r;
2028 				} else {
2029 					adev->ip_blocks[i].status.valid = true;
2030 				}
2031 			} else {
2032 				adev->ip_blocks[i].status.valid = true;
2033 			}
2034 		}
2035 		/* get the vbios after the asic_funcs are set up */
2036 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2037 			r = amdgpu_device_parse_gpu_info_fw(adev);
2038 			if (r)
2039 				return r;
2040 
2041 			/* Read BIOS */
2042 			if (!amdgpu_get_bios(adev))
2043 				return -EINVAL;
2044 
2045 			r = amdgpu_atombios_init(adev);
2046 			if (r) {
2047 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2048 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2049 				return r;
2050 			}
2051 		}
2052 	}
2053 
2054 	adev->cg_flags &= amdgpu_cg_mask;
2055 	adev->pg_flags &= amdgpu_pg_mask;
2056 
2057 	return 0;
2058 }
2059 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2060 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2061 {
2062 	int i, r;
2063 
2064 	for (i = 0; i < adev->num_ip_blocks; i++) {
2065 		if (!adev->ip_blocks[i].status.sw)
2066 			continue;
2067 		if (adev->ip_blocks[i].status.hw)
2068 			continue;
2069 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2070 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2071 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2072 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2073 			if (r) {
2074 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2075 					  adev->ip_blocks[i].version->funcs->name, r);
2076 				return r;
2077 			}
2078 			adev->ip_blocks[i].status.hw = true;
2079 		}
2080 	}
2081 
2082 	return 0;
2083 }
2084 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2085 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2086 {
2087 	int i, r;
2088 
2089 	for (i = 0; i < adev->num_ip_blocks; i++) {
2090 		if (!adev->ip_blocks[i].status.sw)
2091 			continue;
2092 		if (adev->ip_blocks[i].status.hw)
2093 			continue;
2094 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2095 		if (r) {
2096 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2097 				  adev->ip_blocks[i].version->funcs->name, r);
2098 			return r;
2099 		}
2100 		adev->ip_blocks[i].status.hw = true;
2101 	}
2102 
2103 	return 0;
2104 }
2105 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2106 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2107 {
2108 	int r = 0;
2109 	int i;
2110 	uint32_t smu_version;
2111 
2112 	if (adev->asic_type >= CHIP_VEGA10) {
2113 		for (i = 0; i < adev->num_ip_blocks; i++) {
2114 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2115 				continue;
2116 
2117 			/* no need to do the fw loading again if already done*/
2118 			if (adev->ip_blocks[i].status.hw == true)
2119 				break;
2120 
2121 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2122 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2123 				if (r) {
2124 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2125 							  adev->ip_blocks[i].version->funcs->name, r);
2126 					return r;
2127 				}
2128 			} else {
2129 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2130 				if (r) {
2131 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2132 							  adev->ip_blocks[i].version->funcs->name, r);
2133 					return r;
2134 				}
2135 			}
2136 
2137 			adev->ip_blocks[i].status.hw = true;
2138 			break;
2139 		}
2140 	}
2141 
2142 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2143 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2144 
2145 	return r;
2146 }
2147 
2148 /**
2149  * amdgpu_device_ip_init - run init for hardware IPs
2150  *
2151  * @adev: amdgpu_device pointer
2152  *
2153  * Main initialization pass for hardware IPs.  The list of all the hardware
2154  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2155  * are run.  sw_init initializes the software state associated with each IP
2156  * and hw_init initializes the hardware associated with each IP.
2157  * Returns 0 on success, negative error code on failure.
2158  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2159 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2160 {
2161 	int i, r;
2162 
2163 	r = amdgpu_ras_init(adev);
2164 	if (r)
2165 		return r;
2166 
2167 	for (i = 0; i < adev->num_ip_blocks; i++) {
2168 		if (!adev->ip_blocks[i].status.valid)
2169 			continue;
2170 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2171 		if (r) {
2172 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2173 				  adev->ip_blocks[i].version->funcs->name, r);
2174 			goto init_failed;
2175 		}
2176 		adev->ip_blocks[i].status.sw = true;
2177 
2178 		/* need to do gmc hw init early so we can allocate gpu mem */
2179 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2180 			r = amdgpu_device_vram_scratch_init(adev);
2181 			if (r) {
2182 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2183 				goto init_failed;
2184 			}
2185 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2186 			if (r) {
2187 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2188 				goto init_failed;
2189 			}
2190 			r = amdgpu_device_wb_init(adev);
2191 			if (r) {
2192 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2193 				goto init_failed;
2194 			}
2195 			adev->ip_blocks[i].status.hw = true;
2196 
2197 			/* right after GMC hw init, we create CSA */
2198 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2199 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2200 								AMDGPU_GEM_DOMAIN_VRAM,
2201 								AMDGPU_CSA_SIZE);
2202 				if (r) {
2203 					DRM_ERROR("allocate CSA failed %d\n", r);
2204 					goto init_failed;
2205 				}
2206 			}
2207 		}
2208 	}
2209 
2210 	if (amdgpu_sriov_vf(adev))
2211 		amdgpu_virt_init_data_exchange(adev);
2212 
2213 	r = amdgpu_ib_pool_init(adev);
2214 	if (r) {
2215 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2216 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2217 		goto init_failed;
2218 	}
2219 
2220 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2221 	if (r)
2222 		goto init_failed;
2223 
2224 	r = amdgpu_device_ip_hw_init_phase1(adev);
2225 	if (r)
2226 		goto init_failed;
2227 
2228 	r = amdgpu_device_fw_loading(adev);
2229 	if (r)
2230 		goto init_failed;
2231 
2232 	r = amdgpu_device_ip_hw_init_phase2(adev);
2233 	if (r)
2234 		goto init_failed;
2235 
2236 	/*
2237 	 * retired pages will be loaded from eeprom and reserved here,
2238 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2239 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2240 	 * for I2C communication which only true at this point.
2241 	 *
2242 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2243 	 * failure from bad gpu situation and stop amdgpu init process
2244 	 * accordingly. For other failed cases, it will still release all
2245 	 * the resource and print error message, rather than returning one
2246 	 * negative value to upper level.
2247 	 *
2248 	 * Note: theoretically, this should be called before all vram allocations
2249 	 * to protect retired page from abusing
2250 	 */
2251 	r = amdgpu_ras_recovery_init(adev);
2252 	if (r)
2253 		goto init_failed;
2254 
2255 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2256 		amdgpu_xgmi_add_device(adev);
2257 	amdgpu_amdkfd_device_init(adev);
2258 
2259 	amdgpu_fru_get_product_info(adev);
2260 
2261 init_failed:
2262 	if (amdgpu_sriov_vf(adev))
2263 		amdgpu_virt_release_full_gpu(adev, true);
2264 
2265 	return r;
2266 }
2267 
2268 /**
2269  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2270  *
2271  * @adev: amdgpu_device pointer
2272  *
2273  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2274  * this function before a GPU reset.  If the value is retained after a
2275  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2276  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2277 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2278 {
2279 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2280 }
2281 
2282 /**
2283  * amdgpu_device_check_vram_lost - check if vram is valid
2284  *
2285  * @adev: amdgpu_device pointer
2286  *
2287  * Checks the reset magic value written to the gart pointer in VRAM.
2288  * The driver calls this after a GPU reset to see if the contents of
2289  * VRAM is lost or now.
2290  * returns true if vram is lost, false if not.
2291  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2292 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2293 {
2294 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2295 			AMDGPU_RESET_MAGIC_NUM))
2296 		return true;
2297 
2298 	if (!amdgpu_in_reset(adev))
2299 		return false;
2300 
2301 	/*
2302 	 * For all ASICs with baco/mode1 reset, the VRAM is
2303 	 * always assumed to be lost.
2304 	 */
2305 	switch (amdgpu_asic_reset_method(adev)) {
2306 	case AMD_RESET_METHOD_BACO:
2307 	case AMD_RESET_METHOD_MODE1:
2308 		return true;
2309 	default:
2310 		return false;
2311 	}
2312 }
2313 
2314 /**
2315  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2316  *
2317  * @adev: amdgpu_device pointer
2318  * @state: clockgating state (gate or ungate)
2319  *
2320  * The list of all the hardware IPs that make up the asic is walked and the
2321  * set_clockgating_state callbacks are run.
2322  * Late initialization pass enabling clockgating for hardware IPs.
2323  * Fini or suspend, pass disabling clockgating for hardware IPs.
2324  * Returns 0 on success, negative error code on failure.
2325  */
2326 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2327 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2328 						enum amd_clockgating_state state)
2329 {
2330 	int i, j, r;
2331 
2332 	if (amdgpu_emu_mode == 1)
2333 		return 0;
2334 
2335 	for (j = 0; j < adev->num_ip_blocks; j++) {
2336 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2337 		if (!adev->ip_blocks[i].status.late_initialized)
2338 			continue;
2339 		/* skip CG for VCE/UVD, it's handled specially */
2340 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2341 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2342 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2343 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2344 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2345 			/* enable clockgating to save power */
2346 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2347 										     state);
2348 			if (r) {
2349 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2350 					  adev->ip_blocks[i].version->funcs->name, r);
2351 				return r;
2352 			}
2353 		}
2354 	}
2355 
2356 	return 0;
2357 }
2358 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2359 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2360 {
2361 	int i, j, r;
2362 
2363 	if (amdgpu_emu_mode == 1)
2364 		return 0;
2365 
2366 	for (j = 0; j < adev->num_ip_blocks; j++) {
2367 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2368 		if (!adev->ip_blocks[i].status.late_initialized)
2369 			continue;
2370 		/* skip CG for VCE/UVD, it's handled specially */
2371 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2372 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2373 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2374 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2375 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2376 			/* enable powergating to save power */
2377 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2378 											state);
2379 			if (r) {
2380 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2381 					  adev->ip_blocks[i].version->funcs->name, r);
2382 				return r;
2383 			}
2384 		}
2385 	}
2386 	return 0;
2387 }
2388 
amdgpu_device_enable_mgpu_fan_boost(void)2389 static int amdgpu_device_enable_mgpu_fan_boost(void)
2390 {
2391 	struct amdgpu_gpu_instance *gpu_ins;
2392 	struct amdgpu_device *adev;
2393 	int i, ret = 0;
2394 
2395 	mutex_lock(&mgpu_info.mutex);
2396 
2397 	/*
2398 	 * MGPU fan boost feature should be enabled
2399 	 * only when there are two or more dGPUs in
2400 	 * the system
2401 	 */
2402 	if (mgpu_info.num_dgpu < 2)
2403 		goto out;
2404 
2405 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2406 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2407 		adev = gpu_ins->adev;
2408 		if (!(adev->flags & AMD_IS_APU) &&
2409 		    !gpu_ins->mgpu_fan_enabled) {
2410 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2411 			if (ret)
2412 				break;
2413 
2414 			gpu_ins->mgpu_fan_enabled = 1;
2415 		}
2416 	}
2417 
2418 out:
2419 	mutex_unlock(&mgpu_info.mutex);
2420 
2421 	return ret;
2422 }
2423 
2424 /**
2425  * amdgpu_device_ip_late_init - run late init for hardware IPs
2426  *
2427  * @adev: amdgpu_device pointer
2428  *
2429  * Late initialization pass for hardware IPs.  The list of all the hardware
2430  * IPs that make up the asic is walked and the late_init callbacks are run.
2431  * late_init covers any special initialization that an IP requires
2432  * after all of the have been initialized or something that needs to happen
2433  * late in the init process.
2434  * Returns 0 on success, negative error code on failure.
2435  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2436 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2437 {
2438 	struct amdgpu_gpu_instance *gpu_instance;
2439 	int i = 0, r;
2440 
2441 	for (i = 0; i < adev->num_ip_blocks; i++) {
2442 		if (!adev->ip_blocks[i].status.hw)
2443 			continue;
2444 		if (adev->ip_blocks[i].version->funcs->late_init) {
2445 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2446 			if (r) {
2447 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2448 					  adev->ip_blocks[i].version->funcs->name, r);
2449 				return r;
2450 			}
2451 		}
2452 		adev->ip_blocks[i].status.late_initialized = true;
2453 	}
2454 
2455 	amdgpu_ras_set_error_query_ready(adev, true);
2456 
2457 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2458 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2459 
2460 	amdgpu_device_fill_reset_magic(adev);
2461 
2462 	r = amdgpu_device_enable_mgpu_fan_boost();
2463 	if (r)
2464 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2465 
2466 
2467 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2468 		mutex_lock(&mgpu_info.mutex);
2469 
2470 		/*
2471 		 * Reset device p-state to low as this was booted with high.
2472 		 *
2473 		 * This should be performed only after all devices from the same
2474 		 * hive get initialized.
2475 		 *
2476 		 * However, it's unknown how many device in the hive in advance.
2477 		 * As this is counted one by one during devices initializations.
2478 		 *
2479 		 * So, we wait for all XGMI interlinked devices initialized.
2480 		 * This may bring some delays as those devices may come from
2481 		 * different hives. But that should be OK.
2482 		 */
2483 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2484 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2485 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2486 				if (gpu_instance->adev->flags & AMD_IS_APU)
2487 					continue;
2488 
2489 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2490 						AMDGPU_XGMI_PSTATE_MIN);
2491 				if (r) {
2492 					DRM_ERROR("pstate setting failed (%d).\n", r);
2493 					break;
2494 				}
2495 			}
2496 		}
2497 
2498 		mutex_unlock(&mgpu_info.mutex);
2499 	}
2500 
2501 	return 0;
2502 }
2503 
2504 /**
2505  * amdgpu_device_ip_fini - run fini for hardware IPs
2506  *
2507  * @adev: amdgpu_device pointer
2508  *
2509  * Main teardown pass for hardware IPs.  The list of all the hardware
2510  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2511  * are run.  hw_fini tears down the hardware associated with each IP
2512  * and sw_fini tears down any software state associated with each IP.
2513  * Returns 0 on success, negative error code on failure.
2514  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2515 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2516 {
2517 	int i, r;
2518 
2519 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2520 		amdgpu_virt_release_ras_err_handler_data(adev);
2521 
2522 	amdgpu_ras_pre_fini(adev);
2523 
2524 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2525 		amdgpu_xgmi_remove_device(adev);
2526 
2527 	amdgpu_amdkfd_device_fini(adev);
2528 
2529 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2530 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2531 
2532 	/* need to disable SMC first */
2533 	for (i = 0; i < adev->num_ip_blocks; i++) {
2534 		if (!adev->ip_blocks[i].status.hw)
2535 			continue;
2536 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2537 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2538 			/* XXX handle errors */
2539 			if (r) {
2540 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2541 					  adev->ip_blocks[i].version->funcs->name, r);
2542 			}
2543 			adev->ip_blocks[i].status.hw = false;
2544 			break;
2545 		}
2546 	}
2547 
2548 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2549 		if (!adev->ip_blocks[i].status.hw)
2550 			continue;
2551 
2552 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2553 		/* XXX handle errors */
2554 		if (r) {
2555 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2556 				  adev->ip_blocks[i].version->funcs->name, r);
2557 		}
2558 
2559 		adev->ip_blocks[i].status.hw = false;
2560 	}
2561 
2562 
2563 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2564 		if (!adev->ip_blocks[i].status.sw)
2565 			continue;
2566 
2567 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2568 			amdgpu_ucode_free_bo(adev);
2569 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2570 			amdgpu_device_wb_fini(adev);
2571 			amdgpu_device_vram_scratch_fini(adev);
2572 			amdgpu_ib_pool_fini(adev);
2573 		}
2574 
2575 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2576 		/* XXX handle errors */
2577 		if (r) {
2578 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2579 				  adev->ip_blocks[i].version->funcs->name, r);
2580 		}
2581 		adev->ip_blocks[i].status.sw = false;
2582 		adev->ip_blocks[i].status.valid = false;
2583 	}
2584 
2585 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2586 		if (!adev->ip_blocks[i].status.late_initialized)
2587 			continue;
2588 		if (adev->ip_blocks[i].version->funcs->late_fini)
2589 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2590 		adev->ip_blocks[i].status.late_initialized = false;
2591 	}
2592 
2593 	amdgpu_ras_fini(adev);
2594 
2595 	if (amdgpu_sriov_vf(adev))
2596 		if (amdgpu_virt_release_full_gpu(adev, false))
2597 			DRM_ERROR("failed to release exclusive mode on fini\n");
2598 
2599 	return 0;
2600 }
2601 
2602 /**
2603  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2604  *
2605  * @work: work_struct.
2606  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2607 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2608 {
2609 	struct amdgpu_device *adev =
2610 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2611 	int r;
2612 
2613 	r = amdgpu_ib_ring_tests(adev);
2614 	if (r)
2615 		DRM_ERROR("ib ring test failed (%d).\n", r);
2616 }
2617 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2618 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2619 {
2620 	struct amdgpu_device *adev =
2621 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2622 
2623 	mutex_lock(&adev->gfx.gfx_off_mutex);
2624 	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2625 		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2626 			adev->gfx.gfx_off_state = true;
2627 	}
2628 	mutex_unlock(&adev->gfx.gfx_off_mutex);
2629 }
2630 
2631 /**
2632  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2633  *
2634  * @adev: amdgpu_device pointer
2635  *
2636  * Main suspend function for hardware IPs.  The list of all the hardware
2637  * IPs that make up the asic is walked, clockgating is disabled and the
2638  * suspend callbacks are run.  suspend puts the hardware and software state
2639  * in each IP into a state suitable for suspend.
2640  * Returns 0 on success, negative error code on failure.
2641  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2642 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2643 {
2644 	int i, r;
2645 
2646 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2647 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2648 
2649 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2650 		if (!adev->ip_blocks[i].status.valid)
2651 			continue;
2652 
2653 		/* displays are handled separately */
2654 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2655 			continue;
2656 
2657 		/* XXX handle errors */
2658 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2659 		/* XXX handle errors */
2660 		if (r) {
2661 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2662 				  adev->ip_blocks[i].version->funcs->name, r);
2663 			return r;
2664 		}
2665 
2666 		adev->ip_blocks[i].status.hw = false;
2667 	}
2668 
2669 	return 0;
2670 }
2671 
2672 /**
2673  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2674  *
2675  * @adev: amdgpu_device pointer
2676  *
2677  * Main suspend function for hardware IPs.  The list of all the hardware
2678  * IPs that make up the asic is walked, clockgating is disabled and the
2679  * suspend callbacks are run.  suspend puts the hardware and software state
2680  * in each IP into a state suitable for suspend.
2681  * Returns 0 on success, negative error code on failure.
2682  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2683 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2684 {
2685 	int i, r;
2686 
2687 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2688 		if (!adev->ip_blocks[i].status.valid)
2689 			continue;
2690 		/* displays are handled in phase1 */
2691 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2692 			continue;
2693 		/* PSP lost connection when err_event_athub occurs */
2694 		if (amdgpu_ras_intr_triggered() &&
2695 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2696 			adev->ip_blocks[i].status.hw = false;
2697 			continue;
2698 		}
2699 		/* XXX handle errors */
2700 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2701 		/* XXX handle errors */
2702 		if (r) {
2703 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2704 				  adev->ip_blocks[i].version->funcs->name, r);
2705 		}
2706 		adev->ip_blocks[i].status.hw = false;
2707 		/* handle putting the SMC in the appropriate state */
2708 		if(!amdgpu_sriov_vf(adev)){
2709 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2710 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2711 				if (r) {
2712 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2713 							adev->mp1_state, r);
2714 					return r;
2715 				}
2716 			}
2717 		}
2718 		adev->ip_blocks[i].status.hw = false;
2719 	}
2720 
2721 	return 0;
2722 }
2723 
2724 /**
2725  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2726  *
2727  * @adev: amdgpu_device pointer
2728  *
2729  * Main suspend function for hardware IPs.  The list of all the hardware
2730  * IPs that make up the asic is walked, clockgating is disabled and the
2731  * suspend callbacks are run.  suspend puts the hardware and software state
2732  * in each IP into a state suitable for suspend.
2733  * Returns 0 on success, negative error code on failure.
2734  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2735 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2736 {
2737 	int r;
2738 
2739 	if (amdgpu_sriov_vf(adev))
2740 		amdgpu_virt_request_full_gpu(adev, false);
2741 
2742 	r = amdgpu_device_ip_suspend_phase1(adev);
2743 	if (r)
2744 		return r;
2745 	r = amdgpu_device_ip_suspend_phase2(adev);
2746 
2747 	if (amdgpu_sriov_vf(adev))
2748 		amdgpu_virt_release_full_gpu(adev, false);
2749 
2750 	return r;
2751 }
2752 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2753 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2754 {
2755 	int i, r;
2756 
2757 	static enum amd_ip_block_type ip_order[] = {
2758 		AMD_IP_BLOCK_TYPE_GMC,
2759 		AMD_IP_BLOCK_TYPE_COMMON,
2760 		AMD_IP_BLOCK_TYPE_PSP,
2761 		AMD_IP_BLOCK_TYPE_IH,
2762 	};
2763 
2764 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2765 		int j;
2766 		struct amdgpu_ip_block *block;
2767 
2768 		block = &adev->ip_blocks[i];
2769 		block->status.hw = false;
2770 
2771 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2772 
2773 			if (block->version->type != ip_order[j] ||
2774 				!block->status.valid)
2775 				continue;
2776 
2777 			r = block->version->funcs->hw_init(adev);
2778 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2779 			if (r)
2780 				return r;
2781 			block->status.hw = true;
2782 		}
2783 	}
2784 
2785 	return 0;
2786 }
2787 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2788 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2789 {
2790 	int i, r;
2791 
2792 	static enum amd_ip_block_type ip_order[] = {
2793 		AMD_IP_BLOCK_TYPE_SMC,
2794 		AMD_IP_BLOCK_TYPE_DCE,
2795 		AMD_IP_BLOCK_TYPE_GFX,
2796 		AMD_IP_BLOCK_TYPE_SDMA,
2797 		AMD_IP_BLOCK_TYPE_UVD,
2798 		AMD_IP_BLOCK_TYPE_VCE,
2799 		AMD_IP_BLOCK_TYPE_VCN
2800 	};
2801 
2802 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2803 		int j;
2804 		struct amdgpu_ip_block *block;
2805 
2806 		for (j = 0; j < adev->num_ip_blocks; j++) {
2807 			block = &adev->ip_blocks[j];
2808 
2809 			if (block->version->type != ip_order[i] ||
2810 				!block->status.valid ||
2811 				block->status.hw)
2812 				continue;
2813 
2814 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2815 				r = block->version->funcs->resume(adev);
2816 			else
2817 				r = block->version->funcs->hw_init(adev);
2818 
2819 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2820 			if (r)
2821 				return r;
2822 			block->status.hw = true;
2823 		}
2824 	}
2825 
2826 	return 0;
2827 }
2828 
2829 /**
2830  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2831  *
2832  * @adev: amdgpu_device pointer
2833  *
2834  * First resume function for hardware IPs.  The list of all the hardware
2835  * IPs that make up the asic is walked and the resume callbacks are run for
2836  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2837  * after a suspend and updates the software state as necessary.  This
2838  * function is also used for restoring the GPU after a GPU reset.
2839  * Returns 0 on success, negative error code on failure.
2840  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2841 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2842 {
2843 	int i, r;
2844 
2845 	for (i = 0; i < adev->num_ip_blocks; i++) {
2846 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2847 			continue;
2848 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2849 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2850 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2851 
2852 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2853 			if (r) {
2854 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2855 					  adev->ip_blocks[i].version->funcs->name, r);
2856 				return r;
2857 			}
2858 			adev->ip_blocks[i].status.hw = true;
2859 		}
2860 	}
2861 
2862 	return 0;
2863 }
2864 
2865 /**
2866  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2867  *
2868  * @adev: amdgpu_device pointer
2869  *
2870  * First resume function for hardware IPs.  The list of all the hardware
2871  * IPs that make up the asic is walked and the resume callbacks are run for
2872  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2873  * functional state after a suspend and updates the software state as
2874  * necessary.  This function is also used for restoring the GPU after a GPU
2875  * reset.
2876  * Returns 0 on success, negative error code on failure.
2877  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2878 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2879 {
2880 	int i, r;
2881 
2882 	for (i = 0; i < adev->num_ip_blocks; i++) {
2883 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2884 			continue;
2885 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2886 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2887 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2888 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2889 			continue;
2890 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2891 		if (r) {
2892 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2893 				  adev->ip_blocks[i].version->funcs->name, r);
2894 			return r;
2895 		}
2896 		adev->ip_blocks[i].status.hw = true;
2897 	}
2898 
2899 	return 0;
2900 }
2901 
2902 /**
2903  * amdgpu_device_ip_resume - run resume for hardware IPs
2904  *
2905  * @adev: amdgpu_device pointer
2906  *
2907  * Main resume function for hardware IPs.  The hardware IPs
2908  * are split into two resume functions because they are
2909  * are also used in in recovering from a GPU reset and some additional
2910  * steps need to be take between them.  In this case (S3/S4) they are
2911  * run sequentially.
2912  * Returns 0 on success, negative error code on failure.
2913  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2914 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2915 {
2916 	int r;
2917 
2918 	r = amdgpu_device_ip_resume_phase1(adev);
2919 	if (r)
2920 		return r;
2921 
2922 	r = amdgpu_device_fw_loading(adev);
2923 	if (r)
2924 		return r;
2925 
2926 	r = amdgpu_device_ip_resume_phase2(adev);
2927 
2928 	return r;
2929 }
2930 
2931 /**
2932  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2933  *
2934  * @adev: amdgpu_device pointer
2935  *
2936  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2937  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2938 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2939 {
2940 	if (amdgpu_sriov_vf(adev)) {
2941 		if (adev->is_atom_fw) {
2942 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2943 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2944 		} else {
2945 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2946 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2947 		}
2948 
2949 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2950 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2951 	}
2952 }
2953 
2954 /**
2955  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2956  *
2957  * @asic_type: AMD asic type
2958  *
2959  * Check if there is DC (new modesetting infrastructre) support for an asic.
2960  * returns true if DC has support, false if not.
2961  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2962 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2963 {
2964 	switch (asic_type) {
2965 #if defined(CONFIG_DRM_AMD_DC)
2966 #if defined(CONFIG_DRM_AMD_DC_SI)
2967 	case CHIP_TAHITI:
2968 	case CHIP_PITCAIRN:
2969 	case CHIP_VERDE:
2970 	case CHIP_OLAND:
2971 #endif
2972 	case CHIP_BONAIRE:
2973 	case CHIP_KAVERI:
2974 	case CHIP_KABINI:
2975 	case CHIP_MULLINS:
2976 		/*
2977 		 * We have systems in the wild with these ASICs that require
2978 		 * LVDS and VGA support which is not supported with DC.
2979 		 *
2980 		 * Fallback to the non-DC driver here by default so as not to
2981 		 * cause regressions.
2982 		 */
2983 		return amdgpu_dc > 0;
2984 	case CHIP_HAWAII:
2985 	case CHIP_CARRIZO:
2986 	case CHIP_STONEY:
2987 	case CHIP_POLARIS10:
2988 	case CHIP_POLARIS11:
2989 	case CHIP_POLARIS12:
2990 	case CHIP_VEGAM:
2991 	case CHIP_TONGA:
2992 	case CHIP_FIJI:
2993 	case CHIP_VEGA10:
2994 	case CHIP_VEGA12:
2995 	case CHIP_VEGA20:
2996 #if defined(CONFIG_DRM_AMD_DC_DCN)
2997 	case CHIP_RAVEN:
2998 	case CHIP_NAVI10:
2999 	case CHIP_NAVI14:
3000 	case CHIP_NAVI12:
3001 	case CHIP_RENOIR:
3002 #endif
3003 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3004 	case CHIP_SIENNA_CICHLID:
3005 	case CHIP_NAVY_FLOUNDER:
3006 #endif
3007 		return amdgpu_dc != 0;
3008 #endif
3009 	default:
3010 		if (amdgpu_dc > 0)
3011 			DRM_INFO("Display Core has been requested via kernel parameter "
3012 					 "but isn't supported by ASIC, ignoring\n");
3013 		return false;
3014 	}
3015 }
3016 
3017 /**
3018  * amdgpu_device_has_dc_support - check if dc is supported
3019  *
3020  * @adev: amdgpu_device pointer
3021  *
3022  * Returns true for supported, false for not supported
3023  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3024 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3025 {
3026 	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3027 		return false;
3028 
3029 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3030 }
3031 
3032 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3033 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3034 {
3035 	struct amdgpu_device *adev =
3036 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3037 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3038 
3039 	/* It's a bug to not have a hive within this function */
3040 	if (WARN_ON(!hive))
3041 		return;
3042 
3043 	/*
3044 	 * Use task barrier to synchronize all xgmi reset works across the
3045 	 * hive. task_barrier_enter and task_barrier_exit will block
3046 	 * until all the threads running the xgmi reset works reach
3047 	 * those points. task_barrier_full will do both blocks.
3048 	 */
3049 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3050 
3051 		task_barrier_enter(&hive->tb);
3052 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3053 
3054 		if (adev->asic_reset_res)
3055 			goto fail;
3056 
3057 		task_barrier_exit(&hive->tb);
3058 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3059 
3060 		if (adev->asic_reset_res)
3061 			goto fail;
3062 
3063 		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3064 			adev->mmhub.funcs->reset_ras_error_count(adev);
3065 	} else {
3066 
3067 		task_barrier_full(&hive->tb);
3068 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3069 	}
3070 
3071 fail:
3072 	if (adev->asic_reset_res)
3073 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3074 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3075 	amdgpu_put_xgmi_hive(hive);
3076 }
3077 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3078 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3079 {
3080 	char *input = amdgpu_lockup_timeout;
3081 	char *timeout_setting = NULL;
3082 	int index = 0;
3083 	long timeout;
3084 	int ret = 0;
3085 
3086 	/*
3087 	 * By default timeout for non compute jobs is 10000.
3088 	 * And there is no timeout enforced on compute jobs.
3089 	 * In SR-IOV or passthrough mode, timeout for compute
3090 	 * jobs are 60000 by default.
3091 	 */
3092 	adev->gfx_timeout = msecs_to_jiffies(10000);
3093 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3094 	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3095 		adev->compute_timeout =  msecs_to_jiffies(60000);
3096 	else
3097 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3098 
3099 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3100 		while ((timeout_setting = strsep(&input, ",")) &&
3101 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3102 			ret = kstrtol(timeout_setting, 0, &timeout);
3103 			if (ret)
3104 				return ret;
3105 
3106 			if (timeout == 0) {
3107 				index++;
3108 				continue;
3109 			} else if (timeout < 0) {
3110 				timeout = MAX_SCHEDULE_TIMEOUT;
3111 			} else {
3112 				timeout = msecs_to_jiffies(timeout);
3113 			}
3114 
3115 			switch (index++) {
3116 			case 0:
3117 				adev->gfx_timeout = timeout;
3118 				break;
3119 			case 1:
3120 				adev->compute_timeout = timeout;
3121 				break;
3122 			case 2:
3123 				adev->sdma_timeout = timeout;
3124 				break;
3125 			case 3:
3126 				adev->video_timeout = timeout;
3127 				break;
3128 			default:
3129 				break;
3130 			}
3131 		}
3132 		/*
3133 		 * There is only one value specified and
3134 		 * it should apply to all non-compute jobs.
3135 		 */
3136 		if (index == 1) {
3137 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3138 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3139 				adev->compute_timeout = adev->gfx_timeout;
3140 		}
3141 	}
3142 
3143 	return ret;
3144 }
3145 
3146 static const struct attribute *amdgpu_dev_attributes[] = {
3147 	&dev_attr_product_name.attr,
3148 	&dev_attr_product_number.attr,
3149 	&dev_attr_serial_number.attr,
3150 	&dev_attr_pcie_replay_count.attr,
3151 	NULL
3152 };
3153 
3154 
3155 /**
3156  * amdgpu_device_init - initialize the driver
3157  *
3158  * @adev: amdgpu_device pointer
3159  * @flags: driver flags
3160  *
3161  * Initializes the driver info and hw (all asics).
3162  * Returns 0 for success or an error on failure.
3163  * Called at driver startup.
3164  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3165 int amdgpu_device_init(struct amdgpu_device *adev,
3166 		       uint32_t flags)
3167 {
3168 	struct drm_device *ddev = adev_to_drm(adev);
3169 	struct pci_dev *pdev = adev->pdev;
3170 	int r, i;
3171 	bool boco = false;
3172 	u32 max_MBps;
3173 
3174 	adev->shutdown = false;
3175 	adev->flags = flags;
3176 
3177 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3178 		adev->asic_type = amdgpu_force_asic_type;
3179 	else
3180 		adev->asic_type = flags & AMD_ASIC_MASK;
3181 
3182 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3183 	if (amdgpu_emu_mode == 1)
3184 		adev->usec_timeout *= 10;
3185 	adev->gmc.gart_size = 512 * 1024 * 1024;
3186 	adev->accel_working = false;
3187 	adev->num_rings = 0;
3188 	adev->mman.buffer_funcs = NULL;
3189 	adev->mman.buffer_funcs_ring = NULL;
3190 	adev->vm_manager.vm_pte_funcs = NULL;
3191 	adev->vm_manager.vm_pte_num_scheds = 0;
3192 	adev->gmc.gmc_funcs = NULL;
3193 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3194 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3195 
3196 	adev->smc_rreg = &amdgpu_invalid_rreg;
3197 	adev->smc_wreg = &amdgpu_invalid_wreg;
3198 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3199 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3200 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3201 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3202 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3203 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3204 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3205 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3206 	adev->didt_rreg = &amdgpu_invalid_rreg;
3207 	adev->didt_wreg = &amdgpu_invalid_wreg;
3208 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3209 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3210 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3211 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3212 
3213 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3214 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3215 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3216 
3217 	/* mutex initialization are all done here so we
3218 	 * can recall function without having locking issues */
3219 	atomic_set(&adev->irq.ih.lock, 0);
3220 	mutex_init(&adev->firmware.mutex);
3221 	mutex_init(&adev->pm.mutex);
3222 	mutex_init(&adev->gfx.gpu_clock_mutex);
3223 	mutex_init(&adev->srbm_mutex);
3224 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3225 	mutex_init(&adev->gfx.gfx_off_mutex);
3226 	mutex_init(&adev->grbm_idx_mutex);
3227 	mutex_init(&adev->mn_lock);
3228 	mutex_init(&adev->virt.vf_errors.lock);
3229 	hash_init(adev->mn_hash);
3230 	atomic_set(&adev->in_gpu_reset, 0);
3231 	init_rwsem(&adev->reset_sem);
3232 	mutex_init(&adev->psp.mutex);
3233 	mutex_init(&adev->notifier_lock);
3234 
3235 	r = amdgpu_device_check_arguments(adev);
3236 	if (r)
3237 		return r;
3238 
3239 	spin_lock_init(&adev->mmio_idx_lock);
3240 	spin_lock_init(&adev->smc_idx_lock);
3241 	spin_lock_init(&adev->pcie_idx_lock);
3242 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3243 	spin_lock_init(&adev->didt_idx_lock);
3244 	spin_lock_init(&adev->gc_cac_idx_lock);
3245 	spin_lock_init(&adev->se_cac_idx_lock);
3246 	spin_lock_init(&adev->audio_endpt_idx_lock);
3247 	spin_lock_init(&adev->mm_stats.lock);
3248 
3249 	INIT_LIST_HEAD(&adev->shadow_list);
3250 	mutex_init(&adev->shadow_list_lock);
3251 
3252 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3253 			  amdgpu_device_delayed_init_work_handler);
3254 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3255 			  amdgpu_device_delay_enable_gfx_off);
3256 
3257 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3258 
3259 	adev->gfx.gfx_off_req_count = 1;
3260 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3261 
3262 	atomic_set(&adev->throttling_logging_enabled, 1);
3263 	/*
3264 	 * If throttling continues, logging will be performed every minute
3265 	 * to avoid log flooding. "-1" is subtracted since the thermal
3266 	 * throttling interrupt comes every second. Thus, the total logging
3267 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3268 	 * for throttling interrupt) = 60 seconds.
3269 	 */
3270 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3271 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3272 
3273 	/* Registers mapping */
3274 	/* TODO: block userspace mapping of io register */
3275 	if (adev->asic_type >= CHIP_BONAIRE) {
3276 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3277 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3278 	} else {
3279 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3280 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3281 	}
3282 
3283 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3284 	if (adev->rmmio == NULL) {
3285 		return -ENOMEM;
3286 	}
3287 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3288 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3289 
3290 	/* io port mapping */
3291 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3292 		if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3293 			adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3294 			adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3295 			break;
3296 		}
3297 	}
3298 	if (adev->rio_mem == NULL)
3299 		DRM_INFO("PCI I/O BAR is not found.\n");
3300 
3301 	/* enable PCIE atomic ops */
3302 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3303 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3304 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3305 	if (r) {
3306 		adev->have_atomics_support = false;
3307 		DRM_INFO("PCIE atomic ops is not supported\n");
3308 	} else {
3309 		adev->have_atomics_support = true;
3310 	}
3311 
3312 	amdgpu_device_get_pcie_info(adev);
3313 
3314 	if (amdgpu_mcbp)
3315 		DRM_INFO("MCBP is enabled\n");
3316 
3317 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3318 		adev->enable_mes = true;
3319 
3320 	/* detect hw virtualization here */
3321 	amdgpu_detect_virtualization(adev);
3322 
3323 	r = amdgpu_device_get_job_timeout_settings(adev);
3324 	if (r) {
3325 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3326 		goto failed_unmap;
3327 	}
3328 
3329 	/* early init functions */
3330 	r = amdgpu_device_ip_early_init(adev);
3331 	if (r)
3332 		goto failed_unmap;
3333 
3334 	/* doorbell bar mapping and doorbell index init*/
3335 	amdgpu_device_doorbell_init(adev);
3336 
3337 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3338 	/* this will fail for cards that aren't VGA class devices, just
3339 	 * ignore it */
3340 	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3341 
3342 	if (amdgpu_device_supports_boco(ddev))
3343 		boco = true;
3344 	if (amdgpu_has_atpx() &&
3345 	    (amdgpu_is_atpx_hybrid() ||
3346 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3347 	    !pci_is_thunderbolt_attached(adev->pdev))
3348 		vga_switcheroo_register_client(adev->pdev,
3349 					       &amdgpu_switcheroo_ops, boco);
3350 	if (boco)
3351 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3352 
3353 	if (amdgpu_emu_mode == 1) {
3354 		/* post the asic on emulation mode */
3355 		emu_soc_asic_init(adev);
3356 		goto fence_driver_init;
3357 	}
3358 
3359 	/* detect if we are with an SRIOV vbios */
3360 	amdgpu_device_detect_sriov_bios(adev);
3361 
3362 	/* check if we need to reset the asic
3363 	 *  E.g., driver was not cleanly unloaded previously, etc.
3364 	 */
3365 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3366 		r = amdgpu_asic_reset(adev);
3367 		if (r) {
3368 			dev_err(adev->dev, "asic reset on init failed\n");
3369 			goto failed;
3370 		}
3371 	}
3372 
3373 	pci_enable_pcie_error_reporting(adev->ddev.pdev);
3374 
3375 	/* Post card if necessary */
3376 	if (amdgpu_device_need_post(adev)) {
3377 		if (!adev->bios) {
3378 			dev_err(adev->dev, "no vBIOS found\n");
3379 			r = -EINVAL;
3380 			goto failed;
3381 		}
3382 		DRM_INFO("GPU posting now...\n");
3383 		r = amdgpu_device_asic_init(adev);
3384 		if (r) {
3385 			dev_err(adev->dev, "gpu post error!\n");
3386 			goto failed;
3387 		}
3388 	}
3389 
3390 	if (adev->is_atom_fw) {
3391 		/* Initialize clocks */
3392 		r = amdgpu_atomfirmware_get_clock_info(adev);
3393 		if (r) {
3394 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3395 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3396 			goto failed;
3397 		}
3398 	} else {
3399 		/* Initialize clocks */
3400 		r = amdgpu_atombios_get_clock_info(adev);
3401 		if (r) {
3402 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3403 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3404 			goto failed;
3405 		}
3406 		/* init i2c buses */
3407 		if (!amdgpu_device_has_dc_support(adev))
3408 			amdgpu_atombios_i2c_init(adev);
3409 	}
3410 
3411 fence_driver_init:
3412 	/* Fence driver */
3413 	r = amdgpu_fence_driver_init(adev);
3414 	if (r) {
3415 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3416 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3417 		goto failed;
3418 	}
3419 
3420 	/* init the mode config */
3421 	drm_mode_config_init(adev_to_drm(adev));
3422 
3423 	r = amdgpu_device_ip_init(adev);
3424 	if (r) {
3425 		/* failed in exclusive mode due to timeout */
3426 		if (amdgpu_sriov_vf(adev) &&
3427 		    !amdgpu_sriov_runtime(adev) &&
3428 		    amdgpu_virt_mmio_blocked(adev) &&
3429 		    !amdgpu_virt_wait_reset(adev)) {
3430 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3431 			/* Don't send request since VF is inactive. */
3432 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3433 			adev->virt.ops = NULL;
3434 			r = -EAGAIN;
3435 			goto failed;
3436 		}
3437 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3438 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3439 		goto failed;
3440 	}
3441 
3442 	dev_info(adev->dev,
3443 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3444 			adev->gfx.config.max_shader_engines,
3445 			adev->gfx.config.max_sh_per_se,
3446 			adev->gfx.config.max_cu_per_sh,
3447 			adev->gfx.cu_info.number);
3448 
3449 	adev->accel_working = true;
3450 
3451 	amdgpu_vm_check_compute_bug(adev);
3452 
3453 	/* Initialize the buffer migration limit. */
3454 	if (amdgpu_moverate >= 0)
3455 		max_MBps = amdgpu_moverate;
3456 	else
3457 		max_MBps = 8; /* Allow 8 MB/s. */
3458 	/* Get a log2 for easy divisions. */
3459 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3460 
3461 	amdgpu_fbdev_init(adev);
3462 
3463 	r = amdgpu_pm_sysfs_init(adev);
3464 	if (r) {
3465 		adev->pm_sysfs_en = false;
3466 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3467 	} else
3468 		adev->pm_sysfs_en = true;
3469 
3470 	r = amdgpu_ucode_sysfs_init(adev);
3471 	if (r) {
3472 		adev->ucode_sysfs_en = false;
3473 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3474 	} else
3475 		adev->ucode_sysfs_en = true;
3476 
3477 	if ((amdgpu_testing & 1)) {
3478 		if (adev->accel_working)
3479 			amdgpu_test_moves(adev);
3480 		else
3481 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3482 	}
3483 	if (amdgpu_benchmarking) {
3484 		if (adev->accel_working)
3485 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3486 		else
3487 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3488 	}
3489 
3490 	/*
3491 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3492 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3493 	 * gpu instance is counted less.
3494 	 */
3495 	amdgpu_register_gpu_instance(adev);
3496 
3497 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3498 	 * explicit gating rather than handling it automatically.
3499 	 */
3500 	r = amdgpu_device_ip_late_init(adev);
3501 	if (r) {
3502 		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3503 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3504 		goto failed;
3505 	}
3506 
3507 	/* must succeed. */
3508 	amdgpu_ras_resume(adev);
3509 
3510 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3511 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3512 
3513 	if (amdgpu_sriov_vf(adev))
3514 		flush_delayed_work(&adev->delayed_init_work);
3515 
3516 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3517 	if (r)
3518 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3519 
3520 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3521 		r = amdgpu_pmu_init(adev);
3522 	if (r)
3523 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3524 
3525 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3526 	if (amdgpu_device_cache_pci_state(adev->pdev))
3527 		pci_restore_state(pdev);
3528 
3529 	return 0;
3530 
3531 failed:
3532 	amdgpu_vf_error_trans_all(adev);
3533 	if (boco)
3534 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3535 
3536 failed_unmap:
3537 	iounmap(adev->rmmio);
3538 	adev->rmmio = NULL;
3539 
3540 	return r;
3541 }
3542 
3543 /**
3544  * amdgpu_device_fini - tear down the driver
3545  *
3546  * @adev: amdgpu_device pointer
3547  *
3548  * Tear down the driver info (all asics).
3549  * Called at driver shutdown.
3550  */
amdgpu_device_fini(struct amdgpu_device * adev)3551 void amdgpu_device_fini(struct amdgpu_device *adev)
3552 {
3553 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3554 	flush_delayed_work(&adev->delayed_init_work);
3555 	adev->shutdown = true;
3556 
3557 	kfree(adev->pci_state);
3558 
3559 	/* make sure IB test finished before entering exclusive mode
3560 	 * to avoid preemption on IB test
3561 	 * */
3562 	if (amdgpu_sriov_vf(adev)) {
3563 		amdgpu_virt_request_full_gpu(adev, false);
3564 		amdgpu_virt_fini_data_exchange(adev);
3565 	}
3566 
3567 	/* disable all interrupts */
3568 	amdgpu_irq_disable_all(adev);
3569 	if (adev->mode_info.mode_config_initialized){
3570 		if (!amdgpu_device_has_dc_support(adev))
3571 			drm_helper_force_disable_all(adev_to_drm(adev));
3572 		else
3573 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3574 	}
3575 	amdgpu_fence_driver_fini(adev);
3576 	if (adev->pm_sysfs_en)
3577 		amdgpu_pm_sysfs_fini(adev);
3578 	amdgpu_fbdev_fini(adev);
3579 	amdgpu_device_ip_fini(adev);
3580 	release_firmware(adev->firmware.gpu_info_fw);
3581 	adev->firmware.gpu_info_fw = NULL;
3582 	adev->accel_working = false;
3583 	/* free i2c buses */
3584 	if (!amdgpu_device_has_dc_support(adev))
3585 		amdgpu_i2c_fini(adev);
3586 
3587 	if (amdgpu_emu_mode != 1)
3588 		amdgpu_atombios_fini(adev);
3589 
3590 	kfree(adev->bios);
3591 	adev->bios = NULL;
3592 	if (amdgpu_has_atpx() &&
3593 	    (amdgpu_is_atpx_hybrid() ||
3594 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3595 	    !pci_is_thunderbolt_attached(adev->pdev))
3596 		vga_switcheroo_unregister_client(adev->pdev);
3597 	if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3598 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3599 	vga_client_register(adev->pdev, NULL, NULL, NULL);
3600 	if (adev->rio_mem)
3601 		pci_iounmap(adev->pdev, adev->rio_mem);
3602 	adev->rio_mem = NULL;
3603 	iounmap(adev->rmmio);
3604 	adev->rmmio = NULL;
3605 	amdgpu_device_doorbell_fini(adev);
3606 
3607 	if (adev->ucode_sysfs_en)
3608 		amdgpu_ucode_sysfs_fini(adev);
3609 
3610 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3611 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3612 		amdgpu_pmu_fini(adev);
3613 	if (adev->mman.discovery_bin)
3614 		amdgpu_discovery_fini(adev);
3615 }
3616 
3617 
3618 /*
3619  * Suspend & resume.
3620  */
3621 /**
3622  * amdgpu_device_suspend - initiate device suspend
3623  *
3624  * @dev: drm dev pointer
3625  * @fbcon : notify the fbdev of suspend
3626  *
3627  * Puts the hw in the suspend state (all asics).
3628  * Returns 0 for success or an error on failure.
3629  * Called at driver suspend.
3630  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3631 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3632 {
3633 	struct amdgpu_device *adev;
3634 	struct drm_crtc *crtc;
3635 	struct drm_connector *connector;
3636 	struct drm_connector_list_iter iter;
3637 	int r;
3638 
3639 	adev = drm_to_adev(dev);
3640 
3641 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3642 		return 0;
3643 
3644 	adev->in_suspend = true;
3645 	drm_kms_helper_poll_disable(dev);
3646 
3647 	if (fbcon)
3648 		amdgpu_fbdev_set_suspend(adev, 1);
3649 
3650 	cancel_delayed_work_sync(&adev->delayed_init_work);
3651 
3652 	if (!amdgpu_device_has_dc_support(adev)) {
3653 		/* turn off display hw */
3654 		drm_modeset_lock_all(dev);
3655 		drm_connector_list_iter_begin(dev, &iter);
3656 		drm_for_each_connector_iter(connector, &iter)
3657 			drm_helper_connector_dpms(connector,
3658 						  DRM_MODE_DPMS_OFF);
3659 		drm_connector_list_iter_end(&iter);
3660 		drm_modeset_unlock_all(dev);
3661 			/* unpin the front buffers and cursors */
3662 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3663 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3664 			struct drm_framebuffer *fb = crtc->primary->fb;
3665 			struct amdgpu_bo *robj;
3666 
3667 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3668 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3669 				r = amdgpu_bo_reserve(aobj, true);
3670 				if (r == 0) {
3671 					amdgpu_bo_unpin(aobj);
3672 					amdgpu_bo_unreserve(aobj);
3673 				}
3674 			}
3675 
3676 			if (fb == NULL || fb->obj[0] == NULL) {
3677 				continue;
3678 			}
3679 			robj = gem_to_amdgpu_bo(fb->obj[0]);
3680 			/* don't unpin kernel fb objects */
3681 			if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3682 				r = amdgpu_bo_reserve(robj, true);
3683 				if (r == 0) {
3684 					amdgpu_bo_unpin(robj);
3685 					amdgpu_bo_unreserve(robj);
3686 				}
3687 			}
3688 		}
3689 	}
3690 
3691 	amdgpu_ras_suspend(adev);
3692 
3693 	r = amdgpu_device_ip_suspend_phase1(adev);
3694 
3695 	amdgpu_amdkfd_suspend(adev, !fbcon);
3696 
3697 	/* evict vram memory */
3698 	amdgpu_bo_evict_vram(adev);
3699 
3700 	amdgpu_fence_driver_suspend(adev);
3701 
3702 	r = amdgpu_device_ip_suspend_phase2(adev);
3703 
3704 	/* evict remaining vram memory
3705 	 * This second call to evict vram is to evict the gart page table
3706 	 * using the CPU.
3707 	 */
3708 	amdgpu_bo_evict_vram(adev);
3709 
3710 	return 0;
3711 }
3712 
3713 /**
3714  * amdgpu_device_resume - initiate device resume
3715  *
3716  * @dev: drm dev pointer
3717  * @fbcon : notify the fbdev of resume
3718  *
3719  * Bring the hw back to operating state (all asics).
3720  * Returns 0 for success or an error on failure.
3721  * Called at driver resume.
3722  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3723 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3724 {
3725 	struct drm_connector *connector;
3726 	struct drm_connector_list_iter iter;
3727 	struct amdgpu_device *adev = drm_to_adev(dev);
3728 	struct drm_crtc *crtc;
3729 	int r = 0;
3730 
3731 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3732 		return 0;
3733 
3734 	/* post card */
3735 	if (amdgpu_device_need_post(adev)) {
3736 		r = amdgpu_device_asic_init(adev);
3737 		if (r)
3738 			dev_err(adev->dev, "amdgpu asic init failed\n");
3739 	}
3740 
3741 	r = amdgpu_device_ip_resume(adev);
3742 	if (r) {
3743 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3744 		return r;
3745 	}
3746 	amdgpu_fence_driver_resume(adev);
3747 
3748 
3749 	r = amdgpu_device_ip_late_init(adev);
3750 	if (r)
3751 		return r;
3752 
3753 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3754 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3755 
3756 	if (!amdgpu_device_has_dc_support(adev)) {
3757 		/* pin cursors */
3758 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3759 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3760 
3761 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3762 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3763 				r = amdgpu_bo_reserve(aobj, true);
3764 				if (r == 0) {
3765 					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3766 					if (r != 0)
3767 						dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3768 					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3769 					amdgpu_bo_unreserve(aobj);
3770 				}
3771 			}
3772 		}
3773 	}
3774 	r = amdgpu_amdkfd_resume(adev, !fbcon);
3775 	if (r)
3776 		return r;
3777 
3778 	/* Make sure IB tests flushed */
3779 	flush_delayed_work(&adev->delayed_init_work);
3780 
3781 	/* blat the mode back in */
3782 	if (fbcon) {
3783 		if (!amdgpu_device_has_dc_support(adev)) {
3784 			/* pre DCE11 */
3785 			drm_helper_resume_force_mode(dev);
3786 
3787 			/* turn on display hw */
3788 			drm_modeset_lock_all(dev);
3789 
3790 			drm_connector_list_iter_begin(dev, &iter);
3791 			drm_for_each_connector_iter(connector, &iter)
3792 				drm_helper_connector_dpms(connector,
3793 							  DRM_MODE_DPMS_ON);
3794 			drm_connector_list_iter_end(&iter);
3795 
3796 			drm_modeset_unlock_all(dev);
3797 		}
3798 		amdgpu_fbdev_set_suspend(adev, 0);
3799 	}
3800 
3801 	drm_kms_helper_poll_enable(dev);
3802 
3803 	amdgpu_ras_resume(adev);
3804 
3805 	/*
3806 	 * Most of the connector probing functions try to acquire runtime pm
3807 	 * refs to ensure that the GPU is powered on when connector polling is
3808 	 * performed. Since we're calling this from a runtime PM callback,
3809 	 * trying to acquire rpm refs will cause us to deadlock.
3810 	 *
3811 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3812 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3813 	 */
3814 #ifdef CONFIG_PM
3815 	dev->dev->power.disable_depth++;
3816 #endif
3817 	if (!amdgpu_device_has_dc_support(adev))
3818 		drm_helper_hpd_irq_event(dev);
3819 	else
3820 		drm_kms_helper_hotplug_event(dev);
3821 #ifdef CONFIG_PM
3822 	dev->dev->power.disable_depth--;
3823 #endif
3824 	adev->in_suspend = false;
3825 
3826 	return 0;
3827 }
3828 
3829 /**
3830  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3831  *
3832  * @adev: amdgpu_device pointer
3833  *
3834  * The list of all the hardware IPs that make up the asic is walked and
3835  * the check_soft_reset callbacks are run.  check_soft_reset determines
3836  * if the asic is still hung or not.
3837  * Returns true if any of the IPs are still in a hung state, false if not.
3838  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3839 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3840 {
3841 	int i;
3842 	bool asic_hang = false;
3843 
3844 	if (amdgpu_sriov_vf(adev))
3845 		return true;
3846 
3847 	if (amdgpu_asic_need_full_reset(adev))
3848 		return true;
3849 
3850 	for (i = 0; i < adev->num_ip_blocks; i++) {
3851 		if (!adev->ip_blocks[i].status.valid)
3852 			continue;
3853 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3854 			adev->ip_blocks[i].status.hang =
3855 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3856 		if (adev->ip_blocks[i].status.hang) {
3857 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3858 			asic_hang = true;
3859 		}
3860 	}
3861 	return asic_hang;
3862 }
3863 
3864 /**
3865  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3866  *
3867  * @adev: amdgpu_device pointer
3868  *
3869  * The list of all the hardware IPs that make up the asic is walked and the
3870  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3871  * handles any IP specific hardware or software state changes that are
3872  * necessary for a soft reset to succeed.
3873  * Returns 0 on success, negative error code on failure.
3874  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3875 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3876 {
3877 	int i, r = 0;
3878 
3879 	for (i = 0; i < adev->num_ip_blocks; i++) {
3880 		if (!adev->ip_blocks[i].status.valid)
3881 			continue;
3882 		if (adev->ip_blocks[i].status.hang &&
3883 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3884 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3885 			if (r)
3886 				return r;
3887 		}
3888 	}
3889 
3890 	return 0;
3891 }
3892 
3893 /**
3894  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3895  *
3896  * @adev: amdgpu_device pointer
3897  *
3898  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3899  * reset is necessary to recover.
3900  * Returns true if a full asic reset is required, false if not.
3901  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3902 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3903 {
3904 	int i;
3905 
3906 	if (amdgpu_asic_need_full_reset(adev))
3907 		return true;
3908 
3909 	for (i = 0; i < adev->num_ip_blocks; i++) {
3910 		if (!adev->ip_blocks[i].status.valid)
3911 			continue;
3912 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3913 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3914 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3915 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3916 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3917 			if (adev->ip_blocks[i].status.hang) {
3918 				dev_info(adev->dev, "Some block need full reset!\n");
3919 				return true;
3920 			}
3921 		}
3922 	}
3923 	return false;
3924 }
3925 
3926 /**
3927  * amdgpu_device_ip_soft_reset - do a soft reset
3928  *
3929  * @adev: amdgpu_device pointer
3930  *
3931  * The list of all the hardware IPs that make up the asic is walked and the
3932  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3933  * IP specific hardware or software state changes that are necessary to soft
3934  * reset the IP.
3935  * Returns 0 on success, negative error code on failure.
3936  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3937 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3938 {
3939 	int i, r = 0;
3940 
3941 	for (i = 0; i < adev->num_ip_blocks; i++) {
3942 		if (!adev->ip_blocks[i].status.valid)
3943 			continue;
3944 		if (adev->ip_blocks[i].status.hang &&
3945 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3946 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3947 			if (r)
3948 				return r;
3949 		}
3950 	}
3951 
3952 	return 0;
3953 }
3954 
3955 /**
3956  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3957  *
3958  * @adev: amdgpu_device pointer
3959  *
3960  * The list of all the hardware IPs that make up the asic is walked and the
3961  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3962  * handles any IP specific hardware or software state changes that are
3963  * necessary after the IP has been soft reset.
3964  * Returns 0 on success, negative error code on failure.
3965  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3966 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3967 {
3968 	int i, r = 0;
3969 
3970 	for (i = 0; i < adev->num_ip_blocks; i++) {
3971 		if (!adev->ip_blocks[i].status.valid)
3972 			continue;
3973 		if (adev->ip_blocks[i].status.hang &&
3974 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3975 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3976 		if (r)
3977 			return r;
3978 	}
3979 
3980 	return 0;
3981 }
3982 
3983 /**
3984  * amdgpu_device_recover_vram - Recover some VRAM contents
3985  *
3986  * @adev: amdgpu_device pointer
3987  *
3988  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3989  * restore things like GPUVM page tables after a GPU reset where
3990  * the contents of VRAM might be lost.
3991  *
3992  * Returns:
3993  * 0 on success, negative error code on failure.
3994  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)3995 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3996 {
3997 	struct dma_fence *fence = NULL, *next = NULL;
3998 	struct amdgpu_bo *shadow;
3999 	long r = 1, tmo;
4000 
4001 	if (amdgpu_sriov_runtime(adev))
4002 		tmo = msecs_to_jiffies(8000);
4003 	else
4004 		tmo = msecs_to_jiffies(100);
4005 
4006 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4007 	mutex_lock(&adev->shadow_list_lock);
4008 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4009 
4010 		/* No need to recover an evicted BO */
4011 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4012 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4013 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4014 			continue;
4015 
4016 		r = amdgpu_bo_restore_shadow(shadow, &next);
4017 		if (r)
4018 			break;
4019 
4020 		if (fence) {
4021 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4022 			dma_fence_put(fence);
4023 			fence = next;
4024 			if (tmo == 0) {
4025 				r = -ETIMEDOUT;
4026 				break;
4027 			} else if (tmo < 0) {
4028 				r = tmo;
4029 				break;
4030 			}
4031 		} else {
4032 			fence = next;
4033 		}
4034 	}
4035 	mutex_unlock(&adev->shadow_list_lock);
4036 
4037 	if (fence)
4038 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4039 	dma_fence_put(fence);
4040 
4041 	if (r < 0 || tmo <= 0) {
4042 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4043 		return -EIO;
4044 	}
4045 
4046 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4047 	return 0;
4048 }
4049 
4050 
4051 /**
4052  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4053  *
4054  * @adev: amdgpu_device pointer
4055  * @from_hypervisor: request from hypervisor
4056  *
4057  * do VF FLR and reinitialize Asic
4058  * return 0 means succeeded otherwise failed
4059  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4060 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4061 				     bool from_hypervisor)
4062 {
4063 	int r;
4064 
4065 	if (from_hypervisor)
4066 		r = amdgpu_virt_request_full_gpu(adev, true);
4067 	else
4068 		r = amdgpu_virt_reset_gpu(adev);
4069 	if (r)
4070 		return r;
4071 
4072 	amdgpu_amdkfd_pre_reset(adev);
4073 
4074 	/* Resume IP prior to SMC */
4075 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4076 	if (r)
4077 		goto error;
4078 
4079 	amdgpu_virt_init_data_exchange(adev);
4080 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4081 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4082 
4083 	r = amdgpu_device_fw_loading(adev);
4084 	if (r)
4085 		return r;
4086 
4087 	/* now we are okay to resume SMC/CP/SDMA */
4088 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4089 	if (r)
4090 		goto error;
4091 
4092 	amdgpu_irq_gpu_reset_resume_helper(adev);
4093 	r = amdgpu_ib_ring_tests(adev);
4094 	amdgpu_amdkfd_post_reset(adev);
4095 
4096 error:
4097 	amdgpu_virt_release_full_gpu(adev, true);
4098 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4099 		amdgpu_inc_vram_lost(adev);
4100 		r = amdgpu_device_recover_vram(adev);
4101 	}
4102 
4103 	return r;
4104 }
4105 
4106 /**
4107  * amdgpu_device_has_job_running - check if there is any job in mirror list
4108  *
4109  * @adev: amdgpu_device pointer
4110  *
4111  * check if there is any job in mirror list
4112  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4113 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4114 {
4115 	int i;
4116 	struct drm_sched_job *job;
4117 
4118 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4119 		struct amdgpu_ring *ring = adev->rings[i];
4120 
4121 		if (!ring || !ring->sched.thread)
4122 			continue;
4123 
4124 		spin_lock(&ring->sched.job_list_lock);
4125 		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4126 				struct drm_sched_job, node);
4127 		spin_unlock(&ring->sched.job_list_lock);
4128 		if (job)
4129 			return true;
4130 	}
4131 	return false;
4132 }
4133 
4134 /**
4135  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4136  *
4137  * @adev: amdgpu_device pointer
4138  *
4139  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4140  * a hung GPU.
4141  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4142 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4143 {
4144 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4145 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4146 		return false;
4147 	}
4148 
4149 	if (amdgpu_gpu_recovery == 0)
4150 		goto disabled;
4151 
4152 	if (amdgpu_sriov_vf(adev))
4153 		return true;
4154 
4155 	if (amdgpu_gpu_recovery == -1) {
4156 		switch (adev->asic_type) {
4157 		case CHIP_BONAIRE:
4158 		case CHIP_HAWAII:
4159 		case CHIP_TOPAZ:
4160 		case CHIP_TONGA:
4161 		case CHIP_FIJI:
4162 		case CHIP_POLARIS10:
4163 		case CHIP_POLARIS11:
4164 		case CHIP_POLARIS12:
4165 		case CHIP_VEGAM:
4166 		case CHIP_VEGA20:
4167 		case CHIP_VEGA10:
4168 		case CHIP_VEGA12:
4169 		case CHIP_RAVEN:
4170 		case CHIP_ARCTURUS:
4171 		case CHIP_RENOIR:
4172 		case CHIP_NAVI10:
4173 		case CHIP_NAVI14:
4174 		case CHIP_NAVI12:
4175 		case CHIP_SIENNA_CICHLID:
4176 			break;
4177 		default:
4178 			goto disabled;
4179 		}
4180 	}
4181 
4182 	return true;
4183 
4184 disabled:
4185 		dev_info(adev->dev, "GPU recovery disabled.\n");
4186 		return false;
4187 }
4188 
4189 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4190 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4191 					struct amdgpu_job *job,
4192 					bool *need_full_reset_arg)
4193 {
4194 	int i, r = 0;
4195 	bool need_full_reset  = *need_full_reset_arg;
4196 
4197 	amdgpu_debugfs_wait_dump(adev);
4198 
4199 	if (amdgpu_sriov_vf(adev)) {
4200 		/* stop the data exchange thread */
4201 		amdgpu_virt_fini_data_exchange(adev);
4202 	}
4203 
4204 	/* block all schedulers and reset given job's ring */
4205 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4206 		struct amdgpu_ring *ring = adev->rings[i];
4207 
4208 		if (!ring || !ring->sched.thread)
4209 			continue;
4210 
4211 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4212 		amdgpu_fence_driver_force_completion(ring);
4213 	}
4214 
4215 	if(job)
4216 		drm_sched_increase_karma(&job->base);
4217 
4218 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4219 	if (!amdgpu_sriov_vf(adev)) {
4220 
4221 		if (!need_full_reset)
4222 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4223 
4224 		if (!need_full_reset) {
4225 			amdgpu_device_ip_pre_soft_reset(adev);
4226 			r = amdgpu_device_ip_soft_reset(adev);
4227 			amdgpu_device_ip_post_soft_reset(adev);
4228 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4229 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4230 				need_full_reset = true;
4231 			}
4232 		}
4233 
4234 		if (need_full_reset)
4235 			r = amdgpu_device_ip_suspend(adev);
4236 
4237 		*need_full_reset_arg = need_full_reset;
4238 	}
4239 
4240 	return r;
4241 }
4242 
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4243 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4244 			       struct list_head *device_list_handle,
4245 			       bool *need_full_reset_arg,
4246 			       bool skip_hw_reset)
4247 {
4248 	struct amdgpu_device *tmp_adev = NULL;
4249 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4250 	int r = 0;
4251 
4252 	/*
4253 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
4254 	 * to allow proper links negotiation in FW (within 1 sec)
4255 	 */
4256 	if (!skip_hw_reset && need_full_reset) {
4257 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4258 			/* For XGMI run all resets in parallel to speed up the process */
4259 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4260 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4261 					r = -EALREADY;
4262 			} else
4263 				r = amdgpu_asic_reset(tmp_adev);
4264 
4265 			if (r) {
4266 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4267 					 r, adev_to_drm(tmp_adev)->unique);
4268 				break;
4269 			}
4270 		}
4271 
4272 		/* For XGMI wait for all resets to complete before proceed */
4273 		if (!r) {
4274 			list_for_each_entry(tmp_adev, device_list_handle,
4275 					    gmc.xgmi.head) {
4276 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4277 					flush_work(&tmp_adev->xgmi_reset_work);
4278 					r = tmp_adev->asic_reset_res;
4279 					if (r)
4280 						break;
4281 				}
4282 			}
4283 		}
4284 	}
4285 
4286 	if (!r && amdgpu_ras_intr_triggered()) {
4287 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4288 			if (tmp_adev->mmhub.funcs &&
4289 			    tmp_adev->mmhub.funcs->reset_ras_error_count)
4290 				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4291 		}
4292 
4293 		amdgpu_ras_intr_cleared();
4294 	}
4295 
4296 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4297 		if (need_full_reset) {
4298 			/* post card */
4299 			if (amdgpu_device_asic_init(tmp_adev))
4300 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4301 
4302 			if (!r) {
4303 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4304 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4305 				if (r)
4306 					goto out;
4307 
4308 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4309 				if (vram_lost) {
4310 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4311 					amdgpu_inc_vram_lost(tmp_adev);
4312 				}
4313 
4314 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4315 				if (r)
4316 					goto out;
4317 
4318 				r = amdgpu_device_fw_loading(tmp_adev);
4319 				if (r)
4320 					return r;
4321 
4322 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4323 				if (r)
4324 					goto out;
4325 
4326 				if (vram_lost)
4327 					amdgpu_device_fill_reset_magic(tmp_adev);
4328 
4329 				/*
4330 				 * Add this ASIC as tracked as reset was already
4331 				 * complete successfully.
4332 				 */
4333 				amdgpu_register_gpu_instance(tmp_adev);
4334 
4335 				r = amdgpu_device_ip_late_init(tmp_adev);
4336 				if (r)
4337 					goto out;
4338 
4339 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4340 
4341 				/*
4342 				 * The GPU enters bad state once faulty pages
4343 				 * by ECC has reached the threshold, and ras
4344 				 * recovery is scheduled next. So add one check
4345 				 * here to break recovery if it indeed exceeds
4346 				 * bad page threshold, and remind user to
4347 				 * retire this GPU or setting one bigger
4348 				 * bad_page_threshold value to fix this once
4349 				 * probing driver again.
4350 				 */
4351 				if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4352 					/* must succeed. */
4353 					amdgpu_ras_resume(tmp_adev);
4354 				} else {
4355 					r = -EINVAL;
4356 					goto out;
4357 				}
4358 
4359 				/* Update PSP FW topology after reset */
4360 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4361 					r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4362 			}
4363 		}
4364 
4365 out:
4366 		if (!r) {
4367 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4368 			r = amdgpu_ib_ring_tests(tmp_adev);
4369 			if (r) {
4370 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4371 				r = amdgpu_device_ip_suspend(tmp_adev);
4372 				need_full_reset = true;
4373 				r = -EAGAIN;
4374 				goto end;
4375 			}
4376 		}
4377 
4378 		if (!r)
4379 			r = amdgpu_device_recover_vram(tmp_adev);
4380 		else
4381 			tmp_adev->asic_reset_res = r;
4382 	}
4383 
4384 end:
4385 	*need_full_reset_arg = need_full_reset;
4386 	return r;
4387 }
4388 
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4389 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4390 				struct amdgpu_hive_info *hive)
4391 {
4392 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4393 		return false;
4394 
4395 	if (hive) {
4396 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4397 	} else {
4398 		down_write(&adev->reset_sem);
4399 	}
4400 
4401 	atomic_inc(&adev->gpu_reset_counter);
4402 	switch (amdgpu_asic_reset_method(adev)) {
4403 	case AMD_RESET_METHOD_MODE1:
4404 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4405 		break;
4406 	case AMD_RESET_METHOD_MODE2:
4407 		adev->mp1_state = PP_MP1_STATE_RESET;
4408 		break;
4409 	default:
4410 		adev->mp1_state = PP_MP1_STATE_NONE;
4411 		break;
4412 	}
4413 
4414 	return true;
4415 }
4416 
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4417 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4418 {
4419 	amdgpu_vf_error_trans_all(adev);
4420 	adev->mp1_state = PP_MP1_STATE_NONE;
4421 	atomic_set(&adev->in_gpu_reset, 0);
4422 	up_write(&adev->reset_sem);
4423 }
4424 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4425 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4426 {
4427 	struct pci_dev *p = NULL;
4428 
4429 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4430 			adev->pdev->bus->number, 1);
4431 	if (p) {
4432 		pm_runtime_enable(&(p->dev));
4433 		pm_runtime_resume(&(p->dev));
4434 	}
4435 }
4436 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4437 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4438 {
4439 	enum amd_reset_method reset_method;
4440 	struct pci_dev *p = NULL;
4441 	u64 expires;
4442 
4443 	/*
4444 	 * For now, only BACO and mode1 reset are confirmed
4445 	 * to suffer the audio issue without proper suspended.
4446 	 */
4447 	reset_method = amdgpu_asic_reset_method(adev);
4448 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4449 	     (reset_method != AMD_RESET_METHOD_MODE1))
4450 		return -EINVAL;
4451 
4452 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4453 			adev->pdev->bus->number, 1);
4454 	if (!p)
4455 		return -ENODEV;
4456 
4457 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4458 	if (!expires)
4459 		/*
4460 		 * If we cannot get the audio device autosuspend delay,
4461 		 * a fixed 4S interval will be used. Considering 3S is
4462 		 * the audio controller default autosuspend delay setting.
4463 		 * 4S used here is guaranteed to cover that.
4464 		 */
4465 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4466 
4467 	while (!pm_runtime_status_suspended(&(p->dev))) {
4468 		if (!pm_runtime_suspend(&(p->dev)))
4469 			break;
4470 
4471 		if (expires < ktime_get_mono_fast_ns()) {
4472 			dev_warn(adev->dev, "failed to suspend display audio\n");
4473 			/* TODO: abort the succeeding gpu reset? */
4474 			return -ETIMEDOUT;
4475 		}
4476 	}
4477 
4478 	pm_runtime_disable(&(p->dev));
4479 
4480 	return 0;
4481 }
4482 
4483 /**
4484  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4485  *
4486  * @adev: amdgpu_device pointer
4487  * @job: which job trigger hang
4488  *
4489  * Attempt to reset the GPU if it has hung (all asics).
4490  * Attempt to do soft-reset or full-reset and reinitialize Asic
4491  * Returns 0 for success or an error on failure.
4492  */
4493 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4494 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4495 			      struct amdgpu_job *job)
4496 {
4497 	struct list_head device_list, *device_list_handle =  NULL;
4498 	bool need_full_reset = false;
4499 	bool job_signaled = false;
4500 	struct amdgpu_hive_info *hive = NULL;
4501 	struct amdgpu_device *tmp_adev = NULL;
4502 	int i, r = 0;
4503 	bool need_emergency_restart = false;
4504 	bool audio_suspended = false;
4505 
4506 	/*
4507 	 * Special case: RAS triggered and full reset isn't supported
4508 	 */
4509 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4510 
4511 	/*
4512 	 * Flush RAM to disk so that after reboot
4513 	 * the user can read log and see why the system rebooted.
4514 	 */
4515 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4516 		DRM_WARN("Emergency reboot.");
4517 
4518 		ksys_sync_helper();
4519 		emergency_restart();
4520 	}
4521 
4522 	dev_info(adev->dev, "GPU %s begin!\n",
4523 		need_emergency_restart ? "jobs stop":"reset");
4524 
4525 	/*
4526 	 * Here we trylock to avoid chain of resets executing from
4527 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4528 	 * different schedulers for same device while this TO handler is running.
4529 	 * We always reset all schedulers for device and all devices for XGMI
4530 	 * hive so that should take care of them too.
4531 	 */
4532 	hive = amdgpu_get_xgmi_hive(adev);
4533 	if (hive) {
4534 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4535 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4536 				job ? job->base.id : -1, hive->hive_id);
4537 			amdgpu_put_xgmi_hive(hive);
4538 			return 0;
4539 		}
4540 		mutex_lock(&hive->hive_lock);
4541 	}
4542 
4543 	/*
4544 	 * Build list of devices to reset.
4545 	 * In case we are in XGMI hive mode, resort the device list
4546 	 * to put adev in the 1st position.
4547 	 */
4548 	INIT_LIST_HEAD(&device_list);
4549 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4550 		if (!hive)
4551 			return -ENODEV;
4552 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4553 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4554 		device_list_handle = &hive->device_list;
4555 	} else {
4556 		list_add_tail(&adev->gmc.xgmi.head, &device_list);
4557 		device_list_handle = &device_list;
4558 	}
4559 
4560 	/* block all schedulers and reset given job's ring */
4561 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4562 		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4563 			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4564 				  job ? job->base.id : -1);
4565 			r = 0;
4566 			goto skip_recovery;
4567 		}
4568 
4569 		/*
4570 		 * Try to put the audio codec into suspend state
4571 		 * before gpu reset started.
4572 		 *
4573 		 * Due to the power domain of the graphics device
4574 		 * is shared with AZ power domain. Without this,
4575 		 * we may change the audio hardware from behind
4576 		 * the audio driver's back. That will trigger
4577 		 * some audio codec errors.
4578 		 */
4579 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4580 			audio_suspended = true;
4581 
4582 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4583 
4584 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4585 
4586 		if (!amdgpu_sriov_vf(tmp_adev))
4587 			amdgpu_amdkfd_pre_reset(tmp_adev);
4588 
4589 		/*
4590 		 * Mark these ASICs to be reseted as untracked first
4591 		 * And add them back after reset completed
4592 		 */
4593 		amdgpu_unregister_gpu_instance(tmp_adev);
4594 
4595 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4596 
4597 		/* disable ras on ALL IPs */
4598 		if (!need_emergency_restart &&
4599 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4600 			amdgpu_ras_suspend(tmp_adev);
4601 
4602 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4603 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4604 
4605 			if (!ring || !ring->sched.thread)
4606 				continue;
4607 
4608 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4609 
4610 			if (need_emergency_restart)
4611 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4612 		}
4613 	}
4614 
4615 	if (need_emergency_restart)
4616 		goto skip_sched_resume;
4617 
4618 	/*
4619 	 * Must check guilty signal here since after this point all old
4620 	 * HW fences are force signaled.
4621 	 *
4622 	 * job->base holds a reference to parent fence
4623 	 */
4624 	if (job && job->base.s_fence->parent &&
4625 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4626 		job_signaled = true;
4627 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4628 		goto skip_hw_reset;
4629 	}
4630 
4631 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4632 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4633 		r = amdgpu_device_pre_asic_reset(tmp_adev,
4634 						 (tmp_adev == adev) ? job : NULL,
4635 						 &need_full_reset);
4636 		/*TODO Should we stop ?*/
4637 		if (r) {
4638 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4639 				  r, adev_to_drm(tmp_adev)->unique);
4640 			tmp_adev->asic_reset_res = r;
4641 		}
4642 	}
4643 
4644 	/* Actual ASIC resets if needed.*/
4645 	/* TODO Implement XGMI hive reset logic for SRIOV */
4646 	if (amdgpu_sriov_vf(adev)) {
4647 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4648 		if (r)
4649 			adev->asic_reset_res = r;
4650 	} else {
4651 		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4652 		if (r && r == -EAGAIN)
4653 			goto retry;
4654 	}
4655 
4656 skip_hw_reset:
4657 
4658 	/* Post ASIC reset for all devs .*/
4659 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4660 
4661 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4662 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4663 
4664 			if (!ring || !ring->sched.thread)
4665 				continue;
4666 
4667 			/* No point to resubmit jobs if we didn't HW reset*/
4668 			if (!tmp_adev->asic_reset_res && !job_signaled)
4669 				drm_sched_resubmit_jobs(&ring->sched);
4670 
4671 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4672 		}
4673 
4674 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4675 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4676 		}
4677 
4678 		tmp_adev->asic_reset_res = 0;
4679 
4680 		if (r) {
4681 			/* bad news, how to tell it to userspace ? */
4682 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4683 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4684 		} else {
4685 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4686 		}
4687 	}
4688 
4689 skip_sched_resume:
4690 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4691 		/*unlock kfd: SRIOV would do it separately */
4692 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4693 	                amdgpu_amdkfd_post_reset(tmp_adev);
4694 		if (audio_suspended)
4695 			amdgpu_device_resume_display_audio(tmp_adev);
4696 		amdgpu_device_unlock_adev(tmp_adev);
4697 	}
4698 
4699 skip_recovery:
4700 	if (hive) {
4701 		atomic_set(&hive->in_reset, 0);
4702 		mutex_unlock(&hive->hive_lock);
4703 		amdgpu_put_xgmi_hive(hive);
4704 	}
4705 
4706 	if (r)
4707 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4708 	return r;
4709 }
4710 
4711 /**
4712  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4713  *
4714  * @adev: amdgpu_device pointer
4715  *
4716  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4717  * and lanes) of the slot the device is in. Handles APUs and
4718  * virtualized environments where PCIE config space may not be available.
4719  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4720 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4721 {
4722 	struct pci_dev *pdev;
4723 	enum pci_bus_speed speed_cap, platform_speed_cap;
4724 	enum pcie_link_width platform_link_width;
4725 
4726 	if (amdgpu_pcie_gen_cap)
4727 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4728 
4729 	if (amdgpu_pcie_lane_cap)
4730 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4731 
4732 	/* covers APUs as well */
4733 	if (pci_is_root_bus(adev->pdev->bus)) {
4734 		if (adev->pm.pcie_gen_mask == 0)
4735 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4736 		if (adev->pm.pcie_mlw_mask == 0)
4737 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4738 		return;
4739 	}
4740 
4741 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4742 		return;
4743 
4744 	pcie_bandwidth_available(adev->pdev, NULL,
4745 				 &platform_speed_cap, &platform_link_width);
4746 
4747 	if (adev->pm.pcie_gen_mask == 0) {
4748 		/* asic caps */
4749 		pdev = adev->pdev;
4750 		speed_cap = pcie_get_speed_cap(pdev);
4751 		if (speed_cap == PCI_SPEED_UNKNOWN) {
4752 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4753 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4754 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4755 		} else {
4756 			if (speed_cap == PCIE_SPEED_16_0GT)
4757 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4758 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4759 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4760 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4761 			else if (speed_cap == PCIE_SPEED_8_0GT)
4762 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4763 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4764 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4765 			else if (speed_cap == PCIE_SPEED_5_0GT)
4766 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4767 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4768 			else
4769 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4770 		}
4771 		/* platform caps */
4772 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4773 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4774 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4775 		} else {
4776 			if (platform_speed_cap == PCIE_SPEED_16_0GT)
4777 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4778 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4779 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4780 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4781 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4782 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4783 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4784 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4785 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4786 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4787 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4788 			else
4789 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4790 
4791 		}
4792 	}
4793 	if (adev->pm.pcie_mlw_mask == 0) {
4794 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4795 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4796 		} else {
4797 			switch (platform_link_width) {
4798 			case PCIE_LNK_X32:
4799 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4800 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4801 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4802 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4803 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4804 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4805 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4806 				break;
4807 			case PCIE_LNK_X16:
4808 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4809 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4810 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4811 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4812 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4813 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4814 				break;
4815 			case PCIE_LNK_X12:
4816 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4817 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4818 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4819 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4820 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4821 				break;
4822 			case PCIE_LNK_X8:
4823 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4824 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4825 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4826 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4827 				break;
4828 			case PCIE_LNK_X4:
4829 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4830 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4831 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4832 				break;
4833 			case PCIE_LNK_X2:
4834 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4835 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4836 				break;
4837 			case PCIE_LNK_X1:
4838 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4839 				break;
4840 			default:
4841 				break;
4842 			}
4843 		}
4844 	}
4845 }
4846 
amdgpu_device_baco_enter(struct drm_device * dev)4847 int amdgpu_device_baco_enter(struct drm_device *dev)
4848 {
4849 	struct amdgpu_device *adev = drm_to_adev(dev);
4850 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4851 
4852 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4853 		return -ENOTSUPP;
4854 
4855 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4856 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4857 
4858 	return amdgpu_dpm_baco_enter(adev);
4859 }
4860 
amdgpu_device_baco_exit(struct drm_device * dev)4861 int amdgpu_device_baco_exit(struct drm_device *dev)
4862 {
4863 	struct amdgpu_device *adev = drm_to_adev(dev);
4864 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4865 	int ret = 0;
4866 
4867 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4868 		return -ENOTSUPP;
4869 
4870 	ret = amdgpu_dpm_baco_exit(adev);
4871 	if (ret)
4872 		return ret;
4873 
4874 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4875 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4876 
4877 	return 0;
4878 }
4879 
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4880 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4881 {
4882 	int i;
4883 
4884 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4885 		struct amdgpu_ring *ring = adev->rings[i];
4886 
4887 		if (!ring || !ring->sched.thread)
4888 			continue;
4889 
4890 		cancel_delayed_work_sync(&ring->sched.work_tdr);
4891 	}
4892 }
4893 
4894 /**
4895  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4896  * @pdev: PCI device struct
4897  * @state: PCI channel state
4898  *
4899  * Description: Called when a PCI error is detected.
4900  *
4901  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4902  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4903 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4904 {
4905 	struct drm_device *dev = pci_get_drvdata(pdev);
4906 	struct amdgpu_device *adev = drm_to_adev(dev);
4907 	int i;
4908 
4909 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4910 
4911 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4912 		DRM_WARN("No support for XGMI hive yet...");
4913 		return PCI_ERS_RESULT_DISCONNECT;
4914 	}
4915 
4916 	switch (state) {
4917 	case pci_channel_io_normal:
4918 		return PCI_ERS_RESULT_CAN_RECOVER;
4919 	/* Fatal error, prepare for slot reset */
4920 	case pci_channel_io_frozen:
4921 		/*
4922 		 * Cancel and wait for all TDRs in progress if failing to
4923 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4924 		 *
4925 		 * Locking adev->reset_sem will prevent any external access
4926 		 * to GPU during PCI error recovery
4927 		 */
4928 		while (!amdgpu_device_lock_adev(adev, NULL))
4929 			amdgpu_cancel_all_tdr(adev);
4930 
4931 		/*
4932 		 * Block any work scheduling as we do for regular GPU reset
4933 		 * for the duration of the recovery
4934 		 */
4935 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4936 			struct amdgpu_ring *ring = adev->rings[i];
4937 
4938 			if (!ring || !ring->sched.thread)
4939 				continue;
4940 
4941 			drm_sched_stop(&ring->sched, NULL);
4942 		}
4943 		return PCI_ERS_RESULT_NEED_RESET;
4944 	case pci_channel_io_perm_failure:
4945 		/* Permanent error, prepare for device removal */
4946 		return PCI_ERS_RESULT_DISCONNECT;
4947 	}
4948 
4949 	return PCI_ERS_RESULT_NEED_RESET;
4950 }
4951 
4952 /**
4953  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4954  * @pdev: pointer to PCI device
4955  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4956 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4957 {
4958 
4959 	DRM_INFO("PCI error: mmio enabled callback!!\n");
4960 
4961 	/* TODO - dump whatever for debugging purposes */
4962 
4963 	/* This called only if amdgpu_pci_error_detected returns
4964 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4965 	 * works, no need to reset slot.
4966 	 */
4967 
4968 	return PCI_ERS_RESULT_RECOVERED;
4969 }
4970 
4971 /**
4972  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4973  * @pdev: PCI device struct
4974  *
4975  * Description: This routine is called by the pci error recovery
4976  * code after the PCI slot has been reset, just before we
4977  * should resume normal operations.
4978  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4979 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4980 {
4981 	struct drm_device *dev = pci_get_drvdata(pdev);
4982 	struct amdgpu_device *adev = drm_to_adev(dev);
4983 	int r, i;
4984 	bool need_full_reset = true;
4985 	u32 memsize;
4986 	struct list_head device_list;
4987 
4988 	DRM_INFO("PCI error: slot reset callback!!\n");
4989 
4990 	INIT_LIST_HEAD(&device_list);
4991 	list_add_tail(&adev->gmc.xgmi.head, &device_list);
4992 
4993 	/* wait for asic to come out of reset */
4994 	msleep(500);
4995 
4996 	/* Restore PCI confspace */
4997 	amdgpu_device_load_pci_state(pdev);
4998 
4999 	/* confirm  ASIC came out of reset */
5000 	for (i = 0; i < adev->usec_timeout; i++) {
5001 		memsize = amdgpu_asic_get_config_memsize(adev);
5002 
5003 		if (memsize != 0xffffffff)
5004 			break;
5005 		udelay(1);
5006 	}
5007 	if (memsize == 0xffffffff) {
5008 		r = -ETIME;
5009 		goto out;
5010 	}
5011 
5012 	adev->in_pci_err_recovery = true;
5013 	r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5014 	adev->in_pci_err_recovery = false;
5015 	if (r)
5016 		goto out;
5017 
5018 	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5019 
5020 out:
5021 	if (!r) {
5022 		if (amdgpu_device_cache_pci_state(adev->pdev))
5023 			pci_restore_state(adev->pdev);
5024 
5025 		DRM_INFO("PCIe error recovery succeeded\n");
5026 	} else {
5027 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5028 		amdgpu_device_unlock_adev(adev);
5029 	}
5030 
5031 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5032 }
5033 
5034 /**
5035  * amdgpu_pci_resume() - resume normal ops after PCI reset
5036  * @pdev: pointer to PCI device
5037  *
5038  * Called when the error recovery driver tells us that its
5039  * OK to resume normal operation. Use completion to allow
5040  * halted scsi ops to resume.
5041  */
amdgpu_pci_resume(struct pci_dev * pdev)5042 void amdgpu_pci_resume(struct pci_dev *pdev)
5043 {
5044 	struct drm_device *dev = pci_get_drvdata(pdev);
5045 	struct amdgpu_device *adev = drm_to_adev(dev);
5046 	int i;
5047 
5048 
5049 	DRM_INFO("PCI error: resume callback!!\n");
5050 
5051 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5052 		struct amdgpu_ring *ring = adev->rings[i];
5053 
5054 		if (!ring || !ring->sched.thread)
5055 			continue;
5056 
5057 
5058 		drm_sched_resubmit_jobs(&ring->sched);
5059 		drm_sched_start(&ring->sched, true);
5060 	}
5061 
5062 	amdgpu_device_unlock_adev(adev);
5063 }
5064 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5065 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5066 {
5067 	struct drm_device *dev = pci_get_drvdata(pdev);
5068 	struct amdgpu_device *adev = drm_to_adev(dev);
5069 	int r;
5070 
5071 	r = pci_save_state(pdev);
5072 	if (!r) {
5073 		kfree(adev->pci_state);
5074 
5075 		adev->pci_state = pci_store_saved_state(pdev);
5076 
5077 		if (!adev->pci_state) {
5078 			DRM_ERROR("Failed to store PCI saved state");
5079 			return false;
5080 		}
5081 	} else {
5082 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5083 		return false;
5084 	}
5085 
5086 	return true;
5087 }
5088 
amdgpu_device_load_pci_state(struct pci_dev * pdev)5089 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5090 {
5091 	struct drm_device *dev = pci_get_drvdata(pdev);
5092 	struct amdgpu_device *adev = drm_to_adev(dev);
5093 	int r;
5094 
5095 	if (!adev->pci_state)
5096 		return false;
5097 
5098 	r = pci_load_saved_state(pdev, adev->pci_state);
5099 
5100 	if (!r) {
5101 		pci_restore_state(pdev);
5102 	} else {
5103 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5104 		return false;
5105 	}
5106 
5107 	return true;
5108 }
5109 
5110 
5111