1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/devcoredump.h>
36 #include <generated/utsrelease.h>
37 #include <linux/pci-p2pdma.h>
38 #include <linux/apple-gmux.h>
39
40 #include <drm/drm_aperture.h>
41 #include <drm/drm_atomic_helper.h>
42 #include <drm/drm_crtc_helper.h>
43 #include <drm/drm_fb_helper.h>
44 #include <drm/drm_probe_helper.h>
45 #include <drm/amdgpu_drm.h>
46 #include <linux/vgaarb.h>
47 #include <linux/vga_switcheroo.h>
48 #include <linux/efi.h>
49 #include "amdgpu.h"
50 #include "amdgpu_trace.h"
51 #include "amdgpu_i2c.h"
52 #include "atom.h"
53 #include "amdgpu_atombios.h"
54 #include "amdgpu_atomfirmware.h"
55 #include "amd_pcie.h"
56 #ifdef CONFIG_DRM_AMDGPU_SI
57 #include "si.h"
58 #endif
59 #ifdef CONFIG_DRM_AMDGPU_CIK
60 #include "cik.h"
61 #endif
62 #include "vi.h"
63 #include "soc15.h"
64 #include "nv.h"
65 #include "bif/bif_4_1_d.h"
66 #include <linux/firmware.h>
67 #include "amdgpu_vf_error.h"
68
69 #include "amdgpu_amdkfd.h"
70 #include "amdgpu_pm.h"
71
72 #include "amdgpu_xgmi.h"
73 #include "amdgpu_ras.h"
74 #include "amdgpu_pmu.h"
75 #include "amdgpu_fru_eeprom.h"
76 #include "amdgpu_reset.h"
77
78 #include <linux/suspend.h>
79 #include <drm/task_barrier.h>
80 #include <linux/pm_runtime.h>
81
82 #include <drm/drm_drv.h>
83
84 #if IS_ENABLED(CONFIG_X86)
85 #include <asm/intel-family.h>
86 #endif
87
88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
95
96 #define AMDGPU_RESUME_MS 2000
97 #define AMDGPU_MAX_RETRY_LIMIT 2
98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
99
100 static const struct drm_driver amdgpu_kms_driver;
101
102 const char *amdgpu_asic_name[] = {
103 "TAHITI",
104 "PITCAIRN",
105 "VERDE",
106 "OLAND",
107 "HAINAN",
108 "BONAIRE",
109 "KAVERI",
110 "KABINI",
111 "HAWAII",
112 "MULLINS",
113 "TOPAZ",
114 "TONGA",
115 "FIJI",
116 "CARRIZO",
117 "STONEY",
118 "POLARIS10",
119 "POLARIS11",
120 "POLARIS12",
121 "VEGAM",
122 "VEGA10",
123 "VEGA12",
124 "VEGA20",
125 "RAVEN",
126 "ARCTURUS",
127 "RENOIR",
128 "ALDEBARAN",
129 "NAVI10",
130 "CYAN_SKILLFISH",
131 "NAVI14",
132 "NAVI12",
133 "SIENNA_CICHLID",
134 "NAVY_FLOUNDER",
135 "VANGOGH",
136 "DIMGREY_CAVEFISH",
137 "BEIGE_GOBY",
138 "YELLOW_CARP",
139 "IP DISCOVERY",
140 "LAST",
141 };
142
143 /**
144 * DOC: pcie_replay_count
145 *
146 * The amdgpu driver provides a sysfs API for reporting the total number
147 * of PCIe replays (NAKs)
148 * The file pcie_replay_count is used for this and returns the total
149 * number of replays as a sum of the NAKs generated and NAKs received
150 */
151
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
153 struct device_attribute *attr, char *buf)
154 {
155 struct drm_device *ddev = dev_get_drvdata(dev);
156 struct amdgpu_device *adev = drm_to_adev(ddev);
157 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
158
159 return sysfs_emit(buf, "%llu\n", cnt);
160 }
161
162 static DEVICE_ATTR(pcie_replay_count, 0444,
163 amdgpu_device_get_pcie_replay_count, NULL);
164
165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
166
167
168 /**
169 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
170 *
171 * @dev: drm_device pointer
172 *
173 * Returns true if the device is a dGPU with ATPX power control,
174 * otherwise return false.
175 */
amdgpu_device_supports_px(struct drm_device * dev)176 bool amdgpu_device_supports_px(struct drm_device *dev)
177 {
178 struct amdgpu_device *adev = drm_to_adev(dev);
179
180 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
181 return true;
182 return false;
183 }
184
185 /**
186 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
187 *
188 * @dev: drm_device pointer
189 *
190 * Returns true if the device is a dGPU with ACPI power control,
191 * otherwise return false.
192 */
amdgpu_device_supports_boco(struct drm_device * dev)193 bool amdgpu_device_supports_boco(struct drm_device *dev)
194 {
195 struct amdgpu_device *adev = drm_to_adev(dev);
196
197 if (adev->has_pr3 ||
198 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
199 return true;
200 return false;
201 }
202
203 /**
204 * amdgpu_device_supports_baco - Does the device support BACO
205 *
206 * @dev: drm_device pointer
207 *
208 * Returns true if the device supporte BACO,
209 * otherwise return false.
210 */
amdgpu_device_supports_baco(struct drm_device * dev)211 bool amdgpu_device_supports_baco(struct drm_device *dev)
212 {
213 struct amdgpu_device *adev = drm_to_adev(dev);
214
215 return amdgpu_asic_supports_baco(adev);
216 }
217
218 /**
219 * amdgpu_device_supports_smart_shift - Is the device dGPU with
220 * smart shift support
221 *
222 * @dev: drm_device pointer
223 *
224 * Returns true if the device is a dGPU with Smart Shift support,
225 * otherwise returns false.
226 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)227 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
228 {
229 return (amdgpu_device_supports_boco(dev) &&
230 amdgpu_acpi_is_power_shift_control_supported());
231 }
232
233 /*
234 * VRAM access helper functions
235 */
236
237 /**
238 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
239 *
240 * @adev: amdgpu_device pointer
241 * @pos: offset of the buffer in vram
242 * @buf: virtual address of the buffer in system memory
243 * @size: read/write size, sizeof(@buf) must > @size
244 * @write: true - write to vram, otherwise - read from vram
245 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)246 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
247 void *buf, size_t size, bool write)
248 {
249 unsigned long flags;
250 uint32_t hi = ~0, tmp = 0;
251 uint32_t *data = buf;
252 uint64_t last;
253 int idx;
254
255 if (!drm_dev_enter(adev_to_drm(adev), &idx))
256 return;
257
258 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
259
260 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
261 for (last = pos + size; pos < last; pos += 4) {
262 tmp = pos >> 31;
263
264 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
265 if (tmp != hi) {
266 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
267 hi = tmp;
268 }
269 if (write)
270 WREG32_NO_KIQ(mmMM_DATA, *data++);
271 else
272 *data++ = RREG32_NO_KIQ(mmMM_DATA);
273 }
274
275 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
276 drm_dev_exit(idx);
277 }
278
279 /**
280 * amdgpu_device_aper_access - access vram by vram aperature
281 *
282 * @adev: amdgpu_device pointer
283 * @pos: offset of the buffer in vram
284 * @buf: virtual address of the buffer in system memory
285 * @size: read/write size, sizeof(@buf) must > @size
286 * @write: true - write to vram, otherwise - read from vram
287 *
288 * The return value means how many bytes have been transferred.
289 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)290 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
291 void *buf, size_t size, bool write)
292 {
293 #ifdef CONFIG_64BIT
294 void __iomem *addr;
295 size_t count = 0;
296 uint64_t last;
297
298 if (!adev->mman.aper_base_kaddr)
299 return 0;
300
301 last = min(pos + size, adev->gmc.visible_vram_size);
302 if (last > pos) {
303 addr = adev->mman.aper_base_kaddr + pos;
304 count = last - pos;
305
306 if (write) {
307 memcpy_toio(addr, buf, count);
308 /* Make sure HDP write cache flush happens without any reordering
309 * after the system memory contents are sent over PCIe device
310 */
311 mb();
312 amdgpu_device_flush_hdp(adev, NULL);
313 } else {
314 amdgpu_device_invalidate_hdp(adev, NULL);
315 /* Make sure HDP read cache is invalidated before issuing a read
316 * to the PCIe device
317 */
318 mb();
319 memcpy_fromio(buf, addr, count);
320 }
321
322 }
323
324 return count;
325 #else
326 return 0;
327 #endif
328 }
329
330 /**
331 * amdgpu_device_vram_access - read/write a buffer in vram
332 *
333 * @adev: amdgpu_device pointer
334 * @pos: offset of the buffer in vram
335 * @buf: virtual address of the buffer in system memory
336 * @size: read/write size, sizeof(@buf) must > @size
337 * @write: true - write to vram, otherwise - read from vram
338 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)339 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
340 void *buf, size_t size, bool write)
341 {
342 size_t count;
343
344 /* try to using vram apreature to access vram first */
345 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
346 size -= count;
347 if (size) {
348 /* using MM to access rest vram */
349 pos += count;
350 buf += count;
351 amdgpu_device_mm_access(adev, pos, buf, size, write);
352 }
353 }
354
355 /*
356 * register access helper functions.
357 */
358
359 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)360 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
361 {
362 if (adev->no_hw_access)
363 return true;
364
365 #ifdef CONFIG_LOCKDEP
366 /*
367 * This is a bit complicated to understand, so worth a comment. What we assert
368 * here is that the GPU reset is not running on another thread in parallel.
369 *
370 * For this we trylock the read side of the reset semaphore, if that succeeds
371 * we know that the reset is not running in paralell.
372 *
373 * If the trylock fails we assert that we are either already holding the read
374 * side of the lock or are the reset thread itself and hold the write side of
375 * the lock.
376 */
377 if (in_task()) {
378 if (down_read_trylock(&adev->reset_domain->sem))
379 up_read(&adev->reset_domain->sem);
380 else
381 lockdep_assert_held(&adev->reset_domain->sem);
382 }
383 #endif
384 return false;
385 }
386
387 /**
388 * amdgpu_device_rreg - read a memory mapped IO or indirect register
389 *
390 * @adev: amdgpu_device pointer
391 * @reg: dword aligned register offset
392 * @acc_flags: access flags which require special behavior
393 *
394 * Returns the 32 bit value from the offset specified.
395 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)396 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
397 uint32_t reg, uint32_t acc_flags)
398 {
399 uint32_t ret;
400
401 if (amdgpu_device_skip_hw_access(adev))
402 return 0;
403
404 if ((reg * 4) < adev->rmmio_size) {
405 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
406 amdgpu_sriov_runtime(adev) &&
407 down_read_trylock(&adev->reset_domain->sem)) {
408 ret = amdgpu_kiq_rreg(adev, reg);
409 up_read(&adev->reset_domain->sem);
410 } else {
411 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
412 }
413 } else {
414 ret = adev->pcie_rreg(adev, reg * 4);
415 }
416
417 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
418
419 return ret;
420 }
421
422 /*
423 * MMIO register read with bytes helper functions
424 * @offset:bytes offset from MMIO start
425 */
426
427 /**
428 * amdgpu_mm_rreg8 - read a memory mapped IO register
429 *
430 * @adev: amdgpu_device pointer
431 * @offset: byte aligned register offset
432 *
433 * Returns the 8 bit value from the offset specified.
434 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)435 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
436 {
437 if (amdgpu_device_skip_hw_access(adev))
438 return 0;
439
440 if (offset < adev->rmmio_size)
441 return (readb(adev->rmmio + offset));
442 BUG();
443 }
444
445 /*
446 * MMIO register write with bytes helper functions
447 * @offset:bytes offset from MMIO start
448 * @value: the value want to be written to the register
449 */
450
451 /**
452 * amdgpu_mm_wreg8 - read a memory mapped IO register
453 *
454 * @adev: amdgpu_device pointer
455 * @offset: byte aligned register offset
456 * @value: 8 bit value to write
457 *
458 * Writes the value specified to the offset specified.
459 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)460 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
461 {
462 if (amdgpu_device_skip_hw_access(adev))
463 return;
464
465 if (offset < adev->rmmio_size)
466 writeb(value, adev->rmmio + offset);
467 else
468 BUG();
469 }
470
471 /**
472 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
473 *
474 * @adev: amdgpu_device pointer
475 * @reg: dword aligned register offset
476 * @v: 32 bit value to write to the register
477 * @acc_flags: access flags which require special behavior
478 *
479 * Writes the value specified to the offset specified.
480 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)481 void amdgpu_device_wreg(struct amdgpu_device *adev,
482 uint32_t reg, uint32_t v,
483 uint32_t acc_flags)
484 {
485 if (amdgpu_device_skip_hw_access(adev))
486 return;
487
488 if ((reg * 4) < adev->rmmio_size) {
489 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
490 amdgpu_sriov_runtime(adev) &&
491 down_read_trylock(&adev->reset_domain->sem)) {
492 amdgpu_kiq_wreg(adev, reg, v);
493 up_read(&adev->reset_domain->sem);
494 } else {
495 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
496 }
497 } else {
498 adev->pcie_wreg(adev, reg * 4, v);
499 }
500
501 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
502 }
503
504 /**
505 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
506 *
507 * @adev: amdgpu_device pointer
508 * @reg: mmio/rlc register
509 * @v: value to write
510 *
511 * this function is invoked only for the debugfs register access
512 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)513 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
514 uint32_t reg, uint32_t v,
515 uint32_t xcc_id)
516 {
517 if (amdgpu_device_skip_hw_access(adev))
518 return;
519
520 if (amdgpu_sriov_fullaccess(adev) &&
521 adev->gfx.rlc.funcs &&
522 adev->gfx.rlc.funcs->is_rlcg_access_range) {
523 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
524 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
525 } else if ((reg * 4) >= adev->rmmio_size) {
526 adev->pcie_wreg(adev, reg * 4, v);
527 } else {
528 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
529 }
530 }
531
532 /**
533 * amdgpu_device_indirect_rreg - read an indirect register
534 *
535 * @adev: amdgpu_device pointer
536 * @reg_addr: indirect register address to read from
537 *
538 * Returns the value of indirect register @reg_addr
539 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)540 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
541 u32 reg_addr)
542 {
543 unsigned long flags, pcie_index, pcie_data;
544 void __iomem *pcie_index_offset;
545 void __iomem *pcie_data_offset;
546 u32 r;
547
548 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
549 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
550
551 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
552 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
553 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
554
555 writel(reg_addr, pcie_index_offset);
556 readl(pcie_index_offset);
557 r = readl(pcie_data_offset);
558 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
559
560 return r;
561 }
562
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)563 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
564 u64 reg_addr)
565 {
566 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
567 u32 r;
568 void __iomem *pcie_index_offset;
569 void __iomem *pcie_index_hi_offset;
570 void __iomem *pcie_data_offset;
571
572 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
573 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
574 if (adev->nbio.funcs->get_pcie_index_hi_offset)
575 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
576 else
577 pcie_index_hi = 0;
578
579 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
580 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
581 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
582 if (pcie_index_hi != 0)
583 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
584 pcie_index_hi * 4;
585
586 writel(reg_addr, pcie_index_offset);
587 readl(pcie_index_offset);
588 if (pcie_index_hi != 0) {
589 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
590 readl(pcie_index_hi_offset);
591 }
592 r = readl(pcie_data_offset);
593
594 /* clear the high bits */
595 if (pcie_index_hi != 0) {
596 writel(0, pcie_index_hi_offset);
597 readl(pcie_index_hi_offset);
598 }
599
600 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
601
602 return r;
603 }
604
605 /**
606 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
607 *
608 * @adev: amdgpu_device pointer
609 * @reg_addr: indirect register address to read from
610 *
611 * Returns the value of indirect register @reg_addr
612 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)613 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
614 u32 reg_addr)
615 {
616 unsigned long flags, pcie_index, pcie_data;
617 void __iomem *pcie_index_offset;
618 void __iomem *pcie_data_offset;
619 u64 r;
620
621 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
622 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
623
624 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
625 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
626 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
627
628 /* read low 32 bits */
629 writel(reg_addr, pcie_index_offset);
630 readl(pcie_index_offset);
631 r = readl(pcie_data_offset);
632 /* read high 32 bits */
633 writel(reg_addr + 4, pcie_index_offset);
634 readl(pcie_index_offset);
635 r |= ((u64)readl(pcie_data_offset) << 32);
636 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
637
638 return r;
639 }
640
641 /**
642 * amdgpu_device_indirect_wreg - write an indirect register address
643 *
644 * @adev: amdgpu_device pointer
645 * @reg_addr: indirect register offset
646 * @reg_data: indirect register data
647 *
648 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)649 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
650 u32 reg_addr, u32 reg_data)
651 {
652 unsigned long flags, pcie_index, pcie_data;
653 void __iomem *pcie_index_offset;
654 void __iomem *pcie_data_offset;
655
656 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
657 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
658
659 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
660 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
661 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
662
663 writel(reg_addr, pcie_index_offset);
664 readl(pcie_index_offset);
665 writel(reg_data, pcie_data_offset);
666 readl(pcie_data_offset);
667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
668 }
669
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)670 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
671 u64 reg_addr, u32 reg_data)
672 {
673 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
674 void __iomem *pcie_index_offset;
675 void __iomem *pcie_index_hi_offset;
676 void __iomem *pcie_data_offset;
677
678 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
679 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
680 if (adev->nbio.funcs->get_pcie_index_hi_offset)
681 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
682 else
683 pcie_index_hi = 0;
684
685 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
688 if (pcie_index_hi != 0)
689 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
690 pcie_index_hi * 4;
691
692 writel(reg_addr, pcie_index_offset);
693 readl(pcie_index_offset);
694 if (pcie_index_hi != 0) {
695 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
696 readl(pcie_index_hi_offset);
697 }
698 writel(reg_data, pcie_data_offset);
699 readl(pcie_data_offset);
700
701 /* clear the high bits */
702 if (pcie_index_hi != 0) {
703 writel(0, pcie_index_hi_offset);
704 readl(pcie_index_hi_offset);
705 }
706
707 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
708 }
709
710 /**
711 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
712 *
713 * @adev: amdgpu_device pointer
714 * @reg_addr: indirect register offset
715 * @reg_data: indirect register data
716 *
717 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)718 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
719 u32 reg_addr, u64 reg_data)
720 {
721 unsigned long flags, pcie_index, pcie_data;
722 void __iomem *pcie_index_offset;
723 void __iomem *pcie_data_offset;
724
725 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
726 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
727
728 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
729 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
730 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
731
732 /* write low 32 bits */
733 writel(reg_addr, pcie_index_offset);
734 readl(pcie_index_offset);
735 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
736 readl(pcie_data_offset);
737 /* write high 32 bits */
738 writel(reg_addr + 4, pcie_index_offset);
739 readl(pcie_index_offset);
740 writel((u32)(reg_data >> 32), pcie_data_offset);
741 readl(pcie_data_offset);
742 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
743 }
744
745 /**
746 * amdgpu_device_get_rev_id - query device rev_id
747 *
748 * @adev: amdgpu_device pointer
749 *
750 * Return device rev_id
751 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)752 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
753 {
754 return adev->nbio.funcs->get_rev_id(adev);
755 }
756
757 /**
758 * amdgpu_invalid_rreg - dummy reg read function
759 *
760 * @adev: amdgpu_device pointer
761 * @reg: offset of register
762 *
763 * Dummy register read function. Used for register blocks
764 * that certain asics don't have (all asics).
765 * Returns the value in the register.
766 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)767 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
768 {
769 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
770 BUG();
771 return 0;
772 }
773
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)774 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
775 {
776 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
777 BUG();
778 return 0;
779 }
780
781 /**
782 * amdgpu_invalid_wreg - dummy reg write function
783 *
784 * @adev: amdgpu_device pointer
785 * @reg: offset of register
786 * @v: value to write to the register
787 *
788 * Dummy register read function. Used for register blocks
789 * that certain asics don't have (all asics).
790 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)791 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
792 {
793 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
794 reg, v);
795 BUG();
796 }
797
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)798 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
799 {
800 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
801 reg, v);
802 BUG();
803 }
804
805 /**
806 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
807 *
808 * @adev: amdgpu_device pointer
809 * @reg: offset of register
810 *
811 * Dummy register read function. Used for register blocks
812 * that certain asics don't have (all asics).
813 * Returns the value in the register.
814 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)815 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
816 {
817 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
818 BUG();
819 return 0;
820 }
821
822 /**
823 * amdgpu_invalid_wreg64 - dummy reg write function
824 *
825 * @adev: amdgpu_device pointer
826 * @reg: offset of register
827 * @v: value to write to the register
828 *
829 * Dummy register read function. Used for register blocks
830 * that certain asics don't have (all asics).
831 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)832 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
833 {
834 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
835 reg, v);
836 BUG();
837 }
838
839 /**
840 * amdgpu_block_invalid_rreg - dummy reg read function
841 *
842 * @adev: amdgpu_device pointer
843 * @block: offset of instance
844 * @reg: offset of register
845 *
846 * Dummy register read function. Used for register blocks
847 * that certain asics don't have (all asics).
848 * Returns the value in the register.
849 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)850 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
851 uint32_t block, uint32_t reg)
852 {
853 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
854 reg, block);
855 BUG();
856 return 0;
857 }
858
859 /**
860 * amdgpu_block_invalid_wreg - dummy reg write function
861 *
862 * @adev: amdgpu_device pointer
863 * @block: offset of instance
864 * @reg: offset of register
865 * @v: value to write to the register
866 *
867 * Dummy register read function. Used for register blocks
868 * that certain asics don't have (all asics).
869 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)870 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
871 uint32_t block,
872 uint32_t reg, uint32_t v)
873 {
874 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
875 reg, block, v);
876 BUG();
877 }
878
879 /**
880 * amdgpu_device_asic_init - Wrapper for atom asic_init
881 *
882 * @adev: amdgpu_device pointer
883 *
884 * Does any asic specific work and then calls atom asic init.
885 */
amdgpu_device_asic_init(struct amdgpu_device * adev)886 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
887 {
888 int ret;
889
890 amdgpu_asic_pre_asic_init(adev);
891
892 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
893 adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) {
894 amdgpu_psp_wait_for_bootloader(adev);
895 ret = amdgpu_atomfirmware_asic_init(adev, true);
896 return ret;
897 } else {
898 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
899 }
900
901 return 0;
902 }
903
904 /**
905 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
906 *
907 * @adev: amdgpu_device pointer
908 *
909 * Allocates a scratch page of VRAM for use by various things in the
910 * driver.
911 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)912 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
913 {
914 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
915 AMDGPU_GEM_DOMAIN_VRAM |
916 AMDGPU_GEM_DOMAIN_GTT,
917 &adev->mem_scratch.robj,
918 &adev->mem_scratch.gpu_addr,
919 (void **)&adev->mem_scratch.ptr);
920 }
921
922 /**
923 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
924 *
925 * @adev: amdgpu_device pointer
926 *
927 * Frees the VRAM scratch page.
928 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)929 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
930 {
931 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
932 }
933
934 /**
935 * amdgpu_device_program_register_sequence - program an array of registers.
936 *
937 * @adev: amdgpu_device pointer
938 * @registers: pointer to the register array
939 * @array_size: size of the register array
940 *
941 * Programs an array or registers with and or masks.
942 * This is a helper for setting golden registers.
943 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)944 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
945 const u32 *registers,
946 const u32 array_size)
947 {
948 u32 tmp, reg, and_mask, or_mask;
949 int i;
950
951 if (array_size % 3)
952 return;
953
954 for (i = 0; i < array_size; i += 3) {
955 reg = registers[i + 0];
956 and_mask = registers[i + 1];
957 or_mask = registers[i + 2];
958
959 if (and_mask == 0xffffffff) {
960 tmp = or_mask;
961 } else {
962 tmp = RREG32(reg);
963 tmp &= ~and_mask;
964 if (adev->family >= AMDGPU_FAMILY_AI)
965 tmp |= (or_mask & and_mask);
966 else
967 tmp |= or_mask;
968 }
969 WREG32(reg, tmp);
970 }
971 }
972
973 /**
974 * amdgpu_device_pci_config_reset - reset the GPU
975 *
976 * @adev: amdgpu_device pointer
977 *
978 * Resets the GPU using the pci config reset sequence.
979 * Only applicable to asics prior to vega10.
980 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)981 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
982 {
983 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
984 }
985
986 /**
987 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
988 *
989 * @adev: amdgpu_device pointer
990 *
991 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
992 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)993 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
994 {
995 return pci_reset_function(adev->pdev);
996 }
997
998 /*
999 * amdgpu_device_wb_*()
1000 * Writeback is the method by which the GPU updates special pages in memory
1001 * with the status of certain GPU events (fences, ring pointers,etc.).
1002 */
1003
1004 /**
1005 * amdgpu_device_wb_fini - Disable Writeback and free memory
1006 *
1007 * @adev: amdgpu_device pointer
1008 *
1009 * Disables Writeback and frees the Writeback memory (all asics).
1010 * Used at driver shutdown.
1011 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1012 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1013 {
1014 if (adev->wb.wb_obj) {
1015 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1016 &adev->wb.gpu_addr,
1017 (void **)&adev->wb.wb);
1018 adev->wb.wb_obj = NULL;
1019 }
1020 }
1021
1022 /**
1023 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1024 *
1025 * @adev: amdgpu_device pointer
1026 *
1027 * Initializes writeback and allocates writeback memory (all asics).
1028 * Used at driver startup.
1029 * Returns 0 on success or an -error on failure.
1030 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1031 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1032 {
1033 int r;
1034
1035 if (adev->wb.wb_obj == NULL) {
1036 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1037 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1038 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1039 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1040 (void **)&adev->wb.wb);
1041 if (r) {
1042 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1043 return r;
1044 }
1045
1046 adev->wb.num_wb = AMDGPU_MAX_WB;
1047 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1048
1049 /* clear wb memory */
1050 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1051 }
1052
1053 return 0;
1054 }
1055
1056 /**
1057 * amdgpu_device_wb_get - Allocate a wb entry
1058 *
1059 * @adev: amdgpu_device pointer
1060 * @wb: wb index
1061 *
1062 * Allocate a wb slot for use by the driver (all asics).
1063 * Returns 0 on success or -EINVAL on failure.
1064 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1065 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1066 {
1067 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1068
1069 if (offset < adev->wb.num_wb) {
1070 __set_bit(offset, adev->wb.used);
1071 *wb = offset << 3; /* convert to dw offset */
1072 return 0;
1073 } else {
1074 return -EINVAL;
1075 }
1076 }
1077
1078 /**
1079 * amdgpu_device_wb_free - Free a wb entry
1080 *
1081 * @adev: amdgpu_device pointer
1082 * @wb: wb index
1083 *
1084 * Free a wb slot allocated for use by the driver (all asics)
1085 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1086 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1087 {
1088 wb >>= 3;
1089 if (wb < adev->wb.num_wb)
1090 __clear_bit(wb, adev->wb.used);
1091 }
1092
1093 /**
1094 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1095 *
1096 * @adev: amdgpu_device pointer
1097 *
1098 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1099 * to fail, but if any of the BARs is not accessible after the size we abort
1100 * driver loading by returning -ENODEV.
1101 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1102 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1103 {
1104 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1105 struct pci_bus *root;
1106 struct resource *res;
1107 unsigned int i;
1108 u16 cmd;
1109 int r;
1110
1111 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1112 return 0;
1113
1114 /* Bypass for VF */
1115 if (amdgpu_sriov_vf(adev))
1116 return 0;
1117
1118 /* skip if the bios has already enabled large BAR */
1119 if (adev->gmc.real_vram_size &&
1120 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1121 return 0;
1122
1123 /* Check if the root BUS has 64bit memory resources */
1124 root = adev->pdev->bus;
1125 while (root->parent)
1126 root = root->parent;
1127
1128 pci_bus_for_each_resource(root, res, i) {
1129 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1130 res->start > 0x100000000ull)
1131 break;
1132 }
1133
1134 /* Trying to resize is pointless without a root hub window above 4GB */
1135 if (!res)
1136 return 0;
1137
1138 /* Limit the BAR size to what is available */
1139 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1140 rbar_size);
1141
1142 /* Disable memory decoding while we change the BAR addresses and size */
1143 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1144 pci_write_config_word(adev->pdev, PCI_COMMAND,
1145 cmd & ~PCI_COMMAND_MEMORY);
1146
1147 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1148 amdgpu_doorbell_fini(adev);
1149 if (adev->asic_type >= CHIP_BONAIRE)
1150 pci_release_resource(adev->pdev, 2);
1151
1152 pci_release_resource(adev->pdev, 0);
1153
1154 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1155 if (r == -ENOSPC)
1156 DRM_INFO("Not enough PCI address space for a large BAR.");
1157 else if (r && r != -ENOTSUPP)
1158 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1159
1160 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1161
1162 /* When the doorbell or fb BAR isn't available we have no chance of
1163 * using the device.
1164 */
1165 r = amdgpu_doorbell_init(adev);
1166 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1167 return -ENODEV;
1168
1169 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1170
1171 return 0;
1172 }
1173
amdgpu_device_read_bios(struct amdgpu_device * adev)1174 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1175 {
1176 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1177 return false;
1178
1179 return true;
1180 }
1181
1182 /*
1183 * GPU helpers function.
1184 */
1185 /**
1186 * amdgpu_device_need_post - check if the hw need post or not
1187 *
1188 * @adev: amdgpu_device pointer
1189 *
1190 * Check if the asic has been initialized (all asics) at driver startup
1191 * or post is needed if hw reset is performed.
1192 * Returns true if need or false if not.
1193 */
amdgpu_device_need_post(struct amdgpu_device * adev)1194 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1195 {
1196 uint32_t reg;
1197
1198 if (amdgpu_sriov_vf(adev))
1199 return false;
1200
1201 if (!amdgpu_device_read_bios(adev))
1202 return false;
1203
1204 if (amdgpu_passthrough(adev)) {
1205 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1206 * some old smc fw still need driver do vPost otherwise gpu hang, while
1207 * those smc fw version above 22.15 doesn't have this flaw, so we force
1208 * vpost executed for smc version below 22.15
1209 */
1210 if (adev->asic_type == CHIP_FIJI) {
1211 int err;
1212 uint32_t fw_ver;
1213
1214 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1215 /* force vPost if error occured */
1216 if (err)
1217 return true;
1218
1219 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1220 if (fw_ver < 0x00160e00)
1221 return true;
1222 }
1223 }
1224
1225 /* Don't post if we need to reset whole hive on init */
1226 if (adev->gmc.xgmi.pending_reset)
1227 return false;
1228
1229 if (adev->has_hw_reset) {
1230 adev->has_hw_reset = false;
1231 return true;
1232 }
1233
1234 /* bios scratch used on CIK+ */
1235 if (adev->asic_type >= CHIP_BONAIRE)
1236 return amdgpu_atombios_scratch_need_asic_init(adev);
1237
1238 /* check MEM_SIZE for older asics */
1239 reg = amdgpu_asic_get_config_memsize(adev);
1240
1241 if ((reg != 0) && (reg != 0xffffffff))
1242 return false;
1243
1244 return true;
1245 }
1246
1247 /*
1248 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1249 * speed switching. Until we have confirmation from Intel that a specific host
1250 * supports it, it's safer that we keep it disabled for all.
1251 *
1252 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1253 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1254 */
amdgpu_device_pcie_dynamic_switching_supported(void)1255 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1256 {
1257 #if IS_ENABLED(CONFIG_X86)
1258 struct cpuinfo_x86 *c = &cpu_data(0);
1259
1260 if (c->x86_vendor == X86_VENDOR_INTEL)
1261 return false;
1262 #endif
1263 return true;
1264 }
1265
1266 /**
1267 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1268 *
1269 * @adev: amdgpu_device pointer
1270 *
1271 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1272 * be set for this device.
1273 *
1274 * Returns true if it should be used or false if not.
1275 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1276 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1277 {
1278 switch (amdgpu_aspm) {
1279 case -1:
1280 break;
1281 case 0:
1282 return false;
1283 case 1:
1284 return true;
1285 default:
1286 return false;
1287 }
1288 return pcie_aspm_enabled(adev->pdev);
1289 }
1290
amdgpu_device_aspm_support_quirk(void)1291 bool amdgpu_device_aspm_support_quirk(void)
1292 {
1293 #if IS_ENABLED(CONFIG_X86)
1294 struct cpuinfo_x86 *c = &cpu_data(0);
1295
1296 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1297 #else
1298 return true;
1299 #endif
1300 }
1301
1302 /* if we get transitioned to only one device, take VGA back */
1303 /**
1304 * amdgpu_device_vga_set_decode - enable/disable vga decode
1305 *
1306 * @pdev: PCI device pointer
1307 * @state: enable/disable vga decode
1308 *
1309 * Enable/disable vga decode (all asics).
1310 * Returns VGA resource flags.
1311 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1312 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1313 bool state)
1314 {
1315 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1316
1317 amdgpu_asic_set_vga_state(adev, state);
1318 if (state)
1319 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1320 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1321 else
1322 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1323 }
1324
1325 /**
1326 * amdgpu_device_check_block_size - validate the vm block size
1327 *
1328 * @adev: amdgpu_device pointer
1329 *
1330 * Validates the vm block size specified via module parameter.
1331 * The vm block size defines number of bits in page table versus page directory,
1332 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1333 * page table and the remaining bits are in the page directory.
1334 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1335 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1336 {
1337 /* defines number of bits in page table versus page directory,
1338 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1339 * page table and the remaining bits are in the page directory
1340 */
1341 if (amdgpu_vm_block_size == -1)
1342 return;
1343
1344 if (amdgpu_vm_block_size < 9) {
1345 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1346 amdgpu_vm_block_size);
1347 amdgpu_vm_block_size = -1;
1348 }
1349 }
1350
1351 /**
1352 * amdgpu_device_check_vm_size - validate the vm size
1353 *
1354 * @adev: amdgpu_device pointer
1355 *
1356 * Validates the vm size in GB specified via module parameter.
1357 * The VM size is the size of the GPU virtual memory space in GB.
1358 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1359 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1360 {
1361 /* no need to check the default value */
1362 if (amdgpu_vm_size == -1)
1363 return;
1364
1365 if (amdgpu_vm_size < 1) {
1366 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1367 amdgpu_vm_size);
1368 amdgpu_vm_size = -1;
1369 }
1370 }
1371
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1372 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1373 {
1374 struct sysinfo si;
1375 bool is_os_64 = (sizeof(void *) == 8);
1376 uint64_t total_memory;
1377 uint64_t dram_size_seven_GB = 0x1B8000000;
1378 uint64_t dram_size_three_GB = 0xB8000000;
1379
1380 if (amdgpu_smu_memory_pool_size == 0)
1381 return;
1382
1383 if (!is_os_64) {
1384 DRM_WARN("Not 64-bit OS, feature not supported\n");
1385 goto def_value;
1386 }
1387 si_meminfo(&si);
1388 total_memory = (uint64_t)si.totalram * si.mem_unit;
1389
1390 if ((amdgpu_smu_memory_pool_size == 1) ||
1391 (amdgpu_smu_memory_pool_size == 2)) {
1392 if (total_memory < dram_size_three_GB)
1393 goto def_value1;
1394 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1395 (amdgpu_smu_memory_pool_size == 8)) {
1396 if (total_memory < dram_size_seven_GB)
1397 goto def_value1;
1398 } else {
1399 DRM_WARN("Smu memory pool size not supported\n");
1400 goto def_value;
1401 }
1402 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1403
1404 return;
1405
1406 def_value1:
1407 DRM_WARN("No enough system memory\n");
1408 def_value:
1409 adev->pm.smu_prv_buffer_size = 0;
1410 }
1411
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1412 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1413 {
1414 if (!(adev->flags & AMD_IS_APU) ||
1415 adev->asic_type < CHIP_RAVEN)
1416 return 0;
1417
1418 switch (adev->asic_type) {
1419 case CHIP_RAVEN:
1420 if (adev->pdev->device == 0x15dd)
1421 adev->apu_flags |= AMD_APU_IS_RAVEN;
1422 if (adev->pdev->device == 0x15d8)
1423 adev->apu_flags |= AMD_APU_IS_PICASSO;
1424 break;
1425 case CHIP_RENOIR:
1426 if ((adev->pdev->device == 0x1636) ||
1427 (adev->pdev->device == 0x164c))
1428 adev->apu_flags |= AMD_APU_IS_RENOIR;
1429 else
1430 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1431 break;
1432 case CHIP_VANGOGH:
1433 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1434 break;
1435 case CHIP_YELLOW_CARP:
1436 break;
1437 case CHIP_CYAN_SKILLFISH:
1438 if ((adev->pdev->device == 0x13FE) ||
1439 (adev->pdev->device == 0x143F))
1440 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1441 break;
1442 default:
1443 break;
1444 }
1445
1446 return 0;
1447 }
1448
1449 /**
1450 * amdgpu_device_check_arguments - validate module params
1451 *
1452 * @adev: amdgpu_device pointer
1453 *
1454 * Validates certain module parameters and updates
1455 * the associated values used by the driver (all asics).
1456 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1457 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1458 {
1459 if (amdgpu_sched_jobs < 4) {
1460 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1461 amdgpu_sched_jobs);
1462 amdgpu_sched_jobs = 4;
1463 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1464 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1465 amdgpu_sched_jobs);
1466 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1467 }
1468
1469 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1470 /* gart size must be greater or equal to 32M */
1471 dev_warn(adev->dev, "gart size (%d) too small\n",
1472 amdgpu_gart_size);
1473 amdgpu_gart_size = -1;
1474 }
1475
1476 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1477 /* gtt size must be greater or equal to 32M */
1478 dev_warn(adev->dev, "gtt size (%d) too small\n",
1479 amdgpu_gtt_size);
1480 amdgpu_gtt_size = -1;
1481 }
1482
1483 /* valid range is between 4 and 9 inclusive */
1484 if (amdgpu_vm_fragment_size != -1 &&
1485 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1486 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1487 amdgpu_vm_fragment_size = -1;
1488 }
1489
1490 if (amdgpu_sched_hw_submission < 2) {
1491 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1492 amdgpu_sched_hw_submission);
1493 amdgpu_sched_hw_submission = 2;
1494 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1495 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1496 amdgpu_sched_hw_submission);
1497 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1498 }
1499
1500 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1501 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1502 amdgpu_reset_method = -1;
1503 }
1504
1505 amdgpu_device_check_smu_prv_buffer_size(adev);
1506
1507 amdgpu_device_check_vm_size(adev);
1508
1509 amdgpu_device_check_block_size(adev);
1510
1511 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1512
1513 return 0;
1514 }
1515
1516 /**
1517 * amdgpu_switcheroo_set_state - set switcheroo state
1518 *
1519 * @pdev: pci dev pointer
1520 * @state: vga_switcheroo state
1521 *
1522 * Callback for the switcheroo driver. Suspends or resumes
1523 * the asics before or after it is powered up using ACPI methods.
1524 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1525 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1526 enum vga_switcheroo_state state)
1527 {
1528 struct drm_device *dev = pci_get_drvdata(pdev);
1529 int r;
1530
1531 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1532 return;
1533
1534 if (state == VGA_SWITCHEROO_ON) {
1535 pr_info("switched on\n");
1536 /* don't suspend or resume card normally */
1537 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1538
1539 pci_set_power_state(pdev, PCI_D0);
1540 amdgpu_device_load_pci_state(pdev);
1541 r = pci_enable_device(pdev);
1542 if (r)
1543 DRM_WARN("pci_enable_device failed (%d)\n", r);
1544 amdgpu_device_resume(dev, true);
1545
1546 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1547 } else {
1548 pr_info("switched off\n");
1549 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1550 amdgpu_device_suspend(dev, true);
1551 amdgpu_device_cache_pci_state(pdev);
1552 /* Shut down the device */
1553 pci_disable_device(pdev);
1554 pci_set_power_state(pdev, PCI_D3cold);
1555 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1556 }
1557 }
1558
1559 /**
1560 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1561 *
1562 * @pdev: pci dev pointer
1563 *
1564 * Callback for the switcheroo driver. Check of the switcheroo
1565 * state can be changed.
1566 * Returns true if the state can be changed, false if not.
1567 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1568 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1569 {
1570 struct drm_device *dev = pci_get_drvdata(pdev);
1571
1572 /*
1573 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1574 * locking inversion with the driver load path. And the access here is
1575 * completely racy anyway. So don't bother with locking for now.
1576 */
1577 return atomic_read(&dev->open_count) == 0;
1578 }
1579
1580 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1581 .set_gpu_state = amdgpu_switcheroo_set_state,
1582 .reprobe = NULL,
1583 .can_switch = amdgpu_switcheroo_can_switch,
1584 };
1585
1586 /**
1587 * amdgpu_device_ip_set_clockgating_state - set the CG state
1588 *
1589 * @dev: amdgpu_device pointer
1590 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1591 * @state: clockgating state (gate or ungate)
1592 *
1593 * Sets the requested clockgating state for all instances of
1594 * the hardware IP specified.
1595 * Returns the error code from the last instance.
1596 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1597 int amdgpu_device_ip_set_clockgating_state(void *dev,
1598 enum amd_ip_block_type block_type,
1599 enum amd_clockgating_state state)
1600 {
1601 struct amdgpu_device *adev = dev;
1602 int i, r = 0;
1603
1604 for (i = 0; i < adev->num_ip_blocks; i++) {
1605 if (!adev->ip_blocks[i].status.valid)
1606 continue;
1607 if (adev->ip_blocks[i].version->type != block_type)
1608 continue;
1609 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1610 continue;
1611 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1612 (void *)adev, state);
1613 if (r)
1614 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1615 adev->ip_blocks[i].version->funcs->name, r);
1616 }
1617 return r;
1618 }
1619
1620 /**
1621 * amdgpu_device_ip_set_powergating_state - set the PG state
1622 *
1623 * @dev: amdgpu_device pointer
1624 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1625 * @state: powergating state (gate or ungate)
1626 *
1627 * Sets the requested powergating state for all instances of
1628 * the hardware IP specified.
1629 * Returns the error code from the last instance.
1630 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1631 int amdgpu_device_ip_set_powergating_state(void *dev,
1632 enum amd_ip_block_type block_type,
1633 enum amd_powergating_state state)
1634 {
1635 struct amdgpu_device *adev = dev;
1636 int i, r = 0;
1637
1638 for (i = 0; i < adev->num_ip_blocks; i++) {
1639 if (!adev->ip_blocks[i].status.valid)
1640 continue;
1641 if (adev->ip_blocks[i].version->type != block_type)
1642 continue;
1643 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1644 continue;
1645 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1646 (void *)adev, state);
1647 if (r)
1648 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1649 adev->ip_blocks[i].version->funcs->name, r);
1650 }
1651 return r;
1652 }
1653
1654 /**
1655 * amdgpu_device_ip_get_clockgating_state - get the CG state
1656 *
1657 * @adev: amdgpu_device pointer
1658 * @flags: clockgating feature flags
1659 *
1660 * Walks the list of IPs on the device and updates the clockgating
1661 * flags for each IP.
1662 * Updates @flags with the feature flags for each hardware IP where
1663 * clockgating is enabled.
1664 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1665 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1666 u64 *flags)
1667 {
1668 int i;
1669
1670 for (i = 0; i < adev->num_ip_blocks; i++) {
1671 if (!adev->ip_blocks[i].status.valid)
1672 continue;
1673 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1674 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1675 }
1676 }
1677
1678 /**
1679 * amdgpu_device_ip_wait_for_idle - wait for idle
1680 *
1681 * @adev: amdgpu_device pointer
1682 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1683 *
1684 * Waits for the request hardware IP to be idle.
1685 * Returns 0 for success or a negative error code on failure.
1686 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1687 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1688 enum amd_ip_block_type block_type)
1689 {
1690 int i, r;
1691
1692 for (i = 0; i < adev->num_ip_blocks; i++) {
1693 if (!adev->ip_blocks[i].status.valid)
1694 continue;
1695 if (adev->ip_blocks[i].version->type == block_type) {
1696 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1697 if (r)
1698 return r;
1699 break;
1700 }
1701 }
1702 return 0;
1703
1704 }
1705
1706 /**
1707 * amdgpu_device_ip_is_idle - is the hardware IP idle
1708 *
1709 * @adev: amdgpu_device pointer
1710 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1711 *
1712 * Check if the hardware IP is idle or not.
1713 * Returns true if it the IP is idle, false if not.
1714 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1715 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1716 enum amd_ip_block_type block_type)
1717 {
1718 int i;
1719
1720 for (i = 0; i < adev->num_ip_blocks; i++) {
1721 if (!adev->ip_blocks[i].status.valid)
1722 continue;
1723 if (adev->ip_blocks[i].version->type == block_type)
1724 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1725 }
1726 return true;
1727
1728 }
1729
1730 /**
1731 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1732 *
1733 * @adev: amdgpu_device pointer
1734 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1735 *
1736 * Returns a pointer to the hardware IP block structure
1737 * if it exists for the asic, otherwise NULL.
1738 */
1739 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1740 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1741 enum amd_ip_block_type type)
1742 {
1743 int i;
1744
1745 for (i = 0; i < adev->num_ip_blocks; i++)
1746 if (adev->ip_blocks[i].version->type == type)
1747 return &adev->ip_blocks[i];
1748
1749 return NULL;
1750 }
1751
1752 /**
1753 * amdgpu_device_ip_block_version_cmp
1754 *
1755 * @adev: amdgpu_device pointer
1756 * @type: enum amd_ip_block_type
1757 * @major: major version
1758 * @minor: minor version
1759 *
1760 * return 0 if equal or greater
1761 * return 1 if smaller or the ip_block doesn't exist
1762 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1763 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1764 enum amd_ip_block_type type,
1765 u32 major, u32 minor)
1766 {
1767 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1768
1769 if (ip_block && ((ip_block->version->major > major) ||
1770 ((ip_block->version->major == major) &&
1771 (ip_block->version->minor >= minor))))
1772 return 0;
1773
1774 return 1;
1775 }
1776
1777 /**
1778 * amdgpu_device_ip_block_add
1779 *
1780 * @adev: amdgpu_device pointer
1781 * @ip_block_version: pointer to the IP to add
1782 *
1783 * Adds the IP block driver information to the collection of IPs
1784 * on the asic.
1785 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1786 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1787 const struct amdgpu_ip_block_version *ip_block_version)
1788 {
1789 if (!ip_block_version)
1790 return -EINVAL;
1791
1792 switch (ip_block_version->type) {
1793 case AMD_IP_BLOCK_TYPE_VCN:
1794 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1795 return 0;
1796 break;
1797 case AMD_IP_BLOCK_TYPE_JPEG:
1798 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1799 return 0;
1800 break;
1801 default:
1802 break;
1803 }
1804
1805 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1806 ip_block_version->funcs->name);
1807
1808 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1809
1810 return 0;
1811 }
1812
1813 /**
1814 * amdgpu_device_enable_virtual_display - enable virtual display feature
1815 *
1816 * @adev: amdgpu_device pointer
1817 *
1818 * Enabled the virtual display feature if the user has enabled it via
1819 * the module parameter virtual_display. This feature provides a virtual
1820 * display hardware on headless boards or in virtualized environments.
1821 * This function parses and validates the configuration string specified by
1822 * the user and configues the virtual display configuration (number of
1823 * virtual connectors, crtcs, etc.) specified.
1824 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1825 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1826 {
1827 adev->enable_virtual_display = false;
1828
1829 if (amdgpu_virtual_display) {
1830 const char *pci_address_name = pci_name(adev->pdev);
1831 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1832
1833 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1834 pciaddstr_tmp = pciaddstr;
1835 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1836 pciaddname = strsep(&pciaddname_tmp, ",");
1837 if (!strcmp("all", pciaddname)
1838 || !strcmp(pci_address_name, pciaddname)) {
1839 long num_crtc;
1840 int res = -1;
1841
1842 adev->enable_virtual_display = true;
1843
1844 if (pciaddname_tmp)
1845 res = kstrtol(pciaddname_tmp, 10,
1846 &num_crtc);
1847
1848 if (!res) {
1849 if (num_crtc < 1)
1850 num_crtc = 1;
1851 if (num_crtc > 6)
1852 num_crtc = 6;
1853 adev->mode_info.num_crtc = num_crtc;
1854 } else {
1855 adev->mode_info.num_crtc = 1;
1856 }
1857 break;
1858 }
1859 }
1860
1861 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1862 amdgpu_virtual_display, pci_address_name,
1863 adev->enable_virtual_display, adev->mode_info.num_crtc);
1864
1865 kfree(pciaddstr);
1866 }
1867 }
1868
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)1869 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1870 {
1871 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1872 adev->mode_info.num_crtc = 1;
1873 adev->enable_virtual_display = true;
1874 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1875 adev->enable_virtual_display, adev->mode_info.num_crtc);
1876 }
1877 }
1878
1879 /**
1880 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1881 *
1882 * @adev: amdgpu_device pointer
1883 *
1884 * Parses the asic configuration parameters specified in the gpu info
1885 * firmware and makes them availale to the driver for use in configuring
1886 * the asic.
1887 * Returns 0 on success, -EINVAL on failure.
1888 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1889 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1890 {
1891 const char *chip_name;
1892 char fw_name[40];
1893 int err;
1894 const struct gpu_info_firmware_header_v1_0 *hdr;
1895
1896 adev->firmware.gpu_info_fw = NULL;
1897
1898 if (adev->mman.discovery_bin) {
1899 /*
1900 * FIXME: The bounding box is still needed by Navi12, so
1901 * temporarily read it from gpu_info firmware. Should be dropped
1902 * when DAL no longer needs it.
1903 */
1904 if (adev->asic_type != CHIP_NAVI12)
1905 return 0;
1906 }
1907
1908 switch (adev->asic_type) {
1909 default:
1910 return 0;
1911 case CHIP_VEGA10:
1912 chip_name = "vega10";
1913 break;
1914 case CHIP_VEGA12:
1915 chip_name = "vega12";
1916 break;
1917 case CHIP_RAVEN:
1918 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1919 chip_name = "raven2";
1920 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1921 chip_name = "picasso";
1922 else
1923 chip_name = "raven";
1924 break;
1925 case CHIP_ARCTURUS:
1926 chip_name = "arcturus";
1927 break;
1928 case CHIP_NAVI12:
1929 chip_name = "navi12";
1930 break;
1931 }
1932
1933 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1934 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1935 if (err) {
1936 dev_err(adev->dev,
1937 "Failed to get gpu_info firmware \"%s\"\n",
1938 fw_name);
1939 goto out;
1940 }
1941
1942 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1943 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1944
1945 switch (hdr->version_major) {
1946 case 1:
1947 {
1948 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1949 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1950 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1951
1952 /*
1953 * Should be droped when DAL no longer needs it.
1954 */
1955 if (adev->asic_type == CHIP_NAVI12)
1956 goto parse_soc_bounding_box;
1957
1958 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1959 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1960 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1961 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1962 adev->gfx.config.max_texture_channel_caches =
1963 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1964 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1965 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1966 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1967 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1968 adev->gfx.config.double_offchip_lds_buf =
1969 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1970 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1971 adev->gfx.cu_info.max_waves_per_simd =
1972 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1973 adev->gfx.cu_info.max_scratch_slots_per_cu =
1974 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1975 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1976 if (hdr->version_minor >= 1) {
1977 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1978 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1979 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1980 adev->gfx.config.num_sc_per_sh =
1981 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1982 adev->gfx.config.num_packer_per_sc =
1983 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1984 }
1985
1986 parse_soc_bounding_box:
1987 /*
1988 * soc bounding box info is not integrated in disocovery table,
1989 * we always need to parse it from gpu info firmware if needed.
1990 */
1991 if (hdr->version_minor == 2) {
1992 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1993 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1994 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1995 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1996 }
1997 break;
1998 }
1999 default:
2000 dev_err(adev->dev,
2001 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2002 err = -EINVAL;
2003 goto out;
2004 }
2005 out:
2006 return err;
2007 }
2008
2009 /**
2010 * amdgpu_device_ip_early_init - run early init for hardware IPs
2011 *
2012 * @adev: amdgpu_device pointer
2013 *
2014 * Early initialization pass for hardware IPs. The hardware IPs that make
2015 * up each asic are discovered each IP's early_init callback is run. This
2016 * is the first stage in initializing the asic.
2017 * Returns 0 on success, negative error code on failure.
2018 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2019 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2020 {
2021 struct drm_device *dev = adev_to_drm(adev);
2022 struct pci_dev *parent;
2023 int i, r;
2024 bool total;
2025
2026 amdgpu_device_enable_virtual_display(adev);
2027
2028 if (amdgpu_sriov_vf(adev)) {
2029 r = amdgpu_virt_request_full_gpu(adev, true);
2030 if (r)
2031 return r;
2032 }
2033
2034 switch (adev->asic_type) {
2035 #ifdef CONFIG_DRM_AMDGPU_SI
2036 case CHIP_VERDE:
2037 case CHIP_TAHITI:
2038 case CHIP_PITCAIRN:
2039 case CHIP_OLAND:
2040 case CHIP_HAINAN:
2041 adev->family = AMDGPU_FAMILY_SI;
2042 r = si_set_ip_blocks(adev);
2043 if (r)
2044 return r;
2045 break;
2046 #endif
2047 #ifdef CONFIG_DRM_AMDGPU_CIK
2048 case CHIP_BONAIRE:
2049 case CHIP_HAWAII:
2050 case CHIP_KAVERI:
2051 case CHIP_KABINI:
2052 case CHIP_MULLINS:
2053 if (adev->flags & AMD_IS_APU)
2054 adev->family = AMDGPU_FAMILY_KV;
2055 else
2056 adev->family = AMDGPU_FAMILY_CI;
2057
2058 r = cik_set_ip_blocks(adev);
2059 if (r)
2060 return r;
2061 break;
2062 #endif
2063 case CHIP_TOPAZ:
2064 case CHIP_TONGA:
2065 case CHIP_FIJI:
2066 case CHIP_POLARIS10:
2067 case CHIP_POLARIS11:
2068 case CHIP_POLARIS12:
2069 case CHIP_VEGAM:
2070 case CHIP_CARRIZO:
2071 case CHIP_STONEY:
2072 if (adev->flags & AMD_IS_APU)
2073 adev->family = AMDGPU_FAMILY_CZ;
2074 else
2075 adev->family = AMDGPU_FAMILY_VI;
2076
2077 r = vi_set_ip_blocks(adev);
2078 if (r)
2079 return r;
2080 break;
2081 default:
2082 r = amdgpu_discovery_set_ip_blocks(adev);
2083 if (r)
2084 return r;
2085 break;
2086 }
2087
2088 if (amdgpu_has_atpx() &&
2089 (amdgpu_is_atpx_hybrid() ||
2090 amdgpu_has_atpx_dgpu_power_cntl()) &&
2091 ((adev->flags & AMD_IS_APU) == 0) &&
2092 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2093 adev->flags |= AMD_IS_PX;
2094
2095 if (!(adev->flags & AMD_IS_APU)) {
2096 parent = pcie_find_root_port(adev->pdev);
2097 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2098 }
2099
2100
2101 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2102 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2103 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2104 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2105 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2106
2107 total = true;
2108 for (i = 0; i < adev->num_ip_blocks; i++) {
2109 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2110 DRM_WARN("disabled ip block: %d <%s>\n",
2111 i, adev->ip_blocks[i].version->funcs->name);
2112 adev->ip_blocks[i].status.valid = false;
2113 } else {
2114 if (adev->ip_blocks[i].version->funcs->early_init) {
2115 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2116 if (r == -ENOENT) {
2117 adev->ip_blocks[i].status.valid = false;
2118 } else if (r) {
2119 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2120 adev->ip_blocks[i].version->funcs->name, r);
2121 total = false;
2122 } else {
2123 adev->ip_blocks[i].status.valid = true;
2124 }
2125 } else {
2126 adev->ip_blocks[i].status.valid = true;
2127 }
2128 }
2129 /* get the vbios after the asic_funcs are set up */
2130 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2131 r = amdgpu_device_parse_gpu_info_fw(adev);
2132 if (r)
2133 return r;
2134
2135 /* Read BIOS */
2136 if (amdgpu_device_read_bios(adev)) {
2137 if (!amdgpu_get_bios(adev))
2138 return -EINVAL;
2139
2140 r = amdgpu_atombios_init(adev);
2141 if (r) {
2142 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2143 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2144 return r;
2145 }
2146 }
2147
2148 /*get pf2vf msg info at it's earliest time*/
2149 if (amdgpu_sriov_vf(adev))
2150 amdgpu_virt_init_data_exchange(adev);
2151
2152 }
2153 }
2154 if (!total)
2155 return -ENODEV;
2156
2157 amdgpu_amdkfd_device_probe(adev);
2158 adev->cg_flags &= amdgpu_cg_mask;
2159 adev->pg_flags &= amdgpu_pg_mask;
2160
2161 return 0;
2162 }
2163
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2164 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2165 {
2166 int i, r;
2167
2168 for (i = 0; i < adev->num_ip_blocks; i++) {
2169 if (!adev->ip_blocks[i].status.sw)
2170 continue;
2171 if (adev->ip_blocks[i].status.hw)
2172 continue;
2173 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2174 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2175 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2176 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2177 if (r) {
2178 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2179 adev->ip_blocks[i].version->funcs->name, r);
2180 return r;
2181 }
2182 adev->ip_blocks[i].status.hw = true;
2183 }
2184 }
2185
2186 return 0;
2187 }
2188
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2189 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2190 {
2191 int i, r;
2192
2193 for (i = 0; i < adev->num_ip_blocks; i++) {
2194 if (!adev->ip_blocks[i].status.sw)
2195 continue;
2196 if (adev->ip_blocks[i].status.hw)
2197 continue;
2198 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2199 if (r) {
2200 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2201 adev->ip_blocks[i].version->funcs->name, r);
2202 return r;
2203 }
2204 adev->ip_blocks[i].status.hw = true;
2205 }
2206
2207 return 0;
2208 }
2209
amdgpu_device_fw_loading(struct amdgpu_device * adev)2210 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2211 {
2212 int r = 0;
2213 int i;
2214 uint32_t smu_version;
2215
2216 if (adev->asic_type >= CHIP_VEGA10) {
2217 for (i = 0; i < adev->num_ip_blocks; i++) {
2218 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2219 continue;
2220
2221 if (!adev->ip_blocks[i].status.sw)
2222 continue;
2223
2224 /* no need to do the fw loading again if already done*/
2225 if (adev->ip_blocks[i].status.hw == true)
2226 break;
2227
2228 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2229 r = adev->ip_blocks[i].version->funcs->resume(adev);
2230 if (r) {
2231 DRM_ERROR("resume of IP block <%s> failed %d\n",
2232 adev->ip_blocks[i].version->funcs->name, r);
2233 return r;
2234 }
2235 } else {
2236 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2237 if (r) {
2238 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2239 adev->ip_blocks[i].version->funcs->name, r);
2240 return r;
2241 }
2242 }
2243
2244 adev->ip_blocks[i].status.hw = true;
2245 break;
2246 }
2247 }
2248
2249 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2250 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2251
2252 return r;
2253 }
2254
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2255 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2256 {
2257 long timeout;
2258 int r, i;
2259
2260 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2261 struct amdgpu_ring *ring = adev->rings[i];
2262
2263 /* No need to setup the GPU scheduler for rings that don't need it */
2264 if (!ring || ring->no_scheduler)
2265 continue;
2266
2267 switch (ring->funcs->type) {
2268 case AMDGPU_RING_TYPE_GFX:
2269 timeout = adev->gfx_timeout;
2270 break;
2271 case AMDGPU_RING_TYPE_COMPUTE:
2272 timeout = adev->compute_timeout;
2273 break;
2274 case AMDGPU_RING_TYPE_SDMA:
2275 timeout = adev->sdma_timeout;
2276 break;
2277 default:
2278 timeout = adev->video_timeout;
2279 break;
2280 }
2281
2282 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2283 ring->num_hw_submission, 0,
2284 timeout, adev->reset_domain->wq,
2285 ring->sched_score, ring->name,
2286 adev->dev);
2287 if (r) {
2288 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2289 ring->name);
2290 return r;
2291 }
2292 }
2293
2294 amdgpu_xcp_update_partition_sched_list(adev);
2295
2296 return 0;
2297 }
2298
2299
2300 /**
2301 * amdgpu_device_ip_init - run init for hardware IPs
2302 *
2303 * @adev: amdgpu_device pointer
2304 *
2305 * Main initialization pass for hardware IPs. The list of all the hardware
2306 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2307 * are run. sw_init initializes the software state associated with each IP
2308 * and hw_init initializes the hardware associated with each IP.
2309 * Returns 0 on success, negative error code on failure.
2310 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2311 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2312 {
2313 int i, r;
2314
2315 r = amdgpu_ras_init(adev);
2316 if (r)
2317 return r;
2318
2319 for (i = 0; i < adev->num_ip_blocks; i++) {
2320 if (!adev->ip_blocks[i].status.valid)
2321 continue;
2322 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2323 if (r) {
2324 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2325 adev->ip_blocks[i].version->funcs->name, r);
2326 goto init_failed;
2327 }
2328 adev->ip_blocks[i].status.sw = true;
2329
2330 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2331 /* need to do common hw init early so everything is set up for gmc */
2332 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2333 if (r) {
2334 DRM_ERROR("hw_init %d failed %d\n", i, r);
2335 goto init_failed;
2336 }
2337 adev->ip_blocks[i].status.hw = true;
2338 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2339 /* need to do gmc hw init early so we can allocate gpu mem */
2340 /* Try to reserve bad pages early */
2341 if (amdgpu_sriov_vf(adev))
2342 amdgpu_virt_exchange_data(adev);
2343
2344 r = amdgpu_device_mem_scratch_init(adev);
2345 if (r) {
2346 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2347 goto init_failed;
2348 }
2349 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2350 if (r) {
2351 DRM_ERROR("hw_init %d failed %d\n", i, r);
2352 goto init_failed;
2353 }
2354 r = amdgpu_device_wb_init(adev);
2355 if (r) {
2356 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2357 goto init_failed;
2358 }
2359 adev->ip_blocks[i].status.hw = true;
2360
2361 /* right after GMC hw init, we create CSA */
2362 if (adev->gfx.mcbp) {
2363 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2364 AMDGPU_GEM_DOMAIN_VRAM |
2365 AMDGPU_GEM_DOMAIN_GTT,
2366 AMDGPU_CSA_SIZE);
2367 if (r) {
2368 DRM_ERROR("allocate CSA failed %d\n", r);
2369 goto init_failed;
2370 }
2371 }
2372 }
2373 }
2374
2375 if (amdgpu_sriov_vf(adev))
2376 amdgpu_virt_init_data_exchange(adev);
2377
2378 r = amdgpu_ib_pool_init(adev);
2379 if (r) {
2380 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2381 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2382 goto init_failed;
2383 }
2384
2385 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2386 if (r)
2387 goto init_failed;
2388
2389 r = amdgpu_device_ip_hw_init_phase1(adev);
2390 if (r)
2391 goto init_failed;
2392
2393 r = amdgpu_device_fw_loading(adev);
2394 if (r)
2395 goto init_failed;
2396
2397 r = amdgpu_device_ip_hw_init_phase2(adev);
2398 if (r)
2399 goto init_failed;
2400
2401 /*
2402 * retired pages will be loaded from eeprom and reserved here,
2403 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2404 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2405 * for I2C communication which only true at this point.
2406 *
2407 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2408 * failure from bad gpu situation and stop amdgpu init process
2409 * accordingly. For other failed cases, it will still release all
2410 * the resource and print error message, rather than returning one
2411 * negative value to upper level.
2412 *
2413 * Note: theoretically, this should be called before all vram allocations
2414 * to protect retired page from abusing
2415 */
2416 r = amdgpu_ras_recovery_init(adev);
2417 if (r)
2418 goto init_failed;
2419
2420 /**
2421 * In case of XGMI grab extra reference for reset domain for this device
2422 */
2423 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2424 if (amdgpu_xgmi_add_device(adev) == 0) {
2425 if (!amdgpu_sriov_vf(adev)) {
2426 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2427
2428 if (WARN_ON(!hive)) {
2429 r = -ENOENT;
2430 goto init_failed;
2431 }
2432
2433 if (!hive->reset_domain ||
2434 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2435 r = -ENOENT;
2436 amdgpu_put_xgmi_hive(hive);
2437 goto init_failed;
2438 }
2439
2440 /* Drop the early temporary reset domain we created for device */
2441 amdgpu_reset_put_reset_domain(adev->reset_domain);
2442 adev->reset_domain = hive->reset_domain;
2443 amdgpu_put_xgmi_hive(hive);
2444 }
2445 }
2446 }
2447
2448 r = amdgpu_device_init_schedulers(adev);
2449 if (r)
2450 goto init_failed;
2451
2452 /* Don't init kfd if whole hive need to be reset during init */
2453 if (!adev->gmc.xgmi.pending_reset) {
2454 kgd2kfd_init_zone_device(adev);
2455 amdgpu_amdkfd_device_init(adev);
2456 }
2457
2458 amdgpu_fru_get_product_info(adev);
2459
2460 init_failed:
2461
2462 return r;
2463 }
2464
2465 /**
2466 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2467 *
2468 * @adev: amdgpu_device pointer
2469 *
2470 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2471 * this function before a GPU reset. If the value is retained after a
2472 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2473 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2474 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2475 {
2476 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2477 }
2478
2479 /**
2480 * amdgpu_device_check_vram_lost - check if vram is valid
2481 *
2482 * @adev: amdgpu_device pointer
2483 *
2484 * Checks the reset magic value written to the gart pointer in VRAM.
2485 * The driver calls this after a GPU reset to see if the contents of
2486 * VRAM is lost or now.
2487 * returns true if vram is lost, false if not.
2488 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2489 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2490 {
2491 if (memcmp(adev->gart.ptr, adev->reset_magic,
2492 AMDGPU_RESET_MAGIC_NUM))
2493 return true;
2494
2495 if (!amdgpu_in_reset(adev))
2496 return false;
2497
2498 /*
2499 * For all ASICs with baco/mode1 reset, the VRAM is
2500 * always assumed to be lost.
2501 */
2502 switch (amdgpu_asic_reset_method(adev)) {
2503 case AMD_RESET_METHOD_BACO:
2504 case AMD_RESET_METHOD_MODE1:
2505 return true;
2506 default:
2507 return false;
2508 }
2509 }
2510
2511 /**
2512 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2513 *
2514 * @adev: amdgpu_device pointer
2515 * @state: clockgating state (gate or ungate)
2516 *
2517 * The list of all the hardware IPs that make up the asic is walked and the
2518 * set_clockgating_state callbacks are run.
2519 * Late initialization pass enabling clockgating for hardware IPs.
2520 * Fini or suspend, pass disabling clockgating for hardware IPs.
2521 * Returns 0 on success, negative error code on failure.
2522 */
2523
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2524 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2525 enum amd_clockgating_state state)
2526 {
2527 int i, j, r;
2528
2529 if (amdgpu_emu_mode == 1)
2530 return 0;
2531
2532 for (j = 0; j < adev->num_ip_blocks; j++) {
2533 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2534 if (!adev->ip_blocks[i].status.late_initialized)
2535 continue;
2536 /* skip CG for GFX, SDMA on S0ix */
2537 if (adev->in_s0ix &&
2538 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2539 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2540 continue;
2541 /* skip CG for VCE/UVD, it's handled specially */
2542 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2543 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2544 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2545 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2546 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2547 /* enable clockgating to save power */
2548 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2549 state);
2550 if (r) {
2551 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2552 adev->ip_blocks[i].version->funcs->name, r);
2553 return r;
2554 }
2555 }
2556 }
2557
2558 return 0;
2559 }
2560
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2561 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2562 enum amd_powergating_state state)
2563 {
2564 int i, j, r;
2565
2566 if (amdgpu_emu_mode == 1)
2567 return 0;
2568
2569 for (j = 0; j < adev->num_ip_blocks; j++) {
2570 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2571 if (!adev->ip_blocks[i].status.late_initialized)
2572 continue;
2573 /* skip PG for GFX, SDMA on S0ix */
2574 if (adev->in_s0ix &&
2575 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2576 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2577 continue;
2578 /* skip CG for VCE/UVD, it's handled specially */
2579 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2580 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2581 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2582 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2583 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2584 /* enable powergating to save power */
2585 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2586 state);
2587 if (r) {
2588 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2589 adev->ip_blocks[i].version->funcs->name, r);
2590 return r;
2591 }
2592 }
2593 }
2594 return 0;
2595 }
2596
amdgpu_device_enable_mgpu_fan_boost(void)2597 static int amdgpu_device_enable_mgpu_fan_boost(void)
2598 {
2599 struct amdgpu_gpu_instance *gpu_ins;
2600 struct amdgpu_device *adev;
2601 int i, ret = 0;
2602
2603 mutex_lock(&mgpu_info.mutex);
2604
2605 /*
2606 * MGPU fan boost feature should be enabled
2607 * only when there are two or more dGPUs in
2608 * the system
2609 */
2610 if (mgpu_info.num_dgpu < 2)
2611 goto out;
2612
2613 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2614 gpu_ins = &(mgpu_info.gpu_ins[i]);
2615 adev = gpu_ins->adev;
2616 if (!(adev->flags & AMD_IS_APU) &&
2617 !gpu_ins->mgpu_fan_enabled) {
2618 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2619 if (ret)
2620 break;
2621
2622 gpu_ins->mgpu_fan_enabled = 1;
2623 }
2624 }
2625
2626 out:
2627 mutex_unlock(&mgpu_info.mutex);
2628
2629 return ret;
2630 }
2631
2632 /**
2633 * amdgpu_device_ip_late_init - run late init for hardware IPs
2634 *
2635 * @adev: amdgpu_device pointer
2636 *
2637 * Late initialization pass for hardware IPs. The list of all the hardware
2638 * IPs that make up the asic is walked and the late_init callbacks are run.
2639 * late_init covers any special initialization that an IP requires
2640 * after all of the have been initialized or something that needs to happen
2641 * late in the init process.
2642 * Returns 0 on success, negative error code on failure.
2643 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2644 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2645 {
2646 struct amdgpu_gpu_instance *gpu_instance;
2647 int i = 0, r;
2648
2649 for (i = 0; i < adev->num_ip_blocks; i++) {
2650 if (!adev->ip_blocks[i].status.hw)
2651 continue;
2652 if (adev->ip_blocks[i].version->funcs->late_init) {
2653 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2654 if (r) {
2655 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2656 adev->ip_blocks[i].version->funcs->name, r);
2657 return r;
2658 }
2659 }
2660 adev->ip_blocks[i].status.late_initialized = true;
2661 }
2662
2663 r = amdgpu_ras_late_init(adev);
2664 if (r) {
2665 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2666 return r;
2667 }
2668
2669 amdgpu_ras_set_error_query_ready(adev, true);
2670
2671 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2672 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2673
2674 amdgpu_device_fill_reset_magic(adev);
2675
2676 r = amdgpu_device_enable_mgpu_fan_boost();
2677 if (r)
2678 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2679
2680 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2681 if (amdgpu_passthrough(adev) &&
2682 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2683 adev->asic_type == CHIP_ALDEBARAN))
2684 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2685
2686 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2687 mutex_lock(&mgpu_info.mutex);
2688
2689 /*
2690 * Reset device p-state to low as this was booted with high.
2691 *
2692 * This should be performed only after all devices from the same
2693 * hive get initialized.
2694 *
2695 * However, it's unknown how many device in the hive in advance.
2696 * As this is counted one by one during devices initializations.
2697 *
2698 * So, we wait for all XGMI interlinked devices initialized.
2699 * This may bring some delays as those devices may come from
2700 * different hives. But that should be OK.
2701 */
2702 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2703 for (i = 0; i < mgpu_info.num_gpu; i++) {
2704 gpu_instance = &(mgpu_info.gpu_ins[i]);
2705 if (gpu_instance->adev->flags & AMD_IS_APU)
2706 continue;
2707
2708 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2709 AMDGPU_XGMI_PSTATE_MIN);
2710 if (r) {
2711 DRM_ERROR("pstate setting failed (%d).\n", r);
2712 break;
2713 }
2714 }
2715 }
2716
2717 mutex_unlock(&mgpu_info.mutex);
2718 }
2719
2720 return 0;
2721 }
2722
2723 /**
2724 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2725 *
2726 * @adev: amdgpu_device pointer
2727 *
2728 * For ASICs need to disable SMC first
2729 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2730 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2731 {
2732 int i, r;
2733
2734 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2735 return;
2736
2737 for (i = 0; i < adev->num_ip_blocks; i++) {
2738 if (!adev->ip_blocks[i].status.hw)
2739 continue;
2740 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2741 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2742 /* XXX handle errors */
2743 if (r) {
2744 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2745 adev->ip_blocks[i].version->funcs->name, r);
2746 }
2747 adev->ip_blocks[i].status.hw = false;
2748 break;
2749 }
2750 }
2751 }
2752
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2753 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2754 {
2755 int i, r;
2756
2757 for (i = 0; i < adev->num_ip_blocks; i++) {
2758 if (!adev->ip_blocks[i].version->funcs->early_fini)
2759 continue;
2760
2761 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2762 if (r) {
2763 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2764 adev->ip_blocks[i].version->funcs->name, r);
2765 }
2766 }
2767
2768 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2769 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2770
2771 amdgpu_amdkfd_suspend(adev, false);
2772
2773 /* Workaroud for ASICs need to disable SMC first */
2774 amdgpu_device_smu_fini_early(adev);
2775
2776 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2777 if (!adev->ip_blocks[i].status.hw)
2778 continue;
2779
2780 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2781 /* XXX handle errors */
2782 if (r) {
2783 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2784 adev->ip_blocks[i].version->funcs->name, r);
2785 }
2786
2787 adev->ip_blocks[i].status.hw = false;
2788 }
2789
2790 if (amdgpu_sriov_vf(adev)) {
2791 if (amdgpu_virt_release_full_gpu(adev, false))
2792 DRM_ERROR("failed to release exclusive mode on fini\n");
2793 }
2794
2795 return 0;
2796 }
2797
2798 /**
2799 * amdgpu_device_ip_fini - run fini for hardware IPs
2800 *
2801 * @adev: amdgpu_device pointer
2802 *
2803 * Main teardown pass for hardware IPs. The list of all the hardware
2804 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2805 * are run. hw_fini tears down the hardware associated with each IP
2806 * and sw_fini tears down any software state associated with each IP.
2807 * Returns 0 on success, negative error code on failure.
2808 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2809 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2810 {
2811 int i, r;
2812
2813 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2814 amdgpu_virt_release_ras_err_handler_data(adev);
2815
2816 if (adev->gmc.xgmi.num_physical_nodes > 1)
2817 amdgpu_xgmi_remove_device(adev);
2818
2819 amdgpu_amdkfd_device_fini_sw(adev);
2820
2821 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2822 if (!adev->ip_blocks[i].status.sw)
2823 continue;
2824
2825 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2826 amdgpu_ucode_free_bo(adev);
2827 amdgpu_free_static_csa(&adev->virt.csa_obj);
2828 amdgpu_device_wb_fini(adev);
2829 amdgpu_device_mem_scratch_fini(adev);
2830 amdgpu_ib_pool_fini(adev);
2831 }
2832
2833 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2834 /* XXX handle errors */
2835 if (r) {
2836 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2837 adev->ip_blocks[i].version->funcs->name, r);
2838 }
2839 adev->ip_blocks[i].status.sw = false;
2840 adev->ip_blocks[i].status.valid = false;
2841 }
2842
2843 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2844 if (!adev->ip_blocks[i].status.late_initialized)
2845 continue;
2846 if (adev->ip_blocks[i].version->funcs->late_fini)
2847 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2848 adev->ip_blocks[i].status.late_initialized = false;
2849 }
2850
2851 amdgpu_ras_fini(adev);
2852
2853 return 0;
2854 }
2855
2856 /**
2857 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2858 *
2859 * @work: work_struct.
2860 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2861 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2862 {
2863 struct amdgpu_device *adev =
2864 container_of(work, struct amdgpu_device, delayed_init_work.work);
2865 int r;
2866
2867 r = amdgpu_ib_ring_tests(adev);
2868 if (r)
2869 DRM_ERROR("ib ring test failed (%d).\n", r);
2870 }
2871
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2872 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2873 {
2874 struct amdgpu_device *adev =
2875 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2876
2877 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2878 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2879
2880 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2881 adev->gfx.gfx_off_state = true;
2882 }
2883
2884 /**
2885 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2886 *
2887 * @adev: amdgpu_device pointer
2888 *
2889 * Main suspend function for hardware IPs. The list of all the hardware
2890 * IPs that make up the asic is walked, clockgating is disabled and the
2891 * suspend callbacks are run. suspend puts the hardware and software state
2892 * in each IP into a state suitable for suspend.
2893 * Returns 0 on success, negative error code on failure.
2894 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2895 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2896 {
2897 int i, r;
2898
2899 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2900 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2901
2902 /*
2903 * Per PMFW team's suggestion, driver needs to handle gfxoff
2904 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2905 * scenario. Add the missing df cstate disablement here.
2906 */
2907 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2908 dev_warn(adev->dev, "Failed to disallow df cstate");
2909
2910 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2911 if (!adev->ip_blocks[i].status.valid)
2912 continue;
2913
2914 /* displays are handled separately */
2915 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2916 continue;
2917
2918 /* XXX handle errors */
2919 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2920 /* XXX handle errors */
2921 if (r) {
2922 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2923 adev->ip_blocks[i].version->funcs->name, r);
2924 return r;
2925 }
2926
2927 adev->ip_blocks[i].status.hw = false;
2928 }
2929
2930 return 0;
2931 }
2932
2933 /**
2934 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2935 *
2936 * @adev: amdgpu_device pointer
2937 *
2938 * Main suspend function for hardware IPs. The list of all the hardware
2939 * IPs that make up the asic is walked, clockgating is disabled and the
2940 * suspend callbacks are run. suspend puts the hardware and software state
2941 * in each IP into a state suitable for suspend.
2942 * Returns 0 on success, negative error code on failure.
2943 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2944 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2945 {
2946 int i, r;
2947
2948 if (adev->in_s0ix)
2949 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2950
2951 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2952 if (!adev->ip_blocks[i].status.valid)
2953 continue;
2954 /* displays are handled in phase1 */
2955 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2956 continue;
2957 /* PSP lost connection when err_event_athub occurs */
2958 if (amdgpu_ras_intr_triggered() &&
2959 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2960 adev->ip_blocks[i].status.hw = false;
2961 continue;
2962 }
2963
2964 /* skip unnecessary suspend if we do not initialize them yet */
2965 if (adev->gmc.xgmi.pending_reset &&
2966 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2968 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2969 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2970 adev->ip_blocks[i].status.hw = false;
2971 continue;
2972 }
2973
2974 /* skip suspend of gfx/mes and psp for S0ix
2975 * gfx is in gfxoff state, so on resume it will exit gfxoff just
2976 * like at runtime. PSP is also part of the always on hardware
2977 * so no need to suspend it.
2978 */
2979 if (adev->in_s0ix &&
2980 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
2981 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
2983 continue;
2984
2985 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
2986 if (adev->in_s0ix &&
2987 (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
2988 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2989 continue;
2990
2991 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
2992 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
2993 * from this location and RLC Autoload automatically also gets loaded
2994 * from here based on PMFW -> PSP message during re-init sequence.
2995 * Therefore, the psp suspend & resume should be skipped to avoid destroy
2996 * the TMR and reload FWs again for IMU enabled APU ASICs.
2997 */
2998 if (amdgpu_in_reset(adev) &&
2999 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3000 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3001 continue;
3002
3003 /* XXX handle errors */
3004 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3005 /* XXX handle errors */
3006 if (r) {
3007 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3008 adev->ip_blocks[i].version->funcs->name, r);
3009 }
3010 adev->ip_blocks[i].status.hw = false;
3011 /* handle putting the SMC in the appropriate state */
3012 if (!amdgpu_sriov_vf(adev)) {
3013 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3014 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3015 if (r) {
3016 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3017 adev->mp1_state, r);
3018 return r;
3019 }
3020 }
3021 }
3022 }
3023
3024 return 0;
3025 }
3026
3027 /**
3028 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3029 *
3030 * @adev: amdgpu_device pointer
3031 *
3032 * Main suspend function for hardware IPs. The list of all the hardware
3033 * IPs that make up the asic is walked, clockgating is disabled and the
3034 * suspend callbacks are run. suspend puts the hardware and software state
3035 * in each IP into a state suitable for suspend.
3036 * Returns 0 on success, negative error code on failure.
3037 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3038 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3039 {
3040 int r;
3041
3042 if (amdgpu_sriov_vf(adev)) {
3043 amdgpu_virt_fini_data_exchange(adev);
3044 amdgpu_virt_request_full_gpu(adev, false);
3045 }
3046
3047 r = amdgpu_device_ip_suspend_phase1(adev);
3048 if (r)
3049 return r;
3050 r = amdgpu_device_ip_suspend_phase2(adev);
3051
3052 if (amdgpu_sriov_vf(adev))
3053 amdgpu_virt_release_full_gpu(adev, false);
3054
3055 return r;
3056 }
3057
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3058 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3059 {
3060 int i, r;
3061
3062 static enum amd_ip_block_type ip_order[] = {
3063 AMD_IP_BLOCK_TYPE_COMMON,
3064 AMD_IP_BLOCK_TYPE_GMC,
3065 AMD_IP_BLOCK_TYPE_PSP,
3066 AMD_IP_BLOCK_TYPE_IH,
3067 };
3068
3069 for (i = 0; i < adev->num_ip_blocks; i++) {
3070 int j;
3071 struct amdgpu_ip_block *block;
3072
3073 block = &adev->ip_blocks[i];
3074 block->status.hw = false;
3075
3076 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3077
3078 if (block->version->type != ip_order[j] ||
3079 !block->status.valid)
3080 continue;
3081
3082 r = block->version->funcs->hw_init(adev);
3083 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3084 if (r)
3085 return r;
3086 block->status.hw = true;
3087 }
3088 }
3089
3090 return 0;
3091 }
3092
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3093 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3094 {
3095 int i, r;
3096
3097 static enum amd_ip_block_type ip_order[] = {
3098 AMD_IP_BLOCK_TYPE_SMC,
3099 AMD_IP_BLOCK_TYPE_DCE,
3100 AMD_IP_BLOCK_TYPE_GFX,
3101 AMD_IP_BLOCK_TYPE_SDMA,
3102 AMD_IP_BLOCK_TYPE_MES,
3103 AMD_IP_BLOCK_TYPE_UVD,
3104 AMD_IP_BLOCK_TYPE_VCE,
3105 AMD_IP_BLOCK_TYPE_VCN,
3106 AMD_IP_BLOCK_TYPE_JPEG
3107 };
3108
3109 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3110 int j;
3111 struct amdgpu_ip_block *block;
3112
3113 for (j = 0; j < adev->num_ip_blocks; j++) {
3114 block = &adev->ip_blocks[j];
3115
3116 if (block->version->type != ip_order[i] ||
3117 !block->status.valid ||
3118 block->status.hw)
3119 continue;
3120
3121 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3122 r = block->version->funcs->resume(adev);
3123 else
3124 r = block->version->funcs->hw_init(adev);
3125
3126 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3127 if (r)
3128 return r;
3129 block->status.hw = true;
3130 }
3131 }
3132
3133 return 0;
3134 }
3135
3136 /**
3137 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3138 *
3139 * @adev: amdgpu_device pointer
3140 *
3141 * First resume function for hardware IPs. The list of all the hardware
3142 * IPs that make up the asic is walked and the resume callbacks are run for
3143 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3144 * after a suspend and updates the software state as necessary. This
3145 * function is also used for restoring the GPU after a GPU reset.
3146 * Returns 0 on success, negative error code on failure.
3147 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3148 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3149 {
3150 int i, r;
3151
3152 for (i = 0; i < adev->num_ip_blocks; i++) {
3153 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3154 continue;
3155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3156 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3157 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3158 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3159
3160 r = adev->ip_blocks[i].version->funcs->resume(adev);
3161 if (r) {
3162 DRM_ERROR("resume of IP block <%s> failed %d\n",
3163 adev->ip_blocks[i].version->funcs->name, r);
3164 return r;
3165 }
3166 adev->ip_blocks[i].status.hw = true;
3167 }
3168 }
3169
3170 return 0;
3171 }
3172
3173 /**
3174 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3175 *
3176 * @adev: amdgpu_device pointer
3177 *
3178 * First resume function for hardware IPs. The list of all the hardware
3179 * IPs that make up the asic is walked and the resume callbacks are run for
3180 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3181 * functional state after a suspend and updates the software state as
3182 * necessary. This function is also used for restoring the GPU after a GPU
3183 * reset.
3184 * Returns 0 on success, negative error code on failure.
3185 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3186 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3187 {
3188 int i, r;
3189
3190 for (i = 0; i < adev->num_ip_blocks; i++) {
3191 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3192 continue;
3193 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3194 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3195 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3196 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3197 continue;
3198 r = adev->ip_blocks[i].version->funcs->resume(adev);
3199 if (r) {
3200 DRM_ERROR("resume of IP block <%s> failed %d\n",
3201 adev->ip_blocks[i].version->funcs->name, r);
3202 return r;
3203 }
3204 adev->ip_blocks[i].status.hw = true;
3205 }
3206
3207 return 0;
3208 }
3209
3210 /**
3211 * amdgpu_device_ip_resume - run resume for hardware IPs
3212 *
3213 * @adev: amdgpu_device pointer
3214 *
3215 * Main resume function for hardware IPs. The hardware IPs
3216 * are split into two resume functions because they are
3217 * also used in recovering from a GPU reset and some additional
3218 * steps need to be take between them. In this case (S3/S4) they are
3219 * run sequentially.
3220 * Returns 0 on success, negative error code on failure.
3221 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3222 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3223 {
3224 int r;
3225
3226 r = amdgpu_device_ip_resume_phase1(adev);
3227 if (r)
3228 return r;
3229
3230 r = amdgpu_device_fw_loading(adev);
3231 if (r)
3232 return r;
3233
3234 r = amdgpu_device_ip_resume_phase2(adev);
3235
3236 return r;
3237 }
3238
3239 /**
3240 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3241 *
3242 * @adev: amdgpu_device pointer
3243 *
3244 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3245 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3246 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3247 {
3248 if (amdgpu_sriov_vf(adev)) {
3249 if (adev->is_atom_fw) {
3250 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3251 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3252 } else {
3253 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3254 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3255 }
3256
3257 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3258 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3259 }
3260 }
3261
3262 /**
3263 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3264 *
3265 * @asic_type: AMD asic type
3266 *
3267 * Check if there is DC (new modesetting infrastructre) support for an asic.
3268 * returns true if DC has support, false if not.
3269 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3270 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3271 {
3272 switch (asic_type) {
3273 #ifdef CONFIG_DRM_AMDGPU_SI
3274 case CHIP_HAINAN:
3275 #endif
3276 case CHIP_TOPAZ:
3277 /* chips with no display hardware */
3278 return false;
3279 #if defined(CONFIG_DRM_AMD_DC)
3280 case CHIP_TAHITI:
3281 case CHIP_PITCAIRN:
3282 case CHIP_VERDE:
3283 case CHIP_OLAND:
3284 /*
3285 * We have systems in the wild with these ASICs that require
3286 * LVDS and VGA support which is not supported with DC.
3287 *
3288 * Fallback to the non-DC driver here by default so as not to
3289 * cause regressions.
3290 */
3291 #if defined(CONFIG_DRM_AMD_DC_SI)
3292 return amdgpu_dc > 0;
3293 #else
3294 return false;
3295 #endif
3296 case CHIP_BONAIRE:
3297 case CHIP_KAVERI:
3298 case CHIP_KABINI:
3299 case CHIP_MULLINS:
3300 /*
3301 * We have systems in the wild with these ASICs that require
3302 * VGA support which is not supported with DC.
3303 *
3304 * Fallback to the non-DC driver here by default so as not to
3305 * cause regressions.
3306 */
3307 return amdgpu_dc > 0;
3308 default:
3309 return amdgpu_dc != 0;
3310 #else
3311 default:
3312 if (amdgpu_dc > 0)
3313 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3314 return false;
3315 #endif
3316 }
3317 }
3318
3319 /**
3320 * amdgpu_device_has_dc_support - check if dc is supported
3321 *
3322 * @adev: amdgpu_device pointer
3323 *
3324 * Returns true for supported, false for not supported
3325 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3326 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3327 {
3328 if (adev->enable_virtual_display ||
3329 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3330 return false;
3331
3332 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3333 }
3334
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3335 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3336 {
3337 struct amdgpu_device *adev =
3338 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3339 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3340
3341 /* It's a bug to not have a hive within this function */
3342 if (WARN_ON(!hive))
3343 return;
3344
3345 /*
3346 * Use task barrier to synchronize all xgmi reset works across the
3347 * hive. task_barrier_enter and task_barrier_exit will block
3348 * until all the threads running the xgmi reset works reach
3349 * those points. task_barrier_full will do both blocks.
3350 */
3351 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3352
3353 task_barrier_enter(&hive->tb);
3354 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3355
3356 if (adev->asic_reset_res)
3357 goto fail;
3358
3359 task_barrier_exit(&hive->tb);
3360 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3361
3362 if (adev->asic_reset_res)
3363 goto fail;
3364
3365 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3366 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3367 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3368 } else {
3369
3370 task_barrier_full(&hive->tb);
3371 adev->asic_reset_res = amdgpu_asic_reset(adev);
3372 }
3373
3374 fail:
3375 if (adev->asic_reset_res)
3376 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3377 adev->asic_reset_res, adev_to_drm(adev)->unique);
3378 amdgpu_put_xgmi_hive(hive);
3379 }
3380
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3381 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3382 {
3383 char *input = amdgpu_lockup_timeout;
3384 char *timeout_setting = NULL;
3385 int index = 0;
3386 long timeout;
3387 int ret = 0;
3388
3389 /*
3390 * By default timeout for non compute jobs is 10000
3391 * and 60000 for compute jobs.
3392 * In SR-IOV or passthrough mode, timeout for compute
3393 * jobs are 60000 by default.
3394 */
3395 adev->gfx_timeout = msecs_to_jiffies(10000);
3396 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3397 if (amdgpu_sriov_vf(adev))
3398 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3399 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3400 else
3401 adev->compute_timeout = msecs_to_jiffies(60000);
3402
3403 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3404 while ((timeout_setting = strsep(&input, ",")) &&
3405 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3406 ret = kstrtol(timeout_setting, 0, &timeout);
3407 if (ret)
3408 return ret;
3409
3410 if (timeout == 0) {
3411 index++;
3412 continue;
3413 } else if (timeout < 0) {
3414 timeout = MAX_SCHEDULE_TIMEOUT;
3415 dev_warn(adev->dev, "lockup timeout disabled");
3416 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3417 } else {
3418 timeout = msecs_to_jiffies(timeout);
3419 }
3420
3421 switch (index++) {
3422 case 0:
3423 adev->gfx_timeout = timeout;
3424 break;
3425 case 1:
3426 adev->compute_timeout = timeout;
3427 break;
3428 case 2:
3429 adev->sdma_timeout = timeout;
3430 break;
3431 case 3:
3432 adev->video_timeout = timeout;
3433 break;
3434 default:
3435 break;
3436 }
3437 }
3438 /*
3439 * There is only one value specified and
3440 * it should apply to all non-compute jobs.
3441 */
3442 if (index == 1) {
3443 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3444 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3445 adev->compute_timeout = adev->gfx_timeout;
3446 }
3447 }
3448
3449 return ret;
3450 }
3451
3452 /**
3453 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3454 *
3455 * @adev: amdgpu_device pointer
3456 *
3457 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3458 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3459 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3460 {
3461 struct iommu_domain *domain;
3462
3463 domain = iommu_get_domain_for_dev(adev->dev);
3464 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3465 adev->ram_is_direct_mapped = true;
3466 }
3467
3468 static const struct attribute *amdgpu_dev_attributes[] = {
3469 &dev_attr_pcie_replay_count.attr,
3470 NULL
3471 };
3472
amdgpu_device_set_mcbp(struct amdgpu_device * adev)3473 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3474 {
3475 if (amdgpu_mcbp == 1)
3476 adev->gfx.mcbp = true;
3477 else if (amdgpu_mcbp == 0)
3478 adev->gfx.mcbp = false;
3479 else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
3480 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
3481 adev->gfx.num_gfx_rings)
3482 adev->gfx.mcbp = true;
3483
3484 if (amdgpu_sriov_vf(adev))
3485 adev->gfx.mcbp = true;
3486
3487 if (adev->gfx.mcbp)
3488 DRM_INFO("MCBP is enabled\n");
3489 }
3490
3491 /**
3492 * amdgpu_device_init - initialize the driver
3493 *
3494 * @adev: amdgpu_device pointer
3495 * @flags: driver flags
3496 *
3497 * Initializes the driver info and hw (all asics).
3498 * Returns 0 for success or an error on failure.
3499 * Called at driver startup.
3500 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3501 int amdgpu_device_init(struct amdgpu_device *adev,
3502 uint32_t flags)
3503 {
3504 struct drm_device *ddev = adev_to_drm(adev);
3505 struct pci_dev *pdev = adev->pdev;
3506 int r, i;
3507 bool px = false;
3508 u32 max_MBps;
3509 int tmp;
3510
3511 adev->shutdown = false;
3512 adev->flags = flags;
3513
3514 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3515 adev->asic_type = amdgpu_force_asic_type;
3516 else
3517 adev->asic_type = flags & AMD_ASIC_MASK;
3518
3519 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3520 if (amdgpu_emu_mode == 1)
3521 adev->usec_timeout *= 10;
3522 adev->gmc.gart_size = 512 * 1024 * 1024;
3523 adev->accel_working = false;
3524 adev->num_rings = 0;
3525 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3526 adev->mman.buffer_funcs = NULL;
3527 adev->mman.buffer_funcs_ring = NULL;
3528 adev->vm_manager.vm_pte_funcs = NULL;
3529 adev->vm_manager.vm_pte_num_scheds = 0;
3530 adev->gmc.gmc_funcs = NULL;
3531 adev->harvest_ip_mask = 0x0;
3532 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3533 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3534
3535 adev->smc_rreg = &amdgpu_invalid_rreg;
3536 adev->smc_wreg = &amdgpu_invalid_wreg;
3537 adev->pcie_rreg = &amdgpu_invalid_rreg;
3538 adev->pcie_wreg = &amdgpu_invalid_wreg;
3539 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3540 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3541 adev->pciep_rreg = &amdgpu_invalid_rreg;
3542 adev->pciep_wreg = &amdgpu_invalid_wreg;
3543 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3544 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3545 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3546 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3547 adev->didt_rreg = &amdgpu_invalid_rreg;
3548 adev->didt_wreg = &amdgpu_invalid_wreg;
3549 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3550 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3551 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3552 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3553
3554 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3555 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3556 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3557
3558 /* mutex initialization are all done here so we
3559 * can recall function without having locking issues
3560 */
3561 mutex_init(&adev->firmware.mutex);
3562 mutex_init(&adev->pm.mutex);
3563 mutex_init(&adev->gfx.gpu_clock_mutex);
3564 mutex_init(&adev->srbm_mutex);
3565 mutex_init(&adev->gfx.pipe_reserve_mutex);
3566 mutex_init(&adev->gfx.gfx_off_mutex);
3567 mutex_init(&adev->gfx.partition_mutex);
3568 mutex_init(&adev->grbm_idx_mutex);
3569 mutex_init(&adev->mn_lock);
3570 mutex_init(&adev->virt.vf_errors.lock);
3571 hash_init(adev->mn_hash);
3572 mutex_init(&adev->psp.mutex);
3573 mutex_init(&adev->notifier_lock);
3574 mutex_init(&adev->pm.stable_pstate_ctx_lock);
3575 mutex_init(&adev->benchmark_mutex);
3576
3577 amdgpu_device_init_apu_flags(adev);
3578
3579 r = amdgpu_device_check_arguments(adev);
3580 if (r)
3581 return r;
3582
3583 spin_lock_init(&adev->mmio_idx_lock);
3584 spin_lock_init(&adev->smc_idx_lock);
3585 spin_lock_init(&adev->pcie_idx_lock);
3586 spin_lock_init(&adev->uvd_ctx_idx_lock);
3587 spin_lock_init(&adev->didt_idx_lock);
3588 spin_lock_init(&adev->gc_cac_idx_lock);
3589 spin_lock_init(&adev->se_cac_idx_lock);
3590 spin_lock_init(&adev->audio_endpt_idx_lock);
3591 spin_lock_init(&adev->mm_stats.lock);
3592
3593 INIT_LIST_HEAD(&adev->shadow_list);
3594 mutex_init(&adev->shadow_list_lock);
3595
3596 INIT_LIST_HEAD(&adev->reset_list);
3597
3598 INIT_LIST_HEAD(&adev->ras_list);
3599
3600 INIT_DELAYED_WORK(&adev->delayed_init_work,
3601 amdgpu_device_delayed_init_work_handler);
3602 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3603 amdgpu_device_delay_enable_gfx_off);
3604
3605 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3606
3607 adev->gfx.gfx_off_req_count = 1;
3608 adev->gfx.gfx_off_residency = 0;
3609 adev->gfx.gfx_off_entrycount = 0;
3610 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3611
3612 atomic_set(&adev->throttling_logging_enabled, 1);
3613 /*
3614 * If throttling continues, logging will be performed every minute
3615 * to avoid log flooding. "-1" is subtracted since the thermal
3616 * throttling interrupt comes every second. Thus, the total logging
3617 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3618 * for throttling interrupt) = 60 seconds.
3619 */
3620 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3621 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3622
3623 /* Registers mapping */
3624 /* TODO: block userspace mapping of io register */
3625 if (adev->asic_type >= CHIP_BONAIRE) {
3626 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3627 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3628 } else {
3629 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3630 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3631 }
3632
3633 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3634 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3635
3636 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3637 if (!adev->rmmio)
3638 return -ENOMEM;
3639
3640 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3641 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3642
3643 /*
3644 * Reset domain needs to be present early, before XGMI hive discovered
3645 * (if any) and intitialized to use reset sem and in_gpu reset flag
3646 * early on during init and before calling to RREG32.
3647 */
3648 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3649 if (!adev->reset_domain)
3650 return -ENOMEM;
3651
3652 /* detect hw virtualization here */
3653 amdgpu_detect_virtualization(adev);
3654
3655 amdgpu_device_get_pcie_info(adev);
3656
3657 r = amdgpu_device_get_job_timeout_settings(adev);
3658 if (r) {
3659 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3660 return r;
3661 }
3662
3663 /* early init functions */
3664 r = amdgpu_device_ip_early_init(adev);
3665 if (r)
3666 return r;
3667
3668 amdgpu_device_set_mcbp(adev);
3669
3670 /* Get rid of things like offb */
3671 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3672 if (r)
3673 return r;
3674
3675 /* Enable TMZ based on IP_VERSION */
3676 amdgpu_gmc_tmz_set(adev);
3677
3678 amdgpu_gmc_noretry_set(adev);
3679 /* Need to get xgmi info early to decide the reset behavior*/
3680 if (adev->gmc.xgmi.supported) {
3681 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3682 if (r)
3683 return r;
3684 }
3685
3686 /* enable PCIE atomic ops */
3687 if (amdgpu_sriov_vf(adev)) {
3688 if (adev->virt.fw_reserve.p_pf2vf)
3689 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3690 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3691 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3692 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3693 * internal path natively support atomics, set have_atomics_support to true.
3694 */
3695 } else if ((adev->flags & AMD_IS_APU) &&
3696 (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3697 adev->have_atomics_support = true;
3698 } else {
3699 adev->have_atomics_support =
3700 !pci_enable_atomic_ops_to_root(adev->pdev,
3701 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3702 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3703 }
3704
3705 if (!adev->have_atomics_support)
3706 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3707
3708 /* doorbell bar mapping and doorbell index init*/
3709 amdgpu_doorbell_init(adev);
3710
3711 if (amdgpu_emu_mode == 1) {
3712 /* post the asic on emulation mode */
3713 emu_soc_asic_init(adev);
3714 goto fence_driver_init;
3715 }
3716
3717 amdgpu_reset_init(adev);
3718
3719 /* detect if we are with an SRIOV vbios */
3720 if (adev->bios)
3721 amdgpu_device_detect_sriov_bios(adev);
3722
3723 /* check if we need to reset the asic
3724 * E.g., driver was not cleanly unloaded previously, etc.
3725 */
3726 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3727 if (adev->gmc.xgmi.num_physical_nodes) {
3728 dev_info(adev->dev, "Pending hive reset.\n");
3729 adev->gmc.xgmi.pending_reset = true;
3730 /* Only need to init necessary block for SMU to handle the reset */
3731 for (i = 0; i < adev->num_ip_blocks; i++) {
3732 if (!adev->ip_blocks[i].status.valid)
3733 continue;
3734 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3735 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3736 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3738 DRM_DEBUG("IP %s disabled for hw_init.\n",
3739 adev->ip_blocks[i].version->funcs->name);
3740 adev->ip_blocks[i].status.hw = true;
3741 }
3742 }
3743 } else {
3744 tmp = amdgpu_reset_method;
3745 /* It should do a default reset when loading or reloading the driver,
3746 * regardless of the module parameter reset_method.
3747 */
3748 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3749 r = amdgpu_asic_reset(adev);
3750 amdgpu_reset_method = tmp;
3751 if (r) {
3752 dev_err(adev->dev, "asic reset on init failed\n");
3753 goto failed;
3754 }
3755 }
3756 }
3757
3758 /* Post card if necessary */
3759 if (amdgpu_device_need_post(adev)) {
3760 if (!adev->bios) {
3761 dev_err(adev->dev, "no vBIOS found\n");
3762 r = -EINVAL;
3763 goto failed;
3764 }
3765 DRM_INFO("GPU posting now...\n");
3766 r = amdgpu_device_asic_init(adev);
3767 if (r) {
3768 dev_err(adev->dev, "gpu post error!\n");
3769 goto failed;
3770 }
3771 }
3772
3773 if (adev->bios) {
3774 if (adev->is_atom_fw) {
3775 /* Initialize clocks */
3776 r = amdgpu_atomfirmware_get_clock_info(adev);
3777 if (r) {
3778 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3779 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3780 goto failed;
3781 }
3782 } else {
3783 /* Initialize clocks */
3784 r = amdgpu_atombios_get_clock_info(adev);
3785 if (r) {
3786 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3787 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3788 goto failed;
3789 }
3790 /* init i2c buses */
3791 if (!amdgpu_device_has_dc_support(adev))
3792 amdgpu_atombios_i2c_init(adev);
3793 }
3794 }
3795
3796 fence_driver_init:
3797 /* Fence driver */
3798 r = amdgpu_fence_driver_sw_init(adev);
3799 if (r) {
3800 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3801 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3802 goto failed;
3803 }
3804
3805 /* init the mode config */
3806 drm_mode_config_init(adev_to_drm(adev));
3807
3808 r = amdgpu_device_ip_init(adev);
3809 if (r) {
3810 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3811 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3812 goto release_ras_con;
3813 }
3814
3815 amdgpu_fence_driver_hw_init(adev);
3816
3817 dev_info(adev->dev,
3818 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3819 adev->gfx.config.max_shader_engines,
3820 adev->gfx.config.max_sh_per_se,
3821 adev->gfx.config.max_cu_per_sh,
3822 adev->gfx.cu_info.number);
3823
3824 adev->accel_working = true;
3825
3826 amdgpu_vm_check_compute_bug(adev);
3827
3828 /* Initialize the buffer migration limit. */
3829 if (amdgpu_moverate >= 0)
3830 max_MBps = amdgpu_moverate;
3831 else
3832 max_MBps = 8; /* Allow 8 MB/s. */
3833 /* Get a log2 for easy divisions. */
3834 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3835
3836 r = amdgpu_atombios_sysfs_init(adev);
3837 if (r)
3838 drm_err(&adev->ddev,
3839 "registering atombios sysfs failed (%d).\n", r);
3840
3841 r = amdgpu_pm_sysfs_init(adev);
3842 if (r)
3843 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3844
3845 r = amdgpu_ucode_sysfs_init(adev);
3846 if (r) {
3847 adev->ucode_sysfs_en = false;
3848 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3849 } else
3850 adev->ucode_sysfs_en = true;
3851
3852 /*
3853 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3854 * Otherwise the mgpu fan boost feature will be skipped due to the
3855 * gpu instance is counted less.
3856 */
3857 amdgpu_register_gpu_instance(adev);
3858
3859 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3860 * explicit gating rather than handling it automatically.
3861 */
3862 if (!adev->gmc.xgmi.pending_reset) {
3863 r = amdgpu_device_ip_late_init(adev);
3864 if (r) {
3865 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3866 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3867 goto release_ras_con;
3868 }
3869 /* must succeed. */
3870 amdgpu_ras_resume(adev);
3871 queue_delayed_work(system_wq, &adev->delayed_init_work,
3872 msecs_to_jiffies(AMDGPU_RESUME_MS));
3873 }
3874
3875 if (amdgpu_sriov_vf(adev)) {
3876 amdgpu_virt_release_full_gpu(adev, true);
3877 flush_delayed_work(&adev->delayed_init_work);
3878 }
3879
3880 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3881 if (r)
3882 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3883
3884 amdgpu_fru_sysfs_init(adev);
3885
3886 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3887 r = amdgpu_pmu_init(adev);
3888 if (r)
3889 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3890
3891 /* Have stored pci confspace at hand for restore in sudden PCI error */
3892 if (amdgpu_device_cache_pci_state(adev->pdev))
3893 pci_restore_state(pdev);
3894
3895 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3896 /* this will fail for cards that aren't VGA class devices, just
3897 * ignore it
3898 */
3899 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3900 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3901
3902 px = amdgpu_device_supports_px(ddev);
3903
3904 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3905 apple_gmux_detect(NULL, NULL)))
3906 vga_switcheroo_register_client(adev->pdev,
3907 &amdgpu_switcheroo_ops, px);
3908
3909 if (px)
3910 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3911
3912 if (adev->gmc.xgmi.pending_reset)
3913 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3914 msecs_to_jiffies(AMDGPU_RESUME_MS));
3915
3916 amdgpu_device_check_iommu_direct_map(adev);
3917
3918 return 0;
3919
3920 release_ras_con:
3921 if (amdgpu_sriov_vf(adev))
3922 amdgpu_virt_release_full_gpu(adev, true);
3923
3924 /* failed in exclusive mode due to timeout */
3925 if (amdgpu_sriov_vf(adev) &&
3926 !amdgpu_sriov_runtime(adev) &&
3927 amdgpu_virt_mmio_blocked(adev) &&
3928 !amdgpu_virt_wait_reset(adev)) {
3929 dev_err(adev->dev, "VF exclusive mode timeout\n");
3930 /* Don't send request since VF is inactive. */
3931 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3932 adev->virt.ops = NULL;
3933 r = -EAGAIN;
3934 }
3935 amdgpu_release_ras_context(adev);
3936
3937 failed:
3938 amdgpu_vf_error_trans_all(adev);
3939
3940 return r;
3941 }
3942
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)3943 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3944 {
3945
3946 /* Clear all CPU mappings pointing to this device */
3947 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3948
3949 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3950 amdgpu_doorbell_fini(adev);
3951
3952 iounmap(adev->rmmio);
3953 adev->rmmio = NULL;
3954 if (adev->mman.aper_base_kaddr)
3955 iounmap(adev->mman.aper_base_kaddr);
3956 adev->mman.aper_base_kaddr = NULL;
3957
3958 /* Memory manager related */
3959 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
3960 arch_phys_wc_del(adev->gmc.vram_mtrr);
3961 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3962 }
3963 }
3964
3965 /**
3966 * amdgpu_device_fini_hw - tear down the driver
3967 *
3968 * @adev: amdgpu_device pointer
3969 *
3970 * Tear down the driver info (all asics).
3971 * Called at driver shutdown.
3972 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)3973 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3974 {
3975 dev_info(adev->dev, "amdgpu: finishing device.\n");
3976 flush_delayed_work(&adev->delayed_init_work);
3977 adev->shutdown = true;
3978
3979 /* make sure IB test finished before entering exclusive mode
3980 * to avoid preemption on IB test
3981 */
3982 if (amdgpu_sriov_vf(adev)) {
3983 amdgpu_virt_request_full_gpu(adev, false);
3984 amdgpu_virt_fini_data_exchange(adev);
3985 }
3986
3987 /* disable all interrupts */
3988 amdgpu_irq_disable_all(adev);
3989 if (adev->mode_info.mode_config_initialized) {
3990 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3991 drm_helper_force_disable_all(adev_to_drm(adev));
3992 else
3993 drm_atomic_helper_shutdown(adev_to_drm(adev));
3994 }
3995 amdgpu_fence_driver_hw_fini(adev);
3996
3997 if (adev->mman.initialized)
3998 drain_workqueue(adev->mman.bdev.wq);
3999
4000 if (adev->pm.sysfs_initialized)
4001 amdgpu_pm_sysfs_fini(adev);
4002 if (adev->ucode_sysfs_en)
4003 amdgpu_ucode_sysfs_fini(adev);
4004 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4005 amdgpu_fru_sysfs_fini(adev);
4006
4007 /* disable ras feature must before hw fini */
4008 amdgpu_ras_pre_fini(adev);
4009
4010 amdgpu_device_ip_fini_early(adev);
4011
4012 amdgpu_irq_fini_hw(adev);
4013
4014 if (adev->mman.initialized)
4015 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4016
4017 amdgpu_gart_dummy_page_fini(adev);
4018
4019 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4020 amdgpu_device_unmap_mmio(adev);
4021
4022 }
4023
amdgpu_device_fini_sw(struct amdgpu_device * adev)4024 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4025 {
4026 int idx;
4027 bool px;
4028
4029 amdgpu_fence_driver_sw_fini(adev);
4030 amdgpu_device_ip_fini(adev);
4031 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4032 adev->accel_working = false;
4033 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4034
4035 amdgpu_reset_fini(adev);
4036
4037 /* free i2c buses */
4038 if (!amdgpu_device_has_dc_support(adev))
4039 amdgpu_i2c_fini(adev);
4040
4041 if (amdgpu_emu_mode != 1)
4042 amdgpu_atombios_fini(adev);
4043
4044 kfree(adev->bios);
4045 adev->bios = NULL;
4046
4047 px = amdgpu_device_supports_px(adev_to_drm(adev));
4048
4049 if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4050 apple_gmux_detect(NULL, NULL)))
4051 vga_switcheroo_unregister_client(adev->pdev);
4052
4053 if (px)
4054 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4055
4056 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4057 vga_client_unregister(adev->pdev);
4058
4059 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4060
4061 iounmap(adev->rmmio);
4062 adev->rmmio = NULL;
4063 amdgpu_doorbell_fini(adev);
4064 drm_dev_exit(idx);
4065 }
4066
4067 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4068 amdgpu_pmu_fini(adev);
4069 if (adev->mman.discovery_bin)
4070 amdgpu_discovery_fini(adev);
4071
4072 amdgpu_reset_put_reset_domain(adev->reset_domain);
4073 adev->reset_domain = NULL;
4074
4075 kfree(adev->pci_state);
4076
4077 }
4078
4079 /**
4080 * amdgpu_device_evict_resources - evict device resources
4081 * @adev: amdgpu device object
4082 *
4083 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4084 * of the vram memory type. Mainly used for evicting device resources
4085 * at suspend time.
4086 *
4087 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4088 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4089 {
4090 int ret;
4091
4092 /* No need to evict vram on APUs for suspend to ram or s2idle */
4093 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4094 return 0;
4095
4096 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4097 if (ret)
4098 DRM_WARN("evicting device resources failed\n");
4099 return ret;
4100 }
4101
4102 /*
4103 * Suspend & resume.
4104 */
4105 /**
4106 * amdgpu_device_suspend - initiate device suspend
4107 *
4108 * @dev: drm dev pointer
4109 * @fbcon : notify the fbdev of suspend
4110 *
4111 * Puts the hw in the suspend state (all asics).
4112 * Returns 0 for success or an error on failure.
4113 * Called at driver suspend.
4114 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4115 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4116 {
4117 struct amdgpu_device *adev = drm_to_adev(dev);
4118 int r = 0;
4119
4120 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4121 return 0;
4122
4123 adev->in_suspend = true;
4124
4125 /* Evict the majority of BOs before grabbing the full access */
4126 r = amdgpu_device_evict_resources(adev);
4127 if (r)
4128 return r;
4129
4130 if (amdgpu_sriov_vf(adev)) {
4131 amdgpu_virt_fini_data_exchange(adev);
4132 r = amdgpu_virt_request_full_gpu(adev, false);
4133 if (r)
4134 return r;
4135 }
4136
4137 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4138 DRM_WARN("smart shift update failed\n");
4139
4140 if (fbcon)
4141 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4142
4143 cancel_delayed_work_sync(&adev->delayed_init_work);
4144 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4145
4146 amdgpu_ras_suspend(adev);
4147
4148 amdgpu_device_ip_suspend_phase1(adev);
4149
4150 if (!adev->in_s0ix)
4151 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4152
4153 r = amdgpu_device_evict_resources(adev);
4154 if (r)
4155 return r;
4156
4157 amdgpu_fence_driver_hw_fini(adev);
4158
4159 amdgpu_device_ip_suspend_phase2(adev);
4160
4161 if (amdgpu_sriov_vf(adev))
4162 amdgpu_virt_release_full_gpu(adev, false);
4163
4164 return 0;
4165 }
4166
4167 /**
4168 * amdgpu_device_resume - initiate device resume
4169 *
4170 * @dev: drm dev pointer
4171 * @fbcon : notify the fbdev of resume
4172 *
4173 * Bring the hw back to operating state (all asics).
4174 * Returns 0 for success or an error on failure.
4175 * Called at driver resume.
4176 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4177 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4178 {
4179 struct amdgpu_device *adev = drm_to_adev(dev);
4180 int r = 0;
4181
4182 if (amdgpu_sriov_vf(adev)) {
4183 r = amdgpu_virt_request_full_gpu(adev, true);
4184 if (r)
4185 return r;
4186 }
4187
4188 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4189 return 0;
4190
4191 if (adev->in_s0ix)
4192 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4193
4194 /* post card */
4195 if (amdgpu_device_need_post(adev)) {
4196 r = amdgpu_device_asic_init(adev);
4197 if (r)
4198 dev_err(adev->dev, "amdgpu asic init failed\n");
4199 }
4200
4201 r = amdgpu_device_ip_resume(adev);
4202
4203 if (r) {
4204 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4205 goto exit;
4206 }
4207 amdgpu_fence_driver_hw_init(adev);
4208
4209 r = amdgpu_device_ip_late_init(adev);
4210 if (r)
4211 goto exit;
4212
4213 queue_delayed_work(system_wq, &adev->delayed_init_work,
4214 msecs_to_jiffies(AMDGPU_RESUME_MS));
4215
4216 if (!adev->in_s0ix) {
4217 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4218 if (r)
4219 goto exit;
4220 }
4221
4222 exit:
4223 if (amdgpu_sriov_vf(adev)) {
4224 amdgpu_virt_init_data_exchange(adev);
4225 amdgpu_virt_release_full_gpu(adev, true);
4226 }
4227
4228 if (r)
4229 return r;
4230
4231 /* Make sure IB tests flushed */
4232 flush_delayed_work(&adev->delayed_init_work);
4233
4234 if (fbcon)
4235 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4236
4237 amdgpu_ras_resume(adev);
4238
4239 if (adev->mode_info.num_crtc) {
4240 /*
4241 * Most of the connector probing functions try to acquire runtime pm
4242 * refs to ensure that the GPU is powered on when connector polling is
4243 * performed. Since we're calling this from a runtime PM callback,
4244 * trying to acquire rpm refs will cause us to deadlock.
4245 *
4246 * Since we're guaranteed to be holding the rpm lock, it's safe to
4247 * temporarily disable the rpm helpers so this doesn't deadlock us.
4248 */
4249 #ifdef CONFIG_PM
4250 dev->dev->power.disable_depth++;
4251 #endif
4252 if (!adev->dc_enabled)
4253 drm_helper_hpd_irq_event(dev);
4254 else
4255 drm_kms_helper_hotplug_event(dev);
4256 #ifdef CONFIG_PM
4257 dev->dev->power.disable_depth--;
4258 #endif
4259 }
4260 adev->in_suspend = false;
4261
4262 if (adev->enable_mes)
4263 amdgpu_mes_self_test(adev);
4264
4265 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4266 DRM_WARN("smart shift update failed\n");
4267
4268 return 0;
4269 }
4270
4271 /**
4272 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4273 *
4274 * @adev: amdgpu_device pointer
4275 *
4276 * The list of all the hardware IPs that make up the asic is walked and
4277 * the check_soft_reset callbacks are run. check_soft_reset determines
4278 * if the asic is still hung or not.
4279 * Returns true if any of the IPs are still in a hung state, false if not.
4280 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4281 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4282 {
4283 int i;
4284 bool asic_hang = false;
4285
4286 if (amdgpu_sriov_vf(adev))
4287 return true;
4288
4289 if (amdgpu_asic_need_full_reset(adev))
4290 return true;
4291
4292 for (i = 0; i < adev->num_ip_blocks; i++) {
4293 if (!adev->ip_blocks[i].status.valid)
4294 continue;
4295 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4296 adev->ip_blocks[i].status.hang =
4297 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4298 if (adev->ip_blocks[i].status.hang) {
4299 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4300 asic_hang = true;
4301 }
4302 }
4303 return asic_hang;
4304 }
4305
4306 /**
4307 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4308 *
4309 * @adev: amdgpu_device pointer
4310 *
4311 * The list of all the hardware IPs that make up the asic is walked and the
4312 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4313 * handles any IP specific hardware or software state changes that are
4314 * necessary for a soft reset to succeed.
4315 * Returns 0 on success, negative error code on failure.
4316 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4317 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4318 {
4319 int i, r = 0;
4320
4321 for (i = 0; i < adev->num_ip_blocks; i++) {
4322 if (!adev->ip_blocks[i].status.valid)
4323 continue;
4324 if (adev->ip_blocks[i].status.hang &&
4325 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4326 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4327 if (r)
4328 return r;
4329 }
4330 }
4331
4332 return 0;
4333 }
4334
4335 /**
4336 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4337 *
4338 * @adev: amdgpu_device pointer
4339 *
4340 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4341 * reset is necessary to recover.
4342 * Returns true if a full asic reset is required, false if not.
4343 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4344 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4345 {
4346 int i;
4347
4348 if (amdgpu_asic_need_full_reset(adev))
4349 return true;
4350
4351 for (i = 0; i < adev->num_ip_blocks; i++) {
4352 if (!adev->ip_blocks[i].status.valid)
4353 continue;
4354 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4355 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4356 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4357 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4358 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4359 if (adev->ip_blocks[i].status.hang) {
4360 dev_info(adev->dev, "Some block need full reset!\n");
4361 return true;
4362 }
4363 }
4364 }
4365 return false;
4366 }
4367
4368 /**
4369 * amdgpu_device_ip_soft_reset - do a soft reset
4370 *
4371 * @adev: amdgpu_device pointer
4372 *
4373 * The list of all the hardware IPs that make up the asic is walked and the
4374 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4375 * IP specific hardware or software state changes that are necessary to soft
4376 * reset the IP.
4377 * Returns 0 on success, negative error code on failure.
4378 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4379 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4380 {
4381 int i, r = 0;
4382
4383 for (i = 0; i < adev->num_ip_blocks; i++) {
4384 if (!adev->ip_blocks[i].status.valid)
4385 continue;
4386 if (adev->ip_blocks[i].status.hang &&
4387 adev->ip_blocks[i].version->funcs->soft_reset) {
4388 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4389 if (r)
4390 return r;
4391 }
4392 }
4393
4394 return 0;
4395 }
4396
4397 /**
4398 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4399 *
4400 * @adev: amdgpu_device pointer
4401 *
4402 * The list of all the hardware IPs that make up the asic is walked and the
4403 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4404 * handles any IP specific hardware or software state changes that are
4405 * necessary after the IP has been soft reset.
4406 * Returns 0 on success, negative error code on failure.
4407 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4408 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4409 {
4410 int i, r = 0;
4411
4412 for (i = 0; i < adev->num_ip_blocks; i++) {
4413 if (!adev->ip_blocks[i].status.valid)
4414 continue;
4415 if (adev->ip_blocks[i].status.hang &&
4416 adev->ip_blocks[i].version->funcs->post_soft_reset)
4417 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4418 if (r)
4419 return r;
4420 }
4421
4422 return 0;
4423 }
4424
4425 /**
4426 * amdgpu_device_recover_vram - Recover some VRAM contents
4427 *
4428 * @adev: amdgpu_device pointer
4429 *
4430 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4431 * restore things like GPUVM page tables after a GPU reset where
4432 * the contents of VRAM might be lost.
4433 *
4434 * Returns:
4435 * 0 on success, negative error code on failure.
4436 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4437 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4438 {
4439 struct dma_fence *fence = NULL, *next = NULL;
4440 struct amdgpu_bo *shadow;
4441 struct amdgpu_bo_vm *vmbo;
4442 long r = 1, tmo;
4443
4444 if (amdgpu_sriov_runtime(adev))
4445 tmo = msecs_to_jiffies(8000);
4446 else
4447 tmo = msecs_to_jiffies(100);
4448
4449 dev_info(adev->dev, "recover vram bo from shadow start\n");
4450 mutex_lock(&adev->shadow_list_lock);
4451 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4452 /* If vm is compute context or adev is APU, shadow will be NULL */
4453 if (!vmbo->shadow)
4454 continue;
4455 shadow = vmbo->shadow;
4456
4457 /* No need to recover an evicted BO */
4458 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4459 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4460 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4461 continue;
4462
4463 r = amdgpu_bo_restore_shadow(shadow, &next);
4464 if (r)
4465 break;
4466
4467 if (fence) {
4468 tmo = dma_fence_wait_timeout(fence, false, tmo);
4469 dma_fence_put(fence);
4470 fence = next;
4471 if (tmo == 0) {
4472 r = -ETIMEDOUT;
4473 break;
4474 } else if (tmo < 0) {
4475 r = tmo;
4476 break;
4477 }
4478 } else {
4479 fence = next;
4480 }
4481 }
4482 mutex_unlock(&adev->shadow_list_lock);
4483
4484 if (fence)
4485 tmo = dma_fence_wait_timeout(fence, false, tmo);
4486 dma_fence_put(fence);
4487
4488 if (r < 0 || tmo <= 0) {
4489 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4490 return -EIO;
4491 }
4492
4493 dev_info(adev->dev, "recover vram bo from shadow done\n");
4494 return 0;
4495 }
4496
4497
4498 /**
4499 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4500 *
4501 * @adev: amdgpu_device pointer
4502 * @from_hypervisor: request from hypervisor
4503 *
4504 * do VF FLR and reinitialize Asic
4505 * return 0 means succeeded otherwise failed
4506 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4507 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4508 bool from_hypervisor)
4509 {
4510 int r;
4511 struct amdgpu_hive_info *hive = NULL;
4512 int retry_limit = 0;
4513
4514 retry:
4515 amdgpu_amdkfd_pre_reset(adev);
4516
4517 if (from_hypervisor)
4518 r = amdgpu_virt_request_full_gpu(adev, true);
4519 else
4520 r = amdgpu_virt_reset_gpu(adev);
4521 if (r)
4522 return r;
4523 amdgpu_irq_gpu_reset_resume_helper(adev);
4524
4525 /* some sw clean up VF needs to do before recover */
4526 amdgpu_virt_post_reset(adev);
4527
4528 /* Resume IP prior to SMC */
4529 r = amdgpu_device_ip_reinit_early_sriov(adev);
4530 if (r)
4531 goto error;
4532
4533 amdgpu_virt_init_data_exchange(adev);
4534
4535 r = amdgpu_device_fw_loading(adev);
4536 if (r)
4537 return r;
4538
4539 /* now we are okay to resume SMC/CP/SDMA */
4540 r = amdgpu_device_ip_reinit_late_sriov(adev);
4541 if (r)
4542 goto error;
4543
4544 hive = amdgpu_get_xgmi_hive(adev);
4545 /* Update PSP FW topology after reset */
4546 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4547 r = amdgpu_xgmi_update_topology(hive, adev);
4548
4549 if (hive)
4550 amdgpu_put_xgmi_hive(hive);
4551
4552 if (!r) {
4553 r = amdgpu_ib_ring_tests(adev);
4554
4555 amdgpu_amdkfd_post_reset(adev);
4556 }
4557
4558 error:
4559 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4560 amdgpu_inc_vram_lost(adev);
4561 r = amdgpu_device_recover_vram(adev);
4562 }
4563 amdgpu_virt_release_full_gpu(adev, true);
4564
4565 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4566 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4567 retry_limit++;
4568 goto retry;
4569 } else
4570 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4571 }
4572
4573 return r;
4574 }
4575
4576 /**
4577 * amdgpu_device_has_job_running - check if there is any job in mirror list
4578 *
4579 * @adev: amdgpu_device pointer
4580 *
4581 * check if there is any job in mirror list
4582 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4583 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4584 {
4585 int i;
4586 struct drm_sched_job *job;
4587
4588 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4589 struct amdgpu_ring *ring = adev->rings[i];
4590
4591 if (!ring || !ring->sched.thread)
4592 continue;
4593
4594 spin_lock(&ring->sched.job_list_lock);
4595 job = list_first_entry_or_null(&ring->sched.pending_list,
4596 struct drm_sched_job, list);
4597 spin_unlock(&ring->sched.job_list_lock);
4598 if (job)
4599 return true;
4600 }
4601 return false;
4602 }
4603
4604 /**
4605 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4606 *
4607 * @adev: amdgpu_device pointer
4608 *
4609 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4610 * a hung GPU.
4611 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4612 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4613 {
4614
4615 if (amdgpu_gpu_recovery == 0)
4616 goto disabled;
4617
4618 /* Skip soft reset check in fatal error mode */
4619 if (!amdgpu_ras_is_poison_mode_supported(adev))
4620 return true;
4621
4622 if (amdgpu_sriov_vf(adev))
4623 return true;
4624
4625 if (amdgpu_gpu_recovery == -1) {
4626 switch (adev->asic_type) {
4627 #ifdef CONFIG_DRM_AMDGPU_SI
4628 case CHIP_VERDE:
4629 case CHIP_TAHITI:
4630 case CHIP_PITCAIRN:
4631 case CHIP_OLAND:
4632 case CHIP_HAINAN:
4633 #endif
4634 #ifdef CONFIG_DRM_AMDGPU_CIK
4635 case CHIP_KAVERI:
4636 case CHIP_KABINI:
4637 case CHIP_MULLINS:
4638 #endif
4639 case CHIP_CARRIZO:
4640 case CHIP_STONEY:
4641 case CHIP_CYAN_SKILLFISH:
4642 goto disabled;
4643 default:
4644 break;
4645 }
4646 }
4647
4648 return true;
4649
4650 disabled:
4651 dev_info(adev->dev, "GPU recovery disabled.\n");
4652 return false;
4653 }
4654
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4655 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4656 {
4657 u32 i;
4658 int ret = 0;
4659
4660 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4661
4662 dev_info(adev->dev, "GPU mode1 reset\n");
4663
4664 /* disable BM */
4665 pci_clear_master(adev->pdev);
4666
4667 amdgpu_device_cache_pci_state(adev->pdev);
4668
4669 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4670 dev_info(adev->dev, "GPU smu mode1 reset\n");
4671 ret = amdgpu_dpm_mode1_reset(adev);
4672 } else {
4673 dev_info(adev->dev, "GPU psp mode1 reset\n");
4674 ret = psp_gpu_reset(adev);
4675 }
4676
4677 if (ret)
4678 goto mode1_reset_failed;
4679
4680 amdgpu_device_load_pci_state(adev->pdev);
4681 ret = amdgpu_psp_wait_for_bootloader(adev);
4682 if (ret)
4683 goto mode1_reset_failed;
4684
4685 /* wait for asic to come out of reset */
4686 for (i = 0; i < adev->usec_timeout; i++) {
4687 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4688
4689 if (memsize != 0xffffffff)
4690 break;
4691 udelay(1);
4692 }
4693
4694 if (i >= adev->usec_timeout) {
4695 ret = -ETIMEDOUT;
4696 goto mode1_reset_failed;
4697 }
4698
4699 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4700
4701 return 0;
4702
4703 mode1_reset_failed:
4704 dev_err(adev->dev, "GPU mode1 reset failed\n");
4705 return ret;
4706 }
4707
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4708 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4709 struct amdgpu_reset_context *reset_context)
4710 {
4711 int i, r = 0;
4712 struct amdgpu_job *job = NULL;
4713 bool need_full_reset =
4714 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4715
4716 if (reset_context->reset_req_dev == adev)
4717 job = reset_context->job;
4718
4719 if (amdgpu_sriov_vf(adev)) {
4720 /* stop the data exchange thread */
4721 amdgpu_virt_fini_data_exchange(adev);
4722 }
4723
4724 amdgpu_fence_driver_isr_toggle(adev, true);
4725
4726 /* block all schedulers and reset given job's ring */
4727 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4728 struct amdgpu_ring *ring = adev->rings[i];
4729
4730 if (!ring || !ring->sched.thread)
4731 continue;
4732
4733 /* Clear job fence from fence drv to avoid force_completion
4734 * leave NULL and vm flush fence in fence drv
4735 */
4736 amdgpu_fence_driver_clear_job_fences(ring);
4737
4738 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4739 amdgpu_fence_driver_force_completion(ring);
4740 }
4741
4742 amdgpu_fence_driver_isr_toggle(adev, false);
4743
4744 if (job && job->vm)
4745 drm_sched_increase_karma(&job->base);
4746
4747 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4748 /* If reset handler not implemented, continue; otherwise return */
4749 if (r == -EOPNOTSUPP)
4750 r = 0;
4751 else
4752 return r;
4753
4754 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4755 if (!amdgpu_sriov_vf(adev)) {
4756
4757 if (!need_full_reset)
4758 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4759
4760 if (!need_full_reset && amdgpu_gpu_recovery &&
4761 amdgpu_device_ip_check_soft_reset(adev)) {
4762 amdgpu_device_ip_pre_soft_reset(adev);
4763 r = amdgpu_device_ip_soft_reset(adev);
4764 amdgpu_device_ip_post_soft_reset(adev);
4765 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4766 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4767 need_full_reset = true;
4768 }
4769 }
4770
4771 if (need_full_reset)
4772 r = amdgpu_device_ip_suspend(adev);
4773 if (need_full_reset)
4774 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4775 else
4776 clear_bit(AMDGPU_NEED_FULL_RESET,
4777 &reset_context->flags);
4778 }
4779
4780 return r;
4781 }
4782
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4783 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4784 {
4785 int i;
4786
4787 lockdep_assert_held(&adev->reset_domain->sem);
4788
4789 for (i = 0; i < adev->num_regs; i++) {
4790 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4791 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4792 adev->reset_dump_reg_value[i]);
4793 }
4794
4795 return 0;
4796 }
4797
4798 #ifdef CONFIG_DEV_COREDUMP
amdgpu_devcoredump_read(char * buffer,loff_t offset,size_t count,void * data,size_t datalen)4799 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4800 size_t count, void *data, size_t datalen)
4801 {
4802 struct drm_printer p;
4803 struct amdgpu_device *adev = data;
4804 struct drm_print_iterator iter;
4805 int i;
4806
4807 iter.data = buffer;
4808 iter.offset = 0;
4809 iter.start = offset;
4810 iter.remain = count;
4811
4812 p = drm_coredump_printer(&iter);
4813
4814 drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4815 drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4816 drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4817 drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4818 if (adev->reset_task_info.pid)
4819 drm_printf(&p, "process_name: %s PID: %d\n",
4820 adev->reset_task_info.process_name,
4821 adev->reset_task_info.pid);
4822
4823 if (adev->reset_vram_lost)
4824 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4825 if (adev->num_regs) {
4826 drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
4827
4828 for (i = 0; i < adev->num_regs; i++)
4829 drm_printf(&p, "0x%08x: 0x%08x\n",
4830 adev->reset_dump_reg_list[i],
4831 adev->reset_dump_reg_value[i]);
4832 }
4833
4834 return count - iter.remain;
4835 }
4836
amdgpu_devcoredump_free(void * data)4837 static void amdgpu_devcoredump_free(void *data)
4838 {
4839 }
4840
amdgpu_reset_capture_coredumpm(struct amdgpu_device * adev)4841 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4842 {
4843 struct drm_device *dev = adev_to_drm(adev);
4844
4845 ktime_get_ts64(&adev->reset_time);
4846 dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
4847 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4848 }
4849 #endif
4850
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4851 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4852 struct amdgpu_reset_context *reset_context)
4853 {
4854 struct amdgpu_device *tmp_adev = NULL;
4855 bool need_full_reset, skip_hw_reset, vram_lost = false;
4856 int r = 0;
4857 bool gpu_reset_for_dev_remove = 0;
4858
4859 /* Try reset handler method first */
4860 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4861 reset_list);
4862 amdgpu_reset_reg_dumps(tmp_adev);
4863
4864 reset_context->reset_device_list = device_list_handle;
4865 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4866 /* If reset handler not implemented, continue; otherwise return */
4867 if (r == -EOPNOTSUPP)
4868 r = 0;
4869 else
4870 return r;
4871
4872 /* Reset handler not implemented, use the default method */
4873 need_full_reset =
4874 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4875 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4876
4877 gpu_reset_for_dev_remove =
4878 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4879 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4880
4881 /*
4882 * ASIC reset has to be done on all XGMI hive nodes ASAP
4883 * to allow proper links negotiation in FW (within 1 sec)
4884 */
4885 if (!skip_hw_reset && need_full_reset) {
4886 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4887 /* For XGMI run all resets in parallel to speed up the process */
4888 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4889 tmp_adev->gmc.xgmi.pending_reset = false;
4890 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4891 r = -EALREADY;
4892 } else
4893 r = amdgpu_asic_reset(tmp_adev);
4894
4895 if (r) {
4896 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4897 r, adev_to_drm(tmp_adev)->unique);
4898 break;
4899 }
4900 }
4901
4902 /* For XGMI wait for all resets to complete before proceed */
4903 if (!r) {
4904 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4905 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4906 flush_work(&tmp_adev->xgmi_reset_work);
4907 r = tmp_adev->asic_reset_res;
4908 if (r)
4909 break;
4910 }
4911 }
4912 }
4913 }
4914
4915 if (!r && amdgpu_ras_intr_triggered()) {
4916 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4917 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4918 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4919 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4920 }
4921
4922 amdgpu_ras_intr_cleared();
4923 }
4924
4925 /* Since the mode1 reset affects base ip blocks, the
4926 * phase1 ip blocks need to be resumed. Otherwise there
4927 * will be a BIOS signature error and the psp bootloader
4928 * can't load kdb on the next amdgpu install.
4929 */
4930 if (gpu_reset_for_dev_remove) {
4931 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4932 amdgpu_device_ip_resume_phase1(tmp_adev);
4933
4934 goto end;
4935 }
4936
4937 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4938 if (need_full_reset) {
4939 /* post card */
4940 r = amdgpu_device_asic_init(tmp_adev);
4941 if (r) {
4942 dev_warn(tmp_adev->dev, "asic atom init failed!");
4943 } else {
4944 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4945
4946 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4947 if (r)
4948 goto out;
4949
4950 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4951 #ifdef CONFIG_DEV_COREDUMP
4952 tmp_adev->reset_vram_lost = vram_lost;
4953 memset(&tmp_adev->reset_task_info, 0,
4954 sizeof(tmp_adev->reset_task_info));
4955 if (reset_context->job && reset_context->job->vm)
4956 tmp_adev->reset_task_info =
4957 reset_context->job->vm->task_info;
4958 amdgpu_reset_capture_coredumpm(tmp_adev);
4959 #endif
4960 if (vram_lost) {
4961 DRM_INFO("VRAM is lost due to GPU reset!\n");
4962 amdgpu_inc_vram_lost(tmp_adev);
4963 }
4964
4965 r = amdgpu_device_fw_loading(tmp_adev);
4966 if (r)
4967 return r;
4968
4969 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4970 if (r)
4971 goto out;
4972
4973 if (vram_lost)
4974 amdgpu_device_fill_reset_magic(tmp_adev);
4975
4976 /*
4977 * Add this ASIC as tracked as reset was already
4978 * complete successfully.
4979 */
4980 amdgpu_register_gpu_instance(tmp_adev);
4981
4982 if (!reset_context->hive &&
4983 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4984 amdgpu_xgmi_add_device(tmp_adev);
4985
4986 r = amdgpu_device_ip_late_init(tmp_adev);
4987 if (r)
4988 goto out;
4989
4990 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4991
4992 /*
4993 * The GPU enters bad state once faulty pages
4994 * by ECC has reached the threshold, and ras
4995 * recovery is scheduled next. So add one check
4996 * here to break recovery if it indeed exceeds
4997 * bad page threshold, and remind user to
4998 * retire this GPU or setting one bigger
4999 * bad_page_threshold value to fix this once
5000 * probing driver again.
5001 */
5002 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5003 /* must succeed. */
5004 amdgpu_ras_resume(tmp_adev);
5005 } else {
5006 r = -EINVAL;
5007 goto out;
5008 }
5009
5010 /* Update PSP FW topology after reset */
5011 if (reset_context->hive &&
5012 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5013 r = amdgpu_xgmi_update_topology(
5014 reset_context->hive, tmp_adev);
5015 }
5016 }
5017
5018 out:
5019 if (!r) {
5020 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5021 r = amdgpu_ib_ring_tests(tmp_adev);
5022 if (r) {
5023 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5024 need_full_reset = true;
5025 r = -EAGAIN;
5026 goto end;
5027 }
5028 }
5029
5030 if (!r)
5031 r = amdgpu_device_recover_vram(tmp_adev);
5032 else
5033 tmp_adev->asic_reset_res = r;
5034 }
5035
5036 end:
5037 if (need_full_reset)
5038 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5039 else
5040 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5041 return r;
5042 }
5043
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5044 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5045 {
5046
5047 switch (amdgpu_asic_reset_method(adev)) {
5048 case AMD_RESET_METHOD_MODE1:
5049 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5050 break;
5051 case AMD_RESET_METHOD_MODE2:
5052 adev->mp1_state = PP_MP1_STATE_RESET;
5053 break;
5054 default:
5055 adev->mp1_state = PP_MP1_STATE_NONE;
5056 break;
5057 }
5058 }
5059
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5060 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5061 {
5062 amdgpu_vf_error_trans_all(adev);
5063 adev->mp1_state = PP_MP1_STATE_NONE;
5064 }
5065
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5066 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5067 {
5068 struct pci_dev *p = NULL;
5069
5070 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5071 adev->pdev->bus->number, 1);
5072 if (p) {
5073 pm_runtime_enable(&(p->dev));
5074 pm_runtime_resume(&(p->dev));
5075 }
5076
5077 pci_dev_put(p);
5078 }
5079
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5080 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5081 {
5082 enum amd_reset_method reset_method;
5083 struct pci_dev *p = NULL;
5084 u64 expires;
5085
5086 /*
5087 * For now, only BACO and mode1 reset are confirmed
5088 * to suffer the audio issue without proper suspended.
5089 */
5090 reset_method = amdgpu_asic_reset_method(adev);
5091 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5092 (reset_method != AMD_RESET_METHOD_MODE1))
5093 return -EINVAL;
5094
5095 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5096 adev->pdev->bus->number, 1);
5097 if (!p)
5098 return -ENODEV;
5099
5100 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5101 if (!expires)
5102 /*
5103 * If we cannot get the audio device autosuspend delay,
5104 * a fixed 4S interval will be used. Considering 3S is
5105 * the audio controller default autosuspend delay setting.
5106 * 4S used here is guaranteed to cover that.
5107 */
5108 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5109
5110 while (!pm_runtime_status_suspended(&(p->dev))) {
5111 if (!pm_runtime_suspend(&(p->dev)))
5112 break;
5113
5114 if (expires < ktime_get_mono_fast_ns()) {
5115 dev_warn(adev->dev, "failed to suspend display audio\n");
5116 pci_dev_put(p);
5117 /* TODO: abort the succeeding gpu reset? */
5118 return -ETIMEDOUT;
5119 }
5120 }
5121
5122 pm_runtime_disable(&(p->dev));
5123
5124 pci_dev_put(p);
5125 return 0;
5126 }
5127
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5128 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5129 {
5130 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5131
5132 #if defined(CONFIG_DEBUG_FS)
5133 if (!amdgpu_sriov_vf(adev))
5134 cancel_work(&adev->reset_work);
5135 #endif
5136
5137 if (adev->kfd.dev)
5138 cancel_work(&adev->kfd.reset_work);
5139
5140 if (amdgpu_sriov_vf(adev))
5141 cancel_work(&adev->virt.flr_work);
5142
5143 if (con && adev->ras_enabled)
5144 cancel_work(&con->recovery_work);
5145
5146 }
5147
5148 /**
5149 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5150 *
5151 * @adev: amdgpu_device pointer
5152 * @job: which job trigger hang
5153 * @reset_context: amdgpu reset context pointer
5154 *
5155 * Attempt to reset the GPU if it has hung (all asics).
5156 * Attempt to do soft-reset or full-reset and reinitialize Asic
5157 * Returns 0 for success or an error on failure.
5158 */
5159
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5160 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5161 struct amdgpu_job *job,
5162 struct amdgpu_reset_context *reset_context)
5163 {
5164 struct list_head device_list, *device_list_handle = NULL;
5165 bool job_signaled = false;
5166 struct amdgpu_hive_info *hive = NULL;
5167 struct amdgpu_device *tmp_adev = NULL;
5168 int i, r = 0;
5169 bool need_emergency_restart = false;
5170 bool audio_suspended = false;
5171 bool gpu_reset_for_dev_remove = false;
5172
5173 gpu_reset_for_dev_remove =
5174 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5175 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5176
5177 /*
5178 * Special case: RAS triggered and full reset isn't supported
5179 */
5180 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5181
5182 /*
5183 * Flush RAM to disk so that after reboot
5184 * the user can read log and see why the system rebooted.
5185 */
5186 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5187 DRM_WARN("Emergency reboot.");
5188
5189 ksys_sync_helper();
5190 emergency_restart();
5191 }
5192
5193 dev_info(adev->dev, "GPU %s begin!\n",
5194 need_emergency_restart ? "jobs stop":"reset");
5195
5196 if (!amdgpu_sriov_vf(adev))
5197 hive = amdgpu_get_xgmi_hive(adev);
5198 if (hive)
5199 mutex_lock(&hive->hive_lock);
5200
5201 reset_context->job = job;
5202 reset_context->hive = hive;
5203 /*
5204 * Build list of devices to reset.
5205 * In case we are in XGMI hive mode, resort the device list
5206 * to put adev in the 1st position.
5207 */
5208 INIT_LIST_HEAD(&device_list);
5209 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5210 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5211 list_add_tail(&tmp_adev->reset_list, &device_list);
5212 if (gpu_reset_for_dev_remove && adev->shutdown)
5213 tmp_adev->shutdown = true;
5214 }
5215 if (!list_is_first(&adev->reset_list, &device_list))
5216 list_rotate_to_front(&adev->reset_list, &device_list);
5217 device_list_handle = &device_list;
5218 } else {
5219 list_add_tail(&adev->reset_list, &device_list);
5220 device_list_handle = &device_list;
5221 }
5222
5223 /* We need to lock reset domain only once both for XGMI and single device */
5224 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5225 reset_list);
5226 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5227
5228 /* block all schedulers and reset given job's ring */
5229 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5230
5231 amdgpu_device_set_mp1_state(tmp_adev);
5232
5233 /*
5234 * Try to put the audio codec into suspend state
5235 * before gpu reset started.
5236 *
5237 * Due to the power domain of the graphics device
5238 * is shared with AZ power domain. Without this,
5239 * we may change the audio hardware from behind
5240 * the audio driver's back. That will trigger
5241 * some audio codec errors.
5242 */
5243 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5244 audio_suspended = true;
5245
5246 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5247
5248 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5249
5250 if (!amdgpu_sriov_vf(tmp_adev))
5251 amdgpu_amdkfd_pre_reset(tmp_adev);
5252
5253 /*
5254 * Mark these ASICs to be reseted as untracked first
5255 * And add them back after reset completed
5256 */
5257 amdgpu_unregister_gpu_instance(tmp_adev);
5258
5259 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5260
5261 /* disable ras on ALL IPs */
5262 if (!need_emergency_restart &&
5263 amdgpu_device_ip_need_full_reset(tmp_adev))
5264 amdgpu_ras_suspend(tmp_adev);
5265
5266 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5267 struct amdgpu_ring *ring = tmp_adev->rings[i];
5268
5269 if (!ring || !ring->sched.thread)
5270 continue;
5271
5272 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5273
5274 if (need_emergency_restart)
5275 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5276 }
5277 atomic_inc(&tmp_adev->gpu_reset_counter);
5278 }
5279
5280 if (need_emergency_restart)
5281 goto skip_sched_resume;
5282
5283 /*
5284 * Must check guilty signal here since after this point all old
5285 * HW fences are force signaled.
5286 *
5287 * job->base holds a reference to parent fence
5288 */
5289 if (job && dma_fence_is_signaled(&job->hw_fence)) {
5290 job_signaled = true;
5291 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5292 goto skip_hw_reset;
5293 }
5294
5295 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5296 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5297 if (gpu_reset_for_dev_remove) {
5298 /* Workaroud for ASICs need to disable SMC first */
5299 amdgpu_device_smu_fini_early(tmp_adev);
5300 }
5301 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5302 /*TODO Should we stop ?*/
5303 if (r) {
5304 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5305 r, adev_to_drm(tmp_adev)->unique);
5306 tmp_adev->asic_reset_res = r;
5307 }
5308
5309 /*
5310 * Drop all pending non scheduler resets. Scheduler resets
5311 * were already dropped during drm_sched_stop
5312 */
5313 amdgpu_device_stop_pending_resets(tmp_adev);
5314 }
5315
5316 /* Actual ASIC resets if needed.*/
5317 /* Host driver will handle XGMI hive reset for SRIOV */
5318 if (amdgpu_sriov_vf(adev)) {
5319 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5320 if (r)
5321 adev->asic_reset_res = r;
5322
5323 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5324 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5325 adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5326 amdgpu_ras_resume(adev);
5327 } else {
5328 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5329 if (r && r == -EAGAIN)
5330 goto retry;
5331
5332 if (!r && gpu_reset_for_dev_remove)
5333 goto recover_end;
5334 }
5335
5336 skip_hw_reset:
5337
5338 /* Post ASIC reset for all devs .*/
5339 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5340
5341 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5342 struct amdgpu_ring *ring = tmp_adev->rings[i];
5343
5344 if (!ring || !ring->sched.thread)
5345 continue;
5346
5347 drm_sched_start(&ring->sched, true);
5348 }
5349
5350 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5351 amdgpu_mes_self_test(tmp_adev);
5352
5353 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5354 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5355
5356 if (tmp_adev->asic_reset_res)
5357 r = tmp_adev->asic_reset_res;
5358
5359 tmp_adev->asic_reset_res = 0;
5360
5361 if (r) {
5362 /* bad news, how to tell it to userspace ? */
5363 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5364 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5365 } else {
5366 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5367 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5368 DRM_WARN("smart shift update failed\n");
5369 }
5370 }
5371
5372 skip_sched_resume:
5373 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5374 /* unlock kfd: SRIOV would do it separately */
5375 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5376 amdgpu_amdkfd_post_reset(tmp_adev);
5377
5378 /* kfd_post_reset will do nothing if kfd device is not initialized,
5379 * need to bring up kfd here if it's not be initialized before
5380 */
5381 if (!adev->kfd.init_complete)
5382 amdgpu_amdkfd_device_init(adev);
5383
5384 if (audio_suspended)
5385 amdgpu_device_resume_display_audio(tmp_adev);
5386
5387 amdgpu_device_unset_mp1_state(tmp_adev);
5388
5389 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5390 }
5391
5392 recover_end:
5393 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5394 reset_list);
5395 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5396
5397 if (hive) {
5398 mutex_unlock(&hive->hive_lock);
5399 amdgpu_put_xgmi_hive(hive);
5400 }
5401
5402 if (r)
5403 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5404
5405 atomic_set(&adev->reset_domain->reset_res, r);
5406 return r;
5407 }
5408
5409 /**
5410 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5411 *
5412 * @adev: amdgpu_device pointer
5413 *
5414 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5415 * and lanes) of the slot the device is in. Handles APUs and
5416 * virtualized environments where PCIE config space may not be available.
5417 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5418 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5419 {
5420 struct pci_dev *pdev;
5421 enum pci_bus_speed speed_cap, platform_speed_cap;
5422 enum pcie_link_width platform_link_width;
5423
5424 if (amdgpu_pcie_gen_cap)
5425 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5426
5427 if (amdgpu_pcie_lane_cap)
5428 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5429
5430 /* covers APUs as well */
5431 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5432 if (adev->pm.pcie_gen_mask == 0)
5433 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5434 if (adev->pm.pcie_mlw_mask == 0)
5435 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5436 return;
5437 }
5438
5439 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5440 return;
5441
5442 pcie_bandwidth_available(adev->pdev, NULL,
5443 &platform_speed_cap, &platform_link_width);
5444
5445 if (adev->pm.pcie_gen_mask == 0) {
5446 /* asic caps */
5447 pdev = adev->pdev;
5448 speed_cap = pcie_get_speed_cap(pdev);
5449 if (speed_cap == PCI_SPEED_UNKNOWN) {
5450 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5451 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5452 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5453 } else {
5454 if (speed_cap == PCIE_SPEED_32_0GT)
5455 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5456 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5457 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5458 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5459 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5460 else if (speed_cap == PCIE_SPEED_16_0GT)
5461 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5462 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5463 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5464 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5465 else if (speed_cap == PCIE_SPEED_8_0GT)
5466 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5467 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5468 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5469 else if (speed_cap == PCIE_SPEED_5_0GT)
5470 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5471 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5472 else
5473 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5474 }
5475 /* platform caps */
5476 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5477 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5478 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5479 } else {
5480 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5481 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5482 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5483 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5484 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5485 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5486 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5487 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5488 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5489 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5490 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5491 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5492 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5493 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5494 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5495 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5496 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5497 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5498 else
5499 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5500
5501 }
5502 }
5503 if (adev->pm.pcie_mlw_mask == 0) {
5504 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5505 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5506 } else {
5507 switch (platform_link_width) {
5508 case PCIE_LNK_X32:
5509 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5510 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5511 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5512 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5513 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5514 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5515 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5516 break;
5517 case PCIE_LNK_X16:
5518 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5519 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5520 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5521 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5522 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5523 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5524 break;
5525 case PCIE_LNK_X12:
5526 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5527 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5528 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5529 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5530 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5531 break;
5532 case PCIE_LNK_X8:
5533 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5534 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5537 break;
5538 case PCIE_LNK_X4:
5539 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5540 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5541 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5542 break;
5543 case PCIE_LNK_X2:
5544 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5546 break;
5547 case PCIE_LNK_X1:
5548 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5549 break;
5550 default:
5551 break;
5552 }
5553 }
5554 }
5555 }
5556
5557 /**
5558 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5559 *
5560 * @adev: amdgpu_device pointer
5561 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5562 *
5563 * Return true if @peer_adev can access (DMA) @adev through the PCIe
5564 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5565 * @peer_adev.
5566 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)5567 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5568 struct amdgpu_device *peer_adev)
5569 {
5570 #ifdef CONFIG_HSA_AMD_P2P
5571 uint64_t address_mask = peer_adev->dev->dma_mask ?
5572 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5573 resource_size_t aper_limit =
5574 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5575 bool p2p_access =
5576 !adev->gmc.xgmi.connected_to_cpu &&
5577 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5578
5579 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5580 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5581 !(adev->gmc.aper_base & address_mask ||
5582 aper_limit & address_mask));
5583 #else
5584 return false;
5585 #endif
5586 }
5587
amdgpu_device_baco_enter(struct drm_device * dev)5588 int amdgpu_device_baco_enter(struct drm_device *dev)
5589 {
5590 struct amdgpu_device *adev = drm_to_adev(dev);
5591 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5592
5593 if (!amdgpu_device_supports_baco(dev))
5594 return -ENOTSUPP;
5595
5596 if (ras && adev->ras_enabled &&
5597 adev->nbio.funcs->enable_doorbell_interrupt)
5598 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5599
5600 return amdgpu_dpm_baco_enter(adev);
5601 }
5602
amdgpu_device_baco_exit(struct drm_device * dev)5603 int amdgpu_device_baco_exit(struct drm_device *dev)
5604 {
5605 struct amdgpu_device *adev = drm_to_adev(dev);
5606 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5607 int ret = 0;
5608
5609 if (!amdgpu_device_supports_baco(dev))
5610 return -ENOTSUPP;
5611
5612 ret = amdgpu_dpm_baco_exit(adev);
5613 if (ret)
5614 return ret;
5615
5616 if (ras && adev->ras_enabled &&
5617 adev->nbio.funcs->enable_doorbell_interrupt)
5618 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5619
5620 if (amdgpu_passthrough(adev) &&
5621 adev->nbio.funcs->clear_doorbell_interrupt)
5622 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5623
5624 return 0;
5625 }
5626
5627 /**
5628 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5629 * @pdev: PCI device struct
5630 * @state: PCI channel state
5631 *
5632 * Description: Called when a PCI error is detected.
5633 *
5634 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5635 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5636 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5637 {
5638 struct drm_device *dev = pci_get_drvdata(pdev);
5639 struct amdgpu_device *adev = drm_to_adev(dev);
5640 int i;
5641
5642 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5643
5644 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5645 DRM_WARN("No support for XGMI hive yet...");
5646 return PCI_ERS_RESULT_DISCONNECT;
5647 }
5648
5649 adev->pci_channel_state = state;
5650
5651 switch (state) {
5652 case pci_channel_io_normal:
5653 return PCI_ERS_RESULT_CAN_RECOVER;
5654 /* Fatal error, prepare for slot reset */
5655 case pci_channel_io_frozen:
5656 /*
5657 * Locking adev->reset_domain->sem will prevent any external access
5658 * to GPU during PCI error recovery
5659 */
5660 amdgpu_device_lock_reset_domain(adev->reset_domain);
5661 amdgpu_device_set_mp1_state(adev);
5662
5663 /*
5664 * Block any work scheduling as we do for regular GPU reset
5665 * for the duration of the recovery
5666 */
5667 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5668 struct amdgpu_ring *ring = adev->rings[i];
5669
5670 if (!ring || !ring->sched.thread)
5671 continue;
5672
5673 drm_sched_stop(&ring->sched, NULL);
5674 }
5675 atomic_inc(&adev->gpu_reset_counter);
5676 return PCI_ERS_RESULT_NEED_RESET;
5677 case pci_channel_io_perm_failure:
5678 /* Permanent error, prepare for device removal */
5679 return PCI_ERS_RESULT_DISCONNECT;
5680 }
5681
5682 return PCI_ERS_RESULT_NEED_RESET;
5683 }
5684
5685 /**
5686 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5687 * @pdev: pointer to PCI device
5688 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5689 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5690 {
5691
5692 DRM_INFO("PCI error: mmio enabled callback!!\n");
5693
5694 /* TODO - dump whatever for debugging purposes */
5695
5696 /* This called only if amdgpu_pci_error_detected returns
5697 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5698 * works, no need to reset slot.
5699 */
5700
5701 return PCI_ERS_RESULT_RECOVERED;
5702 }
5703
5704 /**
5705 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5706 * @pdev: PCI device struct
5707 *
5708 * Description: This routine is called by the pci error recovery
5709 * code after the PCI slot has been reset, just before we
5710 * should resume normal operations.
5711 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5712 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5713 {
5714 struct drm_device *dev = pci_get_drvdata(pdev);
5715 struct amdgpu_device *adev = drm_to_adev(dev);
5716 int r, i;
5717 struct amdgpu_reset_context reset_context;
5718 u32 memsize;
5719 struct list_head device_list;
5720
5721 DRM_INFO("PCI error: slot reset callback!!\n");
5722
5723 memset(&reset_context, 0, sizeof(reset_context));
5724
5725 INIT_LIST_HEAD(&device_list);
5726 list_add_tail(&adev->reset_list, &device_list);
5727
5728 /* wait for asic to come out of reset */
5729 msleep(500);
5730
5731 /* Restore PCI confspace */
5732 amdgpu_device_load_pci_state(pdev);
5733
5734 /* confirm ASIC came out of reset */
5735 for (i = 0; i < adev->usec_timeout; i++) {
5736 memsize = amdgpu_asic_get_config_memsize(adev);
5737
5738 if (memsize != 0xffffffff)
5739 break;
5740 udelay(1);
5741 }
5742 if (memsize == 0xffffffff) {
5743 r = -ETIME;
5744 goto out;
5745 }
5746
5747 reset_context.method = AMD_RESET_METHOD_NONE;
5748 reset_context.reset_req_dev = adev;
5749 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5750 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5751
5752 adev->no_hw_access = true;
5753 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5754 adev->no_hw_access = false;
5755 if (r)
5756 goto out;
5757
5758 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5759
5760 out:
5761 if (!r) {
5762 if (amdgpu_device_cache_pci_state(adev->pdev))
5763 pci_restore_state(adev->pdev);
5764
5765 DRM_INFO("PCIe error recovery succeeded\n");
5766 } else {
5767 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5768 amdgpu_device_unset_mp1_state(adev);
5769 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5770 }
5771
5772 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5773 }
5774
5775 /**
5776 * amdgpu_pci_resume() - resume normal ops after PCI reset
5777 * @pdev: pointer to PCI device
5778 *
5779 * Called when the error recovery driver tells us that its
5780 * OK to resume normal operation.
5781 */
amdgpu_pci_resume(struct pci_dev * pdev)5782 void amdgpu_pci_resume(struct pci_dev *pdev)
5783 {
5784 struct drm_device *dev = pci_get_drvdata(pdev);
5785 struct amdgpu_device *adev = drm_to_adev(dev);
5786 int i;
5787
5788
5789 DRM_INFO("PCI error: resume callback!!\n");
5790
5791 /* Only continue execution for the case of pci_channel_io_frozen */
5792 if (adev->pci_channel_state != pci_channel_io_frozen)
5793 return;
5794
5795 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5796 struct amdgpu_ring *ring = adev->rings[i];
5797
5798 if (!ring || !ring->sched.thread)
5799 continue;
5800
5801 drm_sched_start(&ring->sched, true);
5802 }
5803
5804 amdgpu_device_unset_mp1_state(adev);
5805 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5806 }
5807
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5808 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5809 {
5810 struct drm_device *dev = pci_get_drvdata(pdev);
5811 struct amdgpu_device *adev = drm_to_adev(dev);
5812 int r;
5813
5814 r = pci_save_state(pdev);
5815 if (!r) {
5816 kfree(adev->pci_state);
5817
5818 adev->pci_state = pci_store_saved_state(pdev);
5819
5820 if (!adev->pci_state) {
5821 DRM_ERROR("Failed to store PCI saved state");
5822 return false;
5823 }
5824 } else {
5825 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5826 return false;
5827 }
5828
5829 return true;
5830 }
5831
amdgpu_device_load_pci_state(struct pci_dev * pdev)5832 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5833 {
5834 struct drm_device *dev = pci_get_drvdata(pdev);
5835 struct amdgpu_device *adev = drm_to_adev(dev);
5836 int r;
5837
5838 if (!adev->pci_state)
5839 return false;
5840
5841 r = pci_load_saved_state(pdev, adev->pci_state);
5842
5843 if (!r) {
5844 pci_restore_state(pdev);
5845 } else {
5846 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5847 return false;
5848 }
5849
5850 return true;
5851 }
5852
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5853 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5854 struct amdgpu_ring *ring)
5855 {
5856 #ifdef CONFIG_X86_64
5857 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5858 return;
5859 #endif
5860 if (adev->gmc.xgmi.connected_to_cpu)
5861 return;
5862
5863 if (ring && ring->funcs->emit_hdp_flush)
5864 amdgpu_ring_emit_hdp_flush(ring);
5865 else
5866 amdgpu_asic_flush_hdp(adev, ring);
5867 }
5868
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5869 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5870 struct amdgpu_ring *ring)
5871 {
5872 #ifdef CONFIG_X86_64
5873 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5874 return;
5875 #endif
5876 if (adev->gmc.xgmi.connected_to_cpu)
5877 return;
5878
5879 amdgpu_asic_invalidate_hdp(adev, ring);
5880 }
5881
amdgpu_in_reset(struct amdgpu_device * adev)5882 int amdgpu_in_reset(struct amdgpu_device *adev)
5883 {
5884 return atomic_read(&adev->reset_domain->in_gpu_reset);
5885 }
5886
5887 /**
5888 * amdgpu_device_halt() - bring hardware to some kind of halt state
5889 *
5890 * @adev: amdgpu_device pointer
5891 *
5892 * Bring hardware to some kind of halt state so that no one can touch it
5893 * any more. It will help to maintain error context when error occurred.
5894 * Compare to a simple hang, the system will keep stable at least for SSH
5895 * access. Then it should be trivial to inspect the hardware state and
5896 * see what's going on. Implemented as following:
5897 *
5898 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5899 * clears all CPU mappings to device, disallows remappings through page faults
5900 * 2. amdgpu_irq_disable_all() disables all interrupts
5901 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5902 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5903 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5904 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5905 * flush any in flight DMA operations
5906 */
amdgpu_device_halt(struct amdgpu_device * adev)5907 void amdgpu_device_halt(struct amdgpu_device *adev)
5908 {
5909 struct pci_dev *pdev = adev->pdev;
5910 struct drm_device *ddev = adev_to_drm(adev);
5911
5912 amdgpu_xcp_dev_unplug(adev);
5913 drm_dev_unplug(ddev);
5914
5915 amdgpu_irq_disable_all(adev);
5916
5917 amdgpu_fence_driver_hw_fini(adev);
5918
5919 adev->no_hw_access = true;
5920
5921 amdgpu_device_unmap_mmio(adev);
5922
5923 pci_disable_device(pdev);
5924 pci_wait_for_pending_transaction(pdev);
5925 }
5926
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)5927 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5928 u32 reg)
5929 {
5930 unsigned long flags, address, data;
5931 u32 r;
5932
5933 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5934 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5935
5936 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5937 WREG32(address, reg * 4);
5938 (void)RREG32(address);
5939 r = RREG32(data);
5940 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5941 return r;
5942 }
5943
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)5944 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5945 u32 reg, u32 v)
5946 {
5947 unsigned long flags, address, data;
5948
5949 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5950 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5951
5952 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5953 WREG32(address, reg * 4);
5954 (void)RREG32(address);
5955 WREG32(data, v);
5956 (void)RREG32(data);
5957 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5958 }
5959
5960 /**
5961 * amdgpu_device_switch_gang - switch to a new gang
5962 * @adev: amdgpu_device pointer
5963 * @gang: the gang to switch to
5964 *
5965 * Try to switch to a new gang.
5966 * Returns: NULL if we switched to the new gang or a reference to the current
5967 * gang leader.
5968 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)5969 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
5970 struct dma_fence *gang)
5971 {
5972 struct dma_fence *old = NULL;
5973
5974 do {
5975 dma_fence_put(old);
5976 rcu_read_lock();
5977 old = dma_fence_get_rcu_safe(&adev->gang_submit);
5978 rcu_read_unlock();
5979
5980 if (old == gang)
5981 break;
5982
5983 if (!dma_fence_is_signaled(old))
5984 return old;
5985
5986 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
5987 old, gang) != old);
5988
5989 dma_fence_put(old);
5990 return NULL;
5991 }
5992
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)5993 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
5994 {
5995 switch (adev->asic_type) {
5996 #ifdef CONFIG_DRM_AMDGPU_SI
5997 case CHIP_HAINAN:
5998 #endif
5999 case CHIP_TOPAZ:
6000 /* chips with no display hardware */
6001 return false;
6002 #ifdef CONFIG_DRM_AMDGPU_SI
6003 case CHIP_TAHITI:
6004 case CHIP_PITCAIRN:
6005 case CHIP_VERDE:
6006 case CHIP_OLAND:
6007 #endif
6008 #ifdef CONFIG_DRM_AMDGPU_CIK
6009 case CHIP_BONAIRE:
6010 case CHIP_HAWAII:
6011 case CHIP_KAVERI:
6012 case CHIP_KABINI:
6013 case CHIP_MULLINS:
6014 #endif
6015 case CHIP_TONGA:
6016 case CHIP_FIJI:
6017 case CHIP_POLARIS10:
6018 case CHIP_POLARIS11:
6019 case CHIP_POLARIS12:
6020 case CHIP_VEGAM:
6021 case CHIP_CARRIZO:
6022 case CHIP_STONEY:
6023 /* chips with display hardware */
6024 return true;
6025 default:
6026 /* IP discovery */
6027 if (!adev->ip_versions[DCE_HWIP][0] ||
6028 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6029 return false;
6030 return true;
6031 }
6032 }
6033
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6034 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6035 uint32_t inst, uint32_t reg_addr, char reg_name[],
6036 uint32_t expected_value, uint32_t mask)
6037 {
6038 uint32_t ret = 0;
6039 uint32_t old_ = 0;
6040 uint32_t tmp_ = RREG32(reg_addr);
6041 uint32_t loop = adev->usec_timeout;
6042
6043 while ((tmp_ & (mask)) != (expected_value)) {
6044 if (old_ != tmp_) {
6045 loop = adev->usec_timeout;
6046 old_ = tmp_;
6047 } else
6048 udelay(1);
6049 tmp_ = RREG32(reg_addr);
6050 loop--;
6051 if (!loop) {
6052 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6053 inst, reg_name, (uint32_t)expected_value,
6054 (uint32_t)(tmp_ & (mask)));
6055 ret = -ETIMEDOUT;
6056 break;
6057 }
6058 }
6059 return ret;
6060 }
6061