1  /*
2   * SPDX-License-Identifier: MIT
3   *
4   * Copyright © 2008-2018 Intel Corporation
5   */
6  
7  #ifndef _I915_GPU_ERROR_H_
8  #define _I915_GPU_ERROR_H_
9  
10  #include <linux/atomic.h>
11  #include <linux/kref.h>
12  #include <linux/ktime.h>
13  #include <linux/sched.h>
14  
15  #include <drm/drm_mm.h>
16  
17  #include "display/intel_display_device.h"
18  #include "gt/intel_engine.h"
19  #include "gt/intel_gt_types.h"
20  #include "gt/uc/intel_uc_fw.h"
21  
22  #include "intel_device_info.h"
23  
24  #include "i915_gem.h"
25  #include "i915_gem_gtt.h"
26  #include "i915_params.h"
27  #include "i915_scheduler.h"
28  
29  struct drm_i915_private;
30  struct i915_vma_compress;
31  struct intel_engine_capture_vma;
32  struct intel_overlay_error_state;
33  
34  struct i915_vma_coredump {
35  	struct i915_vma_coredump *next;
36  
37  	char name[20];
38  
39  	u64 gtt_offset;
40  	u64 gtt_size;
41  	u32 gtt_page_sizes;
42  
43  	int unused;
44  	struct list_head page_list;
45  };
46  
47  struct i915_request_coredump {
48  	unsigned long flags;
49  	pid_t pid;
50  	u32 context;
51  	u32 seqno;
52  	u32 head;
53  	u32 tail;
54  	struct i915_sched_attr sched_attr;
55  };
56  
57  struct __guc_capture_parsed_output;
58  
59  struct intel_engine_coredump {
60  	const struct intel_engine_cs *engine;
61  
62  	bool hung;
63  	bool simulated;
64  	u32 reset_count;
65  
66  	/* position of active request inside the ring */
67  	u32 rq_head, rq_post, rq_tail;
68  
69  	/* Register state */
70  	u32 ccid;
71  	u32 start;
72  	u32 tail;
73  	u32 head;
74  	u32 ctl;
75  	u32 mode;
76  	u32 hws;
77  	u32 ipeir;
78  	u32 ipehr;
79  	u32 esr;
80  	u32 bbstate;
81  	u32 instpm;
82  	u32 instps;
83  	u64 bbaddr;
84  	u64 acthd;
85  	u32 fault_reg;
86  	u64 faddr;
87  	u32 rc_psmi; /* sleep state */
88  	u32 nopid;
89  	u32 excc;
90  	u32 cmd_cctl;
91  	u32 cscmdop;
92  	u32 ctx_sr_ctl;
93  	u32 dma_faddr_hi;
94  	u32 dma_faddr_lo;
95  	struct intel_instdone instdone;
96  
97  	/* GuC matched capture-lists info */
98  	struct intel_guc_state_capture *guc_capture;
99  	struct __guc_capture_parsed_output *guc_capture_node;
100  
101  	struct i915_gem_context_coredump {
102  		char comm[TASK_COMM_LEN];
103  
104  		u64 total_runtime;
105  		u64 avg_runtime;
106  
107  		pid_t pid;
108  		int active;
109  		int guilty;
110  		struct i915_sched_attr sched_attr;
111  		u32 hwsp_seqno;
112  	} context;
113  
114  	struct i915_vma_coredump *vma;
115  
116  	struct i915_request_coredump execlist[EXECLIST_MAX_PORTS];
117  	unsigned int num_ports;
118  
119  	struct {
120  		u32 gfx_mode;
121  		union {
122  			u64 pdp[4];
123  			u32 pp_dir_base;
124  		};
125  	} vm_info;
126  
127  	struct intel_engine_coredump *next;
128  };
129  
130  struct intel_ctb_coredump {
131  	u32 raw_head, head;
132  	u32 raw_tail, tail;
133  	u32 raw_status;
134  	u32 desc_offset;
135  	u32 cmds_offset;
136  	u32 size;
137  };
138  
139  struct intel_gt_coredump {
140  	const struct intel_gt *_gt;
141  	bool awake;
142  	bool simulated;
143  
144  	struct intel_gt_info info;
145  
146  	/* Generic register state */
147  	u32 eir;
148  	u32 pgtbl_er;
149  	u32 ier;
150  	u32 gtier[6], ngtier;
151  	u32 forcewake;
152  	u32 error; /* gen6+ */
153  	u32 err_int; /* gen7 */
154  	u32 fault_data0; /* gen8, gen9 */
155  	u32 fault_data1; /* gen8, gen9 */
156  	u32 done_reg;
157  	u32 gac_eco;
158  	u32 gam_ecochk;
159  	u32 gab_ctl;
160  	u32 gfx_mode;
161  	u32 gtt_cache;
162  	u32 aux_err; /* gen12 */
163  	u32 gam_done; /* gen12 */
164  	u32 clock_frequency;
165  	u32 clock_period_ns;
166  
167  	/* Display related */
168  	u32 derrmr;
169  	u32 sfc_done[I915_MAX_SFC]; /* gen12 */
170  
171  	u32 nfence;
172  	u64 fence[I915_MAX_NUM_FENCES];
173  
174  	struct intel_engine_coredump *engine;
175  
176  	struct intel_uc_coredump {
177  		struct intel_uc_fw guc_fw;
178  		struct intel_uc_fw huc_fw;
179  		struct guc_info {
180  			struct intel_ctb_coredump ctb[2];
181  			struct i915_vma_coredump *vma_ctb;
182  			struct i915_vma_coredump *vma_log;
183  			u32 timestamp;
184  			u16 last_fence;
185  			bool is_guc_capture;
186  		} guc;
187  	} *uc;
188  
189  	struct intel_gt_coredump *next;
190  };
191  
192  struct i915_gpu_coredump {
193  	struct kref ref;
194  	ktime_t time;
195  	ktime_t boottime;
196  	ktime_t uptime;
197  	unsigned long capture;
198  
199  	struct drm_i915_private *i915;
200  
201  	struct intel_gt_coredump *gt;
202  
203  	char error_msg[128];
204  	bool simulated;
205  	bool wakelock;
206  	bool suspended;
207  	int iommu;
208  	u32 reset_count;
209  	u32 suspend_count;
210  
211  	struct intel_device_info device_info;
212  	struct intel_runtime_info runtime_info;
213  	struct intel_display_device_info display_device_info;
214  	struct intel_display_runtime_info display_runtime_info;
215  	struct intel_driver_caps driver_caps;
216  	struct i915_params params;
217  
218  	struct intel_overlay_error_state *overlay;
219  
220  	struct scatterlist *sgl, *fit;
221  };
222  
223  struct i915_gpu_error {
224  	/* For reset and error_state handling. */
225  	spinlock_t lock;
226  	/* Protected by the above dev->gpu_error.lock. */
227  	struct i915_gpu_coredump *first_error;
228  
229  	atomic_t pending_fb_pin;
230  
231  	/** Number of times the device has been reset (global) */
232  	atomic_t reset_count;
233  
234  	/** Number of times an engine has been reset */
235  	atomic_t reset_engine_count[I915_NUM_ENGINES];
236  };
237  
238  struct drm_i915_error_state_buf {
239  	struct drm_i915_private *i915;
240  	struct scatterlist *sgl, *cur, *end;
241  
242  	char *buf;
243  	size_t bytes;
244  	size_t size;
245  	loff_t iter;
246  
247  	int err;
248  };
249  
i915_reset_count(struct i915_gpu_error * error)250  static inline u32 i915_reset_count(struct i915_gpu_error *error)
251  {
252  	return atomic_read(&error->reset_count);
253  }
254  
i915_reset_engine_count(struct i915_gpu_error * error,const struct intel_engine_cs * engine)255  static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
256  					  const struct intel_engine_cs *engine)
257  {
258  	return atomic_read(&error->reset_engine_count[engine->uabi_class]);
259  }
260  
261  #define CORE_DUMP_FLAG_NONE           0x0
262  #define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
263  
264  #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) && IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
265  void intel_klog_error_capture(struct intel_gt *gt,
266  			      intel_engine_mask_t engine_mask);
267  #else
intel_klog_error_capture(struct intel_gt * gt,intel_engine_mask_t engine_mask)268  static inline void intel_klog_error_capture(struct intel_gt *gt,
269  					    intel_engine_mask_t engine_mask)
270  {
271  }
272  #endif
273  
274  #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
275  
276  __printf(2, 3)
277  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
278  void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m,
279  			       const struct intel_engine_cs *engine,
280  			       const struct i915_vma_coredump *vma);
281  struct i915_vma_coredump *
282  intel_gpu_error_find_batch(const struct intel_engine_coredump *ee);
283  
284  struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
285  					    intel_engine_mask_t engine_mask, u32 dump_flags);
286  void i915_capture_error_state(struct intel_gt *gt,
287  			      intel_engine_mask_t engine_mask, u32 dump_flags);
288  
289  struct i915_gpu_coredump *
290  i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
291  
292  struct intel_gt_coredump *
293  intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags);
294  
295  struct intel_engine_coredump *
296  intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags);
297  
298  struct intel_engine_capture_vma *
299  intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
300  				  struct i915_request *rq,
301  				  gfp_t gfp);
302  
303  void intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
304  				   struct intel_engine_capture_vma *capture,
305  				   struct i915_vma_compress *compress);
306  
307  struct i915_vma_compress *
308  i915_vma_capture_prepare(struct intel_gt_coredump *gt);
309  
310  void i915_vma_capture_finish(struct intel_gt_coredump *gt,
311  			     struct i915_vma_compress *compress);
312  
313  void i915_error_state_store(struct i915_gpu_coredump *error);
314  
315  static inline struct i915_gpu_coredump *
i915_gpu_coredump_get(struct i915_gpu_coredump * gpu)316  i915_gpu_coredump_get(struct i915_gpu_coredump *gpu)
317  {
318  	kref_get(&gpu->ref);
319  	return gpu;
320  }
321  
322  ssize_t
323  i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error,
324  				 char *buf, loff_t offset, size_t count);
325  
326  void __i915_gpu_coredump_free(struct kref *kref);
i915_gpu_coredump_put(struct i915_gpu_coredump * gpu)327  static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
328  {
329  	if (gpu)
330  		kref_put(&gpu->ref, __i915_gpu_coredump_free);
331  }
332  
333  struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);
334  void i915_reset_error_state(struct drm_i915_private *i915);
335  void i915_disable_error_state(struct drm_i915_private *i915, int err);
336  
337  #else
338  
339  __printf(2, 3)
340  static inline void
i915_error_printf(struct drm_i915_error_state_buf * e,const char * f,...)341  i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
342  {
343  }
344  
345  static inline void
i915_capture_error_state(struct intel_gt * gt,intel_engine_mask_t engine_mask,u32 dump_flags)346  i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags)
347  {
348  }
349  
350  static inline struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private * i915,gfp_t gfp)351  i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp)
352  {
353  	return NULL;
354  }
355  
356  static inline struct intel_gt_coredump *
intel_gt_coredump_alloc(struct intel_gt * gt,gfp_t gfp,u32 dump_flags)357  intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags)
358  {
359  	return NULL;
360  }
361  
362  static inline struct intel_engine_coredump *
intel_engine_coredump_alloc(struct intel_engine_cs * engine,gfp_t gfp,u32 dump_flags)363  intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags)
364  {
365  	return NULL;
366  }
367  
368  static inline struct intel_engine_capture_vma *
intel_engine_coredump_add_request(struct intel_engine_coredump * ee,struct i915_request * rq,gfp_t gfp)369  intel_engine_coredump_add_request(struct intel_engine_coredump *ee,
370  				  struct i915_request *rq,
371  				  gfp_t gfp)
372  {
373  	return NULL;
374  }
375  
376  static inline void
intel_engine_coredump_add_vma(struct intel_engine_coredump * ee,struct intel_engine_capture_vma * capture,struct i915_vma_compress * compress)377  intel_engine_coredump_add_vma(struct intel_engine_coredump *ee,
378  			      struct intel_engine_capture_vma *capture,
379  			      struct i915_vma_compress *compress)
380  {
381  }
382  
383  static inline struct i915_vma_compress *
i915_vma_capture_prepare(struct intel_gt_coredump * gt)384  i915_vma_capture_prepare(struct intel_gt_coredump *gt)
385  {
386  	return NULL;
387  }
388  
389  static inline void
i915_vma_capture_finish(struct intel_gt_coredump * gt,struct i915_vma_compress * compress)390  i915_vma_capture_finish(struct intel_gt_coredump *gt,
391  			struct i915_vma_compress *compress)
392  {
393  }
394  
395  static inline void
i915_error_state_store(struct i915_gpu_coredump * error)396  i915_error_state_store(struct i915_gpu_coredump *error)
397  {
398  }
399  
i915_gpu_coredump_put(struct i915_gpu_coredump * gpu)400  static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)
401  {
402  }
403  
404  static inline struct i915_gpu_coredump *
i915_first_error_state(struct drm_i915_private * i915)405  i915_first_error_state(struct drm_i915_private *i915)
406  {
407  	return ERR_PTR(-ENODEV);
408  }
409  
i915_reset_error_state(struct drm_i915_private * i915)410  static inline void i915_reset_error_state(struct drm_i915_private *i915)
411  {
412  }
413  
i915_disable_error_state(struct drm_i915_private * i915,int err)414  static inline void i915_disable_error_state(struct drm_i915_private *i915,
415  					    int err)
416  {
417  }
418  
419  #endif /* IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) */
420  
421  #endif /* _I915_GPU_ERROR_H_ */
422