1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "../i915_selftest.h"
28 #include "i915_random.h"
29 #include "igt_flush_test.h"
30 #include "igt_wedge_me.h"
31 
32 #include "mock_context.h"
33 #include "mock_drm.h"
34 
35 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
36 
37 struct hang {
38 	struct drm_i915_private *i915;
39 	struct drm_i915_gem_object *hws;
40 	struct drm_i915_gem_object *obj;
41 	struct i915_gem_context *ctx;
42 	u32 *seqno;
43 	u32 *batch;
44 };
45 
hang_init(struct hang * h,struct drm_i915_private * i915)46 static int hang_init(struct hang *h, struct drm_i915_private *i915)
47 {
48 	void *vaddr;
49 	int err;
50 
51 	memset(h, 0, sizeof(*h));
52 	h->i915 = i915;
53 
54 	h->ctx = kernel_context(i915);
55 	if (IS_ERR(h->ctx))
56 		return PTR_ERR(h->ctx);
57 
58 	h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
59 	if (IS_ERR(h->hws)) {
60 		err = PTR_ERR(h->hws);
61 		goto err_ctx;
62 	}
63 
64 	h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
65 	if (IS_ERR(h->obj)) {
66 		err = PTR_ERR(h->obj);
67 		goto err_hws;
68 	}
69 
70 	i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
71 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
72 	if (IS_ERR(vaddr)) {
73 		err = PTR_ERR(vaddr);
74 		goto err_obj;
75 	}
76 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
77 
78 	vaddr = i915_gem_object_pin_map(h->obj,
79 					HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
80 	if (IS_ERR(vaddr)) {
81 		err = PTR_ERR(vaddr);
82 		goto err_unpin_hws;
83 	}
84 	h->batch = vaddr;
85 
86 	return 0;
87 
88 err_unpin_hws:
89 	i915_gem_object_unpin_map(h->hws);
90 err_obj:
91 	i915_gem_object_put(h->obj);
92 err_hws:
93 	i915_gem_object_put(h->hws);
94 err_ctx:
95 	kernel_context_close(h->ctx);
96 	return err;
97 }
98 
hws_address(const struct i915_vma * hws,const struct i915_request * rq)99 static u64 hws_address(const struct i915_vma *hws,
100 		       const struct i915_request *rq)
101 {
102 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
103 }
104 
emit_recurse_batch(struct hang * h,struct i915_request * rq)105 static int emit_recurse_batch(struct hang *h,
106 			      struct i915_request *rq)
107 {
108 	struct drm_i915_private *i915 = h->i915;
109 	struct i915_address_space *vm =
110 		rq->gem_context->ppgtt ?
111 		&rq->gem_context->ppgtt->vm :
112 		&i915->ggtt.vm;
113 	struct i915_vma *hws, *vma;
114 	unsigned int flags;
115 	u32 *batch;
116 	int err;
117 
118 	vma = i915_vma_instance(h->obj, vm, NULL);
119 	if (IS_ERR(vma))
120 		return PTR_ERR(vma);
121 
122 	hws = i915_vma_instance(h->hws, vm, NULL);
123 	if (IS_ERR(hws))
124 		return PTR_ERR(hws);
125 
126 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
127 	if (err)
128 		return err;
129 
130 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
131 	if (err)
132 		goto unpin_vma;
133 
134 	err = i915_vma_move_to_active(vma, rq, 0);
135 	if (err)
136 		goto unpin_hws;
137 
138 	if (!i915_gem_object_has_active_reference(vma->obj)) {
139 		i915_gem_object_get(vma->obj);
140 		i915_gem_object_set_active_reference(vma->obj);
141 	}
142 
143 	err = i915_vma_move_to_active(hws, rq, 0);
144 	if (err)
145 		goto unpin_hws;
146 
147 	if (!i915_gem_object_has_active_reference(hws->obj)) {
148 		i915_gem_object_get(hws->obj);
149 		i915_gem_object_set_active_reference(hws->obj);
150 	}
151 
152 	batch = h->batch;
153 	if (INTEL_GEN(i915) >= 8) {
154 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
155 		*batch++ = lower_32_bits(hws_address(hws, rq));
156 		*batch++ = upper_32_bits(hws_address(hws, rq));
157 		*batch++ = rq->fence.seqno;
158 		*batch++ = MI_ARB_CHECK;
159 
160 		memset(batch, 0, 1024);
161 		batch += 1024 / sizeof(*batch);
162 
163 		*batch++ = MI_ARB_CHECK;
164 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
165 		*batch++ = lower_32_bits(vma->node.start);
166 		*batch++ = upper_32_bits(vma->node.start);
167 	} else if (INTEL_GEN(i915) >= 6) {
168 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
169 		*batch++ = 0;
170 		*batch++ = lower_32_bits(hws_address(hws, rq));
171 		*batch++ = rq->fence.seqno;
172 		*batch++ = MI_ARB_CHECK;
173 
174 		memset(batch, 0, 1024);
175 		batch += 1024 / sizeof(*batch);
176 
177 		*batch++ = MI_ARB_CHECK;
178 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
179 		*batch++ = lower_32_bits(vma->node.start);
180 	} else if (INTEL_GEN(i915) >= 4) {
181 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
182 		*batch++ = 0;
183 		*batch++ = lower_32_bits(hws_address(hws, rq));
184 		*batch++ = rq->fence.seqno;
185 		*batch++ = MI_ARB_CHECK;
186 
187 		memset(batch, 0, 1024);
188 		batch += 1024 / sizeof(*batch);
189 
190 		*batch++ = MI_ARB_CHECK;
191 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
192 		*batch++ = lower_32_bits(vma->node.start);
193 	} else {
194 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
195 		*batch++ = lower_32_bits(hws_address(hws, rq));
196 		*batch++ = rq->fence.seqno;
197 		*batch++ = MI_ARB_CHECK;
198 
199 		memset(batch, 0, 1024);
200 		batch += 1024 / sizeof(*batch);
201 
202 		*batch++ = MI_ARB_CHECK;
203 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
204 		*batch++ = lower_32_bits(vma->node.start);
205 	}
206 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
207 	i915_gem_chipset_flush(h->i915);
208 
209 	flags = 0;
210 	if (INTEL_GEN(vm->i915) <= 5)
211 		flags |= I915_DISPATCH_SECURE;
212 
213 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
214 
215 unpin_hws:
216 	i915_vma_unpin(hws);
217 unpin_vma:
218 	i915_vma_unpin(vma);
219 	return err;
220 }
221 
222 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)223 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
224 {
225 	struct i915_request *rq;
226 	int err;
227 
228 	if (i915_gem_object_is_active(h->obj)) {
229 		struct drm_i915_gem_object *obj;
230 		void *vaddr;
231 
232 		obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
233 		if (IS_ERR(obj))
234 			return ERR_CAST(obj);
235 
236 		vaddr = i915_gem_object_pin_map(obj,
237 						HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
238 		if (IS_ERR(vaddr)) {
239 			i915_gem_object_put(obj);
240 			return ERR_CAST(vaddr);
241 		}
242 
243 		i915_gem_object_unpin_map(h->obj);
244 		i915_gem_object_put(h->obj);
245 
246 		h->obj = obj;
247 		h->batch = vaddr;
248 	}
249 
250 	rq = i915_request_alloc(engine, h->ctx);
251 	if (IS_ERR(rq))
252 		return rq;
253 
254 	err = emit_recurse_batch(h, rq);
255 	if (err) {
256 		i915_request_add(rq);
257 		return ERR_PTR(err);
258 	}
259 
260 	return rq;
261 }
262 
hws_seqno(const struct hang * h,const struct i915_request * rq)263 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
264 {
265 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
266 }
267 
hang_fini(struct hang * h)268 static void hang_fini(struct hang *h)
269 {
270 	*h->batch = MI_BATCH_BUFFER_END;
271 	i915_gem_chipset_flush(h->i915);
272 
273 	i915_gem_object_unpin_map(h->obj);
274 	i915_gem_object_put(h->obj);
275 
276 	i915_gem_object_unpin_map(h->hws);
277 	i915_gem_object_put(h->hws);
278 
279 	kernel_context_close(h->ctx);
280 
281 	igt_flush_test(h->i915, I915_WAIT_LOCKED);
282 }
283 
wait_until_running(struct hang * h,struct i915_request * rq)284 static bool wait_until_running(struct hang *h, struct i915_request *rq)
285 {
286 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
287 					       rq->fence.seqno),
288 			     10) &&
289 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
290 					    rq->fence.seqno),
291 			  1000));
292 }
293 
igt_hang_sanitycheck(void * arg)294 static int igt_hang_sanitycheck(void *arg)
295 {
296 	struct drm_i915_private *i915 = arg;
297 	struct i915_request *rq;
298 	struct intel_engine_cs *engine;
299 	enum intel_engine_id id;
300 	struct hang h;
301 	int err;
302 
303 	/* Basic check that we can execute our hanging batch */
304 
305 	mutex_lock(&i915->drm.struct_mutex);
306 	err = hang_init(&h, i915);
307 	if (err)
308 		goto unlock;
309 
310 	for_each_engine(engine, i915, id) {
311 		long timeout;
312 
313 		if (!intel_engine_can_store_dword(engine))
314 			continue;
315 
316 		rq = hang_create_request(&h, engine);
317 		if (IS_ERR(rq)) {
318 			err = PTR_ERR(rq);
319 			pr_err("Failed to create request for %s, err=%d\n",
320 			       engine->name, err);
321 			goto fini;
322 		}
323 
324 		i915_request_get(rq);
325 
326 		*h.batch = MI_BATCH_BUFFER_END;
327 		i915_gem_chipset_flush(i915);
328 
329 		i915_request_add(rq);
330 
331 		timeout = i915_request_wait(rq,
332 					    I915_WAIT_LOCKED,
333 					    MAX_SCHEDULE_TIMEOUT);
334 		i915_request_put(rq);
335 
336 		if (timeout < 0) {
337 			err = timeout;
338 			pr_err("Wait for request failed on %s, err=%d\n",
339 			       engine->name, err);
340 			goto fini;
341 		}
342 	}
343 
344 fini:
345 	hang_fini(&h);
346 unlock:
347 	mutex_unlock(&i915->drm.struct_mutex);
348 	return err;
349 }
350 
global_reset_lock(struct drm_i915_private * i915)351 static void global_reset_lock(struct drm_i915_private *i915)
352 {
353 	struct intel_engine_cs *engine;
354 	enum intel_engine_id id;
355 
356 	pr_debug("%s: current gpu_error=%08lx\n",
357 		 __func__, i915->gpu_error.flags);
358 
359 	while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
360 		wait_event(i915->gpu_error.reset_queue,
361 			   !test_bit(I915_RESET_BACKOFF,
362 				     &i915->gpu_error.flags));
363 
364 	for_each_engine(engine, i915, id) {
365 		while (test_and_set_bit(I915_RESET_ENGINE + id,
366 					&i915->gpu_error.flags))
367 			wait_on_bit(&i915->gpu_error.flags,
368 				    I915_RESET_ENGINE + id,
369 				    TASK_UNINTERRUPTIBLE);
370 	}
371 }
372 
global_reset_unlock(struct drm_i915_private * i915)373 static void global_reset_unlock(struct drm_i915_private *i915)
374 {
375 	struct intel_engine_cs *engine;
376 	enum intel_engine_id id;
377 
378 	for_each_engine(engine, i915, id)
379 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
380 
381 	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
382 	wake_up_all(&i915->gpu_error.reset_queue);
383 }
384 
igt_global_reset(void * arg)385 static int igt_global_reset(void *arg)
386 {
387 	struct drm_i915_private *i915 = arg;
388 	unsigned int reset_count;
389 	int err = 0;
390 
391 	/* Check that we can issue a global GPU reset */
392 
393 	global_reset_lock(i915);
394 	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
395 
396 	mutex_lock(&i915->drm.struct_mutex);
397 	reset_count = i915_reset_count(&i915->gpu_error);
398 
399 	i915_reset(i915, ALL_ENGINES, NULL);
400 
401 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
402 		pr_err("No GPU reset recorded!\n");
403 		err = -EINVAL;
404 	}
405 	mutex_unlock(&i915->drm.struct_mutex);
406 
407 	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
408 	global_reset_unlock(i915);
409 
410 	if (i915_terminally_wedged(&i915->gpu_error))
411 		err = -EIO;
412 
413 	return err;
414 }
415 
wait_for_idle(struct intel_engine_cs * engine)416 static bool wait_for_idle(struct intel_engine_cs *engine)
417 {
418 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
419 }
420 
__igt_reset_engine(struct drm_i915_private * i915,bool active)421 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
422 {
423 	struct intel_engine_cs *engine;
424 	enum intel_engine_id id;
425 	struct hang h;
426 	int err = 0;
427 
428 	/* Check that we can issue an engine reset on an idle engine (no-op) */
429 
430 	if (!intel_has_reset_engine(i915))
431 		return 0;
432 
433 	if (active) {
434 		mutex_lock(&i915->drm.struct_mutex);
435 		err = hang_init(&h, i915);
436 		mutex_unlock(&i915->drm.struct_mutex);
437 		if (err)
438 			return err;
439 	}
440 
441 	for_each_engine(engine, i915, id) {
442 		unsigned int reset_count, reset_engine_count;
443 		IGT_TIMEOUT(end_time);
444 
445 		if (active && !intel_engine_can_store_dword(engine))
446 			continue;
447 
448 		if (!wait_for_idle(engine)) {
449 			pr_err("%s failed to idle before reset\n",
450 			       engine->name);
451 			err = -EIO;
452 			break;
453 		}
454 
455 		reset_count = i915_reset_count(&i915->gpu_error);
456 		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
457 							     engine);
458 
459 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
460 		do {
461 			u32 seqno = intel_engine_get_seqno(engine);
462 
463 			if (active) {
464 				struct i915_request *rq;
465 
466 				mutex_lock(&i915->drm.struct_mutex);
467 				rq = hang_create_request(&h, engine);
468 				if (IS_ERR(rq)) {
469 					err = PTR_ERR(rq);
470 					mutex_unlock(&i915->drm.struct_mutex);
471 					break;
472 				}
473 
474 				i915_request_get(rq);
475 				i915_request_add(rq);
476 				mutex_unlock(&i915->drm.struct_mutex);
477 
478 				if (!wait_until_running(&h, rq)) {
479 					struct drm_printer p = drm_info_printer(i915->drm.dev);
480 
481 					pr_err("%s: Failed to start request %x, at %x\n",
482 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
483 					intel_engine_dump(engine, &p,
484 							  "%s\n", engine->name);
485 
486 					i915_request_put(rq);
487 					err = -EIO;
488 					break;
489 				}
490 
491 				GEM_BUG_ON(!rq->global_seqno);
492 				seqno = rq->global_seqno - 1;
493 				i915_request_put(rq);
494 			}
495 
496 			err = i915_reset_engine(engine, NULL);
497 			if (err) {
498 				pr_err("i915_reset_engine failed\n");
499 				break;
500 			}
501 
502 			if (i915_reset_count(&i915->gpu_error) != reset_count) {
503 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
504 				err = -EINVAL;
505 				break;
506 			}
507 
508 			reset_engine_count += active;
509 			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
510 			    reset_engine_count) {
511 				pr_err("%s engine reset %srecorded!\n",
512 				       engine->name, active ? "not " : "");
513 				err = -EINVAL;
514 				break;
515 			}
516 
517 			if (!wait_for_idle(engine)) {
518 				struct drm_printer p =
519 					drm_info_printer(i915->drm.dev);
520 
521 				pr_err("%s failed to idle after reset\n",
522 				       engine->name);
523 				intel_engine_dump(engine, &p,
524 						  "%s\n", engine->name);
525 
526 				err = -EIO;
527 				break;
528 			}
529 		} while (time_before(jiffies, end_time));
530 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
531 
532 		if (err)
533 			break;
534 
535 		err = igt_flush_test(i915, 0);
536 		if (err)
537 			break;
538 	}
539 
540 	if (i915_terminally_wedged(&i915->gpu_error))
541 		err = -EIO;
542 
543 	if (active) {
544 		mutex_lock(&i915->drm.struct_mutex);
545 		hang_fini(&h);
546 		mutex_unlock(&i915->drm.struct_mutex);
547 	}
548 
549 	return err;
550 }
551 
igt_reset_idle_engine(void * arg)552 static int igt_reset_idle_engine(void *arg)
553 {
554 	return __igt_reset_engine(arg, false);
555 }
556 
igt_reset_active_engine(void * arg)557 static int igt_reset_active_engine(void *arg)
558 {
559 	return __igt_reset_engine(arg, true);
560 }
561 
562 struct active_engine {
563 	struct task_struct *task;
564 	struct intel_engine_cs *engine;
565 	unsigned long resets;
566 	unsigned int flags;
567 };
568 
569 #define TEST_ACTIVE	BIT(0)
570 #define TEST_OTHERS	BIT(1)
571 #define TEST_SELF	BIT(2)
572 #define TEST_PRIORITY	BIT(3)
573 
active_request_put(struct i915_request * rq)574 static int active_request_put(struct i915_request *rq)
575 {
576 	int err = 0;
577 
578 	if (!rq)
579 		return 0;
580 
581 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
582 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
583 			  rq->engine->name,
584 			  rq->fence.context,
585 			  rq->fence.seqno,
586 			  i915_request_global_seqno(rq));
587 		GEM_TRACE_DUMP();
588 
589 		i915_gem_set_wedged(rq->i915);
590 		err = -EIO;
591 	}
592 
593 	i915_request_put(rq);
594 
595 	return err;
596 }
597 
active_engine(void * data)598 static int active_engine(void *data)
599 {
600 	I915_RND_STATE(prng);
601 	struct active_engine *arg = data;
602 	struct intel_engine_cs *engine = arg->engine;
603 	struct i915_request *rq[8] = {};
604 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
605 	struct drm_file *file;
606 	unsigned long count = 0;
607 	int err = 0;
608 
609 	file = mock_file(engine->i915);
610 	if (IS_ERR(file))
611 		return PTR_ERR(file);
612 
613 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
614 		mutex_lock(&engine->i915->drm.struct_mutex);
615 		ctx[count] = live_context(engine->i915, file);
616 		mutex_unlock(&engine->i915->drm.struct_mutex);
617 		if (IS_ERR(ctx[count])) {
618 			err = PTR_ERR(ctx[count]);
619 			while (--count)
620 				i915_gem_context_put(ctx[count]);
621 			goto err_file;
622 		}
623 	}
624 
625 	while (!kthread_should_stop()) {
626 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
627 		struct i915_request *old = rq[idx];
628 		struct i915_request *new;
629 
630 		mutex_lock(&engine->i915->drm.struct_mutex);
631 		new = i915_request_alloc(engine, ctx[idx]);
632 		if (IS_ERR(new)) {
633 			mutex_unlock(&engine->i915->drm.struct_mutex);
634 			err = PTR_ERR(new);
635 			break;
636 		}
637 
638 		if (arg->flags & TEST_PRIORITY)
639 			ctx[idx]->sched.priority =
640 				i915_prandom_u32_max_state(512, &prng);
641 
642 		rq[idx] = i915_request_get(new);
643 		i915_request_add(new);
644 		mutex_unlock(&engine->i915->drm.struct_mutex);
645 
646 		err = active_request_put(old);
647 		if (err)
648 			break;
649 
650 		cond_resched();
651 	}
652 
653 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
654 		int err__ = active_request_put(rq[count]);
655 
656 		/* Keep the first error */
657 		if (!err)
658 			err = err__;
659 	}
660 
661 err_file:
662 	mock_file_free(engine->i915, file);
663 	return err;
664 }
665 
__igt_reset_engines(struct drm_i915_private * i915,const char * test_name,unsigned int flags)666 static int __igt_reset_engines(struct drm_i915_private *i915,
667 			       const char *test_name,
668 			       unsigned int flags)
669 {
670 	struct intel_engine_cs *engine, *other;
671 	enum intel_engine_id id, tmp;
672 	struct hang h;
673 	int err = 0;
674 
675 	/* Check that issuing a reset on one engine does not interfere
676 	 * with any other engine.
677 	 */
678 
679 	if (!intel_has_reset_engine(i915))
680 		return 0;
681 
682 	if (flags & TEST_ACTIVE) {
683 		mutex_lock(&i915->drm.struct_mutex);
684 		err = hang_init(&h, i915);
685 		mutex_unlock(&i915->drm.struct_mutex);
686 		if (err)
687 			return err;
688 
689 		if (flags & TEST_PRIORITY)
690 			h.ctx->sched.priority = 1024;
691 	}
692 
693 	for_each_engine(engine, i915, id) {
694 		struct active_engine threads[I915_NUM_ENGINES] = {};
695 		unsigned long global = i915_reset_count(&i915->gpu_error);
696 		unsigned long count = 0, reported;
697 		IGT_TIMEOUT(end_time);
698 
699 		if (flags & TEST_ACTIVE &&
700 		    !intel_engine_can_store_dword(engine))
701 			continue;
702 
703 		if (!wait_for_idle(engine)) {
704 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
705 			       engine->name, test_name);
706 			err = -EIO;
707 			break;
708 		}
709 
710 		memset(threads, 0, sizeof(threads));
711 		for_each_engine(other, i915, tmp) {
712 			struct task_struct *tsk;
713 
714 			threads[tmp].resets =
715 				i915_reset_engine_count(&i915->gpu_error,
716 							other);
717 
718 			if (!(flags & TEST_OTHERS))
719 				continue;
720 
721 			if (other == engine && !(flags & TEST_SELF))
722 				continue;
723 
724 			threads[tmp].engine = other;
725 			threads[tmp].flags = flags;
726 
727 			tsk = kthread_run(active_engine, &threads[tmp],
728 					  "igt/%s", other->name);
729 			if (IS_ERR(tsk)) {
730 				err = PTR_ERR(tsk);
731 				goto unwind;
732 			}
733 
734 			threads[tmp].task = tsk;
735 			get_task_struct(tsk);
736 		}
737 
738 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739 		do {
740 			u32 seqno = intel_engine_get_seqno(engine);
741 			struct i915_request *rq = NULL;
742 
743 			if (flags & TEST_ACTIVE) {
744 				mutex_lock(&i915->drm.struct_mutex);
745 				rq = hang_create_request(&h, engine);
746 				if (IS_ERR(rq)) {
747 					err = PTR_ERR(rq);
748 					mutex_unlock(&i915->drm.struct_mutex);
749 					break;
750 				}
751 
752 				i915_request_get(rq);
753 				i915_request_add(rq);
754 				mutex_unlock(&i915->drm.struct_mutex);
755 
756 				if (!wait_until_running(&h, rq)) {
757 					struct drm_printer p = drm_info_printer(i915->drm.dev);
758 
759 					pr_err("%s: Failed to start request %x, at %x\n",
760 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
761 					intel_engine_dump(engine, &p,
762 							  "%s\n", engine->name);
763 
764 					i915_request_put(rq);
765 					err = -EIO;
766 					break;
767 				}
768 
769 				GEM_BUG_ON(!rq->global_seqno);
770 				seqno = rq->global_seqno - 1;
771 			}
772 
773 			err = i915_reset_engine(engine, NULL);
774 			if (err) {
775 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
776 				       engine->name, test_name, err);
777 				break;
778 			}
779 
780 			count++;
781 
782 			if (rq) {
783 				i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
784 				i915_request_put(rq);
785 			}
786 
787 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
788 				struct drm_printer p =
789 					drm_info_printer(i915->drm.dev);
790 
791 				pr_err("i915_reset_engine(%s:%s):"
792 				       " failed to idle after reset\n",
793 				       engine->name, test_name);
794 				intel_engine_dump(engine, &p,
795 						  "%s\n", engine->name);
796 
797 				err = -EIO;
798 				break;
799 			}
800 		} while (time_before(jiffies, end_time));
801 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
802 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
803 			engine->name, test_name, count);
804 
805 		reported = i915_reset_engine_count(&i915->gpu_error, engine);
806 		reported -= threads[engine->id].resets;
807 		if (reported != (flags & TEST_ACTIVE ? count : 0)) {
808 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
809 			       engine->name, test_name, count, reported,
810 			       (flags & TEST_ACTIVE ? count : 0));
811 			if (!err)
812 				err = -EINVAL;
813 		}
814 
815 unwind:
816 		for_each_engine(other, i915, tmp) {
817 			int ret;
818 
819 			if (!threads[tmp].task)
820 				continue;
821 
822 			ret = kthread_stop(threads[tmp].task);
823 			if (ret) {
824 				pr_err("kthread for other engine %s failed, err=%d\n",
825 				       other->name, ret);
826 				if (!err)
827 					err = ret;
828 			}
829 			put_task_struct(threads[tmp].task);
830 
831 			if (other != engine &&
832 			    threads[tmp].resets !=
833 			    i915_reset_engine_count(&i915->gpu_error, other)) {
834 				pr_err("Innocent engine %s was reset (count=%ld)\n",
835 				       other->name,
836 				       i915_reset_engine_count(&i915->gpu_error,
837 							       other) -
838 				       threads[tmp].resets);
839 				if (!err)
840 					err = -EINVAL;
841 			}
842 		}
843 
844 		if (global != i915_reset_count(&i915->gpu_error)) {
845 			pr_err("Global reset (count=%ld)!\n",
846 			       i915_reset_count(&i915->gpu_error) - global);
847 			if (!err)
848 				err = -EINVAL;
849 		}
850 
851 		if (err)
852 			break;
853 
854 		err = igt_flush_test(i915, 0);
855 		if (err)
856 			break;
857 	}
858 
859 	if (i915_terminally_wedged(&i915->gpu_error))
860 		err = -EIO;
861 
862 	if (flags & TEST_ACTIVE) {
863 		mutex_lock(&i915->drm.struct_mutex);
864 		hang_fini(&h);
865 		mutex_unlock(&i915->drm.struct_mutex);
866 	}
867 
868 	return err;
869 }
870 
igt_reset_engines(void * arg)871 static int igt_reset_engines(void *arg)
872 {
873 	static const struct {
874 		const char *name;
875 		unsigned int flags;
876 	} phases[] = {
877 		{ "idle", 0 },
878 		{ "active", TEST_ACTIVE },
879 		{ "others-idle", TEST_OTHERS },
880 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
881 		{
882 			"others-priority",
883 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
884 		},
885 		{
886 			"self-priority",
887 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
888 		},
889 		{ }
890 	};
891 	struct drm_i915_private *i915 = arg;
892 	typeof(*phases) *p;
893 	int err;
894 
895 	for (p = phases; p->name; p++) {
896 		if (p->flags & TEST_PRIORITY) {
897 			if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
898 				continue;
899 		}
900 
901 		err = __igt_reset_engines(arg, p->name, p->flags);
902 		if (err)
903 			return err;
904 	}
905 
906 	return 0;
907 }
908 
fake_hangcheck(struct i915_request * rq,u32 mask)909 static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
910 {
911 	struct i915_gpu_error *error = &rq->i915->gpu_error;
912 	u32 reset_count = i915_reset_count(error);
913 
914 	error->stalled_mask = mask;
915 
916 	/* set_bit() must be after we have setup the backchannel (mask) */
917 	smp_mb__before_atomic();
918 	set_bit(I915_RESET_HANDOFF, &error->flags);
919 
920 	wake_up_all(&error->wait_queue);
921 
922 	return reset_count;
923 }
924 
igt_reset_wait(void * arg)925 static int igt_reset_wait(void *arg)
926 {
927 	struct drm_i915_private *i915 = arg;
928 	struct i915_request *rq;
929 	unsigned int reset_count;
930 	struct hang h;
931 	long timeout;
932 	int err;
933 
934 	if (!intel_engine_can_store_dword(i915->engine[RCS]))
935 		return 0;
936 
937 	/* Check that we detect a stuck waiter and issue a reset */
938 
939 	global_reset_lock(i915);
940 
941 	mutex_lock(&i915->drm.struct_mutex);
942 	err = hang_init(&h, i915);
943 	if (err)
944 		goto unlock;
945 
946 	rq = hang_create_request(&h, i915->engine[RCS]);
947 	if (IS_ERR(rq)) {
948 		err = PTR_ERR(rq);
949 		goto fini;
950 	}
951 
952 	i915_request_get(rq);
953 	i915_request_add(rq);
954 
955 	if (!wait_until_running(&h, rq)) {
956 		struct drm_printer p = drm_info_printer(i915->drm.dev);
957 
958 		pr_err("%s: Failed to start request %x, at %x\n",
959 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
960 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
961 
962 		i915_gem_set_wedged(i915);
963 
964 		err = -EIO;
965 		goto out_rq;
966 	}
967 
968 	reset_count = fake_hangcheck(rq, ALL_ENGINES);
969 
970 	timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
971 	if (timeout < 0) {
972 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
973 		       timeout);
974 		err = timeout;
975 		goto out_rq;
976 	}
977 
978 	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
979 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
980 		pr_err("No GPU reset recorded!\n");
981 		err = -EINVAL;
982 		goto out_rq;
983 	}
984 
985 out_rq:
986 	i915_request_put(rq);
987 fini:
988 	hang_fini(&h);
989 unlock:
990 	mutex_unlock(&i915->drm.struct_mutex);
991 	global_reset_unlock(i915);
992 
993 	if (i915_terminally_wedged(&i915->gpu_error))
994 		return -EIO;
995 
996 	return err;
997 }
998 
999 struct evict_vma {
1000 	struct completion completion;
1001 	struct i915_vma *vma;
1002 };
1003 
evict_vma(void * data)1004 static int evict_vma(void *data)
1005 {
1006 	struct evict_vma *arg = data;
1007 	struct i915_address_space *vm = arg->vma->vm;
1008 	struct drm_i915_private *i915 = vm->i915;
1009 	struct drm_mm_node evict = arg->vma->node;
1010 	int err;
1011 
1012 	complete(&arg->completion);
1013 
1014 	mutex_lock(&i915->drm.struct_mutex);
1015 	err = i915_gem_evict_for_node(vm, &evict, 0);
1016 	mutex_unlock(&i915->drm.struct_mutex);
1017 
1018 	return err;
1019 }
1020 
__igt_reset_evict_vma(struct drm_i915_private * i915,struct i915_address_space * vm)1021 static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1022 				 struct i915_address_space *vm)
1023 {
1024 	struct drm_i915_gem_object *obj;
1025 	struct task_struct *tsk = NULL;
1026 	struct i915_request *rq;
1027 	struct evict_vma arg;
1028 	struct hang h;
1029 	int err;
1030 
1031 	if (!intel_engine_can_store_dword(i915->engine[RCS]))
1032 		return 0;
1033 
1034 	/* Check that we can recover an unbind stuck on a hanging request */
1035 
1036 	global_reset_lock(i915);
1037 
1038 	mutex_lock(&i915->drm.struct_mutex);
1039 	err = hang_init(&h, i915);
1040 	if (err)
1041 		goto unlock;
1042 
1043 	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1044 	if (IS_ERR(obj)) {
1045 		err = PTR_ERR(obj);
1046 		goto fini;
1047 	}
1048 
1049 	arg.vma = i915_vma_instance(obj, vm, NULL);
1050 	if (IS_ERR(arg.vma)) {
1051 		err = PTR_ERR(arg.vma);
1052 		goto out_obj;
1053 	}
1054 
1055 	rq = hang_create_request(&h, i915->engine[RCS]);
1056 	if (IS_ERR(rq)) {
1057 		err = PTR_ERR(rq);
1058 		goto out_obj;
1059 	}
1060 
1061 	err = i915_vma_pin(arg.vma, 0, 0,
1062 			   i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER);
1063 	if (err)
1064 		goto out_obj;
1065 
1066 	err = i915_vma_move_to_active(arg.vma, rq, EXEC_OBJECT_WRITE);
1067 	i915_vma_unpin(arg.vma);
1068 
1069 	i915_request_get(rq);
1070 	i915_request_add(rq);
1071 	if (err)
1072 		goto out_rq;
1073 
1074 	mutex_unlock(&i915->drm.struct_mutex);
1075 
1076 	if (!wait_until_running(&h, rq)) {
1077 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1078 
1079 		pr_err("%s: Failed to start request %x, at %x\n",
1080 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1081 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1082 
1083 		i915_gem_set_wedged(i915);
1084 		goto out_reset;
1085 	}
1086 
1087 	init_completion(&arg.completion);
1088 
1089 	tsk = kthread_run(evict_vma, &arg, "igt/evict_vma");
1090 	if (IS_ERR(tsk)) {
1091 		err = PTR_ERR(tsk);
1092 		tsk = NULL;
1093 		goto out_reset;
1094 	}
1095 
1096 	wait_for_completion(&arg.completion);
1097 
1098 	if (wait_for(waitqueue_active(&rq->execute), 10)) {
1099 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1100 
1101 		pr_err("igt/evict_vma kthread did not wait\n");
1102 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1103 
1104 		i915_gem_set_wedged(i915);
1105 		goto out_reset;
1106 	}
1107 
1108 out_reset:
1109 	fake_hangcheck(rq, intel_engine_flag(rq->engine));
1110 
1111 	if (tsk) {
1112 		struct igt_wedge_me w;
1113 
1114 		/* The reset, even indirectly, should take less than 10ms. */
1115 		igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1116 			err = kthread_stop(tsk);
1117 	}
1118 
1119 	mutex_lock(&i915->drm.struct_mutex);
1120 out_rq:
1121 	i915_request_put(rq);
1122 out_obj:
1123 	i915_gem_object_put(obj);
1124 fini:
1125 	hang_fini(&h);
1126 unlock:
1127 	mutex_unlock(&i915->drm.struct_mutex);
1128 	global_reset_unlock(i915);
1129 
1130 	if (i915_terminally_wedged(&i915->gpu_error))
1131 		return -EIO;
1132 
1133 	return err;
1134 }
1135 
igt_reset_evict_ggtt(void * arg)1136 static int igt_reset_evict_ggtt(void *arg)
1137 {
1138 	struct drm_i915_private *i915 = arg;
1139 
1140 	return __igt_reset_evict_vma(i915, &i915->ggtt.vm);
1141 }
1142 
igt_reset_evict_ppgtt(void * arg)1143 static int igt_reset_evict_ppgtt(void *arg)
1144 {
1145 	struct drm_i915_private *i915 = arg;
1146 	struct i915_gem_context *ctx;
1147 	int err;
1148 
1149 	mutex_lock(&i915->drm.struct_mutex);
1150 	ctx = kernel_context(i915);
1151 	mutex_unlock(&i915->drm.struct_mutex);
1152 	if (IS_ERR(ctx))
1153 		return PTR_ERR(ctx);
1154 
1155 	err = 0;
1156 	if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1157 		err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm);
1158 
1159 	kernel_context_close(ctx);
1160 	return err;
1161 }
1162 
wait_for_others(struct drm_i915_private * i915,struct intel_engine_cs * exclude)1163 static int wait_for_others(struct drm_i915_private *i915,
1164 			   struct intel_engine_cs *exclude)
1165 {
1166 	struct intel_engine_cs *engine;
1167 	enum intel_engine_id id;
1168 
1169 	for_each_engine(engine, i915, id) {
1170 		if (engine == exclude)
1171 			continue;
1172 
1173 		if (!wait_for_idle(engine))
1174 			return -EIO;
1175 	}
1176 
1177 	return 0;
1178 }
1179 
igt_reset_queue(void * arg)1180 static int igt_reset_queue(void *arg)
1181 {
1182 	struct drm_i915_private *i915 = arg;
1183 	struct intel_engine_cs *engine;
1184 	enum intel_engine_id id;
1185 	struct hang h;
1186 	int err;
1187 
1188 	/* Check that we replay pending requests following a hang */
1189 
1190 	global_reset_lock(i915);
1191 
1192 	mutex_lock(&i915->drm.struct_mutex);
1193 	err = hang_init(&h, i915);
1194 	if (err)
1195 		goto unlock;
1196 
1197 	for_each_engine(engine, i915, id) {
1198 		struct i915_request *prev;
1199 		IGT_TIMEOUT(end_time);
1200 		unsigned int count;
1201 
1202 		if (!intel_engine_can_store_dword(engine))
1203 			continue;
1204 
1205 		prev = hang_create_request(&h, engine);
1206 		if (IS_ERR(prev)) {
1207 			err = PTR_ERR(prev);
1208 			goto fini;
1209 		}
1210 
1211 		i915_request_get(prev);
1212 		i915_request_add(prev);
1213 
1214 		count = 0;
1215 		do {
1216 			struct i915_request *rq;
1217 			unsigned int reset_count;
1218 
1219 			rq = hang_create_request(&h, engine);
1220 			if (IS_ERR(rq)) {
1221 				err = PTR_ERR(rq);
1222 				goto fini;
1223 			}
1224 
1225 			i915_request_get(rq);
1226 			i915_request_add(rq);
1227 
1228 			/*
1229 			 * XXX We don't handle resetting the kernel context
1230 			 * very well. If we trigger a device reset twice in
1231 			 * quick succession while the kernel context is
1232 			 * executing, we may end up skipping the breadcrumb.
1233 			 * This is really only a problem for the selftest as
1234 			 * normally there is a large interlude between resets
1235 			 * (hangcheck), or we focus on resetting just one
1236 			 * engine and so avoid repeatedly resetting innocents.
1237 			 */
1238 			err = wait_for_others(i915, engine);
1239 			if (err) {
1240 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1241 				       __func__, engine->name);
1242 				i915_request_put(rq);
1243 				i915_request_put(prev);
1244 
1245 				GEM_TRACE_DUMP();
1246 				i915_gem_set_wedged(i915);
1247 				goto fini;
1248 			}
1249 
1250 			if (!wait_until_running(&h, prev)) {
1251 				struct drm_printer p = drm_info_printer(i915->drm.dev);
1252 
1253 				pr_err("%s(%s): Failed to start request %x, at %x\n",
1254 				       __func__, engine->name,
1255 				       prev->fence.seqno, hws_seqno(&h, prev));
1256 				intel_engine_dump(engine, &p,
1257 						  "%s\n", engine->name);
1258 
1259 				i915_request_put(rq);
1260 				i915_request_put(prev);
1261 
1262 				i915_gem_set_wedged(i915);
1263 
1264 				err = -EIO;
1265 				goto fini;
1266 			}
1267 
1268 			reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1269 
1270 			i915_reset(i915, ENGINE_MASK(id), NULL);
1271 
1272 			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1273 					    &i915->gpu_error.flags));
1274 
1275 			if (prev->fence.error != -EIO) {
1276 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1277 				       prev->fence.error);
1278 				i915_request_put(rq);
1279 				i915_request_put(prev);
1280 				err = -EINVAL;
1281 				goto fini;
1282 			}
1283 
1284 			if (rq->fence.error) {
1285 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1286 				       rq->fence.error);
1287 				i915_request_put(rq);
1288 				i915_request_put(prev);
1289 				err = -EINVAL;
1290 				goto fini;
1291 			}
1292 
1293 			if (i915_reset_count(&i915->gpu_error) == reset_count) {
1294 				pr_err("No GPU reset recorded!\n");
1295 				i915_request_put(rq);
1296 				i915_request_put(prev);
1297 				err = -EINVAL;
1298 				goto fini;
1299 			}
1300 
1301 			i915_request_put(prev);
1302 			prev = rq;
1303 			count++;
1304 		} while (time_before(jiffies, end_time));
1305 		pr_info("%s: Completed %d resets\n", engine->name, count);
1306 
1307 		*h.batch = MI_BATCH_BUFFER_END;
1308 		i915_gem_chipset_flush(i915);
1309 
1310 		i915_request_put(prev);
1311 
1312 		err = igt_flush_test(i915, I915_WAIT_LOCKED);
1313 		if (err)
1314 			break;
1315 	}
1316 
1317 fini:
1318 	hang_fini(&h);
1319 unlock:
1320 	mutex_unlock(&i915->drm.struct_mutex);
1321 	global_reset_unlock(i915);
1322 
1323 	if (i915_terminally_wedged(&i915->gpu_error))
1324 		return -EIO;
1325 
1326 	return err;
1327 }
1328 
igt_handle_error(void * arg)1329 static int igt_handle_error(void *arg)
1330 {
1331 	struct drm_i915_private *i915 = arg;
1332 	struct intel_engine_cs *engine = i915->engine[RCS];
1333 	struct hang h;
1334 	struct i915_request *rq;
1335 	struct i915_gpu_state *error;
1336 	int err;
1337 
1338 	/* Check that we can issue a global GPU and engine reset */
1339 
1340 	if (!intel_has_reset_engine(i915))
1341 		return 0;
1342 
1343 	if (!engine || !intel_engine_can_store_dword(engine))
1344 		return 0;
1345 
1346 	mutex_lock(&i915->drm.struct_mutex);
1347 
1348 	err = hang_init(&h, i915);
1349 	if (err)
1350 		goto err_unlock;
1351 
1352 	rq = hang_create_request(&h, engine);
1353 	if (IS_ERR(rq)) {
1354 		err = PTR_ERR(rq);
1355 		goto err_fini;
1356 	}
1357 
1358 	i915_request_get(rq);
1359 	i915_request_add(rq);
1360 
1361 	if (!wait_until_running(&h, rq)) {
1362 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1363 
1364 		pr_err("%s: Failed to start request %x, at %x\n",
1365 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1366 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1367 
1368 		i915_gem_set_wedged(i915);
1369 
1370 		err = -EIO;
1371 		goto err_request;
1372 	}
1373 
1374 	mutex_unlock(&i915->drm.struct_mutex);
1375 
1376 	/* Temporarily disable error capture */
1377 	error = xchg(&i915->gpu_error.first_error, (void *)-1);
1378 
1379 	i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1380 
1381 	xchg(&i915->gpu_error.first_error, error);
1382 
1383 	mutex_lock(&i915->drm.struct_mutex);
1384 
1385 	if (rq->fence.error != -EIO) {
1386 		pr_err("Guilty request not identified!\n");
1387 		err = -EINVAL;
1388 		goto err_request;
1389 	}
1390 
1391 err_request:
1392 	i915_request_put(rq);
1393 err_fini:
1394 	hang_fini(&h);
1395 err_unlock:
1396 	mutex_unlock(&i915->drm.struct_mutex);
1397 	return err;
1398 }
1399 
intel_hangcheck_live_selftests(struct drm_i915_private * i915)1400 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1401 {
1402 	static const struct i915_subtest tests[] = {
1403 		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1404 		SUBTEST(igt_hang_sanitycheck),
1405 		SUBTEST(igt_reset_idle_engine),
1406 		SUBTEST(igt_reset_active_engine),
1407 		SUBTEST(igt_reset_engines),
1408 		SUBTEST(igt_reset_queue),
1409 		SUBTEST(igt_reset_wait),
1410 		SUBTEST(igt_reset_evict_ggtt),
1411 		SUBTEST(igt_reset_evict_ppgtt),
1412 		SUBTEST(igt_handle_error),
1413 	};
1414 	bool saved_hangcheck;
1415 	int err;
1416 
1417 	if (!intel_has_gpu_reset(i915))
1418 		return 0;
1419 
1420 	if (i915_terminally_wedged(&i915->gpu_error))
1421 		return -EIO; /* we're long past hope of a successful reset */
1422 
1423 	intel_runtime_pm_get(i915);
1424 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1425 
1426 	err = i915_subtests(tests, i915);
1427 
1428 	mutex_lock(&i915->drm.struct_mutex);
1429 	igt_flush_test(i915, I915_WAIT_LOCKED);
1430 	mutex_unlock(&i915->drm.struct_mutex);
1431 
1432 	i915_modparams.enable_hangcheck = saved_hangcheck;
1433 	intel_runtime_pm_put(i915);
1434 
1435 	return err;
1436 }
1437