1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 
29 #include "intel_gt.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32 #include "selftest_engine_heartbeat.h"
33 
34 #include "i915_selftest.h"
35 #include "selftests/i915_random.h"
36 #include "selftests/igt_flush_test.h"
37 #include "selftests/igt_reset.h"
38 #include "selftests/igt_atomic.h"
39 
40 #include "selftests/mock_drm.h"
41 
42 #include "gem/selftests/mock_context.h"
43 #include "gem/selftests/igt_gem_utils.h"
44 
45 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
46 
47 struct hang {
48 	struct intel_gt *gt;
49 	struct drm_i915_gem_object *hws;
50 	struct drm_i915_gem_object *obj;
51 	struct i915_gem_context *ctx;
52 	u32 *seqno;
53 	u32 *batch;
54 };
55 
hang_init(struct hang * h,struct intel_gt * gt)56 static int hang_init(struct hang *h, struct intel_gt *gt)
57 {
58 	void *vaddr;
59 	int err;
60 
61 	memset(h, 0, sizeof(*h));
62 	h->gt = gt;
63 
64 	h->ctx = kernel_context(gt->i915);
65 	if (IS_ERR(h->ctx))
66 		return PTR_ERR(h->ctx);
67 
68 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
69 
70 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
71 	if (IS_ERR(h->hws)) {
72 		err = PTR_ERR(h->hws);
73 		goto err_ctx;
74 	}
75 
76 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
77 	if (IS_ERR(h->obj)) {
78 		err = PTR_ERR(h->obj);
79 		goto err_hws;
80 	}
81 
82 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
83 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
84 	if (IS_ERR(vaddr)) {
85 		err = PTR_ERR(vaddr);
86 		goto err_obj;
87 	}
88 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
89 
90 	vaddr = i915_gem_object_pin_map(h->obj,
91 					i915_coherent_map_type(gt->i915));
92 	if (IS_ERR(vaddr)) {
93 		err = PTR_ERR(vaddr);
94 		goto err_unpin_hws;
95 	}
96 	h->batch = vaddr;
97 
98 	return 0;
99 
100 err_unpin_hws:
101 	i915_gem_object_unpin_map(h->hws);
102 err_obj:
103 	i915_gem_object_put(h->obj);
104 err_hws:
105 	i915_gem_object_put(h->hws);
106 err_ctx:
107 	kernel_context_close(h->ctx);
108 	return err;
109 }
110 
hws_address(const struct i915_vma * hws,const struct i915_request * rq)111 static u64 hws_address(const struct i915_vma *hws,
112 		       const struct i915_request *rq)
113 {
114 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
115 }
116 
move_to_active(struct i915_vma * vma,struct i915_request * rq,unsigned int flags)117 static int move_to_active(struct i915_vma *vma,
118 			  struct i915_request *rq,
119 			  unsigned int flags)
120 {
121 	int err;
122 
123 	i915_vma_lock(vma);
124 	err = i915_request_await_object(rq, vma->obj,
125 					flags & EXEC_OBJECT_WRITE);
126 	if (err == 0)
127 		err = i915_vma_move_to_active(vma, rq, flags);
128 	i915_vma_unlock(vma);
129 
130 	return err;
131 }
132 
133 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)134 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
135 {
136 	struct intel_gt *gt = h->gt;
137 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
138 	struct drm_i915_gem_object *obj;
139 	struct i915_request *rq = NULL;
140 	struct i915_vma *hws, *vma;
141 	unsigned int flags;
142 	void *vaddr;
143 	u32 *batch;
144 	int err;
145 
146 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
147 	if (IS_ERR(obj)) {
148 		i915_vm_put(vm);
149 		return ERR_CAST(obj);
150 	}
151 
152 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
153 	if (IS_ERR(vaddr)) {
154 		i915_gem_object_put(obj);
155 		i915_vm_put(vm);
156 		return ERR_CAST(vaddr);
157 	}
158 
159 	i915_gem_object_unpin_map(h->obj);
160 	i915_gem_object_put(h->obj);
161 
162 	h->obj = obj;
163 	h->batch = vaddr;
164 
165 	vma = i915_vma_instance(h->obj, vm, NULL);
166 	if (IS_ERR(vma)) {
167 		i915_vm_put(vm);
168 		return ERR_CAST(vma);
169 	}
170 
171 	hws = i915_vma_instance(h->hws, vm, NULL);
172 	if (IS_ERR(hws)) {
173 		i915_vm_put(vm);
174 		return ERR_CAST(hws);
175 	}
176 
177 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
178 	if (err) {
179 		i915_vm_put(vm);
180 		return ERR_PTR(err);
181 	}
182 
183 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
184 	if (err)
185 		goto unpin_vma;
186 
187 	rq = igt_request_alloc(h->ctx, engine);
188 	if (IS_ERR(rq)) {
189 		err = PTR_ERR(rq);
190 		goto unpin_hws;
191 	}
192 
193 	err = move_to_active(vma, rq, 0);
194 	if (err)
195 		goto cancel_rq;
196 
197 	err = move_to_active(hws, rq, 0);
198 	if (err)
199 		goto cancel_rq;
200 
201 	batch = h->batch;
202 	if (INTEL_GEN(gt->i915) >= 8) {
203 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
204 		*batch++ = lower_32_bits(hws_address(hws, rq));
205 		*batch++ = upper_32_bits(hws_address(hws, rq));
206 		*batch++ = rq->fence.seqno;
207 		*batch++ = MI_NOOP;
208 
209 		memset(batch, 0, 1024);
210 		batch += 1024 / sizeof(*batch);
211 
212 		*batch++ = MI_NOOP;
213 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
214 		*batch++ = lower_32_bits(vma->node.start);
215 		*batch++ = upper_32_bits(vma->node.start);
216 	} else if (INTEL_GEN(gt->i915) >= 6) {
217 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
218 		*batch++ = 0;
219 		*batch++ = lower_32_bits(hws_address(hws, rq));
220 		*batch++ = rq->fence.seqno;
221 		*batch++ = MI_NOOP;
222 
223 		memset(batch, 0, 1024);
224 		batch += 1024 / sizeof(*batch);
225 
226 		*batch++ = MI_NOOP;
227 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
228 		*batch++ = lower_32_bits(vma->node.start);
229 	} else if (INTEL_GEN(gt->i915) >= 4) {
230 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
231 		*batch++ = 0;
232 		*batch++ = lower_32_bits(hws_address(hws, rq));
233 		*batch++ = rq->fence.seqno;
234 		*batch++ = MI_NOOP;
235 
236 		memset(batch, 0, 1024);
237 		batch += 1024 / sizeof(*batch);
238 
239 		*batch++ = MI_NOOP;
240 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241 		*batch++ = lower_32_bits(vma->node.start);
242 	} else {
243 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
244 		*batch++ = lower_32_bits(hws_address(hws, rq));
245 		*batch++ = rq->fence.seqno;
246 		*batch++ = MI_NOOP;
247 
248 		memset(batch, 0, 1024);
249 		batch += 1024 / sizeof(*batch);
250 
251 		*batch++ = MI_NOOP;
252 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
253 		*batch++ = lower_32_bits(vma->node.start);
254 	}
255 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
256 	intel_gt_chipset_flush(engine->gt);
257 
258 	if (rq->engine->emit_init_breadcrumb) {
259 		err = rq->engine->emit_init_breadcrumb(rq);
260 		if (err)
261 			goto cancel_rq;
262 	}
263 
264 	flags = 0;
265 	if (INTEL_GEN(gt->i915) <= 5)
266 		flags |= I915_DISPATCH_SECURE;
267 
268 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
269 
270 cancel_rq:
271 	if (err) {
272 		i915_request_set_error_once(rq, err);
273 		i915_request_add(rq);
274 	}
275 unpin_hws:
276 	i915_vma_unpin(hws);
277 unpin_vma:
278 	i915_vma_unpin(vma);
279 	i915_vm_put(vm);
280 	return err ? ERR_PTR(err) : rq;
281 }
282 
hws_seqno(const struct hang * h,const struct i915_request * rq)283 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
284 {
285 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
286 }
287 
hang_fini(struct hang * h)288 static void hang_fini(struct hang *h)
289 {
290 	*h->batch = MI_BATCH_BUFFER_END;
291 	intel_gt_chipset_flush(h->gt);
292 
293 	i915_gem_object_unpin_map(h->obj);
294 	i915_gem_object_put(h->obj);
295 
296 	i915_gem_object_unpin_map(h->hws);
297 	i915_gem_object_put(h->hws);
298 
299 	kernel_context_close(h->ctx);
300 
301 	igt_flush_test(h->gt->i915);
302 }
303 
wait_until_running(struct hang * h,struct i915_request * rq)304 static bool wait_until_running(struct hang *h, struct i915_request *rq)
305 {
306 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
307 					       rq->fence.seqno),
308 			     10) &&
309 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
310 					    rq->fence.seqno),
311 			  1000));
312 }
313 
igt_hang_sanitycheck(void * arg)314 static int igt_hang_sanitycheck(void *arg)
315 {
316 	struct intel_gt *gt = arg;
317 	struct i915_request *rq;
318 	struct intel_engine_cs *engine;
319 	enum intel_engine_id id;
320 	struct hang h;
321 	int err;
322 
323 	/* Basic check that we can execute our hanging batch */
324 
325 	err = hang_init(&h, gt);
326 	if (err)
327 		return err;
328 
329 	for_each_engine(engine, gt, id) {
330 		struct intel_wedge_me w;
331 		long timeout;
332 
333 		if (!intel_engine_can_store_dword(engine))
334 			continue;
335 
336 		rq = hang_create_request(&h, engine);
337 		if (IS_ERR(rq)) {
338 			err = PTR_ERR(rq);
339 			pr_err("Failed to create request for %s, err=%d\n",
340 			       engine->name, err);
341 			goto fini;
342 		}
343 
344 		i915_request_get(rq);
345 
346 		*h.batch = MI_BATCH_BUFFER_END;
347 		intel_gt_chipset_flush(engine->gt);
348 
349 		i915_request_add(rq);
350 
351 		timeout = 0;
352 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
353 			timeout = i915_request_wait(rq, 0,
354 						    MAX_SCHEDULE_TIMEOUT);
355 		if (intel_gt_is_wedged(gt))
356 			timeout = -EIO;
357 
358 		i915_request_put(rq);
359 
360 		if (timeout < 0) {
361 			err = timeout;
362 			pr_err("Wait for request failed on %s, err=%d\n",
363 			       engine->name, err);
364 			goto fini;
365 		}
366 	}
367 
368 fini:
369 	hang_fini(&h);
370 	return err;
371 }
372 
wait_for_idle(struct intel_engine_cs * engine)373 static bool wait_for_idle(struct intel_engine_cs *engine)
374 {
375 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
376 }
377 
igt_reset_nop(void * arg)378 static int igt_reset_nop(void *arg)
379 {
380 	struct intel_gt *gt = arg;
381 	struct i915_gpu_error *global = &gt->i915->gpu_error;
382 	struct intel_engine_cs *engine;
383 	unsigned int reset_count, count;
384 	enum intel_engine_id id;
385 	IGT_TIMEOUT(end_time);
386 	int err = 0;
387 
388 	/* Check that we can reset during non-user portions of requests */
389 
390 	reset_count = i915_reset_count(global);
391 	count = 0;
392 	do {
393 		for_each_engine(engine, gt, id) {
394 			struct intel_context *ce;
395 			int i;
396 
397 			ce = intel_context_create(engine);
398 			if (IS_ERR(ce)) {
399 				err = PTR_ERR(ce);
400 				break;
401 			}
402 
403 			for (i = 0; i < 16; i++) {
404 				struct i915_request *rq;
405 
406 				rq = intel_context_create_request(ce);
407 				if (IS_ERR(rq)) {
408 					err = PTR_ERR(rq);
409 					break;
410 				}
411 
412 				i915_request_add(rq);
413 			}
414 
415 			intel_context_put(ce);
416 		}
417 
418 		igt_global_reset_lock(gt);
419 		intel_gt_reset(gt, ALL_ENGINES, NULL);
420 		igt_global_reset_unlock(gt);
421 
422 		if (intel_gt_is_wedged(gt)) {
423 			err = -EIO;
424 			break;
425 		}
426 
427 		if (i915_reset_count(global) != reset_count + ++count) {
428 			pr_err("Full GPU reset not recorded!\n");
429 			err = -EINVAL;
430 			break;
431 		}
432 
433 		err = igt_flush_test(gt->i915);
434 		if (err)
435 			break;
436 	} while (time_before(jiffies, end_time));
437 	pr_info("%s: %d resets\n", __func__, count);
438 
439 	if (igt_flush_test(gt->i915))
440 		err = -EIO;
441 	return err;
442 }
443 
igt_reset_nop_engine(void * arg)444 static int igt_reset_nop_engine(void *arg)
445 {
446 	struct intel_gt *gt = arg;
447 	struct i915_gpu_error *global = &gt->i915->gpu_error;
448 	struct intel_engine_cs *engine;
449 	enum intel_engine_id id;
450 
451 	/* Check that we can engine-reset during non-user portions */
452 
453 	if (!intel_has_reset_engine(gt))
454 		return 0;
455 
456 	for_each_engine(engine, gt, id) {
457 		unsigned int reset_count, reset_engine_count, count;
458 		struct intel_context *ce;
459 		IGT_TIMEOUT(end_time);
460 		int err;
461 
462 		ce = intel_context_create(engine);
463 		if (IS_ERR(ce))
464 			return PTR_ERR(ce);
465 
466 		reset_count = i915_reset_count(global);
467 		reset_engine_count = i915_reset_engine_count(global, engine);
468 		count = 0;
469 
470 		st_engine_heartbeat_disable(engine);
471 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
472 		do {
473 			int i;
474 
475 			if (!wait_for_idle(engine)) {
476 				pr_err("%s failed to idle before reset\n",
477 				       engine->name);
478 				err = -EIO;
479 				break;
480 			}
481 
482 			for (i = 0; i < 16; i++) {
483 				struct i915_request *rq;
484 
485 				rq = intel_context_create_request(ce);
486 				if (IS_ERR(rq)) {
487 					struct drm_printer p =
488 						drm_info_printer(gt->i915->drm.dev);
489 					intel_engine_dump(engine, &p,
490 							  "%s(%s): failed to submit request\n",
491 							  __func__,
492 							  engine->name);
493 
494 					GEM_TRACE("%s(%s): failed to submit request\n",
495 						  __func__,
496 						  engine->name);
497 					GEM_TRACE_DUMP();
498 
499 					intel_gt_set_wedged(gt);
500 
501 					err = PTR_ERR(rq);
502 					break;
503 				}
504 
505 				i915_request_add(rq);
506 			}
507 			err = intel_engine_reset(engine, NULL);
508 			if (err) {
509 				pr_err("i915_reset_engine failed\n");
510 				break;
511 			}
512 
513 			if (i915_reset_count(global) != reset_count) {
514 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
515 				err = -EINVAL;
516 				break;
517 			}
518 
519 			if (i915_reset_engine_count(global, engine) !=
520 			    reset_engine_count + ++count) {
521 				pr_err("%s engine reset not recorded!\n",
522 				       engine->name);
523 				err = -EINVAL;
524 				break;
525 			}
526 		} while (time_before(jiffies, end_time));
527 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
528 		st_engine_heartbeat_enable(engine);
529 
530 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
531 
532 		intel_context_put(ce);
533 		if (igt_flush_test(gt->i915))
534 			err = -EIO;
535 		if (err)
536 			return err;
537 	}
538 
539 	return 0;
540 }
541 
__igt_reset_engine(struct intel_gt * gt,bool active)542 static int __igt_reset_engine(struct intel_gt *gt, bool active)
543 {
544 	struct i915_gpu_error *global = &gt->i915->gpu_error;
545 	struct intel_engine_cs *engine;
546 	enum intel_engine_id id;
547 	struct hang h;
548 	int err = 0;
549 
550 	/* Check that we can issue an engine reset on an idle engine (no-op) */
551 
552 	if (!intel_has_reset_engine(gt))
553 		return 0;
554 
555 	if (active) {
556 		err = hang_init(&h, gt);
557 		if (err)
558 			return err;
559 	}
560 
561 	for_each_engine(engine, gt, id) {
562 		unsigned int reset_count, reset_engine_count;
563 		IGT_TIMEOUT(end_time);
564 
565 		if (active && !intel_engine_can_store_dword(engine))
566 			continue;
567 
568 		if (!wait_for_idle(engine)) {
569 			pr_err("%s failed to idle before reset\n",
570 			       engine->name);
571 			err = -EIO;
572 			break;
573 		}
574 
575 		reset_count = i915_reset_count(global);
576 		reset_engine_count = i915_reset_engine_count(global, engine);
577 
578 		st_engine_heartbeat_disable(engine);
579 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
580 		do {
581 			if (active) {
582 				struct i915_request *rq;
583 
584 				rq = hang_create_request(&h, engine);
585 				if (IS_ERR(rq)) {
586 					err = PTR_ERR(rq);
587 					break;
588 				}
589 
590 				i915_request_get(rq);
591 				i915_request_add(rq);
592 
593 				if (!wait_until_running(&h, rq)) {
594 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
595 
596 					pr_err("%s: Failed to start request %llx, at %x\n",
597 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
598 					intel_engine_dump(engine, &p,
599 							  "%s\n", engine->name);
600 
601 					i915_request_put(rq);
602 					err = -EIO;
603 					break;
604 				}
605 
606 				i915_request_put(rq);
607 			}
608 
609 			err = intel_engine_reset(engine, NULL);
610 			if (err) {
611 				pr_err("i915_reset_engine failed\n");
612 				break;
613 			}
614 
615 			if (i915_reset_count(global) != reset_count) {
616 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
617 				err = -EINVAL;
618 				break;
619 			}
620 
621 			if (i915_reset_engine_count(global, engine) !=
622 			    ++reset_engine_count) {
623 				pr_err("%s engine reset not recorded!\n",
624 				       engine->name);
625 				err = -EINVAL;
626 				break;
627 			}
628 		} while (time_before(jiffies, end_time));
629 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
630 		st_engine_heartbeat_enable(engine);
631 
632 		if (err)
633 			break;
634 
635 		err = igt_flush_test(gt->i915);
636 		if (err)
637 			break;
638 	}
639 
640 	if (intel_gt_is_wedged(gt))
641 		err = -EIO;
642 
643 	if (active)
644 		hang_fini(&h);
645 
646 	return err;
647 }
648 
igt_reset_idle_engine(void * arg)649 static int igt_reset_idle_engine(void *arg)
650 {
651 	return __igt_reset_engine(arg, false);
652 }
653 
igt_reset_active_engine(void * arg)654 static int igt_reset_active_engine(void *arg)
655 {
656 	return __igt_reset_engine(arg, true);
657 }
658 
659 struct active_engine {
660 	struct task_struct *task;
661 	struct intel_engine_cs *engine;
662 	unsigned long resets;
663 	unsigned int flags;
664 };
665 
666 #define TEST_ACTIVE	BIT(0)
667 #define TEST_OTHERS	BIT(1)
668 #define TEST_SELF	BIT(2)
669 #define TEST_PRIORITY	BIT(3)
670 
active_request_put(struct i915_request * rq)671 static int active_request_put(struct i915_request *rq)
672 {
673 	int err = 0;
674 
675 	if (!rq)
676 		return 0;
677 
678 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
679 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
680 			  rq->engine->name,
681 			  rq->fence.context,
682 			  rq->fence.seqno);
683 		GEM_TRACE_DUMP();
684 
685 		intel_gt_set_wedged(rq->engine->gt);
686 		err = -EIO;
687 	}
688 
689 	i915_request_put(rq);
690 
691 	return err;
692 }
693 
active_engine(void * data)694 static int active_engine(void *data)
695 {
696 	I915_RND_STATE(prng);
697 	struct active_engine *arg = data;
698 	struct intel_engine_cs *engine = arg->engine;
699 	struct i915_request *rq[8] = {};
700 	struct intel_context *ce[ARRAY_SIZE(rq)];
701 	unsigned long count;
702 	int err = 0;
703 
704 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
705 		ce[count] = intel_context_create(engine);
706 		if (IS_ERR(ce[count])) {
707 			err = PTR_ERR(ce[count]);
708 			while (--count)
709 				intel_context_put(ce[count]);
710 			return err;
711 		}
712 	}
713 
714 	count = 0;
715 	while (!kthread_should_stop()) {
716 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
717 		struct i915_request *old = rq[idx];
718 		struct i915_request *new;
719 
720 		new = intel_context_create_request(ce[idx]);
721 		if (IS_ERR(new)) {
722 			err = PTR_ERR(new);
723 			break;
724 		}
725 
726 		rq[idx] = i915_request_get(new);
727 		i915_request_add(new);
728 
729 		if (engine->schedule && arg->flags & TEST_PRIORITY) {
730 			struct i915_sched_attr attr = {
731 				.priority =
732 					i915_prandom_u32_max_state(512, &prng),
733 			};
734 			engine->schedule(rq[idx], &attr);
735 		}
736 
737 		err = active_request_put(old);
738 		if (err)
739 			break;
740 
741 		cond_resched();
742 	}
743 
744 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
745 		int err__ = active_request_put(rq[count]);
746 
747 		/* Keep the first error */
748 		if (!err)
749 			err = err__;
750 
751 		intel_context_put(ce[count]);
752 	}
753 
754 	return err;
755 }
756 
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)757 static int __igt_reset_engines(struct intel_gt *gt,
758 			       const char *test_name,
759 			       unsigned int flags)
760 {
761 	struct i915_gpu_error *global = &gt->i915->gpu_error;
762 	struct intel_engine_cs *engine, *other;
763 	enum intel_engine_id id, tmp;
764 	struct hang h;
765 	int err = 0;
766 
767 	/* Check that issuing a reset on one engine does not interfere
768 	 * with any other engine.
769 	 */
770 
771 	if (!intel_has_reset_engine(gt))
772 		return 0;
773 
774 	if (flags & TEST_ACTIVE) {
775 		err = hang_init(&h, gt);
776 		if (err)
777 			return err;
778 
779 		if (flags & TEST_PRIORITY)
780 			h.ctx->sched.priority = 1024;
781 	}
782 
783 	for_each_engine(engine, gt, id) {
784 		struct active_engine threads[I915_NUM_ENGINES] = {};
785 		unsigned long device = i915_reset_count(global);
786 		unsigned long count = 0, reported;
787 		IGT_TIMEOUT(end_time);
788 
789 		if (flags & TEST_ACTIVE &&
790 		    !intel_engine_can_store_dword(engine))
791 			continue;
792 
793 		if (!wait_for_idle(engine)) {
794 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
795 			       engine->name, test_name);
796 			err = -EIO;
797 			break;
798 		}
799 
800 		memset(threads, 0, sizeof(threads));
801 		for_each_engine(other, gt, tmp) {
802 			struct task_struct *tsk;
803 
804 			threads[tmp].resets =
805 				i915_reset_engine_count(global, other);
806 
807 			if (other == engine && !(flags & TEST_SELF))
808 				continue;
809 
810 			if (other != engine && !(flags & TEST_OTHERS))
811 				continue;
812 
813 			threads[tmp].engine = other;
814 			threads[tmp].flags = flags;
815 
816 			tsk = kthread_run(active_engine, &threads[tmp],
817 					  "igt/%s", other->name);
818 			if (IS_ERR(tsk)) {
819 				err = PTR_ERR(tsk);
820 				goto unwind;
821 			}
822 
823 			threads[tmp].task = tsk;
824 			get_task_struct(tsk);
825 		}
826 
827 		yield(); /* start all threads before we begin */
828 
829 		st_engine_heartbeat_disable(engine);
830 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
831 		do {
832 			struct i915_request *rq = NULL;
833 
834 			if (flags & TEST_ACTIVE) {
835 				rq = hang_create_request(&h, engine);
836 				if (IS_ERR(rq)) {
837 					err = PTR_ERR(rq);
838 					break;
839 				}
840 
841 				i915_request_get(rq);
842 				i915_request_add(rq);
843 
844 				if (!wait_until_running(&h, rq)) {
845 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
846 
847 					pr_err("%s: Failed to start request %llx, at %x\n",
848 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
849 					intel_engine_dump(engine, &p,
850 							  "%s\n", engine->name);
851 
852 					i915_request_put(rq);
853 					err = -EIO;
854 					break;
855 				}
856 			}
857 
858 			err = intel_engine_reset(engine, NULL);
859 			if (err) {
860 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
861 				       engine->name, test_name, err);
862 				break;
863 			}
864 
865 			count++;
866 
867 			if (rq) {
868 				if (rq->fence.error != -EIO) {
869 					pr_err("i915_reset_engine(%s:%s):"
870 					       " failed to reset request %llx:%lld\n",
871 					       engine->name, test_name,
872 					       rq->fence.context,
873 					       rq->fence.seqno);
874 					i915_request_put(rq);
875 
876 					GEM_TRACE_DUMP();
877 					intel_gt_set_wedged(gt);
878 					err = -EIO;
879 					break;
880 				}
881 
882 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
883 					struct drm_printer p =
884 						drm_info_printer(gt->i915->drm.dev);
885 
886 					pr_err("i915_reset_engine(%s:%s):"
887 					       " failed to complete request %llx:%lld after reset\n",
888 					       engine->name, test_name,
889 					       rq->fence.context,
890 					       rq->fence.seqno);
891 					intel_engine_dump(engine, &p,
892 							  "%s\n", engine->name);
893 					i915_request_put(rq);
894 
895 					GEM_TRACE_DUMP();
896 					intel_gt_set_wedged(gt);
897 					err = -EIO;
898 					break;
899 				}
900 
901 				i915_request_put(rq);
902 			}
903 
904 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
905 				struct drm_printer p =
906 					drm_info_printer(gt->i915->drm.dev);
907 
908 				pr_err("i915_reset_engine(%s:%s):"
909 				       " failed to idle after reset\n",
910 				       engine->name, test_name);
911 				intel_engine_dump(engine, &p,
912 						  "%s\n", engine->name);
913 
914 				err = -EIO;
915 				break;
916 			}
917 		} while (time_before(jiffies, end_time));
918 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
919 		st_engine_heartbeat_enable(engine);
920 
921 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
922 			engine->name, test_name, count);
923 
924 		reported = i915_reset_engine_count(global, engine);
925 		reported -= threads[engine->id].resets;
926 		if (reported != count) {
927 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
928 			       engine->name, test_name, count, reported);
929 			if (!err)
930 				err = -EINVAL;
931 		}
932 
933 unwind:
934 		for_each_engine(other, gt, tmp) {
935 			int ret;
936 
937 			if (!threads[tmp].task)
938 				continue;
939 
940 			ret = kthread_stop(threads[tmp].task);
941 			if (ret) {
942 				pr_err("kthread for other engine %s failed, err=%d\n",
943 				       other->name, ret);
944 				if (!err)
945 					err = ret;
946 			}
947 			put_task_struct(threads[tmp].task);
948 
949 			if (other->uabi_class != engine->uabi_class &&
950 			    threads[tmp].resets !=
951 			    i915_reset_engine_count(global, other)) {
952 				pr_err("Innocent engine %s was reset (count=%ld)\n",
953 				       other->name,
954 				       i915_reset_engine_count(global, other) -
955 				       threads[tmp].resets);
956 				if (!err)
957 					err = -EINVAL;
958 			}
959 		}
960 
961 		if (device != i915_reset_count(global)) {
962 			pr_err("Global reset (count=%ld)!\n",
963 			       i915_reset_count(global) - device);
964 			if (!err)
965 				err = -EINVAL;
966 		}
967 
968 		if (err)
969 			break;
970 
971 		err = igt_flush_test(gt->i915);
972 		if (err)
973 			break;
974 	}
975 
976 	if (intel_gt_is_wedged(gt))
977 		err = -EIO;
978 
979 	if (flags & TEST_ACTIVE)
980 		hang_fini(&h);
981 
982 	return err;
983 }
984 
igt_reset_engines(void * arg)985 static int igt_reset_engines(void *arg)
986 {
987 	static const struct {
988 		const char *name;
989 		unsigned int flags;
990 	} phases[] = {
991 		{ "idle", 0 },
992 		{ "active", TEST_ACTIVE },
993 		{ "others-idle", TEST_OTHERS },
994 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
995 		{
996 			"others-priority",
997 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
998 		},
999 		{
1000 			"self-priority",
1001 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1002 		},
1003 		{ }
1004 	};
1005 	struct intel_gt *gt = arg;
1006 	typeof(*phases) *p;
1007 	int err;
1008 
1009 	for (p = phases; p->name; p++) {
1010 		if (p->flags & TEST_PRIORITY) {
1011 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1012 				continue;
1013 		}
1014 
1015 		err = __igt_reset_engines(arg, p->name, p->flags);
1016 		if (err)
1017 			return err;
1018 	}
1019 
1020 	return 0;
1021 }
1022 
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1023 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1024 {
1025 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1026 
1027 	intel_gt_reset(gt, mask, NULL);
1028 
1029 	return count;
1030 }
1031 
igt_reset_wait(void * arg)1032 static int igt_reset_wait(void *arg)
1033 {
1034 	struct intel_gt *gt = arg;
1035 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1036 	struct intel_engine_cs *engine = gt->engine[RCS0];
1037 	struct i915_request *rq;
1038 	unsigned int reset_count;
1039 	struct hang h;
1040 	long timeout;
1041 	int err;
1042 
1043 	if (!engine || !intel_engine_can_store_dword(engine))
1044 		return 0;
1045 
1046 	/* Check that we detect a stuck waiter and issue a reset */
1047 
1048 	igt_global_reset_lock(gt);
1049 
1050 	err = hang_init(&h, gt);
1051 	if (err)
1052 		goto unlock;
1053 
1054 	rq = hang_create_request(&h, engine);
1055 	if (IS_ERR(rq)) {
1056 		err = PTR_ERR(rq);
1057 		goto fini;
1058 	}
1059 
1060 	i915_request_get(rq);
1061 	i915_request_add(rq);
1062 
1063 	if (!wait_until_running(&h, rq)) {
1064 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1065 
1066 		pr_err("%s: Failed to start request %llx, at %x\n",
1067 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1068 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1069 
1070 		intel_gt_set_wedged(gt);
1071 
1072 		err = -EIO;
1073 		goto out_rq;
1074 	}
1075 
1076 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1077 
1078 	timeout = i915_request_wait(rq, 0, 10);
1079 	if (timeout < 0) {
1080 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1081 		       timeout);
1082 		err = timeout;
1083 		goto out_rq;
1084 	}
1085 
1086 	if (i915_reset_count(global) == reset_count) {
1087 		pr_err("No GPU reset recorded!\n");
1088 		err = -EINVAL;
1089 		goto out_rq;
1090 	}
1091 
1092 out_rq:
1093 	i915_request_put(rq);
1094 fini:
1095 	hang_fini(&h);
1096 unlock:
1097 	igt_global_reset_unlock(gt);
1098 
1099 	if (intel_gt_is_wedged(gt))
1100 		return -EIO;
1101 
1102 	return err;
1103 }
1104 
1105 struct evict_vma {
1106 	struct completion completion;
1107 	struct i915_vma *vma;
1108 };
1109 
evict_vma(void * data)1110 static int evict_vma(void *data)
1111 {
1112 	struct evict_vma *arg = data;
1113 	struct i915_address_space *vm = arg->vma->vm;
1114 	struct drm_mm_node evict = arg->vma->node;
1115 	int err;
1116 
1117 	complete(&arg->completion);
1118 
1119 	mutex_lock(&vm->mutex);
1120 	err = i915_gem_evict_for_node(vm, &evict, 0);
1121 	mutex_unlock(&vm->mutex);
1122 
1123 	return err;
1124 }
1125 
evict_fence(void * data)1126 static int evict_fence(void *data)
1127 {
1128 	struct evict_vma *arg = data;
1129 	int err;
1130 
1131 	complete(&arg->completion);
1132 
1133 	/* Mark the fence register as dirty to force the mmio update. */
1134 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1135 	if (err) {
1136 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1137 		return err;
1138 	}
1139 
1140 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1141 	if (err) {
1142 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1143 		return err;
1144 	}
1145 
1146 	err = i915_vma_pin_fence(arg->vma);
1147 	i915_vma_unpin(arg->vma);
1148 	if (err) {
1149 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1150 		return err;
1151 	}
1152 
1153 	i915_vma_unpin_fence(arg->vma);
1154 
1155 	return 0;
1156 }
1157 
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1158 static int __igt_reset_evict_vma(struct intel_gt *gt,
1159 				 struct i915_address_space *vm,
1160 				 int (*fn)(void *),
1161 				 unsigned int flags)
1162 {
1163 	struct intel_engine_cs *engine = gt->engine[RCS0];
1164 	struct drm_i915_gem_object *obj;
1165 	struct task_struct *tsk = NULL;
1166 	struct i915_request *rq;
1167 	struct evict_vma arg;
1168 	struct hang h;
1169 	unsigned int pin_flags;
1170 	int err;
1171 
1172 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1173 		return 0;
1174 
1175 	if (!engine || !intel_engine_can_store_dword(engine))
1176 		return 0;
1177 
1178 	/* Check that we can recover an unbind stuck on a hanging request */
1179 
1180 	err = hang_init(&h, gt);
1181 	if (err)
1182 		return err;
1183 
1184 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1185 	if (IS_ERR(obj)) {
1186 		err = PTR_ERR(obj);
1187 		goto fini;
1188 	}
1189 
1190 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1191 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1192 		if (err) {
1193 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1194 			goto out_obj;
1195 		}
1196 	}
1197 
1198 	arg.vma = i915_vma_instance(obj, vm, NULL);
1199 	if (IS_ERR(arg.vma)) {
1200 		err = PTR_ERR(arg.vma);
1201 		goto out_obj;
1202 	}
1203 
1204 	rq = hang_create_request(&h, engine);
1205 	if (IS_ERR(rq)) {
1206 		err = PTR_ERR(rq);
1207 		goto out_obj;
1208 	}
1209 
1210 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1211 
1212 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1213 		pin_flags |= PIN_MAPPABLE;
1214 
1215 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1216 	if (err) {
1217 		i915_request_add(rq);
1218 		goto out_obj;
1219 	}
1220 
1221 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1222 		err = i915_vma_pin_fence(arg.vma);
1223 		if (err) {
1224 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1225 			i915_vma_unpin(arg.vma);
1226 			i915_request_add(rq);
1227 			goto out_obj;
1228 		}
1229 	}
1230 
1231 	i915_vma_lock(arg.vma);
1232 	err = i915_request_await_object(rq, arg.vma->obj,
1233 					flags & EXEC_OBJECT_WRITE);
1234 	if (err == 0)
1235 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1236 	i915_vma_unlock(arg.vma);
1237 
1238 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1239 		i915_vma_unpin_fence(arg.vma);
1240 	i915_vma_unpin(arg.vma);
1241 
1242 	i915_request_get(rq);
1243 	i915_request_add(rq);
1244 	if (err)
1245 		goto out_rq;
1246 
1247 	if (!wait_until_running(&h, rq)) {
1248 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1249 
1250 		pr_err("%s: Failed to start request %llx, at %x\n",
1251 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1252 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1253 
1254 		intel_gt_set_wedged(gt);
1255 		goto out_reset;
1256 	}
1257 
1258 	init_completion(&arg.completion);
1259 
1260 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1261 	if (IS_ERR(tsk)) {
1262 		err = PTR_ERR(tsk);
1263 		tsk = NULL;
1264 		goto out_reset;
1265 	}
1266 	get_task_struct(tsk);
1267 
1268 	wait_for_completion(&arg.completion);
1269 
1270 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1271 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1272 
1273 		pr_err("igt/evict_vma kthread did not wait\n");
1274 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1275 
1276 		intel_gt_set_wedged(gt);
1277 		goto out_reset;
1278 	}
1279 
1280 out_reset:
1281 	igt_global_reset_lock(gt);
1282 	fake_hangcheck(gt, rq->engine->mask);
1283 	igt_global_reset_unlock(gt);
1284 
1285 	if (tsk) {
1286 		struct intel_wedge_me w;
1287 
1288 		/* The reset, even indirectly, should take less than 10ms. */
1289 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1290 			err = kthread_stop(tsk);
1291 
1292 		put_task_struct(tsk);
1293 	}
1294 
1295 out_rq:
1296 	i915_request_put(rq);
1297 out_obj:
1298 	i915_gem_object_put(obj);
1299 fini:
1300 	hang_fini(&h);
1301 	if (intel_gt_is_wedged(gt))
1302 		return -EIO;
1303 
1304 	return err;
1305 }
1306 
igt_reset_evict_ggtt(void * arg)1307 static int igt_reset_evict_ggtt(void *arg)
1308 {
1309 	struct intel_gt *gt = arg;
1310 
1311 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1312 				     evict_vma, EXEC_OBJECT_WRITE);
1313 }
1314 
igt_reset_evict_ppgtt(void * arg)1315 static int igt_reset_evict_ppgtt(void *arg)
1316 {
1317 	struct intel_gt *gt = arg;
1318 	struct i915_ppgtt *ppgtt;
1319 	int err;
1320 
1321 	/* aliasing == global gtt locking, covered above */
1322 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1323 		return 0;
1324 
1325 	ppgtt = i915_ppgtt_create(gt);
1326 	if (IS_ERR(ppgtt))
1327 		return PTR_ERR(ppgtt);
1328 
1329 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1330 				    evict_vma, EXEC_OBJECT_WRITE);
1331 	i915_vm_put(&ppgtt->vm);
1332 
1333 	return err;
1334 }
1335 
igt_reset_evict_fence(void * arg)1336 static int igt_reset_evict_fence(void *arg)
1337 {
1338 	struct intel_gt *gt = arg;
1339 
1340 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1341 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1342 }
1343 
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1344 static int wait_for_others(struct intel_gt *gt,
1345 			   struct intel_engine_cs *exclude)
1346 {
1347 	struct intel_engine_cs *engine;
1348 	enum intel_engine_id id;
1349 
1350 	for_each_engine(engine, gt, id) {
1351 		if (engine == exclude)
1352 			continue;
1353 
1354 		if (!wait_for_idle(engine))
1355 			return -EIO;
1356 	}
1357 
1358 	return 0;
1359 }
1360 
igt_reset_queue(void * arg)1361 static int igt_reset_queue(void *arg)
1362 {
1363 	struct intel_gt *gt = arg;
1364 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1365 	struct intel_engine_cs *engine;
1366 	enum intel_engine_id id;
1367 	struct hang h;
1368 	int err;
1369 
1370 	/* Check that we replay pending requests following a hang */
1371 
1372 	igt_global_reset_lock(gt);
1373 
1374 	err = hang_init(&h, gt);
1375 	if (err)
1376 		goto unlock;
1377 
1378 	for_each_engine(engine, gt, id) {
1379 		struct i915_request *prev;
1380 		IGT_TIMEOUT(end_time);
1381 		unsigned int count;
1382 
1383 		if (!intel_engine_can_store_dword(engine))
1384 			continue;
1385 
1386 		prev = hang_create_request(&h, engine);
1387 		if (IS_ERR(prev)) {
1388 			err = PTR_ERR(prev);
1389 			goto fini;
1390 		}
1391 
1392 		i915_request_get(prev);
1393 		i915_request_add(prev);
1394 
1395 		count = 0;
1396 		do {
1397 			struct i915_request *rq;
1398 			unsigned int reset_count;
1399 
1400 			rq = hang_create_request(&h, engine);
1401 			if (IS_ERR(rq)) {
1402 				err = PTR_ERR(rq);
1403 				goto fini;
1404 			}
1405 
1406 			i915_request_get(rq);
1407 			i915_request_add(rq);
1408 
1409 			/*
1410 			 * XXX We don't handle resetting the kernel context
1411 			 * very well. If we trigger a device reset twice in
1412 			 * quick succession while the kernel context is
1413 			 * executing, we may end up skipping the breadcrumb.
1414 			 * This is really only a problem for the selftest as
1415 			 * normally there is a large interlude between resets
1416 			 * (hangcheck), or we focus on resetting just one
1417 			 * engine and so avoid repeatedly resetting innocents.
1418 			 */
1419 			err = wait_for_others(gt, engine);
1420 			if (err) {
1421 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1422 				       __func__, engine->name);
1423 				i915_request_put(rq);
1424 				i915_request_put(prev);
1425 
1426 				GEM_TRACE_DUMP();
1427 				intel_gt_set_wedged(gt);
1428 				goto fini;
1429 			}
1430 
1431 			if (!wait_until_running(&h, prev)) {
1432 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1433 
1434 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1435 				       __func__, engine->name,
1436 				       prev->fence.seqno, hws_seqno(&h, prev));
1437 				intel_engine_dump(engine, &p,
1438 						  "%s\n", engine->name);
1439 
1440 				i915_request_put(rq);
1441 				i915_request_put(prev);
1442 
1443 				intel_gt_set_wedged(gt);
1444 
1445 				err = -EIO;
1446 				goto fini;
1447 			}
1448 
1449 			reset_count = fake_hangcheck(gt, BIT(id));
1450 
1451 			if (prev->fence.error != -EIO) {
1452 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1453 				       prev->fence.error);
1454 				i915_request_put(rq);
1455 				i915_request_put(prev);
1456 				err = -EINVAL;
1457 				goto fini;
1458 			}
1459 
1460 			if (rq->fence.error) {
1461 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1462 				       rq->fence.error);
1463 				i915_request_put(rq);
1464 				i915_request_put(prev);
1465 				err = -EINVAL;
1466 				goto fini;
1467 			}
1468 
1469 			if (i915_reset_count(global) == reset_count) {
1470 				pr_err("No GPU reset recorded!\n");
1471 				i915_request_put(rq);
1472 				i915_request_put(prev);
1473 				err = -EINVAL;
1474 				goto fini;
1475 			}
1476 
1477 			i915_request_put(prev);
1478 			prev = rq;
1479 			count++;
1480 		} while (time_before(jiffies, end_time));
1481 		pr_info("%s: Completed %d resets\n", engine->name, count);
1482 
1483 		*h.batch = MI_BATCH_BUFFER_END;
1484 		intel_gt_chipset_flush(engine->gt);
1485 
1486 		i915_request_put(prev);
1487 
1488 		err = igt_flush_test(gt->i915);
1489 		if (err)
1490 			break;
1491 	}
1492 
1493 fini:
1494 	hang_fini(&h);
1495 unlock:
1496 	igt_global_reset_unlock(gt);
1497 
1498 	if (intel_gt_is_wedged(gt))
1499 		return -EIO;
1500 
1501 	return err;
1502 }
1503 
igt_handle_error(void * arg)1504 static int igt_handle_error(void *arg)
1505 {
1506 	struct intel_gt *gt = arg;
1507 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1508 	struct intel_engine_cs *engine = gt->engine[RCS0];
1509 	struct hang h;
1510 	struct i915_request *rq;
1511 	struct i915_gpu_coredump *error;
1512 	int err;
1513 
1514 	/* Check that we can issue a global GPU and engine reset */
1515 
1516 	if (!intel_has_reset_engine(gt))
1517 		return 0;
1518 
1519 	if (!engine || !intel_engine_can_store_dword(engine))
1520 		return 0;
1521 
1522 	err = hang_init(&h, gt);
1523 	if (err)
1524 		return err;
1525 
1526 	rq = hang_create_request(&h, engine);
1527 	if (IS_ERR(rq)) {
1528 		err = PTR_ERR(rq);
1529 		goto err_fini;
1530 	}
1531 
1532 	i915_request_get(rq);
1533 	i915_request_add(rq);
1534 
1535 	if (!wait_until_running(&h, rq)) {
1536 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1537 
1538 		pr_err("%s: Failed to start request %llx, at %x\n",
1539 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1540 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1541 
1542 		intel_gt_set_wedged(gt);
1543 
1544 		err = -EIO;
1545 		goto err_request;
1546 	}
1547 
1548 	/* Temporarily disable error capture */
1549 	error = xchg(&global->first_error, (void *)-1);
1550 
1551 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1552 
1553 	xchg(&global->first_error, error);
1554 
1555 	if (rq->fence.error != -EIO) {
1556 		pr_err("Guilty request not identified!\n");
1557 		err = -EINVAL;
1558 		goto err_request;
1559 	}
1560 
1561 err_request:
1562 	i915_request_put(rq);
1563 err_fini:
1564 	hang_fini(&h);
1565 	return err;
1566 }
1567 
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1568 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1569 				     const struct igt_atomic_section *p,
1570 				     const char *mode)
1571 {
1572 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1573 	int err;
1574 
1575 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1576 		  engine->name, mode, p->name);
1577 
1578 	tasklet_disable(t);
1579 	p->critical_section_begin();
1580 
1581 	err = intel_engine_reset(engine, NULL);
1582 
1583 	p->critical_section_end();
1584 	tasklet_enable(t);
1585 
1586 	if (err)
1587 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1588 		       engine->name, mode, p->name);
1589 
1590 	return err;
1591 }
1592 
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1593 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1594 				   const struct igt_atomic_section *p)
1595 {
1596 	struct i915_request *rq;
1597 	struct hang h;
1598 	int err;
1599 
1600 	err = __igt_atomic_reset_engine(engine, p, "idle");
1601 	if (err)
1602 		return err;
1603 
1604 	err = hang_init(&h, engine->gt);
1605 	if (err)
1606 		return err;
1607 
1608 	rq = hang_create_request(&h, engine);
1609 	if (IS_ERR(rq)) {
1610 		err = PTR_ERR(rq);
1611 		goto out;
1612 	}
1613 
1614 	i915_request_get(rq);
1615 	i915_request_add(rq);
1616 
1617 	if (wait_until_running(&h, rq)) {
1618 		err = __igt_atomic_reset_engine(engine, p, "active");
1619 	} else {
1620 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1621 		       __func__, engine->name,
1622 		       rq->fence.seqno, hws_seqno(&h, rq));
1623 		intel_gt_set_wedged(engine->gt);
1624 		err = -EIO;
1625 	}
1626 
1627 	if (err == 0) {
1628 		struct intel_wedge_me w;
1629 
1630 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1631 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1632 		if (intel_gt_is_wedged(engine->gt))
1633 			err = -EIO;
1634 	}
1635 
1636 	i915_request_put(rq);
1637 out:
1638 	hang_fini(&h);
1639 	return err;
1640 }
1641 
igt_reset_engines_atomic(void * arg)1642 static int igt_reset_engines_atomic(void *arg)
1643 {
1644 	struct intel_gt *gt = arg;
1645 	const typeof(*igt_atomic_phases) *p;
1646 	int err = 0;
1647 
1648 	/* Check that the engines resets are usable from atomic context */
1649 
1650 	if (!intel_has_reset_engine(gt))
1651 		return 0;
1652 
1653 	if (intel_uc_uses_guc_submission(&gt->uc))
1654 		return 0;
1655 
1656 	igt_global_reset_lock(gt);
1657 
1658 	/* Flush any requests before we get started and check basics */
1659 	if (!igt_force_reset(gt))
1660 		goto unlock;
1661 
1662 	for (p = igt_atomic_phases; p->name; p++) {
1663 		struct intel_engine_cs *engine;
1664 		enum intel_engine_id id;
1665 
1666 		for_each_engine(engine, gt, id) {
1667 			err = igt_atomic_reset_engine(engine, p);
1668 			if (err)
1669 				goto out;
1670 		}
1671 	}
1672 
1673 out:
1674 	/* As we poke around the guts, do a full reset before continuing. */
1675 	igt_force_reset(gt);
1676 unlock:
1677 	igt_global_reset_unlock(gt);
1678 
1679 	return err;
1680 }
1681 
intel_hangcheck_live_selftests(struct drm_i915_private * i915)1682 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1683 {
1684 	static const struct i915_subtest tests[] = {
1685 		SUBTEST(igt_hang_sanitycheck),
1686 		SUBTEST(igt_reset_nop),
1687 		SUBTEST(igt_reset_nop_engine),
1688 		SUBTEST(igt_reset_idle_engine),
1689 		SUBTEST(igt_reset_active_engine),
1690 		SUBTEST(igt_reset_engines),
1691 		SUBTEST(igt_reset_engines_atomic),
1692 		SUBTEST(igt_reset_queue),
1693 		SUBTEST(igt_reset_wait),
1694 		SUBTEST(igt_reset_evict_ggtt),
1695 		SUBTEST(igt_reset_evict_ppgtt),
1696 		SUBTEST(igt_reset_evict_fence),
1697 		SUBTEST(igt_handle_error),
1698 	};
1699 	struct intel_gt *gt = &i915->gt;
1700 	intel_wakeref_t wakeref;
1701 	int err;
1702 
1703 	if (!intel_has_gpu_reset(gt))
1704 		return 0;
1705 
1706 	if (intel_gt_is_wedged(gt))
1707 		return -EIO; /* we're long past hope of a successful reset */
1708 
1709 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1710 
1711 	err = intel_gt_live_subtests(tests, gt);
1712 
1713 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1714 
1715 	return err;
1716 }
1717