1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/kthread.h>
26
27 #include "../i915_selftest.h"
28 #include "i915_random.h"
29 #include "igt_flush_test.h"
30 #include "igt_wedge_me.h"
31
32 #include "mock_context.h"
33 #include "mock_drm.h"
34
35 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
36
37 struct hang {
38 struct drm_i915_private *i915;
39 struct drm_i915_gem_object *hws;
40 struct drm_i915_gem_object *obj;
41 struct i915_gem_context *ctx;
42 u32 *seqno;
43 u32 *batch;
44 };
45
hang_init(struct hang * h,struct drm_i915_private * i915)46 static int hang_init(struct hang *h, struct drm_i915_private *i915)
47 {
48 void *vaddr;
49 int err;
50
51 memset(h, 0, sizeof(*h));
52 h->i915 = i915;
53
54 h->ctx = kernel_context(i915);
55 if (IS_ERR(h->ctx))
56 return PTR_ERR(h->ctx);
57
58 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
59 if (IS_ERR(h->hws)) {
60 err = PTR_ERR(h->hws);
61 goto err_ctx;
62 }
63
64 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
65 if (IS_ERR(h->obj)) {
66 err = PTR_ERR(h->obj);
67 goto err_hws;
68 }
69
70 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
71 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
72 if (IS_ERR(vaddr)) {
73 err = PTR_ERR(vaddr);
74 goto err_obj;
75 }
76 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
77
78 vaddr = i915_gem_object_pin_map(h->obj,
79 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
80 if (IS_ERR(vaddr)) {
81 err = PTR_ERR(vaddr);
82 goto err_unpin_hws;
83 }
84 h->batch = vaddr;
85
86 return 0;
87
88 err_unpin_hws:
89 i915_gem_object_unpin_map(h->hws);
90 err_obj:
91 i915_gem_object_put(h->obj);
92 err_hws:
93 i915_gem_object_put(h->hws);
94 err_ctx:
95 kernel_context_close(h->ctx);
96 return err;
97 }
98
hws_address(const struct i915_vma * hws,const struct i915_request * rq)99 static u64 hws_address(const struct i915_vma *hws,
100 const struct i915_request *rq)
101 {
102 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
103 }
104
emit_recurse_batch(struct hang * h,struct i915_request * rq)105 static int emit_recurse_batch(struct hang *h,
106 struct i915_request *rq)
107 {
108 struct drm_i915_private *i915 = h->i915;
109 struct i915_address_space *vm =
110 rq->gem_context->ppgtt ?
111 &rq->gem_context->ppgtt->vm :
112 &i915->ggtt.vm;
113 struct i915_vma *hws, *vma;
114 unsigned int flags;
115 u32 *batch;
116 int err;
117
118 vma = i915_vma_instance(h->obj, vm, NULL);
119 if (IS_ERR(vma))
120 return PTR_ERR(vma);
121
122 hws = i915_vma_instance(h->hws, vm, NULL);
123 if (IS_ERR(hws))
124 return PTR_ERR(hws);
125
126 err = i915_vma_pin(vma, 0, 0, PIN_USER);
127 if (err)
128 return err;
129
130 err = i915_vma_pin(hws, 0, 0, PIN_USER);
131 if (err)
132 goto unpin_vma;
133
134 err = i915_vma_move_to_active(vma, rq, 0);
135 if (err)
136 goto unpin_hws;
137
138 if (!i915_gem_object_has_active_reference(vma->obj)) {
139 i915_gem_object_get(vma->obj);
140 i915_gem_object_set_active_reference(vma->obj);
141 }
142
143 err = i915_vma_move_to_active(hws, rq, 0);
144 if (err)
145 goto unpin_hws;
146
147 if (!i915_gem_object_has_active_reference(hws->obj)) {
148 i915_gem_object_get(hws->obj);
149 i915_gem_object_set_active_reference(hws->obj);
150 }
151
152 batch = h->batch;
153 if (INTEL_GEN(i915) >= 8) {
154 *batch++ = MI_STORE_DWORD_IMM_GEN4;
155 *batch++ = lower_32_bits(hws_address(hws, rq));
156 *batch++ = upper_32_bits(hws_address(hws, rq));
157 *batch++ = rq->fence.seqno;
158 *batch++ = MI_ARB_CHECK;
159
160 memset(batch, 0, 1024);
161 batch += 1024 / sizeof(*batch);
162
163 *batch++ = MI_ARB_CHECK;
164 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
165 *batch++ = lower_32_bits(vma->node.start);
166 *batch++ = upper_32_bits(vma->node.start);
167 } else if (INTEL_GEN(i915) >= 6) {
168 *batch++ = MI_STORE_DWORD_IMM_GEN4;
169 *batch++ = 0;
170 *batch++ = lower_32_bits(hws_address(hws, rq));
171 *batch++ = rq->fence.seqno;
172 *batch++ = MI_ARB_CHECK;
173
174 memset(batch, 0, 1024);
175 batch += 1024 / sizeof(*batch);
176
177 *batch++ = MI_ARB_CHECK;
178 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
179 *batch++ = lower_32_bits(vma->node.start);
180 } else if (INTEL_GEN(i915) >= 4) {
181 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
182 *batch++ = 0;
183 *batch++ = lower_32_bits(hws_address(hws, rq));
184 *batch++ = rq->fence.seqno;
185 *batch++ = MI_ARB_CHECK;
186
187 memset(batch, 0, 1024);
188 batch += 1024 / sizeof(*batch);
189
190 *batch++ = MI_ARB_CHECK;
191 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
192 *batch++ = lower_32_bits(vma->node.start);
193 } else {
194 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
195 *batch++ = lower_32_bits(hws_address(hws, rq));
196 *batch++ = rq->fence.seqno;
197 *batch++ = MI_ARB_CHECK;
198
199 memset(batch, 0, 1024);
200 batch += 1024 / sizeof(*batch);
201
202 *batch++ = MI_ARB_CHECK;
203 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
204 *batch++ = lower_32_bits(vma->node.start);
205 }
206 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
207 i915_gem_chipset_flush(h->i915);
208
209 flags = 0;
210 if (INTEL_GEN(vm->i915) <= 5)
211 flags |= I915_DISPATCH_SECURE;
212
213 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
214
215 unpin_hws:
216 i915_vma_unpin(hws);
217 unpin_vma:
218 i915_vma_unpin(vma);
219 return err;
220 }
221
222 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)223 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
224 {
225 struct i915_request *rq;
226 int err;
227
228 if (i915_gem_object_is_active(h->obj)) {
229 struct drm_i915_gem_object *obj;
230 void *vaddr;
231
232 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
233 if (IS_ERR(obj))
234 return ERR_CAST(obj);
235
236 vaddr = i915_gem_object_pin_map(obj,
237 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
238 if (IS_ERR(vaddr)) {
239 i915_gem_object_put(obj);
240 return ERR_CAST(vaddr);
241 }
242
243 i915_gem_object_unpin_map(h->obj);
244 i915_gem_object_put(h->obj);
245
246 h->obj = obj;
247 h->batch = vaddr;
248 }
249
250 rq = i915_request_alloc(engine, h->ctx);
251 if (IS_ERR(rq))
252 return rq;
253
254 err = emit_recurse_batch(h, rq);
255 if (err) {
256 i915_request_add(rq);
257 return ERR_PTR(err);
258 }
259
260 return rq;
261 }
262
hws_seqno(const struct hang * h,const struct i915_request * rq)263 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
264 {
265 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
266 }
267
hang_fini(struct hang * h)268 static void hang_fini(struct hang *h)
269 {
270 *h->batch = MI_BATCH_BUFFER_END;
271 i915_gem_chipset_flush(h->i915);
272
273 i915_gem_object_unpin_map(h->obj);
274 i915_gem_object_put(h->obj);
275
276 i915_gem_object_unpin_map(h->hws);
277 i915_gem_object_put(h->hws);
278
279 kernel_context_close(h->ctx);
280
281 igt_flush_test(h->i915, I915_WAIT_LOCKED);
282 }
283
wait_until_running(struct hang * h,struct i915_request * rq)284 static bool wait_until_running(struct hang *h, struct i915_request *rq)
285 {
286 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
287 rq->fence.seqno),
288 10) &&
289 wait_for(i915_seqno_passed(hws_seqno(h, rq),
290 rq->fence.seqno),
291 1000));
292 }
293
igt_hang_sanitycheck(void * arg)294 static int igt_hang_sanitycheck(void *arg)
295 {
296 struct drm_i915_private *i915 = arg;
297 struct i915_request *rq;
298 struct intel_engine_cs *engine;
299 enum intel_engine_id id;
300 struct hang h;
301 int err;
302
303 /* Basic check that we can execute our hanging batch */
304
305 mutex_lock(&i915->drm.struct_mutex);
306 err = hang_init(&h, i915);
307 if (err)
308 goto unlock;
309
310 for_each_engine(engine, i915, id) {
311 long timeout;
312
313 if (!intel_engine_can_store_dword(engine))
314 continue;
315
316 rq = hang_create_request(&h, engine);
317 if (IS_ERR(rq)) {
318 err = PTR_ERR(rq);
319 pr_err("Failed to create request for %s, err=%d\n",
320 engine->name, err);
321 goto fini;
322 }
323
324 i915_request_get(rq);
325
326 *h.batch = MI_BATCH_BUFFER_END;
327 i915_gem_chipset_flush(i915);
328
329 i915_request_add(rq);
330
331 timeout = i915_request_wait(rq,
332 I915_WAIT_LOCKED,
333 MAX_SCHEDULE_TIMEOUT);
334 i915_request_put(rq);
335
336 if (timeout < 0) {
337 err = timeout;
338 pr_err("Wait for request failed on %s, err=%d\n",
339 engine->name, err);
340 goto fini;
341 }
342 }
343
344 fini:
345 hang_fini(&h);
346 unlock:
347 mutex_unlock(&i915->drm.struct_mutex);
348 return err;
349 }
350
global_reset_lock(struct drm_i915_private * i915)351 static void global_reset_lock(struct drm_i915_private *i915)
352 {
353 struct intel_engine_cs *engine;
354 enum intel_engine_id id;
355
356 pr_debug("%s: current gpu_error=%08lx\n",
357 __func__, i915->gpu_error.flags);
358
359 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
360 wait_event(i915->gpu_error.reset_queue,
361 !test_bit(I915_RESET_BACKOFF,
362 &i915->gpu_error.flags));
363
364 for_each_engine(engine, i915, id) {
365 while (test_and_set_bit(I915_RESET_ENGINE + id,
366 &i915->gpu_error.flags))
367 wait_on_bit(&i915->gpu_error.flags,
368 I915_RESET_ENGINE + id,
369 TASK_UNINTERRUPTIBLE);
370 }
371 }
372
global_reset_unlock(struct drm_i915_private * i915)373 static void global_reset_unlock(struct drm_i915_private *i915)
374 {
375 struct intel_engine_cs *engine;
376 enum intel_engine_id id;
377
378 for_each_engine(engine, i915, id)
379 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
380
381 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
382 wake_up_all(&i915->gpu_error.reset_queue);
383 }
384
igt_global_reset(void * arg)385 static int igt_global_reset(void *arg)
386 {
387 struct drm_i915_private *i915 = arg;
388 unsigned int reset_count;
389 int err = 0;
390
391 /* Check that we can issue a global GPU reset */
392
393 global_reset_lock(i915);
394 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
395
396 mutex_lock(&i915->drm.struct_mutex);
397 reset_count = i915_reset_count(&i915->gpu_error);
398
399 i915_reset(i915, ALL_ENGINES, NULL);
400
401 if (i915_reset_count(&i915->gpu_error) == reset_count) {
402 pr_err("No GPU reset recorded!\n");
403 err = -EINVAL;
404 }
405 mutex_unlock(&i915->drm.struct_mutex);
406
407 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
408 global_reset_unlock(i915);
409
410 if (i915_terminally_wedged(&i915->gpu_error))
411 err = -EIO;
412
413 return err;
414 }
415
wait_for_idle(struct intel_engine_cs * engine)416 static bool wait_for_idle(struct intel_engine_cs *engine)
417 {
418 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
419 }
420
__igt_reset_engine(struct drm_i915_private * i915,bool active)421 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
422 {
423 struct intel_engine_cs *engine;
424 enum intel_engine_id id;
425 struct hang h;
426 int err = 0;
427
428 /* Check that we can issue an engine reset on an idle engine (no-op) */
429
430 if (!intel_has_reset_engine(i915))
431 return 0;
432
433 if (active) {
434 mutex_lock(&i915->drm.struct_mutex);
435 err = hang_init(&h, i915);
436 mutex_unlock(&i915->drm.struct_mutex);
437 if (err)
438 return err;
439 }
440
441 for_each_engine(engine, i915, id) {
442 unsigned int reset_count, reset_engine_count;
443 IGT_TIMEOUT(end_time);
444
445 if (active && !intel_engine_can_store_dword(engine))
446 continue;
447
448 if (!wait_for_idle(engine)) {
449 pr_err("%s failed to idle before reset\n",
450 engine->name);
451 err = -EIO;
452 break;
453 }
454
455 reset_count = i915_reset_count(&i915->gpu_error);
456 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
457 engine);
458
459 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
460 do {
461 u32 seqno = intel_engine_get_seqno(engine);
462
463 if (active) {
464 struct i915_request *rq;
465
466 mutex_lock(&i915->drm.struct_mutex);
467 rq = hang_create_request(&h, engine);
468 if (IS_ERR(rq)) {
469 err = PTR_ERR(rq);
470 mutex_unlock(&i915->drm.struct_mutex);
471 break;
472 }
473
474 i915_request_get(rq);
475 i915_request_add(rq);
476 mutex_unlock(&i915->drm.struct_mutex);
477
478 if (!wait_until_running(&h, rq)) {
479 struct drm_printer p = drm_info_printer(i915->drm.dev);
480
481 pr_err("%s: Failed to start request %x, at %x\n",
482 __func__, rq->fence.seqno, hws_seqno(&h, rq));
483 intel_engine_dump(engine, &p,
484 "%s\n", engine->name);
485
486 i915_request_put(rq);
487 err = -EIO;
488 break;
489 }
490
491 GEM_BUG_ON(!rq->global_seqno);
492 seqno = rq->global_seqno - 1;
493 i915_request_put(rq);
494 }
495
496 err = i915_reset_engine(engine, NULL);
497 if (err) {
498 pr_err("i915_reset_engine failed\n");
499 break;
500 }
501
502 if (i915_reset_count(&i915->gpu_error) != reset_count) {
503 pr_err("Full GPU reset recorded! (engine reset expected)\n");
504 err = -EINVAL;
505 break;
506 }
507
508 reset_engine_count += active;
509 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
510 reset_engine_count) {
511 pr_err("%s engine reset %srecorded!\n",
512 engine->name, active ? "not " : "");
513 err = -EINVAL;
514 break;
515 }
516
517 if (!wait_for_idle(engine)) {
518 struct drm_printer p =
519 drm_info_printer(i915->drm.dev);
520
521 pr_err("%s failed to idle after reset\n",
522 engine->name);
523 intel_engine_dump(engine, &p,
524 "%s\n", engine->name);
525
526 err = -EIO;
527 break;
528 }
529 } while (time_before(jiffies, end_time));
530 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
531
532 if (err)
533 break;
534
535 err = igt_flush_test(i915, 0);
536 if (err)
537 break;
538 }
539
540 if (i915_terminally_wedged(&i915->gpu_error))
541 err = -EIO;
542
543 if (active) {
544 mutex_lock(&i915->drm.struct_mutex);
545 hang_fini(&h);
546 mutex_unlock(&i915->drm.struct_mutex);
547 }
548
549 return err;
550 }
551
igt_reset_idle_engine(void * arg)552 static int igt_reset_idle_engine(void *arg)
553 {
554 return __igt_reset_engine(arg, false);
555 }
556
igt_reset_active_engine(void * arg)557 static int igt_reset_active_engine(void *arg)
558 {
559 return __igt_reset_engine(arg, true);
560 }
561
562 struct active_engine {
563 struct task_struct *task;
564 struct intel_engine_cs *engine;
565 unsigned long resets;
566 unsigned int flags;
567 };
568
569 #define TEST_ACTIVE BIT(0)
570 #define TEST_OTHERS BIT(1)
571 #define TEST_SELF BIT(2)
572 #define TEST_PRIORITY BIT(3)
573
active_request_put(struct i915_request * rq)574 static int active_request_put(struct i915_request *rq)
575 {
576 int err = 0;
577
578 if (!rq)
579 return 0;
580
581 if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
582 GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
583 rq->engine->name,
584 rq->fence.context,
585 rq->fence.seqno,
586 i915_request_global_seqno(rq));
587 GEM_TRACE_DUMP();
588
589 i915_gem_set_wedged(rq->i915);
590 err = -EIO;
591 }
592
593 i915_request_put(rq);
594
595 return err;
596 }
597
active_engine(void * data)598 static int active_engine(void *data)
599 {
600 I915_RND_STATE(prng);
601 struct active_engine *arg = data;
602 struct intel_engine_cs *engine = arg->engine;
603 struct i915_request *rq[8] = {};
604 struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
605 struct drm_file *file;
606 unsigned long count = 0;
607 int err = 0;
608
609 file = mock_file(engine->i915);
610 if (IS_ERR(file))
611 return PTR_ERR(file);
612
613 for (count = 0; count < ARRAY_SIZE(ctx); count++) {
614 mutex_lock(&engine->i915->drm.struct_mutex);
615 ctx[count] = live_context(engine->i915, file);
616 mutex_unlock(&engine->i915->drm.struct_mutex);
617 if (IS_ERR(ctx[count])) {
618 err = PTR_ERR(ctx[count]);
619 while (--count)
620 i915_gem_context_put(ctx[count]);
621 goto err_file;
622 }
623 }
624
625 while (!kthread_should_stop()) {
626 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
627 struct i915_request *old = rq[idx];
628 struct i915_request *new;
629
630 mutex_lock(&engine->i915->drm.struct_mutex);
631 new = i915_request_alloc(engine, ctx[idx]);
632 if (IS_ERR(new)) {
633 mutex_unlock(&engine->i915->drm.struct_mutex);
634 err = PTR_ERR(new);
635 break;
636 }
637
638 if (arg->flags & TEST_PRIORITY)
639 ctx[idx]->sched.priority =
640 i915_prandom_u32_max_state(512, &prng);
641
642 rq[idx] = i915_request_get(new);
643 i915_request_add(new);
644 mutex_unlock(&engine->i915->drm.struct_mutex);
645
646 err = active_request_put(old);
647 if (err)
648 break;
649
650 cond_resched();
651 }
652
653 for (count = 0; count < ARRAY_SIZE(rq); count++) {
654 int err__ = active_request_put(rq[count]);
655
656 /* Keep the first error */
657 if (!err)
658 err = err__;
659 }
660
661 err_file:
662 mock_file_free(engine->i915, file);
663 return err;
664 }
665
__igt_reset_engines(struct drm_i915_private * i915,const char * test_name,unsigned int flags)666 static int __igt_reset_engines(struct drm_i915_private *i915,
667 const char *test_name,
668 unsigned int flags)
669 {
670 struct intel_engine_cs *engine, *other;
671 enum intel_engine_id id, tmp;
672 struct hang h;
673 int err = 0;
674
675 /* Check that issuing a reset on one engine does not interfere
676 * with any other engine.
677 */
678
679 if (!intel_has_reset_engine(i915))
680 return 0;
681
682 if (flags & TEST_ACTIVE) {
683 mutex_lock(&i915->drm.struct_mutex);
684 err = hang_init(&h, i915);
685 mutex_unlock(&i915->drm.struct_mutex);
686 if (err)
687 return err;
688
689 if (flags & TEST_PRIORITY)
690 h.ctx->sched.priority = 1024;
691 }
692
693 for_each_engine(engine, i915, id) {
694 struct active_engine threads[I915_NUM_ENGINES] = {};
695 unsigned long global = i915_reset_count(&i915->gpu_error);
696 unsigned long count = 0, reported;
697 IGT_TIMEOUT(end_time);
698
699 if (flags & TEST_ACTIVE &&
700 !intel_engine_can_store_dword(engine))
701 continue;
702
703 if (!wait_for_idle(engine)) {
704 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
705 engine->name, test_name);
706 err = -EIO;
707 break;
708 }
709
710 memset(threads, 0, sizeof(threads));
711 for_each_engine(other, i915, tmp) {
712 struct task_struct *tsk;
713
714 threads[tmp].resets =
715 i915_reset_engine_count(&i915->gpu_error,
716 other);
717
718 if (!(flags & TEST_OTHERS))
719 continue;
720
721 if (other == engine && !(flags & TEST_SELF))
722 continue;
723
724 threads[tmp].engine = other;
725 threads[tmp].flags = flags;
726
727 tsk = kthread_run(active_engine, &threads[tmp],
728 "igt/%s", other->name);
729 if (IS_ERR(tsk)) {
730 err = PTR_ERR(tsk);
731 goto unwind;
732 }
733
734 threads[tmp].task = tsk;
735 get_task_struct(tsk);
736 }
737
738 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739 do {
740 u32 seqno = intel_engine_get_seqno(engine);
741 struct i915_request *rq = NULL;
742
743 if (flags & TEST_ACTIVE) {
744 mutex_lock(&i915->drm.struct_mutex);
745 rq = hang_create_request(&h, engine);
746 if (IS_ERR(rq)) {
747 err = PTR_ERR(rq);
748 mutex_unlock(&i915->drm.struct_mutex);
749 break;
750 }
751
752 i915_request_get(rq);
753 i915_request_add(rq);
754 mutex_unlock(&i915->drm.struct_mutex);
755
756 if (!wait_until_running(&h, rq)) {
757 struct drm_printer p = drm_info_printer(i915->drm.dev);
758
759 pr_err("%s: Failed to start request %x, at %x\n",
760 __func__, rq->fence.seqno, hws_seqno(&h, rq));
761 intel_engine_dump(engine, &p,
762 "%s\n", engine->name);
763
764 i915_request_put(rq);
765 err = -EIO;
766 break;
767 }
768
769 GEM_BUG_ON(!rq->global_seqno);
770 seqno = rq->global_seqno - 1;
771 }
772
773 err = i915_reset_engine(engine, NULL);
774 if (err) {
775 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
776 engine->name, test_name, err);
777 break;
778 }
779
780 count++;
781
782 if (rq) {
783 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
784 i915_request_put(rq);
785 }
786
787 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
788 struct drm_printer p =
789 drm_info_printer(i915->drm.dev);
790
791 pr_err("i915_reset_engine(%s:%s):"
792 " failed to idle after reset\n",
793 engine->name, test_name);
794 intel_engine_dump(engine, &p,
795 "%s\n", engine->name);
796
797 err = -EIO;
798 break;
799 }
800 } while (time_before(jiffies, end_time));
801 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
802 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
803 engine->name, test_name, count);
804
805 reported = i915_reset_engine_count(&i915->gpu_error, engine);
806 reported -= threads[engine->id].resets;
807 if (reported != (flags & TEST_ACTIVE ? count : 0)) {
808 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
809 engine->name, test_name, count, reported,
810 (flags & TEST_ACTIVE ? count : 0));
811 if (!err)
812 err = -EINVAL;
813 }
814
815 unwind:
816 for_each_engine(other, i915, tmp) {
817 int ret;
818
819 if (!threads[tmp].task)
820 continue;
821
822 ret = kthread_stop(threads[tmp].task);
823 if (ret) {
824 pr_err("kthread for other engine %s failed, err=%d\n",
825 other->name, ret);
826 if (!err)
827 err = ret;
828 }
829 put_task_struct(threads[tmp].task);
830
831 if (other != engine &&
832 threads[tmp].resets !=
833 i915_reset_engine_count(&i915->gpu_error, other)) {
834 pr_err("Innocent engine %s was reset (count=%ld)\n",
835 other->name,
836 i915_reset_engine_count(&i915->gpu_error,
837 other) -
838 threads[tmp].resets);
839 if (!err)
840 err = -EINVAL;
841 }
842 }
843
844 if (global != i915_reset_count(&i915->gpu_error)) {
845 pr_err("Global reset (count=%ld)!\n",
846 i915_reset_count(&i915->gpu_error) - global);
847 if (!err)
848 err = -EINVAL;
849 }
850
851 if (err)
852 break;
853
854 err = igt_flush_test(i915, 0);
855 if (err)
856 break;
857 }
858
859 if (i915_terminally_wedged(&i915->gpu_error))
860 err = -EIO;
861
862 if (flags & TEST_ACTIVE) {
863 mutex_lock(&i915->drm.struct_mutex);
864 hang_fini(&h);
865 mutex_unlock(&i915->drm.struct_mutex);
866 }
867
868 return err;
869 }
870
igt_reset_engines(void * arg)871 static int igt_reset_engines(void *arg)
872 {
873 static const struct {
874 const char *name;
875 unsigned int flags;
876 } phases[] = {
877 { "idle", 0 },
878 { "active", TEST_ACTIVE },
879 { "others-idle", TEST_OTHERS },
880 { "others-active", TEST_OTHERS | TEST_ACTIVE },
881 {
882 "others-priority",
883 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
884 },
885 {
886 "self-priority",
887 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
888 },
889 { }
890 };
891 struct drm_i915_private *i915 = arg;
892 typeof(*phases) *p;
893 int err;
894
895 for (p = phases; p->name; p++) {
896 if (p->flags & TEST_PRIORITY) {
897 if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
898 continue;
899 }
900
901 err = __igt_reset_engines(arg, p->name, p->flags);
902 if (err)
903 return err;
904 }
905
906 return 0;
907 }
908
fake_hangcheck(struct i915_request * rq,u32 mask)909 static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
910 {
911 struct i915_gpu_error *error = &rq->i915->gpu_error;
912 u32 reset_count = i915_reset_count(error);
913
914 error->stalled_mask = mask;
915
916 /* set_bit() must be after we have setup the backchannel (mask) */
917 smp_mb__before_atomic();
918 set_bit(I915_RESET_HANDOFF, &error->flags);
919
920 wake_up_all(&error->wait_queue);
921
922 return reset_count;
923 }
924
igt_reset_wait(void * arg)925 static int igt_reset_wait(void *arg)
926 {
927 struct drm_i915_private *i915 = arg;
928 struct i915_request *rq;
929 unsigned int reset_count;
930 struct hang h;
931 long timeout;
932 int err;
933
934 if (!intel_engine_can_store_dword(i915->engine[RCS]))
935 return 0;
936
937 /* Check that we detect a stuck waiter and issue a reset */
938
939 global_reset_lock(i915);
940
941 mutex_lock(&i915->drm.struct_mutex);
942 err = hang_init(&h, i915);
943 if (err)
944 goto unlock;
945
946 rq = hang_create_request(&h, i915->engine[RCS]);
947 if (IS_ERR(rq)) {
948 err = PTR_ERR(rq);
949 goto fini;
950 }
951
952 i915_request_get(rq);
953 i915_request_add(rq);
954
955 if (!wait_until_running(&h, rq)) {
956 struct drm_printer p = drm_info_printer(i915->drm.dev);
957
958 pr_err("%s: Failed to start request %x, at %x\n",
959 __func__, rq->fence.seqno, hws_seqno(&h, rq));
960 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
961
962 i915_gem_set_wedged(i915);
963
964 err = -EIO;
965 goto out_rq;
966 }
967
968 reset_count = fake_hangcheck(rq, ALL_ENGINES);
969
970 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
971 if (timeout < 0) {
972 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
973 timeout);
974 err = timeout;
975 goto out_rq;
976 }
977
978 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
979 if (i915_reset_count(&i915->gpu_error) == reset_count) {
980 pr_err("No GPU reset recorded!\n");
981 err = -EINVAL;
982 goto out_rq;
983 }
984
985 out_rq:
986 i915_request_put(rq);
987 fini:
988 hang_fini(&h);
989 unlock:
990 mutex_unlock(&i915->drm.struct_mutex);
991 global_reset_unlock(i915);
992
993 if (i915_terminally_wedged(&i915->gpu_error))
994 return -EIO;
995
996 return err;
997 }
998
999 struct evict_vma {
1000 struct completion completion;
1001 struct i915_vma *vma;
1002 };
1003
evict_vma(void * data)1004 static int evict_vma(void *data)
1005 {
1006 struct evict_vma *arg = data;
1007 struct i915_address_space *vm = arg->vma->vm;
1008 struct drm_i915_private *i915 = vm->i915;
1009 struct drm_mm_node evict = arg->vma->node;
1010 int err;
1011
1012 complete(&arg->completion);
1013
1014 mutex_lock(&i915->drm.struct_mutex);
1015 err = i915_gem_evict_for_node(vm, &evict, 0);
1016 mutex_unlock(&i915->drm.struct_mutex);
1017
1018 return err;
1019 }
1020
__igt_reset_evict_vma(struct drm_i915_private * i915,struct i915_address_space * vm)1021 static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1022 struct i915_address_space *vm)
1023 {
1024 struct drm_i915_gem_object *obj;
1025 struct task_struct *tsk = NULL;
1026 struct i915_request *rq;
1027 struct evict_vma arg;
1028 struct hang h;
1029 int err;
1030
1031 if (!intel_engine_can_store_dword(i915->engine[RCS]))
1032 return 0;
1033
1034 /* Check that we can recover an unbind stuck on a hanging request */
1035
1036 global_reset_lock(i915);
1037
1038 mutex_lock(&i915->drm.struct_mutex);
1039 err = hang_init(&h, i915);
1040 if (err)
1041 goto unlock;
1042
1043 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1044 if (IS_ERR(obj)) {
1045 err = PTR_ERR(obj);
1046 goto fini;
1047 }
1048
1049 arg.vma = i915_vma_instance(obj, vm, NULL);
1050 if (IS_ERR(arg.vma)) {
1051 err = PTR_ERR(arg.vma);
1052 goto out_obj;
1053 }
1054
1055 rq = hang_create_request(&h, i915->engine[RCS]);
1056 if (IS_ERR(rq)) {
1057 err = PTR_ERR(rq);
1058 goto out_obj;
1059 }
1060
1061 err = i915_vma_pin(arg.vma, 0, 0,
1062 i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER);
1063 if (err)
1064 goto out_obj;
1065
1066 err = i915_vma_move_to_active(arg.vma, rq, EXEC_OBJECT_WRITE);
1067 i915_vma_unpin(arg.vma);
1068
1069 i915_request_get(rq);
1070 i915_request_add(rq);
1071 if (err)
1072 goto out_rq;
1073
1074 mutex_unlock(&i915->drm.struct_mutex);
1075
1076 if (!wait_until_running(&h, rq)) {
1077 struct drm_printer p = drm_info_printer(i915->drm.dev);
1078
1079 pr_err("%s: Failed to start request %x, at %x\n",
1080 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1081 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1082
1083 i915_gem_set_wedged(i915);
1084 goto out_reset;
1085 }
1086
1087 init_completion(&arg.completion);
1088
1089 tsk = kthread_run(evict_vma, &arg, "igt/evict_vma");
1090 if (IS_ERR(tsk)) {
1091 err = PTR_ERR(tsk);
1092 tsk = NULL;
1093 goto out_reset;
1094 }
1095
1096 wait_for_completion(&arg.completion);
1097
1098 if (wait_for(waitqueue_active(&rq->execute), 10)) {
1099 struct drm_printer p = drm_info_printer(i915->drm.dev);
1100
1101 pr_err("igt/evict_vma kthread did not wait\n");
1102 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1103
1104 i915_gem_set_wedged(i915);
1105 goto out_reset;
1106 }
1107
1108 out_reset:
1109 fake_hangcheck(rq, intel_engine_flag(rq->engine));
1110
1111 if (tsk) {
1112 struct igt_wedge_me w;
1113
1114 /* The reset, even indirectly, should take less than 10ms. */
1115 igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1116 err = kthread_stop(tsk);
1117 }
1118
1119 mutex_lock(&i915->drm.struct_mutex);
1120 out_rq:
1121 i915_request_put(rq);
1122 out_obj:
1123 i915_gem_object_put(obj);
1124 fini:
1125 hang_fini(&h);
1126 unlock:
1127 mutex_unlock(&i915->drm.struct_mutex);
1128 global_reset_unlock(i915);
1129
1130 if (i915_terminally_wedged(&i915->gpu_error))
1131 return -EIO;
1132
1133 return err;
1134 }
1135
igt_reset_evict_ggtt(void * arg)1136 static int igt_reset_evict_ggtt(void *arg)
1137 {
1138 struct drm_i915_private *i915 = arg;
1139
1140 return __igt_reset_evict_vma(i915, &i915->ggtt.vm);
1141 }
1142
igt_reset_evict_ppgtt(void * arg)1143 static int igt_reset_evict_ppgtt(void *arg)
1144 {
1145 struct drm_i915_private *i915 = arg;
1146 struct i915_gem_context *ctx;
1147 int err;
1148
1149 mutex_lock(&i915->drm.struct_mutex);
1150 ctx = kernel_context(i915);
1151 mutex_unlock(&i915->drm.struct_mutex);
1152 if (IS_ERR(ctx))
1153 return PTR_ERR(ctx);
1154
1155 err = 0;
1156 if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1157 err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm);
1158
1159 kernel_context_close(ctx);
1160 return err;
1161 }
1162
wait_for_others(struct drm_i915_private * i915,struct intel_engine_cs * exclude)1163 static int wait_for_others(struct drm_i915_private *i915,
1164 struct intel_engine_cs *exclude)
1165 {
1166 struct intel_engine_cs *engine;
1167 enum intel_engine_id id;
1168
1169 for_each_engine(engine, i915, id) {
1170 if (engine == exclude)
1171 continue;
1172
1173 if (!wait_for_idle(engine))
1174 return -EIO;
1175 }
1176
1177 return 0;
1178 }
1179
igt_reset_queue(void * arg)1180 static int igt_reset_queue(void *arg)
1181 {
1182 struct drm_i915_private *i915 = arg;
1183 struct intel_engine_cs *engine;
1184 enum intel_engine_id id;
1185 struct hang h;
1186 int err;
1187
1188 /* Check that we replay pending requests following a hang */
1189
1190 global_reset_lock(i915);
1191
1192 mutex_lock(&i915->drm.struct_mutex);
1193 err = hang_init(&h, i915);
1194 if (err)
1195 goto unlock;
1196
1197 for_each_engine(engine, i915, id) {
1198 struct i915_request *prev;
1199 IGT_TIMEOUT(end_time);
1200 unsigned int count;
1201
1202 if (!intel_engine_can_store_dword(engine))
1203 continue;
1204
1205 prev = hang_create_request(&h, engine);
1206 if (IS_ERR(prev)) {
1207 err = PTR_ERR(prev);
1208 goto fini;
1209 }
1210
1211 i915_request_get(prev);
1212 i915_request_add(prev);
1213
1214 count = 0;
1215 do {
1216 struct i915_request *rq;
1217 unsigned int reset_count;
1218
1219 rq = hang_create_request(&h, engine);
1220 if (IS_ERR(rq)) {
1221 err = PTR_ERR(rq);
1222 goto fini;
1223 }
1224
1225 i915_request_get(rq);
1226 i915_request_add(rq);
1227
1228 /*
1229 * XXX We don't handle resetting the kernel context
1230 * very well. If we trigger a device reset twice in
1231 * quick succession while the kernel context is
1232 * executing, we may end up skipping the breadcrumb.
1233 * This is really only a problem for the selftest as
1234 * normally there is a large interlude between resets
1235 * (hangcheck), or we focus on resetting just one
1236 * engine and so avoid repeatedly resetting innocents.
1237 */
1238 err = wait_for_others(i915, engine);
1239 if (err) {
1240 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1241 __func__, engine->name);
1242 i915_request_put(rq);
1243 i915_request_put(prev);
1244
1245 GEM_TRACE_DUMP();
1246 i915_gem_set_wedged(i915);
1247 goto fini;
1248 }
1249
1250 if (!wait_until_running(&h, prev)) {
1251 struct drm_printer p = drm_info_printer(i915->drm.dev);
1252
1253 pr_err("%s(%s): Failed to start request %x, at %x\n",
1254 __func__, engine->name,
1255 prev->fence.seqno, hws_seqno(&h, prev));
1256 intel_engine_dump(engine, &p,
1257 "%s\n", engine->name);
1258
1259 i915_request_put(rq);
1260 i915_request_put(prev);
1261
1262 i915_gem_set_wedged(i915);
1263
1264 err = -EIO;
1265 goto fini;
1266 }
1267
1268 reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1269
1270 i915_reset(i915, ENGINE_MASK(id), NULL);
1271
1272 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1273 &i915->gpu_error.flags));
1274
1275 if (prev->fence.error != -EIO) {
1276 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1277 prev->fence.error);
1278 i915_request_put(rq);
1279 i915_request_put(prev);
1280 err = -EINVAL;
1281 goto fini;
1282 }
1283
1284 if (rq->fence.error) {
1285 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1286 rq->fence.error);
1287 i915_request_put(rq);
1288 i915_request_put(prev);
1289 err = -EINVAL;
1290 goto fini;
1291 }
1292
1293 if (i915_reset_count(&i915->gpu_error) == reset_count) {
1294 pr_err("No GPU reset recorded!\n");
1295 i915_request_put(rq);
1296 i915_request_put(prev);
1297 err = -EINVAL;
1298 goto fini;
1299 }
1300
1301 i915_request_put(prev);
1302 prev = rq;
1303 count++;
1304 } while (time_before(jiffies, end_time));
1305 pr_info("%s: Completed %d resets\n", engine->name, count);
1306
1307 *h.batch = MI_BATCH_BUFFER_END;
1308 i915_gem_chipset_flush(i915);
1309
1310 i915_request_put(prev);
1311
1312 err = igt_flush_test(i915, I915_WAIT_LOCKED);
1313 if (err)
1314 break;
1315 }
1316
1317 fini:
1318 hang_fini(&h);
1319 unlock:
1320 mutex_unlock(&i915->drm.struct_mutex);
1321 global_reset_unlock(i915);
1322
1323 if (i915_terminally_wedged(&i915->gpu_error))
1324 return -EIO;
1325
1326 return err;
1327 }
1328
igt_handle_error(void * arg)1329 static int igt_handle_error(void *arg)
1330 {
1331 struct drm_i915_private *i915 = arg;
1332 struct intel_engine_cs *engine = i915->engine[RCS];
1333 struct hang h;
1334 struct i915_request *rq;
1335 struct i915_gpu_state *error;
1336 int err;
1337
1338 /* Check that we can issue a global GPU and engine reset */
1339
1340 if (!intel_has_reset_engine(i915))
1341 return 0;
1342
1343 if (!engine || !intel_engine_can_store_dword(engine))
1344 return 0;
1345
1346 mutex_lock(&i915->drm.struct_mutex);
1347
1348 err = hang_init(&h, i915);
1349 if (err)
1350 goto err_unlock;
1351
1352 rq = hang_create_request(&h, engine);
1353 if (IS_ERR(rq)) {
1354 err = PTR_ERR(rq);
1355 goto err_fini;
1356 }
1357
1358 i915_request_get(rq);
1359 i915_request_add(rq);
1360
1361 if (!wait_until_running(&h, rq)) {
1362 struct drm_printer p = drm_info_printer(i915->drm.dev);
1363
1364 pr_err("%s: Failed to start request %x, at %x\n",
1365 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1366 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1367
1368 i915_gem_set_wedged(i915);
1369
1370 err = -EIO;
1371 goto err_request;
1372 }
1373
1374 mutex_unlock(&i915->drm.struct_mutex);
1375
1376 /* Temporarily disable error capture */
1377 error = xchg(&i915->gpu_error.first_error, (void *)-1);
1378
1379 i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1380
1381 xchg(&i915->gpu_error.first_error, error);
1382
1383 mutex_lock(&i915->drm.struct_mutex);
1384
1385 if (rq->fence.error != -EIO) {
1386 pr_err("Guilty request not identified!\n");
1387 err = -EINVAL;
1388 goto err_request;
1389 }
1390
1391 err_request:
1392 i915_request_put(rq);
1393 err_fini:
1394 hang_fini(&h);
1395 err_unlock:
1396 mutex_unlock(&i915->drm.struct_mutex);
1397 return err;
1398 }
1399
intel_hangcheck_live_selftests(struct drm_i915_private * i915)1400 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1401 {
1402 static const struct i915_subtest tests[] = {
1403 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1404 SUBTEST(igt_hang_sanitycheck),
1405 SUBTEST(igt_reset_idle_engine),
1406 SUBTEST(igt_reset_active_engine),
1407 SUBTEST(igt_reset_engines),
1408 SUBTEST(igt_reset_queue),
1409 SUBTEST(igt_reset_wait),
1410 SUBTEST(igt_reset_evict_ggtt),
1411 SUBTEST(igt_reset_evict_ppgtt),
1412 SUBTEST(igt_handle_error),
1413 };
1414 bool saved_hangcheck;
1415 int err;
1416
1417 if (!intel_has_gpu_reset(i915))
1418 return 0;
1419
1420 if (i915_terminally_wedged(&i915->gpu_error))
1421 return -EIO; /* we're long past hope of a successful reset */
1422
1423 intel_runtime_pm_get(i915);
1424 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1425
1426 err = i915_subtests(tests, i915);
1427
1428 mutex_lock(&i915->drm.struct_mutex);
1429 igt_flush_test(i915, I915_WAIT_LOCKED);
1430 mutex_unlock(&i915->drm.struct_mutex);
1431
1432 i915_modparams.enable_hangcheck = saved_hangcheck;
1433 intel_runtime_pm_put(i915);
1434
1435 return err;
1436 }
1437