1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
31
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_requests.h"
37 #include "gt/selftest_engine_heartbeat.h"
38
39 #include "i915_random.h"
40 #include "i915_selftest.h"
41 #include "igt_flush_test.h"
42 #include "igt_live_test.h"
43 #include "igt_spinner.h"
44 #include "lib_sw_fence.h"
45
46 #include "mock_drm.h"
47 #include "mock_gem_device.h"
48
num_uabi_engines(struct drm_i915_private * i915)49 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
50 {
51 struct intel_engine_cs *engine;
52 unsigned int count;
53
54 count = 0;
55 for_each_uabi_engine(engine, i915)
56 count++;
57
58 return count;
59 }
60
rcs0(struct drm_i915_private * i915)61 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
62 {
63 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
64 }
65
igt_add_request(void * arg)66 static int igt_add_request(void *arg)
67 {
68 struct drm_i915_private *i915 = arg;
69 struct i915_request *request;
70
71 /* Basic preliminary test to create a request and let it loose! */
72
73 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
74 if (!request)
75 return -ENOMEM;
76
77 i915_request_add(request);
78
79 return 0;
80 }
81
igt_wait_request(void * arg)82 static int igt_wait_request(void *arg)
83 {
84 const long T = HZ / 4;
85 struct drm_i915_private *i915 = arg;
86 struct i915_request *request;
87 int err = -EINVAL;
88
89 /* Submit a request, then wait upon it */
90
91 request = mock_request(rcs0(i915)->kernel_context, T);
92 if (!request)
93 return -ENOMEM;
94
95 i915_request_get(request);
96
97 if (i915_request_wait(request, 0, 0) != -ETIME) {
98 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
99 goto out_request;
100 }
101
102 if (i915_request_wait(request, 0, T) != -ETIME) {
103 pr_err("request wait succeeded (expected timeout before submit!)\n");
104 goto out_request;
105 }
106
107 if (i915_request_completed(request)) {
108 pr_err("request completed before submit!!\n");
109 goto out_request;
110 }
111
112 i915_request_add(request);
113
114 if (i915_request_wait(request, 0, 0) != -ETIME) {
115 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
116 goto out_request;
117 }
118
119 if (i915_request_completed(request)) {
120 pr_err("request completed immediately!\n");
121 goto out_request;
122 }
123
124 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
125 pr_err("request wait succeeded (expected timeout!)\n");
126 goto out_request;
127 }
128
129 if (i915_request_wait(request, 0, T) == -ETIME) {
130 pr_err("request wait timed out!\n");
131 goto out_request;
132 }
133
134 if (!i915_request_completed(request)) {
135 pr_err("request not complete after waiting!\n");
136 goto out_request;
137 }
138
139 if (i915_request_wait(request, 0, T) == -ETIME) {
140 pr_err("request wait timed out when already complete!\n");
141 goto out_request;
142 }
143
144 err = 0;
145 out_request:
146 i915_request_put(request);
147 mock_device_flush(i915);
148 return err;
149 }
150
igt_fence_wait(void * arg)151 static int igt_fence_wait(void *arg)
152 {
153 const long T = HZ / 4;
154 struct drm_i915_private *i915 = arg;
155 struct i915_request *request;
156 int err = -EINVAL;
157
158 /* Submit a request, treat it as a fence and wait upon it */
159
160 request = mock_request(rcs0(i915)->kernel_context, T);
161 if (!request)
162 return -ENOMEM;
163
164 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
165 pr_err("fence wait success before submit (expected timeout)!\n");
166 goto out;
167 }
168
169 i915_request_add(request);
170
171 if (dma_fence_is_signaled(&request->fence)) {
172 pr_err("fence signaled immediately!\n");
173 goto out;
174 }
175
176 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
177 pr_err("fence wait success after submit (expected timeout)!\n");
178 goto out;
179 }
180
181 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
182 pr_err("fence wait timed out (expected success)!\n");
183 goto out;
184 }
185
186 if (!dma_fence_is_signaled(&request->fence)) {
187 pr_err("fence unsignaled after waiting!\n");
188 goto out;
189 }
190
191 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
192 pr_err("fence wait timed out when complete (expected success)!\n");
193 goto out;
194 }
195
196 err = 0;
197 out:
198 mock_device_flush(i915);
199 return err;
200 }
201
igt_request_rewind(void * arg)202 static int igt_request_rewind(void *arg)
203 {
204 struct drm_i915_private *i915 = arg;
205 struct i915_request *request, *vip;
206 struct i915_gem_context *ctx[2];
207 struct intel_context *ce;
208 int err = -EINVAL;
209
210 ctx[0] = mock_context(i915, "A");
211
212 ce = i915_gem_context_get_engine(ctx[0], RCS0);
213 GEM_BUG_ON(IS_ERR(ce));
214 request = mock_request(ce, 2 * HZ);
215 intel_context_put(ce);
216 if (!request) {
217 err = -ENOMEM;
218 goto err_context_0;
219 }
220
221 i915_request_get(request);
222 i915_request_add(request);
223
224 ctx[1] = mock_context(i915, "B");
225
226 ce = i915_gem_context_get_engine(ctx[1], RCS0);
227 GEM_BUG_ON(IS_ERR(ce));
228 vip = mock_request(ce, 0);
229 intel_context_put(ce);
230 if (!vip) {
231 err = -ENOMEM;
232 goto err_context_1;
233 }
234
235 /* Simulate preemption by manual reordering */
236 if (!mock_cancel_request(request)) {
237 pr_err("failed to cancel request (already executed)!\n");
238 i915_request_add(vip);
239 goto err_context_1;
240 }
241 i915_request_get(vip);
242 i915_request_add(vip);
243 rcu_read_lock();
244 request->engine->submit_request(request);
245 rcu_read_unlock();
246
247
248 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
249 pr_err("timed out waiting for high priority request\n");
250 goto err;
251 }
252
253 if (i915_request_completed(request)) {
254 pr_err("low priority request already completed\n");
255 goto err;
256 }
257
258 err = 0;
259 err:
260 i915_request_put(vip);
261 err_context_1:
262 mock_context_close(ctx[1]);
263 i915_request_put(request);
264 err_context_0:
265 mock_context_close(ctx[0]);
266 mock_device_flush(i915);
267 return err;
268 }
269
270 struct smoketest {
271 struct intel_engine_cs *engine;
272 struct i915_gem_context **contexts;
273 atomic_long_t num_waits, num_fences;
274 int ncontexts, max_batch;
275 struct i915_request *(*request_alloc)(struct intel_context *ce);
276 };
277
278 static struct i915_request *
__mock_request_alloc(struct intel_context * ce)279 __mock_request_alloc(struct intel_context *ce)
280 {
281 return mock_request(ce, 0);
282 }
283
284 static struct i915_request *
__live_request_alloc(struct intel_context * ce)285 __live_request_alloc(struct intel_context *ce)
286 {
287 return intel_context_create_request(ce);
288 }
289
__igt_breadcrumbs_smoketest(void * arg)290 static int __igt_breadcrumbs_smoketest(void *arg)
291 {
292 struct smoketest *t = arg;
293 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
294 const unsigned int total = 4 * t->ncontexts + 1;
295 unsigned int num_waits = 0, num_fences = 0;
296 struct i915_request **requests;
297 I915_RND_STATE(prng);
298 unsigned int *order;
299 int err = 0;
300
301 /*
302 * A very simple test to catch the most egregious of list handling bugs.
303 *
304 * At its heart, we simply create oodles of requests running across
305 * multiple kthreads and enable signaling on them, for the sole purpose
306 * of stressing our breadcrumb handling. The only inspection we do is
307 * that the fences were marked as signaled.
308 */
309
310 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
311 if (!requests)
312 return -ENOMEM;
313
314 order = i915_random_order(total, &prng);
315 if (!order) {
316 err = -ENOMEM;
317 goto out_requests;
318 }
319
320 while (!kthread_should_stop()) {
321 struct i915_sw_fence *submit, *wait;
322 unsigned int n, count;
323
324 submit = heap_fence_create(GFP_KERNEL);
325 if (!submit) {
326 err = -ENOMEM;
327 break;
328 }
329
330 wait = heap_fence_create(GFP_KERNEL);
331 if (!wait) {
332 i915_sw_fence_commit(submit);
333 heap_fence_put(submit);
334 err = -ENOMEM;
335 break;
336 }
337
338 i915_random_reorder(order, total, &prng);
339 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
340
341 for (n = 0; n < count; n++) {
342 struct i915_gem_context *ctx =
343 t->contexts[order[n] % t->ncontexts];
344 struct i915_request *rq;
345 struct intel_context *ce;
346
347 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
348 GEM_BUG_ON(IS_ERR(ce));
349 rq = t->request_alloc(ce);
350 intel_context_put(ce);
351 if (IS_ERR(rq)) {
352 err = PTR_ERR(rq);
353 count = n;
354 break;
355 }
356
357 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
358 submit,
359 GFP_KERNEL);
360
361 requests[n] = i915_request_get(rq);
362 i915_request_add(rq);
363
364 if (err >= 0)
365 err = i915_sw_fence_await_dma_fence(wait,
366 &rq->fence,
367 0,
368 GFP_KERNEL);
369
370 if (err < 0) {
371 i915_request_put(rq);
372 count = n;
373 break;
374 }
375 }
376
377 i915_sw_fence_commit(submit);
378 i915_sw_fence_commit(wait);
379
380 if (!wait_event_timeout(wait->wait,
381 i915_sw_fence_done(wait),
382 5 * HZ)) {
383 struct i915_request *rq = requests[count - 1];
384
385 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
386 atomic_read(&wait->pending), count,
387 rq->fence.context, rq->fence.seqno,
388 t->engine->name);
389 GEM_TRACE_DUMP();
390
391 intel_gt_set_wedged(t->engine->gt);
392 GEM_BUG_ON(!i915_request_completed(rq));
393 i915_sw_fence_wait(wait);
394 err = -EIO;
395 }
396
397 for (n = 0; n < count; n++) {
398 struct i915_request *rq = requests[n];
399
400 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
401 &rq->fence.flags)) {
402 pr_err("%llu:%llu was not signaled!\n",
403 rq->fence.context, rq->fence.seqno);
404 err = -EINVAL;
405 }
406
407 i915_request_put(rq);
408 }
409
410 heap_fence_put(wait);
411 heap_fence_put(submit);
412
413 if (err < 0)
414 break;
415
416 num_fences += count;
417 num_waits++;
418
419 cond_resched();
420 }
421
422 atomic_long_add(num_fences, &t->num_fences);
423 atomic_long_add(num_waits, &t->num_waits);
424
425 kfree(order);
426 out_requests:
427 kfree(requests);
428 return err;
429 }
430
mock_breadcrumbs_smoketest(void * arg)431 static int mock_breadcrumbs_smoketest(void *arg)
432 {
433 struct drm_i915_private *i915 = arg;
434 struct smoketest t = {
435 .engine = rcs0(i915),
436 .ncontexts = 1024,
437 .max_batch = 1024,
438 .request_alloc = __mock_request_alloc
439 };
440 unsigned int ncpus = num_online_cpus();
441 struct task_struct **threads;
442 unsigned int n;
443 int ret = 0;
444
445 /*
446 * Smoketest our breadcrumb/signal handling for requests across multiple
447 * threads. A very simple test to only catch the most egregious of bugs.
448 * See __igt_breadcrumbs_smoketest();
449 */
450
451 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
452 if (!threads)
453 return -ENOMEM;
454
455 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
456 if (!t.contexts) {
457 ret = -ENOMEM;
458 goto out_threads;
459 }
460
461 for (n = 0; n < t.ncontexts; n++) {
462 t.contexts[n] = mock_context(t.engine->i915, "mock");
463 if (!t.contexts[n]) {
464 ret = -ENOMEM;
465 goto out_contexts;
466 }
467 }
468
469 for (n = 0; n < ncpus; n++) {
470 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
471 &t, "igt/%d", n);
472 if (IS_ERR(threads[n])) {
473 ret = PTR_ERR(threads[n]);
474 ncpus = n;
475 break;
476 }
477
478 get_task_struct(threads[n]);
479 }
480
481 yield(); /* start all threads before we begin */
482 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
483
484 for (n = 0; n < ncpus; n++) {
485 int err;
486
487 err = kthread_stop(threads[n]);
488 if (err < 0 && !ret)
489 ret = err;
490
491 put_task_struct(threads[n]);
492 }
493 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
494 atomic_long_read(&t.num_waits),
495 atomic_long_read(&t.num_fences),
496 ncpus);
497
498 out_contexts:
499 for (n = 0; n < t.ncontexts; n++) {
500 if (!t.contexts[n])
501 break;
502 mock_context_close(t.contexts[n]);
503 }
504 kfree(t.contexts);
505 out_threads:
506 kfree(threads);
507 return ret;
508 }
509
i915_request_mock_selftests(void)510 int i915_request_mock_selftests(void)
511 {
512 static const struct i915_subtest tests[] = {
513 SUBTEST(igt_add_request),
514 SUBTEST(igt_wait_request),
515 SUBTEST(igt_fence_wait),
516 SUBTEST(igt_request_rewind),
517 SUBTEST(mock_breadcrumbs_smoketest),
518 };
519 struct drm_i915_private *i915;
520 intel_wakeref_t wakeref;
521 int err = 0;
522
523 i915 = mock_gem_device();
524 if (!i915)
525 return -ENOMEM;
526
527 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
528 err = i915_subtests(tests, i915);
529
530 mock_destroy_device(i915);
531
532 return err;
533 }
534
live_nop_request(void * arg)535 static int live_nop_request(void *arg)
536 {
537 struct drm_i915_private *i915 = arg;
538 struct intel_engine_cs *engine;
539 struct igt_live_test t;
540 int err = -ENODEV;
541
542 /*
543 * Submit various sized batches of empty requests, to each engine
544 * (individually), and wait for the batch to complete. We can check
545 * the overhead of submitting requests to the hardware.
546 */
547
548 for_each_uabi_engine(engine, i915) {
549 unsigned long n, prime;
550 IGT_TIMEOUT(end_time);
551 ktime_t times[2] = {};
552
553 err = igt_live_test_begin(&t, i915, __func__, engine->name);
554 if (err)
555 return err;
556
557 intel_engine_pm_get(engine);
558 for_each_prime_number_from(prime, 1, 8192) {
559 struct i915_request *request = NULL;
560
561 times[1] = ktime_get_raw();
562
563 for (n = 0; n < prime; n++) {
564 i915_request_put(request);
565 request = i915_request_create(engine->kernel_context);
566 if (IS_ERR(request))
567 return PTR_ERR(request);
568
569 /*
570 * This space is left intentionally blank.
571 *
572 * We do not actually want to perform any
573 * action with this request, we just want
574 * to measure the latency in allocation
575 * and submission of our breadcrumbs -
576 * ensuring that the bare request is sufficient
577 * for the system to work (i.e. proper HEAD
578 * tracking of the rings, interrupt handling,
579 * etc). It also gives us the lowest bounds
580 * for latency.
581 */
582
583 i915_request_get(request);
584 i915_request_add(request);
585 }
586 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
587 i915_request_put(request);
588
589 times[1] = ktime_sub(ktime_get_raw(), times[1]);
590 if (prime == 1)
591 times[0] = times[1];
592
593 if (__igt_timeout(end_time, NULL))
594 break;
595 }
596 intel_engine_pm_put(engine);
597
598 err = igt_live_test_end(&t);
599 if (err)
600 return err;
601
602 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
603 engine->name,
604 ktime_to_ns(times[0]),
605 prime, div64_u64(ktime_to_ns(times[1]), prime));
606 }
607
608 return err;
609 }
610
empty_batch(struct drm_i915_private * i915)611 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
612 {
613 struct drm_i915_gem_object *obj;
614 struct i915_vma *vma;
615 u32 *cmd;
616 int err;
617
618 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
619 if (IS_ERR(obj))
620 return ERR_CAST(obj);
621
622 cmd = i915_gem_object_pin_map(obj, I915_MAP_WB);
623 if (IS_ERR(cmd)) {
624 err = PTR_ERR(cmd);
625 goto err;
626 }
627
628 *cmd = MI_BATCH_BUFFER_END;
629
630 __i915_gem_object_flush_map(obj, 0, 64);
631 i915_gem_object_unpin_map(obj);
632
633 intel_gt_chipset_flush(&i915->gt);
634
635 vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
636 if (IS_ERR(vma)) {
637 err = PTR_ERR(vma);
638 goto err;
639 }
640
641 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
642 if (err)
643 goto err;
644
645 /* Force the wait wait now to avoid including it in the benchmark */
646 err = i915_vma_sync(vma);
647 if (err)
648 goto err_pin;
649
650 return vma;
651
652 err_pin:
653 i915_vma_unpin(vma);
654 err:
655 i915_gem_object_put(obj);
656 return ERR_PTR(err);
657 }
658
659 static struct i915_request *
empty_request(struct intel_engine_cs * engine,struct i915_vma * batch)660 empty_request(struct intel_engine_cs *engine,
661 struct i915_vma *batch)
662 {
663 struct i915_request *request;
664 int err;
665
666 request = i915_request_create(engine->kernel_context);
667 if (IS_ERR(request))
668 return request;
669
670 err = engine->emit_bb_start(request,
671 batch->node.start,
672 batch->node.size,
673 I915_DISPATCH_SECURE);
674 if (err)
675 goto out_request;
676
677 i915_request_get(request);
678 out_request:
679 i915_request_add(request);
680 return err ? ERR_PTR(err) : request;
681 }
682
live_empty_request(void * arg)683 static int live_empty_request(void *arg)
684 {
685 struct drm_i915_private *i915 = arg;
686 struct intel_engine_cs *engine;
687 struct igt_live_test t;
688 struct i915_vma *batch;
689 int err = 0;
690
691 /*
692 * Submit various sized batches of empty requests, to each engine
693 * (individually), and wait for the batch to complete. We can check
694 * the overhead of submitting requests to the hardware.
695 */
696
697 batch = empty_batch(i915);
698 if (IS_ERR(batch))
699 return PTR_ERR(batch);
700
701 for_each_uabi_engine(engine, i915) {
702 IGT_TIMEOUT(end_time);
703 struct i915_request *request;
704 unsigned long n, prime;
705 ktime_t times[2] = {};
706
707 err = igt_live_test_begin(&t, i915, __func__, engine->name);
708 if (err)
709 goto out_batch;
710
711 intel_engine_pm_get(engine);
712
713 /* Warmup / preload */
714 request = empty_request(engine, batch);
715 if (IS_ERR(request)) {
716 err = PTR_ERR(request);
717 intel_engine_pm_put(engine);
718 goto out_batch;
719 }
720 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
721
722 for_each_prime_number_from(prime, 1, 8192) {
723 times[1] = ktime_get_raw();
724
725 for (n = 0; n < prime; n++) {
726 i915_request_put(request);
727 request = empty_request(engine, batch);
728 if (IS_ERR(request)) {
729 err = PTR_ERR(request);
730 intel_engine_pm_put(engine);
731 goto out_batch;
732 }
733 }
734 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
735
736 times[1] = ktime_sub(ktime_get_raw(), times[1]);
737 if (prime == 1)
738 times[0] = times[1];
739
740 if (__igt_timeout(end_time, NULL))
741 break;
742 }
743 i915_request_put(request);
744 intel_engine_pm_put(engine);
745
746 err = igt_live_test_end(&t);
747 if (err)
748 goto out_batch;
749
750 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
751 engine->name,
752 ktime_to_ns(times[0]),
753 prime, div64_u64(ktime_to_ns(times[1]), prime));
754 }
755
756 out_batch:
757 i915_vma_unpin(batch);
758 i915_vma_put(batch);
759 return err;
760 }
761
recursive_batch(struct drm_i915_private * i915)762 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
763 {
764 struct drm_i915_gem_object *obj;
765 const int gen = INTEL_GEN(i915);
766 struct i915_vma *vma;
767 u32 *cmd;
768 int err;
769
770 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
771 if (IS_ERR(obj))
772 return ERR_CAST(obj);
773
774 vma = i915_vma_instance(obj, i915->gt.vm, NULL);
775 if (IS_ERR(vma)) {
776 err = PTR_ERR(vma);
777 goto err;
778 }
779
780 err = i915_vma_pin(vma, 0, 0, PIN_USER);
781 if (err)
782 goto err;
783
784 cmd = i915_gem_object_pin_map(obj, I915_MAP_WC);
785 if (IS_ERR(cmd)) {
786 err = PTR_ERR(cmd);
787 goto err;
788 }
789
790 if (gen >= 8) {
791 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
792 *cmd++ = lower_32_bits(vma->node.start);
793 *cmd++ = upper_32_bits(vma->node.start);
794 } else if (gen >= 6) {
795 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
796 *cmd++ = lower_32_bits(vma->node.start);
797 } else {
798 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
799 *cmd++ = lower_32_bits(vma->node.start);
800 }
801 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
802
803 __i915_gem_object_flush_map(obj, 0, 64);
804 i915_gem_object_unpin_map(obj);
805
806 intel_gt_chipset_flush(&i915->gt);
807
808 return vma;
809
810 err:
811 i915_gem_object_put(obj);
812 return ERR_PTR(err);
813 }
814
recursive_batch_resolve(struct i915_vma * batch)815 static int recursive_batch_resolve(struct i915_vma *batch)
816 {
817 u32 *cmd;
818
819 cmd = i915_gem_object_pin_map(batch->obj, I915_MAP_WC);
820 if (IS_ERR(cmd))
821 return PTR_ERR(cmd);
822
823 *cmd = MI_BATCH_BUFFER_END;
824
825 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
826 i915_gem_object_unpin_map(batch->obj);
827
828 intel_gt_chipset_flush(batch->vm->gt);
829
830 return 0;
831 }
832
live_all_engines(void * arg)833 static int live_all_engines(void *arg)
834 {
835 struct drm_i915_private *i915 = arg;
836 const unsigned int nengines = num_uabi_engines(i915);
837 struct intel_engine_cs *engine;
838 struct i915_request **request;
839 struct igt_live_test t;
840 struct i915_vma *batch;
841 unsigned int idx;
842 int err;
843
844 /*
845 * Check we can submit requests to all engines simultaneously. We
846 * send a recursive batch to each engine - checking that we don't
847 * block doing so, and that they don't complete too soon.
848 */
849
850 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
851 if (!request)
852 return -ENOMEM;
853
854 err = igt_live_test_begin(&t, i915, __func__, "");
855 if (err)
856 goto out_free;
857
858 batch = recursive_batch(i915);
859 if (IS_ERR(batch)) {
860 err = PTR_ERR(batch);
861 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
862 goto out_free;
863 }
864
865 i915_vma_lock(batch);
866
867 idx = 0;
868 for_each_uabi_engine(engine, i915) {
869 request[idx] = intel_engine_create_kernel_request(engine);
870 if (IS_ERR(request[idx])) {
871 err = PTR_ERR(request[idx]);
872 pr_err("%s: Request allocation failed with err=%d\n",
873 __func__, err);
874 goto out_request;
875 }
876
877 err = i915_request_await_object(request[idx], batch->obj, 0);
878 if (err == 0)
879 err = i915_vma_move_to_active(batch, request[idx], 0);
880 GEM_BUG_ON(err);
881
882 err = engine->emit_bb_start(request[idx],
883 batch->node.start,
884 batch->node.size,
885 0);
886 GEM_BUG_ON(err);
887 request[idx]->batch = batch;
888
889 i915_request_get(request[idx]);
890 i915_request_add(request[idx]);
891 idx++;
892 }
893
894 i915_vma_unlock(batch);
895
896 idx = 0;
897 for_each_uabi_engine(engine, i915) {
898 if (i915_request_completed(request[idx])) {
899 pr_err("%s(%s): request completed too early!\n",
900 __func__, engine->name);
901 err = -EINVAL;
902 goto out_request;
903 }
904 idx++;
905 }
906
907 err = recursive_batch_resolve(batch);
908 if (err) {
909 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
910 goto out_request;
911 }
912
913 idx = 0;
914 for_each_uabi_engine(engine, i915) {
915 long timeout;
916
917 timeout = i915_request_wait(request[idx], 0,
918 MAX_SCHEDULE_TIMEOUT);
919 if (timeout < 0) {
920 err = timeout;
921 pr_err("%s: error waiting for request on %s, err=%d\n",
922 __func__, engine->name, err);
923 goto out_request;
924 }
925
926 GEM_BUG_ON(!i915_request_completed(request[idx]));
927 i915_request_put(request[idx]);
928 request[idx] = NULL;
929 idx++;
930 }
931
932 err = igt_live_test_end(&t);
933
934 out_request:
935 idx = 0;
936 for_each_uabi_engine(engine, i915) {
937 if (request[idx])
938 i915_request_put(request[idx]);
939 idx++;
940 }
941 i915_vma_unpin(batch);
942 i915_vma_put(batch);
943 out_free:
944 kfree(request);
945 return err;
946 }
947
live_sequential_engines(void * arg)948 static int live_sequential_engines(void *arg)
949 {
950 struct drm_i915_private *i915 = arg;
951 const unsigned int nengines = num_uabi_engines(i915);
952 struct i915_request **request;
953 struct i915_request *prev = NULL;
954 struct intel_engine_cs *engine;
955 struct igt_live_test t;
956 unsigned int idx;
957 int err;
958
959 /*
960 * Check we can submit requests to all engines sequentially, such
961 * that each successive request waits for the earlier ones. This
962 * tests that we don't execute requests out of order, even though
963 * they are running on independent engines.
964 */
965
966 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
967 if (!request)
968 return -ENOMEM;
969
970 err = igt_live_test_begin(&t, i915, __func__, "");
971 if (err)
972 goto out_free;
973
974 idx = 0;
975 for_each_uabi_engine(engine, i915) {
976 struct i915_vma *batch;
977
978 batch = recursive_batch(i915);
979 if (IS_ERR(batch)) {
980 err = PTR_ERR(batch);
981 pr_err("%s: Unable to create batch for %s, err=%d\n",
982 __func__, engine->name, err);
983 goto out_free;
984 }
985
986 i915_vma_lock(batch);
987 request[idx] = intel_engine_create_kernel_request(engine);
988 if (IS_ERR(request[idx])) {
989 err = PTR_ERR(request[idx]);
990 pr_err("%s: Request allocation failed for %s with err=%d\n",
991 __func__, engine->name, err);
992 goto out_unlock;
993 }
994
995 if (prev) {
996 err = i915_request_await_dma_fence(request[idx],
997 &prev->fence);
998 if (err) {
999 i915_request_add(request[idx]);
1000 pr_err("%s: Request await failed for %s with err=%d\n",
1001 __func__, engine->name, err);
1002 goto out_unlock;
1003 }
1004 }
1005
1006 err = i915_request_await_object(request[idx],
1007 batch->obj, false);
1008 if (err == 0)
1009 err = i915_vma_move_to_active(batch, request[idx], 0);
1010 GEM_BUG_ON(err);
1011
1012 err = engine->emit_bb_start(request[idx],
1013 batch->node.start,
1014 batch->node.size,
1015 0);
1016 GEM_BUG_ON(err);
1017 request[idx]->batch = batch;
1018
1019 i915_request_get(request[idx]);
1020 i915_request_add(request[idx]);
1021
1022 prev = request[idx];
1023 idx++;
1024
1025 out_unlock:
1026 i915_vma_unlock(batch);
1027 if (err)
1028 goto out_request;
1029 }
1030
1031 idx = 0;
1032 for_each_uabi_engine(engine, i915) {
1033 long timeout;
1034
1035 if (i915_request_completed(request[idx])) {
1036 pr_err("%s(%s): request completed too early!\n",
1037 __func__, engine->name);
1038 err = -EINVAL;
1039 goto out_request;
1040 }
1041
1042 err = recursive_batch_resolve(request[idx]->batch);
1043 if (err) {
1044 pr_err("%s: failed to resolve batch, err=%d\n",
1045 __func__, err);
1046 goto out_request;
1047 }
1048
1049 timeout = i915_request_wait(request[idx], 0,
1050 MAX_SCHEDULE_TIMEOUT);
1051 if (timeout < 0) {
1052 err = timeout;
1053 pr_err("%s: error waiting for request on %s, err=%d\n",
1054 __func__, engine->name, err);
1055 goto out_request;
1056 }
1057
1058 GEM_BUG_ON(!i915_request_completed(request[idx]));
1059 idx++;
1060 }
1061
1062 err = igt_live_test_end(&t);
1063
1064 out_request:
1065 idx = 0;
1066 for_each_uabi_engine(engine, i915) {
1067 u32 *cmd;
1068
1069 if (!request[idx])
1070 break;
1071
1072 cmd = i915_gem_object_pin_map(request[idx]->batch->obj,
1073 I915_MAP_WC);
1074 if (!IS_ERR(cmd)) {
1075 *cmd = MI_BATCH_BUFFER_END;
1076
1077 __i915_gem_object_flush_map(request[idx]->batch->obj,
1078 0, sizeof(*cmd));
1079 i915_gem_object_unpin_map(request[idx]->batch->obj);
1080
1081 intel_gt_chipset_flush(engine->gt);
1082 }
1083
1084 i915_vma_put(request[idx]->batch);
1085 i915_request_put(request[idx]);
1086 idx++;
1087 }
1088 out_free:
1089 kfree(request);
1090 return err;
1091 }
1092
__live_parallel_engine1(void * arg)1093 static int __live_parallel_engine1(void *arg)
1094 {
1095 struct intel_engine_cs *engine = arg;
1096 IGT_TIMEOUT(end_time);
1097 unsigned long count;
1098 int err = 0;
1099
1100 count = 0;
1101 intel_engine_pm_get(engine);
1102 do {
1103 struct i915_request *rq;
1104
1105 rq = i915_request_create(engine->kernel_context);
1106 if (IS_ERR(rq)) {
1107 err = PTR_ERR(rq);
1108 break;
1109 }
1110
1111 i915_request_get(rq);
1112 i915_request_add(rq);
1113
1114 err = 0;
1115 if (i915_request_wait(rq, 0, HZ / 5) < 0)
1116 err = -ETIME;
1117 i915_request_put(rq);
1118 if (err)
1119 break;
1120
1121 count++;
1122 } while (!__igt_timeout(end_time, NULL));
1123 intel_engine_pm_put(engine);
1124
1125 pr_info("%s: %lu request + sync\n", engine->name, count);
1126 return err;
1127 }
1128
__live_parallel_engineN(void * arg)1129 static int __live_parallel_engineN(void *arg)
1130 {
1131 struct intel_engine_cs *engine = arg;
1132 IGT_TIMEOUT(end_time);
1133 unsigned long count;
1134 int err = 0;
1135
1136 count = 0;
1137 intel_engine_pm_get(engine);
1138 do {
1139 struct i915_request *rq;
1140
1141 rq = i915_request_create(engine->kernel_context);
1142 if (IS_ERR(rq)) {
1143 err = PTR_ERR(rq);
1144 break;
1145 }
1146
1147 i915_request_add(rq);
1148 count++;
1149 } while (!__igt_timeout(end_time, NULL));
1150 intel_engine_pm_put(engine);
1151
1152 pr_info("%s: %lu requests\n", engine->name, count);
1153 return err;
1154 }
1155
wake_all(struct drm_i915_private * i915)1156 static bool wake_all(struct drm_i915_private *i915)
1157 {
1158 if (atomic_dec_and_test(&i915->selftest.counter)) {
1159 wake_up_var(&i915->selftest.counter);
1160 return true;
1161 }
1162
1163 return false;
1164 }
1165
wait_for_all(struct drm_i915_private * i915)1166 static int wait_for_all(struct drm_i915_private *i915)
1167 {
1168 if (wake_all(i915))
1169 return 0;
1170
1171 if (wait_var_event_timeout(&i915->selftest.counter,
1172 !atomic_read(&i915->selftest.counter),
1173 i915_selftest.timeout_jiffies))
1174 return 0;
1175
1176 return -ETIME;
1177 }
1178
__live_parallel_spin(void * arg)1179 static int __live_parallel_spin(void *arg)
1180 {
1181 struct intel_engine_cs *engine = arg;
1182 struct igt_spinner spin;
1183 struct i915_request *rq;
1184 int err = 0;
1185
1186 /*
1187 * Create a spinner running for eternity on each engine. If a second
1188 * spinner is incorrectly placed on the same engine, it will not be
1189 * able to start in time.
1190 */
1191
1192 if (igt_spinner_init(&spin, engine->gt)) {
1193 wake_all(engine->i915);
1194 return -ENOMEM;
1195 }
1196
1197 intel_engine_pm_get(engine);
1198 rq = igt_spinner_create_request(&spin,
1199 engine->kernel_context,
1200 MI_NOOP); /* no preemption */
1201 intel_engine_pm_put(engine);
1202 if (IS_ERR(rq)) {
1203 err = PTR_ERR(rq);
1204 if (err == -ENODEV)
1205 err = 0;
1206 wake_all(engine->i915);
1207 goto out_spin;
1208 }
1209
1210 i915_request_get(rq);
1211 i915_request_add(rq);
1212 if (igt_wait_for_spinner(&spin, rq)) {
1213 /* Occupy this engine for the whole test */
1214 err = wait_for_all(engine->i915);
1215 } else {
1216 pr_err("Failed to start spinner on %s\n", engine->name);
1217 err = -EINVAL;
1218 }
1219 igt_spinner_end(&spin);
1220
1221 if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1222 err = -EIO;
1223 i915_request_put(rq);
1224
1225 out_spin:
1226 igt_spinner_fini(&spin);
1227 return err;
1228 }
1229
live_parallel_engines(void * arg)1230 static int live_parallel_engines(void *arg)
1231 {
1232 struct drm_i915_private *i915 = arg;
1233 static int (* const func[])(void *arg) = {
1234 __live_parallel_engine1,
1235 __live_parallel_engineN,
1236 __live_parallel_spin,
1237 NULL,
1238 };
1239 const unsigned int nengines = num_uabi_engines(i915);
1240 struct intel_engine_cs *engine;
1241 int (* const *fn)(void *arg);
1242 struct task_struct **tsk;
1243 int err = 0;
1244
1245 /*
1246 * Check we can submit requests to all engines concurrently. This
1247 * tests that we load up the system maximally.
1248 */
1249
1250 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1251 if (!tsk)
1252 return -ENOMEM;
1253
1254 for (fn = func; !err && *fn; fn++) {
1255 char name[KSYM_NAME_LEN];
1256 struct igt_live_test t;
1257 unsigned int idx;
1258
1259 snprintf(name, sizeof(name), "%ps", *fn);
1260 err = igt_live_test_begin(&t, i915, __func__, name);
1261 if (err)
1262 break;
1263
1264 atomic_set(&i915->selftest.counter, nengines);
1265
1266 idx = 0;
1267 for_each_uabi_engine(engine, i915) {
1268 tsk[idx] = kthread_run(*fn, engine,
1269 "igt/parallel:%s",
1270 engine->name);
1271 if (IS_ERR(tsk[idx])) {
1272 err = PTR_ERR(tsk[idx]);
1273 break;
1274 }
1275 get_task_struct(tsk[idx++]);
1276 }
1277
1278 yield(); /* start all threads before we kthread_stop() */
1279
1280 idx = 0;
1281 for_each_uabi_engine(engine, i915) {
1282 int status;
1283
1284 if (IS_ERR(tsk[idx]))
1285 break;
1286
1287 status = kthread_stop(tsk[idx]);
1288 if (status && !err)
1289 err = status;
1290
1291 put_task_struct(tsk[idx++]);
1292 }
1293
1294 if (igt_live_test_end(&t))
1295 err = -EIO;
1296 }
1297
1298 kfree(tsk);
1299 return err;
1300 }
1301
1302 static int
max_batches(struct i915_gem_context * ctx,struct intel_engine_cs * engine)1303 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1304 {
1305 struct i915_request *rq;
1306 int ret;
1307
1308 /*
1309 * Before execlists, all contexts share the same ringbuffer. With
1310 * execlists, each context/engine has a separate ringbuffer and
1311 * for the purposes of this test, inexhaustible.
1312 *
1313 * For the global ringbuffer though, we have to be very careful
1314 * that we do not wrap while preventing the execution of requests
1315 * with a unsignaled fence.
1316 */
1317 if (HAS_EXECLISTS(ctx->i915))
1318 return INT_MAX;
1319
1320 rq = igt_request_alloc(ctx, engine);
1321 if (IS_ERR(rq)) {
1322 ret = PTR_ERR(rq);
1323 } else {
1324 int sz;
1325
1326 ret = rq->ring->size - rq->reserved_space;
1327 i915_request_add(rq);
1328
1329 sz = rq->ring->emit - rq->head;
1330 if (sz < 0)
1331 sz += rq->ring->size;
1332 ret /= sz;
1333 ret /= 2; /* leave half spare, in case of emergency! */
1334 }
1335
1336 return ret;
1337 }
1338
live_breadcrumbs_smoketest(void * arg)1339 static int live_breadcrumbs_smoketest(void *arg)
1340 {
1341 struct drm_i915_private *i915 = arg;
1342 const unsigned int nengines = num_uabi_engines(i915);
1343 const unsigned int ncpus = num_online_cpus();
1344 unsigned long num_waits, num_fences;
1345 struct intel_engine_cs *engine;
1346 struct task_struct **threads;
1347 struct igt_live_test live;
1348 intel_wakeref_t wakeref;
1349 struct smoketest *smoke;
1350 unsigned int n, idx;
1351 struct file *file;
1352 int ret = 0;
1353
1354 /*
1355 * Smoketest our breadcrumb/signal handling for requests across multiple
1356 * threads. A very simple test to only catch the most egregious of bugs.
1357 * See __igt_breadcrumbs_smoketest();
1358 *
1359 * On real hardware this time.
1360 */
1361
1362 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1363
1364 file = mock_file(i915);
1365 if (IS_ERR(file)) {
1366 ret = PTR_ERR(file);
1367 goto out_rpm;
1368 }
1369
1370 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1371 if (!smoke) {
1372 ret = -ENOMEM;
1373 goto out_file;
1374 }
1375
1376 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1377 if (!threads) {
1378 ret = -ENOMEM;
1379 goto out_smoke;
1380 }
1381
1382 smoke[0].request_alloc = __live_request_alloc;
1383 smoke[0].ncontexts = 64;
1384 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1385 sizeof(*smoke[0].contexts),
1386 GFP_KERNEL);
1387 if (!smoke[0].contexts) {
1388 ret = -ENOMEM;
1389 goto out_threads;
1390 }
1391
1392 for (n = 0; n < smoke[0].ncontexts; n++) {
1393 smoke[0].contexts[n] = live_context(i915, file);
1394 if (!smoke[0].contexts[n]) {
1395 ret = -ENOMEM;
1396 goto out_contexts;
1397 }
1398 }
1399
1400 ret = igt_live_test_begin(&live, i915, __func__, "");
1401 if (ret)
1402 goto out_contexts;
1403
1404 idx = 0;
1405 for_each_uabi_engine(engine, i915) {
1406 smoke[idx] = smoke[0];
1407 smoke[idx].engine = engine;
1408 smoke[idx].max_batch =
1409 max_batches(smoke[0].contexts[0], engine);
1410 if (smoke[idx].max_batch < 0) {
1411 ret = smoke[idx].max_batch;
1412 goto out_flush;
1413 }
1414 /* One ring interleaved between requests from all cpus */
1415 smoke[idx].max_batch /= num_online_cpus() + 1;
1416 pr_debug("Limiting batches to %d requests on %s\n",
1417 smoke[idx].max_batch, engine->name);
1418
1419 for (n = 0; n < ncpus; n++) {
1420 struct task_struct *tsk;
1421
1422 tsk = kthread_run(__igt_breadcrumbs_smoketest,
1423 &smoke[idx], "igt/%d.%d", idx, n);
1424 if (IS_ERR(tsk)) {
1425 ret = PTR_ERR(tsk);
1426 goto out_flush;
1427 }
1428
1429 get_task_struct(tsk);
1430 threads[idx * ncpus + n] = tsk;
1431 }
1432
1433 idx++;
1434 }
1435
1436 yield(); /* start all threads before we begin */
1437 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1438
1439 out_flush:
1440 idx = 0;
1441 num_waits = 0;
1442 num_fences = 0;
1443 for_each_uabi_engine(engine, i915) {
1444 for (n = 0; n < ncpus; n++) {
1445 struct task_struct *tsk = threads[idx * ncpus + n];
1446 int err;
1447
1448 if (!tsk)
1449 continue;
1450
1451 err = kthread_stop(tsk);
1452 if (err < 0 && !ret)
1453 ret = err;
1454
1455 put_task_struct(tsk);
1456 }
1457
1458 num_waits += atomic_long_read(&smoke[idx].num_waits);
1459 num_fences += atomic_long_read(&smoke[idx].num_fences);
1460 idx++;
1461 }
1462 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1463 num_waits, num_fences, idx, ncpus);
1464
1465 ret = igt_live_test_end(&live) ?: ret;
1466 out_contexts:
1467 kfree(smoke[0].contexts);
1468 out_threads:
1469 kfree(threads);
1470 out_smoke:
1471 kfree(smoke);
1472 out_file:
1473 fput(file);
1474 out_rpm:
1475 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1476
1477 return ret;
1478 }
1479
i915_request_live_selftests(struct drm_i915_private * i915)1480 int i915_request_live_selftests(struct drm_i915_private *i915)
1481 {
1482 static const struct i915_subtest tests[] = {
1483 SUBTEST(live_nop_request),
1484 SUBTEST(live_all_engines),
1485 SUBTEST(live_sequential_engines),
1486 SUBTEST(live_parallel_engines),
1487 SUBTEST(live_empty_request),
1488 SUBTEST(live_breadcrumbs_smoketest),
1489 };
1490
1491 if (intel_gt_is_wedged(&i915->gt))
1492 return 0;
1493
1494 return i915_subtests(tests, i915);
1495 }
1496
switch_to_kernel_sync(struct intel_context * ce,int err)1497 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1498 {
1499 struct i915_request *rq;
1500 struct dma_fence *fence;
1501
1502 rq = intel_engine_create_kernel_request(ce->engine);
1503 if (IS_ERR(rq))
1504 return PTR_ERR(rq);
1505
1506 fence = i915_active_fence_get(&ce->timeline->last_request);
1507 if (fence) {
1508 i915_request_await_dma_fence(rq, fence);
1509 dma_fence_put(fence);
1510 }
1511
1512 rq = i915_request_get(rq);
1513 i915_request_add(rq);
1514 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1515 err = -ETIME;
1516 i915_request_put(rq);
1517
1518 while (!err && !intel_engine_is_idle(ce->engine))
1519 intel_engine_flush_submission(ce->engine);
1520
1521 return err;
1522 }
1523
1524 struct perf_stats {
1525 struct intel_engine_cs *engine;
1526 unsigned long count;
1527 ktime_t time;
1528 ktime_t busy;
1529 u64 runtime;
1530 };
1531
1532 struct perf_series {
1533 struct drm_i915_private *i915;
1534 unsigned int nengines;
1535 struct intel_context *ce[];
1536 };
1537
cmp_u32(const void * A,const void * B)1538 static int cmp_u32(const void *A, const void *B)
1539 {
1540 const u32 *a = A, *b = B;
1541
1542 return *a - *b;
1543 }
1544
trifilter(u32 * a)1545 static u32 trifilter(u32 *a)
1546 {
1547 u64 sum;
1548
1549 #define TF_COUNT 5
1550 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1551
1552 sum = mul_u32_u32(a[2], 2);
1553 sum += a[1];
1554 sum += a[3];
1555
1556 GEM_BUG_ON(sum > U32_MAX);
1557 return sum;
1558 #define TF_BIAS 2
1559 }
1560
cycles_to_ns(struct intel_engine_cs * engine,u32 cycles)1561 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1562 {
1563 u64 ns = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
1564
1565 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1566 }
1567
emit_timestamp_store(u32 * cs,struct intel_context * ce,u32 offset)1568 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1569 {
1570 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1571 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1572 *cs++ = offset;
1573 *cs++ = 0;
1574
1575 return cs;
1576 }
1577
emit_store_dw(u32 * cs,u32 offset,u32 value)1578 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1579 {
1580 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1581 *cs++ = offset;
1582 *cs++ = 0;
1583 *cs++ = value;
1584
1585 return cs;
1586 }
1587
emit_semaphore_poll(u32 * cs,u32 mode,u32 value,u32 offset)1588 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1589 {
1590 *cs++ = MI_SEMAPHORE_WAIT |
1591 MI_SEMAPHORE_GLOBAL_GTT |
1592 MI_SEMAPHORE_POLL |
1593 mode;
1594 *cs++ = value;
1595 *cs++ = offset;
1596 *cs++ = 0;
1597
1598 return cs;
1599 }
1600
emit_semaphore_poll_until(u32 * cs,u32 offset,u32 value)1601 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1602 {
1603 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1604 }
1605
semaphore_set(u32 * sema,u32 value)1606 static void semaphore_set(u32 *sema, u32 value)
1607 {
1608 WRITE_ONCE(*sema, value);
1609 wmb(); /* flush the update to the cache, and beyond */
1610 }
1611
hwsp_scratch(const struct intel_context * ce)1612 static u32 *hwsp_scratch(const struct intel_context *ce)
1613 {
1614 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1615 }
1616
hwsp_offset(const struct intel_context * ce,u32 * dw)1617 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1618 {
1619 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1620 offset_in_page(dw));
1621 }
1622
measure_semaphore_response(struct intel_context * ce)1623 static int measure_semaphore_response(struct intel_context *ce)
1624 {
1625 u32 *sema = hwsp_scratch(ce);
1626 const u32 offset = hwsp_offset(ce, sema);
1627 u32 elapsed[TF_COUNT], cycles;
1628 struct i915_request *rq;
1629 u32 *cs;
1630 int err;
1631 int i;
1632
1633 /*
1634 * Measure how many cycles it takes for the HW to detect the change
1635 * in a semaphore value.
1636 *
1637 * A: read CS_TIMESTAMP from CPU
1638 * poke semaphore
1639 * B: read CS_TIMESTAMP on GPU
1640 *
1641 * Semaphore latency: B - A
1642 */
1643
1644 semaphore_set(sema, -1);
1645
1646 rq = i915_request_create(ce);
1647 if (IS_ERR(rq))
1648 return PTR_ERR(rq);
1649
1650 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1651 if (IS_ERR(cs)) {
1652 i915_request_add(rq);
1653 err = PTR_ERR(cs);
1654 goto err;
1655 }
1656
1657 cs = emit_store_dw(cs, offset, 0);
1658 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1659 cs = emit_semaphore_poll_until(cs, offset, i);
1660 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1661 cs = emit_store_dw(cs, offset, 0);
1662 }
1663
1664 intel_ring_advance(rq, cs);
1665 i915_request_add(rq);
1666
1667 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1668 err = -EIO;
1669 goto err;
1670 }
1671
1672 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1673 preempt_disable();
1674 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1675 semaphore_set(sema, i);
1676 preempt_enable();
1677
1678 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1679 err = -EIO;
1680 goto err;
1681 }
1682
1683 elapsed[i - 1] = sema[i] - cycles;
1684 }
1685
1686 cycles = trifilter(elapsed);
1687 pr_info("%s: semaphore response %d cycles, %lluns\n",
1688 ce->engine->name, cycles >> TF_BIAS,
1689 cycles_to_ns(ce->engine, cycles));
1690
1691 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1692
1693 err:
1694 intel_gt_set_wedged(ce->engine->gt);
1695 return err;
1696 }
1697
measure_idle_dispatch(struct intel_context * ce)1698 static int measure_idle_dispatch(struct intel_context *ce)
1699 {
1700 u32 *sema = hwsp_scratch(ce);
1701 const u32 offset = hwsp_offset(ce, sema);
1702 u32 elapsed[TF_COUNT], cycles;
1703 u32 *cs;
1704 int err;
1705 int i;
1706
1707 /*
1708 * Measure how long it takes for us to submit a request while the
1709 * engine is idle, but is resting in our context.
1710 *
1711 * A: read CS_TIMESTAMP from CPU
1712 * submit request
1713 * B: read CS_TIMESTAMP on GPU
1714 *
1715 * Submission latency: B - A
1716 */
1717
1718 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1719 struct i915_request *rq;
1720
1721 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1722 if (err)
1723 return err;
1724
1725 rq = i915_request_create(ce);
1726 if (IS_ERR(rq)) {
1727 err = PTR_ERR(rq);
1728 goto err;
1729 }
1730
1731 cs = intel_ring_begin(rq, 4);
1732 if (IS_ERR(cs)) {
1733 i915_request_add(rq);
1734 err = PTR_ERR(cs);
1735 goto err;
1736 }
1737
1738 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1739
1740 intel_ring_advance(rq, cs);
1741
1742 preempt_disable();
1743 local_bh_disable();
1744 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1745 i915_request_add(rq);
1746 local_bh_enable();
1747 preempt_enable();
1748 }
1749
1750 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1751 if (err)
1752 goto err;
1753
1754 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1755 elapsed[i] = sema[i] - elapsed[i];
1756
1757 cycles = trifilter(elapsed);
1758 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1759 ce->engine->name, cycles >> TF_BIAS,
1760 cycles_to_ns(ce->engine, cycles));
1761
1762 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1763
1764 err:
1765 intel_gt_set_wedged(ce->engine->gt);
1766 return err;
1767 }
1768
measure_busy_dispatch(struct intel_context * ce)1769 static int measure_busy_dispatch(struct intel_context *ce)
1770 {
1771 u32 *sema = hwsp_scratch(ce);
1772 const u32 offset = hwsp_offset(ce, sema);
1773 u32 elapsed[TF_COUNT + 1], cycles;
1774 u32 *cs;
1775 int err;
1776 int i;
1777
1778 /*
1779 * Measure how long it takes for us to submit a request while the
1780 * engine is busy, polling on a semaphore in our context. With
1781 * direct submission, this will include the cost of a lite restore.
1782 *
1783 * A: read CS_TIMESTAMP from CPU
1784 * submit request
1785 * B: read CS_TIMESTAMP on GPU
1786 *
1787 * Submission latency: B - A
1788 */
1789
1790 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1791 struct i915_request *rq;
1792
1793 rq = i915_request_create(ce);
1794 if (IS_ERR(rq)) {
1795 err = PTR_ERR(rq);
1796 goto err;
1797 }
1798
1799 cs = intel_ring_begin(rq, 12);
1800 if (IS_ERR(cs)) {
1801 i915_request_add(rq);
1802 err = PTR_ERR(cs);
1803 goto err;
1804 }
1805
1806 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
1807 cs = emit_semaphore_poll_until(cs, offset, i);
1808 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1809
1810 intel_ring_advance(rq, cs);
1811
1812 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
1813 err = -EIO;
1814 goto err;
1815 }
1816
1817 preempt_disable();
1818 local_bh_disable();
1819 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1820 i915_request_add(rq);
1821 local_bh_enable();
1822 semaphore_set(sema, i - 1);
1823 preempt_enable();
1824 }
1825
1826 wait_for(READ_ONCE(sema[i - 1]), 500);
1827 semaphore_set(sema, i - 1);
1828
1829 for (i = 1; i <= TF_COUNT; i++) {
1830 GEM_BUG_ON(sema[i] == -1);
1831 elapsed[i - 1] = sema[i] - elapsed[i];
1832 }
1833
1834 cycles = trifilter(elapsed);
1835 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
1836 ce->engine->name, cycles >> TF_BIAS,
1837 cycles_to_ns(ce->engine, cycles));
1838
1839 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1840
1841 err:
1842 intel_gt_set_wedged(ce->engine->gt);
1843 return err;
1844 }
1845
plug(struct intel_engine_cs * engine,u32 * sema,u32 mode,int value)1846 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
1847 {
1848 const u32 offset =
1849 i915_ggtt_offset(engine->status_page.vma) +
1850 offset_in_page(sema);
1851 struct i915_request *rq;
1852 u32 *cs;
1853
1854 rq = i915_request_create(engine->kernel_context);
1855 if (IS_ERR(rq))
1856 return PTR_ERR(rq);
1857
1858 cs = intel_ring_begin(rq, 4);
1859 if (IS_ERR(cs)) {
1860 i915_request_add(rq);
1861 return PTR_ERR(cs);
1862 }
1863
1864 cs = emit_semaphore_poll(cs, mode, value, offset);
1865
1866 intel_ring_advance(rq, cs);
1867 i915_request_add(rq);
1868
1869 return 0;
1870 }
1871
measure_inter_request(struct intel_context * ce)1872 static int measure_inter_request(struct intel_context *ce)
1873 {
1874 u32 *sema = hwsp_scratch(ce);
1875 const u32 offset = hwsp_offset(ce, sema);
1876 u32 elapsed[TF_COUNT + 1], cycles;
1877 struct i915_sw_fence *submit;
1878 int i, err;
1879
1880 /*
1881 * Measure how long it takes to advance from one request into the
1882 * next. Between each request we flush the GPU caches to memory,
1883 * update the breadcrumbs, and then invalidate those caches.
1884 * We queue up all the requests to be submitted in one batch so
1885 * it should be one set of contiguous measurements.
1886 *
1887 * A: read CS_TIMESTAMP on GPU
1888 * advance request
1889 * B: read CS_TIMESTAMP on GPU
1890 *
1891 * Request latency: B - A
1892 */
1893
1894 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1895 if (err)
1896 return err;
1897
1898 submit = heap_fence_create(GFP_KERNEL);
1899 if (!submit) {
1900 semaphore_set(sema, 1);
1901 return -ENOMEM;
1902 }
1903
1904 intel_engine_flush_submission(ce->engine);
1905 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1906 struct i915_request *rq;
1907 u32 *cs;
1908
1909 rq = i915_request_create(ce);
1910 if (IS_ERR(rq)) {
1911 err = PTR_ERR(rq);
1912 goto err_submit;
1913 }
1914
1915 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
1916 submit,
1917 GFP_KERNEL);
1918 if (err < 0) {
1919 i915_request_add(rq);
1920 goto err_submit;
1921 }
1922
1923 cs = intel_ring_begin(rq, 4);
1924 if (IS_ERR(cs)) {
1925 i915_request_add(rq);
1926 err = PTR_ERR(cs);
1927 goto err_submit;
1928 }
1929
1930 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1931
1932 intel_ring_advance(rq, cs);
1933 i915_request_add(rq);
1934 }
1935 local_bh_disable();
1936 i915_sw_fence_commit(submit);
1937 local_bh_enable();
1938 intel_engine_flush_submission(ce->engine);
1939 heap_fence_put(submit);
1940
1941 semaphore_set(sema, 1);
1942 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1943 if (err)
1944 goto err;
1945
1946 for (i = 1; i <= TF_COUNT; i++)
1947 elapsed[i - 1] = sema[i + 1] - sema[i];
1948
1949 cycles = trifilter(elapsed);
1950 pr_info("%s: inter-request latency %d cycles, %lluns\n",
1951 ce->engine->name, cycles >> TF_BIAS,
1952 cycles_to_ns(ce->engine, cycles));
1953
1954 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1955
1956 err_submit:
1957 i915_sw_fence_commit(submit);
1958 heap_fence_put(submit);
1959 semaphore_set(sema, 1);
1960 err:
1961 intel_gt_set_wedged(ce->engine->gt);
1962 return err;
1963 }
1964
measure_context_switch(struct intel_context * ce)1965 static int measure_context_switch(struct intel_context *ce)
1966 {
1967 u32 *sema = hwsp_scratch(ce);
1968 const u32 offset = hwsp_offset(ce, sema);
1969 struct i915_request *fence = NULL;
1970 u32 elapsed[TF_COUNT + 1], cycles;
1971 int i, j, err;
1972 u32 *cs;
1973
1974 /*
1975 * Measure how long it takes to advance from one request in one
1976 * context to a request in another context. This allows us to
1977 * measure how long the context save/restore take, along with all
1978 * the inter-context setup we require.
1979 *
1980 * A: read CS_TIMESTAMP on GPU
1981 * switch context
1982 * B: read CS_TIMESTAMP on GPU
1983 *
1984 * Context switch latency: B - A
1985 */
1986
1987 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
1988 if (err)
1989 return err;
1990
1991 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1992 struct intel_context *arr[] = {
1993 ce, ce->engine->kernel_context
1994 };
1995 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
1996
1997 for (j = 0; j < ARRAY_SIZE(arr); j++) {
1998 struct i915_request *rq;
1999
2000 rq = i915_request_create(arr[j]);
2001 if (IS_ERR(rq)) {
2002 err = PTR_ERR(rq);
2003 goto err_fence;
2004 }
2005
2006 if (fence) {
2007 err = i915_request_await_dma_fence(rq,
2008 &fence->fence);
2009 if (err) {
2010 i915_request_add(rq);
2011 goto err_fence;
2012 }
2013 }
2014
2015 cs = intel_ring_begin(rq, 4);
2016 if (IS_ERR(cs)) {
2017 i915_request_add(rq);
2018 err = PTR_ERR(cs);
2019 goto err_fence;
2020 }
2021
2022 cs = emit_timestamp_store(cs, ce, addr);
2023 addr += sizeof(u32);
2024
2025 intel_ring_advance(rq, cs);
2026
2027 i915_request_put(fence);
2028 fence = i915_request_get(rq);
2029
2030 i915_request_add(rq);
2031 }
2032 }
2033 i915_request_put(fence);
2034 intel_engine_flush_submission(ce->engine);
2035
2036 semaphore_set(sema, 1);
2037 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2038 if (err)
2039 goto err;
2040
2041 for (i = 1; i <= TF_COUNT; i++)
2042 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2043
2044 cycles = trifilter(elapsed);
2045 pr_info("%s: context switch latency %d cycles, %lluns\n",
2046 ce->engine->name, cycles >> TF_BIAS,
2047 cycles_to_ns(ce->engine, cycles));
2048
2049 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2050
2051 err_fence:
2052 i915_request_put(fence);
2053 semaphore_set(sema, 1);
2054 err:
2055 intel_gt_set_wedged(ce->engine->gt);
2056 return err;
2057 }
2058
measure_preemption(struct intel_context * ce)2059 static int measure_preemption(struct intel_context *ce)
2060 {
2061 u32 *sema = hwsp_scratch(ce);
2062 const u32 offset = hwsp_offset(ce, sema);
2063 u32 elapsed[TF_COUNT], cycles;
2064 u32 *cs;
2065 int err;
2066 int i;
2067
2068 /*
2069 * We measure two latencies while triggering preemption. The first
2070 * latency is how long it takes for us to submit a preempting request.
2071 * The second latency is how it takes for us to return from the
2072 * preemption back to the original context.
2073 *
2074 * A: read CS_TIMESTAMP from CPU
2075 * submit preemption
2076 * B: read CS_TIMESTAMP on GPU (in preempting context)
2077 * context switch
2078 * C: read CS_TIMESTAMP on GPU (in original context)
2079 *
2080 * Preemption dispatch latency: B - A
2081 * Preemption switch latency: C - B
2082 */
2083
2084 if (!intel_engine_has_preemption(ce->engine))
2085 return 0;
2086
2087 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2088 u32 addr = offset + 2 * i * sizeof(u32);
2089 struct i915_request *rq;
2090
2091 rq = i915_request_create(ce);
2092 if (IS_ERR(rq)) {
2093 err = PTR_ERR(rq);
2094 goto err;
2095 }
2096
2097 cs = intel_ring_begin(rq, 12);
2098 if (IS_ERR(cs)) {
2099 i915_request_add(rq);
2100 err = PTR_ERR(cs);
2101 goto err;
2102 }
2103
2104 cs = emit_store_dw(cs, addr, -1);
2105 cs = emit_semaphore_poll_until(cs, offset, i);
2106 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2107
2108 intel_ring_advance(rq, cs);
2109 i915_request_add(rq);
2110
2111 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2112 err = -EIO;
2113 goto err;
2114 }
2115
2116 rq = i915_request_create(ce->engine->kernel_context);
2117 if (IS_ERR(rq)) {
2118 err = PTR_ERR(rq);
2119 goto err;
2120 }
2121
2122 cs = intel_ring_begin(rq, 8);
2123 if (IS_ERR(cs)) {
2124 i915_request_add(rq);
2125 err = PTR_ERR(cs);
2126 goto err;
2127 }
2128
2129 cs = emit_timestamp_store(cs, ce, addr);
2130 cs = emit_store_dw(cs, offset, i);
2131
2132 intel_ring_advance(rq, cs);
2133 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2134
2135 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2136 i915_request_add(rq);
2137 }
2138
2139 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2140 err = -EIO;
2141 goto err;
2142 }
2143
2144 for (i = 1; i <= TF_COUNT; i++)
2145 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2146
2147 cycles = trifilter(elapsed);
2148 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2149 ce->engine->name, cycles >> TF_BIAS,
2150 cycles_to_ns(ce->engine, cycles));
2151
2152 for (i = 1; i <= TF_COUNT; i++)
2153 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2154
2155 cycles = trifilter(elapsed);
2156 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2157 ce->engine->name, cycles >> TF_BIAS,
2158 cycles_to_ns(ce->engine, cycles));
2159
2160 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2161
2162 err:
2163 intel_gt_set_wedged(ce->engine->gt);
2164 return err;
2165 }
2166
2167 struct signal_cb {
2168 struct dma_fence_cb base;
2169 bool seen;
2170 };
2171
signal_cb(struct dma_fence * fence,struct dma_fence_cb * cb)2172 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2173 {
2174 struct signal_cb *s = container_of(cb, typeof(*s), base);
2175
2176 smp_store_mb(s->seen, true); /* be safe, be strong */
2177 }
2178
measure_completion(struct intel_context * ce)2179 static int measure_completion(struct intel_context *ce)
2180 {
2181 u32 *sema = hwsp_scratch(ce);
2182 const u32 offset = hwsp_offset(ce, sema);
2183 u32 elapsed[TF_COUNT], cycles;
2184 u32 *cs;
2185 int err;
2186 int i;
2187
2188 /*
2189 * Measure how long it takes for the signal (interrupt) to be
2190 * sent from the GPU to be processed by the CPU.
2191 *
2192 * A: read CS_TIMESTAMP on GPU
2193 * signal
2194 * B: read CS_TIMESTAMP from CPU
2195 *
2196 * Completion latency: B - A
2197 */
2198
2199 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2200 struct signal_cb cb = { .seen = false };
2201 struct i915_request *rq;
2202
2203 rq = i915_request_create(ce);
2204 if (IS_ERR(rq)) {
2205 err = PTR_ERR(rq);
2206 goto err;
2207 }
2208
2209 cs = intel_ring_begin(rq, 12);
2210 if (IS_ERR(cs)) {
2211 i915_request_add(rq);
2212 err = PTR_ERR(cs);
2213 goto err;
2214 }
2215
2216 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2217 cs = emit_semaphore_poll_until(cs, offset, i);
2218 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2219
2220 intel_ring_advance(rq, cs);
2221
2222 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2223
2224 local_bh_disable();
2225 i915_request_add(rq);
2226 local_bh_enable();
2227
2228 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2229 err = -EIO;
2230 goto err;
2231 }
2232
2233 preempt_disable();
2234 semaphore_set(sema, i);
2235 while (!READ_ONCE(cb.seen))
2236 cpu_relax();
2237
2238 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2239 preempt_enable();
2240 }
2241
2242 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2243 if (err)
2244 goto err;
2245
2246 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2247 GEM_BUG_ON(sema[i + 1] == -1);
2248 elapsed[i] = elapsed[i] - sema[i + 1];
2249 }
2250
2251 cycles = trifilter(elapsed);
2252 pr_info("%s: completion latency %d cycles, %lluns\n",
2253 ce->engine->name, cycles >> TF_BIAS,
2254 cycles_to_ns(ce->engine, cycles));
2255
2256 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2257
2258 err:
2259 intel_gt_set_wedged(ce->engine->gt);
2260 return err;
2261 }
2262
rps_pin(struct intel_gt * gt)2263 static void rps_pin(struct intel_gt *gt)
2264 {
2265 /* Pin the frequency to max */
2266 atomic_inc(>->rps.num_waiters);
2267 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2268
2269 mutex_lock(>->rps.lock);
2270 intel_rps_set(>->rps, gt->rps.max_freq);
2271 mutex_unlock(>->rps.lock);
2272 }
2273
rps_unpin(struct intel_gt * gt)2274 static void rps_unpin(struct intel_gt *gt)
2275 {
2276 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2277 atomic_dec(>->rps.num_waiters);
2278 }
2279
perf_request_latency(void * arg)2280 static int perf_request_latency(void *arg)
2281 {
2282 struct drm_i915_private *i915 = arg;
2283 struct intel_engine_cs *engine;
2284 struct pm_qos_request qos;
2285 int err = 0;
2286
2287 if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2288 return 0;
2289
2290 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2291
2292 for_each_uabi_engine(engine, i915) {
2293 struct intel_context *ce;
2294
2295 ce = intel_context_create(engine);
2296 if (IS_ERR(ce)) {
2297 err = PTR_ERR(ce);
2298 goto out;
2299 }
2300
2301 err = intel_context_pin(ce);
2302 if (err) {
2303 intel_context_put(ce);
2304 goto out;
2305 }
2306
2307 st_engine_heartbeat_disable(engine);
2308 rps_pin(engine->gt);
2309
2310 if (err == 0)
2311 err = measure_semaphore_response(ce);
2312 if (err == 0)
2313 err = measure_idle_dispatch(ce);
2314 if (err == 0)
2315 err = measure_busy_dispatch(ce);
2316 if (err == 0)
2317 err = measure_inter_request(ce);
2318 if (err == 0)
2319 err = measure_context_switch(ce);
2320 if (err == 0)
2321 err = measure_preemption(ce);
2322 if (err == 0)
2323 err = measure_completion(ce);
2324
2325 rps_unpin(engine->gt);
2326 st_engine_heartbeat_enable(engine);
2327
2328 intel_context_unpin(ce);
2329 intel_context_put(ce);
2330 if (err)
2331 goto out;
2332 }
2333
2334 out:
2335 if (igt_flush_test(i915))
2336 err = -EIO;
2337
2338 cpu_latency_qos_remove_request(&qos);
2339 return err;
2340 }
2341
s_sync0(void * arg)2342 static int s_sync0(void *arg)
2343 {
2344 struct perf_series *ps = arg;
2345 IGT_TIMEOUT(end_time);
2346 unsigned int idx = 0;
2347 int err = 0;
2348
2349 GEM_BUG_ON(!ps->nengines);
2350 do {
2351 struct i915_request *rq;
2352
2353 rq = i915_request_create(ps->ce[idx]);
2354 if (IS_ERR(rq)) {
2355 err = PTR_ERR(rq);
2356 break;
2357 }
2358
2359 i915_request_get(rq);
2360 i915_request_add(rq);
2361
2362 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2363 err = -ETIME;
2364 i915_request_put(rq);
2365 if (err)
2366 break;
2367
2368 if (++idx == ps->nengines)
2369 idx = 0;
2370 } while (!__igt_timeout(end_time, NULL));
2371
2372 return err;
2373 }
2374
s_sync1(void * arg)2375 static int s_sync1(void *arg)
2376 {
2377 struct perf_series *ps = arg;
2378 struct i915_request *prev = NULL;
2379 IGT_TIMEOUT(end_time);
2380 unsigned int idx = 0;
2381 int err = 0;
2382
2383 GEM_BUG_ON(!ps->nengines);
2384 do {
2385 struct i915_request *rq;
2386
2387 rq = i915_request_create(ps->ce[idx]);
2388 if (IS_ERR(rq)) {
2389 err = PTR_ERR(rq);
2390 break;
2391 }
2392
2393 i915_request_get(rq);
2394 i915_request_add(rq);
2395
2396 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2397 err = -ETIME;
2398 i915_request_put(prev);
2399 prev = rq;
2400 if (err)
2401 break;
2402
2403 if (++idx == ps->nengines)
2404 idx = 0;
2405 } while (!__igt_timeout(end_time, NULL));
2406 i915_request_put(prev);
2407
2408 return err;
2409 }
2410
s_many(void * arg)2411 static int s_many(void *arg)
2412 {
2413 struct perf_series *ps = arg;
2414 IGT_TIMEOUT(end_time);
2415 unsigned int idx = 0;
2416
2417 GEM_BUG_ON(!ps->nengines);
2418 do {
2419 struct i915_request *rq;
2420
2421 rq = i915_request_create(ps->ce[idx]);
2422 if (IS_ERR(rq))
2423 return PTR_ERR(rq);
2424
2425 i915_request_add(rq);
2426
2427 if (++idx == ps->nengines)
2428 idx = 0;
2429 } while (!__igt_timeout(end_time, NULL));
2430
2431 return 0;
2432 }
2433
perf_series_engines(void * arg)2434 static int perf_series_engines(void *arg)
2435 {
2436 struct drm_i915_private *i915 = arg;
2437 static int (* const func[])(void *arg) = {
2438 s_sync0,
2439 s_sync1,
2440 s_many,
2441 NULL,
2442 };
2443 const unsigned int nengines = num_uabi_engines(i915);
2444 struct intel_engine_cs *engine;
2445 int (* const *fn)(void *arg);
2446 struct pm_qos_request qos;
2447 struct perf_stats *stats;
2448 struct perf_series *ps;
2449 unsigned int idx;
2450 int err = 0;
2451
2452 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2453 if (!stats)
2454 return -ENOMEM;
2455
2456 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2457 if (!ps) {
2458 kfree(stats);
2459 return -ENOMEM;
2460 }
2461
2462 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2463
2464 ps->i915 = i915;
2465 ps->nengines = nengines;
2466
2467 idx = 0;
2468 for_each_uabi_engine(engine, i915) {
2469 struct intel_context *ce;
2470
2471 ce = intel_context_create(engine);
2472 if (IS_ERR(ce)) {
2473 err = PTR_ERR(ce);
2474 goto out;
2475 }
2476
2477 err = intel_context_pin(ce);
2478 if (err) {
2479 intel_context_put(ce);
2480 goto out;
2481 }
2482
2483 ps->ce[idx++] = ce;
2484 }
2485 GEM_BUG_ON(idx != ps->nengines);
2486
2487 for (fn = func; *fn && !err; fn++) {
2488 char name[KSYM_NAME_LEN];
2489 struct igt_live_test t;
2490
2491 snprintf(name, sizeof(name), "%ps", *fn);
2492 err = igt_live_test_begin(&t, i915, __func__, name);
2493 if (err)
2494 break;
2495
2496 for (idx = 0; idx < nengines; idx++) {
2497 struct perf_stats *p =
2498 memset(&stats[idx], 0, sizeof(stats[idx]));
2499 struct intel_context *ce = ps->ce[idx];
2500
2501 p->engine = ps->ce[idx]->engine;
2502 intel_engine_pm_get(p->engine);
2503
2504 if (intel_engine_supports_stats(p->engine))
2505 p->busy = intel_engine_get_busy_time(p->engine,
2506 &p->time) + 1;
2507 else
2508 p->time = ktime_get();
2509 p->runtime = -intel_context_get_total_runtime_ns(ce);
2510 }
2511
2512 err = (*fn)(ps);
2513 if (igt_live_test_end(&t))
2514 err = -EIO;
2515
2516 for (idx = 0; idx < nengines; idx++) {
2517 struct perf_stats *p = &stats[idx];
2518 struct intel_context *ce = ps->ce[idx];
2519 int integer, decimal;
2520 u64 busy, dt, now;
2521
2522 if (p->busy)
2523 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2524 &now),
2525 p->busy - 1);
2526 else
2527 now = ktime_get();
2528 p->time = ktime_sub(now, p->time);
2529
2530 err = switch_to_kernel_sync(ce, err);
2531 p->runtime += intel_context_get_total_runtime_ns(ce);
2532 intel_engine_pm_put(p->engine);
2533
2534 busy = 100 * ktime_to_ns(p->busy);
2535 dt = ktime_to_ns(p->time);
2536 if (dt) {
2537 integer = div64_u64(busy, dt);
2538 busy -= integer * dt;
2539 decimal = div64_u64(100 * busy, dt);
2540 } else {
2541 integer = 0;
2542 decimal = 0;
2543 }
2544
2545 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2546 name, p->engine->name, ce->timeline->seqno,
2547 integer, decimal,
2548 div_u64(p->runtime, 1000 * 1000),
2549 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2550 }
2551 }
2552
2553 out:
2554 for (idx = 0; idx < nengines; idx++) {
2555 if (IS_ERR_OR_NULL(ps->ce[idx]))
2556 break;
2557
2558 intel_context_unpin(ps->ce[idx]);
2559 intel_context_put(ps->ce[idx]);
2560 }
2561 kfree(ps);
2562
2563 cpu_latency_qos_remove_request(&qos);
2564 kfree(stats);
2565 return err;
2566 }
2567
p_sync0(void * arg)2568 static int p_sync0(void *arg)
2569 {
2570 struct perf_stats *p = arg;
2571 struct intel_engine_cs *engine = p->engine;
2572 struct intel_context *ce;
2573 IGT_TIMEOUT(end_time);
2574 unsigned long count;
2575 bool busy;
2576 int err = 0;
2577
2578 ce = intel_context_create(engine);
2579 if (IS_ERR(ce))
2580 return PTR_ERR(ce);
2581
2582 err = intel_context_pin(ce);
2583 if (err) {
2584 intel_context_put(ce);
2585 return err;
2586 }
2587
2588 if (intel_engine_supports_stats(engine)) {
2589 p->busy = intel_engine_get_busy_time(engine, &p->time);
2590 busy = true;
2591 } else {
2592 p->time = ktime_get();
2593 busy = false;
2594 }
2595
2596 count = 0;
2597 do {
2598 struct i915_request *rq;
2599
2600 rq = i915_request_create(ce);
2601 if (IS_ERR(rq)) {
2602 err = PTR_ERR(rq);
2603 break;
2604 }
2605
2606 i915_request_get(rq);
2607 i915_request_add(rq);
2608
2609 err = 0;
2610 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2611 err = -ETIME;
2612 i915_request_put(rq);
2613 if (err)
2614 break;
2615
2616 count++;
2617 } while (!__igt_timeout(end_time, NULL));
2618
2619 if (busy) {
2620 ktime_t now;
2621
2622 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2623 p->busy);
2624 p->time = ktime_sub(now, p->time);
2625 } else {
2626 p->time = ktime_sub(ktime_get(), p->time);
2627 }
2628
2629 err = switch_to_kernel_sync(ce, err);
2630 p->runtime = intel_context_get_total_runtime_ns(ce);
2631 p->count = count;
2632
2633 intel_context_unpin(ce);
2634 intel_context_put(ce);
2635 return err;
2636 }
2637
p_sync1(void * arg)2638 static int p_sync1(void *arg)
2639 {
2640 struct perf_stats *p = arg;
2641 struct intel_engine_cs *engine = p->engine;
2642 struct i915_request *prev = NULL;
2643 struct intel_context *ce;
2644 IGT_TIMEOUT(end_time);
2645 unsigned long count;
2646 bool busy;
2647 int err = 0;
2648
2649 ce = intel_context_create(engine);
2650 if (IS_ERR(ce))
2651 return PTR_ERR(ce);
2652
2653 err = intel_context_pin(ce);
2654 if (err) {
2655 intel_context_put(ce);
2656 return err;
2657 }
2658
2659 if (intel_engine_supports_stats(engine)) {
2660 p->busy = intel_engine_get_busy_time(engine, &p->time);
2661 busy = true;
2662 } else {
2663 p->time = ktime_get();
2664 busy = false;
2665 }
2666
2667 count = 0;
2668 do {
2669 struct i915_request *rq;
2670
2671 rq = i915_request_create(ce);
2672 if (IS_ERR(rq)) {
2673 err = PTR_ERR(rq);
2674 break;
2675 }
2676
2677 i915_request_get(rq);
2678 i915_request_add(rq);
2679
2680 err = 0;
2681 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2682 err = -ETIME;
2683 i915_request_put(prev);
2684 prev = rq;
2685 if (err)
2686 break;
2687
2688 count++;
2689 } while (!__igt_timeout(end_time, NULL));
2690 i915_request_put(prev);
2691
2692 if (busy) {
2693 ktime_t now;
2694
2695 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2696 p->busy);
2697 p->time = ktime_sub(now, p->time);
2698 } else {
2699 p->time = ktime_sub(ktime_get(), p->time);
2700 }
2701
2702 err = switch_to_kernel_sync(ce, err);
2703 p->runtime = intel_context_get_total_runtime_ns(ce);
2704 p->count = count;
2705
2706 intel_context_unpin(ce);
2707 intel_context_put(ce);
2708 return err;
2709 }
2710
p_many(void * arg)2711 static int p_many(void *arg)
2712 {
2713 struct perf_stats *p = arg;
2714 struct intel_engine_cs *engine = p->engine;
2715 struct intel_context *ce;
2716 IGT_TIMEOUT(end_time);
2717 unsigned long count;
2718 int err = 0;
2719 bool busy;
2720
2721 ce = intel_context_create(engine);
2722 if (IS_ERR(ce))
2723 return PTR_ERR(ce);
2724
2725 err = intel_context_pin(ce);
2726 if (err) {
2727 intel_context_put(ce);
2728 return err;
2729 }
2730
2731 if (intel_engine_supports_stats(engine)) {
2732 p->busy = intel_engine_get_busy_time(engine, &p->time);
2733 busy = true;
2734 } else {
2735 p->time = ktime_get();
2736 busy = false;
2737 }
2738
2739 count = 0;
2740 do {
2741 struct i915_request *rq;
2742
2743 rq = i915_request_create(ce);
2744 if (IS_ERR(rq)) {
2745 err = PTR_ERR(rq);
2746 break;
2747 }
2748
2749 i915_request_add(rq);
2750 count++;
2751 } while (!__igt_timeout(end_time, NULL));
2752
2753 if (busy) {
2754 ktime_t now;
2755
2756 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2757 p->busy);
2758 p->time = ktime_sub(now, p->time);
2759 } else {
2760 p->time = ktime_sub(ktime_get(), p->time);
2761 }
2762
2763 err = switch_to_kernel_sync(ce, err);
2764 p->runtime = intel_context_get_total_runtime_ns(ce);
2765 p->count = count;
2766
2767 intel_context_unpin(ce);
2768 intel_context_put(ce);
2769 return err;
2770 }
2771
perf_parallel_engines(void * arg)2772 static int perf_parallel_engines(void *arg)
2773 {
2774 struct drm_i915_private *i915 = arg;
2775 static int (* const func[])(void *arg) = {
2776 p_sync0,
2777 p_sync1,
2778 p_many,
2779 NULL,
2780 };
2781 const unsigned int nengines = num_uabi_engines(i915);
2782 struct intel_engine_cs *engine;
2783 int (* const *fn)(void *arg);
2784 struct pm_qos_request qos;
2785 struct {
2786 struct perf_stats p;
2787 struct task_struct *tsk;
2788 } *engines;
2789 int err = 0;
2790
2791 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2792 if (!engines)
2793 return -ENOMEM;
2794
2795 cpu_latency_qos_add_request(&qos, 0);
2796
2797 for (fn = func; *fn; fn++) {
2798 char name[KSYM_NAME_LEN];
2799 struct igt_live_test t;
2800 unsigned int idx;
2801
2802 snprintf(name, sizeof(name), "%ps", *fn);
2803 err = igt_live_test_begin(&t, i915, __func__, name);
2804 if (err)
2805 break;
2806
2807 atomic_set(&i915->selftest.counter, nengines);
2808
2809 idx = 0;
2810 for_each_uabi_engine(engine, i915) {
2811 intel_engine_pm_get(engine);
2812
2813 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
2814 engines[idx].p.engine = engine;
2815
2816 engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
2817 "igt:%s", engine->name);
2818 if (IS_ERR(engines[idx].tsk)) {
2819 err = PTR_ERR(engines[idx].tsk);
2820 intel_engine_pm_put(engine);
2821 break;
2822 }
2823 get_task_struct(engines[idx++].tsk);
2824 }
2825
2826 yield(); /* start all threads before we kthread_stop() */
2827
2828 idx = 0;
2829 for_each_uabi_engine(engine, i915) {
2830 int status;
2831
2832 if (IS_ERR(engines[idx].tsk))
2833 break;
2834
2835 status = kthread_stop(engines[idx].tsk);
2836 if (status && !err)
2837 err = status;
2838
2839 intel_engine_pm_put(engine);
2840 put_task_struct(engines[idx++].tsk);
2841 }
2842
2843 if (igt_live_test_end(&t))
2844 err = -EIO;
2845 if (err)
2846 break;
2847
2848 idx = 0;
2849 for_each_uabi_engine(engine, i915) {
2850 struct perf_stats *p = &engines[idx].p;
2851 u64 busy = 100 * ktime_to_ns(p->busy);
2852 u64 dt = ktime_to_ns(p->time);
2853 int integer, decimal;
2854
2855 if (dt) {
2856 integer = div64_u64(busy, dt);
2857 busy -= integer * dt;
2858 decimal = div64_u64(100 * busy, dt);
2859 } else {
2860 integer = 0;
2861 decimal = 0;
2862 }
2863
2864 GEM_BUG_ON(engine != p->engine);
2865 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2866 name, engine->name, p->count, integer, decimal,
2867 div_u64(p->runtime, 1000 * 1000),
2868 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2869 idx++;
2870 }
2871 }
2872
2873 cpu_latency_qos_remove_request(&qos);
2874 kfree(engines);
2875 return err;
2876 }
2877
i915_request_perf_selftests(struct drm_i915_private * i915)2878 int i915_request_perf_selftests(struct drm_i915_private *i915)
2879 {
2880 static const struct i915_subtest tests[] = {
2881 SUBTEST(perf_request_latency),
2882 SUBTEST(perf_series_engines),
2883 SUBTEST(perf_parallel_engines),
2884 };
2885
2886 if (intel_gt_is_wedged(&i915->gt))
2887 return 0;
2888
2889 return i915_subtests(tests, i915);
2890 }
2891