]>
git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/gpu/drm/i915/selftests/intel_hangcheck.c
2 * Copyright © 2016 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include <linux/kthread.h>
27 #include "../i915_selftest.h"
29 #include "mock_context.h"
33 struct drm_i915_private
*i915
;
34 struct drm_i915_gem_object
*hws
;
35 struct drm_i915_gem_object
*obj
;
40 static int hang_init(struct hang
*h
, struct drm_i915_private
*i915
)
45 memset(h
, 0, sizeof(*h
));
48 h
->hws
= i915_gem_object_create_internal(i915
, PAGE_SIZE
);
50 return PTR_ERR(h
->hws
);
52 h
->obj
= i915_gem_object_create_internal(i915
, PAGE_SIZE
);
54 err
= PTR_ERR(h
->obj
);
58 i915_gem_object_set_cache_level(h
->hws
, I915_CACHE_LLC
);
59 vaddr
= i915_gem_object_pin_map(h
->hws
, I915_MAP_WB
);
64 h
->seqno
= memset(vaddr
, 0xff, PAGE_SIZE
);
66 vaddr
= i915_gem_object_pin_map(h
->obj
,
67 HAS_LLC(i915
) ? I915_MAP_WB
: I915_MAP_WC
);
77 i915_gem_object_unpin_map(h
->hws
);
79 i915_gem_object_put(h
->obj
);
81 i915_gem_object_put(h
->hws
);
85 static u64
hws_address(const struct i915_vma
*hws
,
86 const struct drm_i915_gem_request
*rq
)
88 return hws
->node
.start
+ offset_in_page(sizeof(u32
)*rq
->fence
.context
);
91 static int emit_recurse_batch(struct hang
*h
,
92 struct drm_i915_gem_request
*rq
)
94 struct drm_i915_private
*i915
= h
->i915
;
95 struct i915_address_space
*vm
= rq
->ctx
->ppgtt
? &rq
->ctx
->ppgtt
->base
: &i915
->ggtt
.base
;
96 struct i915_vma
*hws
, *vma
;
101 vma
= i915_vma_instance(h
->obj
, vm
, NULL
);
105 hws
= i915_vma_instance(h
->hws
, vm
, NULL
);
109 err
= i915_vma_pin(vma
, 0, 0, PIN_USER
);
113 err
= i915_vma_pin(hws
, 0, 0, PIN_USER
);
117 err
= rq
->engine
->emit_flush(rq
, EMIT_INVALIDATE
);
121 err
= i915_switch_context(rq
);
125 i915_vma_move_to_active(vma
, rq
, 0);
126 if (!i915_gem_object_has_active_reference(vma
->obj
)) {
127 i915_gem_object_get(vma
->obj
);
128 i915_gem_object_set_active_reference(vma
->obj
);
131 i915_vma_move_to_active(hws
, rq
, 0);
132 if (!i915_gem_object_has_active_reference(hws
->obj
)) {
133 i915_gem_object_get(hws
->obj
);
134 i915_gem_object_set_active_reference(hws
->obj
);
138 if (INTEL_GEN(i915
) >= 8) {
139 *batch
++ = MI_STORE_DWORD_IMM_GEN4
;
140 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
141 *batch
++ = upper_32_bits(hws_address(hws
, rq
));
142 *batch
++ = rq
->fence
.seqno
;
143 *batch
++ = MI_BATCH_BUFFER_START
| 1 << 8 | 1;
144 *batch
++ = lower_32_bits(vma
->node
.start
);
145 *batch
++ = upper_32_bits(vma
->node
.start
);
146 } else if (INTEL_GEN(i915
) >= 6) {
147 *batch
++ = MI_STORE_DWORD_IMM_GEN4
;
149 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
150 *batch
++ = rq
->fence
.seqno
;
151 *batch
++ = MI_BATCH_BUFFER_START
| 1 << 8;
152 *batch
++ = lower_32_bits(vma
->node
.start
);
153 } else if (INTEL_GEN(i915
) >= 4) {
154 *batch
++ = MI_STORE_DWORD_IMM_GEN4
| 1 << 22;
156 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
157 *batch
++ = rq
->fence
.seqno
;
158 *batch
++ = MI_BATCH_BUFFER_START
| 2 << 6;
159 *batch
++ = lower_32_bits(vma
->node
.start
);
161 *batch
++ = MI_STORE_DWORD_IMM
;
162 *batch
++ = lower_32_bits(hws_address(hws
, rq
));
163 *batch
++ = rq
->fence
.seqno
;
164 *batch
++ = MI_BATCH_BUFFER_START
| 2 << 6 | 1;
165 *batch
++ = lower_32_bits(vma
->node
.start
);
167 *batch
++ = MI_BATCH_BUFFER_END
; /* not reached */
170 if (INTEL_GEN(vm
->i915
) <= 5)
171 flags
|= I915_DISPATCH_SECURE
;
173 err
= rq
->engine
->emit_bb_start(rq
, vma
->node
.start
, PAGE_SIZE
, flags
);
182 static struct drm_i915_gem_request
*
183 hang_create_request(struct hang
*h
,
184 struct intel_engine_cs
*engine
,
185 struct i915_gem_context
*ctx
)
187 struct drm_i915_gem_request
*rq
;
190 if (i915_gem_object_is_active(h
->obj
)) {
191 struct drm_i915_gem_object
*obj
;
194 obj
= i915_gem_object_create_internal(h
->i915
, PAGE_SIZE
);
196 return ERR_CAST(obj
);
198 vaddr
= i915_gem_object_pin_map(obj
,
199 HAS_LLC(h
->i915
) ? I915_MAP_WB
: I915_MAP_WC
);
201 i915_gem_object_put(obj
);
202 return ERR_CAST(vaddr
);
205 i915_gem_object_unpin_map(h
->obj
);
206 i915_gem_object_put(h
->obj
);
212 rq
= i915_gem_request_alloc(engine
, ctx
);
216 err
= emit_recurse_batch(h
, rq
);
218 __i915_add_request(rq
, false);
225 static u32
hws_seqno(const struct hang
*h
,
226 const struct drm_i915_gem_request
*rq
)
228 return READ_ONCE(h
->seqno
[rq
->fence
.context
% (PAGE_SIZE
/sizeof(u32
))]);
231 static void hang_fini(struct hang
*h
)
233 *h
->batch
= MI_BATCH_BUFFER_END
;
236 i915_gem_object_unpin_map(h
->obj
);
237 i915_gem_object_put(h
->obj
);
239 i915_gem_object_unpin_map(h
->hws
);
240 i915_gem_object_put(h
->hws
);
242 i915_gem_wait_for_idle(h
->i915
, I915_WAIT_LOCKED
);
245 static int igt_hang_sanitycheck(void *arg
)
247 struct drm_i915_private
*i915
= arg
;
248 struct drm_i915_gem_request
*rq
;
249 struct intel_engine_cs
*engine
;
250 enum intel_engine_id id
;
254 /* Basic check that we can execute our hanging batch */
256 mutex_lock(&i915
->drm
.struct_mutex
);
257 err
= hang_init(&h
, i915
);
261 for_each_engine(engine
, i915
, id
) {
264 if (!intel_engine_can_store_dword(engine
))
267 rq
= hang_create_request(&h
, engine
, i915
->kernel_context
);
270 pr_err("Failed to create request for %s, err=%d\n",
275 i915_gem_request_get(rq
);
277 *h
.batch
= MI_BATCH_BUFFER_END
;
278 __i915_add_request(rq
, true);
280 timeout
= i915_wait_request(rq
,
282 MAX_SCHEDULE_TIMEOUT
);
283 i915_gem_request_put(rq
);
287 pr_err("Wait for request failed on %s, err=%d\n",
296 mutex_unlock(&i915
->drm
.struct_mutex
);
300 static void global_reset_lock(struct drm_i915_private
*i915
)
302 struct intel_engine_cs
*engine
;
303 enum intel_engine_id id
;
305 while (test_and_set_bit(I915_RESET_BACKOFF
, &i915
->gpu_error
.flags
))
306 wait_event(i915
->gpu_error
.reset_queue
,
307 !test_bit(I915_RESET_BACKOFF
,
308 &i915
->gpu_error
.flags
));
310 for_each_engine(engine
, i915
, id
) {
311 while (test_and_set_bit(I915_RESET_ENGINE
+ id
,
312 &i915
->gpu_error
.flags
))
313 wait_on_bit(&i915
->gpu_error
.flags
,
314 I915_RESET_ENGINE
+ id
,
315 TASK_UNINTERRUPTIBLE
);
319 static void global_reset_unlock(struct drm_i915_private
*i915
)
321 struct intel_engine_cs
*engine
;
322 enum intel_engine_id id
;
324 for_each_engine(engine
, i915
, id
)
325 clear_bit(I915_RESET_ENGINE
+ id
, &i915
->gpu_error
.flags
);
327 clear_bit(I915_RESET_BACKOFF
, &i915
->gpu_error
.flags
);
328 wake_up_all(&i915
->gpu_error
.reset_queue
);
331 static int igt_global_reset(void *arg
)
333 struct drm_i915_private
*i915
= arg
;
334 unsigned int reset_count
;
337 /* Check that we can issue a global GPU reset */
339 global_reset_lock(i915
);
340 set_bit(I915_RESET_HANDOFF
, &i915
->gpu_error
.flags
);
342 mutex_lock(&i915
->drm
.struct_mutex
);
343 reset_count
= i915_reset_count(&i915
->gpu_error
);
345 i915_reset(i915
, I915_RESET_QUIET
);
347 if (i915_reset_count(&i915
->gpu_error
) == reset_count
) {
348 pr_err("No GPU reset recorded!\n");
351 mutex_unlock(&i915
->drm
.struct_mutex
);
353 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF
, &i915
->gpu_error
.flags
));
354 global_reset_unlock(i915
);
356 if (i915_terminally_wedged(&i915
->gpu_error
))
362 static int igt_reset_engine(void *arg
)
364 struct drm_i915_private
*i915
= arg
;
365 struct intel_engine_cs
*engine
;
366 enum intel_engine_id id
;
367 unsigned int reset_count
, reset_engine_count
;
370 /* Check that we can issue a global GPU and engine reset */
372 if (!intel_has_reset_engine(i915
))
375 for_each_engine(engine
, i915
, id
) {
376 set_bit(I915_RESET_ENGINE
+ engine
->id
, &i915
->gpu_error
.flags
);
377 reset_count
= i915_reset_count(&i915
->gpu_error
);
378 reset_engine_count
= i915_reset_engine_count(&i915
->gpu_error
,
381 err
= i915_reset_engine(engine
, I915_RESET_QUIET
);
383 pr_err("i915_reset_engine failed\n");
387 if (i915_reset_count(&i915
->gpu_error
) != reset_count
) {
388 pr_err("Full GPU reset recorded! (engine reset expected)\n");
393 if (i915_reset_engine_count(&i915
->gpu_error
, engine
) ==
394 reset_engine_count
) {
395 pr_err("No %s engine reset recorded!\n", engine
->name
);
400 clear_bit(I915_RESET_ENGINE
+ engine
->id
,
401 &i915
->gpu_error
.flags
);
404 if (i915_terminally_wedged(&i915
->gpu_error
))
410 static int active_engine(void *data
)
412 struct intel_engine_cs
*engine
= data
;
413 struct drm_i915_gem_request
*rq
[2] = {};
414 struct i915_gem_context
*ctx
[2];
415 struct drm_file
*file
;
416 unsigned long count
= 0;
419 file
= mock_file(engine
->i915
);
421 return PTR_ERR(file
);
423 mutex_lock(&engine
->i915
->drm
.struct_mutex
);
424 ctx
[0] = live_context(engine
->i915
, file
);
425 mutex_unlock(&engine
->i915
->drm
.struct_mutex
);
426 if (IS_ERR(ctx
[0])) {
427 err
= PTR_ERR(ctx
[0]);
431 mutex_lock(&engine
->i915
->drm
.struct_mutex
);
432 ctx
[1] = live_context(engine
->i915
, file
);
433 mutex_unlock(&engine
->i915
->drm
.struct_mutex
);
434 if (IS_ERR(ctx
[1])) {
435 err
= PTR_ERR(ctx
[1]);
436 i915_gem_context_put(ctx
[0]);
440 while (!kthread_should_stop()) {
441 unsigned int idx
= count
++ & 1;
442 struct drm_i915_gem_request
*old
= rq
[idx
];
443 struct drm_i915_gem_request
*new;
445 mutex_lock(&engine
->i915
->drm
.struct_mutex
);
446 new = i915_gem_request_alloc(engine
, ctx
[idx
]);
448 mutex_unlock(&engine
->i915
->drm
.struct_mutex
);
453 rq
[idx
] = i915_gem_request_get(new);
454 i915_add_request(new);
455 mutex_unlock(&engine
->i915
->drm
.struct_mutex
);
458 i915_wait_request(old
, 0, MAX_SCHEDULE_TIMEOUT
);
459 i915_gem_request_put(old
);
463 for (count
= 0; count
< ARRAY_SIZE(rq
); count
++)
464 i915_gem_request_put(rq
[count
]);
467 mock_file_free(engine
->i915
, file
);
471 static int igt_reset_active_engines(void *arg
)
473 struct drm_i915_private
*i915
= arg
;
474 struct intel_engine_cs
*engine
, *active
;
475 enum intel_engine_id id
, tmp
;
478 /* Check that issuing a reset on one engine does not interfere
479 * with any other engine.
482 if (!intel_has_reset_engine(i915
))
485 for_each_engine(engine
, i915
, id
) {
486 struct task_struct
*threads
[I915_NUM_ENGINES
];
487 unsigned long resets
[I915_NUM_ENGINES
];
488 unsigned long global
= i915_reset_count(&i915
->gpu_error
);
489 IGT_TIMEOUT(end_time
);
491 memset(threads
, 0, sizeof(threads
));
492 for_each_engine(active
, i915
, tmp
) {
493 struct task_struct
*tsk
;
495 if (active
== engine
)
498 resets
[tmp
] = i915_reset_engine_count(&i915
->gpu_error
,
501 tsk
= kthread_run(active_engine
, active
,
502 "igt/%s", active
->name
);
509 get_task_struct(tsk
);
512 set_bit(I915_RESET_ENGINE
+ engine
->id
, &i915
->gpu_error
.flags
);
514 err
= i915_reset_engine(engine
, I915_RESET_QUIET
);
516 pr_err("i915_reset_engine(%s) failed, err=%d\n",
520 } while (time_before(jiffies
, end_time
));
521 clear_bit(I915_RESET_ENGINE
+ engine
->id
,
522 &i915
->gpu_error
.flags
);
525 for_each_engine(active
, i915
, tmp
) {
531 ret
= kthread_stop(threads
[tmp
]);
533 pr_err("kthread for active engine %s failed, err=%d\n",
538 put_task_struct(threads
[tmp
]);
540 if (resets
[tmp
] != i915_reset_engine_count(&i915
->gpu_error
,
542 pr_err("Innocent engine %s was reset (count=%ld)\n",
544 i915_reset_engine_count(&i915
->gpu_error
,
545 active
) - resets
[tmp
]);
550 if (global
!= i915_reset_count(&i915
->gpu_error
)) {
551 pr_err("Global reset (count=%ld)!\n",
552 i915_reset_count(&i915
->gpu_error
) - global
);
562 if (i915_terminally_wedged(&i915
->gpu_error
))
568 static u32
fake_hangcheck(struct drm_i915_gem_request
*rq
)
572 rq
->engine
->hangcheck
.stalled
= true;
573 rq
->engine
->hangcheck
.seqno
= intel_engine_get_seqno(rq
->engine
);
575 reset_count
= i915_reset_count(&rq
->i915
->gpu_error
);
577 set_bit(I915_RESET_HANDOFF
, &rq
->i915
->gpu_error
.flags
);
578 wake_up_all(&rq
->i915
->gpu_error
.wait_queue
);
583 static bool wait_for_hang(struct hang
*h
, struct drm_i915_gem_request
*rq
)
585 return !(wait_for_us(i915_seqno_passed(hws_seqno(h
, rq
),
588 wait_for(i915_seqno_passed(hws_seqno(h
, rq
),
593 static int igt_wait_reset(void *arg
)
595 struct drm_i915_private
*i915
= arg
;
596 struct drm_i915_gem_request
*rq
;
597 unsigned int reset_count
;
602 if (!intel_engine_can_store_dword(i915
->engine
[RCS
]))
605 /* Check that we detect a stuck waiter and issue a reset */
607 global_reset_lock(i915
);
609 mutex_lock(&i915
->drm
.struct_mutex
);
610 err
= hang_init(&h
, i915
);
614 rq
= hang_create_request(&h
, i915
->engine
[RCS
], i915
->kernel_context
);
620 i915_gem_request_get(rq
);
621 __i915_add_request(rq
, true);
623 if (!wait_for_hang(&h
, rq
)) {
624 pr_err("Failed to start request %x, at %x\n",
625 rq
->fence
.seqno
, hws_seqno(&h
, rq
));
628 i915_gem_set_wedged(i915
);
634 reset_count
= fake_hangcheck(rq
);
636 timeout
= i915_wait_request(rq
, I915_WAIT_LOCKED
, 10);
638 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
644 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF
, &i915
->gpu_error
.flags
));
645 if (i915_reset_count(&i915
->gpu_error
) == reset_count
) {
646 pr_err("No GPU reset recorded!\n");
652 i915_gem_request_put(rq
);
656 mutex_unlock(&i915
->drm
.struct_mutex
);
657 global_reset_unlock(i915
);
659 if (i915_terminally_wedged(&i915
->gpu_error
))
665 static int igt_reset_queue(void *arg
)
667 struct drm_i915_private
*i915
= arg
;
668 struct intel_engine_cs
*engine
;
669 enum intel_engine_id id
;
673 /* Check that we replay pending requests following a hang */
675 global_reset_lock(i915
);
677 mutex_lock(&i915
->drm
.struct_mutex
);
678 err
= hang_init(&h
, i915
);
682 for_each_engine(engine
, i915
, id
) {
683 struct drm_i915_gem_request
*prev
;
684 IGT_TIMEOUT(end_time
);
687 if (!intel_engine_can_store_dword(engine
))
690 prev
= hang_create_request(&h
, engine
, i915
->kernel_context
);
696 i915_gem_request_get(prev
);
697 __i915_add_request(prev
, true);
701 struct drm_i915_gem_request
*rq
;
702 unsigned int reset_count
;
704 rq
= hang_create_request(&h
,
706 i915
->kernel_context
);
712 i915_gem_request_get(rq
);
713 __i915_add_request(rq
, true);
715 if (!wait_for_hang(&h
, prev
)) {
716 pr_err("Failed to start request %x, at %x\n",
717 prev
->fence
.seqno
, hws_seqno(&h
, prev
));
718 i915_gem_request_put(rq
);
719 i915_gem_request_put(prev
);
722 i915_gem_set_wedged(i915
);
728 reset_count
= fake_hangcheck(prev
);
730 i915_reset(i915
, I915_RESET_QUIET
);
732 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF
,
733 &i915
->gpu_error
.flags
));
735 if (prev
->fence
.error
!= -EIO
) {
736 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
738 i915_gem_request_put(rq
);
739 i915_gem_request_put(prev
);
744 if (rq
->fence
.error
) {
745 pr_err("Fence error status not zero [%d] after unrelated reset\n",
747 i915_gem_request_put(rq
);
748 i915_gem_request_put(prev
);
753 if (i915_reset_count(&i915
->gpu_error
) == reset_count
) {
754 pr_err("No GPU reset recorded!\n");
755 i915_gem_request_put(rq
);
756 i915_gem_request_put(prev
);
761 i915_gem_request_put(prev
);
764 } while (time_before(jiffies
, end_time
));
765 pr_info("%s: Completed %d resets\n", engine
->name
, count
);
767 *h
.batch
= MI_BATCH_BUFFER_END
;
770 i915_gem_request_put(prev
);
776 mutex_unlock(&i915
->drm
.struct_mutex
);
777 global_reset_unlock(i915
);
779 if (i915_terminally_wedged(&i915
->gpu_error
))
785 static int igt_handle_error(void *arg
)
787 struct drm_i915_private
*i915
= arg
;
788 struct intel_engine_cs
*engine
= i915
->engine
[RCS
];
790 struct drm_i915_gem_request
*rq
;
791 struct i915_gpu_state
*error
;
794 /* Check that we can issue a global GPU and engine reset */
796 if (!intel_has_reset_engine(i915
))
799 if (!intel_engine_can_store_dword(i915
->engine
[RCS
]))
802 mutex_lock(&i915
->drm
.struct_mutex
);
804 err
= hang_init(&h
, i915
);
808 rq
= hang_create_request(&h
, engine
, i915
->kernel_context
);
814 i915_gem_request_get(rq
);
815 __i915_add_request(rq
, true);
817 if (!wait_for_hang(&h
, rq
)) {
818 pr_err("Failed to start request %x, at %x\n",
819 rq
->fence
.seqno
, hws_seqno(&h
, rq
));
822 i915_gem_set_wedged(i915
);
828 mutex_unlock(&i915
->drm
.struct_mutex
);
830 /* Temporarily disable error capture */
831 error
= xchg(&i915
->gpu_error
.first_error
, (void *)-1);
833 engine
->hangcheck
.stalled
= true;
834 engine
->hangcheck
.seqno
= intel_engine_get_seqno(engine
);
836 i915_handle_error(i915
, intel_engine_flag(engine
), "%s", __func__
);
838 xchg(&i915
->gpu_error
.first_error
, error
);
840 mutex_lock(&i915
->drm
.struct_mutex
);
842 if (rq
->fence
.error
!= -EIO
) {
843 pr_err("Guilty request not identified!\n");
849 i915_gem_request_put(rq
);
853 mutex_unlock(&i915
->drm
.struct_mutex
);
857 int intel_hangcheck_live_selftests(struct drm_i915_private
*i915
)
859 static const struct i915_subtest tests
[] = {
860 SUBTEST(igt_global_reset
), /* attempt to recover GPU first */
861 SUBTEST(igt_hang_sanitycheck
),
862 SUBTEST(igt_reset_engine
),
863 SUBTEST(igt_reset_active_engines
),
864 SUBTEST(igt_wait_reset
),
865 SUBTEST(igt_reset_queue
),
866 SUBTEST(igt_handle_error
),
869 if (!intel_has_gpu_reset(i915
))
872 return i915_subtests(tests
, i915
);