]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/gpu/drm/i915/selftests/intel_hangcheck.c
drm/amdkfd: Improve multiple SDMA queues support per process
[mirror_ubuntu-bionic-kernel.git] / drivers / gpu / drm / i915 / selftests / intel_hangcheck.c
1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/kthread.h>
26
27 #include "../i915_selftest.h"
28
29 #include "mock_context.h"
30 #include "mock_drm.h"
31
32 struct hang {
33 struct drm_i915_private *i915;
34 struct drm_i915_gem_object *hws;
35 struct drm_i915_gem_object *obj;
36 u32 *seqno;
37 u32 *batch;
38 };
39
40 static int hang_init(struct hang *h, struct drm_i915_private *i915)
41 {
42 void *vaddr;
43 int err;
44
45 memset(h, 0, sizeof(*h));
46 h->i915 = i915;
47
48 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
49 if (IS_ERR(h->hws))
50 return PTR_ERR(h->hws);
51
52 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
53 if (IS_ERR(h->obj)) {
54 err = PTR_ERR(h->obj);
55 goto err_hws;
56 }
57
58 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
59 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
60 if (IS_ERR(vaddr)) {
61 err = PTR_ERR(vaddr);
62 goto err_obj;
63 }
64 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
65
66 vaddr = i915_gem_object_pin_map(h->obj,
67 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
68 if (IS_ERR(vaddr)) {
69 err = PTR_ERR(vaddr);
70 goto err_unpin_hws;
71 }
72 h->batch = vaddr;
73
74 return 0;
75
76 err_unpin_hws:
77 i915_gem_object_unpin_map(h->hws);
78 err_obj:
79 i915_gem_object_put(h->obj);
80 err_hws:
81 i915_gem_object_put(h->hws);
82 return err;
83 }
84
85 static u64 hws_address(const struct i915_vma *hws,
86 const struct drm_i915_gem_request *rq)
87 {
88 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
89 }
90
91 static int emit_recurse_batch(struct hang *h,
92 struct drm_i915_gem_request *rq)
93 {
94 struct drm_i915_private *i915 = h->i915;
95 struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
96 struct i915_vma *hws, *vma;
97 unsigned int flags;
98 u32 *batch;
99 int err;
100
101 vma = i915_vma_instance(h->obj, vm, NULL);
102 if (IS_ERR(vma))
103 return PTR_ERR(vma);
104
105 hws = i915_vma_instance(h->hws, vm, NULL);
106 if (IS_ERR(hws))
107 return PTR_ERR(hws);
108
109 err = i915_vma_pin(vma, 0, 0, PIN_USER);
110 if (err)
111 return err;
112
113 err = i915_vma_pin(hws, 0, 0, PIN_USER);
114 if (err)
115 goto unpin_vma;
116
117 err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
118 if (err)
119 goto unpin_hws;
120
121 err = i915_switch_context(rq);
122 if (err)
123 goto unpin_hws;
124
125 i915_vma_move_to_active(vma, rq, 0);
126 if (!i915_gem_object_has_active_reference(vma->obj)) {
127 i915_gem_object_get(vma->obj);
128 i915_gem_object_set_active_reference(vma->obj);
129 }
130
131 i915_vma_move_to_active(hws, rq, 0);
132 if (!i915_gem_object_has_active_reference(hws->obj)) {
133 i915_gem_object_get(hws->obj);
134 i915_gem_object_set_active_reference(hws->obj);
135 }
136
137 batch = h->batch;
138 if (INTEL_GEN(i915) >= 8) {
139 *batch++ = MI_STORE_DWORD_IMM_GEN4;
140 *batch++ = lower_32_bits(hws_address(hws, rq));
141 *batch++ = upper_32_bits(hws_address(hws, rq));
142 *batch++ = rq->fence.seqno;
143 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
144 *batch++ = lower_32_bits(vma->node.start);
145 *batch++ = upper_32_bits(vma->node.start);
146 } else if (INTEL_GEN(i915) >= 6) {
147 *batch++ = MI_STORE_DWORD_IMM_GEN4;
148 *batch++ = 0;
149 *batch++ = lower_32_bits(hws_address(hws, rq));
150 *batch++ = rq->fence.seqno;
151 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
152 *batch++ = lower_32_bits(vma->node.start);
153 } else if (INTEL_GEN(i915) >= 4) {
154 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
155 *batch++ = 0;
156 *batch++ = lower_32_bits(hws_address(hws, rq));
157 *batch++ = rq->fence.seqno;
158 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
159 *batch++ = lower_32_bits(vma->node.start);
160 } else {
161 *batch++ = MI_STORE_DWORD_IMM;
162 *batch++ = lower_32_bits(hws_address(hws, rq));
163 *batch++ = rq->fence.seqno;
164 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
165 *batch++ = lower_32_bits(vma->node.start);
166 }
167 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
168
169 flags = 0;
170 if (INTEL_GEN(vm->i915) <= 5)
171 flags |= I915_DISPATCH_SECURE;
172
173 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
174
175 unpin_hws:
176 i915_vma_unpin(hws);
177 unpin_vma:
178 i915_vma_unpin(vma);
179 return err;
180 }
181
182 static struct drm_i915_gem_request *
183 hang_create_request(struct hang *h,
184 struct intel_engine_cs *engine,
185 struct i915_gem_context *ctx)
186 {
187 struct drm_i915_gem_request *rq;
188 int err;
189
190 if (i915_gem_object_is_active(h->obj)) {
191 struct drm_i915_gem_object *obj;
192 void *vaddr;
193
194 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
195 if (IS_ERR(obj))
196 return ERR_CAST(obj);
197
198 vaddr = i915_gem_object_pin_map(obj,
199 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
200 if (IS_ERR(vaddr)) {
201 i915_gem_object_put(obj);
202 return ERR_CAST(vaddr);
203 }
204
205 i915_gem_object_unpin_map(h->obj);
206 i915_gem_object_put(h->obj);
207
208 h->obj = obj;
209 h->batch = vaddr;
210 }
211
212 rq = i915_gem_request_alloc(engine, ctx);
213 if (IS_ERR(rq))
214 return rq;
215
216 err = emit_recurse_batch(h, rq);
217 if (err) {
218 __i915_add_request(rq, false);
219 return ERR_PTR(err);
220 }
221
222 return rq;
223 }
224
225 static u32 hws_seqno(const struct hang *h,
226 const struct drm_i915_gem_request *rq)
227 {
228 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
229 }
230
231 static void hang_fini(struct hang *h)
232 {
233 *h->batch = MI_BATCH_BUFFER_END;
234 wmb();
235
236 i915_gem_object_unpin_map(h->obj);
237 i915_gem_object_put(h->obj);
238
239 i915_gem_object_unpin_map(h->hws);
240 i915_gem_object_put(h->hws);
241
242 i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
243 }
244
245 static int igt_hang_sanitycheck(void *arg)
246 {
247 struct drm_i915_private *i915 = arg;
248 struct drm_i915_gem_request *rq;
249 struct intel_engine_cs *engine;
250 enum intel_engine_id id;
251 struct hang h;
252 int err;
253
254 /* Basic check that we can execute our hanging batch */
255
256 mutex_lock(&i915->drm.struct_mutex);
257 err = hang_init(&h, i915);
258 if (err)
259 goto unlock;
260
261 for_each_engine(engine, i915, id) {
262 long timeout;
263
264 if (!intel_engine_can_store_dword(engine))
265 continue;
266
267 rq = hang_create_request(&h, engine, i915->kernel_context);
268 if (IS_ERR(rq)) {
269 err = PTR_ERR(rq);
270 pr_err("Failed to create request for %s, err=%d\n",
271 engine->name, err);
272 goto fini;
273 }
274
275 i915_gem_request_get(rq);
276
277 *h.batch = MI_BATCH_BUFFER_END;
278 __i915_add_request(rq, true);
279
280 timeout = i915_wait_request(rq,
281 I915_WAIT_LOCKED,
282 MAX_SCHEDULE_TIMEOUT);
283 i915_gem_request_put(rq);
284
285 if (timeout < 0) {
286 err = timeout;
287 pr_err("Wait for request failed on %s, err=%d\n",
288 engine->name, err);
289 goto fini;
290 }
291 }
292
293 fini:
294 hang_fini(&h);
295 unlock:
296 mutex_unlock(&i915->drm.struct_mutex);
297 return err;
298 }
299
300 static void global_reset_lock(struct drm_i915_private *i915)
301 {
302 struct intel_engine_cs *engine;
303 enum intel_engine_id id;
304
305 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
306 wait_event(i915->gpu_error.reset_queue,
307 !test_bit(I915_RESET_BACKOFF,
308 &i915->gpu_error.flags));
309
310 for_each_engine(engine, i915, id) {
311 while (test_and_set_bit(I915_RESET_ENGINE + id,
312 &i915->gpu_error.flags))
313 wait_on_bit(&i915->gpu_error.flags,
314 I915_RESET_ENGINE + id,
315 TASK_UNINTERRUPTIBLE);
316 }
317 }
318
319 static void global_reset_unlock(struct drm_i915_private *i915)
320 {
321 struct intel_engine_cs *engine;
322 enum intel_engine_id id;
323
324 for_each_engine(engine, i915, id)
325 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
326
327 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
328 wake_up_all(&i915->gpu_error.reset_queue);
329 }
330
331 static int igt_global_reset(void *arg)
332 {
333 struct drm_i915_private *i915 = arg;
334 unsigned int reset_count;
335 int err = 0;
336
337 /* Check that we can issue a global GPU reset */
338
339 global_reset_lock(i915);
340 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
341
342 mutex_lock(&i915->drm.struct_mutex);
343 reset_count = i915_reset_count(&i915->gpu_error);
344
345 i915_reset(i915, I915_RESET_QUIET);
346
347 if (i915_reset_count(&i915->gpu_error) == reset_count) {
348 pr_err("No GPU reset recorded!\n");
349 err = -EINVAL;
350 }
351 mutex_unlock(&i915->drm.struct_mutex);
352
353 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
354 global_reset_unlock(i915);
355
356 if (i915_terminally_wedged(&i915->gpu_error))
357 err = -EIO;
358
359 return err;
360 }
361
362 static int igt_reset_engine(void *arg)
363 {
364 struct drm_i915_private *i915 = arg;
365 struct intel_engine_cs *engine;
366 enum intel_engine_id id;
367 unsigned int reset_count, reset_engine_count;
368 int err = 0;
369
370 /* Check that we can issue a global GPU and engine reset */
371
372 if (!intel_has_reset_engine(i915))
373 return 0;
374
375 for_each_engine(engine, i915, id) {
376 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
377 reset_count = i915_reset_count(&i915->gpu_error);
378 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
379 engine);
380
381 err = i915_reset_engine(engine, I915_RESET_QUIET);
382 if (err) {
383 pr_err("i915_reset_engine failed\n");
384 break;
385 }
386
387 if (i915_reset_count(&i915->gpu_error) != reset_count) {
388 pr_err("Full GPU reset recorded! (engine reset expected)\n");
389 err = -EINVAL;
390 break;
391 }
392
393 if (i915_reset_engine_count(&i915->gpu_error, engine) ==
394 reset_engine_count) {
395 pr_err("No %s engine reset recorded!\n", engine->name);
396 err = -EINVAL;
397 break;
398 }
399
400 clear_bit(I915_RESET_ENGINE + engine->id,
401 &i915->gpu_error.flags);
402 }
403
404 if (i915_terminally_wedged(&i915->gpu_error))
405 err = -EIO;
406
407 return err;
408 }
409
410 static int active_engine(void *data)
411 {
412 struct intel_engine_cs *engine = data;
413 struct drm_i915_gem_request *rq[2] = {};
414 struct i915_gem_context *ctx[2];
415 struct drm_file *file;
416 unsigned long count = 0;
417 int err = 0;
418
419 file = mock_file(engine->i915);
420 if (IS_ERR(file))
421 return PTR_ERR(file);
422
423 mutex_lock(&engine->i915->drm.struct_mutex);
424 ctx[0] = live_context(engine->i915, file);
425 mutex_unlock(&engine->i915->drm.struct_mutex);
426 if (IS_ERR(ctx[0])) {
427 err = PTR_ERR(ctx[0]);
428 goto err_file;
429 }
430
431 mutex_lock(&engine->i915->drm.struct_mutex);
432 ctx[1] = live_context(engine->i915, file);
433 mutex_unlock(&engine->i915->drm.struct_mutex);
434 if (IS_ERR(ctx[1])) {
435 err = PTR_ERR(ctx[1]);
436 i915_gem_context_put(ctx[0]);
437 goto err_file;
438 }
439
440 while (!kthread_should_stop()) {
441 unsigned int idx = count++ & 1;
442 struct drm_i915_gem_request *old = rq[idx];
443 struct drm_i915_gem_request *new;
444
445 mutex_lock(&engine->i915->drm.struct_mutex);
446 new = i915_gem_request_alloc(engine, ctx[idx]);
447 if (IS_ERR(new)) {
448 mutex_unlock(&engine->i915->drm.struct_mutex);
449 err = PTR_ERR(new);
450 break;
451 }
452
453 rq[idx] = i915_gem_request_get(new);
454 i915_add_request(new);
455 mutex_unlock(&engine->i915->drm.struct_mutex);
456
457 if (old) {
458 i915_wait_request(old, 0, MAX_SCHEDULE_TIMEOUT);
459 i915_gem_request_put(old);
460 }
461 }
462
463 for (count = 0; count < ARRAY_SIZE(rq); count++)
464 i915_gem_request_put(rq[count]);
465
466 err_file:
467 mock_file_free(engine->i915, file);
468 return err;
469 }
470
471 static int igt_reset_active_engines(void *arg)
472 {
473 struct drm_i915_private *i915 = arg;
474 struct intel_engine_cs *engine, *active;
475 enum intel_engine_id id, tmp;
476 int err = 0;
477
478 /* Check that issuing a reset on one engine does not interfere
479 * with any other engine.
480 */
481
482 if (!intel_has_reset_engine(i915))
483 return 0;
484
485 for_each_engine(engine, i915, id) {
486 struct task_struct *threads[I915_NUM_ENGINES];
487 unsigned long resets[I915_NUM_ENGINES];
488 unsigned long global = i915_reset_count(&i915->gpu_error);
489 IGT_TIMEOUT(end_time);
490
491 memset(threads, 0, sizeof(threads));
492 for_each_engine(active, i915, tmp) {
493 struct task_struct *tsk;
494
495 if (active == engine)
496 continue;
497
498 resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
499 active);
500
501 tsk = kthread_run(active_engine, active,
502 "igt/%s", active->name);
503 if (IS_ERR(tsk)) {
504 err = PTR_ERR(tsk);
505 goto unwind;
506 }
507
508 threads[tmp] = tsk;
509 get_task_struct(tsk);
510 }
511
512 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
513 do {
514 err = i915_reset_engine(engine, I915_RESET_QUIET);
515 if (err) {
516 pr_err("i915_reset_engine(%s) failed, err=%d\n",
517 engine->name, err);
518 break;
519 }
520 } while (time_before(jiffies, end_time));
521 clear_bit(I915_RESET_ENGINE + engine->id,
522 &i915->gpu_error.flags);
523
524 unwind:
525 for_each_engine(active, i915, tmp) {
526 int ret;
527
528 if (!threads[tmp])
529 continue;
530
531 ret = kthread_stop(threads[tmp]);
532 if (ret) {
533 pr_err("kthread for active engine %s failed, err=%d\n",
534 active->name, ret);
535 if (!err)
536 err = ret;
537 }
538 put_task_struct(threads[tmp]);
539
540 if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
541 active)) {
542 pr_err("Innocent engine %s was reset (count=%ld)\n",
543 active->name,
544 i915_reset_engine_count(&i915->gpu_error,
545 active) - resets[tmp]);
546 err = -EIO;
547 }
548 }
549
550 if (global != i915_reset_count(&i915->gpu_error)) {
551 pr_err("Global reset (count=%ld)!\n",
552 i915_reset_count(&i915->gpu_error) - global);
553 err = -EIO;
554 }
555
556 if (err)
557 break;
558
559 cond_resched();
560 }
561
562 if (i915_terminally_wedged(&i915->gpu_error))
563 err = -EIO;
564
565 return err;
566 }
567
568 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
569 {
570 u32 reset_count;
571
572 rq->engine->hangcheck.stalled = true;
573 rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
574
575 reset_count = i915_reset_count(&rq->i915->gpu_error);
576
577 set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
578 wake_up_all(&rq->i915->gpu_error.wait_queue);
579
580 return reset_count;
581 }
582
583 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
584 {
585 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
586 rq->fence.seqno),
587 10) &&
588 wait_for(i915_seqno_passed(hws_seqno(h, rq),
589 rq->fence.seqno),
590 1000));
591 }
592
593 static int igt_wait_reset(void *arg)
594 {
595 struct drm_i915_private *i915 = arg;
596 struct drm_i915_gem_request *rq;
597 unsigned int reset_count;
598 struct hang h;
599 long timeout;
600 int err;
601
602 if (!intel_engine_can_store_dword(i915->engine[RCS]))
603 return 0;
604
605 /* Check that we detect a stuck waiter and issue a reset */
606
607 global_reset_lock(i915);
608
609 mutex_lock(&i915->drm.struct_mutex);
610 err = hang_init(&h, i915);
611 if (err)
612 goto unlock;
613
614 rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
615 if (IS_ERR(rq)) {
616 err = PTR_ERR(rq);
617 goto fini;
618 }
619
620 i915_gem_request_get(rq);
621 __i915_add_request(rq, true);
622
623 if (!wait_for_hang(&h, rq)) {
624 pr_err("Failed to start request %x, at %x\n",
625 rq->fence.seqno, hws_seqno(&h, rq));
626
627 i915_reset(i915, 0);
628 i915_gem_set_wedged(i915);
629
630 err = -EIO;
631 goto out_rq;
632 }
633
634 reset_count = fake_hangcheck(rq);
635
636 timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
637 if (timeout < 0) {
638 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
639 timeout);
640 err = timeout;
641 goto out_rq;
642 }
643
644 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
645 if (i915_reset_count(&i915->gpu_error) == reset_count) {
646 pr_err("No GPU reset recorded!\n");
647 err = -EINVAL;
648 goto out_rq;
649 }
650
651 out_rq:
652 i915_gem_request_put(rq);
653 fini:
654 hang_fini(&h);
655 unlock:
656 mutex_unlock(&i915->drm.struct_mutex);
657 global_reset_unlock(i915);
658
659 if (i915_terminally_wedged(&i915->gpu_error))
660 return -EIO;
661
662 return err;
663 }
664
665 static int igt_reset_queue(void *arg)
666 {
667 struct drm_i915_private *i915 = arg;
668 struct intel_engine_cs *engine;
669 enum intel_engine_id id;
670 struct hang h;
671 int err;
672
673 /* Check that we replay pending requests following a hang */
674
675 global_reset_lock(i915);
676
677 mutex_lock(&i915->drm.struct_mutex);
678 err = hang_init(&h, i915);
679 if (err)
680 goto unlock;
681
682 for_each_engine(engine, i915, id) {
683 struct drm_i915_gem_request *prev;
684 IGT_TIMEOUT(end_time);
685 unsigned int count;
686
687 if (!intel_engine_can_store_dword(engine))
688 continue;
689
690 prev = hang_create_request(&h, engine, i915->kernel_context);
691 if (IS_ERR(prev)) {
692 err = PTR_ERR(prev);
693 goto fini;
694 }
695
696 i915_gem_request_get(prev);
697 __i915_add_request(prev, true);
698
699 count = 0;
700 do {
701 struct drm_i915_gem_request *rq;
702 unsigned int reset_count;
703
704 rq = hang_create_request(&h,
705 engine,
706 i915->kernel_context);
707 if (IS_ERR(rq)) {
708 err = PTR_ERR(rq);
709 goto fini;
710 }
711
712 i915_gem_request_get(rq);
713 __i915_add_request(rq, true);
714
715 if (!wait_for_hang(&h, prev)) {
716 pr_err("Failed to start request %x, at %x\n",
717 prev->fence.seqno, hws_seqno(&h, prev));
718 i915_gem_request_put(rq);
719 i915_gem_request_put(prev);
720
721 i915_reset(i915, 0);
722 i915_gem_set_wedged(i915);
723
724 err = -EIO;
725 goto fini;
726 }
727
728 reset_count = fake_hangcheck(prev);
729
730 i915_reset(i915, I915_RESET_QUIET);
731
732 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
733 &i915->gpu_error.flags));
734
735 if (prev->fence.error != -EIO) {
736 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
737 prev->fence.error);
738 i915_gem_request_put(rq);
739 i915_gem_request_put(prev);
740 err = -EINVAL;
741 goto fini;
742 }
743
744 if (rq->fence.error) {
745 pr_err("Fence error status not zero [%d] after unrelated reset\n",
746 rq->fence.error);
747 i915_gem_request_put(rq);
748 i915_gem_request_put(prev);
749 err = -EINVAL;
750 goto fini;
751 }
752
753 if (i915_reset_count(&i915->gpu_error) == reset_count) {
754 pr_err("No GPU reset recorded!\n");
755 i915_gem_request_put(rq);
756 i915_gem_request_put(prev);
757 err = -EINVAL;
758 goto fini;
759 }
760
761 i915_gem_request_put(prev);
762 prev = rq;
763 count++;
764 } while (time_before(jiffies, end_time));
765 pr_info("%s: Completed %d resets\n", engine->name, count);
766
767 *h.batch = MI_BATCH_BUFFER_END;
768 wmb();
769
770 i915_gem_request_put(prev);
771 }
772
773 fini:
774 hang_fini(&h);
775 unlock:
776 mutex_unlock(&i915->drm.struct_mutex);
777 global_reset_unlock(i915);
778
779 if (i915_terminally_wedged(&i915->gpu_error))
780 return -EIO;
781
782 return err;
783 }
784
785 static int igt_handle_error(void *arg)
786 {
787 struct drm_i915_private *i915 = arg;
788 struct intel_engine_cs *engine = i915->engine[RCS];
789 struct hang h;
790 struct drm_i915_gem_request *rq;
791 struct i915_gpu_state *error;
792 int err;
793
794 /* Check that we can issue a global GPU and engine reset */
795
796 if (!intel_has_reset_engine(i915))
797 return 0;
798
799 if (!intel_engine_can_store_dword(i915->engine[RCS]))
800 return 0;
801
802 mutex_lock(&i915->drm.struct_mutex);
803
804 err = hang_init(&h, i915);
805 if (err)
806 goto err_unlock;
807
808 rq = hang_create_request(&h, engine, i915->kernel_context);
809 if (IS_ERR(rq)) {
810 err = PTR_ERR(rq);
811 goto err_fini;
812 }
813
814 i915_gem_request_get(rq);
815 __i915_add_request(rq, true);
816
817 if (!wait_for_hang(&h, rq)) {
818 pr_err("Failed to start request %x, at %x\n",
819 rq->fence.seqno, hws_seqno(&h, rq));
820
821 i915_reset(i915, 0);
822 i915_gem_set_wedged(i915);
823
824 err = -EIO;
825 goto err_request;
826 }
827
828 mutex_unlock(&i915->drm.struct_mutex);
829
830 /* Temporarily disable error capture */
831 error = xchg(&i915->gpu_error.first_error, (void *)-1);
832
833 engine->hangcheck.stalled = true;
834 engine->hangcheck.seqno = intel_engine_get_seqno(engine);
835
836 i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
837
838 xchg(&i915->gpu_error.first_error, error);
839
840 mutex_lock(&i915->drm.struct_mutex);
841
842 if (rq->fence.error != -EIO) {
843 pr_err("Guilty request not identified!\n");
844 err = -EINVAL;
845 goto err_request;
846 }
847
848 err_request:
849 i915_gem_request_put(rq);
850 err_fini:
851 hang_fini(&h);
852 err_unlock:
853 mutex_unlock(&i915->drm.struct_mutex);
854 return err;
855 }
856
857 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
858 {
859 static const struct i915_subtest tests[] = {
860 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
861 SUBTEST(igt_hang_sanitycheck),
862 SUBTEST(igt_reset_engine),
863 SUBTEST(igt_reset_active_engines),
864 SUBTEST(igt_wait_reset),
865 SUBTEST(igt_reset_queue),
866 SUBTEST(igt_handle_error),
867 };
868
869 if (!intel_has_gpu_reset(i915))
870 return 0;
871
872 return i915_subtests(tests, i915);
873 }