drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   1 /*
   2  * Copyright © 2016 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <linux/kthread.h>
  26
  27 #include "../i915_selftest.h"
  28
  29 #include "mock_context.h"
  30 #include "mock_drm.h"
  31
  32 struct hang {
  33         struct drm_i915_private *i915;
  34         struct drm_i915_gem_object *hws;
  35         struct drm_i915_gem_object *obj;
  36         u32 *seqno;
  37         u32 *batch;
  38 };
  39
  40 static int hang_init(struct hang *h, struct drm_i915_private *i915)
  41 {
  42         void *vaddr;
  43         int err;
  44
  45         memset(h, 0, sizeof(*h));
  46         h->i915 = i915;
  47
  48         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  49         if (IS_ERR(h->hws))
  50                 return PTR_ERR(h->hws);
  51
  52         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  53         if (IS_ERR(h->obj)) {
  54                 err = PTR_ERR(h->obj);
  55                 goto err_hws;
  56         }
  57
  58         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  59         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  60         if (IS_ERR(vaddr)) {
  61                 err = PTR_ERR(vaddr);
  62                 goto err_obj;
  63         }
  64         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  65
  66         vaddr = i915_gem_object_pin_map(h->obj,
  67                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  68         if (IS_ERR(vaddr)) {
  69                 err = PTR_ERR(vaddr);
  70                 goto err_unpin_hws;
  71         }
  72         h->batch = vaddr;
  73
  74         return 0;
  75
  76 err_unpin_hws:
  77         i915_gem_object_unpin_map(h->hws);
  78 err_obj:
  79         i915_gem_object_put(h->obj);
  80 err_hws:
  81         i915_gem_object_put(h->hws);
  82         return err;
  83 }
  84
  85 static u64 hws_address(const struct i915_vma *hws,
  86                        const struct drm_i915_gem_request *rq)
  87 {
  88         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  89 }
  90
  91 static int emit_recurse_batch(struct hang *h,
  92                               struct drm_i915_gem_request *rq)
  93 {
  94         struct drm_i915_private *i915 = h->i915;
  95         struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
  96         struct i915_vma *hws, *vma;
  97         unsigned int flags;
  98         u32 *batch;
  99         int err;
 100
 101         vma = i915_vma_instance(h->obj, vm, NULL);
 102         if (IS_ERR(vma))
 103                 return PTR_ERR(vma);
 104
 105         hws = i915_vma_instance(h->hws, vm, NULL);
 106         if (IS_ERR(hws))
 107                 return PTR_ERR(hws);
 108
 109         err = i915_vma_pin(vma, 0, 0, PIN_USER);
 110         if (err)
 111                 return err;
 112
 113         err = i915_vma_pin(hws, 0, 0, PIN_USER);
 114         if (err)
 115                 goto unpin_vma;
 116
 117         err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
 118         if (err)
 119                 goto unpin_hws;
 120
 121         err = i915_switch_context(rq);
 122         if (err)
 123                 goto unpin_hws;
 124
 125         i915_vma_move_to_active(vma, rq, 0);
 126         if (!i915_gem_object_has_active_reference(vma->obj)) {
 127                 i915_gem_object_get(vma->obj);
 128                 i915_gem_object_set_active_reference(vma->obj);
 129         }
 130
 131         i915_vma_move_to_active(hws, rq, 0);
 132         if (!i915_gem_object_has_active_reference(hws->obj)) {
 133                 i915_gem_object_get(hws->obj);
 134                 i915_gem_object_set_active_reference(hws->obj);
 135         }
 136
 137         batch = h->batch;
 138         if (INTEL_GEN(i915) >= 8) {
 139                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 140                 *batch++ = lower_32_bits(hws_address(hws, rq));
 141                 *batch++ = upper_32_bits(hws_address(hws, rq));
 142                 *batch++ = rq->fence.seqno;
 143                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 144                 *batch++ = lower_32_bits(vma->node.start);
 145                 *batch++ = upper_32_bits(vma->node.start);
 146         } else if (INTEL_GEN(i915) >= 6) {
 147                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 148                 *batch++ = 0;
 149                 *batch++ = lower_32_bits(hws_address(hws, rq));
 150                 *batch++ = rq->fence.seqno;
 151                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 152                 *batch++ = lower_32_bits(vma->node.start);
 153         } else if (INTEL_GEN(i915) >= 4) {
 154                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
 155                 *batch++ = 0;
 156                 *batch++ = lower_32_bits(hws_address(hws, rq));
 157                 *batch++ = rq->fence.seqno;
 158                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 159                 *batch++ = lower_32_bits(vma->node.start);
 160         } else {
 161                 *batch++ = MI_STORE_DWORD_IMM;
 162                 *batch++ = lower_32_bits(hws_address(hws, rq));
 163                 *batch++ = rq->fence.seqno;
 164                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
 165                 *batch++ = lower_32_bits(vma->node.start);
 166         }
 167         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 168
 169         flags = 0;
 170         if (INTEL_GEN(vm->i915) <= 5)
 171                 flags |= I915_DISPATCH_SECURE;
 172
 173         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 174
 175 unpin_hws:
 176         i915_vma_unpin(hws);
 177 unpin_vma:
 178         i915_vma_unpin(vma);
 179         return err;
 180 }
 181
 182 static struct drm_i915_gem_request *
 183 hang_create_request(struct hang *h,
 184                     struct intel_engine_cs *engine,
 185                     struct i915_gem_context *ctx)
 186 {
 187         struct drm_i915_gem_request *rq;
 188         int err;
 189
 190         if (i915_gem_object_is_active(h->obj)) {
 191                 struct drm_i915_gem_object *obj;
 192                 void *vaddr;
 193
 194                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 195                 if (IS_ERR(obj))
 196                         return ERR_CAST(obj);
 197
 198                 vaddr = i915_gem_object_pin_map(obj,
 199                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
 200                 if (IS_ERR(vaddr)) {
 201                         i915_gem_object_put(obj);
 202                         return ERR_CAST(vaddr);
 203                 }
 204
 205                 i915_gem_object_unpin_map(h->obj);
 206                 i915_gem_object_put(h->obj);
 207
 208                 h->obj = obj;
 209                 h->batch = vaddr;
 210         }
 211
 212         rq = i915_gem_request_alloc(engine, ctx);
 213         if (IS_ERR(rq))
 214                 return rq;
 215
 216         err = emit_recurse_batch(h, rq);
 217         if (err) {
 218                 __i915_add_request(rq, false);
 219                 return ERR_PTR(err);
 220         }
 221
 222         return rq;
 223 }
 224
 225 static u32 hws_seqno(const struct hang *h,
 226                      const struct drm_i915_gem_request *rq)
 227 {
 228         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 229 }
 230
 231 static void hang_fini(struct hang *h)
 232 {
 233         *h->batch = MI_BATCH_BUFFER_END;
 234         wmb();
 235
 236         i915_gem_object_unpin_map(h->obj);
 237         i915_gem_object_put(h->obj);
 238
 239         i915_gem_object_unpin_map(h->hws);
 240         i915_gem_object_put(h->hws);
 241
 242         i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
 243 }
 244
 245 static int igt_hang_sanitycheck(void *arg)
 246 {
 247         struct drm_i915_private *i915 = arg;
 248         struct drm_i915_gem_request *rq;
 249         struct intel_engine_cs *engine;
 250         enum intel_engine_id id;
 251         struct hang h;
 252         int err;
 253
 254         /* Basic check that we can execute our hanging batch */
 255
 256         mutex_lock(&i915->drm.struct_mutex);
 257         err = hang_init(&h, i915);
 258         if (err)
 259                 goto unlock;
 260
 261         for_each_engine(engine, i915, id) {
 262                 long timeout;
 263
 264                 if (!intel_engine_can_store_dword(engine))
 265                         continue;
 266
 267                 rq = hang_create_request(&h, engine, i915->kernel_context);
 268                 if (IS_ERR(rq)) {
 269                         err = PTR_ERR(rq);
 270                         pr_err("Failed to create request for %s, err=%d\n",
 271                                engine->name, err);
 272                         goto fini;
 273                 }
 274
 275                 i915_gem_request_get(rq);
 276
 277                 *h.batch = MI_BATCH_BUFFER_END;
 278                 __i915_add_request(rq, true);
 279
 280                 timeout = i915_wait_request(rq,
 281                                             I915_WAIT_LOCKED,
 282                                             MAX_SCHEDULE_TIMEOUT);
 283                 i915_gem_request_put(rq);
 284
 285                 if (timeout < 0) {
 286                         err = timeout;
 287                         pr_err("Wait for request failed on %s, err=%d\n",
 288                                engine->name, err);
 289                         goto fini;
 290                 }
 291         }
 292
 293 fini:
 294         hang_fini(&h);
 295 unlock:
 296         mutex_unlock(&i915->drm.struct_mutex);
 297         return err;
 298 }
 299
 300 static void global_reset_lock(struct drm_i915_private *i915)
 301 {
 302         struct intel_engine_cs *engine;
 303         enum intel_engine_id id;
 304
 305         while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
 306                 wait_event(i915->gpu_error.reset_queue,
 307                            !test_bit(I915_RESET_BACKOFF,
 308                                      &i915->gpu_error.flags));
 309
 310         for_each_engine(engine, i915, id) {
 311                 while (test_and_set_bit(I915_RESET_ENGINE + id,
 312                                         &i915->gpu_error.flags))
 313                         wait_on_bit(&i915->gpu_error.flags,
 314                                     I915_RESET_ENGINE + id,
 315                                     TASK_UNINTERRUPTIBLE);
 316         }
 317 }
 318
 319 static void global_reset_unlock(struct drm_i915_private *i915)
 320 {
 321         struct intel_engine_cs *engine;
 322         enum intel_engine_id id;
 323
 324         for_each_engine(engine, i915, id)
 325                 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 326
 327         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 328         wake_up_all(&i915->gpu_error.reset_queue);
 329 }
 330
 331 static int igt_global_reset(void *arg)
 332 {
 333         struct drm_i915_private *i915 = arg;
 334         unsigned int reset_count;
 335         int err = 0;
 336
 337         /* Check that we can issue a global GPU reset */
 338
 339         global_reset_lock(i915);
 340         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 341
 342         mutex_lock(&i915->drm.struct_mutex);
 343         reset_count = i915_reset_count(&i915->gpu_error);
 344
 345         i915_reset(i915, I915_RESET_QUIET);
 346
 347         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 348                 pr_err("No GPU reset recorded!\n");
 349                 err = -EINVAL;
 350         }
 351         mutex_unlock(&i915->drm.struct_mutex);
 352
 353         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 354         global_reset_unlock(i915);
 355
 356         if (i915_terminally_wedged(&i915->gpu_error))
 357                 err = -EIO;
 358
 359         return err;
 360 }
 361
 362 static int igt_reset_engine(void *arg)
 363 {
 364         struct drm_i915_private *i915 = arg;
 365         struct intel_engine_cs *engine;
 366         enum intel_engine_id id;
 367         unsigned int reset_count, reset_engine_count;
 368         int err = 0;
 369
 370         /* Check that we can issue a global GPU and engine reset */
 371
 372         if (!intel_has_reset_engine(i915))
 373                 return 0;
 374
 375         for_each_engine(engine, i915, id) {
 376                 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
 377                 reset_count = i915_reset_count(&i915->gpu_error);
 378                 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 379                                                              engine);
 380
 381                 err = i915_reset_engine(engine, I915_RESET_QUIET);
 382                 if (err) {
 383                         pr_err("i915_reset_engine failed\n");
 384                         break;
 385                 }
 386
 387                 if (i915_reset_count(&i915->gpu_error) != reset_count) {
 388                         pr_err("Full GPU reset recorded! (engine reset expected)\n");
 389                         err = -EINVAL;
 390                         break;
 391                 }
 392
 393                 if (i915_reset_engine_count(&i915->gpu_error, engine) ==
 394                     reset_engine_count) {
 395                         pr_err("No %s engine reset recorded!\n", engine->name);
 396                         err = -EINVAL;
 397                         break;
 398                 }
 399
 400                 clear_bit(I915_RESET_ENGINE + engine->id,
 401                           &i915->gpu_error.flags);
 402         }
 403
 404         if (i915_terminally_wedged(&i915->gpu_error))
 405                 err = -EIO;
 406
 407         return err;
 408 }
 409
 410 static int active_engine(void *data)
 411 {
 412         struct intel_engine_cs *engine = data;
 413         struct drm_i915_gem_request *rq[2] = {};
 414         struct i915_gem_context *ctx[2];
 415         struct drm_file *file;
 416         unsigned long count = 0;
 417         int err = 0;
 418
 419         file = mock_file(engine->i915);
 420         if (IS_ERR(file))
 421                 return PTR_ERR(file);
 422
 423         mutex_lock(&engine->i915->drm.struct_mutex);
 424         ctx[0] = live_context(engine->i915, file);
 425         mutex_unlock(&engine->i915->drm.struct_mutex);
 426         if (IS_ERR(ctx[0])) {
 427                 err = PTR_ERR(ctx[0]);
 428                 goto err_file;
 429         }
 430
 431         mutex_lock(&engine->i915->drm.struct_mutex);
 432         ctx[1] = live_context(engine->i915, file);
 433         mutex_unlock(&engine->i915->drm.struct_mutex);
 434         if (IS_ERR(ctx[1])) {
 435                 err = PTR_ERR(ctx[1]);
 436                 i915_gem_context_put(ctx[0]);
 437                 goto err_file;
 438         }
 439
 440         while (!kthread_should_stop()) {
 441                 unsigned int idx = count++ & 1;
 442                 struct drm_i915_gem_request *old = rq[idx];
 443                 struct drm_i915_gem_request *new;
 444
 445                 mutex_lock(&engine->i915->drm.struct_mutex);
 446                 new = i915_gem_request_alloc(engine, ctx[idx]);
 447                 if (IS_ERR(new)) {
 448                         mutex_unlock(&engine->i915->drm.struct_mutex);
 449                         err = PTR_ERR(new);
 450                         break;
 451                 }
 452
 453                 rq[idx] = i915_gem_request_get(new);
 454                 i915_add_request(new);
 455                 mutex_unlock(&engine->i915->drm.struct_mutex);
 456
 457                 if (old) {
 458                         i915_wait_request(old, 0, MAX_SCHEDULE_TIMEOUT);
 459                         i915_gem_request_put(old);
 460                 }
 461         }
 462
 463         for (count = 0; count < ARRAY_SIZE(rq); count++)
 464                 i915_gem_request_put(rq[count]);
 465
 466 err_file:
 467         mock_file_free(engine->i915, file);
 468         return err;
 469 }
 470
 471 static int igt_reset_active_engines(void *arg)
 472 {
 473         struct drm_i915_private *i915 = arg;
 474         struct intel_engine_cs *engine, *active;
 475         enum intel_engine_id id, tmp;
 476         int err = 0;
 477
 478         /* Check that issuing a reset on one engine does not interfere
 479          * with any other engine.
 480          */
 481
 482         if (!intel_has_reset_engine(i915))
 483                 return 0;
 484
 485         for_each_engine(engine, i915, id) {
 486                 struct task_struct *threads[I915_NUM_ENGINES];
 487                 unsigned long resets[I915_NUM_ENGINES];
 488                 unsigned long global = i915_reset_count(&i915->gpu_error);
 489                 IGT_TIMEOUT(end_time);
 490
 491                 memset(threads, 0, sizeof(threads));
 492                 for_each_engine(active, i915, tmp) {
 493                         struct task_struct *tsk;
 494
 495                         if (active == engine)
 496                                 continue;
 497
 498                         resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
 499                                                               active);
 500
 501                         tsk = kthread_run(active_engine, active,
 502                                           "igt/%s", active->name);
 503                         if (IS_ERR(tsk)) {
 504                                 err = PTR_ERR(tsk);
 505                                 goto unwind;
 506                         }
 507
 508                         threads[tmp] = tsk;
 509                         get_task_struct(tsk);
 510                 }
 511
 512                 set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
 513                 do {
 514                         err = i915_reset_engine(engine, I915_RESET_QUIET);
 515                         if (err) {
 516                                 pr_err("i915_reset_engine(%s) failed, err=%d\n",
 517                                        engine->name, err);
 518                                 break;
 519                         }
 520                 } while (time_before(jiffies, end_time));
 521                 clear_bit(I915_RESET_ENGINE + engine->id,
 522                           &i915->gpu_error.flags);
 523
 524 unwind:
 525                 for_each_engine(active, i915, tmp) {
 526                         int ret;
 527
 528                         if (!threads[tmp])
 529                                 continue;
 530
 531                         ret = kthread_stop(threads[tmp]);
 532                         if (ret) {
 533                                 pr_err("kthread for active engine %s failed, err=%d\n",
 534                                        active->name, ret);
 535                                 if (!err)
 536                                         err = ret;
 537                         }
 538                         put_task_struct(threads[tmp]);
 539
 540                         if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
 541                                                                    active)) {
 542                                 pr_err("Innocent engine %s was reset (count=%ld)\n",
 543                                        active->name,
 544                                        i915_reset_engine_count(&i915->gpu_error,
 545                                                                active) - resets[tmp]);
 546                                 err = -EIO;
 547                         }
 548                 }
 549
 550                 if (global != i915_reset_count(&i915->gpu_error)) {
 551                         pr_err("Global reset (count=%ld)!\n",
 552                                i915_reset_count(&i915->gpu_error) - global);
 553                         err = -EIO;
 554                 }
 555
 556                 if (err)
 557                         break;
 558
 559                 cond_resched();
 560         }
 561
 562         if (i915_terminally_wedged(&i915->gpu_error))
 563                 err = -EIO;
 564
 565         return err;
 566 }
 567
 568 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 569 {
 570         u32 reset_count;
 571
 572         rq->engine->hangcheck.stalled = true;
 573         rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
 574
 575         reset_count = i915_reset_count(&rq->i915->gpu_error);
 576
 577         set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 578         wake_up_all(&rq->i915->gpu_error.wait_queue);
 579
 580         return reset_count;
 581 }
 582
 583 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
 584 {
 585         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 586                                                rq->fence.seqno),
 587                              10) &&
 588                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
 589                                             rq->fence.seqno),
 590                           1000));
 591 }
 592
 593 static int igt_wait_reset(void *arg)
 594 {
 595         struct drm_i915_private *i915 = arg;
 596         struct drm_i915_gem_request *rq;
 597         unsigned int reset_count;
 598         struct hang h;
 599         long timeout;
 600         int err;
 601
 602         if (!intel_engine_can_store_dword(i915->engine[RCS]))
 603                 return 0;
 604
 605         /* Check that we detect a stuck waiter and issue a reset */
 606
 607         global_reset_lock(i915);
 608
 609         mutex_lock(&i915->drm.struct_mutex);
 610         err = hang_init(&h, i915);
 611         if (err)
 612                 goto unlock;
 613
 614         rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
 615         if (IS_ERR(rq)) {
 616                 err = PTR_ERR(rq);
 617                 goto fini;
 618         }
 619
 620         i915_gem_request_get(rq);
 621         __i915_add_request(rq, true);
 622
 623         if (!wait_for_hang(&h, rq)) {
 624                 pr_err("Failed to start request %x, at %x\n",
 625                        rq->fence.seqno, hws_seqno(&h, rq));
 626
 627                 i915_reset(i915, 0);
 628                 i915_gem_set_wedged(i915);
 629
 630                 err = -EIO;
 631                 goto out_rq;
 632         }
 633
 634         reset_count = fake_hangcheck(rq);
 635
 636         timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
 637         if (timeout < 0) {
 638                 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
 639                        timeout);
 640                 err = timeout;
 641                 goto out_rq;
 642         }
 643
 644         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 645         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 646                 pr_err("No GPU reset recorded!\n");
 647                 err = -EINVAL;
 648                 goto out_rq;
 649         }
 650
 651 out_rq:
 652         i915_gem_request_put(rq);
 653 fini:
 654         hang_fini(&h);
 655 unlock:
 656         mutex_unlock(&i915->drm.struct_mutex);
 657         global_reset_unlock(i915);
 658
 659         if (i915_terminally_wedged(&i915->gpu_error))
 660                 return -EIO;
 661
 662         return err;
 663 }
 664
 665 static int igt_reset_queue(void *arg)
 666 {
 667         struct drm_i915_private *i915 = arg;
 668         struct intel_engine_cs *engine;
 669         enum intel_engine_id id;
 670         struct hang h;
 671         int err;
 672
 673         /* Check that we replay pending requests following a hang */
 674
 675         global_reset_lock(i915);
 676
 677         mutex_lock(&i915->drm.struct_mutex);
 678         err = hang_init(&h, i915);
 679         if (err)
 680                 goto unlock;
 681
 682         for_each_engine(engine, i915, id) {
 683                 struct drm_i915_gem_request *prev;
 684                 IGT_TIMEOUT(end_time);
 685                 unsigned int count;
 686
 687                 if (!intel_engine_can_store_dword(engine))
 688                         continue;
 689
 690                 prev = hang_create_request(&h, engine, i915->kernel_context);
 691                 if (IS_ERR(prev)) {
 692                         err = PTR_ERR(prev);
 693                         goto fini;
 694                 }
 695
 696                 i915_gem_request_get(prev);
 697                 __i915_add_request(prev, true);
 698
 699                 count = 0;
 700                 do {
 701                         struct drm_i915_gem_request *rq;
 702                         unsigned int reset_count;
 703
 704                         rq = hang_create_request(&h,
 705                                                  engine,
 706                                                  i915->kernel_context);
 707                         if (IS_ERR(rq)) {
 708                                 err = PTR_ERR(rq);
 709                                 goto fini;
 710                         }
 711
 712                         i915_gem_request_get(rq);
 713                         __i915_add_request(rq, true);
 714
 715                         if (!wait_for_hang(&h, prev)) {
 716                                 pr_err("Failed to start request %x, at %x\n",
 717                                        prev->fence.seqno, hws_seqno(&h, prev));
 718                                 i915_gem_request_put(rq);
 719                                 i915_gem_request_put(prev);
 720
 721                                 i915_reset(i915, 0);
 722                                 i915_gem_set_wedged(i915);
 723
 724                                 err = -EIO;
 725                                 goto fini;
 726                         }
 727
 728                         reset_count = fake_hangcheck(prev);
 729
 730                         i915_reset(i915, I915_RESET_QUIET);
 731
 732                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 733                                             &i915->gpu_error.flags));
 734
 735                         if (prev->fence.error != -EIO) {
 736                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 737                                        prev->fence.error);
 738                                 i915_gem_request_put(rq);
 739                                 i915_gem_request_put(prev);
 740                                 err = -EINVAL;
 741                                 goto fini;
 742                         }
 743
 744                         if (rq->fence.error) {
 745                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
 746                                        rq->fence.error);
 747                                 i915_gem_request_put(rq);
 748                                 i915_gem_request_put(prev);
 749                                 err = -EINVAL;
 750                                 goto fini;
 751                         }
 752
 753                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 754                                 pr_err("No GPU reset recorded!\n");
 755                                 i915_gem_request_put(rq);
 756                                 i915_gem_request_put(prev);
 757                                 err = -EINVAL;
 758                                 goto fini;
 759                         }
 760
 761                         i915_gem_request_put(prev);
 762                         prev = rq;
 763                         count++;
 764                 } while (time_before(jiffies, end_time));
 765                 pr_info("%s: Completed %d resets\n", engine->name, count);
 766
 767                 *h.batch = MI_BATCH_BUFFER_END;
 768                 wmb();
 769
 770                 i915_gem_request_put(prev);
 771         }
 772
 773 fini:
 774         hang_fini(&h);
 775 unlock:
 776         mutex_unlock(&i915->drm.struct_mutex);
 777         global_reset_unlock(i915);
 778
 779         if (i915_terminally_wedged(&i915->gpu_error))
 780                 return -EIO;
 781
 782         return err;
 783 }
 784
 785 static int igt_handle_error(void *arg)
 786 {
 787         struct drm_i915_private *i915 = arg;
 788         struct intel_engine_cs *engine = i915->engine[RCS];
 789         struct hang h;
 790         struct drm_i915_gem_request *rq;
 791         struct i915_gpu_state *error;
 792         int err;
 793
 794         /* Check that we can issue a global GPU and engine reset */
 795
 796         if (!intel_has_reset_engine(i915))
 797                 return 0;
 798
 799         if (!intel_engine_can_store_dword(i915->engine[RCS]))
 800                 return 0;
 801
 802         mutex_lock(&i915->drm.struct_mutex);
 803
 804         err = hang_init(&h, i915);
 805         if (err)
 806                 goto err_unlock;
 807
 808         rq = hang_create_request(&h, engine, i915->kernel_context);
 809         if (IS_ERR(rq)) {
 810                 err = PTR_ERR(rq);
 811                 goto err_fini;
 812         }
 813
 814         i915_gem_request_get(rq);
 815         __i915_add_request(rq, true);
 816
 817         if (!wait_for_hang(&h, rq)) {
 818                 pr_err("Failed to start request %x, at %x\n",
 819                        rq->fence.seqno, hws_seqno(&h, rq));
 820
 821                 i915_reset(i915, 0);
 822                 i915_gem_set_wedged(i915);
 823
 824                 err = -EIO;
 825                 goto err_request;
 826         }
 827
 828         mutex_unlock(&i915->drm.struct_mutex);
 829
 830         /* Temporarily disable error capture */
 831         error = xchg(&i915->gpu_error.first_error, (void *)-1);
 832
 833         engine->hangcheck.stalled = true;
 834         engine->hangcheck.seqno = intel_engine_get_seqno(engine);
 835
 836         i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
 837
 838         xchg(&i915->gpu_error.first_error, error);
 839
 840         mutex_lock(&i915->drm.struct_mutex);
 841
 842         if (rq->fence.error != -EIO) {
 843                 pr_err("Guilty request not identified!\n");
 844                 err = -EINVAL;
 845                 goto err_request;
 846         }
 847
 848 err_request:
 849         i915_gem_request_put(rq);
 850 err_fini:
 851         hang_fini(&h);
 852 err_unlock:
 853         mutex_unlock(&i915->drm.struct_mutex);
 854         return err;
 855 }
 856
 857 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 858 {
 859         static const struct i915_subtest tests[] = {
 860                 SUBTEST(igt_global_reset), /* attempt to recover GPU first */
 861                 SUBTEST(igt_hang_sanitycheck),
 862                 SUBTEST(igt_reset_engine),
 863                 SUBTEST(igt_reset_active_engines),
 864                 SUBTEST(igt_wait_reset),
 865                 SUBTEST(igt_reset_queue),
 866                 SUBTEST(igt_handle_error),
 867         };
 868
 869         if (!intel_has_gpu_reset(i915))
 870                 return 0;
 871
 872         return i915_subtests(tests, i915);
 873 }