drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   1 /*
   2  * Copyright © 2016 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "../i915_selftest.h"
  26
  27 struct hang {
  28         struct drm_i915_private *i915;
  29         struct drm_i915_gem_object *hws;
  30         struct drm_i915_gem_object *obj;
  31         u32 *seqno;
  32         u32 *batch;
  33 };
  34
  35 static int hang_init(struct hang *h, struct drm_i915_private *i915)
  36 {
  37         void *vaddr;
  38         int err;
  39
  40         memset(h, 0, sizeof(*h));
  41         h->i915 = i915;
  42
  43         h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  44         if (IS_ERR(h->hws))
  45                 return PTR_ERR(h->hws);
  46
  47         h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  48         if (IS_ERR(h->obj)) {
  49                 err = PTR_ERR(h->obj);
  50                 goto err_hws;
  51         }
  52
  53         i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  54         vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  55         if (IS_ERR(vaddr)) {
  56                 err = PTR_ERR(vaddr);
  57                 goto err_obj;
  58         }
  59         h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  60
  61         vaddr = i915_gem_object_pin_map(h->obj,
  62                                         HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  63         if (IS_ERR(vaddr)) {
  64                 err = PTR_ERR(vaddr);
  65                 goto err_unpin_hws;
  66         }
  67         h->batch = vaddr;
  68
  69         return 0;
  70
  71 err_unpin_hws:
  72         i915_gem_object_unpin_map(h->hws);
  73 err_obj:
  74         i915_gem_object_put(h->obj);
  75 err_hws:
  76         i915_gem_object_put(h->hws);
  77         return err;
  78 }
  79
  80 static u64 hws_address(const struct i915_vma *hws,
  81                        const struct drm_i915_gem_request *rq)
  82 {
  83         return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  84 }
  85
  86 static int emit_recurse_batch(struct hang *h,
  87                               struct drm_i915_gem_request *rq)
  88 {
  89         struct drm_i915_private *i915 = h->i915;
  90         struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
  91         struct i915_vma *hws, *vma;
  92         unsigned int flags;
  93         u32 *batch;
  94         int err;
  95
  96         vma = i915_vma_instance(h->obj, vm, NULL);
  97         if (IS_ERR(vma))
  98                 return PTR_ERR(vma);
  99
 100         hws = i915_vma_instance(h->hws, vm, NULL);
 101         if (IS_ERR(hws))
 102                 return PTR_ERR(hws);
 103
 104         err = i915_vma_pin(vma, 0, 0, PIN_USER);
 105         if (err)
 106                 return err;
 107
 108         err = i915_vma_pin(hws, 0, 0, PIN_USER);
 109         if (err)
 110                 goto unpin_vma;
 111
 112         err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
 113         if (err)
 114                 goto unpin_hws;
 115
 116         err = i915_switch_context(rq);
 117         if (err)
 118                 goto unpin_hws;
 119
 120         i915_vma_move_to_active(vma, rq, 0);
 121         if (!i915_gem_object_has_active_reference(vma->obj)) {
 122                 i915_gem_object_get(vma->obj);
 123                 i915_gem_object_set_active_reference(vma->obj);
 124         }
 125
 126         i915_vma_move_to_active(hws, rq, 0);
 127         if (!i915_gem_object_has_active_reference(hws->obj)) {
 128                 i915_gem_object_get(hws->obj);
 129                 i915_gem_object_set_active_reference(hws->obj);
 130         }
 131
 132         batch = h->batch;
 133         if (INTEL_GEN(i915) >= 8) {
 134                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 135                 *batch++ = lower_32_bits(hws_address(hws, rq));
 136                 *batch++ = upper_32_bits(hws_address(hws, rq));
 137                 *batch++ = rq->fence.seqno;
 138                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 139                 *batch++ = lower_32_bits(vma->node.start);
 140                 *batch++ = upper_32_bits(vma->node.start);
 141         } else if (INTEL_GEN(i915) >= 6) {
 142                 *batch++ = MI_STORE_DWORD_IMM_GEN4;
 143                 *batch++ = 0;
 144                 *batch++ = lower_32_bits(hws_address(hws, rq));
 145                 *batch++ = rq->fence.seqno;
 146                 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 147                 *batch++ = lower_32_bits(vma->node.start);
 148         } else if (INTEL_GEN(i915) >= 4) {
 149                 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
 150                 *batch++ = 0;
 151                 *batch++ = lower_32_bits(hws_address(hws, rq));
 152                 *batch++ = rq->fence.seqno;
 153                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 154                 *batch++ = lower_32_bits(vma->node.start);
 155         } else {
 156                 *batch++ = MI_STORE_DWORD_IMM;
 157                 *batch++ = lower_32_bits(hws_address(hws, rq));
 158                 *batch++ = rq->fence.seqno;
 159                 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
 160                 *batch++ = lower_32_bits(vma->node.start);
 161         }
 162         *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 163
 164         flags = 0;
 165         if (INTEL_GEN(vm->i915) <= 5)
 166                 flags |= I915_DISPATCH_SECURE;
 167
 168         err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 169
 170 unpin_hws:
 171         i915_vma_unpin(hws);
 172 unpin_vma:
 173         i915_vma_unpin(vma);
 174         return err;
 175 }
 176
 177 static struct drm_i915_gem_request *
 178 hang_create_request(struct hang *h,
 179                     struct intel_engine_cs *engine,
 180                     struct i915_gem_context *ctx)
 181 {
 182         struct drm_i915_gem_request *rq;
 183         int err;
 184
 185         if (i915_gem_object_is_active(h->obj)) {
 186                 struct drm_i915_gem_object *obj;
 187                 void *vaddr;
 188
 189                 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 190                 if (IS_ERR(obj))
 191                         return ERR_CAST(obj);
 192
 193                 vaddr = i915_gem_object_pin_map(obj,
 194                                                 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
 195                 if (IS_ERR(vaddr)) {
 196                         i915_gem_object_put(obj);
 197                         return ERR_CAST(vaddr);
 198                 }
 199
 200                 i915_gem_object_unpin_map(h->obj);
 201                 i915_gem_object_put(h->obj);
 202
 203                 h->obj = obj;
 204                 h->batch = vaddr;
 205         }
 206
 207         rq = i915_gem_request_alloc(engine, ctx);
 208         if (IS_ERR(rq))
 209                 return rq;
 210
 211         err = emit_recurse_batch(h, rq);
 212         if (err) {
 213                 __i915_add_request(rq, false);
 214                 return ERR_PTR(err);
 215         }
 216
 217         return rq;
 218 }
 219
 220 static u32 hws_seqno(const struct hang *h,
 221                      const struct drm_i915_gem_request *rq)
 222 {
 223         return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 224 }
 225
 226 static void hang_fini(struct hang *h)
 227 {
 228         *h->batch = MI_BATCH_BUFFER_END;
 229         wmb();
 230
 231         i915_gem_object_unpin_map(h->obj);
 232         i915_gem_object_put(h->obj);
 233
 234         i915_gem_object_unpin_map(h->hws);
 235         i915_gem_object_put(h->hws);
 236
 237         i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
 238 }
 239
 240 static int igt_hang_sanitycheck(void *arg)
 241 {
 242         struct drm_i915_private *i915 = arg;
 243         struct drm_i915_gem_request *rq;
 244         struct intel_engine_cs *engine;
 245         enum intel_engine_id id;
 246         struct hang h;
 247         int err;
 248
 249         /* Basic check that we can execute our hanging batch */
 250
 251         if (!igt_can_mi_store_dword_imm(i915))
 252                 return 0;
 253
 254         mutex_lock(&i915->drm.struct_mutex);
 255         err = hang_init(&h, i915);
 256         if (err)
 257                 goto unlock;
 258
 259         for_each_engine(engine, i915, id) {
 260                 long timeout;
 261
 262                 rq = hang_create_request(&h, engine, i915->kernel_context);
 263                 if (IS_ERR(rq)) {
 264                         err = PTR_ERR(rq);
 265                         pr_err("Failed to create request for %s, err=%d\n",
 266                                engine->name, err);
 267                         goto fini;
 268                 }
 269
 270                 i915_gem_request_get(rq);
 271
 272                 *h.batch = MI_BATCH_BUFFER_END;
 273                 __i915_add_request(rq, true);
 274
 275                 timeout = i915_wait_request(rq,
 276                                             I915_WAIT_LOCKED,
 277                                             MAX_SCHEDULE_TIMEOUT);
 278                 i915_gem_request_put(rq);
 279
 280                 if (timeout < 0) {
 281                         err = timeout;
 282                         pr_err("Wait for request failed on %s, err=%d\n",
 283                                engine->name, err);
 284                         goto fini;
 285                 }
 286         }
 287
 288 fini:
 289         hang_fini(&h);
 290 unlock:
 291         mutex_unlock(&i915->drm.struct_mutex);
 292         return err;
 293 }
 294
 295 static int igt_global_reset(void *arg)
 296 {
 297         struct drm_i915_private *i915 = arg;
 298         unsigned int reset_count;
 299         int err = 0;
 300
 301         /* Check that we can issue a global GPU reset */
 302
 303         set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 304         set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 305
 306         mutex_lock(&i915->drm.struct_mutex);
 307         reset_count = i915_reset_count(&i915->gpu_error);
 308
 309         i915_reset(i915);
 310
 311         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 312                 pr_err("No GPU reset recorded!\n");
 313                 err = -EINVAL;
 314         }
 315         mutex_unlock(&i915->drm.struct_mutex);
 316
 317         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 318         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 319         if (i915_terminally_wedged(&i915->gpu_error))
 320                 err = -EIO;
 321
 322         return err;
 323 }
 324
 325 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 326 {
 327         u32 reset_count;
 328
 329         rq->engine->hangcheck.stalled = true;
 330         rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
 331
 332         reset_count = i915_reset_count(&rq->i915->gpu_error);
 333
 334         set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 335         wake_up_all(&rq->i915->gpu_error.wait_queue);
 336
 337         return reset_count;
 338 }
 339
 340 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
 341 {
 342         return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 343                                                rq->fence.seqno),
 344                              10) &&
 345                  wait_for(i915_seqno_passed(hws_seqno(h, rq),
 346                                             rq->fence.seqno),
 347                           1000));
 348 }
 349
 350 static int igt_wait_reset(void *arg)
 351 {
 352         struct drm_i915_private *i915 = arg;
 353         struct drm_i915_gem_request *rq;
 354         unsigned int reset_count;
 355         struct hang h;
 356         long timeout;
 357         int err;
 358
 359         /* Check that we detect a stuck waiter and issue a reset */
 360
 361         set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 362
 363         mutex_lock(&i915->drm.struct_mutex);
 364         err = hang_init(&h, i915);
 365         if (err)
 366                 goto unlock;
 367
 368         rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
 369         if (IS_ERR(rq)) {
 370                 err = PTR_ERR(rq);
 371                 goto fini;
 372         }
 373
 374         i915_gem_request_get(rq);
 375         __i915_add_request(rq, true);
 376
 377         if (!wait_for_hang(&h, rq)) {
 378                 pr_err("Failed to start request %x\n", rq->fence.seqno);
 379                 err = -EIO;
 380                 goto out_rq;
 381         }
 382
 383         reset_count = fake_hangcheck(rq);
 384
 385         timeout = i915_wait_request(rq, I915_WAIT_LOCKED, 10);
 386         if (timeout < 0) {
 387                 pr_err("i915_wait_request failed on a stuck request: err=%ld\n",
 388                        timeout);
 389                 err = timeout;
 390                 goto out_rq;
 391         }
 392
 393         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 394         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 395                 pr_err("No GPU reset recorded!\n");
 396                 err = -EINVAL;
 397                 goto out_rq;
 398         }
 399
 400 out_rq:
 401         i915_gem_request_put(rq);
 402 fini:
 403         hang_fini(&h);
 404 unlock:
 405         mutex_unlock(&i915->drm.struct_mutex);
 406         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 407
 408         if (i915_terminally_wedged(&i915->gpu_error))
 409                 return -EIO;
 410
 411         return err;
 412 }
 413
 414 static int igt_reset_queue(void *arg)
 415 {
 416         struct drm_i915_private *i915 = arg;
 417         struct intel_engine_cs *engine;
 418         enum intel_engine_id id;
 419         struct hang h;
 420         int err;
 421
 422         /* Check that we replay pending requests following a hang */
 423
 424         if (!igt_can_mi_store_dword_imm(i915))
 425                 return 0;
 426
 427         set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 428         mutex_lock(&i915->drm.struct_mutex);
 429         err = hang_init(&h, i915);
 430         if (err)
 431                 goto unlock;
 432
 433         for_each_engine(engine, i915, id) {
 434                 struct drm_i915_gem_request *prev;
 435                 IGT_TIMEOUT(end_time);
 436                 unsigned int count;
 437
 438                 prev = hang_create_request(&h, engine, i915->kernel_context);
 439                 if (IS_ERR(prev)) {
 440                         err = PTR_ERR(prev);
 441                         goto fini;
 442                 }
 443
 444                 i915_gem_request_get(prev);
 445                 __i915_add_request(prev, true);
 446
 447                 count = 0;
 448                 do {
 449                         struct drm_i915_gem_request *rq;
 450                         unsigned int reset_count;
 451
 452                         rq = hang_create_request(&h,
 453                                                  engine,
 454                                                  i915->kernel_context);
 455                         if (IS_ERR(rq)) {
 456                                 err = PTR_ERR(rq);
 457                                 goto fini;
 458                         }
 459
 460                         i915_gem_request_get(rq);
 461                         __i915_add_request(rq, true);
 462
 463                         if (!wait_for_hang(&h, prev)) {
 464                                 pr_err("Failed to start request %x\n",
 465                                        prev->fence.seqno);
 466                                 i915_gem_request_put(rq);
 467                                 i915_gem_request_put(prev);
 468                                 err = -EIO;
 469                                 goto fini;
 470                         }
 471
 472                         reset_count = fake_hangcheck(prev);
 473
 474                         i915_reset(i915);
 475
 476                         GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 477                                             &i915->gpu_error.flags));
 478
 479                         if (prev->fence.error != -EIO) {
 480                                 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 481                                        prev->fence.error);
 482                                 i915_gem_request_put(rq);
 483                                 i915_gem_request_put(prev);
 484                                 err = -EINVAL;
 485                                 goto fini;
 486                         }
 487
 488                         if (rq->fence.error) {
 489                                 pr_err("Fence error status not zero [%d] after unrelated reset\n",
 490                                        rq->fence.error);
 491                                 i915_gem_request_put(rq);
 492                                 i915_gem_request_put(prev);
 493                                 err = -EINVAL;
 494                                 goto fini;
 495                         }
 496
 497                         if (i915_reset_count(&i915->gpu_error) == reset_count) {
 498                                 pr_err("No GPU reset recorded!\n");
 499                                 i915_gem_request_put(rq);
 500                                 i915_gem_request_put(prev);
 501                                 err = -EINVAL;
 502                                 goto fini;
 503                         }
 504
 505                         i915_gem_request_put(prev);
 506                         prev = rq;
 507                         count++;
 508                 } while (time_before(jiffies, end_time));
 509                 pr_info("%s: Completed %d resets\n", engine->name, count);
 510
 511                 *h.batch = MI_BATCH_BUFFER_END;
 512                 wmb();
 513
 514                 i915_gem_request_put(prev);
 515         }
 516
 517 fini:
 518         hang_fini(&h);
 519 unlock:
 520         mutex_unlock(&i915->drm.struct_mutex);
 521         clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 522
 523         if (i915_terminally_wedged(&i915->gpu_error))
 524                 return -EIO;
 525
 526         return err;
 527 }
 528
 529 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 530 {
 531         static const struct i915_subtest tests[] = {
 532                 SUBTEST(igt_hang_sanitycheck),
 533                 SUBTEST(igt_global_reset),
 534                 SUBTEST(igt_wait_reset),
 535                 SUBTEST(igt_reset_queue),
 536         };
 537
 538         if (!intel_has_gpu_reset(i915))
 539                 return 0;
 540
 541         return i915_subtests(tests, i915);
 542 }