]>
git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/gpu/drm/i915/i915_gpu_error.c
2 * Copyright (c) 2008 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Eric Anholt <eric@anholt.net>
25 * Keith Packard <keithp@keithp.com>
26 * Mika Kuoppala <mika.kuoppala@intel.com>
30 #include <generated/utsrelease.h>
31 #include <linux/stop_machine.h>
32 #include <linux/zlib.h>
35 static const char *engine_str(int engine
)
38 case RCS
: return "render";
39 case VCS
: return "bsd";
40 case BCS
: return "blt";
41 case VECS
: return "vebox";
42 case VCS2
: return "bsd2";
47 static const char *tiling_flag(int tiling
)
51 case I915_TILING_NONE
: return "";
52 case I915_TILING_X
: return " X";
53 case I915_TILING_Y
: return " Y";
57 static const char *dirty_flag(int dirty
)
59 return dirty
? " dirty" : "";
62 static const char *purgeable_flag(int purgeable
)
64 return purgeable
? " purgeable" : "";
67 static bool __i915_error_ok(struct drm_i915_error_state_buf
*e
)
70 if (!e
->err
&& WARN(e
->bytes
> (e
->size
- 1), "overflow")) {
75 if (e
->bytes
== e
->size
- 1 || e
->err
)
81 static bool __i915_error_seek(struct drm_i915_error_state_buf
*e
,
84 if (e
->pos
+ len
<= e
->start
) {
89 /* First vsnprintf needs to fit in its entirety for memmove */
98 static void __i915_error_advance(struct drm_i915_error_state_buf
*e
,
101 /* If this is first printf in this window, adjust it so that
102 * start position matches start of the buffer
105 if (e
->pos
< e
->start
) {
106 const size_t off
= e
->start
- e
->pos
;
108 /* Should not happen but be paranoid */
109 if (off
> len
|| e
->bytes
) {
114 memmove(e
->buf
, e
->buf
+ off
, len
- off
);
115 e
->bytes
= len
- off
;
125 static void i915_error_vprintf(struct drm_i915_error_state_buf
*e
,
126 const char *f
, va_list args
)
130 if (!__i915_error_ok(e
))
133 /* Seek the first printf which is hits start position */
134 if (e
->pos
< e
->start
) {
138 len
= vsnprintf(NULL
, 0, f
, tmp
);
141 if (!__i915_error_seek(e
, len
))
145 len
= vsnprintf(e
->buf
+ e
->bytes
, e
->size
- e
->bytes
, f
, args
);
146 if (len
>= e
->size
- e
->bytes
)
147 len
= e
->size
- e
->bytes
- 1;
149 __i915_error_advance(e
, len
);
152 static void i915_error_puts(struct drm_i915_error_state_buf
*e
,
157 if (!__i915_error_ok(e
))
162 /* Seek the first printf which is hits start position */
163 if (e
->pos
< e
->start
) {
164 if (!__i915_error_seek(e
, len
))
168 if (len
>= e
->size
- e
->bytes
)
169 len
= e
->size
- e
->bytes
- 1;
170 memcpy(e
->buf
+ e
->bytes
, str
, len
);
172 __i915_error_advance(e
, len
);
175 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
176 #define err_puts(e, s) i915_error_puts(e, s)
178 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR
181 struct z_stream_s zstream
;
185 static bool compress_init(struct compress
*c
)
187 struct z_stream_s
*zstream
= memset(&c
->zstream
, 0, sizeof(c
->zstream
));
190 kmalloc(zlib_deflate_workspacesize(MAX_WBITS
, MAX_MEM_LEVEL
),
191 GFP_ATOMIC
| __GFP_NOWARN
);
192 if (!zstream
->workspace
)
195 if (zlib_deflateInit(zstream
, Z_DEFAULT_COMPRESSION
) != Z_OK
) {
196 kfree(zstream
->workspace
);
201 if (i915_has_memcpy_from_wc())
202 c
->tmp
= (void *)__get_free_page(GFP_ATOMIC
| __GFP_NOWARN
);
207 static int compress_page(struct compress
*c
,
209 struct drm_i915_error_object
*dst
)
211 struct z_stream_s
*zstream
= &c
->zstream
;
213 zstream
->next_in
= src
;
214 if (c
->tmp
&& i915_memcpy_from_wc(c
->tmp
, src
, PAGE_SIZE
))
215 zstream
->next_in
= c
->tmp
;
216 zstream
->avail_in
= PAGE_SIZE
;
219 if (zstream
->avail_out
== 0) {
222 page
= __get_free_page(GFP_ATOMIC
| __GFP_NOWARN
);
226 dst
->pages
[dst
->page_count
++] = (void *)page
;
228 zstream
->next_out
= (void *)page
;
229 zstream
->avail_out
= PAGE_SIZE
;
232 if (zlib_deflate(zstream
, Z_SYNC_FLUSH
) != Z_OK
)
234 } while (zstream
->avail_in
);
236 /* Fallback to uncompressed if we increase size? */
237 if (0 && zstream
->total_out
> zstream
->total_in
)
243 static void compress_fini(struct compress
*c
,
244 struct drm_i915_error_object
*dst
)
246 struct z_stream_s
*zstream
= &c
->zstream
;
249 zlib_deflate(zstream
, Z_FINISH
);
250 dst
->unused
= zstream
->avail_out
;
253 zlib_deflateEnd(zstream
);
254 kfree(zstream
->workspace
);
257 free_page((unsigned long)c
->tmp
);
260 static void err_compression_marker(struct drm_i915_error_state_buf
*m
)
270 static bool compress_init(struct compress
*c
)
275 static int compress_page(struct compress
*c
,
277 struct drm_i915_error_object
*dst
)
282 page
= __get_free_page(GFP_ATOMIC
| __GFP_NOWARN
);
287 if (!i915_memcpy_from_wc(ptr
, src
, PAGE_SIZE
))
288 memcpy(ptr
, src
, PAGE_SIZE
);
289 dst
->pages
[dst
->page_count
++] = ptr
;
294 static void compress_fini(struct compress
*c
,
295 struct drm_i915_error_object
*dst
)
299 static void err_compression_marker(struct drm_i915_error_state_buf
*m
)
306 static void print_error_buffers(struct drm_i915_error_state_buf
*m
,
308 struct drm_i915_error_buffer
*err
,
313 err_printf(m
, "%s [%d]:\n", name
, count
);
316 err_printf(m
, " %08x_%08x %8u %02x %02x [ ",
317 upper_32_bits(err
->gtt_offset
),
318 lower_32_bits(err
->gtt_offset
),
322 for (i
= 0; i
< I915_NUM_ENGINES
; i
++)
323 err_printf(m
, "%02x ", err
->rseqno
[i
]);
325 err_printf(m
, "] %02x", err
->wseqno
);
326 err_puts(m
, tiling_flag(err
->tiling
));
327 err_puts(m
, dirty_flag(err
->dirty
));
328 err_puts(m
, purgeable_flag(err
->purgeable
));
329 err_puts(m
, err
->userptr
? " userptr" : "");
330 err_puts(m
, err
->engine
!= -1 ? " " : "");
331 err_puts(m
, engine_str(err
->engine
));
332 err_puts(m
, i915_cache_level_str(m
->i915
, err
->cache_level
));
335 err_printf(m
, " (name: %d)", err
->name
);
336 if (err
->fence_reg
!= I915_FENCE_REG_NONE
)
337 err_printf(m
, " (fence: %d)", err
->fence_reg
);
344 static void error_print_instdone(struct drm_i915_error_state_buf
*m
,
345 const struct drm_i915_error_engine
*ee
)
350 err_printf(m
, " INSTDONE: 0x%08x\n",
351 ee
->instdone
.instdone
);
353 if (ee
->engine_id
!= RCS
|| INTEL_GEN(m
->i915
) <= 3)
356 err_printf(m
, " SC_INSTDONE: 0x%08x\n",
357 ee
->instdone
.slice_common
);
359 if (INTEL_GEN(m
->i915
) <= 6)
362 for_each_instdone_slice_subslice(m
->i915
, slice
, subslice
)
363 err_printf(m
, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
365 ee
->instdone
.sampler
[slice
][subslice
]);
367 for_each_instdone_slice_subslice(m
->i915
, slice
, subslice
)
368 err_printf(m
, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
370 ee
->instdone
.row
[slice
][subslice
]);
373 static void error_print_request(struct drm_i915_error_state_buf
*m
,
375 const struct drm_i915_error_request
*erq
)
380 err_printf(m
, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
381 prefix
, erq
->pid
, erq
->ban_score
,
382 erq
->context
, erq
->seqno
,
383 jiffies_to_msecs(jiffies
- erq
->jiffies
),
384 erq
->head
, erq
->tail
);
387 static void error_print_context(struct drm_i915_error_state_buf
*m
,
389 const struct drm_i915_error_context
*ctx
)
391 err_printf(m
, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
392 header
, ctx
->comm
, ctx
->pid
, ctx
->handle
, ctx
->hw_id
,
393 ctx
->ban_score
, ctx
->guilty
, ctx
->active
);
396 static void error_print_engine(struct drm_i915_error_state_buf
*m
,
397 const struct drm_i915_error_engine
*ee
)
401 err_printf(m
, "%s command stream:\n", engine_str(ee
->engine_id
));
402 err_printf(m
, " START: 0x%08x\n", ee
->start
);
403 err_printf(m
, " HEAD: 0x%08x [0x%08x]\n", ee
->head
, ee
->rq_head
);
404 err_printf(m
, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
405 ee
->tail
, ee
->rq_post
, ee
->rq_tail
);
406 err_printf(m
, " CTL: 0x%08x\n", ee
->ctl
);
407 err_printf(m
, " MODE: 0x%08x\n", ee
->mode
);
408 err_printf(m
, " HWS: 0x%08x\n", ee
->hws
);
409 err_printf(m
, " ACTHD: 0x%08x %08x\n",
410 (u32
)(ee
->acthd
>>32), (u32
)ee
->acthd
);
411 err_printf(m
, " IPEIR: 0x%08x\n", ee
->ipeir
);
412 err_printf(m
, " IPEHR: 0x%08x\n", ee
->ipehr
);
414 error_print_instdone(m
, ee
);
416 if (ee
->batchbuffer
) {
417 u64 start
= ee
->batchbuffer
->gtt_offset
;
418 u64 end
= start
+ ee
->batchbuffer
->gtt_size
;
420 err_printf(m
, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
421 upper_32_bits(start
), lower_32_bits(start
),
422 upper_32_bits(end
), lower_32_bits(end
));
424 if (INTEL_GEN(m
->i915
) >= 4) {
425 err_printf(m
, " BBADDR: 0x%08x_%08x\n",
426 (u32
)(ee
->bbaddr
>>32), (u32
)ee
->bbaddr
);
427 err_printf(m
, " BB_STATE: 0x%08x\n", ee
->bbstate
);
428 err_printf(m
, " INSTPS: 0x%08x\n", ee
->instps
);
430 err_printf(m
, " INSTPM: 0x%08x\n", ee
->instpm
);
431 err_printf(m
, " FADDR: 0x%08x %08x\n", upper_32_bits(ee
->faddr
),
432 lower_32_bits(ee
->faddr
));
433 if (INTEL_GEN(m
->i915
) >= 6) {
434 err_printf(m
, " RC PSMI: 0x%08x\n", ee
->rc_psmi
);
435 err_printf(m
, " FAULT_REG: 0x%08x\n", ee
->fault_reg
);
436 err_printf(m
, " SYNC_0: 0x%08x\n",
437 ee
->semaphore_mboxes
[0]);
438 err_printf(m
, " SYNC_1: 0x%08x\n",
439 ee
->semaphore_mboxes
[1]);
440 if (HAS_VEBOX(m
->i915
))
441 err_printf(m
, " SYNC_2: 0x%08x\n",
442 ee
->semaphore_mboxes
[2]);
444 if (USES_PPGTT(m
->i915
)) {
445 err_printf(m
, " GFX_MODE: 0x%08x\n", ee
->vm_info
.gfx_mode
);
447 if (INTEL_GEN(m
->i915
) >= 8) {
449 for (i
= 0; i
< 4; i
++)
450 err_printf(m
, " PDP%d: 0x%016llx\n",
451 i
, ee
->vm_info
.pdp
[i
]);
453 err_printf(m
, " PP_DIR_BASE: 0x%08x\n",
454 ee
->vm_info
.pp_dir_base
);
457 err_printf(m
, " seqno: 0x%08x\n", ee
->seqno
);
458 err_printf(m
, " last_seqno: 0x%08x\n", ee
->last_seqno
);
459 err_printf(m
, " waiting: %s\n", yesno(ee
->waiting
));
460 err_printf(m
, " ring->head: 0x%08x\n", ee
->cpu_ring_head
);
461 err_printf(m
, " ring->tail: 0x%08x\n", ee
->cpu_ring_tail
);
462 err_printf(m
, " hangcheck stall: %s\n", yesno(ee
->hangcheck_stalled
));
463 err_printf(m
, " hangcheck action: %s\n",
464 hangcheck_action_to_str(ee
->hangcheck_action
));
465 err_printf(m
, " hangcheck action timestamp: %lu, %u ms ago\n",
466 ee
->hangcheck_timestamp
,
467 jiffies_to_msecs(jiffies
- ee
->hangcheck_timestamp
));
468 err_printf(m
, " engine reset count: %u\n", ee
->reset_count
);
470 for (n
= 0; n
< ee
->num_ports
; n
++) {
471 err_printf(m
, " ELSP[%d]:", n
);
472 error_print_request(m
, " ", &ee
->execlist
[n
]);
475 error_print_context(m
, " Active context: ", &ee
->context
);
478 void i915_error_printf(struct drm_i915_error_state_buf
*e
, const char *f
, ...)
483 i915_error_vprintf(e
, f
, args
);
488 ascii85_encode_len(int len
)
490 return DIV_ROUND_UP(len
, 4);
494 ascii85_encode(u32 in
, char *out
)
503 out
[i
] = '!' + in
% 85;
510 static void print_error_obj(struct drm_i915_error_state_buf
*m
,
511 struct intel_engine_cs
*engine
,
513 struct drm_i915_error_object
*obj
)
522 err_printf(m
, "%s --- %s = 0x%08x %08x\n",
523 engine
? engine
->name
: "global", name
,
524 upper_32_bits(obj
->gtt_offset
),
525 lower_32_bits(obj
->gtt_offset
));
528 err_compression_marker(m
);
529 for (page
= 0; page
< obj
->page_count
; page
++) {
533 if (page
== obj
->page_count
- 1)
535 len
= ascii85_encode_len(len
);
537 for (i
= 0; i
< len
; i
++) {
538 if (ascii85_encode(obj
->pages
[page
][i
], out
))
547 static void err_print_capabilities(struct drm_i915_error_state_buf
*m
,
548 const struct intel_device_info
*info
)
550 #define PRINT_FLAG(x) err_printf(m, #x ": %s\n", yesno(info->x))
551 DEV_INFO_FOR_EACH_FLAG(PRINT_FLAG
);
555 static __always_inline
void err_print_param(struct drm_i915_error_state_buf
*m
,
560 if (!__builtin_strcmp(type
, "bool"))
561 err_printf(m
, "i915.%s=%s\n", name
, yesno(*(const bool *)x
));
562 else if (!__builtin_strcmp(type
, "int"))
563 err_printf(m
, "i915.%s=%d\n", name
, *(const int *)x
);
564 else if (!__builtin_strcmp(type
, "unsigned int"))
565 err_printf(m
, "i915.%s=%u\n", name
, *(const unsigned int *)x
);
566 else if (!__builtin_strcmp(type
, "char *"))
567 err_printf(m
, "i915.%s=%s\n", name
, *(const char **)x
);
572 static void err_print_params(struct drm_i915_error_state_buf
*m
,
573 const struct i915_params
*p
)
575 #define PRINT(T, x, ...) err_print_param(m, #x, #T, &p->x);
576 I915_PARAMS_FOR_EACH(PRINT
);
580 static void err_print_pciid(struct drm_i915_error_state_buf
*m
,
581 struct drm_i915_private
*i915
)
583 struct pci_dev
*pdev
= i915
->drm
.pdev
;
585 err_printf(m
, "PCI ID: 0x%04x\n", pdev
->device
);
586 err_printf(m
, "PCI Revision: 0x%02x\n", pdev
->revision
);
587 err_printf(m
, "PCI Subsystem: %04x:%04x\n",
588 pdev
->subsystem_vendor
,
589 pdev
->subsystem_device
);
592 int i915_error_state_to_str(struct drm_i915_error_state_buf
*m
,
593 const struct i915_gpu_state
*error
)
595 struct drm_i915_private
*dev_priv
= m
->i915
;
596 struct drm_i915_error_object
*obj
;
600 err_printf(m
, "No error state collected\n");
604 if (*error
->error_msg
)
605 err_printf(m
, "%s\n", error
->error_msg
);
606 err_printf(m
, "Kernel: " UTS_RELEASE
"\n");
607 err_printf(m
, "Time: %ld s %ld us\n",
608 error
->time
.tv_sec
, error
->time
.tv_usec
);
609 err_printf(m
, "Boottime: %ld s %ld us\n",
610 error
->boottime
.tv_sec
, error
->boottime
.tv_usec
);
611 err_printf(m
, "Uptime: %ld s %ld us\n",
612 error
->uptime
.tv_sec
, error
->uptime
.tv_usec
);
614 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
615 if (error
->engine
[i
].hangcheck_stalled
&&
616 error
->engine
[i
].context
.pid
) {
617 err_printf(m
, "Active process (on ring %s): %s [%d], score %d\n",
619 error
->engine
[i
].context
.comm
,
620 error
->engine
[i
].context
.pid
,
621 error
->engine
[i
].context
.ban_score
);
624 err_printf(m
, "Reset count: %u\n", error
->reset_count
);
625 err_printf(m
, "Suspend count: %u\n", error
->suspend_count
);
626 err_printf(m
, "Platform: %s\n", intel_platform_name(error
->device_info
.platform
));
627 err_print_pciid(m
, error
->i915
);
629 err_printf(m
, "IOMMU enabled?: %d\n", error
->iommu
);
631 if (HAS_CSR(dev_priv
)) {
632 struct intel_csr
*csr
= &dev_priv
->csr
;
634 err_printf(m
, "DMC loaded: %s\n",
635 yesno(csr
->dmc_payload
!= NULL
));
636 err_printf(m
, "DMC fw version: %d.%d\n",
637 CSR_VERSION_MAJOR(csr
->version
),
638 CSR_VERSION_MINOR(csr
->version
));
641 err_printf(m
, "GT awake: %s\n", yesno(error
->awake
));
642 err_printf(m
, "RPM wakelock: %s\n", yesno(error
->wakelock
));
643 err_printf(m
, "PM suspended: %s\n", yesno(error
->suspended
));
644 err_printf(m
, "EIR: 0x%08x\n", error
->eir
);
645 err_printf(m
, "IER: 0x%08x\n", error
->ier
);
646 for (i
= 0; i
< error
->ngtier
; i
++)
647 err_printf(m
, "GTIER[%d]: 0x%08x\n", i
, error
->gtier
[i
]);
648 err_printf(m
, "PGTBL_ER: 0x%08x\n", error
->pgtbl_er
);
649 err_printf(m
, "FORCEWAKE: 0x%08x\n", error
->forcewake
);
650 err_printf(m
, "DERRMR: 0x%08x\n", error
->derrmr
);
651 err_printf(m
, "CCID: 0x%08x\n", error
->ccid
);
652 err_printf(m
, "Missed interrupts: 0x%08lx\n", dev_priv
->gpu_error
.missed_irq_rings
);
654 for (i
= 0; i
< error
->nfence
; i
++)
655 err_printf(m
, " fence[%d] = %08llx\n", i
, error
->fence
[i
]);
657 if (INTEL_GEN(dev_priv
) >= 6) {
658 err_printf(m
, "ERROR: 0x%08x\n", error
->error
);
660 if (INTEL_GEN(dev_priv
) >= 8)
661 err_printf(m
, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
662 error
->fault_data1
, error
->fault_data0
);
664 err_printf(m
, "DONE_REG: 0x%08x\n", error
->done_reg
);
667 if (IS_GEN7(dev_priv
))
668 err_printf(m
, "ERR_INT: 0x%08x\n", error
->err_int
);
670 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
671 if (error
->engine
[i
].engine_id
!= -1)
672 error_print_engine(m
, &error
->engine
[i
]);
675 for (i
= 0; i
< ARRAY_SIZE(error
->active_vm
); i
++) {
679 if (!error
->active_vm
[i
])
682 len
= scnprintf(buf
, sizeof(buf
), "Active (");
683 for (j
= 0; j
< ARRAY_SIZE(error
->engine
); j
++) {
684 if (error
->engine
[j
].vm
!= error
->active_vm
[i
])
687 len
+= scnprintf(buf
+ len
, sizeof(buf
), "%s%s",
689 dev_priv
->engine
[j
]->name
);
692 scnprintf(buf
+ len
, sizeof(buf
), ")");
693 print_error_buffers(m
, buf
,
695 error
->active_bo_count
[i
]);
698 print_error_buffers(m
, "Pinned (global)",
700 error
->pinned_bo_count
);
702 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
703 const struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
705 obj
= ee
->batchbuffer
;
707 err_puts(m
, dev_priv
->engine
[i
]->name
);
709 err_printf(m
, " (submitted by %s [%d], ctx %d [%d], score %d)",
714 ee
->context
.ban_score
);
715 err_printf(m
, " --- gtt_offset = 0x%08x %08x\n",
716 upper_32_bits(obj
->gtt_offset
),
717 lower_32_bits(obj
->gtt_offset
));
718 print_error_obj(m
, dev_priv
->engine
[i
], NULL
, obj
);
721 for (j
= 0; j
< ee
->user_bo_count
; j
++)
722 print_error_obj(m
, dev_priv
->engine
[i
],
723 "user", ee
->user_bo
[j
]);
725 if (ee
->num_requests
) {
726 err_printf(m
, "%s --- %d requests\n",
727 dev_priv
->engine
[i
]->name
,
729 for (j
= 0; j
< ee
->num_requests
; j
++)
730 error_print_request(m
, " ", &ee
->requests
[j
]);
733 if (IS_ERR(ee
->waiters
)) {
734 err_printf(m
, "%s --- ? waiters [unable to acquire spinlock]\n",
735 dev_priv
->engine
[i
]->name
);
736 } else if (ee
->num_waiters
) {
737 err_printf(m
, "%s --- %d waiters\n",
738 dev_priv
->engine
[i
]->name
,
740 for (j
= 0; j
< ee
->num_waiters
; j
++) {
741 err_printf(m
, " seqno 0x%08x for %s [%d]\n",
742 ee
->waiters
[j
].seqno
,
748 print_error_obj(m
, dev_priv
->engine
[i
],
749 "ringbuffer", ee
->ringbuffer
);
751 print_error_obj(m
, dev_priv
->engine
[i
],
752 "HW Status", ee
->hws_page
);
754 print_error_obj(m
, dev_priv
->engine
[i
],
755 "HW context", ee
->ctx
);
757 print_error_obj(m
, dev_priv
->engine
[i
],
758 "WA context", ee
->wa_ctx
);
760 print_error_obj(m
, dev_priv
->engine
[i
],
761 "WA batchbuffer", ee
->wa_batchbuffer
);
764 print_error_obj(m
, NULL
, "Semaphores", error
->semaphore
);
766 print_error_obj(m
, NULL
, "GuC log buffer", error
->guc_log
);
769 intel_overlay_print_error_state(m
, error
->overlay
);
772 intel_display_print_error_state(m
, error
->display
);
774 err_print_capabilities(m
, &error
->device_info
);
775 err_print_params(m
, &error
->params
);
777 if (m
->bytes
== 0 && m
->err
)
783 int i915_error_state_buf_init(struct drm_i915_error_state_buf
*ebuf
,
784 struct drm_i915_private
*i915
,
785 size_t count
, loff_t pos
)
787 memset(ebuf
, 0, sizeof(*ebuf
));
790 /* We need to have enough room to store any i915_error_state printf
791 * so that we can move it to start position.
793 ebuf
->size
= count
+ 1 > PAGE_SIZE
? count
+ 1 : PAGE_SIZE
;
794 ebuf
->buf
= kmalloc(ebuf
->size
,
795 GFP_KERNEL
| __GFP_NORETRY
| __GFP_NOWARN
);
797 if (ebuf
->buf
== NULL
) {
798 ebuf
->size
= PAGE_SIZE
;
799 ebuf
->buf
= kmalloc(ebuf
->size
, GFP_KERNEL
);
802 if (ebuf
->buf
== NULL
) {
804 ebuf
->buf
= kmalloc(ebuf
->size
, GFP_KERNEL
);
807 if (ebuf
->buf
== NULL
)
815 static void i915_error_object_free(struct drm_i915_error_object
*obj
)
822 for (page
= 0; page
< obj
->page_count
; page
++)
823 free_page((unsigned long)obj
->pages
[page
]);
828 static __always_inline
void free_param(const char *type
, void *x
)
830 if (!__builtin_strcmp(type
, "char *"))
834 void __i915_gpu_state_free(struct kref
*error_ref
)
836 struct i915_gpu_state
*error
=
837 container_of(error_ref
, typeof(*error
), ref
);
840 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
841 struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
843 for (j
= 0; j
< ee
->user_bo_count
; j
++)
844 i915_error_object_free(ee
->user_bo
[j
]);
847 i915_error_object_free(ee
->batchbuffer
);
848 i915_error_object_free(ee
->wa_batchbuffer
);
849 i915_error_object_free(ee
->ringbuffer
);
850 i915_error_object_free(ee
->hws_page
);
851 i915_error_object_free(ee
->ctx
);
852 i915_error_object_free(ee
->wa_ctx
);
855 if (!IS_ERR_OR_NULL(ee
->waiters
))
859 i915_error_object_free(error
->semaphore
);
860 i915_error_object_free(error
->guc_log
);
862 for (i
= 0; i
< ARRAY_SIZE(error
->active_bo
); i
++)
863 kfree(error
->active_bo
[i
]);
864 kfree(error
->pinned_bo
);
866 kfree(error
->overlay
);
867 kfree(error
->display
);
869 #define FREE(T, x, ...) free_param(#T, &error->params.x);
870 I915_PARAMS_FOR_EACH(FREE
);
876 static struct drm_i915_error_object
*
877 i915_error_object_create(struct drm_i915_private
*i915
,
878 struct i915_vma
*vma
)
880 struct i915_ggtt
*ggtt
= &i915
->ggtt
;
881 const u64 slot
= ggtt
->error_capture
.start
;
882 struct drm_i915_error_object
*dst
;
883 struct compress compress
;
884 unsigned long num_pages
;
885 struct sgt_iter iter
;
891 num_pages
= min_t(u64
, vma
->size
, vma
->obj
->base
.size
) >> PAGE_SHIFT
;
892 num_pages
= DIV_ROUND_UP(10 * num_pages
, 8); /* worstcase zlib growth */
893 dst
= kmalloc(sizeof(*dst
) + num_pages
* sizeof(u32
*),
894 GFP_ATOMIC
| __GFP_NOWARN
);
898 dst
->gtt_offset
= vma
->node
.start
;
899 dst
->gtt_size
= vma
->node
.size
;
903 if (!compress_init(&compress
)) {
908 for_each_sgt_dma(dma
, iter
, vma
->pages
) {
912 ggtt
->base
.insert_page(&ggtt
->base
, dma
, slot
,
915 s
= io_mapping_map_atomic_wc(&ggtt
->mappable
, slot
);
916 ret
= compress_page(&compress
, (void __force
*)s
, dst
);
917 io_mapping_unmap_atomic(s
);
925 while (dst
->page_count
--)
926 free_page((unsigned long)dst
->pages
[dst
->page_count
]);
931 compress_fini(&compress
, dst
);
932 ggtt
->base
.clear_range(&ggtt
->base
, slot
, PAGE_SIZE
);
936 /* The error capture is special as tries to run underneath the normal
937 * locking rules - so we use the raw version of the i915_gem_active lookup.
939 static inline uint32_t
940 __active_get_seqno(struct i915_gem_active
*active
)
942 struct drm_i915_gem_request
*request
;
944 request
= __i915_gem_active_peek(active
);
945 return request
? request
->global_seqno
: 0;
949 __active_get_engine_id(struct i915_gem_active
*active
)
951 struct drm_i915_gem_request
*request
;
953 request
= __i915_gem_active_peek(active
);
954 return request
? request
->engine
->id
: -1;
957 static void capture_bo(struct drm_i915_error_buffer
*err
,
958 struct i915_vma
*vma
)
960 struct drm_i915_gem_object
*obj
= vma
->obj
;
963 err
->size
= obj
->base
.size
;
964 err
->name
= obj
->base
.name
;
966 for (i
= 0; i
< I915_NUM_ENGINES
; i
++)
967 err
->rseqno
[i
] = __active_get_seqno(&vma
->last_read
[i
]);
968 err
->wseqno
= __active_get_seqno(&obj
->frontbuffer_write
);
969 err
->engine
= __active_get_engine_id(&obj
->frontbuffer_write
);
971 err
->gtt_offset
= vma
->node
.start
;
972 err
->read_domains
= obj
->base
.read_domains
;
973 err
->write_domain
= obj
->base
.write_domain
;
974 err
->fence_reg
= vma
->fence
? vma
->fence
->id
: -1;
975 err
->tiling
= i915_gem_object_get_tiling(obj
);
976 err
->dirty
= obj
->mm
.dirty
;
977 err
->purgeable
= obj
->mm
.madv
!= I915_MADV_WILLNEED
;
978 err
->userptr
= obj
->userptr
.mm
!= NULL
;
979 err
->cache_level
= obj
->cache_level
;
982 static u32
capture_error_bo(struct drm_i915_error_buffer
*err
,
983 int count
, struct list_head
*head
,
986 struct i915_vma
*vma
;
989 list_for_each_entry(vma
, head
, vm_link
) {
990 if (pinned_only
&& !i915_vma_is_pinned(vma
))
993 capture_bo(err
++, vma
);
1001 /* Generate a semi-unique error code. The code is not meant to have meaning, The
1002 * code's only purpose is to try to prevent false duplicated bug reports by
1003 * grossly estimating a GPU error state.
1005 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
1006 * the hang if we could strip the GTT offset information from it.
1008 * It's only a small step better than a random number in its current form.
1010 static uint32_t i915_error_generate_code(struct drm_i915_private
*dev_priv
,
1011 struct i915_gpu_state
*error
,
1014 uint32_t error_code
= 0;
1017 /* IPEHR would be an ideal way to detect errors, as it's the gross
1018 * measure of "the command that hung." However, has some very common
1019 * synchronization commands which almost always appear in the case
1020 * strictly a client bug. Use instdone to differentiate those some.
1022 for (i
= 0; i
< I915_NUM_ENGINES
; i
++) {
1023 if (error
->engine
[i
].hangcheck_stalled
) {
1027 return error
->engine
[i
].ipehr
^
1028 error
->engine
[i
].instdone
.instdone
;
1035 static void i915_gem_record_fences(struct drm_i915_private
*dev_priv
,
1036 struct i915_gpu_state
*error
)
1040 if (INTEL_GEN(dev_priv
) >= 6) {
1041 for (i
= 0; i
< dev_priv
->num_fence_regs
; i
++)
1042 error
->fence
[i
] = I915_READ64(FENCE_REG_GEN6_LO(i
));
1043 } else if (INTEL_GEN(dev_priv
) >= 4) {
1044 for (i
= 0; i
< dev_priv
->num_fence_regs
; i
++)
1045 error
->fence
[i
] = I915_READ64(FENCE_REG_965_LO(i
));
1047 for (i
= 0; i
< dev_priv
->num_fence_regs
; i
++)
1048 error
->fence
[i
] = I915_READ(FENCE_REG(i
));
1054 gen8_engine_sync_index(struct intel_engine_cs
*engine
,
1055 struct intel_engine_cs
*other
)
1060 * rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
1061 * vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
1062 * bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
1063 * vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
1064 * vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
1067 idx
= (other
- engine
) - 1;
1069 idx
+= I915_NUM_ENGINES
;
1074 static void gen8_record_semaphore_state(struct i915_gpu_state
*error
,
1075 struct intel_engine_cs
*engine
,
1076 struct drm_i915_error_engine
*ee
)
1078 struct drm_i915_private
*dev_priv
= engine
->i915
;
1079 struct intel_engine_cs
*to
;
1080 enum intel_engine_id id
;
1082 if (!error
->semaphore
)
1085 for_each_engine(to
, dev_priv
, id
) {
1094 (GEN8_SIGNAL_OFFSET(engine
, id
) & (PAGE_SIZE
- 1)) / 4;
1095 tmp
= error
->semaphore
->pages
[0];
1096 idx
= gen8_engine_sync_index(engine
, to
);
1098 ee
->semaphore_mboxes
[idx
] = tmp
[signal_offset
];
1102 static void gen6_record_semaphore_state(struct intel_engine_cs
*engine
,
1103 struct drm_i915_error_engine
*ee
)
1105 struct drm_i915_private
*dev_priv
= engine
->i915
;
1107 ee
->semaphore_mboxes
[0] = I915_READ(RING_SYNC_0(engine
->mmio_base
));
1108 ee
->semaphore_mboxes
[1] = I915_READ(RING_SYNC_1(engine
->mmio_base
));
1109 if (HAS_VEBOX(dev_priv
))
1110 ee
->semaphore_mboxes
[2] =
1111 I915_READ(RING_SYNC_2(engine
->mmio_base
));
1114 static void error_record_engine_waiters(struct intel_engine_cs
*engine
,
1115 struct drm_i915_error_engine
*ee
)
1117 struct intel_breadcrumbs
*b
= &engine
->breadcrumbs
;
1118 struct drm_i915_error_waiter
*waiter
;
1122 ee
->num_waiters
= 0;
1125 if (RB_EMPTY_ROOT(&b
->waiters
))
1128 if (!spin_trylock_irq(&b
->rb_lock
)) {
1129 ee
->waiters
= ERR_PTR(-EDEADLK
);
1134 for (rb
= rb_first(&b
->waiters
); rb
!= NULL
; rb
= rb_next(rb
))
1136 spin_unlock_irq(&b
->rb_lock
);
1140 waiter
= kmalloc_array(count
,
1141 sizeof(struct drm_i915_error_waiter
),
1146 if (!spin_trylock_irq(&b
->rb_lock
)) {
1148 ee
->waiters
= ERR_PTR(-EDEADLK
);
1152 ee
->waiters
= waiter
;
1153 for (rb
= rb_first(&b
->waiters
); rb
; rb
= rb_next(rb
)) {
1154 struct intel_wait
*w
= rb_entry(rb
, typeof(*w
), node
);
1156 strcpy(waiter
->comm
, w
->tsk
->comm
);
1157 waiter
->pid
= w
->tsk
->pid
;
1158 waiter
->seqno
= w
->seqno
;
1161 if (++ee
->num_waiters
== count
)
1164 spin_unlock_irq(&b
->rb_lock
);
1167 static void error_record_engine_registers(struct i915_gpu_state
*error
,
1168 struct intel_engine_cs
*engine
,
1169 struct drm_i915_error_engine
*ee
)
1171 struct drm_i915_private
*dev_priv
= engine
->i915
;
1173 if (INTEL_GEN(dev_priv
) >= 6) {
1174 ee
->rc_psmi
= I915_READ(RING_PSMI_CTL(engine
->mmio_base
));
1175 ee
->fault_reg
= I915_READ(RING_FAULT_REG(engine
));
1176 if (INTEL_GEN(dev_priv
) >= 8)
1177 gen8_record_semaphore_state(error
, engine
, ee
);
1179 gen6_record_semaphore_state(engine
, ee
);
1182 if (INTEL_GEN(dev_priv
) >= 4) {
1183 ee
->faddr
= I915_READ(RING_DMA_FADD(engine
->mmio_base
));
1184 ee
->ipeir
= I915_READ(RING_IPEIR(engine
->mmio_base
));
1185 ee
->ipehr
= I915_READ(RING_IPEHR(engine
->mmio_base
));
1186 ee
->instps
= I915_READ(RING_INSTPS(engine
->mmio_base
));
1187 ee
->bbaddr
= I915_READ(RING_BBADDR(engine
->mmio_base
));
1188 if (INTEL_GEN(dev_priv
) >= 8) {
1189 ee
->faddr
|= (u64
) I915_READ(RING_DMA_FADD_UDW(engine
->mmio_base
)) << 32;
1190 ee
->bbaddr
|= (u64
) I915_READ(RING_BBADDR_UDW(engine
->mmio_base
)) << 32;
1192 ee
->bbstate
= I915_READ(RING_BBSTATE(engine
->mmio_base
));
1194 ee
->faddr
= I915_READ(DMA_FADD_I8XX
);
1195 ee
->ipeir
= I915_READ(IPEIR
);
1196 ee
->ipehr
= I915_READ(IPEHR
);
1199 intel_engine_get_instdone(engine
, &ee
->instdone
);
1201 ee
->waiting
= intel_engine_has_waiter(engine
);
1202 ee
->instpm
= I915_READ(RING_INSTPM(engine
->mmio_base
));
1203 ee
->acthd
= intel_engine_get_active_head(engine
);
1204 ee
->seqno
= intel_engine_get_seqno(engine
);
1205 ee
->last_seqno
= intel_engine_last_submit(engine
);
1206 ee
->start
= I915_READ_START(engine
);
1207 ee
->head
= I915_READ_HEAD(engine
);
1208 ee
->tail
= I915_READ_TAIL(engine
);
1209 ee
->ctl
= I915_READ_CTL(engine
);
1210 if (INTEL_GEN(dev_priv
) > 2)
1211 ee
->mode
= I915_READ_MODE(engine
);
1213 if (!HWS_NEEDS_PHYSICAL(dev_priv
)) {
1216 if (IS_GEN7(dev_priv
)) {
1217 switch (engine
->id
) {
1220 mmio
= RENDER_HWS_PGA_GEN7
;
1223 mmio
= BLT_HWS_PGA_GEN7
;
1226 mmio
= BSD_HWS_PGA_GEN7
;
1229 mmio
= VEBOX_HWS_PGA_GEN7
;
1232 } else if (IS_GEN6(engine
->i915
)) {
1233 mmio
= RING_HWS_PGA_GEN6(engine
->mmio_base
);
1235 /* XXX: gen8 returns to sanity */
1236 mmio
= RING_HWS_PGA(engine
->mmio_base
);
1239 ee
->hws
= I915_READ(mmio
);
1242 ee
->hangcheck_timestamp
= engine
->hangcheck
.action_timestamp
;
1243 ee
->hangcheck_action
= engine
->hangcheck
.action
;
1244 ee
->hangcheck_stalled
= engine
->hangcheck
.stalled
;
1245 ee
->reset_count
= i915_reset_engine_count(&dev_priv
->gpu_error
,
1248 if (USES_PPGTT(dev_priv
)) {
1251 ee
->vm_info
.gfx_mode
= I915_READ(RING_MODE_GEN7(engine
));
1253 if (IS_GEN6(dev_priv
))
1254 ee
->vm_info
.pp_dir_base
=
1255 I915_READ(RING_PP_DIR_BASE_READ(engine
));
1256 else if (IS_GEN7(dev_priv
))
1257 ee
->vm_info
.pp_dir_base
=
1258 I915_READ(RING_PP_DIR_BASE(engine
));
1259 else if (INTEL_GEN(dev_priv
) >= 8)
1260 for (i
= 0; i
< 4; i
++) {
1261 ee
->vm_info
.pdp
[i
] =
1262 I915_READ(GEN8_RING_PDP_UDW(engine
, i
));
1263 ee
->vm_info
.pdp
[i
] <<= 32;
1264 ee
->vm_info
.pdp
[i
] |=
1265 I915_READ(GEN8_RING_PDP_LDW(engine
, i
));
1270 static void record_request(struct drm_i915_gem_request
*request
,
1271 struct drm_i915_error_request
*erq
)
1273 erq
->context
= request
->ctx
->hw_id
;
1274 erq
->ban_score
= atomic_read(&request
->ctx
->ban_score
);
1275 erq
->seqno
= request
->global_seqno
;
1276 erq
->jiffies
= request
->emitted_jiffies
;
1277 erq
->head
= request
->head
;
1278 erq
->tail
= request
->tail
;
1281 erq
->pid
= request
->ctx
->pid
? pid_nr(request
->ctx
->pid
) : 0;
1285 static void engine_record_requests(struct intel_engine_cs
*engine
,
1286 struct drm_i915_gem_request
*first
,
1287 struct drm_i915_error_engine
*ee
)
1289 struct drm_i915_gem_request
*request
;
1294 list_for_each_entry_from(request
, &engine
->timeline
->requests
, link
)
1299 ee
->requests
= kcalloc(count
, sizeof(*ee
->requests
), GFP_ATOMIC
);
1303 ee
->num_requests
= count
;
1307 list_for_each_entry_from(request
, &engine
->timeline
->requests
, link
) {
1308 if (count
>= ee
->num_requests
) {
1310 * If the ring request list was changed in
1311 * between the point where the error request
1312 * list was created and dimensioned and this
1313 * point then just exit early to avoid crashes.
1315 * We don't need to communicate that the
1316 * request list changed state during error
1317 * state capture and that the error state is
1318 * slightly incorrect as a consequence since we
1319 * are typically only interested in the request
1320 * list state at the point of error state
1321 * capture, not in any changes happening during
1327 record_request(request
, &ee
->requests
[count
++]);
1329 ee
->num_requests
= count
;
1332 static void error_record_engine_execlists(struct intel_engine_cs
*engine
,
1333 struct drm_i915_error_engine
*ee
)
1335 const struct intel_engine_execlists
* const execlists
= &engine
->execlists
;
1338 for (n
= 0; n
< execlists_num_ports(execlists
); n
++) {
1339 struct drm_i915_gem_request
*rq
= port_request(&execlists
->port
[n
]);
1344 record_request(rq
, &ee
->execlist
[n
]);
1350 static void record_context(struct drm_i915_error_context
*e
,
1351 struct i915_gem_context
*ctx
)
1354 struct task_struct
*task
;
1357 task
= pid_task(ctx
->pid
, PIDTYPE_PID
);
1359 strcpy(e
->comm
, task
->comm
);
1365 e
->handle
= ctx
->user_handle
;
1366 e
->hw_id
= ctx
->hw_id
;
1367 e
->ban_score
= atomic_read(&ctx
->ban_score
);
1368 e
->guilty
= atomic_read(&ctx
->guilty_count
);
1369 e
->active
= atomic_read(&ctx
->active_count
);
1372 static void request_record_user_bo(struct drm_i915_gem_request
*request
,
1373 struct drm_i915_error_engine
*ee
)
1375 struct i915_gem_capture_list
*c
;
1376 struct drm_i915_error_object
**bo
;
1380 for (c
= request
->capture_list
; c
; c
= c
->next
)
1385 bo
= kcalloc(count
, sizeof(*bo
), GFP_ATOMIC
);
1390 for (c
= request
->capture_list
; c
; c
= c
->next
) {
1391 bo
[count
] = i915_error_object_create(request
->i915
, c
->vma
);
1398 ee
->user_bo_count
= count
;
1401 static void i915_gem_record_rings(struct drm_i915_private
*dev_priv
,
1402 struct i915_gpu_state
*error
)
1404 struct i915_ggtt
*ggtt
= &dev_priv
->ggtt
;
1408 i915_error_object_create(dev_priv
, dev_priv
->semaphore
);
1410 for (i
= 0; i
< I915_NUM_ENGINES
; i
++) {
1411 struct intel_engine_cs
*engine
= dev_priv
->engine
[i
];
1412 struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
1413 struct drm_i915_gem_request
*request
;
1422 error_record_engine_registers(error
, engine
, ee
);
1423 error_record_engine_waiters(engine
, ee
);
1424 error_record_engine_execlists(engine
, ee
);
1426 request
= i915_gem_find_active_request(engine
);
1428 struct intel_ring
*ring
;
1430 ee
->vm
= request
->ctx
->ppgtt
?
1431 &request
->ctx
->ppgtt
->base
: &ggtt
->base
;
1433 record_context(&ee
->context
, request
->ctx
);
1435 /* We need to copy these to an anonymous buffer
1436 * as the simplest method to avoid being overwritten
1440 i915_error_object_create(dev_priv
,
1443 if (HAS_BROKEN_CS_TLB(dev_priv
))
1444 ee
->wa_batchbuffer
=
1445 i915_error_object_create(dev_priv
,
1447 request_record_user_bo(request
, ee
);
1450 i915_error_object_create(dev_priv
,
1451 request
->ctx
->engine
[i
].state
);
1454 i915_gem_context_no_error_capture(request
->ctx
);
1456 ee
->rq_head
= request
->head
;
1457 ee
->rq_post
= request
->postfix
;
1458 ee
->rq_tail
= request
->tail
;
1460 ring
= request
->ring
;
1461 ee
->cpu_ring_head
= ring
->head
;
1462 ee
->cpu_ring_tail
= ring
->tail
;
1464 i915_error_object_create(dev_priv
, ring
->vma
);
1466 engine_record_requests(engine
, request
, ee
);
1470 i915_error_object_create(dev_priv
,
1471 engine
->status_page
.vma
);
1474 i915_error_object_create(dev_priv
, engine
->wa_ctx
.vma
);
1478 static void i915_gem_capture_vm(struct drm_i915_private
*dev_priv
,
1479 struct i915_gpu_state
*error
,
1480 struct i915_address_space
*vm
,
1483 struct drm_i915_error_buffer
*active_bo
;
1484 struct i915_vma
*vma
;
1488 list_for_each_entry(vma
, &vm
->active_list
, vm_link
)
1493 active_bo
= kcalloc(count
, sizeof(*active_bo
), GFP_ATOMIC
);
1495 count
= capture_error_bo(active_bo
, count
, &vm
->active_list
, false);
1499 error
->active_vm
[idx
] = vm
;
1500 error
->active_bo
[idx
] = active_bo
;
1501 error
->active_bo_count
[idx
] = count
;
1504 static void i915_capture_active_buffers(struct drm_i915_private
*dev_priv
,
1505 struct i915_gpu_state
*error
)
1509 BUILD_BUG_ON(ARRAY_SIZE(error
->engine
) > ARRAY_SIZE(error
->active_bo
));
1510 BUILD_BUG_ON(ARRAY_SIZE(error
->active_bo
) != ARRAY_SIZE(error
->active_vm
));
1511 BUILD_BUG_ON(ARRAY_SIZE(error
->active_bo
) != ARRAY_SIZE(error
->active_bo_count
));
1513 /* Scan each engine looking for unique active contexts/vm */
1514 for (i
= 0; i
< ARRAY_SIZE(error
->engine
); i
++) {
1515 struct drm_i915_error_engine
*ee
= &error
->engine
[i
];
1522 for (j
= 0; j
< i
&& !found
; j
++)
1523 found
= error
->engine
[j
].vm
== ee
->vm
;
1525 i915_gem_capture_vm(dev_priv
, error
, ee
->vm
, cnt
++);
1529 static void i915_capture_pinned_buffers(struct drm_i915_private
*dev_priv
,
1530 struct i915_gpu_state
*error
)
1532 struct i915_address_space
*vm
= &dev_priv
->ggtt
.base
;
1533 struct drm_i915_error_buffer
*bo
;
1534 struct i915_vma
*vma
;
1535 int count_inactive
, count_active
;
1538 list_for_each_entry(vma
, &vm
->active_list
, vm_link
)
1542 list_for_each_entry(vma
, &vm
->inactive_list
, vm_link
)
1546 if (count_inactive
+ count_active
)
1547 bo
= kcalloc(count_inactive
+ count_active
,
1548 sizeof(*bo
), GFP_ATOMIC
);
1552 count_inactive
= capture_error_bo(bo
, count_inactive
,
1553 &vm
->active_list
, true);
1554 count_active
= capture_error_bo(bo
+ count_inactive
, count_active
,
1555 &vm
->inactive_list
, true);
1556 error
->pinned_bo_count
= count_inactive
+ count_active
;
1557 error
->pinned_bo
= bo
;
1560 static void i915_gem_capture_guc_log_buffer(struct drm_i915_private
*dev_priv
,
1561 struct i915_gpu_state
*error
)
1563 /* Capturing log buf contents won't be useful if logging was disabled */
1564 if (!dev_priv
->guc
.log
.vma
|| (i915_modparams
.guc_log_level
< 0))
1567 error
->guc_log
= i915_error_object_create(dev_priv
,
1568 dev_priv
->guc
.log
.vma
);
1571 /* Capture all registers which don't fit into another category. */
1572 static void i915_capture_reg_state(struct drm_i915_private
*dev_priv
,
1573 struct i915_gpu_state
*error
)
1577 /* General organization
1578 * 1. Registers specific to a single generation
1579 * 2. Registers which belong to multiple generations
1580 * 3. Feature specific registers.
1581 * 4. Everything else
1582 * Please try to follow the order.
1585 /* 1: Registers specific to a single generation */
1586 if (IS_VALLEYVIEW(dev_priv
)) {
1587 error
->gtier
[0] = I915_READ(GTIER
);
1588 error
->ier
= I915_READ(VLV_IER
);
1589 error
->forcewake
= I915_READ_FW(FORCEWAKE_VLV
);
1592 if (IS_GEN7(dev_priv
))
1593 error
->err_int
= I915_READ(GEN7_ERR_INT
);
1595 if (INTEL_GEN(dev_priv
) >= 8) {
1596 error
->fault_data0
= I915_READ(GEN8_FAULT_TLB_DATA0
);
1597 error
->fault_data1
= I915_READ(GEN8_FAULT_TLB_DATA1
);
1600 if (IS_GEN6(dev_priv
)) {
1601 error
->forcewake
= I915_READ_FW(FORCEWAKE
);
1602 error
->gab_ctl
= I915_READ(GAB_CTL
);
1603 error
->gfx_mode
= I915_READ(GFX_MODE
);
1606 /* 2: Registers which belong to multiple generations */
1607 if (INTEL_GEN(dev_priv
) >= 7)
1608 error
->forcewake
= I915_READ_FW(FORCEWAKE_MT
);
1610 if (INTEL_GEN(dev_priv
) >= 6) {
1611 error
->derrmr
= I915_READ(DERRMR
);
1612 error
->error
= I915_READ(ERROR_GEN6
);
1613 error
->done_reg
= I915_READ(DONE_REG
);
1616 if (INTEL_GEN(dev_priv
) >= 5)
1617 error
->ccid
= I915_READ(CCID
);
1619 /* 3: Feature specific registers */
1620 if (IS_GEN6(dev_priv
) || IS_GEN7(dev_priv
)) {
1621 error
->gam_ecochk
= I915_READ(GAM_ECOCHK
);
1622 error
->gac_eco
= I915_READ(GAC_ECO_BITS
);
1625 /* 4: Everything else */
1626 if (INTEL_GEN(dev_priv
) >= 8) {
1627 error
->ier
= I915_READ(GEN8_DE_MISC_IER
);
1628 for (i
= 0; i
< 4; i
++)
1629 error
->gtier
[i
] = I915_READ(GEN8_GT_IER(i
));
1631 } else if (HAS_PCH_SPLIT(dev_priv
)) {
1632 error
->ier
= I915_READ(DEIER
);
1633 error
->gtier
[0] = I915_READ(GTIER
);
1635 } else if (IS_GEN2(dev_priv
)) {
1636 error
->ier
= I915_READ16(IER
);
1637 } else if (!IS_VALLEYVIEW(dev_priv
)) {
1638 error
->ier
= I915_READ(IER
);
1640 error
->eir
= I915_READ(EIR
);
1641 error
->pgtbl_er
= I915_READ(PGTBL_ER
);
1644 static void i915_error_capture_msg(struct drm_i915_private
*dev_priv
,
1645 struct i915_gpu_state
*error
,
1647 const char *error_msg
)
1650 int engine_id
= -1, len
;
1652 ecode
= i915_error_generate_code(dev_priv
, error
, &engine_id
);
1654 len
= scnprintf(error
->error_msg
, sizeof(error
->error_msg
),
1655 "GPU HANG: ecode %d:%d:0x%08x",
1656 INTEL_GEN(dev_priv
), engine_id
, ecode
);
1658 if (engine_id
!= -1 && error
->engine
[engine_id
].context
.pid
)
1659 len
+= scnprintf(error
->error_msg
+ len
,
1660 sizeof(error
->error_msg
) - len
,
1662 error
->engine
[engine_id
].context
.comm
,
1663 error
->engine
[engine_id
].context
.pid
);
1665 scnprintf(error
->error_msg
+ len
, sizeof(error
->error_msg
) - len
,
1666 ", reason: %s, action: %s",
1668 engine_mask
? "reset" : "continue");
1671 static void i915_capture_gen_state(struct drm_i915_private
*dev_priv
,
1672 struct i915_gpu_state
*error
)
1674 error
->awake
= dev_priv
->gt
.awake
;
1675 error
->wakelock
= atomic_read(&dev_priv
->pm
.wakeref_count
);
1676 error
->suspended
= dev_priv
->pm
.suspended
;
1679 #ifdef CONFIG_INTEL_IOMMU
1680 error
->iommu
= intel_iommu_gfx_mapped
;
1682 error
->reset_count
= i915_reset_count(&dev_priv
->gpu_error
);
1683 error
->suspend_count
= dev_priv
->suspend_count
;
1685 memcpy(&error
->device_info
,
1686 INTEL_INFO(dev_priv
),
1687 sizeof(error
->device_info
));
1690 static __always_inline
void dup_param(const char *type
, void *x
)
1692 if (!__builtin_strcmp(type
, "char *"))
1693 *(void **)x
= kstrdup(*(void **)x
, GFP_ATOMIC
);
1696 static int capture(void *data
)
1698 struct i915_gpu_state
*error
= data
;
1700 do_gettimeofday(&error
->time
);
1701 error
->boottime
= ktime_to_timeval(ktime_get_boottime());
1703 ktime_to_timeval(ktime_sub(ktime_get(),
1704 error
->i915
->gt
.last_init_time
));
1706 error
->params
= i915_modparams
;
1707 #define DUP(T, x, ...) dup_param(#T, &error->params.x);
1708 I915_PARAMS_FOR_EACH(DUP
);
1711 i915_capture_gen_state(error
->i915
, error
);
1712 i915_capture_reg_state(error
->i915
, error
);
1713 i915_gem_record_fences(error
->i915
, error
);
1714 i915_gem_record_rings(error
->i915
, error
);
1715 i915_capture_active_buffers(error
->i915
, error
);
1716 i915_capture_pinned_buffers(error
->i915
, error
);
1717 i915_gem_capture_guc_log_buffer(error
->i915
, error
);
1719 error
->overlay
= intel_overlay_capture_error_state(error
->i915
);
1720 error
->display
= intel_display_capture_error_state(error
->i915
);
1725 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1727 struct i915_gpu_state
*
1728 i915_capture_gpu_state(struct drm_i915_private
*i915
)
1730 struct i915_gpu_state
*error
;
1732 error
= kzalloc(sizeof(*error
), GFP_ATOMIC
);
1736 kref_init(&error
->ref
);
1739 stop_machine(capture
, error
, NULL
);
1745 * i915_capture_error_state - capture an error record for later analysis
1748 * Should be called when an error is detected (either a hang or an error
1749 * interrupt) to capture error state from the time of the error. Fills
1750 * out a structure which becomes available in debugfs for user level tools
1753 void i915_capture_error_state(struct drm_i915_private
*dev_priv
,
1755 const char *error_msg
)
1758 struct i915_gpu_state
*error
;
1759 unsigned long flags
;
1761 if (!i915_modparams
.error_capture
)
1764 if (READ_ONCE(dev_priv
->gpu_error
.first_error
))
1767 error
= i915_capture_gpu_state(dev_priv
);
1769 DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
1773 i915_error_capture_msg(dev_priv
, error
, engine_mask
, error_msg
);
1774 DRM_INFO("%s\n", error
->error_msg
);
1776 if (!error
->simulated
) {
1777 spin_lock_irqsave(&dev_priv
->gpu_error
.lock
, flags
);
1778 if (!dev_priv
->gpu_error
.first_error
) {
1779 dev_priv
->gpu_error
.first_error
= error
;
1782 spin_unlock_irqrestore(&dev_priv
->gpu_error
.lock
, flags
);
1786 __i915_gpu_state_free(&error
->ref
);
1791 ktime_get_real_seconds() - DRIVER_TIMESTAMP
< DAY_AS_SECONDS(180)) {
1792 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1793 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1794 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1795 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1796 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1797 dev_priv
->drm
.primary
->index
);
1802 struct i915_gpu_state
*
1803 i915_first_error_state(struct drm_i915_private
*i915
)
1805 struct i915_gpu_state
*error
;
1807 spin_lock_irq(&i915
->gpu_error
.lock
);
1808 error
= i915
->gpu_error
.first_error
;
1810 i915_gpu_state_get(error
);
1811 spin_unlock_irq(&i915
->gpu_error
.lock
);
1816 void i915_reset_error_state(struct drm_i915_private
*i915
)
1818 struct i915_gpu_state
*error
;
1820 spin_lock_irq(&i915
->gpu_error
.lock
);
1821 error
= i915
->gpu_error
.first_error
;
1822 i915
->gpu_error
.first_error
= NULL
;
1823 spin_unlock_irq(&i915
->gpu_error
.lock
);
1825 i915_gpu_state_put(error
);