drm/amdkfd: add RAS ECC event support (v3)
RAS ECC event will combine with GPU reset event, due to ECC interrupts are caused by uncorrectable error that triggers GPU reset. v2: Fix misleading-indentation warning v3: fix build with CONFIG_HSA_AMD disabled Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
|
||||
#define KFD_HW_EXCEPTION_GPU_HANG 0
|
||||
#define KFD_HW_EXCEPTION_ECC 1
|
||||
|
||||
/* For kfd_hsa_memory_exception_data.ErrorType */
|
||||
#define KFD_MEM_ERR_NO_RAS 0
|
||||
#define KFD_MEM_ERR_SRAM_ECC 1
|
||||
#define KFD_MEM_ERR_POISON_CONSUMED 2
|
||||
#define KFD_MEM_ERR_GPU_HANG 3
|
||||
|
||||
struct kfd_ioctl_create_event_args {
|
||||
__u64 event_page_offset; /* from KFD */
|
||||
@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
|
||||
struct kfd_memory_exception_failure failure;
|
||||
__u64 va;
|
||||
__u32 gpu_id;
|
||||
__u32 pad;
|
||||
__u32 ErrorType; /* 0 = no RAS error,
|
||||
* 1 = ECC_SRAM,
|
||||
* 2 = Link_SYNFLOOD (poison),
|
||||
* 3 = GPU hang (not attributable to a specific cause),
|
||||
* other values reserved
|
||||
*/
|
||||
};
|
||||
|
||||
/* hw exception data */
|
||||
|
Reference in New Issue
Block a user