drm/amdgpu: create mmhub ras framework
enable mmhub ras feature and create sysfs/debugfs node for mmhub Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -177,7 +177,8 @@ struct amdgpu_gmc {
|
|||||||
|
|
||||||
struct amdgpu_xgmi xgmi;
|
struct amdgpu_xgmi xgmi;
|
||||||
struct amdgpu_irq_src ecc_irq;
|
struct amdgpu_irq_src ecc_irq;
|
||||||
struct ras_common_if *ras_if;
|
struct ras_common_if *umc_ras_if;
|
||||||
|
struct ras_common_if *mmhub_ras_if;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define amdgpu_gmc_flush_gpu_tlb(adev, vmid, type) (adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (type))
|
#define amdgpu_gmc_flush_gpu_tlb(adev, vmid, type) (adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (type))
|
||||||
|
@@ -266,7 +266,7 @@ static int gmc_v9_0_process_ecc_irq(struct amdgpu_device *adev,
|
|||||||
struct amdgpu_irq_src *source,
|
struct amdgpu_irq_src *source,
|
||||||
struct amdgpu_iv_entry *entry)
|
struct amdgpu_iv_entry *entry)
|
||||||
{
|
{
|
||||||
struct ras_common_if *ras_if = adev->gmc.ras_if;
|
struct ras_common_if *ras_if = adev->gmc.umc_ras_if;
|
||||||
struct ras_dispatch_if ih_data = {
|
struct ras_dispatch_if ih_data = {
|
||||||
.entry = entry,
|
.entry = entry,
|
||||||
};
|
};
|
||||||
@@ -740,27 +740,25 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct amdgpu_device *adev)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int gmc_v9_0_ecc_late_init(void *handle)
|
static int gmc_v9_0_ecc_ras_block_late_init(void *handle,
|
||||||
|
struct ras_fs_if *fs_info, struct ras_common_if *ras_block)
|
||||||
{
|
{
|
||||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||||
struct ras_common_if **ras_if = &adev->gmc.ras_if;
|
struct ras_common_if **ras_if = NULL;
|
||||||
struct ras_ih_if ih_info = {
|
struct ras_ih_if ih_info = {
|
||||||
.cb = gmc_v9_0_process_ras_data_cb,
|
.cb = gmc_v9_0_process_ras_data_cb,
|
||||||
};
|
};
|
||||||
struct ras_fs_if fs_info = {
|
|
||||||
.sysfs_name = "umc_err_count",
|
|
||||||
.debugfs_name = "umc_err_inject",
|
|
||||||
};
|
|
||||||
struct ras_common_if ras_block = {
|
|
||||||
.block = AMDGPU_RAS_BLOCK__UMC,
|
|
||||||
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
|
|
||||||
.sub_block_index = 0,
|
|
||||||
.name = "umc",
|
|
||||||
};
|
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) {
|
if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
|
||||||
amdgpu_ras_feature_enable_on_boot(adev, &ras_block, 0);
|
ras_if = &adev->gmc.umc_ras_if;
|
||||||
|
else if (ras_block->block == AMDGPU_RAS_BLOCK__MMHUB)
|
||||||
|
ras_if = &adev->gmc.mmhub_ras_if;
|
||||||
|
else
|
||||||
|
BUG();
|
||||||
|
|
||||||
|
if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
|
||||||
|
amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -775,7 +773,7 @@ static int gmc_v9_0_ecc_late_init(void *handle)
|
|||||||
if (r == -EAGAIN) {
|
if (r == -EAGAIN) {
|
||||||
/* request a gpu reset. will run again. */
|
/* request a gpu reset. will run again. */
|
||||||
amdgpu_ras_request_reset_on_boot(adev,
|
amdgpu_ras_request_reset_on_boot(adev,
|
||||||
AMDGPU_RAS_BLOCK__UMC);
|
ras_block->block);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
/* fail to enable ras, cleanup all. */
|
/* fail to enable ras, cleanup all. */
|
||||||
@@ -789,40 +787,45 @@ static int gmc_v9_0_ecc_late_init(void *handle)
|
|||||||
if (!*ras_if)
|
if (!*ras_if)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
**ras_if = ras_block;
|
**ras_if = *ras_block;
|
||||||
|
|
||||||
r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
|
r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
|
||||||
if (r) {
|
if (r) {
|
||||||
if (r == -EAGAIN) {
|
if (r == -EAGAIN) {
|
||||||
amdgpu_ras_request_reset_on_boot(adev,
|
amdgpu_ras_request_reset_on_boot(adev,
|
||||||
AMDGPU_RAS_BLOCK__UMC);
|
ras_block->block);
|
||||||
r = 0;
|
r = 0;
|
||||||
}
|
}
|
||||||
goto feature;
|
goto feature;
|
||||||
}
|
}
|
||||||
|
|
||||||
ih_info.head = **ras_if;
|
ih_info.head = **ras_if;
|
||||||
fs_info.head = **ras_if;
|
fs_info->head = **ras_if;
|
||||||
|
|
||||||
|
if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
|
||||||
r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
|
r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
|
||||||
if (r)
|
if (r)
|
||||||
goto interrupt;
|
goto interrupt;
|
||||||
|
}
|
||||||
|
|
||||||
amdgpu_ras_debugfs_create(adev, &fs_info);
|
amdgpu_ras_debugfs_create(adev, fs_info);
|
||||||
|
|
||||||
r = amdgpu_ras_sysfs_create(adev, &fs_info);
|
r = amdgpu_ras_sysfs_create(adev, fs_info);
|
||||||
if (r)
|
if (r)
|
||||||
goto sysfs;
|
goto sysfs;
|
||||||
resume:
|
resume:
|
||||||
|
if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
|
||||||
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
|
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
|
||||||
if (r)
|
if (r)
|
||||||
goto irq;
|
goto irq;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
irq:
|
irq:
|
||||||
amdgpu_ras_sysfs_remove(adev, *ras_if);
|
amdgpu_ras_sysfs_remove(adev, *ras_if);
|
||||||
sysfs:
|
sysfs:
|
||||||
amdgpu_ras_debugfs_remove(adev, *ras_if);
|
amdgpu_ras_debugfs_remove(adev, *ras_if);
|
||||||
|
if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
|
||||||
amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
|
amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
|
||||||
interrupt:
|
interrupt:
|
||||||
amdgpu_ras_feature_enable(adev, *ras_if, 0);
|
amdgpu_ras_feature_enable(adev, *ras_if, 0);
|
||||||
@@ -832,6 +835,40 @@ feature:
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int gmc_v9_0_ecc_late_init(void *handle)
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
|
||||||
|
struct ras_fs_if umc_fs_info = {
|
||||||
|
.sysfs_name = "umc_err_count",
|
||||||
|
.debugfs_name = "umc_err_inject",
|
||||||
|
};
|
||||||
|
struct ras_common_if umc_ras_block = {
|
||||||
|
.block = AMDGPU_RAS_BLOCK__UMC,
|
||||||
|
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
|
||||||
|
.sub_block_index = 0,
|
||||||
|
.name = "umc",
|
||||||
|
};
|
||||||
|
struct ras_fs_if mmhub_fs_info = {
|
||||||
|
.sysfs_name = "mmhub_err_count",
|
||||||
|
.debugfs_name = "mmhub_err_inject",
|
||||||
|
};
|
||||||
|
struct ras_common_if mmhub_ras_block = {
|
||||||
|
.block = AMDGPU_RAS_BLOCK__MMHUB,
|
||||||
|
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
|
||||||
|
.sub_block_index = 0,
|
||||||
|
.name = "mmhub",
|
||||||
|
};
|
||||||
|
|
||||||
|
r = gmc_v9_0_ecc_ras_block_late_init(handle,
|
||||||
|
&umc_fs_info, &umc_ras_block);
|
||||||
|
if (r)
|
||||||
|
return r;
|
||||||
|
|
||||||
|
r = gmc_v9_0_ecc_ras_block_late_init(handle,
|
||||||
|
&mmhub_fs_info, &mmhub_ras_block);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
static int gmc_v9_0_late_init(void *handle)
|
static int gmc_v9_0_late_init(void *handle)
|
||||||
{
|
{
|
||||||
@@ -1192,21 +1229,32 @@ static int gmc_v9_0_sw_fini(void *handle)
|
|||||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||||
|
|
||||||
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
|
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
|
||||||
adev->gmc.ras_if) {
|
adev->gmc.umc_ras_if) {
|
||||||
struct ras_common_if *ras_if = adev->gmc.ras_if;
|
struct ras_common_if *ras_if = adev->gmc.umc_ras_if;
|
||||||
struct ras_ih_if ih_info = {
|
struct ras_ih_if ih_info = {
|
||||||
.head = *ras_if,
|
.head = *ras_if,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*remove fs first*/
|
/* remove fs first */
|
||||||
amdgpu_ras_debugfs_remove(adev, ras_if);
|
amdgpu_ras_debugfs_remove(adev, ras_if);
|
||||||
amdgpu_ras_sysfs_remove(adev, ras_if);
|
amdgpu_ras_sysfs_remove(adev, ras_if);
|
||||||
/*remove the IH*/
|
/* remove the IH */
|
||||||
amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
|
amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
|
||||||
amdgpu_ras_feature_enable(adev, ras_if, 0);
|
amdgpu_ras_feature_enable(adev, ras_if, 0);
|
||||||
kfree(ras_if);
|
kfree(ras_if);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB) &&
|
||||||
|
adev->gmc.mmhub_ras_if) {
|
||||||
|
struct ras_common_if *ras_if = adev->gmc.mmhub_ras_if;
|
||||||
|
|
||||||
|
/* remove fs and disable ras feature */
|
||||||
|
amdgpu_ras_debugfs_remove(adev, ras_if);
|
||||||
|
amdgpu_ras_sysfs_remove(adev, ras_if);
|
||||||
|
amdgpu_ras_feature_enable(adev, ras_if, 0);
|
||||||
|
kfree(ras_if);
|
||||||
|
}
|
||||||
|
|
||||||
amdgpu_gem_force_release(adev);
|
amdgpu_gem_force_release(adev);
|
||||||
amdgpu_vm_manager_fini(adev);
|
amdgpu_vm_manager_fini(adev);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user