drm/amdgpu:implement new GPU recover(v3)
1,new imple names amdgpu_gpu_recover which gives more hint on what it does compared with gpu_reset 2,gpu_recover unify bare-metal and SR-IOV, only the asic reset part is implemented differently 3,gpu_recover will increase hang job karma and mark its entity/context as guilty if exceeds limit V2: 4,in scheduler main routine the job from guilty context will be immedialy fake signaled after it poped from queue and its fence be set with "-ECANCELED" error 5,in scheduler recovery routine all jobs from the guilty entity would be dropped 6,in run_job() routine the real IB submission would be skipped if @skip parameter equales true or there was VRAM lost occured. V3: 7,replace deprecated gpu reset, use new gpu recover Signed-off-by: Monk Liu <Monk.Liu@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -178,6 +178,10 @@ extern int amdgpu_cik_support;
|
||||
#define CIK_CURSOR_WIDTH 128
|
||||
#define CIK_CURSOR_HEIGHT 128
|
||||
|
||||
/* GPU RESET flags */
|
||||
#define AMDGPU_RESET_INFO_VRAM_LOST (1 << 0)
|
||||
#define AMDGPU_RESET_INFO_FULLRESET (1 << 1)
|
||||
|
||||
struct amdgpu_device;
|
||||
struct amdgpu_ib;
|
||||
struct amdgpu_cs_parser;
|
||||
@@ -1833,7 +1837,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
|
||||
#define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
|
||||
|
||||
/* Common functions */
|
||||
int amdgpu_gpu_reset(struct amdgpu_device *adev);
|
||||
int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
|
||||
bool amdgpu_need_backup(struct amdgpu_device *adev);
|
||||
void amdgpu_pci_config_reset(struct amdgpu_device *adev);
|
||||
bool amdgpu_need_post(struct amdgpu_device *adev);
|
||||
|
Reference in New Issue
Block a user