drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
When the radeon driver resets a gpu, it attempts to test whether all the rings can successfully handle an IB. If these rings fail to respond, the process will wait forever. Another gpu reset can't happen at this point, as the current reset holds a lock required to do so. Instead, make all the IB tests run with a timeout, so the system can attempt to recover in this case. While this doesn't fix the underlying issue with card resets failing, it gives the system a higher chance of recovering. These timeouts have been confirmed to help both a Tathi and Hawaii card recover after a gpu reset. This also adds a new function, radeon_fence_wait_timeout, that behaves like fence_wait_timeout. It is used instead of fence_wait_timeout as it continues to work during a reset. radeon_fence_wait is changed to be implemented using this function. V2: - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was too long. A timeout of 100ms was tested and found to be too short. - Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout. Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:

committed by
Alex Deucher

parent
6e9821b26d
commit
04db4caf5c
@@ -526,6 +526,46 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* radeon_fence_wait_timeout - wait for a fence to signal with timeout
|
||||
*
|
||||
* @fence: radeon fence object
|
||||
* @intr: use interruptible sleep
|
||||
*
|
||||
* Wait for the requested fence to signal (all asics).
|
||||
* @intr selects whether to use interruptable (true) or non-interruptable
|
||||
* (false) sleep when waiting for the fence.
|
||||
* @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
|
||||
* Returns remaining time if the sequence number has passed, 0 when
|
||||
* the wait timeout, or an error for all other cases.
|
||||
*/
|
||||
long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
|
||||
{
|
||||
uint64_t seq[RADEON_NUM_RINGS] = {};
|
||||
long r;
|
||||
int r_sig;
|
||||
|
||||
/*
|
||||
* This function should not be called on !radeon fences.
|
||||
* If this is the case, it would mean this function can
|
||||
* also be called on radeon fences belonging to another card.
|
||||
* exclusive_lock is not held in that case.
|
||||
*/
|
||||
if (WARN_ON_ONCE(!to_radeon_fence(&fence->base)))
|
||||
return fence_wait(&fence->base, intr);
|
||||
|
||||
seq[fence->ring] = fence->seq;
|
||||
r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
|
||||
if (r <= 0) {
|
||||
return r;
|
||||
}
|
||||
|
||||
r_sig = fence_signal(&fence->base);
|
||||
if (!r_sig)
|
||||
FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* radeon_fence_wait - wait for a fence to signal
|
||||
*
|
||||
@@ -539,28 +579,12 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
|
||||
*/
|
||||
int radeon_fence_wait(struct radeon_fence *fence, bool intr)
|
||||
{
|
||||
uint64_t seq[RADEON_NUM_RINGS] = {};
|
||||
long r;
|
||||
|
||||
/*
|
||||
* This function should not be called on !radeon fences.
|
||||
* If this is the case, it would mean this function can
|
||||
* also be called on radeon fences belonging to another card.
|
||||
* exclusive_lock is not held in that case.
|
||||
*/
|
||||
if (WARN_ON_ONCE(!to_radeon_fence(&fence->base)))
|
||||
return fence_wait(&fence->base, intr);
|
||||
|
||||
seq[fence->ring] = fence->seq;
|
||||
r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
|
||||
if (r < 0) {
|
||||
long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
|
||||
if (r > 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return r;
|
||||
}
|
||||
|
||||
r = fence_signal(&fence->base);
|
||||
if (!r)
|
||||
FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
Reference in New Issue
Block a user