drm/i915: Beware temporary wedging when determining -EIO

At a few points in our uABI, we check to see if the driver is wedged and
report -EIO back to the user in that case. However, as we perform the
check and reset asynchronously (where once before they were both
serialised by the struct_mutex), we may instead see the temporary wedging
used to cancel inflight rendering to avoid a deadlock during reset
(caused by either us timing out in our reset handler,
i915_wedge_on_timeout or with malice aforethought in intel_reset_prepare
for a stuck modeset). If we suspect this is the case, that is we see a
wedged driver *and* reset in progress, then wait until the reset is
resolved before reporting upon the wedged status.

v2: might_sleep() (Mika)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190220145637.23503-1-chris@chris-wilson.co.uk
This commit is contained in:
Chris Wilson
2019-02-20 14:56:37 +00:00
parent 47ed55a9bb
commit c41166f9a1
19 changed files with 91 additions and 52 deletions

View File

@@ -1032,7 +1032,7 @@ void i915_reset(struct drm_i915_private *i915,
finish:
reset_finish(i915);
if (!i915_terminally_wedged(error))
if (!__i915_wedged(error))
reset_restart(i915);
return;
@@ -1253,7 +1253,7 @@ void i915_handle_error(struct drm_i915_private *i915,
* Try engine reset when available. We fall back to full reset if
* single reset fails.
*/
if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
for_each_engine_masked(engine, i915, engine_mask, tmp) {
BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1339,6 +1339,31 @@ __releases(&i915->gpu_error.reset_backoff_srcu)
srcu_read_unlock(&error->reset_backoff_srcu, tag);
}
int i915_terminally_wedged(struct drm_i915_private *i915)
{
struct i915_gpu_error *error = &i915->gpu_error;
might_sleep();
if (!__i915_wedged(error))
return 0;
/* Reset still in progress? Maybe we will recover? */
if (!test_bit(I915_RESET_BACKOFF, &error->flags))
return -EIO;
/* XXX intel_reset_finish() still takes struct_mutex!!! */
if (mutex_is_locked(&i915->drm.struct_mutex))
return -EAGAIN;
if (wait_event_interruptible(error->reset_queue,
!test_bit(I915_RESET_BACKOFF,
&error->flags)))
return -EINTR;
return __i915_wedged(error) ? -EIO : 0;
}
bool i915_reset_flush(struct drm_i915_private *i915)
{
int err;