|
@@ -683,7 +683,6 @@ static void notify_ring(struct drm_device *dev,
|
|
|
|
|
|
wake_up_all(&ring->irq_queue);
|
|
|
if (i915_enable_hangcheck) {
|
|
|
- dev_priv->gpu_error.hangcheck_count = 0;
|
|
|
mod_timer(&dev_priv->gpu_error.hangcheck_timer,
|
|
|
round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
|
|
|
}
|
|
@@ -1656,7 +1655,7 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
|
|
|
struct drm_i915_gem_object *obj;
|
|
|
int i = 0;
|
|
|
|
|
|
- list_for_each_entry(obj, head, gtt_list) {
|
|
|
+ list_for_each_entry(obj, head, global_list) {
|
|
|
if (obj->pin_count == 0)
|
|
|
continue;
|
|
|
|
|
@@ -1798,7 +1797,7 @@ static void i915_gem_record_active_context(struct intel_ring_buffer *ring,
|
|
|
if (ring->id != RCS || !error->ccid)
|
|
|
return;
|
|
|
|
|
|
- list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
|
|
|
+ list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
|
|
|
if ((error->ccid & PAGE_MASK) == obj->gtt_offset) {
|
|
|
ering->ctx = i915_error_object_create_sized(dev_priv,
|
|
|
obj, 1);
|
|
@@ -1935,7 +1934,7 @@ static void i915_capture_error_state(struct drm_device *dev)
|
|
|
list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list)
|
|
|
i++;
|
|
|
error->active_bo_count = i;
|
|
|
- list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list)
|
|
|
+ list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list)
|
|
|
if (obj->pin_count)
|
|
|
i++;
|
|
|
error->pinned_bo_count = i - error->active_bo_count;
|
|
@@ -2315,38 +2314,28 @@ ring_last_seqno(struct intel_ring_buffer *ring)
|
|
|
struct drm_i915_gem_request, list)->seqno;
|
|
|
}
|
|
|
|
|
|
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
|
|
|
- u32 ring_seqno, bool *err)
|
|
|
+static bool
|
|
|
+ring_idle(struct intel_ring_buffer *ring, u32 seqno)
|
|
|
{
|
|
|
- if (list_empty(&ring->request_list) ||
|
|
|
- i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
|
|
|
- /* Issue a wake-up to catch stuck h/w. */
|
|
|
- if (waitqueue_active(&ring->irq_queue)) {
|
|
|
- DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
|
|
|
- ring->name);
|
|
|
- wake_up_all(&ring->irq_queue);
|
|
|
- *err = true;
|
|
|
- }
|
|
|
- return true;
|
|
|
- }
|
|
|
- return false;
|
|
|
+ return (list_empty(&ring->request_list) ||
|
|
|
+ i915_seqno_passed(seqno, ring_last_seqno(ring)));
|
|
|
}
|
|
|
|
|
|
-static bool semaphore_passed(struct intel_ring_buffer *ring)
|
|
|
+static struct intel_ring_buffer *
|
|
|
+semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
|
|
|
{
|
|
|
struct drm_i915_private *dev_priv = ring->dev->dev_private;
|
|
|
- u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
|
|
|
- struct intel_ring_buffer *signaller;
|
|
|
- u32 cmd, ipehr, acthd_min;
|
|
|
+ u32 cmd, ipehr, acthd, acthd_min;
|
|
|
|
|
|
ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
|
|
|
if ((ipehr & ~(0x3 << 16)) !=
|
|
|
(MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
|
|
|
- return false;
|
|
|
+ return NULL;
|
|
|
|
|
|
/* ACTHD is likely pointing to the dword after the actual command,
|
|
|
* so scan backwards until we find the MBOX.
|
|
|
*/
|
|
|
+ acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
|
|
|
acthd_min = max((int)acthd - 3 * 4, 0);
|
|
|
do {
|
|
|
cmd = ioread32(ring->virtual_start + acthd);
|
|
@@ -2355,128 +2344,216 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
|
|
|
|
|
|
acthd -= 4;
|
|
|
if (acthd < acthd_min)
|
|
|
- return false;
|
|
|
+ return NULL;
|
|
|
} while (1);
|
|
|
|
|
|
- signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
|
|
|
- return i915_seqno_passed(signaller->get_seqno(signaller, false),
|
|
|
- ioread32(ring->virtual_start+acthd+4)+1);
|
|
|
+ *seqno = ioread32(ring->virtual_start+acthd+4)+1;
|
|
|
+ return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
|
|
|
}
|
|
|
|
|
|
-static bool kick_ring(struct intel_ring_buffer *ring)
|
|
|
+static int semaphore_passed(struct intel_ring_buffer *ring)
|
|
|
{
|
|
|
- struct drm_device *dev = ring->dev;
|
|
|
- struct drm_i915_private *dev_priv = dev->dev_private;
|
|
|
- u32 tmp = I915_READ_CTL(ring);
|
|
|
- if (tmp & RING_WAIT) {
|
|
|
- DRM_ERROR("Kicking stuck wait on %s\n",
|
|
|
- ring->name);
|
|
|
- I915_WRITE_CTL(ring, tmp);
|
|
|
- return true;
|
|
|
- }
|
|
|
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
|
|
|
+ struct intel_ring_buffer *signaller;
|
|
|
+ u32 seqno, ctl;
|
|
|
|
|
|
- if (INTEL_INFO(dev)->gen >= 6 &&
|
|
|
- tmp & RING_WAIT_SEMAPHORE &&
|
|
|
- semaphore_passed(ring)) {
|
|
|
- DRM_ERROR("Kicking stuck semaphore on %s\n",
|
|
|
- ring->name);
|
|
|
- I915_WRITE_CTL(ring, tmp);
|
|
|
- return true;
|
|
|
- }
|
|
|
- return false;
|
|
|
+ ring->hangcheck.deadlock = true;
|
|
|
+
|
|
|
+ signaller = semaphore_waits_for(ring, &seqno);
|
|
|
+ if (signaller == NULL || signaller->hangcheck.deadlock)
|
|
|
+ return -1;
|
|
|
+
|
|
|
+ /* cursory check for an unkickable deadlock */
|
|
|
+ ctl = I915_READ_CTL(signaller);
|
|
|
+ if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
|
|
|
+ return -1;
|
|
|
+
|
|
|
+ return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
|
|
|
}
|
|
|
|
|
|
-static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
|
|
|
+static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
|
|
|
{
|
|
|
- if (IS_GEN2(ring->dev))
|
|
|
- return false;
|
|
|
+ struct intel_ring_buffer *ring;
|
|
|
+ int i;
|
|
|
|
|
|
- /* Is the chip hanging on a WAIT_FOR_EVENT?
|
|
|
- * If so we can simply poke the RB_WAIT bit
|
|
|
- * and break the hang. This should work on
|
|
|
- * all but the second generation chipsets.
|
|
|
- */
|
|
|
- return !kick_ring(ring);
|
|
|
+ for_each_ring(ring, dev_priv, i)
|
|
|
+ ring->hangcheck.deadlock = false;
|
|
|
}
|
|
|
|
|
|
-static bool i915_hangcheck_hung(struct drm_device *dev)
|
|
|
+static enum intel_ring_hangcheck_action
|
|
|
+ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
|
|
|
{
|
|
|
- drm_i915_private_t *dev_priv = dev->dev_private;
|
|
|
+ struct drm_device *dev = ring->dev;
|
|
|
+ struct drm_i915_private *dev_priv = dev->dev_private;
|
|
|
+ u32 tmp;
|
|
|
|
|
|
- if (dev_priv->gpu_error.hangcheck_count++ > 1) {
|
|
|
- bool hung = true;
|
|
|
- struct intel_ring_buffer *ring;
|
|
|
- int i;
|
|
|
+ if (ring->hangcheck.acthd != acthd)
|
|
|
+ return active;
|
|
|
|
|
|
- DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
|
|
|
- i915_handle_error(dev, true);
|
|
|
+ if (IS_GEN2(dev))
|
|
|
+ return hung;
|
|
|
|
|
|
- for_each_ring(ring, dev_priv, i)
|
|
|
- hung &= i915_hangcheck_ring_hung(ring);
|
|
|
+ /* Is the chip hanging on a WAIT_FOR_EVENT?
|
|
|
+ * If so we can simply poke the RB_WAIT bit
|
|
|
+ * and break the hang. This should work on
|
|
|
+ * all but the second generation chipsets.
|
|
|
+ */
|
|
|
+ tmp = I915_READ_CTL(ring);
|
|
|
+ if (tmp & RING_WAIT) {
|
|
|
+ DRM_ERROR("Kicking stuck wait on %s\n",
|
|
|
+ ring->name);
|
|
|
+ I915_WRITE_CTL(ring, tmp);
|
|
|
+ return kick;
|
|
|
+ }
|
|
|
|
|
|
- return hung;
|
|
|
+ if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
|
|
|
+ switch (semaphore_passed(ring)) {
|
|
|
+ default:
|
|
|
+ return hung;
|
|
|
+ case 1:
|
|
|
+ DRM_ERROR("Kicking stuck semaphore on %s\n",
|
|
|
+ ring->name);
|
|
|
+ I915_WRITE_CTL(ring, tmp);
|
|
|
+ return kick;
|
|
|
+ case 0:
|
|
|
+ return wait;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- return false;
|
|
|
+ return hung;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* This is called when the chip hasn't reported back with completed
|
|
|
- * batchbuffers in a long time. The first time this is called we simply record
|
|
|
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
|
|
|
- * again, we assume the chip is wedged and try to fix it.
|
|
|
+ * batchbuffers in a long time. We keep track per ring seqno progress and
|
|
|
+ * if there are no progress, hangcheck score for that ring is increased.
|
|
|
+ * Further, acthd is inspected to see if the ring is stuck. On stuck case
|
|
|
+ * we kick the ring. If we see no progress on three subsequent calls
|
|
|
+ * we assume chip is wedged and try to fix it by resetting the chip.
|
|
|
*/
|
|
|
void i915_hangcheck_elapsed(unsigned long data)
|
|
|
{
|
|
|
struct drm_device *dev = (struct drm_device *)data;
|
|
|
drm_i915_private_t *dev_priv = dev->dev_private;
|
|
|
struct intel_ring_buffer *ring;
|
|
|
- bool err = false, idle;
|
|
|
int i;
|
|
|
- u32 seqno[I915_NUM_RINGS];
|
|
|
- bool work_done;
|
|
|
+ int busy_count = 0, rings_hung = 0;
|
|
|
+ bool stuck[I915_NUM_RINGS] = { 0 };
|
|
|
+#define BUSY 1
|
|
|
+#define KICK 5
|
|
|
+#define HUNG 20
|
|
|
+#define FIRE 30
|
|
|
|
|
|
if (!i915_enable_hangcheck)
|
|
|
return;
|
|
|
|
|
|
- idle = true;
|
|
|
for_each_ring(ring, dev_priv, i) {
|
|
|
- seqno[i] = ring->get_seqno(ring, false);
|
|
|
- idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
|
|
|
- }
|
|
|
-
|
|
|
- /* If all work is done then ACTHD clearly hasn't advanced. */
|
|
|
- if (idle) {
|
|
|
- if (err) {
|
|
|
- if (i915_hangcheck_hung(dev))
|
|
|
- return;
|
|
|
-
|
|
|
- goto repeat;
|
|
|
+ u32 seqno, acthd;
|
|
|
+ bool busy = true;
|
|
|
+
|
|
|
+ semaphore_clear_deadlocks(dev_priv);
|
|
|
+
|
|
|
+ seqno = ring->get_seqno(ring, false);
|
|
|
+ acthd = intel_ring_get_active_head(ring);
|
|
|
+
|
|
|
+ if (ring->hangcheck.seqno == seqno) {
|
|
|
+ if (ring_idle(ring, seqno)) {
|
|
|
+ if (waitqueue_active(&ring->irq_queue)) {
|
|
|
+ /* Issue a wake-up to catch stuck h/w. */
|
|
|
+ DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
|
|
|
+ ring->name);
|
|
|
+ wake_up_all(&ring->irq_queue);
|
|
|
+ ring->hangcheck.score += HUNG;
|
|
|
+ } else
|
|
|
+ busy = false;
|
|
|
+ } else {
|
|
|
+ int score;
|
|
|
+
|
|
|
+ /* We always increment the hangcheck score
|
|
|
+ * if the ring is busy and still processing
|
|
|
+ * the same request, so that no single request
|
|
|
+ * can run indefinitely (such as a chain of
|
|
|
+ * batches). The only time we do not increment
|
|
|
+ * the hangcheck score on this ring, if this
|
|
|
+ * ring is in a legitimate wait for another
|
|
|
+ * ring. In that case the waiting ring is a
|
|
|
+ * victim and we want to be sure we catch the
|
|
|
+ * right culprit. Then every time we do kick
|
|
|
+ * the ring, add a small increment to the
|
|
|
+ * score so that we can catch a batch that is
|
|
|
+ * being repeatedly kicked and so responsible
|
|
|
+ * for stalling the machine.
|
|
|
+ */
|
|
|
+ ring->hangcheck.action = ring_stuck(ring,
|
|
|
+ acthd);
|
|
|
+
|
|
|
+ switch (ring->hangcheck.action) {
|
|
|
+ case wait:
|
|
|
+ score = 0;
|
|
|
+ break;
|
|
|
+ case active:
|
|
|
+ score = BUSY;
|
|
|
+ break;
|
|
|
+ case kick:
|
|
|
+ score = KICK;
|
|
|
+ break;
|
|
|
+ case hung:
|
|
|
+ score = HUNG;
|
|
|
+ stuck[i] = true;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ ring->hangcheck.score += score;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ /* Gradually reduce the count so that we catch DoS
|
|
|
+ * attempts across multiple batches.
|
|
|
+ */
|
|
|
+ if (ring->hangcheck.score > 0)
|
|
|
+ ring->hangcheck.score--;
|
|
|
}
|
|
|
|
|
|
- dev_priv->gpu_error.hangcheck_count = 0;
|
|
|
- return;
|
|
|
+ ring->hangcheck.seqno = seqno;
|
|
|
+ ring->hangcheck.acthd = acthd;
|
|
|
+ busy_count += busy;
|
|
|
}
|
|
|
|
|
|
- work_done = false;
|
|
|
for_each_ring(ring, dev_priv, i) {
|
|
|
- if (ring->hangcheck.seqno != seqno[i]) {
|
|
|
- work_done = true;
|
|
|
- ring->hangcheck.seqno = seqno[i];
|
|
|
+ if (ring->hangcheck.score > FIRE) {
|
|
|
+ DRM_ERROR("%s on %s\n",
|
|
|
+ stuck[i] ? "stuck" : "no progress",
|
|
|
+ ring->name);
|
|
|
+ rings_hung++;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (!work_done) {
|
|
|
- if (i915_hangcheck_hung(dev))
|
|
|
- return;
|
|
|
- } else {
|
|
|
- dev_priv->gpu_error.hangcheck_count = 0;
|
|
|
- }
|
|
|
+ if (rings_hung)
|
|
|
+ return i915_handle_error(dev, true);
|
|
|
|
|
|
-repeat:
|
|
|
- /* Reset timer case chip hangs without another request being added */
|
|
|
- mod_timer(&dev_priv->gpu_error.hangcheck_timer,
|
|
|
- round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
|
|
|
+ if (busy_count)
|
|
|
+ /* Reset timer case chip hangs without another request
|
|
|
+ * being added */
|
|
|
+ mod_timer(&dev_priv->gpu_error.hangcheck_timer,
|
|
|
+ round_jiffies_up(jiffies +
|
|
|
+ DRM_I915_HANGCHECK_JIFFIES));
|
|
|
+}
|
|
|
+
|
|
|
+static void ibx_irq_preinstall(struct drm_device *dev)
|
|
|
+{
|
|
|
+ struct drm_i915_private *dev_priv = dev->dev_private;
|
|
|
+
|
|
|
+ if (HAS_PCH_NOP(dev))
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* south display irq */
|
|
|
+ I915_WRITE(SDEIMR, 0xffffffff);
|
|
|
+ /*
|
|
|
+ * SDEIER is also touched by the interrupt handler to work around missed
|
|
|
+ * PCH interrupts. Hence we can't update it after the interrupt handler
|
|
|
+ * is enabled - instead we unconditionally enable all PCH interrupt
|
|
|
+ * sources here, but then only unmask them as needed with SDEIMR.
|
|
|
+ */
|
|
|
+ I915_WRITE(SDEIER, 0xffffffff);
|
|
|
+ POSTING_READ(SDEIER);
|
|
|
}
|
|
|
|
|
|
/* drm_dma.h hooks
|
|
@@ -2500,16 +2577,7 @@ static void ironlake_irq_preinstall(struct drm_device *dev)
|
|
|
I915_WRITE(GTIER, 0x0);
|
|
|
POSTING_READ(GTIER);
|
|
|
|
|
|
- /* south display irq */
|
|
|
- I915_WRITE(SDEIMR, 0xffffffff);
|
|
|
- /*
|
|
|
- * SDEIER is also touched by the interrupt handler to work around missed
|
|
|
- * PCH interrupts. Hence we can't update it after the interrupt handler
|
|
|
- * is enabled - instead we unconditionally enable all PCH interrupt
|
|
|
- * sources here, but then only unmask them as needed with SDEIMR.
|
|
|
- */
|
|
|
- I915_WRITE(SDEIER, 0xffffffff);
|
|
|
- POSTING_READ(SDEIER);
|
|
|
+ ibx_irq_preinstall(dev);
|
|
|
}
|
|
|
|
|
|
static void ivybridge_irq_preinstall(struct drm_device *dev)
|
|
@@ -2536,19 +2604,7 @@ static void ivybridge_irq_preinstall(struct drm_device *dev)
|
|
|
I915_WRITE(GEN6_PMIER, 0x0);
|
|
|
POSTING_READ(GEN6_PMIER);
|
|
|
|
|
|
- if (HAS_PCH_NOP(dev))
|
|
|
- return;
|
|
|
-
|
|
|
- /* south display irq */
|
|
|
- I915_WRITE(SDEIMR, 0xffffffff);
|
|
|
- /*
|
|
|
- * SDEIER is also touched by the interrupt handler to work around missed
|
|
|
- * PCH interrupts. Hence we can't update it after the interrupt handler
|
|
|
- * is enabled - instead we unconditionally enable all PCH interrupt
|
|
|
- * sources here, but then only unmask them as needed with SDEIMR.
|
|
|
- */
|
|
|
- I915_WRITE(SDEIER, 0xffffffff);
|
|
|
- POSTING_READ(SDEIER);
|
|
|
+ ibx_irq_preinstall(dev);
|
|
|
}
|
|
|
|
|
|
static void valleyview_irq_preinstall(struct drm_device *dev)
|