[08/15] drm/i915: Slaughter the thundering i915_wait_request herd

Submitted by Chris Wilson on Nov. 29, 2015, 8:48 a.m.

Details

Message ID 1448786893-2522-9-git-send-email-chris@chris-wilson.co.uk
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Chris Wilson Nov. 29, 2015, 8:48 a.m.
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one. Alternatively, we can have one kthread responsible for waking
after an interrupt, checking the seqno and only waking up the waiting
clients who are complete. The disadvantage is that in the uncontended
scenario (i.e. only one waiter) we incur an extra context switch in the
wakeup path - though that should be mitigated somewhat by the busy-wait
we do first before sleeping.

v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/Makefile            |   1 +
 drivers/gpu/drm/i915/i915_dma.c          |   9 +-
 drivers/gpu/drm/i915/i915_drv.h          |  35 ++++-
 drivers/gpu/drm/i915/i915_gem.c          |  99 +++---------
 drivers/gpu/drm/i915/i915_gpu_error.c    |   8 +-
 drivers/gpu/drm/i915/i915_irq.c          |  17 +--
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 253 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c         |   2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c  |   3 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h  |   3 +-
 10 files changed, 333 insertions(+), 97 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 0851de07bd13..d3b9d3618719 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -35,6 +35,7 @@  i915-y += i915_cmd_parser.o \
 	  i915_gem_userptr.o \
 	  i915_gpu_error.o \
 	  i915_trace_points.o \
+	  intel_breadcrumbs.o \
 	  intel_lrc.o \
 	  intel_mocs.o \
 	  intel_ringbuffer.o \
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index a81c76603544..93c6762d87b4 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -388,12 +388,16 @@  static int i915_load_modeset_init(struct drm_device *dev)
 	if (ret)
 		goto cleanup_vga_client;
 
+	ret = intel_breadcrumbs_init(dev_priv);
+	if (ret)
+		goto cleanup_vga_switcheroo;
+
 	/* Initialise stolen first so that we may reserve preallocated
 	 * objects for the BIOS to KMS transition.
 	 */
 	ret = i915_gem_init_stolen(dev);
 	if (ret)
-		goto cleanup_vga_switcheroo;
+		goto cleanup_breadcrumbs;
 
 	intel_power_domains_init_hw(dev_priv, false);
 
@@ -454,6 +458,8 @@  cleanup_irq:
 	drm_irq_uninstall(dev);
 cleanup_gem_stolen:
 	i915_gem_cleanup_stolen(dev);
+cleanup_breadcrumbs:
+	intel_breadcrumbs_fini(dev_priv);
 cleanup_vga_switcheroo:
 	vga_switcheroo_unregister_client(dev->pdev);
 cleanup_vga_client:
@@ -1193,6 +1199,7 @@  int i915_driver_unload(struct drm_device *dev)
 	mutex_unlock(&dev->struct_mutex);
 	intel_fbc_cleanup_cfb(dev_priv);
 	i915_gem_cleanup_stolen(dev);
+	intel_breadcrumbs_fini(dev_priv);
 
 	intel_csr_ucode_fini(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e1980212ba37..782d08ab6264 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -564,6 +564,7 @@  struct drm_i915_error_state {
 			long jiffies;
 			u32 seqno;
 			u32 tail;
+			u32 waiters;
 		} *requests;
 
 		struct {
@@ -1383,7 +1384,7 @@  struct i915_gpu_error {
 #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
 
 	/* For missed irq/seqno simulation. */
-	unsigned int test_irq_rings;
+	unsigned long test_irq_rings;
 };
 
 enum modeset_restore {
@@ -1695,6 +1696,27 @@  struct drm_i915_private {
 
 	struct intel_uncore uncore;
 
+	/* Rather than have every client wait upon all user interrupts,
+	 * with the herd waking after every interrupt and each doing the
+	 * heavyweight seqno dance, we delegate the task to a bottom-half
+	 * for the user interrupt (one bh handling all engines). Now,
+	 * just one task is woken up and does the coherent seqno read and
+	 * then wakes up all the waiters registered with the bottom-half
+	 * that are complete. This avoid the thundering herd problem
+	 * with lots of concurrent waiters, at the expense of an extra
+	 * context switch between the interrupt and the client. That should
+	 * be mitigated somewhat by the busyspin in the client before we go to
+	 * sleep waiting for an interrupt from the bottom-half.
+	 */
+	struct intel_breadcrumbs {
+		spinlock_t lock; /* protects the per-engine request list */
+		struct task_struct *task; /* bh for all user interrupts */
+		struct intel_breadcrumbs_engine {
+			struct rb_root requests; /* sorted by retirement */
+			struct drm_i915_gem_request *first;
+		} engine[I915_NUM_RINGS];
+	} breadcrumbs;
+
 	struct i915_virtual_gpu vgpu;
 
 	struct intel_guc guc;
@@ -2228,6 +2250,11 @@  struct drm_i915_gem_request {
 	/** global list entry for this request */
 	struct list_head list;
 
+	/** List of clients waiting for completion of this request */
+	wait_queue_head_t wait;
+	struct rb_node irq_node;
+	unsigned irq_count;
+
 	struct drm_i915_file_private *file_priv;
 	/** file_priv list entry for this request */
 	struct list_head client_list;
@@ -2800,6 +2827,12 @@  ibx_disable_display_interrupt(struct drm_i915_private *dev_priv, uint32_t bits)
 }
 
 
+/* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
+int intel_breadcrumbs_init(struct drm_i915_private *i915);
+bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request);
+void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request);
+void intel_breadcrumbs_fini(struct drm_i915_private *i915);
+
 /* i915_gem.c */
 int i915_gem_create_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 65d101b47d8e..969592fb0969 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1126,17 +1126,6 @@  i915_gem_check_wedge(unsigned reset_counter,
 	return 0;
 }
 
-static void fake_irq(unsigned long data)
-{
-	wake_up_process((struct task_struct *)data);
-}
-
-static bool missed_irq(struct drm_i915_private *dev_priv,
-		       struct intel_engine_cs *ring)
-{
-	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
-}
-
 static unsigned long local_clock_us(unsigned *cpu)
 {
 	unsigned long t;
@@ -1184,9 +1173,6 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 	 * takes to sleep on a request, on the order of a microsecond.
 	 */
 
-	if (req->ring->irq_refcount)
-		return -EBUSY;
-
 	/* Only spin if we know the GPU is processing this request */
 	if (!i915_gem_request_started(req, false))
 		return -EAGAIN;
@@ -1232,26 +1218,19 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 			s64 *timeout,
 			struct intel_rps_client *rps)
 {
-	struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
-	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	const bool irq_test_in_progress =
-		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
 	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
 	DEFINE_WAIT(wait);
-	unsigned long timeout_expire;
+	unsigned long timeout_remain;
 	s64 before, now;
 	int ret;
 
-	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
-
 	if (list_empty(&req->list))
 		return 0;
 
 	if (i915_gem_request_completed(req, true))
 		return 0;
 
-	timeout_expire = 0;
+	timeout_remain = 0;
 	if (timeout) {
 		if (WARN_ON(*timeout < 0))
 			return -EINVAL;
@@ -1259,43 +1238,28 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 		if (*timeout == 0)
 			return -ETIME;
 
-		timeout_expire = jiffies + nsecs_to_jiffies_timeout(*timeout);
+		timeout_remain = nsecs_to_jiffies_timeout(*timeout);
 	}
 
-	if (INTEL_INFO(dev_priv)->gen >= 6)
-		gen6_rps_boost(dev_priv, rps, req->emitted_jiffies);
-
 	/* Record current time in case interrupted by signal, or wedged */
 	trace_i915_gem_request_wait_begin(req);
 	before = ktime_get_raw_ns();
 
-	/* Optimistic spin for the next jiffie before touching IRQs */
-	ret = __i915_spin_request(req, state);
-	if (ret == 0)
-		goto out;
+	if (INTEL_INFO(req->i915)->gen >= 6)
+		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
 
-	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
-		ret = -ENODEV;
-		goto out;
+	/* Optimistic spin for the next jiffie before touching IRQs */
+	if (intel_breadcrumbs_add_waiter(req)) {
+		ret = __i915_spin_request(req, state);
+		if (ret == 0)
+			goto out;
 	}
 
 	for (;;) {
-		struct timer_list timer;
-
-		prepare_to_wait(&ring->irq_queue, &wait, state);
+		prepare_to_wait(&req->wait, &wait, state);
 
-		/* We need to check whether any gpu reset happened in between
-		 * the caller grabbing the seqno and now ... */
-		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
-			/* As we do not requeue the request over a GPU reset,
-			 * if one does occur we know that the request is
-			 * effectively complete.
-			 */
-			ret = 0;
-			break;
-		}
-
-		if (i915_gem_request_completed(req, false)) {
+		if (i915_gem_request_completed(req, true) ||
+		    req->reset_counter != i915_reset_counter(&req->i915->gpu_error)) {
 			ret = 0;
 			break;
 		}
@@ -1305,36 +1269,19 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 			break;
 		}
 
-		if (timeout && time_after_eq(jiffies, timeout_expire)) {
-			ret = -ETIME;
-			break;
-		}
-
-		i915_queue_hangcheck(dev_priv);
-
-		timer.function = NULL;
-		if (timeout || missed_irq(dev_priv, ring)) {
-			unsigned long expire;
-
-			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
-			expire = missed_irq(dev_priv, ring) ? jiffies + 1 : timeout_expire;
-			mod_timer(&timer, expire);
-		}
-
-		io_schedule();
-
-		if (timer.function) {
-			del_singleshot_timer_sync(&timer);
-			destroy_timer_on_stack(&timer);
-		}
+		if (timeout) {
+			timeout_remain = io_schedule_timeout(timeout_remain);
+			if (timeout_remain == 0) {
+				ret = -ETIME;
+				break;
+			}
+		} else
+			io_schedule();
 	}
-	if (!irq_test_in_progress)
-		ring->irq_put(ring);
-
-	finish_wait(&ring->irq_queue, &wait);
-
+	finish_wait(&req->wait, &wait);
 out:
 	now = ktime_get_raw_ns();
+	intel_breadcrumbs_remove_waiter(req);
 	trace_i915_gem_request_wait_end(req);
 
 	if (timeout) {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 06ca4082735b..d1df405b905c 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -453,10 +453,11 @@  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 				   dev_priv->ring[i].name,
 				   error->ring[i].num_requests);
 			for (j = 0; j < error->ring[i].num_requests; j++) {
-				err_printf(m, "  seqno 0x%08x, emitted %ld, tail 0x%08x\n",
+				err_printf(m, "  seqno 0x%08x, emitted %ld, tail 0x%08x, waiters? %d\n",
 					   error->ring[i].requests[j].seqno,
 					   error->ring[i].requests[j].jiffies,
-					   error->ring[i].requests[j].tail);
+					   error->ring[i].requests[j].tail,
+					   error->ring[i].requests[j].waiters);
 			}
 		}
 
@@ -900,7 +901,7 @@  static void i915_record_ring_state(struct drm_device *dev,
 		ering->instdone = I915_READ(GEN2_INSTDONE);
 	}
 
-	ering->waiting = waitqueue_active(&ring->irq_queue);
+	ering->waiting = READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first);
 	ering->instpm = I915_READ(RING_INSTPM(ring->mmio_base));
 	ering->seqno = ring->get_seqno(ring, false);
 	ering->acthd = intel_ring_get_active_head(ring);
@@ -1105,6 +1106,7 @@  static void i915_gem_record_rings(struct drm_device *dev,
 			erq->seqno = request->seqno;
 			erq->jiffies = request->emitted_jiffies;
 			erq->tail = request->postfix;
+			erq->waiters = request->irq_count;
 		}
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 28c0329f8281..0c2390476bdb 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -996,12 +996,11 @@  static void ironlake_rps_change_irq_handler(struct drm_device *dev)
 
 static void notify_ring(struct intel_engine_cs *ring)
 {
-	if (!intel_ring_initialized(ring))
+	if (ring->i915 == NULL)
 		return;
 
 	trace_i915_gem_request_notify(ring);
-
-	wake_up_all(&ring->irq_queue);
+	wake_up_process(ring->i915->breadcrumbs.task);
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
@@ -2399,9 +2398,6 @@  static irqreturn_t gen8_irq_handler(int irq, void *arg)
 static void i915_error_wake_up(struct drm_i915_private *dev_priv,
 			       bool reset_completed)
 {
-	struct intel_engine_cs *ring;
-	int i;
-
 	/*
 	 * Notify all waiters for GPU completion events that reset state has
 	 * been changed, and that they need to restart their wait after
@@ -2410,8 +2406,7 @@  static void i915_error_wake_up(struct drm_i915_private *dev_priv,
 	 */
 
 	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
-	for_each_ring(ring, dev_priv, i)
-		wake_up_all(&ring->irq_queue);
+	wake_up_process(dev_priv->breadcrumbs.task);
 
 	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
 	wake_up_all(&dev_priv->pending_flip_queue);
@@ -2986,16 +2981,16 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 			if (ring_idle(ring, seqno)) {
 				ring->hangcheck.action = HANGCHECK_IDLE;
 
-				if (waitqueue_active(&ring->irq_queue)) {
+				if (READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first)) {
 					/* Issue a wake-up to catch stuck h/w. */
 					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
-						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
+						if (!test_bit(ring->id, &dev_priv->gpu_error.test_irq_rings))
 							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
 								  ring->name);
 						else
 							DRM_INFO("Fake missed irq on %s\n",
 								 ring->name);
-						wake_up_all(&ring->irq_queue);
+						wake_up_process(dev_priv->breadcrumbs.task);
 					}
 					/* Safeguard against driver failure */
 					ring->hangcheck.score += BUSY;
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
new file mode 100644
index 000000000000..0a18405c8689
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -0,0 +1,253 @@ 
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/kthread.h>
+
+#include "i915_drv.h"
+
+static bool __irq_enable(struct intel_engine_cs *engine)
+{
+	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
+		return false;
+
+	if (!intel_irqs_enabled(engine->i915))
+		return false;
+
+	return engine->irq_get(engine);
+}
+
+static struct drm_i915_gem_request *to_request(struct rb_node *rb)
+{
+	if (rb == NULL)
+		return NULL;
+
+	return rb_entry(rb, struct drm_i915_gem_request, irq_node);
+}
+
+/*
+ * intel_breadcrumbs_irq() acts as the bottom-half for the user interrupt,
+ * which we use as the breadcrumb after every request. In order to scale
+ * to many concurrent waiters, we delegate the task of reading the coherent
+ * seqno to this bottom-half. (Otherwise every waiter is woken after each
+ * interrupt and they all attempt to perform the heavyweight coherent
+ * seqno check.) Individual clients register with the bottom-half when
+ * waiting on a request, and are then woken when the bottom-half notices
+ * their seqno is complete. This incurs an extra context switch from the
+ * interrupt to the client in the uncontended case.
+ */
+static int intel_breadcrumbs_irq(void *data)
+{
+	struct drm_i915_private *i915 = data;
+	struct intel_breadcrumbs *b = &i915->breadcrumbs;
+	unsigned irq_get = 0;
+	unsigned irq_enabled = 0;
+	int i;
+
+	while (!kthread_should_stop()) {
+		unsigned reset_counter = i915_reset_counter(&i915->gpu_error);
+		unsigned long timeout_jiffies;
+		bool idling = false;
+
+		/* On every tick, walk the seqno-ordered list of requests
+		 * and for each retired request wakeup its clients. If we
+		 * find an unfinished request, go back to sleep.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* Note carefully that we do not hold a reference for the
+		 * requests on this list. Therefore we only inspect them
+		 * whilst holding the spinlock to ensure that they are not
+		 * freed in the meantime, and the client must remove the
+		 * request from the list if it is interrupted (before it
+		 * itself releases its reference).
+		 */
+		spin_lock(&b->lock);
+		for (i = 0; i < I915_NUM_RINGS; i++) {
+			struct intel_engine_cs *engine = &i915->ring[i];
+			struct intel_breadcrumbs_engine *be = &b->engine[i];
+			struct drm_i915_gem_request *request = be->first;
+
+			if (request == NULL) {
+				if ((irq_get & (1 << i))) {
+					if (irq_enabled & (1 << i)) {
+						engine->irq_put(engine);
+						irq_enabled &= ~ (1 << i);
+					}
+					intel_runtime_pm_put(i915);
+					irq_get &= ~(1 << i);
+				}
+				continue;
+			}
+
+			if ((irq_get & (1 << i)) == 0) {
+				intel_runtime_pm_get(i915);
+				irq_enabled |= __irq_enable(engine) << i;
+				irq_get |= 1 << i;
+			}
+
+			do {
+				struct rb_node *next;
+
+				if (request->reset_counter == reset_counter &&
+				    !i915_gem_request_completed(request, false))
+					break;
+
+				next = rb_next(&request->irq_node);
+				rb_erase(&request->irq_node, &be->requests);
+				RB_CLEAR_NODE(&request->irq_node);
+
+				wake_up_all(&request->wait);
+
+				request = to_request(next);
+			} while (request);
+			be->first = request;
+			idling |= request == NULL;
+
+			/* Make sure the hangcheck timer is armed in case
+			 * the GPU hangs and we are never woken up.
+			 */
+			if (request)
+				i915_queue_hangcheck(i915);
+		}
+		spin_unlock(&b->lock);
+
+		/* If we don't have interrupts available (either we have
+		 * detected the hardware is missing interrupts or the
+		 * interrupt delivery was disabled by the user), wake up
+		 * every jiffie to check for request completion.
+		 *
+		 * If we have processed all requests for one engine, we
+		 * also wish to wake up in a jiffie to turn off the
+		 * breadcrumb interrupt for that engine. We delay
+		 * switching off the interrupt in order to allow another
+		 * waiter to start without incurring additional latency
+		 * enabling the interrupt.
+		 */
+		timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+		if (idling || i915->gpu_error.missed_irq_rings & irq_enabled)
+			timeout_jiffies = 1;
+
+		/* Unlike the individual clients, we do not want this
+		 * background thread to contribute to the system load,
+		 * i.e. we do not want to use io_schedule() here.
+		 */
+		schedule_timeout(timeout_jiffies);
+	}
+
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		if ((irq_get & (1 << i))) {
+			if (irq_enabled & (1 << i)) {
+				struct intel_engine_cs *engine = &i915->ring[i];
+				engine->irq_put(engine);
+			}
+			intel_runtime_pm_put(i915);
+		}
+	}
+
+	return 0;
+}
+
+bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request)
+{
+	struct intel_breadcrumbs *b = &request->i915->breadcrumbs;
+	bool first = false;
+
+	spin_lock(&b->lock);
+	if (request->irq_count++ == 0) {
+		struct intel_breadcrumbs_engine *be =
+			&b->engine[request->ring->id];
+		struct rb_node **p, *parent;
+
+		if (be->first == NULL)
+			wake_up_process(b->task);
+
+		init_waitqueue_head(&request->wait);
+
+		first = true;
+		parent = NULL;
+		p = &be->requests.rb_node;
+		while (*p) {
+			struct drm_i915_gem_request *__req;
+
+			parent = *p;
+			__req = rb_entry(parent, typeof(*__req), irq_node);
+
+			if (i915_seqno_passed(request->seqno, __req->seqno)) {
+				p = &parent->rb_right;
+				first = false;
+			} else
+				p = &parent->rb_left;
+		}
+		if (first)
+			be->first = request;
+
+		rb_link_node(&request->irq_node, parent, p);
+		rb_insert_color(&request->irq_node, &be->requests);
+	}
+	spin_unlock(&b->lock);
+
+	return first;
+}
+
+void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request)
+{
+	struct intel_breadcrumbs *b = &request->i915->breadcrumbs;
+
+	spin_lock(&b->lock);
+	if (--request->irq_count == 0 && !RB_EMPTY_NODE(&request->irq_node)) {
+		struct intel_breadcrumbs_engine *be =
+			&b->engine[request->ring->id];
+		if (be->first == request)
+			be->first = to_request(rb_next(&request->irq_node));
+		rb_erase(&request->irq_node, &be->requests);
+	}
+	spin_unlock(&b->lock);
+}
+
+int intel_breadcrumbs_init(struct drm_i915_private *i915)
+{
+	struct intel_breadcrumbs *b = &i915->breadcrumbs;
+	struct task_struct *task;
+
+	spin_lock_init(&b->lock);
+
+	task = kthread_run(intel_breadcrumbs_irq, i915, "irq/i915");
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	b->task = task;
+
+	return 0;
+}
+
+void intel_breadcrumbs_fini(struct drm_i915_private *i915)
+{
+	struct intel_breadcrumbs *b = &i915->breadcrumbs;
+
+	if (b->task == NULL)
+		return;
+
+	kthread_stop(b->task);
+	b->task = NULL;
+}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b40ffc3607e7..604c081b0a0f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1922,10 +1922,10 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 	ring->buffer = NULL;
 
 	ring->dev = dev;
+	ring->i915 = to_i915(dev);
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
-	init_waitqueue_head(&ring->irq_queue);
 
 	INIT_LIST_HEAD(&ring->buffers);
 	INIT_LIST_HEAD(&ring->execlist_queue);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 511efe556d73..5a5a7195dd54 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2157,6 +2157,7 @@  static int intel_init_ring_buffer(struct drm_device *dev,
 	WARN_ON(ring->buffer);
 
 	ring->dev = dev;
+	ring->i915 = to_i915(dev);
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	INIT_LIST_HEAD(&ring->execlist_queue);
@@ -2164,8 +2165,6 @@  static int intel_init_ring_buffer(struct drm_device *dev,
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
 	memset(ring->semaphore.sync_seqno, 0, sizeof(ring->semaphore.sync_seqno));
 
-	init_waitqueue_head(&ring->irq_queue);
-
 	ringbuf = intel_engine_create_ringbuffer(ring, 32 * PAGE_SIZE);
 	if (IS_ERR(ringbuf))
 		return PTR_ERR(ringbuf);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d1eb206151d..49b5bded2767 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -157,6 +157,7 @@  struct  intel_engine_cs {
 #define LAST_USER_RING (VECS + 1)
 	u32		mmio_base;
 	struct		drm_device *dev;
+	struct drm_i915_private *i915;
 	struct intel_ringbuffer *buffer;
 	struct list_head buffers;
 
@@ -303,8 +304,6 @@  struct  intel_engine_cs {
 
 	bool gpu_caches_dirty;
 
-	wait_queue_head_t irq_queue;
-
 	struct intel_context *default_context;
 	struct intel_context *last_context;
 

Comments

On Sun, Nov 29, 2015 at 08:48:06AM +0000, Chris Wilson wrote:
> +	/* Optimistic spin for the next jiffie before touching IRQs */
> +	if (intel_breadcrumbs_add_waiter(req)) {
> +		ret = __i915_spin_request(req, state);
> +		if (ret == 0)
> +			goto out;

There are a couple of interesting side-effects here.  As we know start
up the irq in parallel and keep it running for longer, irq/i915 now
consumes a lot of CPU time (like 50-100%!) for synchronous batches, but
doesn't seem to interfere with latency (as spin_request is still nicely
running and catching the request completion). That should also still
work nicely on single cpu machines as the irq enabling should not
preempt us.  The second interesting side-effect is that the synchronous
loads that regressed with a 2us spin-request timeout are now happy again
at 2us. Also with the active-request and the can-spin check from
add_waiter, running a low fps game with a compositor is not burning the
CPU with any spinning.
-Chris
Hi,

On 29/11/15 08:48, Chris Wilson wrote:
> One particularly stressful scenario consists of many independent tasks
> all competing for GPU time and waiting upon the results (e.g. realtime
> transcoding of many, many streams). One bottleneck in particular is that
> each client waits on its own results, but every client is woken up after
> every batchbuffer - hence the thunder of hooves as then every client must
> do its heavyweight dance to read a coherent seqno to see if it is the
> lucky one. Alternatively, we can have one kthread responsible for waking
> after an interrupt, checking the seqno and only waking up the waiting
> clients who are complete. The disadvantage is that in the uncontended
> scenario (i.e. only one waiter) we incur an extra context switch in the
> wakeup path - though that should be mitigated somewhat by the busy-wait
> we do first before sleeping.
>
> v2: Convert from a kworker per engine into a dedicated kthread for the
> bottom-half.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/Makefile            |   1 +
>   drivers/gpu/drm/i915/i915_dma.c          |   9 +-
>   drivers/gpu/drm/i915/i915_drv.h          |  35 ++++-
>   drivers/gpu/drm/i915/i915_gem.c          |  99 +++---------
>   drivers/gpu/drm/i915/i915_gpu_error.c    |   8 +-
>   drivers/gpu/drm/i915/i915_irq.c          |  17 +--
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 253 +++++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/intel_lrc.c         |   2 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.c  |   3 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h  |   3 +-
>   10 files changed, 333 insertions(+), 97 deletions(-)
>   create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 0851de07bd13..d3b9d3618719 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -35,6 +35,7 @@ i915-y += i915_cmd_parser.o \
>   	  i915_gem_userptr.o \
>   	  i915_gpu_error.o \
>   	  i915_trace_points.o \
> +	  intel_breadcrumbs.o \
>   	  intel_lrc.o \
>   	  intel_mocs.o \
>   	  intel_ringbuffer.o \
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index a81c76603544..93c6762d87b4 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -388,12 +388,16 @@ static int i915_load_modeset_init(struct drm_device *dev)
>   	if (ret)
>   		goto cleanup_vga_client;
>
> +	ret = intel_breadcrumbs_init(dev_priv);
> +	if (ret)
> +		goto cleanup_vga_switcheroo;
> +
>   	/* Initialise stolen first so that we may reserve preallocated
>   	 * objects for the BIOS to KMS transition.
>   	 */
>   	ret = i915_gem_init_stolen(dev);
>   	if (ret)
> -		goto cleanup_vga_switcheroo;
> +		goto cleanup_breadcrumbs;
>
>   	intel_power_domains_init_hw(dev_priv, false);
>
> @@ -454,6 +458,8 @@ cleanup_irq:
>   	drm_irq_uninstall(dev);
>   cleanup_gem_stolen:
>   	i915_gem_cleanup_stolen(dev);
> +cleanup_breadcrumbs:
> +	intel_breadcrumbs_fini(dev_priv);
>   cleanup_vga_switcheroo:
>   	vga_switcheroo_unregister_client(dev->pdev);
>   cleanup_vga_client:
> @@ -1193,6 +1199,7 @@ int i915_driver_unload(struct drm_device *dev)
>   	mutex_unlock(&dev->struct_mutex);
>   	intel_fbc_cleanup_cfb(dev_priv);
>   	i915_gem_cleanup_stolen(dev);
> +	intel_breadcrumbs_fini(dev_priv);
>
>   	intel_csr_ucode_fini(dev_priv);
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e1980212ba37..782d08ab6264 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -564,6 +564,7 @@ struct drm_i915_error_state {
>   			long jiffies;
>   			u32 seqno;
>   			u32 tail;
> +			u32 waiters;
>   		} *requests;
>
>   		struct {
> @@ -1383,7 +1384,7 @@ struct i915_gpu_error {
>   #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
>
>   	/* For missed irq/seqno simulation. */
> -	unsigned int test_irq_rings;
> +	unsigned long test_irq_rings;
>   };
>
>   enum modeset_restore {
> @@ -1695,6 +1696,27 @@ struct drm_i915_private {
>
>   	struct intel_uncore uncore;
>
> +	/* Rather than have every client wait upon all user interrupts,
> +	 * with the herd waking after every interrupt and each doing the
> +	 * heavyweight seqno dance, we delegate the task to a bottom-half
> +	 * for the user interrupt (one bh handling all engines). Now,
> +	 * just one task is woken up and does the coherent seqno read and
> +	 * then wakes up all the waiters registered with the bottom-half
> +	 * that are complete. This avoid the thundering herd problem
> +	 * with lots of concurrent waiters, at the expense of an extra
> +	 * context switch between the interrupt and the client. That should
> +	 * be mitigated somewhat by the busyspin in the client before we go to
> +	 * sleep waiting for an interrupt from the bottom-half.
> +	 */
> +	struct intel_breadcrumbs {
> +		spinlock_t lock; /* protects the per-engine request list */

s/list/tree/

> +		struct task_struct *task; /* bh for all user interrupts */
> +		struct intel_breadcrumbs_engine {
> +			struct rb_root requests; /* sorted by retirement */
> +			struct drm_i915_gem_request *first;

/* First request in the tree to be completed next by the hw. */

?

> +		} engine[I915_NUM_RINGS];
> +	} breadcrumbs;
> +
>   	struct i915_virtual_gpu vgpu;
>
>   	struct intel_guc guc;
> @@ -2228,6 +2250,11 @@ struct drm_i915_gem_request {
>   	/** global list entry for this request */
>   	struct list_head list;
>
> +	/** List of clients waiting for completion of this request */
> +	wait_queue_head_t wait;
> +	struct rb_node irq_node;
> +	unsigned irq_count;

To me irq prefix does not fit here that well.

req_tree_node?

waiter_count, waiters?

> +
>   	struct drm_i915_file_private *file_priv;
>   	/** file_priv list entry for this request */
>   	struct list_head client_list;
> @@ -2800,6 +2827,12 @@ ibx_disable_display_interrupt(struct drm_i915_private *dev_priv, uint32_t bits)
>   }
>
>
> +/* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
> +int intel_breadcrumbs_init(struct drm_i915_private *i915);
> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request);
> +void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request);
> +void intel_breadcrumbs_fini(struct drm_i915_private *i915);
> +
>   /* i915_gem.c */
>   int i915_gem_create_ioctl(struct drm_device *dev, void *data,
>   			  struct drm_file *file_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 65d101b47d8e..969592fb0969 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1126,17 +1126,6 @@ i915_gem_check_wedge(unsigned reset_counter,
>   	return 0;
>   }
>
> -static void fake_irq(unsigned long data)
> -{
> -	wake_up_process((struct task_struct *)data);
> -}
> -
> -static bool missed_irq(struct drm_i915_private *dev_priv,
> -		       struct intel_engine_cs *ring)
> -{
> -	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> -}
> -
>   static unsigned long local_clock_us(unsigned *cpu)
>   {
>   	unsigned long t;
> @@ -1184,9 +1173,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   	 * takes to sleep on a request, on the order of a microsecond.
>   	 */
>
> -	if (req->ring->irq_refcount)
> -		return -EBUSY;
> -
>   	/* Only spin if we know the GPU is processing this request */
>   	if (!i915_gem_request_started(req, false))
>   		return -EAGAIN;
> @@ -1232,26 +1218,19 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			s64 *timeout,
>   			struct intel_rps_client *rps)
>   {
> -	struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
> -	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	const bool irq_test_in_progress =
> -		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
>   	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
>   	DEFINE_WAIT(wait);
> -	unsigned long timeout_expire;
> +	unsigned long timeout_remain;
>   	s64 before, now;
>   	int ret;
>
> -	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
> -
>   	if (list_empty(&req->list))
>   		return 0;
>
>   	if (i915_gem_request_completed(req, true))
>   		return 0;
>
> -	timeout_expire = 0;
> +	timeout_remain = 0;
>   	if (timeout) {
>   		if (WARN_ON(*timeout < 0))
>   			return -EINVAL;
> @@ -1259,43 +1238,28 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   		if (*timeout == 0)
>   			return -ETIME;
>
> -		timeout_expire = jiffies + nsecs_to_jiffies_timeout(*timeout);
> +		timeout_remain = nsecs_to_jiffies_timeout(*timeout);
>   	}
>
> -	if (INTEL_INFO(dev_priv)->gen >= 6)
> -		gen6_rps_boost(dev_priv, rps, req->emitted_jiffies);
>
>   	/* Record current time in case interrupted by signal, or wedged */
>   	trace_i915_gem_request_wait_begin(req);
>   	before = ktime_get_raw_ns();
>
> -	/* Optimistic spin for the next jiffie before touching IRQs */
> -	ret = __i915_spin_request(req, state);
> -	if (ret == 0)
> -		goto out;
> +	if (INTEL_INFO(req->i915)->gen >= 6)
> +		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
>
> -	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> -		ret = -ENODEV;
> -		goto out;
> +	/* Optimistic spin for the next jiffie before touching IRQs */
> +	if (intel_breadcrumbs_add_waiter(req)) {
> +		ret = __i915_spin_request(req, state);
> +		if (ret == 0)
> +			goto out;
>   	}
>
>   	for (;;) {
> -		struct timer_list timer;
> -
> -		prepare_to_wait(&ring->irq_queue, &wait, state);
> +		prepare_to_wait(&req->wait, &wait, state);
>
> -		/* We need to check whether any gpu reset happened in between
> -		 * the caller grabbing the seqno and now ... */
> -		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> -			/* As we do not requeue the request over a GPU reset,
> -			 * if one does occur we know that the request is
> -			 * effectively complete.
> -			 */
> -			ret = 0;
> -			break;
> -		}
> -
> -		if (i915_gem_request_completed(req, false)) {
> +		if (i915_gem_request_completed(req, true) ||

Why it is OK to change the lazy coherency mode here?

> +		    req->reset_counter != i915_reset_counter(&req->i915->gpu_error)) {

It looks like it would be valuable to preserve the comment about no 
re-queuing etc on GPU reset.

>   			ret = 0;
>   			break;
>   		}
> @@ -1305,36 +1269,19 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			break;
>   		}
>
> -		if (timeout && time_after_eq(jiffies, timeout_expire)) {
> -			ret = -ETIME;
> -			break;
> -		}
> -
> -		i915_queue_hangcheck(dev_priv);
> -
> -		timer.function = NULL;
> -		if (timeout || missed_irq(dev_priv, ring)) {
> -			unsigned long expire;
> -
> -			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
> -			expire = missed_irq(dev_priv, ring) ? jiffies + 1 : timeout_expire;
> -			mod_timer(&timer, expire);
> -		}
> -
> -		io_schedule();
> -
> -		if (timer.function) {
> -			del_singleshot_timer_sync(&timer);
> -			destroy_timer_on_stack(&timer);
> -		}
> +		if (timeout) {
> +			timeout_remain = io_schedule_timeout(timeout_remain);
> +			if (timeout_remain == 0) {
> +				ret = -ETIME;
> +				break;
> +			}
> +		} else
> +			io_schedule();

Could set timeout_remain to that MAX value when timeout == NULL and have 
a single call site to io_schedule_timeout and less indentation. It 
doesn't matter hugely, maybe it would be a bit easier to read.

>   	}
> -	if (!irq_test_in_progress)
> -		ring->irq_put(ring);
> -
> -	finish_wait(&ring->irq_queue, &wait);
> -
> +	finish_wait(&req->wait, &wait);
>   out:
>   	now = ktime_get_raw_ns();
> +	intel_breadcrumbs_remove_waiter(req);
>   	trace_i915_gem_request_wait_end(req);
>
>   	if (timeout) {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 06ca4082735b..d1df405b905c 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -453,10 +453,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>   				   dev_priv->ring[i].name,
>   				   error->ring[i].num_requests);
>   			for (j = 0; j < error->ring[i].num_requests; j++) {
> -				err_printf(m, "  seqno 0x%08x, emitted %ld, tail 0x%08x\n",
> +				err_printf(m, "  seqno 0x%08x, emitted %ld, tail 0x%08x, waiters? %d\n",
>   					   error->ring[i].requests[j].seqno,
>   					   error->ring[i].requests[j].jiffies,
> -					   error->ring[i].requests[j].tail);
> +					   error->ring[i].requests[j].tail,
> +					   error->ring[i].requests[j].waiters);
>   			}
>   		}
>
> @@ -900,7 +901,7 @@ static void i915_record_ring_state(struct drm_device *dev,
>   		ering->instdone = I915_READ(GEN2_INSTDONE);
>   	}
>
> -	ering->waiting = waitqueue_active(&ring->irq_queue);
> +	ering->waiting = READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first);

What does the READ_ONCE do here?

>   	ering->instpm = I915_READ(RING_INSTPM(ring->mmio_base));
>   	ering->seqno = ring->get_seqno(ring, false);
>   	ering->acthd = intel_ring_get_active_head(ring);
> @@ -1105,6 +1106,7 @@ static void i915_gem_record_rings(struct drm_device *dev,
>   			erq->seqno = request->seqno;
>   			erq->jiffies = request->emitted_jiffies;
>   			erq->tail = request->postfix;
> +			erq->waiters = request->irq_count;
>   		}
>   	}
>   }
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 28c0329f8281..0c2390476bdb 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -996,12 +996,11 @@ static void ironlake_rps_change_irq_handler(struct drm_device *dev)
>
>   static void notify_ring(struct intel_engine_cs *ring)
>   {
> -	if (!intel_ring_initialized(ring))
> +	if (ring->i915 == NULL)
>   		return;
>
>   	trace_i915_gem_request_notify(ring);
> -
> -	wake_up_all(&ring->irq_queue);
> +	wake_up_process(ring->i915->breadcrumbs.task);

For a later extension:

How would you view, some time in the future, adding a bool return to 
ring->put_irq() which would say to the thread that it failed to disable 
interrupts?

In this case thread would exit and would need to be restarted when the 
next waiter queues up. Possibly extending the idle->put_irq->exit 
durations as well then.

>   }
>
>   static void vlv_c0_read(struct drm_i915_private *dev_priv,
> @@ -2399,9 +2398,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>   static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>   			       bool reset_completed)
>   {
> -	struct intel_engine_cs *ring;
> -	int i;
> -
>   	/*
>   	 * Notify all waiters for GPU completion events that reset state has
>   	 * been changed, and that they need to restart their wait after
> @@ -2410,8 +2406,7 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>   	 */
>
>   	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
> -	for_each_ring(ring, dev_priv, i)
> -		wake_up_all(&ring->irq_queue);
> +	wake_up_process(dev_priv->breadcrumbs.task);
>
>   	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
>   	wake_up_all(&dev_priv->pending_flip_queue);
> @@ -2986,16 +2981,16 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   			if (ring_idle(ring, seqno)) {
>   				ring->hangcheck.action = HANGCHECK_IDLE;
>
> -				if (waitqueue_active(&ring->irq_queue)) {
> +				if (READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first)) {

READ_ONCE is again strange, but other than that I don't know the 
hangcheck code to really evaulate it.

Perhaps this also means this line needs a comment describing what 
condition/state is actually approximating with the check.

>   					/* Issue a wake-up to catch stuck h/w. */
>   					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> -						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
> +						if (!test_bit(ring->id, &dev_priv->gpu_error.test_irq_rings))
>   							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
>   								  ring->name);
>   						else
>   							DRM_INFO("Fake missed irq on %s\n",
>   								 ring->name);
> -						wake_up_all(&ring->irq_queue);
> +						wake_up_process(dev_priv->breadcrumbs.task);
>   					}
>   					/* Safeguard against driver failure */
>   					ring->hangcheck.score += BUSY;
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> new file mode 100644
> index 000000000000..0a18405c8689
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -0,0 +1,253 @@
> +/*
> + * Copyright © 2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/kthread.h>
> +
> +#include "i915_drv.h"
> +
> +static bool __irq_enable(struct intel_engine_cs *engine)
> +{
> +	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
> +		return false;
> +
> +	if (!intel_irqs_enabled(engine->i915))
> +		return false;
> +
> +	return engine->irq_get(engine);
> +}
> +
> +static struct drm_i915_gem_request *to_request(struct rb_node *rb)
> +{
> +	if (rb == NULL)
> +		return NULL;
> +
> +	return rb_entry(rb, struct drm_i915_gem_request, irq_node);
> +}
> +
> +/*
> + * intel_breadcrumbs_irq() acts as the bottom-half for the user interrupt,
> + * which we use as the breadcrumb after every request. In order to scale
> + * to many concurrent waiters, we delegate the task of reading the coherent
> + * seqno to this bottom-half. (Otherwise every waiter is woken after each
> + * interrupt and they all attempt to perform the heavyweight coherent
> + * seqno check.) Individual clients register with the bottom-half when
> + * waiting on a request, and are then woken when the bottom-half notices
> + * their seqno is complete. This incurs an extra context switch from the
> + * interrupt to the client in the uncontended case.
> + */
> +static int intel_breadcrumbs_irq(void *data)
> +{
> +	struct drm_i915_private *i915 = data;
> +	struct intel_breadcrumbs *b = &i915->breadcrumbs;
> +	unsigned irq_get = 0;
> +	unsigned irq_enabled = 0;
> +	int i;
> +
> +	while (!kthread_should_stop()) {
> +		unsigned reset_counter = i915_reset_counter(&i915->gpu_error);
> +		unsigned long timeout_jiffies;
> +		bool idling = false;
> +
> +		/* On every tick, walk the seqno-ordered list of requests
> +		 * and for each retired request wakeup its clients. If we
> +		 * find an unfinished request, go back to sleep.
> +		 */

s/tick/loop/ ?

And if we don't find and unfinished request still go back to sleep. :)

> +		set_current_state(TASK_INTERRUPTIBLE);
> +
> +		/* Note carefully that we do not hold a reference for the
> +		 * requests on this list. Therefore we only inspect them
> +		 * whilst holding the spinlock to ensure that they are not
> +		 * freed in the meantime, and the client must remove the
> +		 * request from the list if it is interrupted (before it
> +		 * itself releases its reference).
> +		 */
> +		spin_lock(&b->lock);

This lock is more per engine that global in its nature unless you think 
it was more efficient to do fewer lock atomics vs potential overlap of 
waiter activity per engines?

Would need a new lock for req->irq_count, per request. So 
req->lock/req->irq_count and be->lock for the tree operations.

Thread would only need to deal with the later. Feels like it would work, 
just not sure if it is worth it.

> +		for (i = 0; i < I915_NUM_RINGS; i++) {
> +			struct intel_engine_cs *engine = &i915->ring[i];
> +			struct intel_breadcrumbs_engine *be = &b->engine[i];
> +			struct drm_i915_gem_request *request = be->first;
> +
> +			if (request == NULL) {
> +				if ((irq_get & (1 << i))) {
> +					if (irq_enabled & (1 << i)) {
> +						engine->irq_put(engine);
> +						irq_enabled &= ~ (1 << i);
> +					}
> +					intel_runtime_pm_put(i915);
> +					irq_get &= ~(1 << i);
> +				}
> +				continue;
> +			}
> +
> +			if ((irq_get & (1 << i)) == 0) {
> +				intel_runtime_pm_get(i915);
> +				irq_enabled |= __irq_enable(engine) << i;
> +				irq_get |= 1 << i;
> +			}

irq_get and irq_enabled are creating a little bit of a headache for me. 
And that would probably mean a comment is be in order to explain what 
they are for and how they work.

Like this I don't see the need for irq_get to persist across loops?

irq_get looks like "try to get these", irq_enabled is "these ones I 
got". Apart for the weird usage of it to balance the runtime pm gets and 
puts.

A bit confusing as I said.

> +			do {
> +				struct rb_node *next;
> +
> +				if (request->reset_counter == reset_counter &&
> +				    !i915_gem_request_completed(request, false))
> +					break;
> +
> +				next = rb_next(&request->irq_node);
> +				rb_erase(&request->irq_node, &be->requests);
> +				RB_CLEAR_NODE(&request->irq_node);
> +
> +				wake_up_all(&request->wait);
> +
> +				request = to_request(next);
> +			} while (request);
> +			be->first = request;
> +			idling |= request == NULL;
> +
> +			/* Make sure the hangcheck timer is armed in case
> +			 * the GPU hangs and we are never woken up.
> +			 */
> +			if (request)
> +				i915_queue_hangcheck(i915);
> +		}
> +		spin_unlock(&b->lock);
> +
> +		/* If we don't have interrupts available (either we have
> +		 * detected the hardware is missing interrupts or the
> +		 * interrupt delivery was disabled by the user), wake up
> +		 * every jiffie to check for request completion.
> +		 *
> +		 * If we have processed all requests for one engine, we
> +		 * also wish to wake up in a jiffie to turn off the
> +		 * breadcrumb interrupt for that engine. We delay
> +		 * switching off the interrupt in order to allow another
> +		 * waiter to start without incurring additional latency
> +		 * enabling the interrupt.
> +		 */
> +		timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
> +		if (idling || i915->gpu_error.missed_irq_rings & irq_enabled)
> +			timeout_jiffies = 1;
> +
> +		/* Unlike the individual clients, we do not want this
> +		 * background thread to contribute to the system load,
> +		 * i.e. we do not want to use io_schedule() here.
> +		 */
> +		schedule_timeout(timeout_jiffies);
> +	}
> +
> +	for (i = 0; i < I915_NUM_RINGS; i++) {
> +		if ((irq_get & (1 << i))) {
> +			if (irq_enabled & (1 << i)) {
> +				struct intel_engine_cs *engine = &i915->ring[i];
> +				engine->irq_put(engine);
> +			}
> +			intel_runtime_pm_put(i915);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request)
> +{
> +	struct intel_breadcrumbs *b = &request->i915->breadcrumbs;
> +	bool first = false;
> +
> +	spin_lock(&b->lock);
> +	if (request->irq_count++ == 0) {
> +		struct intel_breadcrumbs_engine *be =
> +			&b->engine[request->ring->id];
> +		struct rb_node **p, *parent;
> +
> +		if (be->first == NULL)
> +			wake_up_process(b->task);

With a global b->lock it makes no difference but in the logical sense 
this would usually go last, after the request has been inserted into the 
tree and waiter initialized.

> +
> +		init_waitqueue_head(&request->wait);
> +
> +		first = true;
> +		parent = NULL;
> +		p = &be->requests.rb_node;
> +		while (*p) {
> +			struct drm_i915_gem_request *__req;
> +
> +			parent = *p;
> +			__req = rb_entry(parent, typeof(*__req), irq_node);
> +
> +			if (i915_seqno_passed(request->seqno, __req->seqno)) {
> +				p = &parent->rb_right;
> +				first = false;
> +			} else
> +				p = &parent->rb_left;
> +		}
> +		if (first)
> +			be->first = request;
> +
> +		rb_link_node(&request->irq_node, parent, p);
> +		rb_insert_color(&request->irq_node, &be->requests);
> +	}
> +	spin_unlock(&b->lock);
> +
> +	return first;
> +}
> +
> +void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request)
> +{
> +	struct intel_breadcrumbs *b = &request->i915->breadcrumbs;
> +
> +	spin_lock(&b->lock);
> +	if (--request->irq_count == 0 && !RB_EMPTY_NODE(&request->irq_node)) {
> +		struct intel_breadcrumbs_engine *be =
> +			&b->engine[request->ring->id];
> +		if (be->first == request)
> +			be->first = to_request(rb_next(&request->irq_node));
> +		rb_erase(&request->irq_node, &be->requests);
> +	}
> +	spin_unlock(&b->lock);
> +}
> +
> +int intel_breadcrumbs_init(struct drm_i915_private *i915)
> +{
> +	struct intel_breadcrumbs *b = &i915->breadcrumbs;
> +	struct task_struct *task;
> +
> +	spin_lock_init(&b->lock);
> +
> +	task = kthread_run(intel_breadcrumbs_irq, i915, "irq/i915");
> +	if (IS_ERR(task))
> +		return PTR_ERR(task);
> +
> +	b->task = task;
> +
> +	return 0;
> +}
> +
> +void intel_breadcrumbs_fini(struct drm_i915_private *i915)
> +{
> +	struct intel_breadcrumbs *b = &i915->breadcrumbs;
> +
> +	if (b->task == NULL)
> +		return;
> +
> +	kthread_stop(b->task);
> +	b->task = NULL;
> +}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index b40ffc3607e7..604c081b0a0f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1922,10 +1922,10 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>   	ring->buffer = NULL;
>
>   	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>   	INIT_LIST_HEAD(&ring->active_list);
>   	INIT_LIST_HEAD(&ring->request_list);
>   	i915_gem_batch_pool_init(dev, &ring->batch_pool);
> -	init_waitqueue_head(&ring->irq_queue);
>
>   	INIT_LIST_HEAD(&ring->buffers);
>   	INIT_LIST_HEAD(&ring->execlist_queue);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 511efe556d73..5a5a7195dd54 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2157,6 +2157,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>   	WARN_ON(ring->buffer);
>
>   	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>   	INIT_LIST_HEAD(&ring->active_list);
>   	INIT_LIST_HEAD(&ring->request_list);
>   	INIT_LIST_HEAD(&ring->execlist_queue);
> @@ -2164,8 +2165,6 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>   	i915_gem_batch_pool_init(dev, &ring->batch_pool);
>   	memset(ring->semaphore.sync_seqno, 0, sizeof(ring->semaphore.sync_seqno));
>
> -	init_waitqueue_head(&ring->irq_queue);
> -
>   	ringbuf = intel_engine_create_ringbuffer(ring, 32 * PAGE_SIZE);
>   	if (IS_ERR(ringbuf))
>   		return PTR_ERR(ringbuf);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d1eb206151d..49b5bded2767 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -157,6 +157,7 @@ struct  intel_engine_cs {
>   #define LAST_USER_RING (VECS + 1)
>   	u32		mmio_base;
>   	struct		drm_device *dev;
> +	struct drm_i915_private *i915;
>   	struct intel_ringbuffer *buffer;
>   	struct list_head buffers;
>
> @@ -303,8 +304,6 @@ struct  intel_engine_cs {
>
>   	bool gpu_caches_dirty;
>
> -	wait_queue_head_t irq_queue;
> -
>   	struct intel_context *default_context;
>   	struct intel_context *last_context;
>
>

Regards,

Tvrtko
Hi,

On 30/11/15 10:53, Chris Wilson wrote:
> On Sun, Nov 29, 2015 at 08:48:06AM +0000, Chris Wilson wrote:
>> +	/* Optimistic spin for the next jiffie before touching IRQs */
>> +	if (intel_breadcrumbs_add_waiter(req)) {
>> +		ret = __i915_spin_request(req, state);
>> +		if (ret == 0)
>> +			goto out;
>
> There are a couple of interesting side-effects here.  As we know start
> up the irq in parallel and keep it running for longer, irq/i915 now
> consumes a lot of CPU time (like 50-100%!) for synchronous batches, but
> doesn't seem to interfere with latency (as spin_request is still nicely
> running and catching the request completion). That should also still
> work nicely on single cpu machines as the irq enabling should not
> preempt us.  The second interesting side-effect is that the synchronous
> loads that regressed with a 2us spin-request timeout are now happy again
> at 2us. Also with the active-request and the can-spin check from
> add_waiter, running a low fps game with a compositor is not burning the
> CPU with any spinning.

Interesting? :) Sounds bad the way you presented it.

Why and where is the thread burning so much CPU? Would per engine req 
tree locks help?

Is this new CPU time or the one which would previously be accounted 
against each waiter, polling in wait request?

Regards,

Tvrtko
On Mon, Nov 30, 2015 at 12:05:50PM +0000, Tvrtko Ursulin wrote:
> >+	struct intel_breadcrumbs {
> >+		spinlock_t lock; /* protects the per-engine request list */
> 
> s/list/tree/

list. We use the tree as a list!

> >+		struct task_struct *task; /* bh for all user interrupts */
> >+		struct intel_breadcrumbs_engine {
> >+			struct rb_root requests; /* sorted by retirement */
> >+			struct drm_i915_gem_request *first;
> 
> /* First request in the tree to be completed next by the hw. */
> 
> ?

/* first */ ?
 
> >+		} engine[I915_NUM_RINGS];
> >+	} breadcrumbs;
> >+
> >  	struct i915_virtual_gpu vgpu;
> >
> >  	struct intel_guc guc;
> >@@ -2228,6 +2250,11 @@ struct drm_i915_gem_request {
> >  	/** global list entry for this request */
> >  	struct list_head list;
> >
> >+	/** List of clients waiting for completion of this request */
> >+	wait_queue_head_t wait;
> >+	struct rb_node irq_node;
> >+	unsigned irq_count;
> 
> To me irq prefix does not fit here that well.
> 
> req_tree_node?
> 
> waiter_count, waiters?

waiters, breadcrumb_node, waiters_count ?

> >  	for (;;) {
> >-		struct timer_list timer;
> >-
> >-		prepare_to_wait(&ring->irq_queue, &wait, state);
> >+		prepare_to_wait(&req->wait, &wait, state);
> >
> >-		/* We need to check whether any gpu reset happened in between
> >-		 * the caller grabbing the seqno and now ... */
> >-		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> >-			/* As we do not requeue the request over a GPU reset,
> >-			 * if one does occur we know that the request is
> >-			 * effectively complete.
> >-			 */
> >-			ret = 0;
> >-			break;
> >-		}
> >-
> >-		if (i915_gem_request_completed(req, false)) {
> >+		if (i915_gem_request_completed(req, true) ||
> 
> Why it is OK to change the lazy coherency mode here?

We bang on the coherency after the IRQ. Here, we are woken by the IRQ
waiter so we know that the IRQ/seqno coherency is fine and we can just
check the CPU cache.
 
> >+		    req->reset_counter != i915_reset_counter(&req->i915->gpu_error)) {
> 
> It looks like it would be valuable to preserve the comment about no
> re-queuing etc on GPU reset.

I can drop it from the previous patch :)

> >+		if (timeout) {
> >+			timeout_remain = io_schedule_timeout(timeout_remain);
> >+			if (timeout_remain == 0) {
> >+				ret = -ETIME;
> >+				break;
> >+			}
> >+		} else
> >+			io_schedule();
> 
> Could set timeout_remain to that MAX value when timeout == NULL and
> have a single call site to io_schedule_timeout and less indentation.
> It doesn't matter hugely, maybe it would be a bit easier to read.

Done. io_schedule() calls io_schedule_timeout(), whereas
schedule_timeout() calls schedule()!

> >@@ -900,7 +901,7 @@ static void i915_record_ring_state(struct drm_device *dev,
> >  		ering->instdone = I915_READ(GEN2_INSTDONE);
> >  	}
> >
> >-	ering->waiting = waitqueue_active(&ring->irq_queue);
> >+	ering->waiting = READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first);
> 
> What does the READ_ONCE do here?

This element is protected by the breadcrumbs spinlock, the READ_ONCE is
documentation that we don't care about the lock here.

> >  static void notify_ring(struct intel_engine_cs *ring)
> >  {
> >-	if (!intel_ring_initialized(ring))
> >+	if (ring->i915 == NULL)
> >  		return;
> >
> >  	trace_i915_gem_request_notify(ring);
> >-
> >-	wake_up_all(&ring->irq_queue);
> >+	wake_up_process(ring->i915->breadcrumbs.task);
> 
> For a later extension:
> 
> How would you view, some time in the future, adding a bool return to
> ring->put_irq() which would say to the thread that it failed to
> disable interrupts?
> 
> In this case thread would exit and would need to be restarted when
> the next waiter queues up. Possibly extending the
> idle->put_irq->exit durations as well then.

Honestly, not keen on the idea that the lowlevel put_irq can fail (makes
cleanup much more fraught). I don't see what improvement you intend
here, maybe clarifying that would help explain what you mean.

> >@@ -2986,16 +2981,16 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> >  			if (ring_idle(ring, seqno)) {
> >  				ring->hangcheck.action = HANGCHECK_IDLE;
> >
> >-				if (waitqueue_active(&ring->irq_queue)) {
> >+				if (READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first)) {
> 
> READ_ONCE is again strange, but other than that I don't know the
> hangcheck code to really evaulate it.
> 
> Perhaps this also means this line needs a comment describing what
> condition/state is actually approximating with the check.

All we are doing is asking if anyone is waiting on the GPU, because the
GPU has finished all of its work. If so, the IRQ handling is suspect
(modulo the pre-existing race condition clearly earmarked by READ_ONCE).

> >+	while (!kthread_should_stop()) {
> >+		unsigned reset_counter = i915_reset_counter(&i915->gpu_error);
> >+		unsigned long timeout_jiffies;
> >+		bool idling = false;
> >+
> >+		/* On every tick, walk the seqno-ordered list of requests
> >+		 * and for each retired request wakeup its clients. If we
> >+		 * find an unfinished request, go back to sleep.
> >+		 */
> 
> s/tick/loop/ ?
s/tick/iteration/ I'm sticking with tick
 
> And if we don't find and unfinished request still go back to sleep. :)

Ok.
 
> >+		set_current_state(TASK_INTERRUPTIBLE);
> >+
> >+		/* Note carefully that we do not hold a reference for the
> >+		 * requests on this list. Therefore we only inspect them
> >+		 * whilst holding the spinlock to ensure that they are not
> >+		 * freed in the meantime, and the client must remove the
> >+		 * request from the list if it is interrupted (before it
> >+		 * itself releases its reference).
> >+		 */
> >+		spin_lock(&b->lock);
> 
> This lock is more per engine that global in its nature unless you
> think it was more efficient to do fewer lock atomics vs potential
> overlap of waiter activity per engines?

Indeed. I decided not to care about making it per-engine.
 
> Would need a new lock for req->irq_count, per request. So
> req->lock/req->irq_count and be->lock for the tree operations.

Why? A request is tied to an individual engine, therefore we can use
that breadcrumb_engine.lock for the request.
 
> Thread would only need to deal with the later. Feels like it would
> work, just not sure if it is worth it.

That was my feeling as well. Not sure if we will have huge contention
that would be solved with per-engine spinlocks, whilst we still have
struct_mutex.

> >+		for (i = 0; i < I915_NUM_RINGS; i++) {
> >+			struct intel_engine_cs *engine = &i915->ring[i];
> >+			struct intel_breadcrumbs_engine *be = &b->engine[i];
> >+			struct drm_i915_gem_request *request = be->first;
> >+
> >+			if (request == NULL) {
> >+				if ((irq_get & (1 << i))) {
> >+					if (irq_enabled & (1 << i)) {
> >+						engine->irq_put(engine);
> >+						irq_enabled &= ~ (1 << i);
> >+					}
> >+					intel_runtime_pm_put(i915);
> >+					irq_get &= ~(1 << i);
> >+				}
> >+				continue;
> >+			}
> >+
> >+			if ((irq_get & (1 << i)) == 0) {
> >+				intel_runtime_pm_get(i915);
> >+				irq_enabled |= __irq_enable(engine) << i;
> >+				irq_get |= 1 << i;
> >+			}
> 
> irq_get and irq_enabled are creating a little bit of a headache for
> me. And that would probably mean a comment is be in order to explain
> what they are for and how they work.
> 
> Like this I don't see the need for irq_get to persist across loops?

Because getting the irq is quite slow, we add a jiffie shadow.
 
> irq_get looks like "try to get these", irq_enabled is "these ones I
> got". Apart for the weird usage of it to balance the runtime pm gets
> and puts.
> 
> A bit confusing as I said.

I don't like the names, but haven't thought of anything better.
 
> >+bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request)
> >+{
> >+	struct intel_breadcrumbs *b = &request->i915->breadcrumbs;
> >+	bool first = false;
> >+
> >+	spin_lock(&b->lock);
> >+	if (request->irq_count++ == 0) {
> >+		struct intel_breadcrumbs_engine *be =
> >+			&b->engine[request->ring->id];
> >+		struct rb_node **p, *parent;
> >+
> >+		if (be->first == NULL)
> >+			wake_up_process(b->task);
> 
> With a global b->lock

or even per engine

> it makes no difference but in the logical
> sense this would usually go last, after the request has been
> inserted into the tree and waiter initialized.

Except that the code is simpler with kicking the task first.

Since we both thought it might make sense with a per-engine lock, let's
go with it. Maybe one day it will pay off.
-Chris
On Mon, Nov 30, 2015 at 12:09:30PM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 30/11/15 10:53, Chris Wilson wrote:
> >On Sun, Nov 29, 2015 at 08:48:06AM +0000, Chris Wilson wrote:
> >>+	/* Optimistic spin for the next jiffie before touching IRQs */
> >>+	if (intel_breadcrumbs_add_waiter(req)) {
> >>+		ret = __i915_spin_request(req, state);
> >>+		if (ret == 0)
> >>+			goto out;
> >
> >There are a couple of interesting side-effects here.  As we know start
> >up the irq in parallel and keep it running for longer, irq/i915 now
> >consumes a lot of CPU time (like 50-100%!) for synchronous batches, but
> >doesn't seem to interfere with latency (as spin_request is still nicely
> >running and catching the request completion). That should also still
> >work nicely on single cpu machines as the irq enabling should not
> >preempt us.  The second interesting side-effect is that the synchronous
> >loads that regressed with a 2us spin-request timeout are now happy again
> >at 2us. Also with the active-request and the can-spin check from
> >add_waiter, running a low fps game with a compositor is not burning the
> >CPU with any spinning.
> 
> Interesting? :) Sounds bad the way you presented it.
> 
> Why and where is the thread burning so much CPU? Would per engine
> req tree locks help?

Just simply being woken up after every batch and checking the seqno is
that expensive. Almost as expensive as the IRQ handler itself! (I expect
top is adding the IRQ handler time to the irq/i915 in this case, perf
says that more time is spent in the IRQ than the bottom-half.) Almost
all waiters will be on the same engine, so I don't expect finer grained
spinlocks to be hugely important.
 
> Is this new CPU time or the one which would previously be accounted
> against each waiter, polling in wait request?

This is CPU time that used to be absorbed in i915_wait_request(),
currently hidden by i915_spin_request(). It is the reason why having
every waiter doing the check after every interrupt is such a big issue
for some workloads.
-Chris
On 30/11/15 12:30, Chris Wilson wrote:
> On Mon, Nov 30, 2015 at 12:05:50PM +0000, Tvrtko Ursulin wrote:
>>> +	struct intel_breadcrumbs {
>>> +		spinlock_t lock; /* protects the per-engine request list */
>>
>> s/list/tree/
>
> list. We use the tree as a list!

Maybe but it confuses the reader a bit who goes and looks for some list 
then.

Related, when I was playing with this I was piggy backing on the 
existing per engine request list.

If they are seqno ordered there, and I think they are, then the waker 
thread just traverses it in order until hitting the not completed one, 
waking up every req->wait_queue as it goes.

In this case it doesn't matter that the waiters come in in 
indeterministic order so we don't need a (new) tree.

But the downside is the thread needs struct mutex. And that the 
retirement from that list is so lazy..

>>> +		struct task_struct *task; /* bh for all user interrupts */
>>> +		struct intel_breadcrumbs_engine {
>>> +			struct rb_root requests; /* sorted by retirement */
>>> +			struct drm_i915_gem_request *first;
>>
>> /* First request in the tree to be completed next by the hw. */
>>
>> ?
>
> /* first */ ?

Funny? :) First what? Why make the reader reverse engineer it?

>>> +		} engine[I915_NUM_RINGS];
>>> +	} breadcrumbs;
>>> +
>>>   	struct i915_virtual_gpu vgpu;
>>>
>>>   	struct intel_guc guc;
>>> @@ -2228,6 +2250,11 @@ struct drm_i915_gem_request {
>>>   	/** global list entry for this request */
>>>   	struct list_head list;
>>>
>>> +	/** List of clients waiting for completion of this request */
>>> +	wait_queue_head_t wait;
>>> +	struct rb_node irq_node;
>>> +	unsigned irq_count;
>>
>> To me irq prefix does not fit here that well.
>>
>> req_tree_node?
>>
>> waiter_count, waiters?
>
> waiters, breadcrumb_node, waiters_count ?

Deal.

>>>   	for (;;) {
>>> -		struct timer_list timer;
>>> -
>>> -		prepare_to_wait(&ring->irq_queue, &wait, state);
>>> +		prepare_to_wait(&req->wait, &wait, state);
>>>
>>> -		/* We need to check whether any gpu reset happened in between
>>> -		 * the caller grabbing the seqno and now ... */
>>> -		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
>>> -			/* As we do not requeue the request over a GPU reset,
>>> -			 * if one does occur we know that the request is
>>> -			 * effectively complete.
>>> -			 */
>>> -			ret = 0;
>>> -			break;
>>> -		}
>>> -
>>> -		if (i915_gem_request_completed(req, false)) {
>>> +		if (i915_gem_request_completed(req, true) ||
>>
>> Why it is OK to change the lazy coherency mode here?
>
> We bang on the coherency after the IRQ. Here, we are woken by the IRQ
> waiter so we know that the IRQ/seqno coherency is fine and we can just
> check the CPU cache.

Makes sense, put into comment? There is space now since function is much 
shorter. :)

>>> +		    req->reset_counter != i915_reset_counter(&req->i915->gpu_error)) {
>>
>> It looks like it would be valuable to preserve the comment about no
>> re-queuing etc on GPU reset.
>
> I can drop it from the previous patch :)

No it is useful!

>>> +		if (timeout) {
>>> +			timeout_remain = io_schedule_timeout(timeout_remain);
>>> +			if (timeout_remain == 0) {
>>> +				ret = -ETIME;
>>> +				break;
>>> +			}
>>> +		} else
>>> +			io_schedule();
>>
>> Could set timeout_remain to that MAX value when timeout == NULL and
>> have a single call site to io_schedule_timeout and less indentation.
>> It doesn't matter hugely, maybe it would be a bit easier to read.
>
> Done. io_schedule() calls io_schedule_timeout(), whereas
> schedule_timeout() calls schedule()!

Who disagreed with that? :) Ok!

>>> @@ -900,7 +901,7 @@ static void i915_record_ring_state(struct drm_device *dev,
>>>   		ering->instdone = I915_READ(GEN2_INSTDONE);
>>>   	}
>>>
>>> -	ering->waiting = waitqueue_active(&ring->irq_queue);
>>> +	ering->waiting = READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first);
>>
>> What does the READ_ONCE do here?
>
> This element is protected by the breadcrumbs spinlock, the READ_ONCE is
> documentation that we don't care about the lock here.

Hm I find it confusing but OK.

>>>   static void notify_ring(struct intel_engine_cs *ring)
>>>   {
>>> -	if (!intel_ring_initialized(ring))
>>> +	if (ring->i915 == NULL)
>>>   		return;
>>>
>>>   	trace_i915_gem_request_notify(ring);
>>> -
>>> -	wake_up_all(&ring->irq_queue);
>>> +	wake_up_process(ring->i915->breadcrumbs.task);
>>
>> For a later extension:
>>
>> How would you view, some time in the future, adding a bool return to
>> ring->put_irq() which would say to the thread that it failed to
>> disable interrupts?
>>
>> In this case thread would exit and would need to be restarted when
>> the next waiter queues up. Possibly extending the
>> idle->put_irq->exit durations as well then.
>
> Honestly, not keen on the idea that the lowlevel put_irq can fail (makes
> cleanup much more fraught). I don't see what improvement you intend
> here, maybe clarifying that would help explain what you mean.

Don't know, maybe reflecting the fact it wasn't the last put_irq call so 
let the caller handle it if they want. We can leave this discussion for 
the future.

>>> @@ -2986,16 +2981,16 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>>>   			if (ring_idle(ring, seqno)) {
>>>   				ring->hangcheck.action = HANGCHECK_IDLE;
>>>
>>> -				if (waitqueue_active(&ring->irq_queue)) {
>>> +				if (READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first)) {
>>
>> READ_ONCE is again strange, but other than that I don't know the
>> hangcheck code to really evaulate it.
>>
>> Perhaps this also means this line needs a comment describing what
>> condition/state is actually approximating with the check.
>
> All we are doing is asking if anyone is waiting on the GPU, because the
> GPU has finished all of its work. If so, the IRQ handling is suspect
> (modulo the pre-existing race condition clearly earmarked by READ_ONCE).

Cool, /* Check if someone is waiting on the GPU */ then would make me happy.

>>> +	while (!kthread_should_stop()) {
>>> +		unsigned reset_counter = i915_reset_counter(&i915->gpu_error);
>>> +		unsigned long timeout_jiffies;
>>> +		bool idling = false;
>>> +
>>> +		/* On every tick, walk the seqno-ordered list of requests
>>> +		 * and for each retired request wakeup its clients. If we
>>> +		 * find an unfinished request, go back to sleep.
>>> +		 */
>>
>> s/tick/loop/ ?
> s/tick/iteration/ I'm sticking with tick

Tick makes me tick of a scheduler tick so to me it is the worst of the 
three. Iteration sounds really good. Whether that will be a 
tick/jiffie/orsomething is definite a bit lower in the code.

>> And if we don't find and unfinished request still go back to sleep. :)
>
> Ok.
>
>>> +		set_current_state(TASK_INTERRUPTIBLE);
>>> +
>>> +		/* Note carefully that we do not hold a reference for the
>>> +		 * requests on this list. Therefore we only inspect them
>>> +		 * whilst holding the spinlock to ensure that they are not
>>> +		 * freed in the meantime, and the client must remove the
>>> +		 * request from the list if it is interrupted (before it
>>> +		 * itself releases its reference).
>>> +		 */
>>> +		spin_lock(&b->lock);
>>
>> This lock is more per engine that global in its nature unless you
>> think it was more efficient to do fewer lock atomics vs potential
>> overlap of waiter activity per engines?
>
> Indeed. I decided not to care about making it per-engine.
>
>> Would need a new lock for req->irq_count, per request. So
>> req->lock/req->irq_count and be->lock for the tree operations.
>
> Why? A request is tied to an individual engine, therefore we can use
> that breadcrumb_engine.lock for the request.

Just so 2nd+ waiters would not touch the per engine lock.

>> Thread would only need to deal with the later. Feels like it would
>> work, just not sure if it is worth it.
>
> That was my feeling as well. Not sure if we will have huge contention
> that would be solved with per-engine spinlocks, whilst we still have
> struct_mutex.
>
>>> +		for (i = 0; i < I915_NUM_RINGS; i++) {
>>> +			struct intel_engine_cs *engine = &i915->ring[i];
>>> +			struct intel_breadcrumbs_engine *be = &b->engine[i];
>>> +			struct drm_i915_gem_request *request = be->first;
>>> +
>>> +			if (request == NULL) {
>>> +				if ((irq_get & (1 << i))) {
>>> +					if (irq_enabled & (1 << i)) {
>>> +						engine->irq_put(engine);
>>> +						irq_enabled &= ~ (1 << i);
>>> +					}
>>> +					intel_runtime_pm_put(i915);
>>> +					irq_get &= ~(1 << i);
>>> +				}
>>> +				continue;
>>> +			}
>>> +
>>> +			if ((irq_get & (1 << i)) == 0) {
>>> +				intel_runtime_pm_get(i915);
>>> +				irq_enabled |= __irq_enable(engine) << i;
>>> +				irq_get |= 1 << i;
>>> +			}
>>
>> irq_get and irq_enabled are creating a little bit of a headache for
>> me. And that would probably mean a comment is be in order to explain
>> what they are for and how they work.
>>
>> Like this I don't see the need for irq_get to persist across loops?
>
> Because getting the irq is quite slow, we add a jiffie shadow.

But it is not a shadow in the sense of the same bitmask from the 
previous iteration. The bits are different depending on special cases in 
__irq_enable.

So it needs some comments I think.

>> irq_get looks like "try to get these", irq_enabled is "these ones I
>> got". Apart for the weird usage of it to balance the runtime pm gets
>> and puts.
>>
>> A bit confusing as I said.
>
> I don't like the names, but haven't thought of anything better.
>
>>> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request)
>>> +{
>>> +	struct intel_breadcrumbs *b = &request->i915->breadcrumbs;
>>> +	bool first = false;
>>> +
>>> +	spin_lock(&b->lock);
>>> +	if (request->irq_count++ == 0) {
>>> +		struct intel_breadcrumbs_engine *be =
>>> +			&b->engine[request->ring->id];
>>> +		struct rb_node **p, *parent;
>>> +
>>> +		if (be->first == NULL)
>>> +			wake_up_process(b->task);
>>
>> With a global b->lock
>
> or even per engine
>
>> it makes no difference but in the logical
>> sense this would usually go last, after the request has been
>> inserted into the tree and waiter initialized.
>
> Except that the code is simpler with kicking the task first.
>
> Since we both thought it might make sense with a per-engine lock, let's
> go with it. Maybe one day it will pay off.

Ok.

Regards,

Tvrtko
On 30/11/15 12:38, Chris Wilson wrote:
> On Mon, Nov 30, 2015 at 12:09:30PM +0000, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 30/11/15 10:53, Chris Wilson wrote:
>>> On Sun, Nov 29, 2015 at 08:48:06AM +0000, Chris Wilson wrote:
>>>> +	/* Optimistic spin for the next jiffie before touching IRQs */
>>>> +	if (intel_breadcrumbs_add_waiter(req)) {
>>>> +		ret = __i915_spin_request(req, state);
>>>> +		if (ret == 0)
>>>> +			goto out;
>>>
>>> There are a couple of interesting side-effects here.  As we know start
>>> up the irq in parallel and keep it running for longer, irq/i915 now
>>> consumes a lot of CPU time (like 50-100%!) for synchronous batches, but
>>> doesn't seem to interfere with latency (as spin_request is still nicely
>>> running and catching the request completion). That should also still
>>> work nicely on single cpu machines as the irq enabling should not
>>> preempt us.  The second interesting side-effect is that the synchronous
>>> loads that regressed with a 2us spin-request timeout are now happy again
>>> at 2us. Also with the active-request and the can-spin check from
>>> add_waiter, running a low fps game with a compositor is not burning the
>>> CPU with any spinning.
>>
>> Interesting? :) Sounds bad the way you presented it.
>>
>> Why and where is the thread burning so much CPU? Would per engine
>> req tree locks help?
>
> Just simply being woken up after every batch and checking the seqno is
> that expensive. Almost as expensive as the IRQ handler itself! (I expect
> top is adding the IRQ handler time to the irq/i915 in this case, perf
> says that more time is spent in the IRQ than the bottom-half.) Almost
> all waiters will be on the same engine, so I don't expect finer grained
> spinlocks to be hugely important.
>
>> Is this new CPU time or the one which would previously be accounted
>> against each waiter, polling in wait request?
>
> This is CPU time that used to be absorbed in i915_wait_request(),
> currently hidden by i915_spin_request(). It is the reason why having
> every waiter doing the check after every interrupt is such a big issue
> for some workloads.

So overall total CPU time is similar, just split differently?

Regards,

Tvrtko
On Mon, Nov 30, 2015 at 01:32:16PM +0000, Tvrtko Ursulin wrote:
> 
> On 30/11/15 12:30, Chris Wilson wrote:
> >On Mon, Nov 30, 2015 at 12:05:50PM +0000, Tvrtko Ursulin wrote:
> >>>+	struct intel_breadcrumbs {
> >>>+		spinlock_t lock; /* protects the per-engine request list */
> >>
> >>s/list/tree/
> >
> >list. We use the tree as a list!
> 
> Maybe but it confuses the reader a bit who goes and looks for some
> list then.
> 
> Related, when I was playing with this I was piggy backing on the
> existing per engine request list.

A separate list with separate locking only used when waiting, that's the
appeal for me.
 
> If they are seqno ordered there, and I think they are, then the
> waker thread just traverses it in order until hitting the not
> completed one, waking up every req->wait_queue as it goes.
>
> In this case it doesn't matter that the waiters come in in
> indeterministic order so we don't need a (new) tree.
> 
> But the downside is the thread needs struct mutex. And that the
> retirement from that list is so lazy..

And we can't even take struct_mutex!

> 
> >>>+		struct task_struct *task; /* bh for all user interrupts */
> >>>+		struct intel_breadcrumbs_engine {
> >>>+			struct rb_root requests; /* sorted by retirement */
> >>>+			struct drm_i915_gem_request *first;
> >>
> >>/* First request in the tree to be completed next by the hw. */
> >>
> >>?
> >
> >/* first */ ?
> 
> Funny? :) First what? Why make the reader reverse engineer it?

struct drm_i915_gem_request *first_waiter; ?

Please don't make me add /* ^^^ */

> >>For a later extension:
> >>
> >>How would you view, some time in the future, adding a bool return to
> >>ring->put_irq() which would say to the thread that it failed to
> >>disable interrupts?
> >>
> >>In this case thread would exit and would need to be restarted when
> >>the next waiter queues up. Possibly extending the
> >>idle->put_irq->exit durations as well then.
> >
> >Honestly, not keen on the idea that the lowlevel put_irq can fail (makes
> >cleanup much more fraught). I don't see what improvement you intend
> >here, maybe clarifying that would help explain what you mean.
> 
> Don't know, maybe reflecting the fact it wasn't the last put_irq
> call so let the caller handle it if they want. We can leave this
> discussion for the future.

Hmm. The only other use is the trace_irq. It would be nice to eliminate
that and the ring->irq_count.
> 
> >>>@@ -2986,16 +2981,16 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> >>>  			if (ring_idle(ring, seqno)) {
> >>>  				ring->hangcheck.action = HANGCHECK_IDLE;
> >>>
> >>>-				if (waitqueue_active(&ring->irq_queue)) {
> >>>+				if (READ_ONCE(dev_priv->breadcrumbs.engine[ring->id].first)) {
> >>
> >>READ_ONCE is again strange, but other than that I don't know the
> >>hangcheck code to really evaulate it.
> >>
> >>Perhaps this also means this line needs a comment describing what
> >>condition/state is actually approximating with the check.
> >
> >All we are doing is asking if anyone is waiting on the GPU, because the
> >GPU has finished all of its work. If so, the IRQ handling is suspect
> >(modulo the pre-existing race condition clearly earmarked by READ_ONCE).
> 
> Cool, /* Check if someone is waiting on the GPU */ then would make me happy.

+static bool any_waiters_on_engine(struct intel_engine_cs *engine)
+{
+       return READ_ONCE(engine->i915->breadcrumbs.engine[engine->id].first_waiter);
+}
+
 static bool any_waiters(struct drm_i915_private *dev_priv)
 {
        struct intel_engine_cs *ring;
        int i;
 
        for_each_ring(ring, dev_priv, i)
-               if (ring->irq_refcount)
+               if (any_waiters_on_engine(engine))
                        return true;
 
        return false;

> >>>+	while (!kthread_should_stop()) {
> >>>+		unsigned reset_counter = i915_reset_counter(&i915->gpu_error);
> >>>+		unsigned long timeout_jiffies;
> >>>+		bool idling = false;
> >>>+
> >>>+		/* On every tick, walk the seqno-ordered list of requests
> >>>+		 * and for each retired request wakeup its clients. If we
> >>>+		 * find an unfinished request, go back to sleep.
> >>>+		 */
> >>
> >>s/tick/loop/ ?
> >s/tick/iteration/ I'm sticking with tick
> 
> Tick makes me tick of a scheduler tick so to me it is the worst of
> the three. Iteration sounds really good. Whether that will be a
> tick/jiffie/orsomething is definite a bit lower in the code.

Hmm, that was the connection I liked with tick.
 
> >>And if we don't find and unfinished request still go back to sleep. :)
> >
> >Ok.
> >
> >>>+		set_current_state(TASK_INTERRUPTIBLE);
> >>>+
> >>>+		/* Note carefully that we do not hold a reference for the
> >>>+		 * requests on this list. Therefore we only inspect them
> >>>+		 * whilst holding the spinlock to ensure that they are not
> >>>+		 * freed in the meantime, and the client must remove the
> >>>+		 * request from the list if it is interrupted (before it
> >>>+		 * itself releases its reference).
> >>>+		 */
> >>>+		spin_lock(&b->lock);
> >>
> >>This lock is more per engine that global in its nature unless you
> >>think it was more efficient to do fewer lock atomics vs potential
> >>overlap of waiter activity per engines?
> >
> >Indeed. I decided not to care about making it per-engine.
> >
> >>Would need a new lock for req->irq_count, per request. So
> >>req->lock/req->irq_count and be->lock for the tree operations.
> >
> >Why? A request is tied to an individual engine, therefore we can use
> >that breadcrumb_engine.lock for the request.
> 
> Just so 2nd+ waiters would not touch the per engine lock.

Ah. I am not expecting that much contention and even fewer multiple
waiters for a request.

> >>>+		for (i = 0; i < I915_NUM_RINGS; i++) {
> >>>+			struct intel_engine_cs *engine = &i915->ring[i];
> >>>+			struct intel_breadcrumbs_engine *be = &b->engine[i];
> >>>+			struct drm_i915_gem_request *request = be->first;
> >>>+
> >>>+			if (request == NULL) {
> >>>+				if ((irq_get & (1 << i))) {
> >>>+					if (irq_enabled & (1 << i)) {
> >>>+						engine->irq_put(engine);
> >>>+						irq_enabled &= ~ (1 << i);
> >>>+					}
> >>>+					intel_runtime_pm_put(i915);
> >>>+					irq_get &= ~(1 << i);
> >>>+				}
> >>>+				continue;
> >>>+			}
> >>>+
> >>>+			if ((irq_get & (1 << i)) == 0) {
> >>>+				intel_runtime_pm_get(i915);
> >>>+				irq_enabled |= __irq_enable(engine) << i;
> >>>+				irq_get |= 1 << i;
> >>>+			}
> >>
> >>irq_get and irq_enabled are creating a little bit of a headache for
> >>me. And that would probably mean a comment is be in order to explain
> >>what they are for and how they work.
> >>
> >>Like this I don't see the need for irq_get to persist across loops?
> >
> >Because getting the irq is quite slow, we add a jiffie shadow.
> 
> But it is not a shadow in the sense of the same bitmask from the
> previous iteration. The bits are different depending on special
> cases in __irq_enable.
> 
> So it needs some comments I think.

Didn't you get that from idling and its comments?

First, how about engines_active and irqs_enabled?

...
                        /* When enabling waiting upon an engine, we need
                         * to unmask the user interrupt, and before we
                         * do so we need to hold a wakelock for our
                         * hardware access (and the interrupts thereafter).
                         */
                        if ((engines_active & (1 << i)) == 0) {
...
                        if (request == NULL) {
                                /* No more waiters, mask the user interrupt
                                 * (if we were able to enable it!) and
                                 * release the wakelock.
                                 */
                                if (engines_active & (1 << i)) {


with a bit of work, I can then get the irq and rpm references out of the
spinlock.
-Chris
On Mon, Nov 30, 2015 at 01:32:16PM +0000, Tvrtko Ursulin wrote:
> 
> On 30/11/15 12:30, Chris Wilson wrote:
> >On Mon, Nov 30, 2015 at 12:05:50PM +0000, Tvrtko Ursulin wrote:
> >>>+	while (!kthread_should_stop()) {
> >>>+		unsigned reset_counter = i915_reset_counter(&i915->gpu_error);
> >>>+		unsigned long timeout_jiffies;
> >>>+		bool idling = false;
> >>>+
> >>>+		/* On every tick, walk the seqno-ordered list of requests
> >>>+		 * and for each retired request wakeup its clients. If we
> >>>+		 * find an unfinished request, go back to sleep.
> >>>+		 */
> >>
> >>s/tick/loop/ ?
> >s/tick/iteration/ I'm sticking with tick
> 
> Tick makes me tick of a scheduler tick so to me it is the worst of
> the three. Iteration sounds really good. Whether that will be a
> tick/jiffie/orsomething is definite a bit lower in the code.

wakeup? Going with "On every wakeup". The duplication of wakeup then
emphasises the extra wakeup we introduction into i915_wait_request.
-Chris
On Mon, Nov 30, 2015 at 01:33:50PM +0000, Tvrtko Ursulin wrote:
> 
> On 30/11/15 12:38, Chris Wilson wrote:
> >On Mon, Nov 30, 2015 at 12:09:30PM +0000, Tvrtko Ursulin wrote:
> >>
> >>Hi,
> >>
> >>On 30/11/15 10:53, Chris Wilson wrote:
> >>>On Sun, Nov 29, 2015 at 08:48:06AM +0000, Chris Wilson wrote:
> >>>>+	/* Optimistic spin for the next jiffie before touching IRQs */
> >>>>+	if (intel_breadcrumbs_add_waiter(req)) {
> >>>>+		ret = __i915_spin_request(req, state);
> >>>>+		if (ret == 0)
> >>>>+			goto out;
> >>>
> >>>There are a couple of interesting side-effects here.  As we know start
> >>>up the irq in parallel and keep it running for longer, irq/i915 now
> >>>consumes a lot of CPU time (like 50-100%!) for synchronous batches, but
> >>>doesn't seem to interfere with latency (as spin_request is still nicely
> >>>running and catching the request completion). That should also still
> >>>work nicely on single cpu machines as the irq enabling should not
> >>>preempt us.  The second interesting side-effect is that the synchronous
> >>>loads that regressed with a 2us spin-request timeout are now happy again
> >>>at 2us. Also with the active-request and the can-spin check from
> >>>add_waiter, running a low fps game with a compositor is not burning the
> >>>CPU with any spinning.
> >>
> >>Interesting? :) Sounds bad the way you presented it.
> >>
> >>Why and where is the thread burning so much CPU? Would per engine
> >>req tree locks help?
> >
> >Just simply being woken up after every batch and checking the seqno is
> >that expensive. Almost as expensive as the IRQ handler itself! (I expect
> >top is adding the IRQ handler time to the irq/i915 in this case, perf
> >says that more time is spent in the IRQ than the bottom-half.) Almost
> >all waiters will be on the same engine, so I don't expect finer grained
> >spinlocks to be hugely important.
> >
> >>Is this new CPU time or the one which would previously be accounted
> >>against each waiter, polling in wait request?
> >
> >This is CPU time that used to be absorbed in i915_wait_request(),
> >currently hidden by i915_spin_request(). It is the reason why having
> >every waiter doing the check after every interrupt is such a big issue
> >for some workloads.
> 
> So overall total CPU time is similar, just split differently?

With the pre-enable + spin, we increase CPU time. On those synchronous
tests, the runtime is the same but we are now at 100% CPU in the test
and an additional 50-100% in the irq/i915. Afaict, this is only
affecting those igt, my desktop usage doesn't spin as much.
-Chris
On 30/11/15 14:18, Chris Wilson wrote:
> On Mon, Nov 30, 2015 at 01:32:16PM +0000, Tvrtko Ursulin wrote:
>>
>> On 30/11/15 12:30, Chris Wilson wrote:
>>> On Mon, Nov 30, 2015 at 12:05:50PM +0000, Tvrtko Ursulin wrote:
>>>>> +	struct intel_breadcrumbs {
>>>>> +		spinlock_t lock; /* protects the per-engine request list */
>>>>
>>>> s/list/tree/
>>>
>>> list. We use the tree as a list!
>>
>> Maybe but it confuses the reader a bit who goes and looks for some
>> list then.
>>
>> Related, when I was playing with this I was piggy backing on the
>> existing per engine request list.
>
> A separate list with separate locking only used when waiting, that's the
> appeal for me.

The patchset recently posted by John Harrison
"[RFC 00/12] Convert requests to use struct fence"
provides exactly that. No struct_mutex involved,
and only the relevant waiters get woken up.

.Dave.