[v9,2/6] drm/i915: Convert requests to use struct fence

Submitted by John.C.Harrison@Intel.com on June 1, 2016, 5:07 p.m.

Details

Message ID 1464800848-36672-3-git-send-email-John.C.Harrison@Intel.com
State New
Headers show
Series "Convert requests to use struct fence" ( rev: 6 ) in Intel GFX

Not browsing as part of any series.

Commit Message

John.C.Harrison@Intel.com June 1, 2016, 5:07 p.m.
From: John Harrison <John.C.Harrison@Intel.com>

There is a construct in the linux kernel called 'struct fence' that is
intended to keep track of work that is executed on hardware. I.e. it
solves the basic problem that the drivers 'struct
drm_i915_gem_request' is trying to address. The request structure does
quite a lot more than simply track the execution progress so is very
definitely still required. However, the basic completion status side
could be updated to use the ready made fence implementation and gain
all the advantages that provides.

This patch makes the first step of integrating a struct fence into the
request. It replaces the explicit reference count with that of the
fence. It also replaces the 'is completed' test with the fence's
equivalent. Currently, that simply chains on to the original request
implementation. A future patch will improve this.

v3: Updated after review comments by Tvrtko Ursulin. Added fence
context/seqno pair to the debugfs request info. Renamed fence 'driver
name' to just 'i915'. Removed BUG_ONs.

v5: Changed seqno format in debugfs to %x rather than %u as that is
apparently the preferred appearance. Line wrapped some long lines to
keep the style checker happy.

v6: Updated to newer nigthly and resolved conflicts. The biggest issue
was with the re-worked busy spin precursor to waiting on a request. In
particular, the addition of a 'request_started' helper function. This
has no corresponding concept within the fence framework. However, it
is only ever used in one place and the whole point of that place is to
always directly read the seqno for absolutely lowest latency possible.
So the simple solution is to just make the seqno test explicit at that
point now rather than later in the series (it was previously being
done anyway when fences become interrupt driven).

v7: Rebased to newer nightly - lots of ring -> engine renaming and
interface change to get_seqno().

v8: Rebased to newer nightly - no longer needs to worry about mutex
locking in the request free code path. Moved to after fence timeline
patch so no longer needs to add a horrid hack timeline.

Removed commented out code block. Added support for possible RCU usage
of fence object (Review comments by Maarten Lankhorst).

For: VIZ-5190
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
 drivers/gpu/drm/i915/i915_drv.h         |  43 +++++---------
 drivers/gpu/drm/i915/i915_gem.c         | 101 +++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/intel_lrc.c        |   1 +
 drivers/gpu/drm/i915/intel_ringbuffer.c |   1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
 6 files changed, 115 insertions(+), 38 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index ac7e569..844cc4b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -767,11 +767,12 @@  static int i915_gem_request_info(struct seq_file *m, void *data)
 			task = NULL;
 			if (req->pid)
 				task = pid_task(req->pid, PIDTYPE_PID);
-			seq_printf(m, "    %x @ %d: %s [%d]\n",
+			seq_printf(m, "    %x @ %d: %s [%d], fence = %x:%x\n",
 				   req->seqno,
 				   (int) (jiffies - req->emitted_jiffies),
 				   task ? task->comm : "<unknown>",
-				   task ? task->pid : -1);
+				   task ? task->pid : -1,
+				   req->fence.context, req->fence.seqno);
 			rcu_read_unlock();
 		}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a5f8ad8..905feae 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -42,6 +42,7 @@ 
 #include <linux/kref.h>
 #include <linux/pm_qos.h>
 #include <linux/shmem_fs.h>
+#include <linux/fence.h>
 
 #include <drm/drmP.h>
 #include <drm/intel-gtt.h>
@@ -2353,7 +2354,11 @@  static inline struct scatterlist *__sg_next(struct scatterlist *sg)
  * initial reference taken using kref_init
  */
 struct drm_i915_gem_request {
-	struct kref ref;
+	/**
+	 * Underlying object for implementing the signal/wait stuff.
+	 */
+	struct fence fence;
+	struct rcu_head rcu_head;
 
 	/** On Which ring this request was generated */
 	struct drm_i915_private *i915;
@@ -2455,7 +2460,13 @@  struct drm_i915_gem_request {
 struct drm_i915_gem_request * __must_check
 i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       struct i915_gem_context *ctx);
-void i915_gem_request_free(struct kref *req_ref);
+
+static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
+					      bool lazy_coherency)
+{
+	return fence_is_signaled(&req->fence);
+}
+
 int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
 				   struct drm_file *file);
 
@@ -2475,14 +2486,14 @@  static inline struct drm_i915_gem_request *
 i915_gem_request_reference(struct drm_i915_gem_request *req)
 {
 	if (req)
-		kref_get(&req->ref);
+		fence_get(&req->fence);
 	return req;
 }
 
 static inline void
 i915_gem_request_unreference(struct drm_i915_gem_request *req)
 {
-	kref_put(&req->ref, i915_gem_request_free);
+	fence_put(&req->fence);
 }
 
 static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
@@ -2498,12 +2509,6 @@  static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
 }
 
 /*
- * XXX: i915_gem_request_completed should be here but currently needs the
- * definition of i915_seqno_passed() which is below. It will be moved in
- * a later patch when the call to i915_seqno_passed() is obsoleted...
- */
-
-/*
  * A command that requires special handling by the command parser.
  */
 struct drm_i915_cmd_descriptor {
@@ -3211,24 +3216,6 @@  i915_seqno_passed(uint32_t seq1, uint32_t seq2)
 	return (int32_t)(seq1 - seq2) >= 0;
 }
 
-static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
-					   bool lazy_coherency)
-{
-	if (!lazy_coherency && req->engine->irq_seqno_barrier)
-		req->engine->irq_seqno_barrier(req->engine);
-	return i915_seqno_passed(req->engine->get_seqno(req->engine),
-				 req->previous_seqno);
-}
-
-static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
-					      bool lazy_coherency)
-{
-	if (!lazy_coherency && req->engine->irq_seqno_barrier)
-		req->engine->irq_seqno_barrier(req->engine);
-	return i915_seqno_passed(req->engine->get_seqno(req->engine),
-				 req->seqno);
-}
-
 int __must_check i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno);
 int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 57d3593..b67fd7c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1170,6 +1170,7 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 {
 	unsigned long timeout;
 	unsigned cpu;
+	uint32_t seqno;
 
 	/* When waiting for high frequency requests, e.g. during synchronous
 	 * rendering split between the CPU and GPU, the finite amount of time
@@ -1185,12 +1186,14 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 		return -EBUSY;
 
 	/* Only spin if we know the GPU is processing this request */
-	if (!i915_gem_request_started(req, true))
+	seqno = req->engine->get_seqno(req->engine);
+	if (!i915_seqno_passed(seqno, req->previous_seqno))
 		return -EAGAIN;
 
 	timeout = local_clock_us(&cpu) + 5;
 	while (!need_resched()) {
-		if (i915_gem_request_completed(req, true))
+		seqno = req->engine->get_seqno(req->engine);
+		if (i915_seqno_passed(seqno, req->seqno))
 			return 0;
 
 		if (signal_pending_state(state, current))
@@ -1202,7 +1205,10 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 		cpu_relax_lowlatency();
 	}
 
-	if (i915_gem_request_completed(req, false))
+	if (req->engine->irq_seqno_barrier)
+		req->engine->irq_seqno_barrier(req->engine);
+	seqno = req->engine->get_seqno(req->engine);
+	if (i915_seqno_passed(seqno, req->seqno))
 		return 0;
 
 	return -EAGAIN;
@@ -2736,13 +2742,89 @@  static void i915_set_reset_status(struct drm_i915_private *dev_priv,
 	}
 }
 
-void i915_gem_request_free(struct kref *req_ref)
+static void i915_gem_request_free_rcu(struct rcu_head *head)
 {
-	struct drm_i915_gem_request *req = container_of(req_ref,
-						 typeof(*req), ref);
+	struct drm_i915_gem_request *req;
+
+	req = container_of(head, typeof(*req), rcu_head);
 	kmem_cache_free(req->i915->requests, req);
 }
 
+static void i915_gem_request_free(struct fence *req_fence)
+{
+	struct drm_i915_gem_request *req;
+
+	req = container_of(req_fence, typeof(*req), fence);
+	call_rcu(&req->rcu_head, i915_gem_request_free_rcu);
+}
+
+static bool i915_gem_request_enable_signaling(struct fence *req_fence)
+{
+	/* Interrupt driven fences are not implemented yet.*/
+	WARN(true, "This should not be called!");
+	return true;
+}
+
+static bool i915_gem_request_is_completed(struct fence *req_fence)
+{
+	struct drm_i915_gem_request *req = container_of(req_fence,
+						 typeof(*req), fence);
+	u32 seqno;
+
+	seqno = req->engine->get_seqno(req->engine);
+
+	return i915_seqno_passed(seqno, req->seqno);
+}
+
+static const char *i915_gem_request_get_driver_name(struct fence *req_fence)
+{
+	return "i915";
+}
+
+static const char *i915_gem_request_get_timeline_name(struct fence *req_fence)
+{
+	struct drm_i915_gem_request *req;
+	struct i915_fence_timeline *timeline;
+
+	req = container_of(req_fence, typeof(*req), fence);
+	timeline = &req->ctx->engine[req->engine->id].fence_timeline;
+
+	return timeline->name;
+}
+
+static void i915_gem_request_timeline_value_str(struct fence *req_fence,
+						char *str, int size)
+{
+	struct drm_i915_gem_request *req;
+
+	req = container_of(req_fence, typeof(*req), fence);
+
+	/* Last signalled timeline value ??? */
+	snprintf(str, size, "? [%d]"/*, timeline->value*/,
+		 req->engine->get_seqno(req->engine));
+}
+
+static void i915_gem_request_fence_value_str(struct fence *req_fence,
+					     char *str, int size)
+{
+	struct drm_i915_gem_request *req;
+
+	req = container_of(req_fence, typeof(*req), fence);
+
+	snprintf(str, size, "%d [%d]", req->fence.seqno, req->seqno);
+}
+
+static const struct fence_ops i915_gem_request_fops = {
+	.enable_signaling	= i915_gem_request_enable_signaling,
+	.signaled		= i915_gem_request_is_completed,
+	.wait			= fence_default_wait,
+	.release		= i915_gem_request_free,
+	.get_driver_name	= i915_gem_request_get_driver_name,
+	.get_timeline_name	= i915_gem_request_get_timeline_name,
+	.fence_value_str	= i915_gem_request_fence_value_str,
+	.timeline_value_str	= i915_gem_request_timeline_value_str,
+};
+
 int i915_create_fence_timeline(struct drm_device *dev,
 			       struct i915_gem_context *ctx,
 			       struct intel_engine_cs *engine)
@@ -2770,7 +2852,7 @@  int i915_create_fence_timeline(struct drm_device *dev,
 	return 0;
 }
 
-unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
+static unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
 {
 	unsigned seqno;
 
@@ -2814,13 +2896,16 @@  __i915_gem_request_alloc(struct intel_engine_cs *engine,
 	if (ret)
 		goto err;
 
-	kref_init(&req->ref);
 	req->i915 = dev_priv;
 	req->engine = engine;
 	req->reset_counter = reset_counter;
 	req->ctx  = ctx;
 	i915_gem_context_reference(req->ctx);
 
+	fence_init(&req->fence, &i915_gem_request_fops, &engine->fence_lock,
+		   ctx->engine[engine->id].fence_timeline.fence_context,
+		   i915_fence_timeline_get_next_seqno(&ctx->engine[engine->id].fence_timeline));
+
 	/*
 	 * Reserve space in the ring buffer for all the commands required to
 	 * eventually emit this request. This is to guarantee that the
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 14bcfb7..f126bcb 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2030,6 +2030,7 @@  logical_ring_setup(struct drm_device *dev, enum intel_engine_id id)
 	INIT_LIST_HEAD(&engine->buffers);
 	INIT_LIST_HEAD(&engine->execlist_queue);
 	spin_lock_init(&engine->execlist_lock);
+	spin_lock_init(&engine->fence_lock);
 
 	tasklet_init(&engine->irq_tasklet,
 		     intel_lrc_irq_handler, (unsigned long)engine);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 8d35a39..fbd3f12 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2254,6 +2254,7 @@  static int intel_init_ring_buffer(struct drm_device *dev,
 	INIT_LIST_HEAD(&engine->request_list);
 	INIT_LIST_HEAD(&engine->execlist_queue);
 	INIT_LIST_HEAD(&engine->buffers);
+	spin_lock_init(&engine->fence_lock);
 	i915_gem_batch_pool_init(dev, &engine->batch_pool);
 	memset(engine->semaphore.sync_seqno, 0,
 	       sizeof(engine->semaphore.sync_seqno));
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index b33c876..3f39daf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -345,6 +345,8 @@  struct intel_engine_cs {
 	 * to encode the command length in the header).
 	 */
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
+
+	spinlock_t fence_lock;
 };
 
 static inline bool

Comments

On 01/06/16 18:07, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> There is a construct in the linux kernel called 'struct fence' that is
> intended to keep track of work that is executed on hardware. I.e. it
> solves the basic problem that the drivers 'struct
> drm_i915_gem_request' is trying to address. The request structure does
> quite a lot more than simply track the execution progress so is very
> definitely still required. However, the basic completion status side
> could be updated to use the ready made fence implementation and gain
> all the advantages that provides.
>
> This patch makes the first step of integrating a struct fence into the
> request. It replaces the explicit reference count with that of the
> fence. It also replaces the 'is completed' test with the fence's
> equivalent. Currently, that simply chains on to the original request
> implementation. A future patch will improve this.
>
> v3: Updated after review comments by Tvrtko Ursulin. Added fence
> context/seqno pair to the debugfs request info. Renamed fence 'driver
> name' to just 'i915'. Removed BUG_ONs.
>
> v5: Changed seqno format in debugfs to %x rather than %u as that is
> apparently the preferred appearance. Line wrapped some long lines to
> keep the style checker happy.
>
> v6: Updated to newer nigthly and resolved conflicts. The biggest issue
> was with the re-worked busy spin precursor to waiting on a request. In
> particular, the addition of a 'request_started' helper function. This
> has no corresponding concept within the fence framework. However, it
> is only ever used in one place and the whole point of that place is to
> always directly read the seqno for absolutely lowest latency possible.
> So the simple solution is to just make the seqno test explicit at that
> point now rather than later in the series (it was previously being
> done anyway when fences become interrupt driven).
>
> v7: Rebased to newer nightly - lots of ring -> engine renaming and
> interface change to get_seqno().
>
> v8: Rebased to newer nightly - no longer needs to worry about mutex
> locking in the request free code path. Moved to after fence timeline
> patch so no longer needs to add a horrid hack timeline.
>
> Removed commented out code block. Added support for possible RCU usage
> of fence object (Review comments by Maarten Lankhorst).
>
> For: VIZ-5190
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>

Was it an r-b or an ack from Jesse? If the former does it need a "(v?)" 
suffix, depending on the amount of code changes after his r-b?

> ---
>   drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
>   drivers/gpu/drm/i915/i915_drv.h         |  43 +++++---------
>   drivers/gpu/drm/i915/i915_gem.c         | 101 +++++++++++++++++++++++++++++---
>   drivers/gpu/drm/i915/intel_lrc.c        |   1 +
>   drivers/gpu/drm/i915/intel_ringbuffer.c |   1 +
>   drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
>   6 files changed, 115 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index ac7e569..844cc4b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -767,11 +767,12 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
>   			task = NULL;
>   			if (req->pid)
>   				task = pid_task(req->pid, PIDTYPE_PID);
> -			seq_printf(m, "    %x @ %d: %s [%d]\n",
> +			seq_printf(m, "    %x @ %d: %s [%d], fence = %x:%x\n",

In the previous patch fence context and seqno were %d in the 
timeline->name so it would probably be more consistent.

>   				   req->seqno,
>   				   (int) (jiffies - req->emitted_jiffies),
>   				   task ? task->comm : "<unknown>",
> -				   task ? task->pid : -1);
> +				   task ? task->pid : -1,
> +				   req->fence.context, req->fence.seqno);
>   			rcu_read_unlock();
>   		}
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a5f8ad8..905feae 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -42,6 +42,7 @@
>   #include <linux/kref.h>
>   #include <linux/pm_qos.h>
>   #include <linux/shmem_fs.h>
> +#include <linux/fence.h>
>
>   #include <drm/drmP.h>
>   #include <drm/intel-gtt.h>
> @@ -2353,7 +2354,11 @@ static inline struct scatterlist *__sg_next(struct scatterlist *sg)
>    * initial reference taken using kref_init
>    */
>   struct drm_i915_gem_request {
> -	struct kref ref;
> +	/**
> +	 * Underlying object for implementing the signal/wait stuff.
> +	 */
> +	struct fence fence;
> +	struct rcu_head rcu_head;
>
>   	/** On Which ring this request was generated */
>   	struct drm_i915_private *i915;
> @@ -2455,7 +2460,13 @@ struct drm_i915_gem_request {
>   struct drm_i915_gem_request * __must_check
>   i915_gem_request_alloc(struct intel_engine_cs *engine,
>   		       struct i915_gem_context *ctx);
> -void i915_gem_request_free(struct kref *req_ref);
> +
> +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
> +					      bool lazy_coherency)
> +{
> +	return fence_is_signaled(&req->fence);
> +}

I would squash the following patch into this one, it makes no sense to 
keep a function with an unused parameter. And fewer patches in the 
series makes it less scary to review. :) Of course if they are also not 
too big. :D

> +
>   int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
>   				   struct drm_file *file);
>
> @@ -2475,14 +2486,14 @@ static inline struct drm_i915_gem_request *
>   i915_gem_request_reference(struct drm_i915_gem_request *req)
>   {
>   	if (req)
> -		kref_get(&req->ref);
> +		fence_get(&req->fence);
>   	return req;
>   }
>
>   static inline void
>   i915_gem_request_unreference(struct drm_i915_gem_request *req)
>   {
> -	kref_put(&req->ref, i915_gem_request_free);
> +	fence_put(&req->fence);
>   }
>
>   static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
> @@ -2498,12 +2509,6 @@ static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
>   }
>
>   /*
> - * XXX: i915_gem_request_completed should be here but currently needs the
> - * definition of i915_seqno_passed() which is below. It will be moved in
> - * a later patch when the call to i915_seqno_passed() is obsoleted...
> - */
> -
> -/*
>    * A command that requires special handling by the command parser.
>    */
>   struct drm_i915_cmd_descriptor {
> @@ -3211,24 +3216,6 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2)
>   	return (int32_t)(seq1 - seq2) >= 0;
>   }
>
> -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
> -					   bool lazy_coherency)
> -{
> -	if (!lazy_coherency && req->engine->irq_seqno_barrier)
> -		req->engine->irq_seqno_barrier(req->engine);
> -	return i915_seqno_passed(req->engine->get_seqno(req->engine),
> -				 req->previous_seqno);
> -}
> -
> -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
> -					      bool lazy_coherency)
> -{
> -	if (!lazy_coherency && req->engine->irq_seqno_barrier)
> -		req->engine->irq_seqno_barrier(req->engine);
> -	return i915_seqno_passed(req->engine->get_seqno(req->engine),
> -				 req->seqno);
> -}
> -
>   int __must_check i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno);
>   int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno);
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 57d3593..b67fd7c 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1170,6 +1170,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   {
>   	unsigned long timeout;
>   	unsigned cpu;
> +	uint32_t seqno;
>
>   	/* When waiting for high frequency requests, e.g. during synchronous
>   	 * rendering split between the CPU and GPU, the finite amount of time
> @@ -1185,12 +1186,14 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   		return -EBUSY;
>
>   	/* Only spin if we know the GPU is processing this request */
> -	if (!i915_gem_request_started(req, true))
> +	seqno = req->engine->get_seqno(req->engine);
> +	if (!i915_seqno_passed(seqno, req->previous_seqno))
>   		return -EAGAIN;
>
>   	timeout = local_clock_us(&cpu) + 5;
>   	while (!need_resched()) {
> -		if (i915_gem_request_completed(req, true))
> +		seqno = req->engine->get_seqno(req->engine);
> +		if (i915_seqno_passed(seqno, req->seqno))
>   			return 0;
>
>   		if (signal_pending_state(state, current))
> @@ -1202,7 +1205,10 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   		cpu_relax_lowlatency();
>   	}
>
> -	if (i915_gem_request_completed(req, false))
> +	if (req->engine->irq_seqno_barrier)
> +		req->engine->irq_seqno_barrier(req->engine);
> +	seqno = req->engine->get_seqno(req->engine);
> +	if (i915_seqno_passed(seqno, req->seqno))
>   		return 0;
>
>   	return -EAGAIN;
> @@ -2736,13 +2742,89 @@ static void i915_set_reset_status(struct drm_i915_private *dev_priv,
>   	}
>   }
>
> -void i915_gem_request_free(struct kref *req_ref)
> +static void i915_gem_request_free_rcu(struct rcu_head *head)
>   {
> -	struct drm_i915_gem_request *req = container_of(req_ref,
> -						 typeof(*req), ref);
> +	struct drm_i915_gem_request *req;
> +
> +	req = container_of(head, typeof(*req), rcu_head);
>   	kmem_cache_free(req->i915->requests, req);
>   }
>
> +static void i915_gem_request_free(struct fence *req_fence)
> +{
> +	struct drm_i915_gem_request *req;
> +
> +	req = container_of(req_fence, typeof(*req), fence);
> +	call_rcu(&req->rcu_head, i915_gem_request_free_rcu);
> +}
> +
> +static bool i915_gem_request_enable_signaling(struct fence *req_fence)
> +{
> +	/* Interrupt driven fences are not implemented yet.*/
> +	WARN(true, "This should not be called!");
> +	return true;
> +}
> +
> +static bool i915_gem_request_is_completed(struct fence *req_fence)
> +{
> +	struct drm_i915_gem_request *req = container_of(req_fence,
> +						 typeof(*req), fence);
> +	u32 seqno;
> +
> +	seqno = req->engine->get_seqno(req->engine);
> +
> +	return i915_seqno_passed(seqno, req->seqno);
> +}
> +
> +static const char *i915_gem_request_get_driver_name(struct fence *req_fence)
> +{
> +	return "i915";
> +}
> +
> +static const char *i915_gem_request_get_timeline_name(struct fence *req_fence)
> +{
> +	struct drm_i915_gem_request *req;
> +	struct i915_fence_timeline *timeline;
> +
> +	req = container_of(req_fence, typeof(*req), fence);
> +	timeline = &req->ctx->engine[req->engine->id].fence_timeline;
> +
> +	return timeline->name;
> +}
> +
> +static void i915_gem_request_timeline_value_str(struct fence *req_fence,
> +						char *str, int size)
> +{
> +	struct drm_i915_gem_request *req;
> +
> +	req = container_of(req_fence, typeof(*req), fence);
> +
> +	/* Last signalled timeline value ??? */
> +	snprintf(str, size, "? [%d]"/*, timeline->value*/,

Reference to timeline->value a leftover from the past?

Is the string format defined by the API? Asking because "? [%d]" looks 
intriguing.

> +		 req->engine->get_seqno(req->engine));
> +}
> +
> +static void i915_gem_request_fence_value_str(struct fence *req_fence,
> +					     char *str, int size)
> +{
> +	struct drm_i915_gem_request *req;
> +
> +	req = container_of(req_fence, typeof(*req), fence);
> +
> +	snprintf(str, size, "%d [%d]", req->fence.seqno, req->seqno);

Is it OK to put req->seqno in this one? OR it is just for debug anyway 
so it helps us and fence framework does not care?

> +}
> +
> +static const struct fence_ops i915_gem_request_fops = {
> +	.enable_signaling	= i915_gem_request_enable_signaling,
> +	.signaled		= i915_gem_request_is_completed,
> +	.wait			= fence_default_wait,
> +	.release		= i915_gem_request_free,
> +	.get_driver_name	= i915_gem_request_get_driver_name,
> +	.get_timeline_name	= i915_gem_request_get_timeline_name,
> +	.fence_value_str	= i915_gem_request_fence_value_str,
> +	.timeline_value_str	= i915_gem_request_timeline_value_str,
> +};
> +
>   int i915_create_fence_timeline(struct drm_device *dev,
>   			       struct i915_gem_context *ctx,
>   			       struct intel_engine_cs *engine)
> @@ -2770,7 +2852,7 @@ int i915_create_fence_timeline(struct drm_device *dev,
>   	return 0;
>   }
>
> -unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
> +static unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
>   {
>   	unsigned seqno;
>
> @@ -2814,13 +2896,16 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
>   	if (ret)
>   		goto err;
>
> -	kref_init(&req->ref);
>   	req->i915 = dev_priv;
>   	req->engine = engine;
>   	req->reset_counter = reset_counter;
>   	req->ctx  = ctx;
>   	i915_gem_context_reference(req->ctx);
>
> +	fence_init(&req->fence, &i915_gem_request_fops, &engine->fence_lock,
> +		   ctx->engine[engine->id].fence_timeline.fence_context,
> +		   i915_fence_timeline_get_next_seqno(&ctx->engine[engine->id].fence_timeline));
> +
>   	/*
>   	 * Reserve space in the ring buffer for all the commands required to
>   	 * eventually emit this request. This is to guarantee that the
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 14bcfb7..f126bcb 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2030,6 +2030,7 @@ logical_ring_setup(struct drm_device *dev, enum intel_engine_id id)
>   	INIT_LIST_HEAD(&engine->buffers);
>   	INIT_LIST_HEAD(&engine->execlist_queue);
>   	spin_lock_init(&engine->execlist_lock);
> +	spin_lock_init(&engine->fence_lock);
>
>   	tasklet_init(&engine->irq_tasklet,
>   		     intel_lrc_irq_handler, (unsigned long)engine);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 8d35a39..fbd3f12 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2254,6 +2254,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>   	INIT_LIST_HEAD(&engine->request_list);
>   	INIT_LIST_HEAD(&engine->execlist_queue);
>   	INIT_LIST_HEAD(&engine->buffers);
> +	spin_lock_init(&engine->fence_lock);
>   	i915_gem_batch_pool_init(dev, &engine->batch_pool);
>   	memset(engine->semaphore.sync_seqno, 0,
>   	       sizeof(engine->semaphore.sync_seqno));
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index b33c876..3f39daf 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -345,6 +345,8 @@ struct intel_engine_cs {
>   	 * to encode the command length in the header).
>   	 */
>   	u32 (*get_cmd_length_mask)(u32 cmd_header);
> +
> +	spinlock_t fence_lock;

Why is this lock per-engine, and not for example per timeline? Aren't 
fencees living completely isolated in their per-context-per-engine 
domains? So presumably there is something somewhere which is shared 
outside that domain to need a lock at the engine level?

>   };
>
>   static inline bool
>

Regards,

Tvrtko
Op 02-06-16 om 13:07 schreef Tvrtko Ursulin:
>
> On 01/06/16 18:07, John.C.Harrison@Intel.com wrote:
>> From: John Harrison <John.C.Harrison@Intel.com>
>>
>> There is a construct in the linux kernel called 'struct fence' that is
>> intended to keep track of work that is executed on hardware. I.e. it
>> solves the basic problem that the drivers 'struct
>> drm_i915_gem_request' is trying to address. The request structure does
>> quite a lot more than simply track the execution progress so is very
>> definitely still required. However, the basic completion status side
>> could be updated to use the ready made fence implementation and gain
>> all the advantages that provides.
>>
>> This patch makes the first step of integrating a struct fence into the
>> request. It replaces the explicit reference count with that of the
>> fence. It also replaces the 'is completed' test with the fence's
>> equivalent. Currently, that simply chains on to the original request
>> implementation. A future patch will improve this.
>>
>> v3: Updated after review comments by Tvrtko Ursulin. Added fence
>> context/seqno pair to the debugfs request info. Renamed fence 'driver
>> name' to just 'i915'. Removed BUG_ONs.
>>
>> v5: Changed seqno format in debugfs to %x rather than %u as that is
>> apparently the preferred appearance. Line wrapped some long lines to
>> keep the style checker happy.
>>
>> v6: Updated to newer nigthly and resolved conflicts. The biggest issue
>> was with the re-worked busy spin precursor to waiting on a request. In
>> particular, the addition of a 'request_started' helper function. This
>> has no corresponding concept within the fence framework. However, it
>> is only ever used in one place and the whole point of that place is to
>> always directly read the seqno for absolutely lowest latency possible.
>> So the simple solution is to just make the seqno test explicit at that
>> point now rather than later in the series (it was previously being
>> done anyway when fences become interrupt driven).
>>
>> v7: Rebased to newer nightly - lots of ring -> engine renaming and
>> interface change to get_seqno().
>>
>> v8: Rebased to newer nightly - no longer needs to worry about mutex
>> locking in the request free code path. Moved to after fence timeline
>> patch so no longer needs to add a horrid hack timeline.
>>
>> Removed commented out code block. Added support for possible RCU usage
>> of fence object (Review comments by Maarten Lankhorst).
>>
>> For: VIZ-5190
>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
>
> Was it an r-b or an ack from Jesse? If the former does it need a "(v?)" suffix, depending on the amount of code changes after his r-b?
>
>> ---
>>   drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
>>   drivers/gpu/drm/i915/i915_drv.h         |  43 +++++---------
>>   drivers/gpu/drm/i915/i915_gem.c         | 101 +++++++++++++++++++++++++++++---
>>   drivers/gpu/drm/i915/intel_lrc.c        |   1 +
>>   drivers/gpu/drm/i915/intel_ringbuffer.c |   1 +
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
>>   6 files changed, 115 insertions(+), 38 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>> index ac7e569..844cc4b 100644
>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> @@ -767,11 +767,12 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
>>               task = NULL;
>>               if (req->pid)
>>                   task = pid_task(req->pid, PIDTYPE_PID);
>> -            seq_printf(m, "    %x @ %d: %s [%d]\n",
>> +            seq_printf(m, "    %x @ %d: %s [%d], fence = %x:%x\n",
>
> In the previous patch fence context and seqno were %d in the timeline->name so it would probably be more consistent.
>
>>                      req->seqno,
>>                      (int) (jiffies - req->emitted_jiffies),
>>                      task ? task->comm : "<unknown>",
>> -                   task ? task->pid : -1);
>> +                   task ? task->pid : -1,
>> +                   req->fence.context, req->fence.seqno);
req->fence.context is 64-bits, will probably cause a compiler warning.
>>               rcu_read_unlock();
>>           }
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index a5f8ad8..905feae 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -42,6 +42,7 @@
>>   #include <linux/kref.h>
>>   #include <linux/pm_qos.h>
>>   #include <linux/shmem_fs.h>
>> +#include <linux/fence.h>
>>
>>   #include <drm/drmP.h>
>>   #include <drm/intel-gtt.h>
>> @@ -2353,7 +2354,11 @@ static inline struct scatterlist *__sg_next(struct scatterlist *sg)
>>    * initial reference taken using kref_init
>>    */
>>   struct drm_i915_gem_request {
>> -    struct kref ref;
>> +    /**
>> +     * Underlying object for implementing the signal/wait stuff.
>> +     */
>> +    struct fence fence;
>> +    struct rcu_head rcu_head;
fenece.rcu can be used, no need to duplicate. :)
>>       /** On Which ring this request was generated */
>>       struct drm_i915_private *i915;
>> @@ -2455,7 +2460,13 @@ struct drm_i915_gem_request {
>>   struct drm_i915_gem_request * __must_check
>>   i915_gem_request_alloc(struct intel_engine_cs *engine,
>>                  struct i915_gem_context *ctx);
>> -void i915_gem_request_free(struct kref *req_ref);
>> +
>> +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>> +                          bool lazy_coherency)
>> +{
>> +    return fence_is_signaled(&req->fence);
>> +}
>
> I would squash the following patch into this one, it makes no sense to keep a function with an unused parameter. And fewer patches in the series makes it less scary to review. :) Of course if they are also not too big. :D
It's easier to read with all the function parameter changes in a separate patch.
>> +
>>   int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
>>                      struct drm_file *file);
>>
>> @@ -2475,14 +2486,14 @@ static inline struct drm_i915_gem_request *
>>   i915_gem_request_reference(struct drm_i915_gem_request *req)
>>   {
>>       if (req)
>> -        kref_get(&req->ref);
>> +        fence_get(&req->fence);
>>       return req;
>>   }
>>
>>   static inline void
>>   i915_gem_request_unreference(struct drm_i915_gem_request *req)
>>   {
>> -    kref_put(&req->ref, i915_gem_request_free);
>> +    fence_put(&req->fence);
>>   }
>>
>>   static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
>> @@ -2498,12 +2509,6 @@ static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
>>   }
>>
>>   /*
>> - * XXX: i915_gem_request_completed should be here but currently needs the
>> - * definition of i915_seqno_passed() which is below. It will be moved in
>> - * a later patch when the call to i915_seqno_passed() is obsoleted...
>> - */
>> -
>> -/*
>>    * A command that requires special handling by the command parser.
>>    */
>>   struct drm_i915_cmd_descriptor {
>> @@ -3211,24 +3216,6 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2)
>>       return (int32_t)(seq1 - seq2) >= 0;
>>   }
>>
>> -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
>> -                       bool lazy_coherency)
>> -{
>> -    if (!lazy_coherency && req->engine->irq_seqno_barrier)
>> -        req->engine->irq_seqno_barrier(req->engine);
>> -    return i915_seqno_passed(req->engine->get_seqno(req->engine),
>> -                 req->previous_seqno);
>> -}
>> -
>> -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>> -                          bool lazy_coherency)
>> -{
>> -    if (!lazy_coherency && req->engine->irq_seqno_barrier)
>> -        req->engine->irq_seqno_barrier(req->engine);
>> -    return i915_seqno_passed(req->engine->get_seqno(req->engine),
>> -                 req->seqno);
>> -}
>> -
>>   int __must_check i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno);
>>   int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno);
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index 57d3593..b67fd7c 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -1170,6 +1170,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>   {
>>       unsigned long timeout;
>>       unsigned cpu;
>> +    uint32_t seqno;
>>
>>       /* When waiting for high frequency requests, e.g. during synchronous
>>        * rendering split between the CPU and GPU, the finite amount of time
>> @@ -1185,12 +1186,14 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>           return -EBUSY;
>>
>>       /* Only spin if we know the GPU is processing this request */
>> -    if (!i915_gem_request_started(req, true))
>> +    seqno = req->engine->get_seqno(req->engine);
>> +    if (!i915_seqno_passed(seqno, req->previous_seqno))
>>           return -EAGAIN;
>>
>>       timeout = local_clock_us(&cpu) + 5;
>>       while (!need_resched()) {
>> -        if (i915_gem_request_completed(req, true))
>> +        seqno = req->engine->get_seqno(req->engine);
>> +        if (i915_seqno_passed(seqno, req->seqno))
>>               return 0;
>>
>>           if (signal_pending_state(state, current))
>> @@ -1202,7 +1205,10 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>           cpu_relax_lowlatency();
>>       }
>>
>> -    if (i915_gem_request_completed(req, false))
>> +    if (req->engine->irq_seqno_barrier)
>> +        req->engine->irq_seqno_barrier(req->engine);
>> +    seqno = req->engine->get_seqno(req->engine);
>> +    if (i915_seqno_passed(seqno, req->seqno))
>>           return 0;
>>
>>       return -EAGAIN;
>> @@ -2736,13 +2742,89 @@ static void i915_set_reset_status(struct drm_i915_private *dev_priv,
>>       }
>>   }
>>
>> -void i915_gem_request_free(struct kref *req_ref)
>> +static void i915_gem_request_free_rcu(struct rcu_head *head)
>>   {
>> -    struct drm_i915_gem_request *req = container_of(req_ref,
>> -                         typeof(*req), ref);
>> +    struct drm_i915_gem_request *req;
>> +
>> +    req = container_of(head, typeof(*req), rcu_head);
>>       kmem_cache_free(req->i915->requests, req);
>>   }
>>
>> +static void i915_gem_request_free(struct fence *req_fence)
>> +{
>> +    struct drm_i915_gem_request *req;
>> +
>> +    req = container_of(req_fence, typeof(*req), fence);
>> +    call_rcu(&req->rcu_head, i915_gem_request_free_rcu);
>> +}
>> +
>> +static bool i915_gem_request_enable_signaling(struct fence *req_fence)
>> +{
>> +    /* Interrupt driven fences are not implemented yet.*/
>> +    WARN(true, "This should not be called!");
>> +    return true;
>> +}
>> +
>> +static bool i915_gem_request_is_completed(struct fence *req_fence)
>> +{
>> +    struct drm_i915_gem_request *req = container_of(req_fence,
>> +                         typeof(*req), fence);
>> +    u32 seqno;
>> +
>> +    seqno = req->engine->get_seqno(req->engine);
>> +
>> +    return i915_seqno_passed(seqno, req->seqno);
>> +}
>> +
>> +static const char *i915_gem_request_get_driver_name(struct fence *req_fence)
>> +{
>> +    return "i915";
>> +}
>> +
>> +static const char *i915_gem_request_get_timeline_name(struct fence *req_fence)
>> +{
>> +    struct drm_i915_gem_request *req;
>> +    struct i915_fence_timeline *timeline;
>> +
>> +    req = container_of(req_fence, typeof(*req), fence);
>> +    timeline = &req->ctx->engine[req->engine->id].fence_timeline;
>> +
>> +    return timeline->name;
>> +}
>> +
>> +static void i915_gem_request_timeline_value_str(struct fence *req_fence,
>> +                        char *str, int size)
>> +{
>> +    struct drm_i915_gem_request *req;
>> +
>> +    req = container_of(req_fence, typeof(*req), fence);
>> +
>> +    /* Last signalled timeline value ??? */
>> +    snprintf(str, size, "? [%d]"/*, timeline->value*/,
>
> Reference to timeline->value a leftover from the past?
>
> Is the string format defined by the API? Asking because "? [%d]" looks intriguing.
>
>> +         req->engine->get_seqno(req->engine));
>> +}
>> +
>> +static void i915_gem_request_fence_value_str(struct fence *req_fence,
>> +                         char *str, int size)
>> +{
>> +    struct drm_i915_gem_request *req;
>> +
>> +    req = container_of(req_fence, typeof(*req), fence);
>> +
>> +    snprintf(str, size, "%d [%d]", req->fence.seqno, req->seqno);
>
> Is it OK to put req->seqno in this one? OR it is just for debug anyway so it helps us and fence framework does not care?
I think this is used for getting all info from debugfs only, so req->seqno is fine.
>> +}
>> +
>> +static const struct fence_ops i915_gem_request_fops = {
>> +    .enable_signaling    = i915_gem_request_enable_signaling,
>> +    .signaled        = i915_gem_request_is_completed,
>> +    .wait            = fence_default_wait,
>> +    .release        = i915_gem_request_free,
>> +    .get_driver_name    = i915_gem_request_get_driver_name,
>> +    .get_timeline_name    = i915_gem_request_get_timeline_name,
>> +    .fence_value_str    = i915_gem_request_fence_value_str,
>> +    .timeline_value_str    = i915_gem_request_timeline_value_str,
>> +};
>> +
>>   int i915_create_fence_timeline(struct drm_device *dev,
>>                      struct i915_gem_context *ctx,
>>                      struct intel_engine_cs *engine)
>> @@ -2770,7 +2852,7 @@ int i915_create_fence_timeline(struct drm_device *dev,
>>       return 0;
>>   }
>>
>> -unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
>> +static unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
>>   {
>>       unsigned seqno;
>>
>> @@ -2814,13 +2896,16 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
>>       if (ret)
>>           goto err;
>>
>> -    kref_init(&req->ref);
>>       req->i915 = dev_priv;
>>       req->engine = engine;
>>       req->reset_counter = reset_counter;
>>       req->ctx  = ctx;
>>       i915_gem_context_reference(req->ctx);
>>
>> +    fence_init(&req->fence, &i915_gem_request_fops, &engine->fence_lock,
>> +           ctx->engine[engine->id].fence_timeline.fence_context,
>> +           i915_fence_timeline_get_next_seqno(&ctx->engine[engine->id].fence_timeline));
>> +
>>       /*
>>        * Reserve space in the ring buffer for all the commands required to
>>        * eventually emit this request. This is to guarantee that the
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 14bcfb7..f126bcb 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -2030,6 +2030,7 @@ logical_ring_setup(struct drm_device *dev, enum intel_engine_id id)
>>       INIT_LIST_HEAD(&engine->buffers);
>>       INIT_LIST_HEAD(&engine->execlist_queue);
>>       spin_lock_init(&engine->execlist_lock);
>> +    spin_lock_init(&engine->fence_lock);
>>
>>       tasklet_init(&engine->irq_tasklet,
>>                intel_lrc_irq_handler, (unsigned long)engine);
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 8d35a39..fbd3f12 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -2254,6 +2254,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>>       INIT_LIST_HEAD(&engine->request_list);
>>       INIT_LIST_HEAD(&engine->execlist_queue);
>>       INIT_LIST_HEAD(&engine->buffers);
>> +    spin_lock_init(&engine->fence_lock);
>>       i915_gem_batch_pool_init(dev, &engine->batch_pool);
>>       memset(engine->semaphore.sync_seqno, 0,
>>              sizeof(engine->semaphore.sync_seqno));
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index b33c876..3f39daf 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -345,6 +345,8 @@ struct intel_engine_cs {
>>        * to encode the command length in the header).
>>        */
>>       u32 (*get_cmd_length_mask)(u32 cmd_header);
>> +
>> +    spinlock_t fence_lock;
>
> Why is this lock per-engine, and not for example per timeline? Aren't fencees living completely isolated in their per-context-per-engine domains? So presumably there is something somewhere which is shared outside that domain to need a lock at the engine level? 
All outstanding requests are added to engine->fence_signal_list in patch 4, which means a per engine lock is required.
On 07/06/16 12:42, Maarten Lankhorst wrote:
> Op 02-06-16 om 13:07 schreef Tvrtko Ursulin:

[snip]

>>> +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>>> +                          bool lazy_coherency)
>>> +{
>>> +    return fence_is_signaled(&req->fence);
>>> +}
>>
>> I would squash the following patch into this one, it makes no sense to keep a function with an unused parameter. And fewer patches in the series makes it less scary to review. :) Of course if they are also not too big. :D
> It's easier to read with all the function parameter changes in a separate patch.

I do not think so, but it is not even near a blocking commit so OK.

>>>        u32 (*get_cmd_length_mask)(u32 cmd_header);
>>> +
>>> +    spinlock_t fence_lock;
>>
>> Why is this lock per-engine, and not for example per timeline? Aren't fencees living completely isolated in their per-context-per-engine domains? So presumably there is something somewhere which is shared outside that domain to need a lock at the engine level?
> All outstanding requests are added to engine->fence_signal_list in patch 4, which means a per engine lock is required.

Okay, a comment is required here to describe the lock then. All what it 
protects and how and when it needs to be taken. Both from the i915 point 
of view and from the fence API side.

Regards,

Tvrtko
On 07/06/2016 12:42, Maarten Lankhorst wrote:
> Op 02-06-16 om 13:07 schreef Tvrtko Ursulin:
>> On 01/06/16 18:07, John.C.Harrison@Intel.com wrote:
>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>
>>> There is a construct in the linux kernel called 'struct fence' that is
>>> intended to keep track of work that is executed on hardware. I.e. it
>>> solves the basic problem that the drivers 'struct
>>> drm_i915_gem_request' is trying to address. The request structure does
>>> quite a lot more than simply track the execution progress so is very
>>> definitely still required. However, the basic completion status side
>>> could be updated to use the ready made fence implementation and gain
>>> all the advantages that provides.
>>>
>>> This patch makes the first step of integrating a struct fence into the
>>> request. It replaces the explicit reference count with that of the
>>> fence. It also replaces the 'is completed' test with the fence's
>>> equivalent. Currently, that simply chains on to the original request
>>> implementation. A future patch will improve this.
>>>
>>> v3: Updated after review comments by Tvrtko Ursulin. Added fence
>>> context/seqno pair to the debugfs request info. Renamed fence 'driver
>>> name' to just 'i915'. Removed BUG_ONs.
>>>
>>> v5: Changed seqno format in debugfs to %x rather than %u as that is
>>> apparently the preferred appearance. Line wrapped some long lines to
>>> keep the style checker happy.
>>>
>>> v6: Updated to newer nigthly and resolved conflicts. The biggest issue
>>> was with the re-worked busy spin precursor to waiting on a request. In
>>> particular, the addition of a 'request_started' helper function. This
>>> has no corresponding concept within the fence framework. However, it
>>> is only ever used in one place and the whole point of that place is to
>>> always directly read the seqno for absolutely lowest latency possible.
>>> So the simple solution is to just make the seqno test explicit at that
>>> point now rather than later in the series (it was previously being
>>> done anyway when fences become interrupt driven).
>>>
>>> v7: Rebased to newer nightly - lots of ring -> engine renaming and
>>> interface change to get_seqno().
>>>
>>> v8: Rebased to newer nightly - no longer needs to worry about mutex
>>> locking in the request free code path. Moved to after fence timeline
>>> patch so no longer needs to add a horrid hack timeline.
>>>
>>> Removed commented out code block. Added support for possible RCU usage
>>> of fence object (Review comments by Maarten Lankhorst).
>>>
>>> For: VIZ-5190
>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
>> Was it an r-b or an ack from Jesse? If the former does it need a "(v?)" suffix, depending on the amount of code changes after his r-b?
Going back through the old emails it looks like you are right, it was 
actually an ack on v3. What is the official tag for that?

>>
>>> ---
>>>    drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
>>>    drivers/gpu/drm/i915/i915_drv.h         |  43 +++++---------
>>>    drivers/gpu/drm/i915/i915_gem.c         | 101 +++++++++++++++++++++++++++++---
>>>    drivers/gpu/drm/i915/intel_lrc.c        |   1 +
>>>    drivers/gpu/drm/i915/intel_ringbuffer.c |   1 +
>>>    drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
>>>    6 files changed, 115 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>>> index ac7e569..844cc4b 100644
>>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>>> @@ -767,11 +767,12 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
>>>                task = NULL;
>>>                if (req->pid)
>>>                    task = pid_task(req->pid, PIDTYPE_PID);
>>> -            seq_printf(m, "    %x @ %d: %s [%d]\n",
>>> +            seq_printf(m, "    %x @ %d: %s [%d], fence = %x:%x\n",
>> In the previous patch fence context and seqno were %d in the timeline->name so it would probably be more consistent.
It is trying to be consistent with the surroundings. Requests used to be 
printed as %d but for some reason got changed to be %x recently. Whereas 
the fence debug output is all %d still. Currently, the request::seqno 
and fence::seqno are different so maybe it doesn't really matter that 
they are printed differently but the ultimate aim is to merge the two 
into a single value. At which point all i915 code would be showing the 
value in one format but all fence code in another.

>>
>>>                       req->seqno,
>>>                       (int) (jiffies - req->emitted_jiffies),
>>>                       task ? task->comm : "<unknown>",
>>> -                   task ? task->pid : -1);
>>> +                   task ? task->pid : -1,
>>> +                   req->fence.context, req->fence.seqno);
> req->fence.context is 64-bits, will probably cause a compiler warning.
Not at the point the above was written. Have rebased to the newer tree 
and updated to u64 / %llx.

>>>                rcu_read_unlock();
>>>            }
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index a5f8ad8..905feae 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -42,6 +42,7 @@
>>>    #include <linux/kref.h>
>>>    #include <linux/pm_qos.h>
>>>    #include <linux/shmem_fs.h>
>>> +#include <linux/fence.h>
>>>
>>>    #include <drm/drmP.h>
>>>    #include <drm/intel-gtt.h>
>>> @@ -2353,7 +2354,11 @@ static inline struct scatterlist *__sg_next(struct scatterlist *sg)
>>>     * initial reference taken using kref_init
>>>     */
>>>    struct drm_i915_gem_request {
>>> -    struct kref ref;
>>> +    /**
>>> +     * Underlying object for implementing the signal/wait stuff.
>>> +     */
>>> +    struct fence fence;
>>> +    struct rcu_head rcu_head;
> fenece.rcu can be used, no need to duplicate. :)
Is that true? Does it not matter if someone else is already using the 
one in the fence structure for some other operation? Or is that one 
specifically there for the creator (us) to use and so guaranteed not to 
be used elsewhere?

>>>        /** On Which ring this request was generated */
>>>        struct drm_i915_private *i915;
>>> @@ -2455,7 +2460,13 @@ struct drm_i915_gem_request {
>>>    struct drm_i915_gem_request * __must_check
>>>    i915_gem_request_alloc(struct intel_engine_cs *engine,
>>>                   struct i915_gem_context *ctx);
>>> -void i915_gem_request_free(struct kref *req_ref);
>>> +
>>> +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>>> +                          bool lazy_coherency)
>>> +{
>>> +    return fence_is_signaled(&req->fence);
>>> +}
>> I would squash the following patch into this one, it makes no sense to keep a function with an unused parameter. And fewer patches in the series makes it less scary to review. :) Of course if they are also not too big. :D
> It's easier to read with all the function parameter changes in a separate patch.
Yeah, the guidance from on high has been that such things should be in 
separate patches.

>>> +
>>>    int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
>>>                       struct drm_file *file);
>>>
>>> @@ -2475,14 +2486,14 @@ static inline struct drm_i915_gem_request *
>>>    i915_gem_request_reference(struct drm_i915_gem_request *req)
>>>    {
>>>        if (req)
>>> -        kref_get(&req->ref);
>>> +        fence_get(&req->fence);
>>>        return req;
>>>    }
>>>
>>>    static inline void
>>>    i915_gem_request_unreference(struct drm_i915_gem_request *req)
>>>    {
>>> -    kref_put(&req->ref, i915_gem_request_free);
>>> +    fence_put(&req->fence);
>>>    }
>>>
>>>    static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
>>> @@ -2498,12 +2509,6 @@ static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
>>>    }
>>>
>>>    /*
>>> - * XXX: i915_gem_request_completed should be here but currently needs the
>>> - * definition of i915_seqno_passed() which is below. It will be moved in
>>> - * a later patch when the call to i915_seqno_passed() is obsoleted...
>>> - */
>>> -
>>> -/*
>>>     * A command that requires special handling by the command parser.
>>>     */
>>>    struct drm_i915_cmd_descriptor {
>>> @@ -3211,24 +3216,6 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2)
>>>        return (int32_t)(seq1 - seq2) >= 0;
>>>    }
>>>
>>> -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
>>> -                       bool lazy_coherency)
>>> -{
>>> -    if (!lazy_coherency && req->engine->irq_seqno_barrier)
>>> -        req->engine->irq_seqno_barrier(req->engine);
>>> -    return i915_seqno_passed(req->engine->get_seqno(req->engine),
>>> -                 req->previous_seqno);
>>> -}
>>> -
>>> -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>>> -                          bool lazy_coherency)
>>> -{
>>> -    if (!lazy_coherency && req->engine->irq_seqno_barrier)
>>> -        req->engine->irq_seqno_barrier(req->engine);
>>> -    return i915_seqno_passed(req->engine->get_seqno(req->engine),
>>> -                 req->seqno);
>>> -}
>>> -
>>>    int __must_check i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno);
>>>    int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno);
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index 57d3593..b67fd7c 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -1170,6 +1170,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>    {
>>>        unsigned long timeout;
>>>        unsigned cpu;
>>> +    uint32_t seqno;
>>>
>>>        /* When waiting for high frequency requests, e.g. during synchronous
>>>         * rendering split between the CPU and GPU, the finite amount of time
>>> @@ -1185,12 +1186,14 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>            return -EBUSY;
>>>
>>>        /* Only spin if we know the GPU is processing this request */
>>> -    if (!i915_gem_request_started(req, true))
>>> +    seqno = req->engine->get_seqno(req->engine);
>>> +    if (!i915_seqno_passed(seqno, req->previous_seqno))
>>>            return -EAGAIN;
>>>
>>>        timeout = local_clock_us(&cpu) + 5;
>>>        while (!need_resched()) {
>>> -        if (i915_gem_request_completed(req, true))
>>> +        seqno = req->engine->get_seqno(req->engine);
>>> +        if (i915_seqno_passed(seqno, req->seqno))
>>>                return 0;
>>>
>>>            if (signal_pending_state(state, current))
>>> @@ -1202,7 +1205,10 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>            cpu_relax_lowlatency();
>>>        }
>>>
>>> -    if (i915_gem_request_completed(req, false))
>>> +    if (req->engine->irq_seqno_barrier)
>>> +        req->engine->irq_seqno_barrier(req->engine);
>>> +    seqno = req->engine->get_seqno(req->engine);
>>> +    if (i915_seqno_passed(seqno, req->seqno))
>>>            return 0;
>>>
>>>        return -EAGAIN;
>>> @@ -2736,13 +2742,89 @@ static void i915_set_reset_status(struct drm_i915_private *dev_priv,
>>>        }
>>>    }
>>>
>>> -void i915_gem_request_free(struct kref *req_ref)
>>> +static void i915_gem_request_free_rcu(struct rcu_head *head)
>>>    {
>>> -    struct drm_i915_gem_request *req = container_of(req_ref,
>>> -                         typeof(*req), ref);
>>> +    struct drm_i915_gem_request *req;
>>> +
>>> +    req = container_of(head, typeof(*req), rcu_head);
>>>        kmem_cache_free(req->i915->requests, req);
>>>    }
>>>
>>> +static void i915_gem_request_free(struct fence *req_fence)
>>> +{
>>> +    struct drm_i915_gem_request *req;
>>> +
>>> +    req = container_of(req_fence, typeof(*req), fence);
>>> +    call_rcu(&req->rcu_head, i915_gem_request_free_rcu);
>>> +}
>>> +
>>> +static bool i915_gem_request_enable_signaling(struct fence *req_fence)
>>> +{
>>> +    /* Interrupt driven fences are not implemented yet.*/
>>> +    WARN(true, "This should not be called!");
>>> +    return true;
>>> +}
>>> +
>>> +static bool i915_gem_request_is_completed(struct fence *req_fence)
>>> +{
>>> +    struct drm_i915_gem_request *req = container_of(req_fence,
>>> +                         typeof(*req), fence);
>>> +    u32 seqno;
>>> +
>>> +    seqno = req->engine->get_seqno(req->engine);
>>> +
>>> +    return i915_seqno_passed(seqno, req->seqno);
>>> +}
>>> +
>>> +static const char *i915_gem_request_get_driver_name(struct fence *req_fence)
>>> +{
>>> +    return "i915";
>>> +}
>>> +
>>> +static const char *i915_gem_request_get_timeline_name(struct fence *req_fence)
>>> +{
>>> +    struct drm_i915_gem_request *req;
>>> +    struct i915_fence_timeline *timeline;
>>> +
>>> +    req = container_of(req_fence, typeof(*req), fence);
>>> +    timeline = &req->ctx->engine[req->engine->id].fence_timeline;
>>> +
>>> +    return timeline->name;
>>> +}
>>> +
>>> +static void i915_gem_request_timeline_value_str(struct fence *req_fence,
>>> +                        char *str, int size)
>>> +{
>>> +    struct drm_i915_gem_request *req;
>>> +
>>> +    req = container_of(req_fence, typeof(*req), fence);
>>> +
>>> +    /* Last signalled timeline value ??? */
>>> +    snprintf(str, size, "? [%d]"/*, timeline->value*/,
>> Reference to timeline->value a leftover from the past?
>>
>> Is the string format defined by the API? Asking because "? [%d]" looks intriguing.
It is basically just a debug string so can say anything we want. 
Convention is that it tells you where the timeline is up to. 
Unfortunately, there is no actual way to know that at present. The 
original Android implementation had the fence notification done via the 
timeline so engine->get_seqno() would be equivalent to 
timeline->current_value. However, all of that automatic update got 
removed with the switch to the 'official' struct fence instead of the 
Android only one. So now it is up to the timeline implementer do things 
how they see fit. And right now, keeping the timeline updated is 
unnecessary extra complication and overhead. When fence::seqno and 
request::seqno are unified then get_seqno() will be sufficient. Until 
then, it is just a pseudo value - hence the '? [%d]'. I have updated the 
comment in the code to explain it a bit better.


>>
>>> +         req->engine->get_seqno(req->engine));
>>> +}
>>> +
>>> +static void i915_gem_request_fence_value_str(struct fence *req_fence,
>>> +                         char *str, int size)
>>> +{
>>> +    struct drm_i915_gem_request *req;
>>> +
>>> +    req = container_of(req_fence, typeof(*req), fence);
>>> +
>>> +    snprintf(str, size, "%d [%d]", req->fence.seqno, req->seqno);
>> Is it OK to put req->seqno in this one? OR it is just for debug anyway so it helps us and fence framework does not care?
> I think this is used for getting all info from debugfs only, so req->seqno is fine.
Yes, it is just for debugfs and trace output. And until the two values 
are unified, it is really useful to have them both present.

>>> +}
>>> +
>>> +static const struct fence_ops i915_gem_request_fops = {
>>> +    .enable_signaling    = i915_gem_request_enable_signaling,
>>> +    .signaled        = i915_gem_request_is_completed,
>>> +    .wait            = fence_default_wait,
>>> +    .release        = i915_gem_request_free,
>>> +    .get_driver_name    = i915_gem_request_get_driver_name,
>>> +    .get_timeline_name    = i915_gem_request_get_timeline_name,
>>> +    .fence_value_str    = i915_gem_request_fence_value_str,
>>> +    .timeline_value_str    = i915_gem_request_timeline_value_str,
>>> +};
>>> +
>>>    int i915_create_fence_timeline(struct drm_device *dev,
>>>                       struct i915_gem_context *ctx,
>>>                       struct intel_engine_cs *engine)
>>> @@ -2770,7 +2852,7 @@ int i915_create_fence_timeline(struct drm_device *dev,
>>>        return 0;
>>>    }
>>>
>>> -unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
>>> +static unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
>>>    {
>>>        unsigned seqno;
>>>
>>> @@ -2814,13 +2896,16 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
>>>        if (ret)
>>>            goto err;
>>>
>>> -    kref_init(&req->ref);
>>>        req->i915 = dev_priv;
>>>        req->engine = engine;
>>>        req->reset_counter = reset_counter;
>>>        req->ctx  = ctx;
>>>        i915_gem_context_reference(req->ctx);
>>>
>>> +    fence_init(&req->fence, &i915_gem_request_fops, &engine->fence_lock,
>>> +           ctx->engine[engine->id].fence_timeline.fence_context,
>>> +           i915_fence_timeline_get_next_seqno(&ctx->engine[engine->id].fence_timeline));
>>> +
>>>        /*
>>>         * Reserve space in the ring buffer for all the commands required to
>>>         * eventually emit this request. This is to guarantee that the
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 14bcfb7..f126bcb 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -2030,6 +2030,7 @@ logical_ring_setup(struct drm_device *dev, enum intel_engine_id id)
>>>        INIT_LIST_HEAD(&engine->buffers);
>>>        INIT_LIST_HEAD(&engine->execlist_queue);
>>>        spin_lock_init(&engine->execlist_lock);
>>> +    spin_lock_init(&engine->fence_lock);
>>>
>>>        tasklet_init(&engine->irq_tasklet,
>>>                 intel_lrc_irq_handler, (unsigned long)engine);
>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> index 8d35a39..fbd3f12 100644
>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>> @@ -2254,6 +2254,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>>>        INIT_LIST_HEAD(&engine->request_list);
>>>        INIT_LIST_HEAD(&engine->execlist_queue);
>>>        INIT_LIST_HEAD(&engine->buffers);
>>> +    spin_lock_init(&engine->fence_lock);
>>>        i915_gem_batch_pool_init(dev, &engine->batch_pool);
>>>        memset(engine->semaphore.sync_seqno, 0,
>>>               sizeof(engine->semaphore.sync_seqno));
>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> index b33c876..3f39daf 100644
>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>> @@ -345,6 +345,8 @@ struct intel_engine_cs {
>>>         * to encode the command length in the header).
>>>         */
>>>        u32 (*get_cmd_length_mask)(u32 cmd_header);
>>> +
>>> +    spinlock_t fence_lock;
>> Why is this lock per-engine, and not for example per timeline? Aren't fencees living completely isolated in their per-context-per-engine domains? So presumably there is something somewhere which is shared outside that domain to need a lock at the engine level?
> All outstanding requests are added to engine->fence_signal_list in patch 4, which means a per engine lock is required.

 > Okay, a comment is required here to describe the lock then. All what it
 > protects and how and when it needs to be taken. Both from the i915
 > point of view and from the fence API side.

Will add a comment to say that the lock is used for the signal list as 
well as the fence itself.


>
>
Op 10-06-16 om 13:26 schreef John Harrison:
> On 07/06/2016 12:42, Maarten Lankhorst wrote:
>> Op 02-06-16 om 13:07 schreef Tvrtko Ursulin:
>>> On 01/06/16 18:07, John.C.Harrison@Intel.com wrote:
>>>> From: John Harrison <John.C.Harrison@Intel.com>
>>>>
>>>> There is a construct in the linux kernel called 'struct fence' that is
>>>> intended to keep track of work that is executed on hardware. I.e. it
>>>> solves the basic problem that the drivers 'struct
>>>> drm_i915_gem_request' is trying to address. The request structure does
>>>> quite a lot more than simply track the execution progress so is very
>>>> definitely still required. However, the basic completion status side
>>>> could be updated to use the ready made fence implementation and gain
>>>> all the advantages that provides.
>>>>
>>>> This patch makes the first step of integrating a struct fence into the
>>>> request. It replaces the explicit reference count with that of the
>>>> fence. It also replaces the 'is completed' test with the fence's
>>>> equivalent. Currently, that simply chains on to the original request
>>>> implementation. A future patch will improve this.
>>>>
>>>> v3: Updated after review comments by Tvrtko Ursulin. Added fence
>>>> context/seqno pair to the debugfs request info. Renamed fence 'driver
>>>> name' to just 'i915'. Removed BUG_ONs.
>>>>
>>>> v5: Changed seqno format in debugfs to %x rather than %u as that is
>>>> apparently the preferred appearance. Line wrapped some long lines to
>>>> keep the style checker happy.
>>>>
>>>> v6: Updated to newer nigthly and resolved conflicts. The biggest issue
>>>> was with the re-worked busy spin precursor to waiting on a request. In
>>>> particular, the addition of a 'request_started' helper function. This
>>>> has no corresponding concept within the fence framework. However, it
>>>> is only ever used in one place and the whole point of that place is to
>>>> always directly read the seqno for absolutely lowest latency possible.
>>>> So the simple solution is to just make the seqno test explicit at that
>>>> point now rather than later in the series (it was previously being
>>>> done anyway when fences become interrupt driven).
>>>>
>>>> v7: Rebased to newer nightly - lots of ring -> engine renaming and
>>>> interface change to get_seqno().
>>>>
>>>> v8: Rebased to newer nightly - no longer needs to worry about mutex
>>>> locking in the request free code path. Moved to after fence timeline
>>>> patch so no longer needs to add a horrid hack timeline.
>>>>
>>>> Removed commented out code block. Added support for possible RCU usage
>>>> of fence object (Review comments by Maarten Lankhorst).
>>>>
>>>> For: VIZ-5190
>>>> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
>>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>>> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>
>>> Was it an r-b or an ack from Jesse? If the former does it need a "(v?)" suffix, depending on the amount of code changes after his r-b?
> Going back through the old emails it looks like you are right, it was actually an ack on v3. What is the official tag for that?
>
>>>
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_debugfs.c     |   5 +-
>>>>    drivers/gpu/drm/i915/i915_drv.h         |  43 +++++---------
>>>>    drivers/gpu/drm/i915/i915_gem.c         | 101 +++++++++++++++++++++++++++++---
>>>>    drivers/gpu/drm/i915/intel_lrc.c        |   1 +
>>>>    drivers/gpu/drm/i915/intel_ringbuffer.c |   1 +
>>>>    drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
>>>>    6 files changed, 115 insertions(+), 38 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>>>> index ac7e569..844cc4b 100644
>>>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>>>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>>>> @@ -767,11 +767,12 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
>>>>                task = NULL;
>>>>                if (req->pid)
>>>>                    task = pid_task(req->pid, PIDTYPE_PID);
>>>> -            seq_printf(m, "    %x @ %d: %s [%d]\n",
>>>> +            seq_printf(m, "    %x @ %d: %s [%d], fence = %x:%x\n",
>>> In the previous patch fence context and seqno were %d in the timeline->name so it would probably be more consistent.
> It is trying to be consistent with the surroundings. Requests used to be printed as %d but for some reason got changed to be %x recently. Whereas the fence debug output is all %d still. Currently, the request::seqno and fence::seqno are different so maybe it doesn't really matter that they are printed differently but the ultimate aim is to merge the two into a single value. At which point all i915 code would be showing the value in one format but all fence code in another.
>
>>>
>>>>                       req->seqno,
>>>>                       (int) (jiffies - req->emitted_jiffies),
>>>>                       task ? task->comm : "<unknown>",
>>>> -                   task ? task->pid : -1);
>>>> +                   task ? task->pid : -1,
>>>> +                   req->fence.context, req->fence.seqno);
>> req->fence.context is 64-bits, will probably cause a compiler warning.
> Not at the point the above was written. Have rebased to the newer tree and updated to u64 / %llx.
>
>>>>                rcu_read_unlock();
>>>>            }
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>>> index a5f8ad8..905feae 100644
>>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>>> @@ -42,6 +42,7 @@
>>>>    #include <linux/kref.h>
>>>>    #include <linux/pm_qos.h>
>>>>    #include <linux/shmem_fs.h>
>>>> +#include <linux/fence.h>
>>>>
>>>>    #include <drm/drmP.h>
>>>>    #include <drm/intel-gtt.h>
>>>> @@ -2353,7 +2354,11 @@ static inline struct scatterlist *__sg_next(struct scatterlist *sg)
>>>>     * initial reference taken using kref_init
>>>>     */
>>>>    struct drm_i915_gem_request {
>>>> -    struct kref ref;
>>>> +    /**
>>>> +     * Underlying object for implementing the signal/wait stuff.
>>>> +     */
>>>> +    struct fence fence;
>>>> +    struct rcu_head rcu_head;
>> fenece.rcu can be used, no need to duplicate. :)
> Is that true? Does it not matter if someone else is already using the one in the fence structure for some other operation? Or is that one specifically there for the creator (us) to use and so guaranteed not to be used elsewhere?
Yes, it's unused when you implement your own ->release function and don't call fence_free.
>>>>        /** On Which ring this request was generated */
>>>>        struct drm_i915_private *i915;
>>>> @@ -2455,7 +2460,13 @@ struct drm_i915_gem_request {
>>>>    struct drm_i915_gem_request * __must_check
>>>>    i915_gem_request_alloc(struct intel_engine_cs *engine,
>>>>                   struct i915_gem_context *ctx);
>>>> -void i915_gem_request_free(struct kref *req_ref);
>>>> +
>>>> +static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>>>> +                          bool lazy_coherency)
>>>> +{
>>>> +    return fence_is_signaled(&req->fence);
>>>> +}
>>> I would squash the following patch into this one, it makes no sense to keep a function with an unused parameter. And fewer patches in the series makes it less scary to review. :) Of course if they are also not too big. :D
>> It's easier to read with all the function parameter changes in a separate patch.
> Yeah, the guidance from on high has been that such things should be in separate patches.
>
>>>> +
>>>>    int i915_gem_request_add_to_client(struct drm_i915_gem_request *req,
>>>>                       struct drm_file *file);
>>>>
>>>> @@ -2475,14 +2486,14 @@ static inline struct drm_i915_gem_request *
>>>>    i915_gem_request_reference(struct drm_i915_gem_request *req)
>>>>    {
>>>>        if (req)
>>>> -        kref_get(&req->ref);
>>>> +        fence_get(&req->fence);
>>>>        return req;
>>>>    }
>>>>
>>>>    static inline void
>>>>    i915_gem_request_unreference(struct drm_i915_gem_request *req)
>>>>    {
>>>> -    kref_put(&req->ref, i915_gem_request_free);
>>>> +    fence_put(&req->fence);
>>>>    }
>>>>
>>>>    static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
>>>> @@ -2498,12 +2509,6 @@ static inline void i915_gem_request_assign(struct drm_i915_gem_request **pdst,
>>>>    }
>>>>
>>>>    /*
>>>> - * XXX: i915_gem_request_completed should be here but currently needs the
>>>> - * definition of i915_seqno_passed() which is below. It will be moved in
>>>> - * a later patch when the call to i915_seqno_passed() is obsoleted...
>>>> - */
>>>> -
>>>> -/*
>>>>     * A command that requires special handling by the command parser.
>>>>     */
>>>>    struct drm_i915_cmd_descriptor {
>>>> @@ -3211,24 +3216,6 @@ i915_seqno_passed(uint32_t seq1, uint32_t seq2)
>>>>        return (int32_t)(seq1 - seq2) >= 0;
>>>>    }
>>>>
>>>> -static inline bool i915_gem_request_started(struct drm_i915_gem_request *req,
>>>> -                       bool lazy_coherency)
>>>> -{
>>>> -    if (!lazy_coherency && req->engine->irq_seqno_barrier)
>>>> -        req->engine->irq_seqno_barrier(req->engine);
>>>> -    return i915_seqno_passed(req->engine->get_seqno(req->engine),
>>>> -                 req->previous_seqno);
>>>> -}
>>>> -
>>>> -static inline bool i915_gem_request_completed(struct drm_i915_gem_request *req,
>>>> -                          bool lazy_coherency)
>>>> -{
>>>> -    if (!lazy_coherency && req->engine->irq_seqno_barrier)
>>>> -        req->engine->irq_seqno_barrier(req->engine);
>>>> -    return i915_seqno_passed(req->engine->get_seqno(req->engine),
>>>> -                 req->seqno);
>>>> -}
>>>> -
>>>>    int __must_check i915_gem_get_seqno(struct drm_i915_private *dev_priv, u32 *seqno);
>>>>    int __must_check i915_gem_set_seqno(struct drm_device *dev, u32 seqno);
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>>> index 57d3593..b67fd7c 100644
>>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>>> @@ -1170,6 +1170,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>>    {
>>>>        unsigned long timeout;
>>>>        unsigned cpu;
>>>> +    uint32_t seqno;
>>>>
>>>>        /* When waiting for high frequency requests, e.g. during synchronous
>>>>         * rendering split between the CPU and GPU, the finite amount of time
>>>> @@ -1185,12 +1186,14 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>>            return -EBUSY;
>>>>
>>>>        /* Only spin if we know the GPU is processing this request */
>>>> -    if (!i915_gem_request_started(req, true))
>>>> +    seqno = req->engine->get_seqno(req->engine);
>>>> +    if (!i915_seqno_passed(seqno, req->previous_seqno))
>>>>            return -EAGAIN;
>>>>
>>>>        timeout = local_clock_us(&cpu) + 5;
>>>>        while (!need_resched()) {
>>>> -        if (i915_gem_request_completed(req, true))
>>>> +        seqno = req->engine->get_seqno(req->engine);
>>>> +        if (i915_seqno_passed(seqno, req->seqno))
>>>>                return 0;
>>>>
>>>>            if (signal_pending_state(state, current))
>>>> @@ -1202,7 +1205,10 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>>            cpu_relax_lowlatency();
>>>>        }
>>>>
>>>> -    if (i915_gem_request_completed(req, false))
>>>> +    if (req->engine->irq_seqno_barrier)
>>>> +        req->engine->irq_seqno_barrier(req->engine);
>>>> +    seqno = req->engine->get_seqno(req->engine);
>>>> +    if (i915_seqno_passed(seqno, req->seqno))
>>>>            return 0;
>>>>
>>>>        return -EAGAIN;
>>>> @@ -2736,13 +2742,89 @@ static void i915_set_reset_status(struct drm_i915_private *dev_priv,
>>>>        }
>>>>    }
>>>>
>>>> -void i915_gem_request_free(struct kref *req_ref)
>>>> +static void i915_gem_request_free_rcu(struct rcu_head *head)
>>>>    {
>>>> -    struct drm_i915_gem_request *req = container_of(req_ref,
>>>> -                         typeof(*req), ref);
>>>> +    struct drm_i915_gem_request *req;
>>>> +
>>>> +    req = container_of(head, typeof(*req), rcu_head);
>>>>        kmem_cache_free(req->i915->requests, req);
>>>>    }
>>>>
>>>> +static void i915_gem_request_free(struct fence *req_fence)
>>>> +{
>>>> +    struct drm_i915_gem_request *req;
>>>> +
>>>> +    req = container_of(req_fence, typeof(*req), fence);
>>>> +    call_rcu(&req->rcu_head, i915_gem_request_free_rcu);
>>>> +}
>>>> +
>>>> +static bool i915_gem_request_enable_signaling(struct fence *req_fence)
>>>> +{
>>>> +    /* Interrupt driven fences are not implemented yet.*/
>>>> +    WARN(true, "This should not be called!");
>>>> +    return true;
>>>> +}
>>>> +
>>>> +static bool i915_gem_request_is_completed(struct fence *req_fence)
>>>> +{
>>>> +    struct drm_i915_gem_request *req = container_of(req_fence,
>>>> +                         typeof(*req), fence);
>>>> +    u32 seqno;
>>>> +
>>>> +    seqno = req->engine->get_seqno(req->engine);
>>>> +
>>>> +    return i915_seqno_passed(seqno, req->seqno);
>>>> +}
>>>> +
>>>> +static const char *i915_gem_request_get_driver_name(struct fence *req_fence)
>>>> +{
>>>> +    return "i915";
>>>> +}
>>>> +
>>>> +static const char *i915_gem_request_get_timeline_name(struct fence *req_fence)
>>>> +{
>>>> +    struct drm_i915_gem_request *req;
>>>> +    struct i915_fence_timeline *timeline;
>>>> +
>>>> +    req = container_of(req_fence, typeof(*req), fence);
>>>> +    timeline = &req->ctx->engine[req->engine->id].fence_timeline;
>>>> +
>>>> +    return timeline->name;
>>>> +}
>>>> +
>>>> +static void i915_gem_request_timeline_value_str(struct fence *req_fence,
>>>> +                        char *str, int size)
>>>> +{
>>>> +    struct drm_i915_gem_request *req;
>>>> +
>>>> +    req = container_of(req_fence, typeof(*req), fence);
>>>> +
>>>> +    /* Last signalled timeline value ??? */
>>>> +    snprintf(str, size, "? [%d]"/*, timeline->value*/,
>>> Reference to timeline->value a leftover from the past?
>>>
>>> Is the string format defined by the API? Asking because "? [%d]" looks intriguing.
> It is basically just a debug string so can say anything we want. Convention is that it tells you where the timeline is up to. Unfortunately, there is no actual way to know that at present. The original Android implementation had the fence notification done via the timeline so engine->get_seqno() would be equivalent to timeline->current_value. However, all of that automatic update got removed with the switch to the 'official' struct fence instead of the Android only one. So now it is up to the timeline implementer do things how they see fit. And right now, keeping the timeline updated is unnecessary extra complication and overhead. When fence::seqno and request::seqno are unified then get_seqno() will be sufficient. Until then, it is just a pseudo value - hence the '? [%d]'. I have updated the comment in the code to explain it a bit better.
>
>
>>>
>>>> +         req->engine->get_seqno(req->engine));
>>>> +}
>>>> +
>>>> +static void i915_gem_request_fence_value_str(struct fence *req_fence,
>>>> +                         char *str, int size)
>>>> +{
>>>> +    struct drm_i915_gem_request *req;
>>>> +
>>>> +    req = container_of(req_fence, typeof(*req), fence);
>>>> +
>>>> +    snprintf(str, size, "%d [%d]", req->fence.seqno, req->seqno);
>>> Is it OK to put req->seqno in this one? OR it is just for debug anyway so it helps us and fence framework does not care?
>> I think this is used for getting all info from debugfs only, so req->seqno is fine.
> Yes, it is just for debugfs and trace output. And until the two values are unified, it is really useful to have them both present.
>
>>>> +}
>>>> +
>>>> +static const struct fence_ops i915_gem_request_fops = {
>>>> +    .enable_signaling    = i915_gem_request_enable_signaling,
>>>> +    .signaled        = i915_gem_request_is_completed,
>>>> +    .wait            = fence_default_wait,
>>>> +    .release        = i915_gem_request_free,
>>>> +    .get_driver_name    = i915_gem_request_get_driver_name,
>>>> +    .get_timeline_name    = i915_gem_request_get_timeline_name,
>>>> +    .fence_value_str    = i915_gem_request_fence_value_str,
>>>> +    .timeline_value_str    = i915_gem_request_timeline_value_str,
>>>> +};
>>>> +
>>>>    int i915_create_fence_timeline(struct drm_device *dev,
>>>>                       struct i915_gem_context *ctx,
>>>>                       struct intel_engine_cs *engine)
>>>> @@ -2770,7 +2852,7 @@ int i915_create_fence_timeline(struct drm_device *dev,
>>>>        return 0;
>>>>    }
>>>>
>>>> -unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
>>>> +static unsigned i915_fence_timeline_get_next_seqno(struct i915_fence_timeline *timeline)
>>>>    {
>>>>        unsigned seqno;
>>>>
>>>> @@ -2814,13 +2896,16 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
>>>>        if (ret)
>>>>            goto err;
>>>>
>>>> -    kref_init(&req->ref);
>>>>        req->i915 = dev_priv;
>>>>        req->engine = engine;
>>>>        req->reset_counter = reset_counter;
>>>>        req->ctx  = ctx;
>>>>        i915_gem_context_reference(req->ctx);
>>>>
>>>> +    fence_init(&req->fence, &i915_gem_request_fops, &engine->fence_lock,
>>>> +           ctx->engine[engine->id].fence_timeline.fence_context,
>>>> +           i915_fence_timeline_get_next_seqno(&ctx->engine[engine->id].fence_timeline));
>>>> +
>>>>        /*
>>>>         * Reserve space in the ring buffer for all the commands required to
>>>>         * eventually emit this request. This is to guarantee that the
>>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>>> index 14bcfb7..f126bcb 100644
>>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>>> @@ -2030,6 +2030,7 @@ logical_ring_setup(struct drm_device *dev, enum intel_engine_id id)
>>>>        INIT_LIST_HEAD(&engine->buffers);
>>>>        INIT_LIST_HEAD(&engine->execlist_queue);
>>>>        spin_lock_init(&engine->execlist_lock);
>>>> +    spin_lock_init(&engine->fence_lock);
>>>>
>>>>        tasklet_init(&engine->irq_tasklet,
>>>>                 intel_lrc_irq_handler, (unsigned long)engine);
>>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>>> index 8d35a39..fbd3f12 100644
>>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>>>> @@ -2254,6 +2254,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>>>>        INIT_LIST_HEAD(&engine->request_list);
>>>>        INIT_LIST_HEAD(&engine->execlist_queue);
>>>>        INIT_LIST_HEAD(&engine->buffers);
>>>> +    spin_lock_init(&engine->fence_lock);
>>>>        i915_gem_batch_pool_init(dev, &engine->batch_pool);
>>>>        memset(engine->semaphore.sync_seqno, 0,
>>>>               sizeof(engine->semaphore.sync_seqno));
>>>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>>> index b33c876..3f39daf 100644
>>>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>>>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>>>> @@ -345,6 +345,8 @@ struct intel_engine_cs {
>>>>         * to encode the command length in the header).
>>>>         */
>>>>        u32 (*get_cmd_length_mask)(u32 cmd_header);
>>>> +
>>>> +    spinlock_t fence_lock;
>>> Why is this lock per-engine, and not for example per timeline? Aren't fencees living completely isolated in their per-context-per-engine domains? So presumably there is something somewhere which is shared outside that domain to need a lock at the engine level?
>> All outstanding requests are added to engine->fence_signal_list in patch 4, which means a per engine lock is required.
>
> > Okay, a comment is required here to describe the lock then. All what it
> > protects and how and when it needs to be taken. Both from the i915
> > point of view and from the fence API side.
>
> Will add a comment to say that the lock is used for the signal list as well as the fence itself.
>
>
>>
>>
>