[RFC,4/4] drm/i915: Stop tracking execlists retired requests

Submitted by Tvrtko Ursulin on April 8, 2016, 1:54 p.m.

Details

Message ID 1460123698-16832-5-git-send-email-tvrtko.ursulin@linux.intel.com
State New
Headers show
Series "Eliminating the execlist retired request queue" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Tvrtko Ursulin April 8, 2016, 1:54 p.m.
From: Tvrtko Ursulin <tvrtko@ursulin.net>

With the previous patch having extended the pinned lifetime of
contexts by referencing the previous context from the current
request until the latter is retired (completed by the GPU),
we can now remove usage of execlist retired queue entirely.

This is because the above now guarantees that all execlist
object access requirements are satisfied by this new tracking,
and we can stop taking additional references and stop keeping
request on the execlists retired queue.

The latter was a source of significant scalability issues in
the driver causing performance hits on some tests. Most
dramatical of which was igt/gem_close_race which had run time
in tens of minutes which is now reduced to tens of seconds.

Signed-off-by: Tvrtko Ursulin <tvrtko@ursulin.net>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c         | 10 +------
 drivers/gpu/drm/i915/intel_lrc.c        | 46 ++++++++++-----------------------
 drivers/gpu/drm/i915/intel_lrc.h        |  2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
 4 files changed, 16 insertions(+), 43 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 118911ce7231..2f4bfe93b20c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2901,13 +2901,7 @@  static void i915_gem_reset_engine_cleanup(struct drm_i915_private *dev_priv,
 		/* Ensure irq handler finishes or is cancelled. */
 		tasklet_kill(&engine->irq_tasklet);
 
-		spin_lock_bh(&engine->execlist_lock);
-		/* list_splice_tail_init checks for empty lists */
-		list_splice_tail_init(&engine->execlist_queue,
-				      &engine->execlist_retired_req_list);
-		spin_unlock_bh(&engine->execlist_lock);
-
-		intel_execlists_retire_requests(engine);
+		intel_execlists_cancel_requests(engine);
 	}
 
 	/*
@@ -3029,8 +3023,6 @@  i915_gem_retire_requests(struct drm_device *dev)
 			spin_lock_bh(&engine->execlist_lock);
 			idle &= list_empty(&engine->execlist_queue);
 			spin_unlock_bh(&engine->execlist_lock);
-
-			intel_execlists_retire_requests(engine);
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e967943ce49..f241cf6e4227 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -456,8 +456,8 @@  static void execlists_context_unqueue(struct intel_engine_cs *engine)
 			/* Same ctx: ignore first request, as second request
 			 * will update tail past first request's workload */
 			cursor->elsp_submitted = req0->elsp_submitted;
-			list_move_tail(&req0->execlist_link,
-				       &engine->execlist_retired_req_list);
+			list_del(&req0->execlist_link);
+			i915_gem_request_unreference(req0);
 			req0 = cursor;
 		} else {
 			req1 = cursor;
@@ -499,19 +499,17 @@  execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
 					    struct drm_i915_gem_request,
 					    execlist_link);
 
-	if (!head_req)
-		return 0;
-
-	if (unlikely(intel_execlists_ctx_id(head_req->ctx, engine) != request_id))
-		return 0;
+	if (WARN_ON(!head_req ||
+	    (intel_execlists_ctx_id(head_req->ctx, engine) != request_id)))
+               return 0;
 
 	WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
 
 	if (--head_req->elsp_submitted > 0)
 		return 0;
 
-	list_move_tail(&head_req->execlist_link,
-		       &engine->execlist_retired_req_list);
+	list_del(&head_req->execlist_link);
+	i915_gem_request_unreference(head_req);
 
 	return 1;
 }
@@ -615,11 +613,6 @@  static void execlists_context_queue(struct drm_i915_gem_request *request)
 	struct drm_i915_gem_request *cursor;
 	int num_elements = 0;
 
-	if (request->ctx != request->i915->kernel_context)
-		intel_lr_context_pin(request->ctx, engine);
-
-	i915_gem_request_reference(request);
-
 	spin_lock_bh(&engine->execlist_lock);
 
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
@@ -636,11 +629,12 @@  static void execlists_context_queue(struct drm_i915_gem_request *request)
 		if (request->ctx == tail_req->ctx) {
 			WARN(tail_req->elsp_submitted != 0,
 				"More than 2 already-submitted reqs queued\n");
-			list_move_tail(&tail_req->execlist_link,
-				       &engine->execlist_retired_req_list);
+			list_del(&tail_req->execlist_link);
+			i915_gem_request_unreference(tail_req);
 		}
 	}
 
+	i915_gem_request_reference(request);
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (num_elements == 0)
 		execlists_context_unqueue(engine);
@@ -1039,28 +1033,18 @@  int intel_execlists_submission(struct i915_execbuffer_params *params,
 	return 0;
 }
 
-void intel_execlists_retire_requests(struct intel_engine_cs *engine)
+void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *req, *tmp;
-	struct list_head retired_list;
+	LIST_HEAD(cancel_list);
 
 	WARN_ON(!mutex_is_locked(&engine->dev->struct_mutex));
-	if (list_empty(&engine->execlist_retired_req_list))
-		return;
 
-	INIT_LIST_HEAD(&retired_list);
 	spin_lock_bh(&engine->execlist_lock);
-	list_replace_init(&engine->execlist_retired_req_list, &retired_list);
+	list_replace_init(&engine->execlist_queue, &cancel_list);
 	spin_unlock_bh(&engine->execlist_lock);
 
-	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
-		struct intel_context *ctx = req->ctx;
-		struct drm_i915_gem_object *ctx_obj =
-				ctx->engine[engine->id].state;
-
-		if (ctx_obj && (ctx != req->i915->kernel_context))
-			intel_lr_context_unpin(ctx, engine);
-
+	list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) {
 		list_del(&req->execlist_link);
 		i915_gem_request_unreference(req);
 	}
@@ -1180,7 +1164,6 @@  void intel_lr_context_unpin(struct intel_context *ctx,
 		intel_unpin_ringbuffer_obj(ctx->engine[engine->id].ringbuf);
 		i915_gem_object_ggtt_unpin(ctx_obj);
 		ctx->engine[engine->id].lrc_vma = NULL;
-		ctx->engine[engine->id].lrc_desc = 0;
 		ctx->engine[engine->id].lrc_reg_state = NULL;
 
 		i915_gem_context_unreference(ctx);
@@ -2110,7 +2093,6 @@  logical_ring_init(struct drm_device *dev, struct intel_engine_cs *engine)
 
 	INIT_LIST_HEAD(&engine->buffers);
 	INIT_LIST_HEAD(&engine->execlist_queue);
-	INIT_LIST_HEAD(&engine->execlist_retired_req_list);
 	spin_lock_init(&engine->execlist_lock);
 
 	tasklet_init(&engine->irq_tasklet,
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 8de1ea536ad4..04cee14f2bb2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -119,6 +119,6 @@  int intel_execlists_submission(struct i915_execbuffer_params *params,
 			       struct drm_i915_gem_execbuffer2 *args,
 			       struct list_head *vmas);
 
-void intel_execlists_retire_requests(struct intel_engine_cs *engine);
+void intel_execlists_cancel_requests(struct intel_engine_cs *engine);
 
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3a11705222fc..712c754bbd4e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -269,7 +269,6 @@  struct  intel_engine_cs {
 	struct tasklet_struct irq_tasklet;
 	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
 	struct list_head execlist_queue;
-	struct list_head execlist_retired_req_list;
 	unsigned int fw_domains;
 	unsigned int next_context_status_buffer;
 	unsigned int idle_lite_restore_wa;

Comments

On Fri, Apr 08, 2016 at 02:54:58PM +0100, Tvrtko Ursulin wrote:
> @@ -615,11 +613,6 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
>  	struct drm_i915_gem_request *cursor;
>  	int num_elements = 0;
>  
> -	if (request->ctx != request->i915->kernel_context)
> -		intel_lr_context_pin(request->ctx, engine);
> -
> -	i915_gem_request_reference(request);
> -
>  	spin_lock_bh(&engine->execlist_lock);
>  
>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> @@ -636,11 +629,12 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
>  		if (request->ctx == tail_req->ctx) {
>  			WARN(tail_req->elsp_submitted != 0,
>  				"More than 2 already-submitted reqs queued\n");
> -			list_move_tail(&tail_req->execlist_link,
> -				       &engine->execlist_retired_req_list);
> +			list_del(&tail_req->execlist_link);
> +			i915_gem_request_unreference(tail_req);
>  		}
>  	}
>  
> +	i915_gem_request_reference(request);
>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);

If you want to get truly radical, we do not need the ref on the request
until it is submitted to hardware. (As the request cannot be retired
until it has done so, it can leave the execlist_queue until we commit it
to hw, or perform the cancel).

Lgtm,
-Chris
On 08/04/16 15:57, Chris Wilson wrote:
> On Fri, Apr 08, 2016 at 02:54:58PM +0100, Tvrtko Ursulin wrote:
>> @@ -615,11 +613,6 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
>>   	struct drm_i915_gem_request *cursor;
>>   	int num_elements = 0;
>>
>> -	if (request->ctx != request->i915->kernel_context)
>> -		intel_lr_context_pin(request->ctx, engine);
>> -
>> -	i915_gem_request_reference(request);
>> -
>>   	spin_lock_bh(&engine->execlist_lock);
>>
>>   	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
>> @@ -636,11 +629,12 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
>>   		if (request->ctx == tail_req->ctx) {
>>   			WARN(tail_req->elsp_submitted != 0,
>>   				"More than 2 already-submitted reqs queued\n");
>> -			list_move_tail(&tail_req->execlist_link,
>> -				       &engine->execlist_retired_req_list);
>> +			list_del(&tail_req->execlist_link);
>> +			i915_gem_request_unreference(tail_req);
>>   		}
>>   	}
>>
>> +	i915_gem_request_reference(request);
>>   	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>
> If you want to get truly radical, we do not need the ref on the request
> until it is submitted to hardware. (As the request cannot be retired
> until it has done so, it can leave the execlist_queue until we commit it
> to hw, or perform the cancel).

Don't know. It is simple and nice that reference is tied to presence on 
execlist_queue.

More importantly, the patch as presented has a flaw that it dereferences 
req->ctx from execlists_check_remove_request where the context pin may 
have disappeared already due context complete interrupts getting behind 
when coallescing.

I will need to cache ctx_id in the request I think. It is fine do to 
that since ctx id must be unique and stable for a request.

Maybe even I should pull in your patch which makes ctx ids stable and 
persistent aligned with context existence and not pin. Think I've seen 
something like that somewhere.

Regards,

Tvrtko
On Mon, Apr 11, 2016 at 10:10:56AM +0100, Tvrtko Ursulin wrote:
> 
> On 08/04/16 15:57, Chris Wilson wrote:
> >On Fri, Apr 08, 2016 at 02:54:58PM +0100, Tvrtko Ursulin wrote:
> >>@@ -615,11 +613,6 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
> >>  	struct drm_i915_gem_request *cursor;
> >>  	int num_elements = 0;
> >>
> >>-	if (request->ctx != request->i915->kernel_context)
> >>-		intel_lr_context_pin(request->ctx, engine);
> >>-
> >>-	i915_gem_request_reference(request);
> >>-
> >>  	spin_lock_bh(&engine->execlist_lock);
> >>
> >>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> >>@@ -636,11 +629,12 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
> >>  		if (request->ctx == tail_req->ctx) {
> >>  			WARN(tail_req->elsp_submitted != 0,
> >>  				"More than 2 already-submitted reqs queued\n");
> >>-			list_move_tail(&tail_req->execlist_link,
> >>-				       &engine->execlist_retired_req_list);
> >>+			list_del(&tail_req->execlist_link);
> >>+			i915_gem_request_unreference(tail_req);
> >>  		}
> >>  	}
> >>
> >>+	i915_gem_request_reference(request);
> >>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> >
> >If you want to get truly radical, we do not need the ref on the request
> >until it is submitted to hardware. (As the request cannot be retired
> >until it has done so, it can leave the execlist_queue until we commit it
> >to hw, or perform the cancel).
> 
> Don't know. It is simple and nice that reference is tied to presence
> on execlist_queue.
> 
> More importantly, the patch as presented has a flaw that it
> dereferences req->ctx from execlists_check_remove_request where the
> context pin may have disappeared already due context complete
> interrupts getting behind when coallescing.
> 
> I will need to cache ctx_id in the request I think. It is fine do to
> that since ctx id must be unique and stable for a request.
> 
> Maybe even I should pull in your patch which makes ctx ids stable
> and persistent aligned with context existence and not pin. Think
> I've seen something like that somewhere.

Yes, both OA, PASID and vGPU depend upon having stable ctx-id for the
lifetime of the context. Dave expressed a concern that the GuC maybe
interpretting it as the LRCA, but as far as I can see, lrc->ring_lcra
and lrc->context_id are distinct and we never use the global context
identifier itself (the closest is the low 32bits of lrc_desc which do
not include our unique id).
-Chris
On Mon, Apr 11, 2016 at 10:10:56AM +0100, Tvrtko Ursulin wrote:
> 
> On 08/04/16 15:57, Chris Wilson wrote:
> >On Fri, Apr 08, 2016 at 02:54:58PM +0100, Tvrtko Ursulin wrote:
> >>@@ -615,11 +613,6 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
> >>  	struct drm_i915_gem_request *cursor;
> >>  	int num_elements = 0;
> >>
> >>-	if (request->ctx != request->i915->kernel_context)
> >>-		intel_lr_context_pin(request->ctx, engine);
> >>-
> >>-	i915_gem_request_reference(request);
> >>-
> >>  	spin_lock_bh(&engine->execlist_lock);
> >>
> >>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> >>@@ -636,11 +629,12 @@ static void execlists_context_queue(struct drm_i915_gem_request *request)
> >>  		if (request->ctx == tail_req->ctx) {
> >>  			WARN(tail_req->elsp_submitted != 0,
> >>  				"More than 2 already-submitted reqs queued\n");
> >>-			list_move_tail(&tail_req->execlist_link,
> >>-				       &engine->execlist_retired_req_list);
> >>+			list_del(&tail_req->execlist_link);
> >>+			i915_gem_request_unreference(tail_req);
> >>  		}
> >>  	}
> >>
> >>+	i915_gem_request_reference(request);
> >>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> >
> >If you want to get truly radical, we do not need the ref on the request
> >until it is submitted to hardware. (As the request cannot be retired
> >until it has done so, it can leave the execlist_queue until we commit it
> >to hw, or perform the cancel).
> 
> Don't know. It is simple and nice that reference is tied to presence
> on execlist_queue.
> 
> More importantly, the patch as presented has a flaw that it
> dereferences req->ctx from execlists_check_remove_request where the
> context pin may have disappeared already due context complete
> interrupts getting behind when coallescing.
> 
> I will need to cache ctx_id in the request I think. It is fine do to
> that since ctx id must be unique and stable for a request.

For comparison, tracking by requests rather than contexts:

https://cgit.freedesktop.org/~ickle/linux-2.6/commit/?h=tasklet&id=e23791ef88636aa6e3f850b61ab2c4e027af0e52
-Chris
>