[v7,5/9] drm/i915: vgpu context submission pv optimization

Submitted by Xiaolin Zhang on July 8, 2019, 1:35 a.m.

Details

Message ID 1562549722-27046-6-git-send-email-xiaolin.zhang@intel.com
State New
Headers show
Series "i915 vgpu PV to improve vgpu performance" ( rev: 1 ) in Intel GVT devel

Not browsing as part of any series.

Commit Message

Xiaolin Zhang July 8, 2019, 1:35 a.m.
It is performance optimization to override the actual submisison backend
in order to eliminate execlists csb process and reduce mmio trap numbers
for workload submission without context switch interrupt by talking with
GVT via PV submisison notification mechanism between guest and GVT.

Use PV_SUBMISSION to control this level of pv optimization.

v0: RFC.
v1: rebase.
v2: added pv ops for pv context submission. to maximize code resuse,
introduced 2 more ops (submit_ports & preempt_context) instead of 1 op
(set_default_submission) in engine structure. pv version of
submit_ports and preempt_context implemented.
v3:
1. to reduce more code duplication, code refactor and replaced 2 ops
"submit_ports & preempt_contex" from v2 by 1 ops "write_desc"
in engine structure. pv version of write_des implemented.
2. added VGT_G2V_ELSP_SUBMIT for g2v pv notification.
v4: implemented pv elsp submission tasklet as the backend workload
submisison by talking to GVT with PV notificaiton mechanism and renamed
VGT_G2V_ELSP_SUBMIT to VGT_G2V_PV_SUBMISIION.
v5: addressed v4 comments from Chris, intel_pv_submission.c added.
v6: addressed v5 comments from Chris, replaced engine id by hw_id.
v7: rebase.

Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
---
 drivers/gpu/drm/i915/Makefile              |   2 +-
 drivers/gpu/drm/i915/gt/intel_lrc.c        |   8 +-
 drivers/gpu/drm/i915/i915_vgpu.c           |  15 ++-
 drivers/gpu/drm/i915/i915_vgpu.h           |  15 +++
 drivers/gpu/drm/i915/intel_pv_submission.c | 189 +++++++++++++++++++++++++++++
 5 files changed, 225 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_pv_submission.c

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 5266dbe..6e13f7c 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -244,7 +244,7 @@  i915-$(CONFIG_DRM_I915_SELFTEST) += \
 	selftests/igt_spinner.o
 
 # virtual gpu code
-i915-y += i915_vgpu.o
+i915-y += i915_vgpu.o intel_pv_submission.o
 
 ifeq ($(CONFIG_DRM_I915_GVT),y)
 i915-y += intel_gvt.o
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e1ae139..48a9b28 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2702,10 +2702,14 @@  void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 	engine->unpark = NULL;
 
 	engine->flags |= I915_ENGINE_SUPPORTS_STATS;
-	if (!intel_vgpu_active(engine->i915))
-		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
+	engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
 	if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
 		engine->flags |= I915_ENGINE_HAS_PREEMPTION;
+
+	if (intel_vgpu_active(engine->i915)) {
+		engine->flags &= ~I915_ENGINE_HAS_SEMAPHORES;
+		intel_vgpu_config_pv_caps(engine->i915,	PV_SUBMISSION, engine);
+	}
 }
 
 static void execlists_destroy(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c
index 2aad0b8..c628be8 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.c
+++ b/drivers/gpu/drm/i915/i915_vgpu.c
@@ -97,7 +97,7 @@  void i915_detect_vgpu(struct drm_i915_private *dev_priv)
 	dev_priv->vgpu.active = true;
 
 	/* guest driver PV capability */
-	dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE;
+	dev_priv->vgpu.pv_caps = PV_PPGTT_UPDATE | PV_SUBMISSION;
 
 	if (!intel_vgpu_check_pv_caps(dev_priv, shared_area)) {
 		DRM_INFO("Virtual GPU for Intel GVT-g detected.\n");
@@ -380,6 +380,7 @@  void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		enum pv_caps cap, void *data)
 {
 	struct i915_ppgtt *ppgtt;
+	struct intel_engine_cs *engine;
 
 	if (!intel_vgpu_enabled_pv_caps(dev_priv, cap))
 		return;
@@ -390,6 +391,11 @@  void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		ppgtt->vm.insert_entries = gen8_ppgtt_insert_4lvl_pv;
 		ppgtt->vm.clear_range = gen8_ppgtt_clear_4lvl_pv;
 	}
+
+	if (cap == PV_SUBMISSION) {
+		engine = (struct intel_engine_cs *)data;
+		vgpu_set_pv_submission(engine);
+	}
 }
 
 /**
@@ -584,6 +590,8 @@  static int intel_vgpu_setup_shared_page(struct drm_i915_private *dev_priv,
 	struct gvt_shared_page *base;
 	u64 gpa;
 	u16 ver_maj, ver_min;
+	int i;
+	u32 size;
 
 	/* We allocate 1 page shared between guest and GVT for data exchange.
 	 *       ___________.....................
@@ -657,6 +665,11 @@  static int intel_vgpu_setup_shared_page(struct drm_i915_private *dev_priv,
 	pv->notify = intel_vgpu_pv_notify_mmio;
 	mutex_init(&pv->send_mutex);
 
+	/* setup PV per engine data exchange ptr */
+	size = sizeof(struct pv_submission);
+	for (i = 0; i < PV_MAX_ENGINES_NUM; i++)
+		pv->pv_elsp[i] = (void *)base + PV_ELSP_OFF +  size * i;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_vgpu.h b/drivers/gpu/drm/i915/i915_vgpu.h
index 92c84d3..2cafb30 100644
--- a/drivers/gpu/drm/i915/i915_vgpu.h
+++ b/drivers/gpu/drm/i915/i915_vgpu.h
@@ -29,6 +29,8 @@ 
 
 #define PV_MAJOR		1
 #define PV_MINOR		0
+#define PV_MAX_ENGINES_NUM	(VECS1_HW + 1)
+#define PV_ELSP_OFF		(PAGE_SIZE/8)
 #define PV_DESC_OFF		(PAGE_SIZE/4)
 #define PV_CMD_OFF		(PAGE_SIZE/2)
 
@@ -37,6 +39,7 @@ 
  */
 enum pv_caps {
 	PV_PPGTT_UPDATE = 0x1,
+	PV_SUBMISSION = 0x2,
 };
 
 /* PV actions */
@@ -45,6 +48,7 @@  enum intel_vgpu_pv_action {
 	PV_ACTION_PPGTT_L4_ALLOC,
 	PV_ACTION_PPGTT_L4_CLEAR,
 	PV_ACTION_PPGTT_L4_INSERT,
+	PV_ACTION_ELSP_SUBMISSION,
 };
 
 /*
@@ -56,6 +60,12 @@  struct gvt_shared_page {
 	u16 ver_minor;
 };
 
+/* workload submission pv support data structure */
+struct pv_submission {
+	u64 descs[EXECLIST_MAX_PORTS];
+	bool submitted;
+};
+
 /*
  * Definition of the command transport message header (DW0)
  *
@@ -100,6 +110,9 @@  struct i915_virtual_gpu_pv {
 	struct gvt_shared_page *shared_page;
 	bool enabled;
 
+	/* per engine PV workload submission data */
+	struct pv_submission *pv_elsp[PV_MAX_ENGINES_NUM];
+
 	/* PV command buffer support */
 	struct vgpu_pv_ct_buffer ctb;
 	u32 next_fence;
@@ -163,4 +176,6 @@  bool intel_vgpu_check_pv_caps(struct drm_i915_private *dev_priv,
 		void __iomem *shared_area);
 void intel_vgpu_config_pv_caps(struct drm_i915_private *dev_priv,
 		enum pv_caps cap, void *data);
+void vgpu_set_pv_submission(struct intel_engine_cs *engine);
+
 #endif /* _I915_VGPU_H_ */
diff --git a/drivers/gpu/drm/i915/intel_pv_submission.c b/drivers/gpu/drm/i915/intel_pv_submission.c
new file mode 100644
index 0000000..18b491b
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_pv_submission.c
@@ -0,0 +1,189 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include "intel_drv.h"
+#include "i915_vgpu.h"
+#include "gt/intel_lrc_reg.h"
+#include "gt/intel_engine_pm.h"
+
+static u64 execlists_update_context(struct i915_request *rq)
+{
+	struct intel_context *ce = rq->hw_context;
+	u32 *reg_state = ce->lrc_reg_state;
+
+	reg_state[CTX_RING_TAIL+1] = intel_ring_set_tail(rq->ring, rq->tail);
+
+	return ce->lrc_desc;
+}
+
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+	return rb_entry(rb, struct i915_priolist, node);
+}
+
+static void pv_submit(struct intel_engine_cs *engine,
+	       struct i915_request **out,
+	       struct i915_request **end)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
+	struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
+	struct i915_request *rq;
+
+	u64 descs[2];
+	int n, err;
+
+	memset(descs, 0, sizeof(descs));
+	n = 0;
+	do {
+		rq = *out++;
+		descs[n++] = execlists_update_context(rq);
+	} while (out != end);
+
+	for (n = 0; n < execlists_num_ports(execlists); n++)
+		pv_elsp->descs[n] = descs[n];
+
+	writel(PV_ACTION_ELSP_SUBMISSION, execlists->submit_reg);
+
+#define done (READ_ONCE(pv_elsp->submitted) == true)
+	err = wait_for_us(done, 1);
+	if (err)
+		err = wait_for(done, 1);
+#undef done
+
+	if (unlikely(err))
+		DRM_ERROR("PV (%s) workload submission failed\n", engine->name);
+
+	pv_elsp->submitted = false;
+}
+
+static struct i915_request *schedule_in(struct i915_request *rq, int idx)
+{
+	trace_i915_request_in(rq, idx);
+
+	if (!rq->hw_context->inflight)
+		rq->hw_context->inflight = rq->engine;
+	intel_context_inflight_inc(rq->hw_context);
+
+	return i915_request_get(rq);
+}
+
+static void pv_dequeue(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request **first = execlists->inflight;
+	struct i915_request *last = first[0];
+	struct i915_request **port;
+	bool submit = false;
+	struct rb_node *rb;
+
+	lockdep_assert_held(&engine->active.lock);
+
+	if (last)
+		return;
+
+	port = first;
+	while ((rb = rb_first_cached(&execlists->queue))) {
+		struct i915_priolist *p = to_priolist(rb);
+		struct i915_request *rq, *rn;
+		int i;
+
+		priolist_for_each_request_consume(rq, rn, p, i) {
+			if (last && rq->hw_context != last->hw_context)
+				goto done;
+
+			list_del_init(&rq->sched.link);
+			__i915_request_submit(rq);
+			submit = true;
+			last = rq;
+		}
+
+		rb_erase_cached(&p->node, &execlists->queue);
+		i915_priolist_free(p);
+	}
+done:
+	execlists->queue_priority_hint =
+		rb ? to_priolist(rb)->priority : INT_MIN;
+	if (submit) {
+		*port = schedule_in(last, port - execlists->inflight);
+		*++port = NULL;
+		pv_submit(engine, first, port);
+	}
+	execlists->active = execlists->inflight;
+}
+
+static void schedule_out(struct i915_request *rq)
+{
+	trace_i915_request_out(rq);
+
+	intel_context_inflight_dec(rq->hw_context);
+	if (!intel_context_inflight_count(rq->hw_context))
+		rq->hw_context->inflight = NULL;
+
+	i915_request_put(rq);
+}
+
+
+static void vgpu_pv_submission_tasklet(unsigned long data)
+{
+	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
+	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request **port, *rq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&engine->active.lock, flags);
+
+	for (port = execlists->inflight; (rq = *port); port++) {
+		if (!i915_request_completed(rq))
+			break;
+
+		schedule_out(rq);
+	}
+
+	if (port != execlists->inflight) {
+		int idx = port - execlists->inflight;
+		int rem = ARRAY_SIZE(execlists->inflight) - idx;
+
+		memmove(execlists->inflight, port, rem * sizeof(*port));
+	}
+
+	if (!rq)
+		pv_dequeue(engine);
+
+	spin_unlock_irqrestore(&engine->active.lock, flags);
+}
+
+static void vgpu_pv_submission_park(struct intel_engine_cs *engine)
+{
+	intel_engine_park(engine);
+	intel_engine_unpin_breadcrumbs_irq(engine);
+	engine->flags &= ~I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
+}
+
+static void vgpu_pv_submission_unpark(struct intel_engine_cs *engine)
+{
+	engine->flags |= I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
+	intel_engine_pin_breadcrumbs_irq(engine);
+}
+
+void vgpu_set_pv_submission(struct intel_engine_cs *engine)
+{
+	/*
+	 * We inherit a bunch of functions from execlists that we'd like
+	 * to keep using:
+	 *
+	 *    engine->submit_request = execlists_submit_request;
+	 *    engine->cancel_requests = execlists_cancel_requests;
+	 *    engine->schedule = execlists_schedule;
+	 *
+	 * But we need to override the actual submission backend in order
+	 * to talk to the GVT with PV notification message.
+	 */
+
+	engine->execlists.tasklet.func = vgpu_pv_submission_tasklet;
+
+	engine->park = vgpu_pv_submission_park;
+	engine->unpark = vgpu_pv_submission_unpark;
+}

Comments

Quoting Xiaolin Zhang (2019-07-08 02:35:18)
> +static void pv_submit(struct intel_engine_cs *engine,
> +              struct i915_request **out,
> +              struct i915_request **end)
> +{
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
> +       struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
> +       struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
> +       struct i915_request *rq;
> +
> +       u64 descs[2];
> +       int n, err;
> +
> +       memset(descs, 0, sizeof(descs));
> +       n = 0;
> +       do {
> +               rq = *out++;
> +               descs[n++] = execlists_update_context(rq);
> +       } while (out != end);
> +
> +       for (n = 0; n < execlists_num_ports(execlists); n++)
> +               pv_elsp->descs[n] = descs[n];

You can polish this a bit, minor nit.

> +       writel(PV_ACTION_ELSP_SUBMISSION, execlists->submit_reg);
> +
> +#define done (READ_ONCE(pv_elsp->submitted) == true)
> +       err = wait_for_us(done, 1);
> +       if (err)
> +               err = wait_for(done, 1);

Strictly, you need to use wait_for_atomic_us() [under a spinlock here],
and there's no need for a second pass since you are not allowed to sleep
anyway. So just set the timeout to 1000us.

> +#undef done
> +
> +       if (unlikely(err))
> +               DRM_ERROR("PV (%s) workload submission failed\n", engine->name);
> +
> +       pv_elsp->submitted = false;

However, that looks solid wrt to serialisation of this engine with its
pv host, without cross-interference (at least in the comms channel).

If you want to get fancy, you should be able to simply not dequeue until
!pv_elsp->submitted so the wait-for-ack occurs naturally. So long as the
pv host plays nicely, we should always see submitted acked before the
request is signaled. Give or take problems with preemption and the pv
host being a black box that may allow requests to complete and so our
submission be a no-op (and so not generate an interrupt to allow further
submission). Indeed, I would strongly recommend you use the delayed ack
plus jiffie timer to avoid the no-op submission problem.

If you want to prove this in a bunch of mocked up selftests that provide
the pv channel ontop of the local driver....
-Chris
On 07/08/2019 06:41 PM, Chris Wilson wrote:
> Quoting Xiaolin Zhang (2019-07-08 02:35:18)
>> +static void pv_submit(struct intel_engine_cs *engine,
>> +              struct i915_request **out,
>> +              struct i915_request **end)
>> +{
>> +       struct intel_engine_execlists * const execlists = &engine->execlists;
>> +       struct i915_virtual_gpu_pv *pv = engine->i915->vgpu.pv;
>> +       struct pv_submission *pv_elsp = pv->pv_elsp[engine->hw_id];
>> +       struct i915_request *rq;
>> +
>> +       u64 descs[2];
>> +       int n, err;
>> +
>> +       memset(descs, 0, sizeof(descs));
>> +       n = 0;
>> +       do {
>> +               rq = *out++;
>> +               descs[n++] = execlists_update_context(rq);
>> +       } while (out != end);
>> +
>> +       for (n = 0; n < execlists_num_ports(execlists); n++)
>> +               pv_elsp->descs[n] = descs[n];
> You can polish this a bit, minor nit.
Sure.
>
>> +       writel(PV_ACTION_ELSP_SUBMISSION, execlists->submit_reg);
>> +
>> +#define done (READ_ONCE(pv_elsp->submitted) == true)
>> +       err = wait_for_us(done, 1);
>> +       if (err)
>> +               err = wait_for(done, 1);
> Strictly, you need to use wait_for_atomic_us() [under a spinlock here],
> and there's no need for a second pass since you are not allowed to sleep
> anyway. So just set the timeout to 1000us.
Sure.
>> +#undef done
>> +
>> +       if (unlikely(err))
>> +               DRM_ERROR("PV (%s) workload submission failed\n", engine->name);
>> +
>> +       pv_elsp->submitted = false;
> However, that looks solid wrt to serialisation of this engine with its
> pv host, without cross-interference (at least in the comms channel).
>
> If you want to get fancy, you should be able to simply not dequeue until
> !pv_elsp->submitted so the wait-for-ack occurs naturally. So long as the
> pv host plays nicely, we should always see submitted acked before the
> request is signaled. Give or take problems with preemption and the pv
> host being a black box that may allow requests to complete and so our
> submission be a no-op (and so not generate an interrupt to allow further
> submission). Indeed, I would strongly recommend you use the delayed ack
> plus jiffie timer to avoid the no-op submission problem.
I will implement this suggestion. thanks your feedback, Chris.
-BRs, Xiaolin
> If you want to prove this in a bunch of mocked up selftests that provide
> the pv channel ontop of the local driver....
> -Chris
> _______________________________________________
> intel-gvt-dev mailing list
> intel-gvt-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gvt-dev