drm/amdgpu:implement CONTEXT_CONTROL (v5)

Submitted by Liu, Monk on Sept. 8, 2016, 6:32 a.m.

Details

Message ID 1473316377-15377-1-git-send-email-Monk.Liu@amd.com
State New
Headers show
Series "drm/amdgpu:implement CONTEXT_CONTROL (v5)" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Liu, Monk Sept. 8, 2016, 6:32 a.m.
v1:
for gfx8, use CONTEXT_CONTROL package to dynamically
skip preamble CEIB and other load_xxx command in sequence.

v2:
support GFX7 as well, and bump up version.
remove cntxcntl in compute ring funcs because CPC doesn't
support this packet.

v3: fix reduntant judgement in cntxcntl.
v4: some cleanups, don't change cs_submit()
v5: keep old MESA supported & bump up KMS version.

Change-Id: I7b2adc15ea83fd6c4d2521d75662bf39587898d5
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Ack-by: Chunming Zhou <David1.Zhou@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 12 +++++++++++-
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
 6 files changed, 79 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9d39fa8..8c6d8c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -321,6 +321,7 @@  struct amdgpu_ring_funcs {
 	void (*begin_use)(struct amdgpu_ring *ring);
 	void (*end_use)(struct amdgpu_ring *ring);
 	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
+	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
 };
 
 /*
@@ -966,6 +967,7 @@  struct amdgpu_ctx {
 	spinlock_t		ring_lock;
 	struct fence            **fences;
 	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
+	bool preamble_presented;
 };
 
 struct amdgpu_ctx_mgr {
@@ -1231,6 +1233,10 @@  struct amdgpu_cs_parser {
 	struct amdgpu_bo_list_entry	uf_entry;
 };
 
+#define PREAMBLE_IB_PRESENT 		(1 << 0) /* bit set means command submit involves a preamble IB */
+#define PREAMBLE_IB_PRESENT_FIRST	(1 << 1) /* bit set means preamble IB is first presented in belonging context */
+#define HAVE_CTX_SWITCH		(1 << 2) /* bit set means context switch occured */
+
 struct amdgpu_job {
 	struct amd_sched_job    base;
 	struct amdgpu_device	*adev;
@@ -1239,6 +1245,7 @@  struct amdgpu_job {
 	struct amdgpu_sync	sync;
 	struct amdgpu_ib	*ibs;
 	struct fence		*fence; /* the hw fence */
+	uint32_t		preamble_status;
 	uint32_t		num_ibs;
 	void			*owner;
 	uint64_t		fence_ctx; /* the fence_context this job uses */
@@ -2284,6 +2291,7 @@  amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
+#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
 #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
 #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 20a1962..2386c7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -851,6 +851,14 @@  static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 		if (r)
 			return r;
 
+		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE) {
+			parser->job->preamble_status |= PREAMBLE_IB_PRESENT;
+			if (!parser->ctx->preamble_presented) {
+				parser->job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
+				parser->ctx->preamble_presented = true;
+			}
+		}
+
 		if (parser->job->ring && parser->job->ring != ring)
 			return -EINVAL;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 21b8cd6..2a96ce7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -55,9 +55,10 @@ 
  * - 3.3.0 - Add VM support for UVD on supported hardware.
  * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
  * - 3.5.0 - Add support for new UVD_NO_OP register.
+ * - 3.6.0 - kmd involves use CONTEXT_CONTROL in ring buffer.
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	5
+#define KMS_DRIVER_MINOR	6
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 2fe2686..1af52be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -125,6 +125,7 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	unsigned patch_offset = ~0;
 	struct amdgpu_vm *vm;
 	uint64_t fence_ctx;
+	uint32_t status = 0;
 
 	unsigned i;
 	int r = 0;
@@ -176,11 +177,20 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 
 	skip_preamble = ring->current_ctx == fence_ctx;
 	need_ctx_switch = ring->current_ctx != fence_ctx;
+	if (job && ring->funcs->emit_cntxcntl) {
+		if (need_ctx_switch)
+			status |= HAVE_CTX_SWITCH;
+		status |= job->preamble_status;
+		amdgpu_ring_emit_cntxcntl(ring, status);
+	}
+
 	for (i = 0; i < num_ibs; ++i) {
 		ib = &ibs[i];
 
 		/* drop preamble IBs if we don't have a context switch */
-		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
+		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) &&
+			skip_preamble &&
+			!(status & PREAMBLE_IB_PRESENT_FIRST))
 			continue;
 
 		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 84ad13e..41b8654 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2096,6 +2096,25 @@  static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 	amdgpu_ring_write(ring, control);
 }
 
+static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
+{
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs */
+		dw2 |= 0x10002;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 /**
  * gfx_v7_0_ring_test_ib - basic ring IB test
  *
@@ -4930,6 +4949,7 @@  static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
 	.test_ib = gfx_v7_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
+	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 1ac9ef7..1bf8c9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6080,6 +6080,35 @@  static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);
 }
 
+static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
+{
+	uint32_t dw2 = 0;
+
+	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
+	if (flags & HAVE_CTX_SWITCH) {
+		/* set load_global_config & load_global_uconfig */
+		dw2 |= 0x8001;
+		/* set load_cs_sh_regs */
+		dw2 |= 0x01000000;
+		/* set load_per_context_state & load_gfx_sh_regs for GFX */
+		dw2 |= 0x10002;
+
+		/* set load_ce_ram if preamble presented */
+		if (PREAMBLE_IB_PRESENT & flags)
+			dw2 |= 0x10000000;
+	} else {
+		/* still load_ce_ram if this is the first time preamble presented
+		 * although there is no context switch happens.
+		 */
+		if (PREAMBLE_IB_PRESENT_FIRST & flags)
+			dw2 |= 0x10000000;
+	}
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
+	amdgpu_ring_write(ring, dw2);
+	amdgpu_ring_write(ring, 0);
+}
+
 static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -6262,6 +6291,7 @@  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_switch_buffer = gfx_v8_ring_emit_sb,
+	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {

Comments

Am 08.09.2016 um 08:32 schrieb Monk Liu:
> v1:
> for gfx8, use CONTEXT_CONTROL package to dynamically
> skip preamble CEIB and other load_xxx command in sequence.
>
> v2:
> support GFX7 as well, and bump up version.
> remove cntxcntl in compute ring funcs because CPC doesn't
> support this packet.
>
> v3: fix reduntant judgement in cntxcntl.
> v4: some cleanups, don't change cs_submit()
> v5: keep old MESA supported & bump up KMS version.
>
> Change-Id: I7b2adc15ea83fd6c4d2521d75662bf39587898d5
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> Ack-by: Chunming Zhou <David1.Zhou@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 12 +++++++++++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 20 ++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 30 ++++++++++++++++++++++++++++++
>   6 files changed, 79 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 9d39fa8..8c6d8c6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -321,6 +321,7 @@ struct amdgpu_ring_funcs {
>   	void (*begin_use)(struct amdgpu_ring *ring);
>   	void (*end_use)(struct amdgpu_ring *ring);
>   	void (*emit_switch_buffer) (struct amdgpu_ring *ring);
> +	void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags);
>   };
>   
>   /*
> @@ -966,6 +967,7 @@ struct amdgpu_ctx {
>   	spinlock_t		ring_lock;
>   	struct fence            **fences;
>   	struct amdgpu_ctx_ring	rings[AMDGPU_MAX_RINGS];
> +	bool preamble_presented;
>   };
>   
>   struct amdgpu_ctx_mgr {
> @@ -1231,6 +1233,10 @@ struct amdgpu_cs_parser {
>   	struct amdgpu_bo_list_entry	uf_entry;
>   };
>   
> +#define PREAMBLE_IB_PRESENT 		(1 << 0) /* bit set means command submit involves a preamble IB */
> +#define PREAMBLE_IB_PRESENT_FIRST	(1 << 1) /* bit set means preamble IB is first presented in belonging context */
> +#define HAVE_CTX_SWITCH		(1 << 2) /* bit set means context switch occured */
> +

Please add an AMDGPU_ prefix in the names of those defines.

With that fixed the patch is Reviewed-by: Christian König 
<christian.koenig@amd.com>.

Regards,
Christian.


>   struct amdgpu_job {
>   	struct amd_sched_job    base;
>   	struct amdgpu_device	*adev;
> @@ -1239,6 +1245,7 @@ struct amdgpu_job {
>   	struct amdgpu_sync	sync;
>   	struct amdgpu_ib	*ibs;
>   	struct fence		*fence; /* the hw fence */
> +	uint32_t		preamble_status;
>   	uint32_t		num_ibs;
>   	void			*owner;
>   	uint64_t		fence_ctx; /* the fence_context this job uses */
> @@ -2284,6 +2291,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_hdp_invalidate(r) (r)->funcs->emit_hdp_invalidate((r))
>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
> +#define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>   #define amdgpu_ring_patch_cond_exec(r,o) (r)->funcs->patch_cond_exec((r),(o))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 20a1962..2386c7b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -851,6 +851,14 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   		if (r)
>   			return r;
>   
> +		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE) {
> +			parser->job->preamble_status |= PREAMBLE_IB_PRESENT;
> +			if (!parser->ctx->preamble_presented) {
> +				parser->job->preamble_status |= PREAMBLE_IB_PRESENT_FIRST;
> +				parser->ctx->preamble_presented = true;
> +			}
> +		}
> +
>   		if (parser->job->ring && parser->job->ring != ring)
>   			return -EINVAL;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 21b8cd6..2a96ce7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -55,9 +55,10 @@
>    * - 3.3.0 - Add VM support for UVD on supported hardware.
>    * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
>    * - 3.5.0 - Add support for new UVD_NO_OP register.
> + * - 3.6.0 - kmd involves use CONTEXT_CONTROL in ring buffer.
>    */
>   #define KMS_DRIVER_MAJOR	3
> -#define KMS_DRIVER_MINOR	5
> +#define KMS_DRIVER_MINOR	6
>   #define KMS_DRIVER_PATCHLEVEL	0
>   
>   int amdgpu_vram_limit = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 2fe2686..1af52be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -125,6 +125,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
>   	uint64_t fence_ctx;
> +	uint32_t status = 0;
>   
>   	unsigned i;
>   	int r = 0;
> @@ -176,11 +177,20 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   
>   	skip_preamble = ring->current_ctx == fence_ctx;
>   	need_ctx_switch = ring->current_ctx != fence_ctx;
> +	if (job && ring->funcs->emit_cntxcntl) {
> +		if (need_ctx_switch)
> +			status |= HAVE_CTX_SWITCH;
> +		status |= job->preamble_status;
> +		amdgpu_ring_emit_cntxcntl(ring, status);
> +	}
> +
>   	for (i = 0; i < num_ibs; ++i) {
>   		ib = &ibs[i];
>   
>   		/* drop preamble IBs if we don't have a context switch */
> -		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) && skip_preamble)
> +		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) &&
> +			skip_preamble &&
> +			!(status & PREAMBLE_IB_PRESENT_FIRST))
>   			continue;
>   
>   		amdgpu_ring_emit_ib(ring, ib, job ? job->vm_id : 0,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 84ad13e..41b8654 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2096,6 +2096,25 @@ static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   	amdgpu_ring_write(ring, control);
>   }
>   
> +static void gfx_v7_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
> +{
> +	uint32_t dw2 = 0;
> +
> +	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
> +	if (flags & HAVE_CTX_SWITCH) {
> +		/* set load_global_config & load_global_uconfig */
> +		dw2 |= 0x8001;
> +		/* set load_cs_sh_regs */
> +		dw2 |= 0x01000000;
> +		/* set load_per_context_state & load_gfx_sh_regs */
> +		dw2 |= 0x10002;
> +	}
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> +	amdgpu_ring_write(ring, dw2);
> +	amdgpu_ring_write(ring, 0);
> +}
> +
>   /**
>    * gfx_v7_0_ring_test_ib - basic ring IB test
>    *
> @@ -4930,6 +4949,7 @@ static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>   	.test_ib = gfx_v7_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
> +	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 1ac9ef7..1bf8c9e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6080,6 +6080,35 @@ static void gfx_v8_ring_emit_sb(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, 0);
>   }
>   
> +static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
> +{
> +	uint32_t dw2 = 0;
> +
> +	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
> +	if (flags & HAVE_CTX_SWITCH) {
> +		/* set load_global_config & load_global_uconfig */
> +		dw2 |= 0x8001;
> +		/* set load_cs_sh_regs */
> +		dw2 |= 0x01000000;
> +		/* set load_per_context_state & load_gfx_sh_regs for GFX */
> +		dw2 |= 0x10002;
> +
> +		/* set load_ce_ram if preamble presented */
> +		if (PREAMBLE_IB_PRESENT & flags)
> +			dw2 |= 0x10000000;
> +	} else {
> +		/* still load_ce_ram if this is the first time preamble presented
> +		 * although there is no context switch happens.
> +		 */
> +		if (PREAMBLE_IB_PRESENT_FIRST & flags)
> +			dw2 |= 0x10000000;
> +	}
> +
> +	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
> +	amdgpu_ring_write(ring, dw2);
> +	amdgpu_ring_write(ring, 0);
> +}
> +
>   static void gfx_v8_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>   						 enum amdgpu_interrupt_state state)
>   {
> @@ -6262,6 +6291,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_switch_buffer = gfx_v8_ring_emit_sb,
> +	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {