drm/amdgpu: optionally do a writeback but don't invalidate TC for IB fences

Submitted by Marek Olšák on April 3, 2018, 8:25 p.m.

Details

Message ID 20180403202533.4419-1-maraeo@gmail.com
State New
Headers show
Series "drm/amdgpu: optionally do a writeback but don't invalidate TC for IB fences" ( rev: 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Marek Olšák April 3, 2018, 8:25 p.m.
From: Marek Olšák <marek.olsak@amd.com>

There is a new IB flag that enables this new behavior.
Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense
when draw calls from two adjacent gfx IBs run in parallel. This will be
the new default for Mesa.

v2: bump the version

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  8 ++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  4 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c     | 11 +++++++----
 drivers/gpu/drm/amd/amdgpu/soc15d.h       |  1 +
 include/uapi/drm/amdgpu_drm.h             |  4 ++++
 8 files changed, 27 insertions(+), 11 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 43df7d2aebb4..0a45f5cceba7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -68,23 +68,24 @@ 
  * - 3.16.0 - Add reserved vmid support
  * - 3.17.0 - Add AMDGPU_NUM_VRAM_CPU_PAGE_FAULTS.
  * - 3.18.0 - Export gpu always on cu bitmap
  * - 3.19.0 - Add support for UVD MJPEG decode
  * - 3.20.0 - Add support for local BOs
  * - 3.21.0 - Add DRM_AMDGPU_FENCE_TO_HANDLE ioctl
  * - 3.22.0 - Add DRM_AMDGPU_SCHED ioctl
  * - 3.23.0 - Add query for VRAM lost counter
  * - 3.24.0 - Add high priority compute support for gfx9
  * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
+ * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
  */
 #define KMS_DRIVER_MAJOR	3
-#define KMS_DRIVER_MINOR	25
+#define KMS_DRIVER_MINOR	26
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
 int amdgpu_vis_vram_limit = 0;
 int amdgpu_gart_size = -1; /* auto */
 int amdgpu_gtt_size = -1; /* auto */
 int amdgpu_moverate = -1; /* auto */
 int amdgpu_benchmarking = 0;
 int amdgpu_testing = 0;
 int amdgpu_audio = -1;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 97449e06a242..d09fcab2398f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -124,39 +124,40 @@  static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
 
 /**
  * amdgpu_fence_emit - emit a fence on the requested ring
  *
  * @ring: ring the fence is associated with
  * @f: resulting fence object
  *
  * Emits a fence command on the requested ring (all asics).
  * Returns 0 on success, -ENOMEM on failure.
  */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+		      unsigned flags)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_fence *fence;
 	struct dma_fence *old, **ptr;
 	uint32_t seq;
 
 	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
 	if (fence == NULL)
 		return -ENOMEM;
 
 	seq = ++ring->fence_drv.sync_seq;
 	fence->ring = ring;
 	dma_fence_init(&fence->base, &amdgpu_fence_ops,
 		       &ring->fence_drv.lock,
 		       adev->fence_context + ring->idx,
 		       seq);
 	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
-			       seq, AMDGPU_FENCE_FLAG_INT);
+			       seq, flags | AMDGPU_FENCE_FLAG_INT);
 
 	ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
 	/* This function can't be called concurrently anyway, otherwise
 	 * emitting the fence would mess up the hardware ring buffer.
 	 */
 	old = rcu_dereference_protected(*ptr, 1);
 	if (old && !dma_fence_is_signaled(old)) {
 		DRM_INFO("rcu slot is busy\n");
 		dma_fence_wait(old, false);
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 311589e02d17..f70eeed9ed76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -120,20 +120,21 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		       struct dma_fence **f)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_ib *ib = &ibs[0];
 	struct dma_fence *tmp = NULL;
 	bool skip_preamble, need_ctx_switch;
 	unsigned patch_offset = ~0;
 	struct amdgpu_vm *vm;
 	uint64_t fence_ctx;
 	uint32_t status = 0, alloc_size;
+	unsigned fence_flags = 0;
 
 	unsigned i;
 	int r = 0;
 	bool need_pipe_sync = false;
 
 	if (num_ibs == 0)
 		return -EINVAL;
 
 	/* ring tests don't use a job */
 	if (job) {
@@ -220,36 +221,39 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	}
 
 	if (ring->funcs->emit_tmz)
 		amdgpu_ring_emit_tmz(ring, false);
 
 #ifdef CONFIG_X86_64
 	if (!(adev->flags & AMD_IS_APU))
 #endif
 		amdgpu_asic_invalidate_hdp(adev, ring);
 
-	r = amdgpu_fence_emit(ring, f);
+	if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE)
+		fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+
+	r = amdgpu_fence_emit(ring, f, fence_flags);
 	if (r) {
 		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
 		if (job && job->vmid)
 			amdgpu_vmid_reset(adev, ring->funcs->vmhub, job->vmid);
 		amdgpu_ring_undo(ring);
 		return r;
 	}
 
 	if (ring->funcs->insert_end)
 		ring->funcs->insert_end(ring);
 
 	/* wrap the last IB with fence */
 	if (job && job->uf_addr) {
 		amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
-				       AMDGPU_FENCE_FLAG_64BIT);
+				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
 	}
 
 	if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
 	ring->current_ctx = fence_ctx;
 	if (vm && ring->funcs->emit_switch_buffer)
 		amdgpu_ring_emit_switch_buffer(ring);
 	amdgpu_ring_commit(ring);
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 1d0d250cbfdf..222052daedd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -33,20 +33,21 @@ 
 #define AMDGPU_MAX_COMPUTE_RINGS	8
 #define AMDGPU_MAX_VCE_RINGS		3
 #define AMDGPU_MAX_UVD_ENC_RINGS	2
 
 /* some special values for the owner field */
 #define AMDGPU_FENCE_OWNER_UNDEFINED	((void*)0ul)
 #define AMDGPU_FENCE_OWNER_VM		((void*)1ul)
 
 #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
+#define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
 
 enum amdgpu_ring_type {
 	AMDGPU_RING_TYPE_GFX,
 	AMDGPU_RING_TYPE_COMPUTE,
 	AMDGPU_RING_TYPE_SDMA,
 	AMDGPU_RING_TYPE_UVD,
 	AMDGPU_RING_TYPE_VCE,
 	AMDGPU_RING_TYPE_KIQ,
 	AMDGPU_RING_TYPE_UVD_ENC,
 	AMDGPU_RING_TYPE_VCN_DEC,
@@ -81,21 +82,22 @@  int amdgpu_fence_driver_init(struct amdgpu_device *adev);
 void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
 void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
 
 int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
 				  unsigned num_hw_submission);
 int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
 				   struct amdgpu_irq_src *irq_src,
 				   unsigned irq_type);
 void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
 void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
+		      unsigned flags);
 int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
 void amdgpu_fence_process(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
 signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring,
 				      uint32_t wait_seq,
 				      signed long timeout);
 unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring);
 
 /*
  * Rings.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 24474294c92a..fe05351ea4d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -620,21 +620,21 @@  int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
 
 	if (vm_flush_needed) {
 		trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
 		amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
 	}
 
 	if (pasid_mapping_needed)
 		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
 
 	if (vm_flush_needed || pasid_mapping_needed) {
-		r = amdgpu_fence_emit(ring, &fence);
+		r = amdgpu_fence_emit(ring, &fence, 0);
 		if (r)
 			return r;
 	}
 
 	if (vm_flush_needed) {
 		mutex_lock(&id_mgr->lock);
 		dma_fence_put(id->last_flush);
 		id->last_flush = dma_fence_get(fence);
 		id->current_gpu_reset_count =
 			atomic_read(&adev->gpu_reset_counter);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9d39fd5b1822..5dea0d4c0af4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3767,27 +3767,30 @@  static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
                                 lower_32_bits(ib->gpu_addr));
         amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
         amdgpu_ring_write(ring, control);
 }
 
 static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
 				     u64 seq, unsigned flags)
 {
 	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
+	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
 
 	/* RELEASE_MEM - flush caches, send int */
 	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
-	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
-				 EOP_TC_ACTION_EN |
-				 EOP_TC_WB_ACTION_EN |
-				 EOP_TC_MD_ACTION_EN |
+	amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
+					       EOP_TC_NC_ACTION_EN) :
+					      (EOP_TCL1_ACTION_EN |
+					       EOP_TC_ACTION_EN |
+					       EOP_TC_WB_ACTION_EN |
+					       EOP_TC_MD_ACTION_EN)) |
 				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
 				 EVENT_INDEX(5)));
 	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
 
 	/*
 	 * the address should be Qword aligned if 64bit write, Dword
 	 * aligned if only send 32bit data low (discard data high)
 	 */
 	if (write64bit)
 		BUG_ON(addr & 0x7);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
index 7f408f85fdb6..839a144c1645 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
@@ -152,20 +152,21 @@ 
 		 * 4 - *S_PARTIAL_FLUSH
 		 */
 #define	PACKET3_RELEASE_MEM				0x49
 #define		EVENT_TYPE(x)                           ((x) << 0)
 #define		EVENT_INDEX(x)                          ((x) << 8)
 #define		EOP_TCL1_VOL_ACTION_EN                  (1 << 12)
 #define		EOP_TC_VOL_ACTION_EN                    (1 << 13) /* L2 */
 #define		EOP_TC_WB_ACTION_EN                     (1 << 15) /* L2 */
 #define		EOP_TCL1_ACTION_EN                      (1 << 16)
 #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
+#define		EOP_TC_NC_ACTION_EN			(1 << 19)
 #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
 
 #define		DATA_SEL(x)                             ((x) << 29)
 		/* 0 - discard
 		 * 1 - send low 32bit data
 		 * 2 - send 64bit data
 		 * 3 - send 64bit GPU counter value
 		 * 4 - send 64bit sys counter value
 		 */
 #define		INT_SEL(x)                              ((x) << 24)
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 0087799962cf..f5901bd9c7d8 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -516,20 +516,24 @@  union drm_amdgpu_cs {
 
 /* This IB should be submitted to CE */
 #define AMDGPU_IB_FLAG_CE	(1<<0)
 
 /* Preamble flag, which means the IB could be dropped if no context switch */
 #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
 
 /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
 #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
 
+/* The IB fence should do the L2 writeback but not invalidate any shader
+ * caches (L2/vL1/sL1/I$). */
+#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
+
 struct drm_amdgpu_cs_chunk_ib {
 	__u32 _pad;
 	/** AMDGPU_IB_FLAG_* */
 	__u32 flags;
 	/** Virtual address to begin IB execution */
 	__u64 va_start;
 	/** Size of submission */
 	__u32 ib_bytes;
 	/** HW IP to submit to */
 	__u32 ip_type;

Comments

Am 03.04.2018 um 22:25 schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
>
> There is a new IB flag that enables this new behavior.
> Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense
> when draw calls from two adjacent gfx IBs run in parallel. This will be
> the new default for Mesa.
>
> v2: bump the version
>
> Signed-off-by: Marek Olšák <marek.olsak@amd.com>

Looks good to me, but I would split it into two patches. One which 
implements all the stuff in the common code and the second implementing 
the handling in gfx_v9_0.c and bumping the version number.

But that's only nice to have, the patch is Reviewed-by: Christian König 
<christian.koenig@amd.com> anyway.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  5 +++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  8 ++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  4 +++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  2 +-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c     | 11 +++++++----
>   drivers/gpu/drm/amd/amdgpu/soc15d.h       |  1 +
>   include/uapi/drm/amdgpu_drm.h             |  4 ++++
>   8 files changed, 27 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 43df7d2aebb4..0a45f5cceba7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -68,23 +68,24 @@
>    * - 3.16.0 - Add reserved vmid support
>    * - 3.17.0 - Add AMDGPU_NUM_VRAM_CPU_PAGE_FAULTS.
>    * - 3.18.0 - Export gpu always on cu bitmap
>    * - 3.19.0 - Add support for UVD MJPEG decode
>    * - 3.20.0 - Add support for local BOs
>    * - 3.21.0 - Add DRM_AMDGPU_FENCE_TO_HANDLE ioctl
>    * - 3.22.0 - Add DRM_AMDGPU_SCHED ioctl
>    * - 3.23.0 - Add query for VRAM lost counter
>    * - 3.24.0 - Add high priority compute support for gfx9
>    * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk).
> + * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE.
>    */
>   #define KMS_DRIVER_MAJOR	3
> -#define KMS_DRIVER_MINOR	25
> +#define KMS_DRIVER_MINOR	26
>   #define KMS_DRIVER_PATCHLEVEL	0
>   
>   int amdgpu_vram_limit = 0;
>   int amdgpu_vis_vram_limit = 0;
>   int amdgpu_gart_size = -1; /* auto */
>   int amdgpu_gtt_size = -1; /* auto */
>   int amdgpu_moverate = -1; /* auto */
>   int amdgpu_benchmarking = 0;
>   int amdgpu_testing = 0;
>   int amdgpu_audio = -1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 97449e06a242..d09fcab2398f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -124,39 +124,40 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
>   
>   /**
>    * amdgpu_fence_emit - emit a fence on the requested ring
>    *
>    * @ring: ring the fence is associated with
>    * @f: resulting fence object
>    *
>    * Emits a fence command on the requested ring (all asics).
>    * Returns 0 on success, -ENOMEM on failure.
>    */
> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
> +		      unsigned flags)
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_fence *fence;
>   	struct dma_fence *old, **ptr;
>   	uint32_t seq;
>   
>   	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
>   	if (fence == NULL)
>   		return -ENOMEM;
>   
>   	seq = ++ring->fence_drv.sync_seq;
>   	fence->ring = ring;
>   	dma_fence_init(&fence->base, &amdgpu_fence_ops,
>   		       &ring->fence_drv.lock,
>   		       adev->fence_context + ring->idx,
>   		       seq);
>   	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> -			       seq, AMDGPU_FENCE_FLAG_INT);
> +			       seq, flags | AMDGPU_FENCE_FLAG_INT);
>   
>   	ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
>   	/* This function can't be called concurrently anyway, otherwise
>   	 * emitting the fence would mess up the hardware ring buffer.
>   	 */
>   	old = rcu_dereference_protected(*ptr, 1);
>   	if (old && !dma_fence_is_signaled(old)) {
>   		DRM_INFO("rcu slot is busy\n");
>   		dma_fence_wait(old, false);
>   	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 311589e02d17..f70eeed9ed76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -120,20 +120,21 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		       struct dma_fence **f)
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_ib *ib = &ibs[0];
>   	struct dma_fence *tmp = NULL;
>   	bool skip_preamble, need_ctx_switch;
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
>   	uint64_t fence_ctx;
>   	uint32_t status = 0, alloc_size;
> +	unsigned fence_flags = 0;
>   
>   	unsigned i;
>   	int r = 0;
>   	bool need_pipe_sync = false;
>   
>   	if (num_ibs == 0)
>   		return -EINVAL;
>   
>   	/* ring tests don't use a job */
>   	if (job) {
> @@ -220,36 +221,39 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	}
>   
>   	if (ring->funcs->emit_tmz)
>   		amdgpu_ring_emit_tmz(ring, false);
>   
>   #ifdef CONFIG_X86_64
>   	if (!(adev->flags & AMD_IS_APU))
>   #endif
>   		amdgpu_asic_invalidate_hdp(adev, ring);
>   
> -	r = amdgpu_fence_emit(ring, f);
> +	if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE)
> +		fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY;
> +
> +	r = amdgpu_fence_emit(ring, f, fence_flags);
>   	if (r) {
>   		dev_err(adev->dev, "failed to emit fence (%d)\n", r);
>   		if (job && job->vmid)
>   			amdgpu_vmid_reset(adev, ring->funcs->vmhub, job->vmid);
>   		amdgpu_ring_undo(ring);
>   		return r;
>   	}
>   
>   	if (ring->funcs->insert_end)
>   		ring->funcs->insert_end(ring);
>   
>   	/* wrap the last IB with fence */
>   	if (job && job->uf_addr) {
>   		amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
> -				       AMDGPU_FENCE_FLAG_64BIT);
> +				       fence_flags | AMDGPU_FENCE_FLAG_64BIT);
>   	}
>   
>   	if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
>   		amdgpu_ring_patch_cond_exec(ring, patch_offset);
>   
>   	ring->current_ctx = fence_ctx;
>   	if (vm && ring->funcs->emit_switch_buffer)
>   		amdgpu_ring_emit_switch_buffer(ring);
>   	amdgpu_ring_commit(ring);
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 1d0d250cbfdf..222052daedd1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -33,20 +33,21 @@
>   #define AMDGPU_MAX_COMPUTE_RINGS	8
>   #define AMDGPU_MAX_VCE_RINGS		3
>   #define AMDGPU_MAX_UVD_ENC_RINGS	2
>   
>   /* some special values for the owner field */
>   #define AMDGPU_FENCE_OWNER_UNDEFINED	((void*)0ul)
>   #define AMDGPU_FENCE_OWNER_VM		((void*)1ul)
>   
>   #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
>   #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
> +#define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
>   
>   enum amdgpu_ring_type {
>   	AMDGPU_RING_TYPE_GFX,
>   	AMDGPU_RING_TYPE_COMPUTE,
>   	AMDGPU_RING_TYPE_SDMA,
>   	AMDGPU_RING_TYPE_UVD,
>   	AMDGPU_RING_TYPE_VCE,
>   	AMDGPU_RING_TYPE_KIQ,
>   	AMDGPU_RING_TYPE_UVD_ENC,
>   	AMDGPU_RING_TYPE_VCN_DEC,
> @@ -81,21 +82,22 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
>   
>   int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
>   				  unsigned num_hw_submission);
>   int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
>   				   struct amdgpu_irq_src *irq_src,
>   				   unsigned irq_type);
>   void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
>   void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
> -int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
> +int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
> +		      unsigned flags);
>   int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
>   void amdgpu_fence_process(struct amdgpu_ring *ring);
>   int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
>   signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring,
>   				      uint32_t wait_seq,
>   				      signed long timeout);
>   unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring);
>   
>   /*
>    * Rings.
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 24474294c92a..fe05351ea4d2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -620,21 +620,21 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
>   
>   	if (vm_flush_needed) {
>   		trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
>   		amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
>   	}
>   
>   	if (pasid_mapping_needed)
>   		amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
>   
>   	if (vm_flush_needed || pasid_mapping_needed) {
> -		r = amdgpu_fence_emit(ring, &fence);
> +		r = amdgpu_fence_emit(ring, &fence, 0);
>   		if (r)
>   			return r;
>   	}
>   
>   	if (vm_flush_needed) {
>   		mutex_lock(&id_mgr->lock);
>   		dma_fence_put(id->last_flush);
>   		id->last_flush = dma_fence_get(fence);
>   		id->current_gpu_reset_count =
>   			atomic_read(&adev->gpu_reset_counter);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9d39fd5b1822..5dea0d4c0af4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -3767,27 +3767,30 @@ static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>                                   lower_32_bits(ib->gpu_addr));
>           amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
>           amdgpu_ring_write(ring, control);
>   }
>   
>   static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
>   				     u64 seq, unsigned flags)
>   {
>   	bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
>   	bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
> +	bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
>   
>   	/* RELEASE_MEM - flush caches, send int */
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
> -	amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
> -				 EOP_TC_ACTION_EN |
> -				 EOP_TC_WB_ACTION_EN |
> -				 EOP_TC_MD_ACTION_EN |
> +	amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
> +					       EOP_TC_NC_ACTION_EN) :
> +					      (EOP_TCL1_ACTION_EN |
> +					       EOP_TC_ACTION_EN |
> +					       EOP_TC_WB_ACTION_EN |
> +					       EOP_TC_MD_ACTION_EN)) |
>   				 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
>   				 EVENT_INDEX(5)));
>   	amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
>   
>   	/*
>   	 * the address should be Qword aligned if 64bit write, Dword
>   	 * aligned if only send 32bit data low (discard data high)
>   	 */
>   	if (write64bit)
>   		BUG_ON(addr & 0x7);
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> index 7f408f85fdb6..839a144c1645 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
> @@ -152,20 +152,21 @@
>   		 * 4 - *S_PARTIAL_FLUSH
>   		 */
>   #define	PACKET3_RELEASE_MEM				0x49
>   #define		EVENT_TYPE(x)                           ((x) << 0)
>   #define		EVENT_INDEX(x)                          ((x) << 8)
>   #define		EOP_TCL1_VOL_ACTION_EN                  (1 << 12)
>   #define		EOP_TC_VOL_ACTION_EN                    (1 << 13) /* L2 */
>   #define		EOP_TC_WB_ACTION_EN                     (1 << 15) /* L2 */
>   #define		EOP_TCL1_ACTION_EN                      (1 << 16)
>   #define		EOP_TC_ACTION_EN                        (1 << 17) /* L2 */
> +#define		EOP_TC_NC_ACTION_EN			(1 << 19)
>   #define		EOP_TC_MD_ACTION_EN			(1 << 21) /* L2 metadata */
>   
>   #define		DATA_SEL(x)                             ((x) << 29)
>   		/* 0 - discard
>   		 * 1 - send low 32bit data
>   		 * 2 - send 64bit data
>   		 * 3 - send 64bit GPU counter value
>   		 * 4 - send 64bit sys counter value
>   		 */
>   #define		INT_SEL(x)                              ((x) << 24)
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 0087799962cf..f5901bd9c7d8 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -516,20 +516,24 @@ union drm_amdgpu_cs {
>   
>   /* This IB should be submitted to CE */
>   #define AMDGPU_IB_FLAG_CE	(1<<0)
>   
>   /* Preamble flag, which means the IB could be dropped if no context switch */
>   #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
>   
>   /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
>   #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
>   
> +/* The IB fence should do the L2 writeback but not invalidate any shader
> + * caches (L2/vL1/sL1/I$). */
> +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
> +
>   struct drm_amdgpu_cs_chunk_ib {
>   	__u32 _pad;
>   	/** AMDGPU_IB_FLAG_* */
>   	__u32 flags;
>   	/** Virtual address to begin IB execution */
>   	__u64 va_start;
>   	/** Size of submission */
>   	__u32 ib_bytes;
>   	/** HW IP to submit to */
>   	__u32 ip_type;