[8/8] radeonsi: rename and re-document cache flush flags

Submitted by Marek Olšák on June 20, 2019, 4:19 a.m.

Details

Message ID 20190620041941.14001-8-maraeo@gmail.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Marek Olšák June 20, 2019, 4:19 a.m.
From: Marek Olšák <marek.olsak@amd.com>

SMEM and VMEM caches are L0 on gfx10.
---
 src/gallium/drivers/radeonsi/si_compute.c     |  2 +-
 .../drivers/radeonsi/si_compute_blit.c        | 12 +++---
 src/gallium/drivers/radeonsi/si_descriptors.c |  2 +-
 src/gallium/drivers/radeonsi/si_gfx_cs.c      |  8 ++--
 src/gallium/drivers/radeonsi/si_pipe.c        |  8 ++--
 src/gallium/drivers/radeonsi/si_pipe.h        | 34 +++++++++--------
 src/gallium/drivers/radeonsi/si_state.c       | 14 +++----
 src/gallium/drivers/radeonsi/si_state_draw.c  | 38 +++++++++----------
 .../drivers/radeonsi/si_state_streamout.c     |  6 +--
 .../drivers/radeonsi/si_test_dma_perf.c       |  6 +--
 10 files changed, 66 insertions(+), 64 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 7e5259b70a0..63c95ed2604 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -910,21 +910,21 @@  static void si_launch_grid(
 	/* Add buffer sizes for memory checking in need_cs_space. */
 	si_context_add_resource_size(sctx, &program->shader.bo->b.b);
 	/* TODO: add the scratch buffer */
 
 	if (info->indirect) {
 		si_context_add_resource_size(sctx, info->indirect);
 
 		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
 		if (sctx->chip_class <= GFX8 &&
 		    si_resource(info->indirect)->TC_L2_dirty) {
-			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+			sctx->flags |= SI_CONTEXT_WB_L2;
 			si_resource(info->indirect)->TC_L2_dirty = false;
 		}
 	}
 
 	si_need_gfx_cs_space(sctx);
 
 	if (sctx->bo_list_add_all_compute_resources)
 		si_compute_resources_add_all_to_bo_list(sctx);
 
 	if (!sctx->cs_shader_state.initialized) {
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 1cfdc9b62c6..4c5464ac118 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -44,23 +44,23 @@  static enum si_cache_policy get_cache_policy(struct si_context *sctx,
 
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
 			    enum si_cache_policy cache_policy)
 {
 	switch (coher) {
 	default:
 	case SI_COHERENCY_NONE:
 	case SI_COHERENCY_CP:
 		return 0;
 	case SI_COHERENCY_SHADER:
-		return SI_CONTEXT_INV_SMEM_L1 |
-		       SI_CONTEXT_INV_VMEM_L1 |
-		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+		return SI_CONTEXT_INV_SCACHE |
+		       SI_CONTEXT_INV_VCACHE |
+		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
 	case SI_COHERENCY_CB_META:
 		return SI_CONTEXT_FLUSH_AND_INV_CB;
 	}
 }
 
 static void si_compute_internal_begin(struct si_context *sctx)
 {
 	sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
 	sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
 	sctx->render_cond_force_off = true;
@@ -165,21 +165,21 @@  static void si_compute_do_clear_or_copy(struct si_context *sctx,
 							     SI_COMPUTE_CLEAR_DW_PER_THREAD,
 							     shader_dst_stream_policy, false);
 		}
 		ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
 	}
 
 	ctx->launch_grid(ctx, &info);
 
 	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
 	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
+		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
 
 	if (cache_policy != L2_BYPASS)
 		si_resource(dst)->TC_L2_dirty = true;
 
 	/* Restore states. */
 	ctx->bind_compute_state(ctx, saved_cs);
 	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
 				saved_writable_mask);
 	si_compute_internal_end(sctx);
 }
@@ -411,21 +411,21 @@  void si_compute_copy_image(struct si_context *sctx,
 		info.last_block[1] = height % 8;
 		info.block[2] = 1;
 		info.grid[0] = DIV_ROUND_UP(width, 8);
 		info.grid[1] = DIV_ROUND_UP(height, 8);
 		info.grid[2] = depth;
 	}
 
 	ctx->launch_grid(ctx, &info);
 
 	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
+		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
 		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 	ctx->bind_compute_state(ctx, saved_cs);
 	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
 	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 	si_compute_internal_end(sctx);
 }
 
 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
 {
 	struct pipe_context *ctx = &sctx->b;
@@ -590,17 +590,17 @@  void si_compute_clear_render_target(struct pipe_context *ctx,
 		info.block[1] = 1;
 		info.block[2] = 1;
 		info.grid[0] = DIV_ROUND_UP(width, 64);
 		info.grid[1] = num_layers;
 		info.grid[2] = 1;
 	}
 
 	ctx->launch_grid(ctx, &info);
 
 	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
+		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
 		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
 	ctx->bind_compute_state(ctx, saved_cs);
 	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
 	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
 	si_compute_internal_end(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 37d92fa7363..2a13ffd32f9 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1874,21 +1874,21 @@  static void si_upload_bindless_descriptors(struct si_context *sctx)
 		unsigned desc_slot = (*img_handle)->desc_slot;
 
 		if (!(*img_handle)->desc_dirty)
 			continue;
 
 		si_upload_bindless_descriptor(sctx, desc_slot, 8);
 		(*img_handle)->desc_dirty = false;
 	}
 
 	/* Invalidate L1 because it doesn't know that L2 changed. */
-	sctx->flags |= SI_CONTEXT_INV_SMEM_L1;
+	sctx->flags |= SI_CONTEXT_INV_SCACHE;
 	si_emit_cache_flush(sctx);
 
 	sctx->bindless_descriptors_dirty = false;
 }
 
 /* Update mutable image descriptor fields of all resident textures. */
 static void si_update_bindless_texture_descriptor(struct si_context *sctx,
 						  struct si_texture_handle *tex_handle)
 {
 	struct si_sampler_view *sview = (struct si_sampler_view *)tex_handle->view;
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index de0909904c8..9386df3a615 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -76,21 +76,21 @@  void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct radeon_winsys *ws = ctx->ws;
 	unsigned wait_flags = 0;
 
 	if (ctx->gfx_flush_in_progress)
 		return;
 
 	if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
 		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 			      SI_CONTEXT_CS_PARTIAL_FLUSH |
-			      SI_CONTEXT_INV_GLOBAL_L2;
+			      SI_CONTEXT_INV_L2;
 	} else if (ctx->chip_class == GFX6) {
 		/* The kernel flushes L2 before shaders are finished. */
 		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 			      SI_CONTEXT_CS_PARTIAL_FLUSH;
 	} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
 		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 			      SI_CONTEXT_CS_PARTIAL_FLUSH;
 	}
 
 	/* Drop this flush if it's a no-op. */
@@ -297,23 +297,23 @@  void si_begin_new_gfx_cs(struct si_context *ctx)
 	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
 	 * buffers.
 	 *
 	 * Note that the cache flush done by the kernel at the end of GFX IBs
 	 * isn't useful here, because that flush can finish after the following
 	 * IB starts drawing.
 	 *
 	 * TODO: Do we also need to invalidate CB & DB caches?
 	 */
 	ctx->flags |= SI_CONTEXT_INV_ICACHE |
-		      SI_CONTEXT_INV_SMEM_L1 |
-		      SI_CONTEXT_INV_VMEM_L1 |
-		      SI_CONTEXT_INV_GLOBAL_L2 |
+		      SI_CONTEXT_INV_SCACHE |
+		      SI_CONTEXT_INV_VCACHE |
+		      SI_CONTEXT_INV_L2 |
 		      SI_CONTEXT_START_PIPELINE_STATS;
 
 	ctx->cs_shader_state.initialized = false;
 	si_all_descriptors_begin_new_cs(ctx);
 
 	if (!ctx->has_graphics) {
 		ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
 		return;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index d0d04bbb3de..31a9d92461f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1148,25 +1148,25 @@  radeonsi_screen_create_impl(struct radeon_winsys *ws,
 
 	sscreen->dcc_msaa_allowed =
 		!(sscreen->debug_flags & DBG(NO_DCC_MSAA));
 
 	sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= GFX8;
 
 	(void) mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
 	sscreen->use_monolithic_shaders =
 		(sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
 
-	sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
-					    SI_CONTEXT_INV_VMEM_L1;
+	sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE |
+					  SI_CONTEXT_INV_VCACHE;
 	if (sscreen->info.chip_class <= GFX8) {
-		sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2;
-		sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+		sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
+		sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
 	}
 
 	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
 		sscreen->debug_flags |= DBG_ALL_SHADERS;
 
 	/* Syntax:
 	 *     EQAA=s,z,c
 	 * Example:
 	 *     EQAA=8,4,2
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 368cb4e473d..11678e1b4cb 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -58,30 +58,32 @@ 
 #define SI_COMPUTE_CLEAR_DW_PER_THREAD	4
 #define SI_COMPUTE_COPY_DW_PER_THREAD	4
 #define SI_COMPUTE_DST_CACHE_POLICY	L2_STREAM
 
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS	(1 << 0)
 #define SI_CONTEXT_STOP_PIPELINE_STATS	(1 << 1)
 #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2)
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(1 << 3)
-/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
-#define SI_CONTEXT_INV_SMEM_L1		(1 << 4)
-/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
-#define SI_CONTEXT_INV_VMEM_L1		(1 << 5)
-/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
-#define SI_CONTEXT_INV_GLOBAL_L2	(1 << 6)
-/* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't
- * invalidate L2. GFX6-GFX7 can't do it, so they will do complete invalidation. */
-#define SI_CONTEXT_WRITEBACK_GLOBAL_L2	(1 << 7)
-/* Writeback & invalidate the L2 metadata cache. It can only be coupled with
+/* Scalar L1 cache. */
+#define SI_CONTEXT_INV_SCACHE		(1 << 4)
+/* Vector L1 cache. */
+#define SI_CONTEXT_INV_VCACHE		(1 << 5)
+/* L2 cache + L2 metadata cache writeback & invalidate.
+ * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */
+#define SI_CONTEXT_INV_L2		(1 << 6)
+/* L2 writeback (write dirty L2 lines to memory for non-L2 clients).
+ * Only used for coherency with non-L2 clients like CB, DB, CP on GFX6-8.
+ * GFX6-7 will do complete invalidation, because the writeback is unsupported. */
+#define SI_CONTEXT_WB_L2		(1 << 7)
+/* Writeback & invalidate the L2 metadata cache only. It can only be coupled with
  * a CB or DB flush. */
 #define SI_CONTEXT_INV_L2_METADATA	(1 << 8)
 /* Framebuffer caches. */
 #define SI_CONTEXT_FLUSH_AND_INV_DB	(1 << 9)
 #define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10)
 #define SI_CONTEXT_FLUSH_AND_INV_CB	(1 << 11)
 /* Engine synchronization. */
 #define SI_CONTEXT_VS_PARTIAL_FLUSH	(1 << 12)
 #define SI_CONTEXT_PS_PARTIAL_FLUSH	(1 << 13)
 #define SI_CONTEXT_CS_PARTIAL_FLUSH	(1 << 14)
@@ -1639,57 +1641,57 @@  si_saved_cs_reference(struct si_saved_cs **dst, struct si_saved_cs *src)
 		si_destroy_saved_cs(*dst);
 
 	*dst = src;
 }
 
 static inline void
 si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
 			   bool shaders_read_metadata, bool dcc_pipe_aligned)
 {
 	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
-		       SI_CONTEXT_INV_VMEM_L1;
+		       SI_CONTEXT_INV_VCACHE;
 
 	if (sctx->chip_class >= GFX9) {
 		/* Single-sample color is coherent with shaders on GFX9, but
 		 * L2 metadata must be flushed if shaders read metadata.
 		 * (DCC, CMASK).
 		 */
 		if (num_samples >= 2 ||
 		    (shaders_read_metadata && !dcc_pipe_aligned))
-			sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+			sctx->flags |= SI_CONTEXT_INV_L2;
 		else if (shaders_read_metadata)
 			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
 	} else {
 		/* GFX6-GFX8 */
-		sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+		sctx->flags |= SI_CONTEXT_INV_L2;
 	}
 }
 
 static inline void
 si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
 			   bool include_stencil, bool shaders_read_metadata)
 {
 	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
-		       SI_CONTEXT_INV_VMEM_L1;
+		       SI_CONTEXT_INV_VCACHE;
 
 	if (sctx->chip_class >= GFX9) {
 		/* Single-sample depth (not stencil) is coherent with shaders
 		 * on GFX9, but L2 metadata must be flushed if shaders read
 		 * metadata.
 		 */
 		if (num_samples >= 2 || include_stencil)
-			sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+			sctx->flags |= SI_CONTEXT_INV_L2;
 		else if (shaders_read_metadata)
 			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
 	} else {
 		/* GFX6-GFX8 */
-		sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
+		sctx->flags |= SI_CONTEXT_INV_L2;
 	}
 }
 
 static inline bool
 si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
 {
 	return (stencil_sampler && tex->can_sample_s) ||
 	       (!stencil_sampler && tex->can_sample_z);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index af21914a142..b9fc77f7918 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4785,61 +4785,61 @@  static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
 	if (!(flags & ~PIPE_BARRIER_UPDATE))
 		return;
 
 	/* Subsequent commands must wait for all shader invocations to
 	 * complete. */
 	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-	                 SI_CONTEXT_CS_PARTIAL_FLUSH;
+		       SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
-		sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
-				 SI_CONTEXT_INV_VMEM_L1;
+		sctx->flags |= SI_CONTEXT_INV_SCACHE |
+			       SI_CONTEXT_INV_VCACHE;
 
 	if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
 		     PIPE_BARRIER_SHADER_BUFFER |
 		     PIPE_BARRIER_TEXTURE |
 		     PIPE_BARRIER_IMAGE |
 		     PIPE_BARRIER_STREAMOUT_BUFFER |
 		     PIPE_BARRIER_GLOBAL_BUFFER)) {
 		/* As far as I can tell, L1 contents are written back to L2
 		 * automatically at end of shader, but the contents of other
 		 * L1 caches might still be stale. */
-		sctx->flags |= SI_CONTEXT_INV_VMEM_L1;
+		sctx->flags |= SI_CONTEXT_INV_VCACHE;
 	}
 
 	if (flags & PIPE_BARRIER_INDEX_BUFFER) {
 		/* Indices are read through TC L2 since GFX8.
 		 * L1 isn't used.
 		 */
 		if (sctx->screen->info.chip_class <= GFX7)
-			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+			sctx->flags |= SI_CONTEXT_WB_L2;
 	}
 
 	/* MSAA color, any depth and any stencil are flushed in
 	 * si_decompress_textures when needed.
 	 */
 	if (flags & PIPE_BARRIER_FRAMEBUFFER &&
 	    sctx->framebuffer.uncompressed_cb_mask) {
 		sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
 
 		if (sctx->chip_class <= GFX8)
-			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+			sctx->flags |= SI_CONTEXT_WB_L2;
 	}
 
 	/* Indirect buffers use TC L2 on GFX9, but not older hw. */
 	if (sctx->screen->info.chip_class <= GFX8 &&
 	    flags & PIPE_BARRIER_INDIRECT_BUFFER)
-		sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+		sctx->flags |= SI_CONTEXT_WB_L2;
 }
 
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
 	struct pipe_blend_state blend;
 
 	memset(&blend, 0, sizeof(blend));
 	blend.independent_blend_enable = true;
 	blend.rt[0].colormask = 0xf;
 	return si_create_blend_state_mode(&sctx->b, &blend, mode);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index d780547659e..a81be533d64 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -952,24 +952,24 @@  void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
 }
 
 void si_emit_cache_flush(struct si_context *sctx)
 {
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint32_t flags = sctx->flags;
 
 	if (!sctx->has_graphics) {
 		/* Only process compute flags. */
 		flags &= SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_SMEM_L1 |
-			 SI_CONTEXT_INV_VMEM_L1 |
-			 SI_CONTEXT_INV_GLOBAL_L2 |
-			 SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
+			 SI_CONTEXT_INV_SCACHE |
+			 SI_CONTEXT_INV_VCACHE |
+			 SI_CONTEXT_INV_L2 |
+			 SI_CONTEXT_WB_L2 |
 			 SI_CONTEXT_INV_L2_METADATA |
 			 SI_CONTEXT_CS_PARTIAL_FLUSH;
 	}
 
 	uint32_t cp_coher_cntl = 0;
 	const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
 					      SI_CONTEXT_FLUSH_AND_INV_DB);
 	const bool is_barrier = flush_cb_db ||
 				/* INV_ICACHE == beginning of gfx IB. Checking
 				 * INV_ICACHE fixes corruption for DeusExMD with
@@ -989,21 +989,21 @@  void si_emit_cache_flush(struct si_context *sctx)
 	/* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
 	 * bit is set. An alternative way is to write SQC_CACHES, but that
 	 * doesn't seem to work reliably. Since the bug doesn't affect
 	 * correctness (it only does more work than necessary) and
 	 * the performance impact is likely negligible, there is no plan
 	 * to add a workaround for it.
 	 */
 
 	if (flags & SI_CONTEXT_INV_ICACHE)
 		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-	if (flags & SI_CONTEXT_INV_SMEM_L1)
+	if (flags & SI_CONTEXT_INV_SCACHE)
 		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
 	if (sctx->chip_class <= GFX8) {
 		if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
 			cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
 					 S_0085F0_CB0_DEST_BASE_ENA(1) |
 					 S_0085F0_CB1_DEST_BASE_ENA(1) |
 					 S_0085F0_CB2_DEST_BASE_ENA(1) |
 					 S_0085F0_CB3_DEST_BASE_ENA(1) |
 					 S_0085F0_CB4_DEST_BASE_ENA(1) |
@@ -1107,29 +1107,29 @@  void si_emit_cache_flush(struct si_context *sctx)
 		 * TCL1                  = invalidate L1
 		 */
 		tc_flags = 0;
 
 		if (flags & SI_CONTEXT_INV_L2_METADATA) {
 			tc_flags = EVENT_TC_ACTION_ENA |
 				   EVENT_TC_MD_ACTION_ENA;
 		}
 
 		/* Ideally flush TC together with CB/DB. */
-		if (flags & SI_CONTEXT_INV_GLOBAL_L2) {
+		if (flags & SI_CONTEXT_INV_L2) {
 			/* Writeback and invalidate everything in L2 & L1. */
 			tc_flags = EVENT_TC_ACTION_ENA |
 				   EVENT_TC_WB_ACTION_ENA;
 
 			/* Clear the flags. */
-			flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
-				   SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
-				   SI_CONTEXT_INV_VMEM_L1);
+			flags &= ~(SI_CONTEXT_INV_L2 |
+				   SI_CONTEXT_WB_L2 |
+				   SI_CONTEXT_INV_VCACHE);
 			sctx->num_L2_invalidates++;
 		}
 
 		/* Do the flush (enqueue the event and wait for it). */
 		va = sctx->wait_mem_scratch->gpu_address;
 		sctx->wait_mem_number++;
 
 		si_cp_release_mem(sctx, cs, cb_db_event, tc_flags,
 				  EOP_DST_SEL_MEM,
 				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
@@ -1139,66 +1139,66 @@  void si_emit_cache_flush(struct si_context *sctx)
 		si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff,
 			       WAIT_REG_MEM_EQUAL);
 	}
 
 	/* Make sure ME is idle (it executes most packets) before continuing.
 	 * This prevents read-after-write hazards between PFP and ME.
 	 */
 	if (sctx->has_graphics &&
 	    (cp_coher_cntl ||
 	     (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       SI_CONTEXT_INV_VMEM_L1 |
-		       SI_CONTEXT_INV_GLOBAL_L2 |
-		       SI_CONTEXT_WRITEBACK_GLOBAL_L2)))) {
+		       SI_CONTEXT_INV_VCACHE |
+		       SI_CONTEXT_INV_L2 |
+		       SI_CONTEXT_WB_L2)))) {
 		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
 		radeon_emit(cs, 0);
 	}
 
 	/* GFX6-GFX8 only:
 	 *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC
 	 *   waits for idle, so it should be last. SURFACE_SYNC is done in PFP.
 	 *
 	 * cp_coher_cntl should contain all necessary flags except TC flags
 	 * at this point.
 	 *
 	 * GFX6-GFX7 don't support L2 write-back.
 	 */
-	if (flags & SI_CONTEXT_INV_GLOBAL_L2 ||
+	if (flags & SI_CONTEXT_INV_L2 ||
 	    (sctx->chip_class <= GFX7 &&
-	     (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
+	     (flags & SI_CONTEXT_WB_L2))) {
 		/* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
 		 * WB must be set on GFX8+ when TC_ACTION is set.
 		 */
 		si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
 				     S_0085F0_TC_ACTION_ENA(1) |
 				     S_0085F0_TCL1_ACTION_ENA(1) |
 				     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
 		cp_coher_cntl = 0;
 		sctx->num_L2_invalidates++;
 	} else {
 		/* L1 invalidation and L2 writeback must be done separately,
 		 * because both operations can't be done together.
 		 */
-		if (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2) {
+		if (flags & SI_CONTEXT_WB_L2) {
 			/* WB = write-back
 			 * NC = apply to non-coherent MTYPEs
 			 *      (i.e. MTYPE <= 1, which is what we use everywhere)
 			 *
 			 * WB doesn't work without NC.
 			 */
 			si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
 					     S_0301F0_TC_WB_ACTION_ENA(1) |
 					     S_0301F0_TC_NC_ACTION_ENA(1));
 			cp_coher_cntl = 0;
 			sctx->num_L2_writebacks++;
 		}
-		if (flags & SI_CONTEXT_INV_VMEM_L1) {
+		if (flags & SI_CONTEXT_INV_VCACHE) {
 			/* Invalidate per-CU VMEM L1. */
 			si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
 					     S_0085F0_TCL1_ACTION_ENA(1));
 			cp_coher_cntl = 0;
 		}
 	}
 
 	/* If TC flushes haven't cleared this... */
 	if (cp_coher_cntl)
 		si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
@@ -1581,46 +1581,46 @@  static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 				      &index_offset, &indexbuf);
 			if (!indexbuf)
 				return;
 
 			/* info->start will be added by the drawing code */
 			index_offset -= start_offset;
 		} else if (sctx->chip_class <= GFX7 &&
 			   si_resource(indexbuf)->TC_L2_dirty) {
 			/* GFX8 reads index buffers through TC L2, so it doesn't
 			 * need this. */
-			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+			sctx->flags |= SI_CONTEXT_WB_L2;
 			si_resource(indexbuf)->TC_L2_dirty = false;
 		}
 	}
 
 	bool dispatch_prim_discard_cs = false;
 	bool prim_discard_cs_instancing = false;
 	unsigned original_index_size = index_size;
 	unsigned direct_count = 0;
 
 	if (info->indirect) {
 		struct pipe_draw_indirect_info *indirect = info->indirect;
 
 		/* Add the buffer size for memory checking in need_cs_space. */
 		si_context_add_resource_size(sctx, indirect->buffer);
 
 		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
 		if (sctx->chip_class <= GFX8) {
 			if (si_resource(indirect->buffer)->TC_L2_dirty) {
-				sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+				sctx->flags |= SI_CONTEXT_WB_L2;
 				si_resource(indirect->buffer)->TC_L2_dirty = false;
 			}
 
 			if (indirect->indirect_draw_count &&
 			    si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
-				sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+				sctx->flags |= SI_CONTEXT_WB_L2;
 				si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
 			}
 		}
 	} else {
 		/* Multiply by 3 for strips and fans to get an approximate vertex
 		 * count as triangles. */
 		direct_count = info->count * instance_count *
 			       (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index e7058f19a8a..e3c72ccdf49 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -114,23 +114,23 @@  static void si_set_streamout_targets(struct pipe_context *ctx,
 		/* Invalidate the scalar cache in case a streamout buffer is
 		 * going to be used as a constant buffer.
 		 *
 		 * Invalidate vL1, because streamout bypasses it (done by
 		 * setting GLC=1 in the store instruction), but vL1 in other
 		 * CUs can contain outdated data of streamout buffers.
 		 *
 		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
 		 * used as an input immediately.
 		 */
-		sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
-				 SI_CONTEXT_INV_VMEM_L1 |
-				 SI_CONTEXT_VS_PARTIAL_FLUSH;
+		sctx->flags |= SI_CONTEXT_INV_SCACHE |
+			       SI_CONTEXT_INV_VCACHE |
+			       SI_CONTEXT_VS_PARTIAL_FLUSH;
 	}
 
 	/* All readers of the streamout targets need to be finished before we can
 	 * start writing to the targets.
 	 */
 	if (num_targets)
 		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 		                 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
 	/* Streamout buffers must be bound in 2 places:
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index 0b5a4a38ab7..0a0b9c4a657 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -226,40 +226,40 @@  void si_test_dma_perf(struct si_screen *sscreen)
 						sb[0].buffer_size = size;
 
 						if (is_copy) {
 							sb[1].buffer = src;
 							sb[1].buffer_size = size;
 						} else {
 							for (unsigned i = 0; i < 4; i++)
 								sctx->cs_user_data[i] = clear_value;
 						}
 
-						sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
-							       SI_CONTEXT_INV_SMEM_L1;
+						sctx->flags |= SI_CONTEXT_INV_VCACHE |
+							       SI_CONTEXT_INV_SCACHE;
 
 						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
 									is_copy ? 2 : 1, sb, 0x1);
 						ctx->bind_compute_state(ctx, cs);
 						sctx->cs_max_waves_per_sh = cs_waves_per_sh;
 
 						ctx->launch_grid(ctx, &info);
 
 						ctx->bind_compute_state(ctx, NULL);
 						ctx->delete_compute_state(ctx, cs);
 						sctx->cs_max_waves_per_sh = 0; /* disable the limit */
 
 						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 					}
 
 					/* Flush L2, so that we don't just test L2 cache performance. */
 					if (!test_sdma) {
-						sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+						sctx->flags |= SI_CONTEXT_WB_L2;
 						si_emit_cache_flush(sctx);
 					}
 
 					ctx->end_query(ctx, q[iter]);
 					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
 				}
 				pipe_resource_reference(&dst, NULL);
 				pipe_resource_reference(&src, NULL);
 
 				/* Get results. */

Comments

For the series

Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>

on Polaris 20

Except gfx10 stuff...;-)

Dieter

Am 20.06.2019 06:19, schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
> 
> SMEM and VMEM caches are L0 on gfx10.
> ---
>  src/gallium/drivers/radeonsi/si_compute.c     |  2 +-
>  .../drivers/radeonsi/si_compute_blit.c        | 12 +++---
>  src/gallium/drivers/radeonsi/si_descriptors.c |  2 +-
>  src/gallium/drivers/radeonsi/si_gfx_cs.c      |  8 ++--
>  src/gallium/drivers/radeonsi/si_pipe.c        |  8 ++--
>  src/gallium/drivers/radeonsi/si_pipe.h        | 34 +++++++++--------
>  src/gallium/drivers/radeonsi/si_state.c       | 14 +++----
>  src/gallium/drivers/radeonsi/si_state_draw.c  | 38 +++++++++----------
>  .../drivers/radeonsi/si_state_streamout.c     |  6 +--
>  .../drivers/radeonsi/si_test_dma_perf.c       |  6 +--
>  10 files changed, 66 insertions(+), 64 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c
> b/src/gallium/drivers/radeonsi/si_compute.c
> index 7e5259b70a0..63c95ed2604 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -910,21 +910,21 @@ static void si_launch_grid(
>  	/* Add buffer sizes for memory checking in need_cs_space. */
>  	si_context_add_resource_size(sctx, &program->shader.bo->b.b);
>  	/* TODO: add the scratch buffer */
> 
>  	if (info->indirect) {
>  		si_context_add_resource_size(sctx, info->indirect);
> 
>  		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
>  		if (sctx->chip_class <= GFX8 &&
>  		    si_resource(info->indirect)->TC_L2_dirty) {
> -			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +			sctx->flags |= SI_CONTEXT_WB_L2;
>  			si_resource(info->indirect)->TC_L2_dirty = false;
>  		}
>  	}
> 
>  	si_need_gfx_cs_space(sctx);
> 
>  	if (sctx->bo_list_add_all_compute_resources)
>  		si_compute_resources_add_all_to_bo_list(sctx);
> 
>  	if (!sctx->cs_shader_state.initialized) {
> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
> b/src/gallium/drivers/radeonsi/si_compute_blit.c
> index 1cfdc9b62c6..4c5464ac118 100644
> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> @@ -44,23 +44,23 @@ static enum si_cache_policy
> get_cache_policy(struct si_context *sctx,
> 
>  unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency 
> coher,
>  			    enum si_cache_policy cache_policy)
>  {
>  	switch (coher) {
>  	default:
>  	case SI_COHERENCY_NONE:
>  	case SI_COHERENCY_CP:
>  		return 0;
>  	case SI_COHERENCY_SHADER:
> -		return SI_CONTEXT_INV_SMEM_L1 |
> -		       SI_CONTEXT_INV_VMEM_L1 |
> -		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
> +		return SI_CONTEXT_INV_SCACHE |
> +		       SI_CONTEXT_INV_VCACHE |
> +		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_L2 : 0);
>  	case SI_COHERENCY_CB_META:
>  		return SI_CONTEXT_FLUSH_AND_INV_CB;
>  	}
>  }
> 
>  static void si_compute_internal_begin(struct si_context *sctx)
>  {
>  	sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
>  	sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
>  	sctx->render_cond_force_off = true;
> @@ -165,21 +165,21 @@ static void si_compute_do_clear_or_copy(struct
> si_context *sctx,
>  							     SI_COMPUTE_CLEAR_DW_PER_THREAD,
>  							     shader_dst_stream_policy, false);
>  		}
>  		ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
>  	}
> 
>  	ctx->launch_grid(ctx, &info);
> 
>  	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, 
> size);
>  	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> -		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 
> 0);
> +		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
> 
>  	if (cache_policy != L2_BYPASS)
>  		si_resource(dst)->TC_L2_dirty = true;
> 
>  	/* Restore states. */
>  	ctx->bind_compute_state(ctx, saved_cs);
>  	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, 
> saved_sb,
>  				saved_writable_mask);
>  	si_compute_internal_end(sctx);
>  }
> @@ -411,21 +411,21 @@ void si_compute_copy_image(struct si_context 
> *sctx,
>  		info.last_block[1] = height % 8;
>  		info.block[2] = 1;
>  		info.grid[0] = DIV_ROUND_UP(width, 8);
>  		info.grid[1] = DIV_ROUND_UP(height, 8);
>  		info.grid[2] = depth;
>  	}
> 
>  	ctx->launch_grid(ctx, &info);
> 
>  	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> -		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 
> 0) |
> +		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
>  		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
>  	ctx->bind_compute_state(ctx, saved_cs);
>  	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
>  	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
>  	si_compute_internal_end(sctx);
>  }
> 
>  void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
>  {
>  	struct pipe_context *ctx = &sctx->b;
> @@ -590,17 +590,17 @@ void si_compute_clear_render_target(struct
> pipe_context *ctx,
>  		info.block[1] = 1;
>  		info.block[2] = 1;
>  		info.grid[0] = DIV_ROUND_UP(width, 64);
>  		info.grid[1] = num_layers;
>  		info.grid[2] = 1;
>  	}
> 
>  	ctx->launch_grid(ctx, &info);
> 
>  	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> -		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 
> 0) |
> +		       (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
>  		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
>  	ctx->bind_compute_state(ctx, saved_cs);
>  	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
>  	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
>  	si_compute_internal_end(sctx);
>  }
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c
> b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 37d92fa7363..2a13ffd32f9 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -1874,21 +1874,21 @@ static void
> si_upload_bindless_descriptors(struct si_context *sctx)
>  		unsigned desc_slot = (*img_handle)->desc_slot;
> 
>  		if (!(*img_handle)->desc_dirty)
>  			continue;
> 
>  		si_upload_bindless_descriptor(sctx, desc_slot, 8);
>  		(*img_handle)->desc_dirty = false;
>  	}
> 
>  	/* Invalidate L1 because it doesn't know that L2 changed. */
> -	sctx->flags |= SI_CONTEXT_INV_SMEM_L1;
> +	sctx->flags |= SI_CONTEXT_INV_SCACHE;
>  	si_emit_cache_flush(sctx);
> 
>  	sctx->bindless_descriptors_dirty = false;
>  }
> 
>  /* Update mutable image descriptor fields of all resident textures. */
>  static void si_update_bindless_texture_descriptor(struct si_context 
> *sctx,
>  						  struct si_texture_handle *tex_handle)
>  {
>  	struct si_sampler_view *sview = (struct si_sampler_view 
> *)tex_handle->view;
> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c
> b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> index de0909904c8..9386df3a615 100644
> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> @@ -76,21 +76,21 @@ void si_flush_gfx_cs(struct si_context *ctx, 
> unsigned flags,
>  	struct radeon_cmdbuf *cs = ctx->gfx_cs;
>  	struct radeon_winsys *ws = ctx->ws;
>  	unsigned wait_flags = 0;
> 
>  	if (ctx->gfx_flush_in_progress)
>  		return;
> 
>  	if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
>  		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>  			      SI_CONTEXT_CS_PARTIAL_FLUSH |
> -			      SI_CONTEXT_INV_GLOBAL_L2;
> +			      SI_CONTEXT_INV_L2;
>  	} else if (ctx->chip_class == GFX6) {
>  		/* The kernel flushes L2 before shaders are finished. */
>  		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>  			      SI_CONTEXT_CS_PARTIAL_FLUSH;
>  	} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
>  		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>  			      SI_CONTEXT_CS_PARTIAL_FLUSH;
>  	}
> 
>  	/* Drop this flush if it's a no-op. */
> @@ -297,23 +297,23 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
>  	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
>  	 * buffers.
>  	 *
>  	 * Note that the cache flush done by the kernel at the end of GFX IBs
>  	 * isn't useful here, because that flush can finish after the 
> following
>  	 * IB starts drawing.
>  	 *
>  	 * TODO: Do we also need to invalidate CB & DB caches?
>  	 */
>  	ctx->flags |= SI_CONTEXT_INV_ICACHE |
> -		      SI_CONTEXT_INV_SMEM_L1 |
> -		      SI_CONTEXT_INV_VMEM_L1 |
> -		      SI_CONTEXT_INV_GLOBAL_L2 |
> +		      SI_CONTEXT_INV_SCACHE |
> +		      SI_CONTEXT_INV_VCACHE |
> +		      SI_CONTEXT_INV_L2 |
>  		      SI_CONTEXT_START_PIPELINE_STATS;
> 
>  	ctx->cs_shader_state.initialized = false;
>  	si_all_descriptors_begin_new_cs(ctx);
> 
>  	if (!ctx->has_graphics) {
>  		ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
>  		return;
>  	}
> 
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
> b/src/gallium/drivers/radeonsi/si_pipe.c
> index d0d04bbb3de..31a9d92461f 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -1148,25 +1148,25 @@ radeonsi_screen_create_impl(struct 
> radeon_winsys *ws,
> 
>  	sscreen->dcc_msaa_allowed =
>  		!(sscreen->debug_flags & DBG(NO_DCC_MSAA));
> 
>  	sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= 
> GFX8;
> 
>  	(void) mtx_init(&sscreen->shader_parts_mutex, mtx_plain);
>  	sscreen->use_monolithic_shaders =
>  		(sscreen->debug_flags & DBG(MONOLITHIC_SHADERS)) != 0;
> 
> -	sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SMEM_L1 |
> -					    SI_CONTEXT_INV_VMEM_L1;
> +	sscreen->barrier_flags.cp_to_L2 = SI_CONTEXT_INV_SCACHE |
> +					  SI_CONTEXT_INV_VCACHE;
>  	if (sscreen->info.chip_class <= GFX8) {
> -		sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_GLOBAL_L2;
> -		sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +		sscreen->barrier_flags.cp_to_L2 |= SI_CONTEXT_INV_L2;
> +		sscreen->barrier_flags.L2_to_cp |= SI_CONTEXT_WB_L2;
>  	}
> 
>  	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
>  		sscreen->debug_flags |= DBG_ALL_SHADERS;
> 
>  	/* Syntax:
>  	 *     EQAA=s,z,c
>  	 * Example:
>  	 *     EQAA=8,4,2
> 
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> index 368cb4e473d..11678e1b4cb 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -58,30 +58,32 @@
>  #define SI_COMPUTE_CLEAR_DW_PER_THREAD	4
>  #define SI_COMPUTE_COPY_DW_PER_THREAD	4
>  #define SI_COMPUTE_DST_CACHE_POLICY	L2_STREAM
> 
>  /* Pipeline & streamout query controls. */
>  #define SI_CONTEXT_START_PIPELINE_STATS	(1 << 0)
>  #define SI_CONTEXT_STOP_PIPELINE_STATS	(1 << 1)
>  #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2)
>  /* Instruction cache. */
>  #define SI_CONTEXT_INV_ICACHE		(1 << 3)
> -/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
> -#define SI_CONTEXT_INV_SMEM_L1		(1 << 4)
> -/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
> -#define SI_CONTEXT_INV_VMEM_L1		(1 << 5)
> -/* Used by everything except CB/DB, can be bypassed (SLC=1). Other
> names: TC L2 */
> -#define SI_CONTEXT_INV_GLOBAL_L2	(1 << 6)
> -/* Write dirty L2 lines back to memory (shader and CP DMA stores), but 
> don't
> - * invalidate L2. GFX6-GFX7 can't do it, so they will do complete
> invalidation. */
> -#define SI_CONTEXT_WRITEBACK_GLOBAL_L2	(1 << 7)
> -/* Writeback & invalidate the L2 metadata cache. It can only be 
> coupled with
> +/* Scalar L1 cache. */
> +#define SI_CONTEXT_INV_SCACHE		(1 << 4)
> +/* Vector L1 cache. */
> +#define SI_CONTEXT_INV_VCACHE		(1 << 5)
> +/* L2 cache + L2 metadata cache writeback & invalidate.
> + * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */
> +#define SI_CONTEXT_INV_L2		(1 << 6)
> +/* L2 writeback (write dirty L2 lines to memory for non-L2 clients).
> + * Only used for coherency with non-L2 clients like CB, DB, CP on 
> GFX6-8.
> + * GFX6-7 will do complete invalidation, because the writeback is
> unsupported. */
> +#define SI_CONTEXT_WB_L2		(1 << 7)
> +/* Writeback & invalidate the L2 metadata cache only. It can only be
> coupled with
>   * a CB or DB flush. */
>  #define SI_CONTEXT_INV_L2_METADATA	(1 << 8)
>  /* Framebuffer caches. */
>  #define SI_CONTEXT_FLUSH_AND_INV_DB	(1 << 9)
>  #define SI_CONTEXT_FLUSH_AND_INV_DB_META (1 << 10)
>  #define SI_CONTEXT_FLUSH_AND_INV_CB	(1 << 11)
>  /* Engine synchronization. */
>  #define SI_CONTEXT_VS_PARTIAL_FLUSH	(1 << 12)
>  #define SI_CONTEXT_PS_PARTIAL_FLUSH	(1 << 13)
>  #define SI_CONTEXT_CS_PARTIAL_FLUSH	(1 << 14)
> @@ -1639,57 +1641,57 @@ si_saved_cs_reference(struct si_saved_cs
> **dst, struct si_saved_cs *src)
>  		si_destroy_saved_cs(*dst);
> 
>  	*dst = src;
>  }
> 
>  static inline void
>  si_make_CB_shader_coherent(struct si_context *sctx, unsigned 
> num_samples,
>  			   bool shaders_read_metadata, bool dcc_pipe_aligned)
>  {
>  	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
> -		       SI_CONTEXT_INV_VMEM_L1;
> +		       SI_CONTEXT_INV_VCACHE;
> 
>  	if (sctx->chip_class >= GFX9) {
>  		/* Single-sample color is coherent with shaders on GFX9, but
>  		 * L2 metadata must be flushed if shaders read metadata.
>  		 * (DCC, CMASK).
>  		 */
>  		if (num_samples >= 2 ||
>  		    (shaders_read_metadata && !dcc_pipe_aligned))
> -			sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
> +			sctx->flags |= SI_CONTEXT_INV_L2;
>  		else if (shaders_read_metadata)
>  			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
>  	} else {
>  		/* GFX6-GFX8 */
> -		sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
> +		sctx->flags |= SI_CONTEXT_INV_L2;
>  	}
>  }
> 
>  static inline void
>  si_make_DB_shader_coherent(struct si_context *sctx, unsigned 
> num_samples,
>  			   bool include_stencil, bool shaders_read_metadata)
>  {
>  	sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
> -		       SI_CONTEXT_INV_VMEM_L1;
> +		       SI_CONTEXT_INV_VCACHE;
> 
>  	if (sctx->chip_class >= GFX9) {
>  		/* Single-sample depth (not stencil) is coherent with shaders
>  		 * on GFX9, but L2 metadata must be flushed if shaders read
>  		 * metadata.
>  		 */
>  		if (num_samples >= 2 || include_stencil)
> -			sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
> +			sctx->flags |= SI_CONTEXT_INV_L2;
>  		else if (shaders_read_metadata)
>  			sctx->flags |= SI_CONTEXT_INV_L2_METADATA;
>  	} else {
>  		/* GFX6-GFX8 */
> -		sctx->flags |= SI_CONTEXT_INV_GLOBAL_L2;
> +		sctx->flags |= SI_CONTEXT_INV_L2;
>  	}
>  }
> 
>  static inline bool
>  si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
>  {
>  	return (stencil_sampler && tex->can_sample_s) ||
>  	       (!stencil_sampler && tex->can_sample_z);
>  }
> 
> diff --git a/src/gallium/drivers/radeonsi/si_state.c
> b/src/gallium/drivers/radeonsi/si_state.c
> index af21914a142..b9fc77f7918 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -4785,61 +4785,61 @@ static void si_texture_barrier(struct
> pipe_context *ctx, unsigned flags)
>  static void si_memory_barrier(struct pipe_context *ctx, unsigned 
> flags)
>  {
>  	struct si_context *sctx = (struct si_context *)ctx;
> 
>  	if (!(flags & ~PIPE_BARRIER_UPDATE))
>  		return;
> 
>  	/* Subsequent commands must wait for all shader invocations to
>  	 * complete. */
>  	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> -	                 SI_CONTEXT_CS_PARTIAL_FLUSH;
> +		       SI_CONTEXT_CS_PARTIAL_FLUSH;
> 
>  	if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
> -		sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
> -				 SI_CONTEXT_INV_VMEM_L1;
> +		sctx->flags |= SI_CONTEXT_INV_SCACHE |
> +			       SI_CONTEXT_INV_VCACHE;
> 
>  	if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
>  		     PIPE_BARRIER_SHADER_BUFFER |
>  		     PIPE_BARRIER_TEXTURE |
>  		     PIPE_BARRIER_IMAGE |
>  		     PIPE_BARRIER_STREAMOUT_BUFFER |
>  		     PIPE_BARRIER_GLOBAL_BUFFER)) {
>  		/* As far as I can tell, L1 contents are written back to L2
>  		 * automatically at end of shader, but the contents of other
>  		 * L1 caches might still be stale. */
> -		sctx->flags |= SI_CONTEXT_INV_VMEM_L1;
> +		sctx->flags |= SI_CONTEXT_INV_VCACHE;
>  	}
> 
>  	if (flags & PIPE_BARRIER_INDEX_BUFFER) {
>  		/* Indices are read through TC L2 since GFX8.
>  		 * L1 isn't used.
>  		 */
>  		if (sctx->screen->info.chip_class <= GFX7)
> -			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +			sctx->flags |= SI_CONTEXT_WB_L2;
>  	}
> 
>  	/* MSAA color, any depth and any stencil are flushed in
>  	 * si_decompress_textures when needed.
>  	 */
>  	if (flags & PIPE_BARRIER_FRAMEBUFFER &&
>  	    sctx->framebuffer.uncompressed_cb_mask) {
>  		sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
> 
>  		if (sctx->chip_class <= GFX8)
> -			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +			sctx->flags |= SI_CONTEXT_WB_L2;
>  	}
> 
>  	/* Indirect buffers use TC L2 on GFX9, but not older hw. */
>  	if (sctx->screen->info.chip_class <= GFX8 &&
>  	    flags & PIPE_BARRIER_INDIRECT_BUFFER)
> -		sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +		sctx->flags |= SI_CONTEXT_WB_L2;
>  }
> 
>  static void *si_create_blend_custom(struct si_context *sctx, unsigned 
> mode)
>  {
>  	struct pipe_blend_state blend;
> 
>  	memset(&blend, 0, sizeof(blend));
>  	blend.independent_blend_enable = true;
>  	blend.rt[0].colormask = 0xf;
>  	return si_create_blend_state_mode(&sctx->b, &blend, mode);
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c
> b/src/gallium/drivers/radeonsi/si_state_draw.c
> index d780547659e..a81be533d64 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -952,24 +952,24 @@ void
> si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
>  }
> 
>  void si_emit_cache_flush(struct si_context *sctx)
>  {
>  	struct radeon_cmdbuf *cs = sctx->gfx_cs;
>  	uint32_t flags = sctx->flags;
> 
>  	if (!sctx->has_graphics) {
>  		/* Only process compute flags. */
>  		flags &= SI_CONTEXT_INV_ICACHE |
> -			 SI_CONTEXT_INV_SMEM_L1 |
> -			 SI_CONTEXT_INV_VMEM_L1 |
> -			 SI_CONTEXT_INV_GLOBAL_L2 |
> -			 SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
> +			 SI_CONTEXT_INV_SCACHE |
> +			 SI_CONTEXT_INV_VCACHE |
> +			 SI_CONTEXT_INV_L2 |
> +			 SI_CONTEXT_WB_L2 |
>  			 SI_CONTEXT_INV_L2_METADATA |
>  			 SI_CONTEXT_CS_PARTIAL_FLUSH;
>  	}
> 
>  	uint32_t cp_coher_cntl = 0;
>  	const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
>  					      SI_CONTEXT_FLUSH_AND_INV_DB);
>  	const bool is_barrier = flush_cb_db ||
>  				/* INV_ICACHE == beginning of gfx IB. Checking
>  				 * INV_ICACHE fixes corruption for DeusExMD with
> @@ -989,21 +989,21 @@ void si_emit_cache_flush(struct si_context *sctx)
>  	/* GFX6 has a bug that it always flushes ICACHE and KCACHE if either
>  	 * bit is set. An alternative way is to write SQC_CACHES, but that
>  	 * doesn't seem to work reliably. Since the bug doesn't affect
>  	 * correctness (it only does more work than necessary) and
>  	 * the performance impact is likely negligible, there is no plan
>  	 * to add a workaround for it.
>  	 */
> 
>  	if (flags & SI_CONTEXT_INV_ICACHE)
>  		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
> -	if (flags & SI_CONTEXT_INV_SMEM_L1)
> +	if (flags & SI_CONTEXT_INV_SCACHE)
>  		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
> 
>  	if (sctx->chip_class <= GFX8) {
>  		if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) {
>  			cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) |
>  					 S_0085F0_CB0_DEST_BASE_ENA(1) |
>  					 S_0085F0_CB1_DEST_BASE_ENA(1) |
>  					 S_0085F0_CB2_DEST_BASE_ENA(1) |
>  					 S_0085F0_CB3_DEST_BASE_ENA(1) |
>  					 S_0085F0_CB4_DEST_BASE_ENA(1) |
> @@ -1107,29 +1107,29 @@ void si_emit_cache_flush(struct si_context 
> *sctx)
>  		 * TCL1                  = invalidate L1
>  		 */
>  		tc_flags = 0;
> 
>  		if (flags & SI_CONTEXT_INV_L2_METADATA) {
>  			tc_flags = EVENT_TC_ACTION_ENA |
>  				   EVENT_TC_MD_ACTION_ENA;
>  		}
> 
>  		/* Ideally flush TC together with CB/DB. */
> -		if (flags & SI_CONTEXT_INV_GLOBAL_L2) {
> +		if (flags & SI_CONTEXT_INV_L2) {
>  			/* Writeback and invalidate everything in L2 & L1. */
>  			tc_flags = EVENT_TC_ACTION_ENA |
>  				   EVENT_TC_WB_ACTION_ENA;
> 
>  			/* Clear the flags. */
> -			flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
> -				   SI_CONTEXT_WRITEBACK_GLOBAL_L2 |
> -				   SI_CONTEXT_INV_VMEM_L1);
> +			flags &= ~(SI_CONTEXT_INV_L2 |
> +				   SI_CONTEXT_WB_L2 |
> +				   SI_CONTEXT_INV_VCACHE);
>  			sctx->num_L2_invalidates++;
>  		}
> 
>  		/* Do the flush (enqueue the event and wait for it). */
>  		va = sctx->wait_mem_scratch->gpu_address;
>  		sctx->wait_mem_number++;
> 
>  		si_cp_release_mem(sctx, cs, cb_db_event, tc_flags,
>  				  EOP_DST_SEL_MEM,
>  				  EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
> @@ -1139,66 +1139,66 @@ void si_emit_cache_flush(struct si_context 
> *sctx)
>  		si_cp_wait_mem(sctx, cs, va, sctx->wait_mem_number, 0xffffffff,
>  			       WAIT_REG_MEM_EQUAL);
>  	}
> 
>  	/* Make sure ME is idle (it executes most packets) before continuing.
>  	 * This prevents read-after-write hazards between PFP and ME.
>  	 */
>  	if (sctx->has_graphics &&
>  	    (cp_coher_cntl ||
>  	     (flags & (SI_CONTEXT_CS_PARTIAL_FLUSH |
> -		       SI_CONTEXT_INV_VMEM_L1 |
> -		       SI_CONTEXT_INV_GLOBAL_L2 |
> -		       SI_CONTEXT_WRITEBACK_GLOBAL_L2)))) {
> +		       SI_CONTEXT_INV_VCACHE |
> +		       SI_CONTEXT_INV_L2 |
> +		       SI_CONTEXT_WB_L2)))) {
>  		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
>  		radeon_emit(cs, 0);
>  	}
> 
>  	/* GFX6-GFX8 only:
>  	 *   When one of the CP_COHER_CNTL.DEST_BASE flags is set, 
> SURFACE_SYNC
>  	 *   waits for idle, so it should be last. SURFACE_SYNC is done in 
> PFP.
>  	 *
>  	 * cp_coher_cntl should contain all necessary flags except TC flags
>  	 * at this point.
>  	 *
>  	 * GFX6-GFX7 don't support L2 write-back.
>  	 */
> -	if (flags & SI_CONTEXT_INV_GLOBAL_L2 ||
> +	if (flags & SI_CONTEXT_INV_L2 ||
>  	    (sctx->chip_class <= GFX7 &&
> -	     (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) {
> +	     (flags & SI_CONTEXT_WB_L2))) {
>  		/* Invalidate L1 & L2. (L1 is always invalidated on GFX6)
>  		 * WB must be set on GFX8+ when TC_ACTION is set.
>  		 */
>  		si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
>  				     S_0085F0_TC_ACTION_ENA(1) |
>  				     S_0085F0_TCL1_ACTION_ENA(1) |
>  				     S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8));
>  		cp_coher_cntl = 0;
>  		sctx->num_L2_invalidates++;
>  	} else {
>  		/* L1 invalidation and L2 writeback must be done separately,
>  		 * because both operations can't be done together.
>  		 */
> -		if (flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2) {
> +		if (flags & SI_CONTEXT_WB_L2) {
>  			/* WB = write-back
>  			 * NC = apply to non-coherent MTYPEs
>  			 *      (i.e. MTYPE <= 1, which is what we use everywhere)
>  			 *
>  			 * WB doesn't work without NC.
>  			 */
>  			si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
>  					     S_0301F0_TC_WB_ACTION_ENA(1) |
>  					     S_0301F0_TC_NC_ACTION_ENA(1));
>  			cp_coher_cntl = 0;
>  			sctx->num_L2_writebacks++;
>  		}
> -		if (flags & SI_CONTEXT_INV_VMEM_L1) {
> +		if (flags & SI_CONTEXT_INV_VCACHE) {
>  			/* Invalidate per-CU VMEM L1. */
>  			si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl |
>  					     S_0085F0_TCL1_ACTION_ENA(1));
>  			cp_coher_cntl = 0;
>  		}
>  	}
> 
>  	/* If TC flushes haven't cleared this... */
>  	if (cp_coher_cntl)
>  		si_emit_surface_sync(sctx, sctx->gfx_cs, cp_coher_cntl);
> @@ -1581,46 +1581,46 @@ static void si_draw_vbo(struct pipe_context
> *ctx, const struct pipe_draw_info *i
>  				      &index_offset, &indexbuf);
>  			if (!indexbuf)
>  				return;
> 
>  			/* info->start will be added by the drawing code */
>  			index_offset -= start_offset;
>  		} else if (sctx->chip_class <= GFX7 &&
>  			   si_resource(indexbuf)->TC_L2_dirty) {
>  			/* GFX8 reads index buffers through TC L2, so it doesn't
>  			 * need this. */
> -			sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +			sctx->flags |= SI_CONTEXT_WB_L2;
>  			si_resource(indexbuf)->TC_L2_dirty = false;
>  		}
>  	}
> 
>  	bool dispatch_prim_discard_cs = false;
>  	bool prim_discard_cs_instancing = false;
>  	unsigned original_index_size = index_size;
>  	unsigned direct_count = 0;
> 
>  	if (info->indirect) {
>  		struct pipe_draw_indirect_info *indirect = info->indirect;
> 
>  		/* Add the buffer size for memory checking in need_cs_space. */
>  		si_context_add_resource_size(sctx, indirect->buffer);
> 
>  		/* Indirect buffers use TC L2 on GFX9, but not older hw. */
>  		if (sctx->chip_class <= GFX8) {
>  			if (si_resource(indirect->buffer)->TC_L2_dirty) {
> -				sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +				sctx->flags |= SI_CONTEXT_WB_L2;
>  				si_resource(indirect->buffer)->TC_L2_dirty = false;
>  			}
> 
>  			if (indirect->indirect_draw_count &&
>  			    si_resource(indirect->indirect_draw_count)->TC_L2_dirty) {
> -				sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +				sctx->flags |= SI_CONTEXT_WB_L2;
>  				si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
>  			}
>  		}
>  	} else {
>  		/* Multiply by 3 for strips and fans to get an approximate vertex
>  		 * count as triangles. */
>  		direct_count = info->count * instance_count *
>  			       (prim == PIPE_PRIM_TRIANGLES ? 1 : 3);
>  	}
> 
> diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c
> b/src/gallium/drivers/radeonsi/si_state_streamout.c
> index e7058f19a8a..e3c72ccdf49 100644
> --- a/src/gallium/drivers/radeonsi/si_state_streamout.c
> +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
> @@ -114,23 +114,23 @@ static void si_set_streamout_targets(struct
> pipe_context *ctx,
>  		/* Invalidate the scalar cache in case a streamout buffer is
>  		 * going to be used as a constant buffer.
>  		 *
>  		 * Invalidate vL1, because streamout bypasses it (done by
>  		 * setting GLC=1 in the store instruction), but vL1 in other
>  		 * CUs can contain outdated data of streamout buffers.
>  		 *
>  		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
>  		 * used as an input immediately.
>  		 */
> -		sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
> -				 SI_CONTEXT_INV_VMEM_L1 |
> -				 SI_CONTEXT_VS_PARTIAL_FLUSH;
> +		sctx->flags |= SI_CONTEXT_INV_SCACHE |
> +			       SI_CONTEXT_INV_VCACHE |
> +			       SI_CONTEXT_VS_PARTIAL_FLUSH;
>  	}
> 
>  	/* All readers of the streamout targets need to be finished before we 
> can
>  	 * start writing to the targets.
>  	 */
>  	if (num_targets)
>  		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
>  		                 SI_CONTEXT_CS_PARTIAL_FLUSH;
> 
>  	/* Streamout buffers must be bound in 2 places:
> diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
> b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
> index 0b5a4a38ab7..0a0b9c4a657 100644
> --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
> +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
> @@ -226,40 +226,40 @@ void si_test_dma_perf(struct si_screen *sscreen)
>  						sb[0].buffer_size = size;
> 
>  						if (is_copy) {
>  							sb[1].buffer = src;
>  							sb[1].buffer_size = size;
>  						} else {
>  							for (unsigned i = 0; i < 4; i++)
>  								sctx->cs_user_data[i] = clear_value;
>  						}
> 
> -						sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
> -							       SI_CONTEXT_INV_SMEM_L1;
> +						sctx->flags |= SI_CONTEXT_INV_VCACHE |
> +							       SI_CONTEXT_INV_SCACHE;
> 
>  						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
>  									is_copy ? 2 : 1, sb, 0x1);
>  						ctx->bind_compute_state(ctx, cs);
>  						sctx->cs_max_waves_per_sh = cs_waves_per_sh;
> 
>  						ctx->launch_grid(ctx, &info);
> 
>  						ctx->bind_compute_state(ctx, NULL);
>  						ctx->delete_compute_state(ctx, cs);
>  						sctx->cs_max_waves_per_sh = 0; /* disable the limit */
> 
>  						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
>  					}
> 
>  					/* Flush L2, so that we don't just test L2 cache performance. */
>  					if (!test_sdma) {
> -						sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
> +						sctx->flags |= SI_CONTEXT_WB_L2;
>  						si_emit_cache_flush(sctx);
>  					}
> 
>  					ctx->end_query(ctx, q[iter]);
>  					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
>  				}
>  				pipe_resource_reference(&dst, NULL);
>  				pipe_resource_reference(&src, NULL);
> 
>  				/* Get results. */