radeonsi: use compute for resource_copy_region when possible

Submitted by Marek Olšák on Jan. 15, 2019, 5:50 p.m.

Details

Message ID 20190115175040.26801-1-maraeo@gmail.com
State New
Headers show
Series "radeonsi: use compute for resource_copy_region when possible" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Marek Olšák Jan. 15, 2019, 5:50 p.m.
From: Sonny Jiang <sonny.jiang@amd.com>

v2: marek: fix snorm8 blits

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c        |  12 ++
 .../drivers/radeonsi/si_compute_blit.c        | 108 ++++++++++++++++++
 src/gallium/drivers/radeonsi/si_pipe.c        |   4 +
 src/gallium/drivers/radeonsi/si_pipe.h        |  11 ++
 .../drivers/radeonsi/si_shaderlib_tgsi.c      |  77 +++++++++++++
 5 files changed, 212 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 16be11247e4..bb8d1cbd12d 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -895,32 +895,44 @@  struct texture_orig_info {
 void si_resource_copy_region(struct pipe_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
 			     unsigned dstx, unsigned dsty, unsigned dstz,
 			     struct pipe_resource *src,
 			     unsigned src_level,
 			     const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_texture *ssrc = (struct si_texture*)src;
+	struct si_texture *sdst = (struct si_texture*)dst;
 	struct pipe_surface *dst_view, dst_templ;
 	struct pipe_sampler_view src_templ, *src_view;
 	unsigned dst_width, dst_height, src_width0, src_height0;
 	unsigned dst_width0, dst_height0, src_force_level = 0;
 	struct pipe_box sbox, dstbox;
 
 	/* Handle buffers first. */
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		si_copy_buffer(sctx, dst, src, dstx, src_box->x, src_box->width);
 		return;
 	}
 
+	if (!util_format_is_compressed(src->format) &&
+	    !util_format_is_compressed(dst->format) &&
+	    !util_format_is_depth_or_stencil(src->format) &&
+	    src->nr_samples <= 1 &&
+	    !sdst->dcc_offset &&
+	    !(dst->target != src->target &&
+	      (src->target == PIPE_TEXTURE_1D_ARRAY || dst->target == PIPE_TEXTURE_1D_ARRAY))) {
+		si_compute_copy_image(sctx, dst, dst_level, src, src_level, dstx, dsty, dstz, src_box);
+		return;
+	}
+
 	assert(u_max_sample(dst) == u_max_sample(src));
 
 	/* The driver doesn't decompress resources automatically while
 	 * u_blitter is rendering. */
 	si_decompress_subresource(ctx, src, PIPE_MASK_RGBAZS, src_level,
 				  src_box->z, src_box->z + src_box->depth - 1);
 
 	dst_width = u_minify(dst->width0, dst_level);
 	dst_height = u_minify(dst->height0, dst_level);
 	dst_width0 = dst->width0;
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index dfa77a98804..c547d124507 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -17,20 +17,21 @@ 
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  */
 
 #include "si_pipe.h"
+#include "util/u_format.h"
 
 /* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
  * and L2_STREAM for src.
  */
 static enum si_cache_policy get_cache_policy(struct si_context *sctx,
 					     enum si_coherency coher,
 					     uint64_t size)
 {
 	if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
 					  coher == SI_COHERENCY_CP)) ||
@@ -285,14 +286,121 @@  void si_copy_buffer(struct si_context *sctx,
 	    size > 32 * 1024 &&
 	    dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
 		si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
 					    size, NULL, 0, coher);
 	} else {
 		si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
 				      0, coher, cache_policy);
 	}
 }
 
+void si_compute_copy_image(struct si_context *sctx,
+			   struct pipe_resource *dst,
+			   unsigned dst_level,
+			   struct pipe_resource *src,
+			   unsigned src_level,
+			   unsigned dstx, unsigned dsty, unsigned dstz,
+			   const struct pipe_box *src_box)
+{
+	struct pipe_context *ctx = &sctx->b;
+	unsigned width = src_box->width;
+	unsigned height = src_box->height;
+	unsigned depth = src_box->depth;
+
+	unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
+
+	if (width == 0 || height == 0)
+		return;
+
+	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+	si_make_CB_shader_coherent(sctx, dst->nr_samples, true);
+
+	struct pipe_constant_buffer saved_cb = {};
+	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+
+	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
+	struct pipe_image_view saved_image[2] = {0};
+	util_copy_image_view(&saved_image[0], &images->views[0]);
+	util_copy_image_view(&saved_image[1], &images->views[1]);
+
+	void *saved_cs = sctx->cs_shader_state.program;
+
+	struct pipe_constant_buffer cb = {};
+	cb.buffer_size = sizeof(data);
+	cb.user_buffer = data;
+	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
+
+	struct pipe_image_view image[2] = {0};
+	image[0].resource = src;
+	image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
+	image[0].format = util_format_linear(src->format);
+	image[0].u.tex.level = src_level;
+	image[0].u.tex.first_layer = 0;
+	image[0].u.tex.last_layer =
+		src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
+						: (unsigned)(src->array_size - 1);
+	image[1].resource = dst;
+	image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
+	image[1].format = util_format_linear(dst->format);
+	image[1].u.tex.level = dst_level;
+	image[1].u.tex.first_layer = 0;
+	image[1].u.tex.last_layer =
+		dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
+						: (unsigned)(dst->array_size - 1);
+
+	if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
+		image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
+
+	/* SNORM8 blitting has precision issues on some chips. Use the SINT
+	 * equivalent instead, which doesn't force DCC decompression.
+	 * Note that some chips avoid this issue by using SDMA.
+	 */
+	if (util_format_is_snorm8(dst->format)) {
+		image[0].format = image[1].format =
+			util_format_snorm8_to_sint8(dst->format);
+	}
+
+	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
+
+	struct pipe_grid_info info = {0};
+
+	if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
+		if (!sctx->cs_copy_image_1d_array)
+			sctx->cs_copy_image_1d_array =
+				si_create_copy_image_compute_shader_1d_array(ctx);
+		ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
+		info.block[0] = 64;
+		info.last_block[0] = width % 64;
+		info.block[1] = 1;
+		info.block[2] = 1;
+		info.grid[0] = DIV_ROUND_UP(width, 64);
+		info.grid[1] = depth;
+		info.grid[2] = 1;
+	} else {
+		if (!sctx->cs_copy_image)
+			sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
+		ctx->bind_compute_state(ctx, sctx->cs_copy_image);
+		info.block[0] = 8;
+		info.last_block[0] = width % 8;
+		info.block[1] = 8;
+		info.last_block[1] = height % 8;
+		info.block[2] = 1;
+		info.grid[0] = DIV_ROUND_UP(width, 8);
+		info.grid[1] = DIV_ROUND_UP(height, 8);
+		info.grid[2] = depth;
+	}
+
+	ctx->launch_grid(ctx, &info);
+
+	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
+		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+	ctx->bind_compute_state(ctx, saved_cs);
+	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
+	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
+}
+
 void si_init_compute_blit_functions(struct si_context *sctx)
 {
 	sctx->b.clear_buffer = si_pipe_clear_buffer;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 6b89a1192d9..0bab41c9a0c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -194,20 +194,24 @@  static void si_destroy_context(struct pipe_context *context)
 	if (sctx->vs_blit_color)
 		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
 	if (sctx->vs_blit_color_layered)
 		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
 	if (sctx->vs_blit_texcoord)
 		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
 	if (sctx->cs_clear_buffer)
 		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
 	if (sctx->cs_copy_buffer)
 		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
+	if (sctx->cs_copy_image)
+		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
+	if (sctx->cs_copy_image_1d_array)
+		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
 
 	if (sctx->blitter)
 		util_blitter_destroy(sctx->blitter);
 
 	/* Release DCC stats. */
 	for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
 		assert(!sctx->dcc_stats[i].query_active);
 
 		for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++)
 			if (sctx->dcc_stats[i].ps_stats[j])
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 957629e4633..9943998a707 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -793,20 +793,22 @@  struct si_context {
 	void				*custom_blend_fmask_decompress;
 	void				*custom_blend_eliminate_fastclear;
 	void				*custom_blend_dcc_decompress;
 	void				*vs_blit_pos;
 	void				*vs_blit_pos_layered;
 	void				*vs_blit_color;
 	void				*vs_blit_color_layered;
 	void				*vs_blit_texcoord;
 	void				*cs_clear_buffer;
 	void				*cs_copy_buffer;
+	void				*cs_copy_image;
+	void				*cs_copy_image_1d_array;
 	struct si_screen		*screen;
 	struct pipe_debug_callback	debug;
 	struct ac_llvm_compiler		compiler; /* only non-threaded compilation */
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	struct r600_resource		*wait_mem_scratch;
 	unsigned			wait_mem_number;
 	uint16_t			prefetch_L2_mask;
 
 	bool				gfx_flush_in_progress:1;
 	bool				gfx_last_ib_is_busy:1;
@@ -1141,20 +1143,27 @@  void si_init_clear_functions(struct si_context *sctx);
 
 /* si_compute_blit.c */
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
 			    enum si_cache_policy cache_policy);
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
 		     uint32_t clear_value_size, enum si_coherency coher);
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
+void si_compute_copy_image(struct si_context *sctx,
+			   struct pipe_resource *dst,
+			   unsigned dst_level,
+			   struct pipe_resource *src,
+			   unsigned src_level,
+			   unsigned dstx, unsigned dsty, unsigned dstz,
+			   const struct pipe_box *src_box);
 void si_init_compute_blit_functions(struct si_context *sctx);
 
 /* si_cp_dma.c */
 #define SI_CPDMA_SKIP_CHECK_CS_SPACE	(1 << 0) /* don't call need_cs_space */
 #define SI_CPDMA_SKIP_SYNC_AFTER	(1 << 1) /* don't wait for DMA after the copy */
 #define SI_CPDMA_SKIP_SYNC_BEFORE	(1 << 2) /* don't wait for DMA before the copy (RAW hazards) */
 #define SI_CPDMA_SKIP_GFX_SYNC		(1 << 3) /* don't flush caches and don't wait for PS/CS */
 #define SI_CPDMA_SKIP_BO_LIST_UPDATE	(1 << 4) /* don't update the BO list */
 #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
 			   SI_CPDMA_SKIP_SYNC_AFTER | \
@@ -1251,20 +1260,22 @@  void si_init_query_functions(struct si_context *sctx);
 void si_suspend_queries(struct si_context *sctx);
 void si_resume_queries(struct si_context *sctx);
 
 /* si_shaderlib_tgsi.c */
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
 			unsigned num_layers);
 void *si_create_fixed_func_tcs(struct si_context *sctx);
 void *si_create_dma_compute_shader(struct pipe_context *ctx,
 				   unsigned num_dwords_per_thread,
 				   bool dst_stream_cache_policy, bool is_copy);
+void *si_create_copy_image_compute_shader(struct pipe_context *ctx);
+void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx);
 void *si_create_query_result_cs(struct si_context *sctx);
 
 /* si_test_dma.c */
 void si_test_dma(struct si_screen *sscreen);
 
 /* si_test_clearbuffer.c */
 void si_test_dma_perf(struct si_screen *sscreen);
 
 /* si_uvd.c */
 struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
index da55c81dd68..55f96b3a25e 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@@ -432,10 +432,87 @@  void *si_create_query_result_cs(struct si_context *sctx)
 	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
 		assert(false);
 		return NULL;
 	}
 
 	state.ir_type = PIPE_SHADER_IR_TGSI;
 	state.prog = tokens;
 
 	return sctx->b.create_compute_state(&sctx->b, &state);
 }
+
+/* Create a compute shader implementing copy_image.
+ * Luckily, this works with all texture targets except 1D_ARRAY.
+ */
+void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
+{
+	static const char text[] =
+		"COMP\n"
+		"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
+		"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
+		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+		"DCL SV[0], THREAD_ID\n"
+		"DCL SV[1], BLOCK_ID\n"
+		"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+		"DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+		"DCL TEMP[0..4], LOCAL\n"
+		"IMM[0] UINT32 {8, 1, 0, 0}\n"
+		"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
+		"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
+		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
+		"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+		"MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
+		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
+		"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+		"END\n";
+
+	struct tgsi_token tokens[1024];
+	struct pipe_compute_state state = {0};
+
+	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+		assert(false);
+		return NULL;
+	}
+
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = tokens;
+
+	return ctx->create_compute_state(ctx, &state);
+}
+
+void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx)
+{
+	static const char text[] =
+		"COMP\n"
+		"PROPERTY CS_FIXED_BLOCK_WIDTH 64\n"
+		"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
+		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
+		"DCL SV[0], THREAD_ID\n"
+		"DCL SV[1], BLOCK_ID\n"
+		"DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+		"DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
+		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
+		"DCL TEMP[0..4], LOCAL\n"
+		"IMM[0] UINT32 {64, 1, 0, 0}\n"
+		"MOV TEMP[0].xy, CONST[0][0].xzzw\n"
+		"UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n"
+		"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n"
+		"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+		"MOV TEMP[4].xy, CONST[0][1].xzzw\n"
+		"UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n"
+		"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
+		"END\n";
+
+	struct tgsi_token tokens[1024];
+	struct pipe_compute_state state = {0};
+
+	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
+		assert(false);
+		return NULL;
+	}
+
+	state.ir_type = PIPE_SHADER_IR_TGSI;
+	state.prog = tokens;
+
+	return ctx->create_compute_state(ctx, &state);
+}

Comments

On 15/01/2019 18:50, Marek Olšák wrote:
>   
> +void si_compute_copy_image(struct si_context *sctx,
> +			   struct pipe_resource *dst,
> +			   unsigned dst_level,
> +			   struct pipe_resource *src,
> +			   unsigned src_level,
> +			   unsigned dstx, unsigned dsty, unsigned dstz,
> +			   const struct pipe_box *src_box)
> +{
> +	struct pipe_context *ctx = &sctx->b;
> +	unsigned width = src_box->width;
> +	unsigned height = src_box->height;
> +	unsigned depth = src_box->depth;
> +
> +	unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx, dsty, dstz, 0};
> +
> +	if (width == 0 || height == 0)
> +		return;
> +
> +	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> +		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
> +	si_make_CB_shader_coherent(sctx, dst->nr_samples, true);
> +
> +	struct pipe_constant_buffer saved_cb = {};
> +	si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> +
> +	struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
> +	struct pipe_image_view saved_image[2] = {0};
> +	util_copy_image_view(&saved_image[0], &images->views[0]);
> +	util_copy_image_view(&saved_image[1], &images->views[1]);
> +
> +	void *saved_cs = sctx->cs_shader_state.program;
> +
> +	struct pipe_constant_buffer cb = {};
> +	cb.buffer_size = sizeof(data);
> +	cb.user_buffer = data;
> +	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
> +
> +	struct pipe_image_view image[2] = {0};
> +	image[0].resource = src;
> +	image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
> +	image[0].format = util_format_linear(src->format);
> +	image[0].u.tex.level = src_level;
> +	image[0].u.tex.first_layer = 0;
> +	image[0].u.tex.last_layer =
> +		src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, src_level) - 1
> +						: (unsigned)(src->array_size - 1);
> +	image[1].resource = dst;
> +	image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
> +	image[1].format = util_format_linear(dst->format);
> +	image[1].u.tex.level = dst_level;
> +	image[1].u.tex.first_layer = 0;
> +	image[1].u.tex.last_layer =
> +		dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0, dst_level) - 1
> +						: (unsigned)(dst->array_size - 1);
> +
> +	if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
> +		image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
> +
> +	/* SNORM8 blitting has precision issues on some chips. Use the SINT
> +	 * equivalent instead, which doesn't force DCC decompression.
> +	 * Note that some chips avoid this issue by using SDMA.
> +	 */
> +	if (util_format_is_snorm8(dst->format)) {
> +		image[0].format = image[1].format =
> +			util_format_snorm8_to_sint8(dst->format);
> +	}
> +
> +	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
> +
> +	struct pipe_grid_info info = {0};
> +
> +	if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
> +		if (!sctx->cs_copy_image_1d_array)
> +			sctx->cs_copy_image_1d_array =
> +				si_create_copy_image_compute_shader_1d_array(ctx);
> +		ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
> +		info.block[0] = 64;
> +		info.last_block[0] = width % 64;
> +		info.block[1] = 1;
> +		info.block[2] = 1;
> +		info.grid[0] = DIV_ROUND_UP(width, 64);
> +		info.grid[1] = depth;
> +		info.grid[2] = 1;
> +	} else {
> +		if (!sctx->cs_copy_image)
> +			sctx->cs_copy_image = si_create_copy_image_compute_shader(ctx);
> +		ctx->bind_compute_state(ctx, sctx->cs_copy_image);
> +		info.block[0] = 8;
> +		info.last_block[0] = width % 8;
> +		info.block[1] = 8;
> +		info.last_block[1] = height % 8;
> +		info.block[2] = 1;
> +		info.grid[0] = DIV_ROUND_UP(width, 8);
> +		info.grid[1] = DIV_ROUND_UP(height, 8);
> +		info.grid[2] = depth;
> +	}
> +
> +	ctx->launch_grid(ctx, &info);
> +
> +	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> +		       (sctx->chip_class <= VI ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
> +		       si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
> +	ctx->bind_compute_state(ctx, saved_cs);
> +	ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
> +	ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> +}
> +

> +void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
> +{
> +	static const char text[] =
> +		"COMP\n"
> +		"PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
> +		"PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
> +		"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
> +		"DCL SV[0], THREAD_ID\n"
> +		"DCL SV[1], BLOCK_ID\n"
> +		"DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
> +		"DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n"
> +		"DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
> +		"DCL TEMP[0..4], LOCAL\n"
> +		"IMM[0] UINT32 {8, 1, 0, 0}\n"
> +		"MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
> +		"UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
> +		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
> +		"LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> +		"MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
> +		"UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
> +		"STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> +		"END\n";
> +
> +	struct tgsi_token tokens[1024];
> +	struct pipe_compute_state state = {0};
> +
> +	if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
> +		assert(false);
> +		return NULL;
> +	}
> +
> +	state.ir_type = PIPE_SHADER_IR_TGSI;
> +	state.prog = tokens;
> +
> +	return ctx->create_compute_state(ctx, &state);
> +}
> +
>
Hi,

Here is my summary of my understanding of the proposal implementation 
for the copy implementation:

. Store input and output (x, y, z) offsets into a constant buffer
. (8, 8) workgroups
. Each workitem copies pixel (x+get_group_id(0)*8+get_local_id(0), 
y+get_group_id(1)*8+get_local_id(1), 
z+get_group_id(2)*8+get_local_id(2)). The pixel is RGBA.

Some questions:
. What happens when the textures do not have some components ? R32F for 
example
. I'm not familiar with using images in compute shaders, but is it ok to 
declare as ARGB32F even if the input/output data is not float ?

Some comments:

. If src_x, dstx, etcs are not multiple of (8, 8), the workgroups won't 
be aligned well with the tiling pattern. Fortunately cache should 
mitigate the loss, but if that's an important case to handle, one could 
write the shader differently to have all workgroups (except at border) 
aligned. I guess one can benchmark see if that tiling alignment matters 
much here.
. Overhead can be reduced by copying several pixels per work-item.
. If the src and dst region are perfectly aligned with the tiling 
pattern, the copy can be reduced to just moving a rectangle of memory 
(no tiling) and could be implemented with dma_copy if no conversion is 
needed or with a shader using buffers (no images), which would avoid 
using the image sampling hw which I believe can be more limiting than 
sampling a buffer when there is a lot of wavefronts. The data conversion 
can be done for no cost in the shader as it should be memory bound.
. (8, 8) is not optimal for linear tiled images (but I guess we don't 
often get to use them with resource_copy_region).


But most likely you already know all that and consider this is not worth 
complicating the code to speed up corner cases.

Yours,


Axel Davy
On Tue, Jan 15, 2019 at 3:54 PM Axel Davy <davyaxel0@gmail.com> wrote:

> On 15/01/2019 18:50, Marek Olšák wrote:
> >
> > +void si_compute_copy_image(struct si_context *sctx,
> > +                        struct pipe_resource *dst,
> > +                        unsigned dst_level,
> > +                        struct pipe_resource *src,
> > +                        unsigned src_level,
> > +                        unsigned dstx, unsigned dsty, unsigned dstz,
> > +                        const struct pipe_box *src_box)
> > +{
> > +     struct pipe_context *ctx = &sctx->b;
> > +     unsigned width = src_box->width;
> > +     unsigned height = src_box->height;
> > +     unsigned depth = src_box->depth;
> > +
> > +     unsigned data[] = {src_box->x, src_box->y, src_box->z, 0, dstx,
> dsty, dstz, 0};
> > +
> > +     if (width == 0 || height == 0)
> > +             return;
> > +
> > +     sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> > +                    si_get_flush_flags(sctx, SI_COHERENCY_SHADER,
> L2_STREAM);
> > +     si_make_CB_shader_coherent(sctx, dst->nr_samples, true);
> > +
> > +     struct pipe_constant_buffer saved_cb = {};
> > +     si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0,
> &saved_cb);
> > +
> > +     struct si_images *images = &sctx->images[PIPE_SHADER_COMPUTE];
> > +     struct pipe_image_view saved_image[2] = {0};
> > +     util_copy_image_view(&saved_image[0], &images->views[0]);
> > +     util_copy_image_view(&saved_image[1], &images->views[1]);
> > +
> > +     void *saved_cs = sctx->cs_shader_state.program;
> > +
> > +     struct pipe_constant_buffer cb = {};
> > +     cb.buffer_size = sizeof(data);
> > +     cb.user_buffer = data;
> > +     ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &cb);
> > +
> > +     struct pipe_image_view image[2] = {0};
> > +     image[0].resource = src;
> > +     image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
> > +     image[0].format = util_format_linear(src->format);
> > +     image[0].u.tex.level = src_level;
> > +     image[0].u.tex.first_layer = 0;
> > +     image[0].u.tex.last_layer =
> > +             src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0,
> src_level) - 1
> > +                                             :
> (unsigned)(src->array_size - 1);
> > +     image[1].resource = dst;
> > +     image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
> > +     image[1].format = util_format_linear(dst->format);
> > +     image[1].u.tex.level = dst_level;
> > +     image[1].u.tex.first_layer = 0;
> > +     image[1].u.tex.last_layer =
> > +             dst->target == PIPE_TEXTURE_3D ? u_minify(dst->depth0,
> dst_level) - 1
> > +                                             :
> (unsigned)(dst->array_size - 1);
> > +
> > +     if (src->format == PIPE_FORMAT_R9G9B9E5_FLOAT)
> > +             image[0].format = image[1].format = PIPE_FORMAT_R32_UINT;
> > +
> > +     /* SNORM8 blitting has precision issues on some chips. Use the SINT
> > +      * equivalent instead, which doesn't force DCC decompression.
> > +      * Note that some chips avoid this issue by using SDMA.
> > +      */
> > +     if (util_format_is_snorm8(dst->format)) {
> > +             image[0].format = image[1].format =
> > +                     util_format_snorm8_to_sint8(dst->format);
> > +     }
> > +
> > +     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, image);
> > +
> > +     struct pipe_grid_info info = {0};
> > +
> > +     if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target ==
> PIPE_TEXTURE_1D_ARRAY) {
> > +             if (!sctx->cs_copy_image_1d_array)
> > +                     sctx->cs_copy_image_1d_array =
> > +
>  si_create_copy_image_compute_shader_1d_array(ctx);
> > +             ctx->bind_compute_state(ctx, sctx->cs_copy_image_1d_array);
> > +             info.block[0] = 64;
> > +             info.last_block[0] = width % 64;
> > +             info.block[1] = 1;
> > +             info.block[2] = 1;
> > +             info.grid[0] = DIV_ROUND_UP(width, 64);
> > +             info.grid[1] = depth;
> > +             info.grid[2] = 1;
> > +     } else {
> > +             if (!sctx->cs_copy_image)
> > +                     sctx->cs_copy_image =
> si_create_copy_image_compute_shader(ctx);
> > +             ctx->bind_compute_state(ctx, sctx->cs_copy_image);
> > +             info.block[0] = 8;
> > +             info.last_block[0] = width % 8;
> > +             info.block[1] = 8;
> > +             info.last_block[1] = height % 8;
> > +             info.block[2] = 1;
> > +             info.grid[0] = DIV_ROUND_UP(width, 8);
> > +             info.grid[1] = DIV_ROUND_UP(height, 8);
> > +             info.grid[2] = depth;
> > +     }
> > +
> > +     ctx->launch_grid(ctx, &info);
> > +
> > +     sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
> > +                    (sctx->chip_class <= VI ?
> SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0) |
> > +                    si_get_flush_flags(sctx, SI_COHERENCY_SHADER,
> L2_STREAM);
> > +     ctx->bind_compute_state(ctx, saved_cs);
> > +     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2,
> saved_image);
> > +     ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
> > +}
> > +
>
> > +void *si_create_copy_image_compute_shader(struct pipe_context *ctx)
> > +{
> > +     static const char text[] =
> > +             "COMP\n"
> > +             "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n"
> > +             "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n"
> > +             "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
> > +             "DCL SV[0], THREAD_ID\n"
> > +             "DCL SV[1], BLOCK_ID\n"
> > +             "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT,
> WR\n"
> > +             "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT,
> WR\n"
> > +             "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw
> > +             "DCL TEMP[0..4], LOCAL\n"
> > +             "IMM[0] UINT32 {8, 1, 0, 0}\n"
> > +             "MOV TEMP[0].xyz, CONST[0][0].xyzw\n"
> > +             "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n"
> > +             "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n"
> > +             "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY,
> PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> > +             "MOV TEMP[4].xyz, CONST[0][1].xyzw\n"
> > +             "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n"
> > +             "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY,
> PIPE_FORMAT_R32G32B32A32_FLOAT\n"
> > +             "END\n";
> > +
> > +     struct tgsi_token tokens[1024];
> > +     struct pipe_compute_state state = {0};
> > +
> > +     if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
> > +             assert(false);
> > +             return NULL;
> > +     }
> > +
> > +     state.ir_type = PIPE_SHADER_IR_TGSI;
> > +     state.prog = tokens;
> > +
> > +     return ctx->create_compute_state(ctx, &state);
> > +}
> > +
> >
> Hi,
>
> Here is my summary of my understanding of the proposal implementation
> for the copy implementation:
>
> . Store input and output (x, y, z) offsets into a constant buffer
> . (8, 8) workgroups
> . Each workitem copies pixel (x+get_group_id(0)*8+get_local_id(0),
> y+get_group_id(1)*8+get_local_id(1),
> z+get_group_id(2)*8+get_local_id(2)). The pixel is RGBA.
>

The block size in Z is 1.


>
> Some questions:
> . What happens when the textures do not have some components ? R32F for
> example
>

Components that don't exist are not stored.


> . I'm not familiar with using images in compute shaders, but is it ok to
> declare as ARGB32F even if the input/output data is not float ?
>

The driver ignores the format specified by shaders.


>
> Some comments:
>
> . If src_x, dstx, etcs are not multiple of (8, 8), the workgroups won't
> be aligned well with the tiling pattern. Fortunately cache should
> mitigate the loss, but if that's an important case to handle, one could
> write the shader differently to have all workgroups (except at border)
> aligned. I guess one can benchmark see if that tiling alignment matters
> much here.
>

That would complicate the shaders too much.

. Overhead can be reduced by copying several pixels per work-item.
>

Patches welcome. :)

. If the src and dst region are perfectly aligned with the tiling
> pattern, the copy can be reduced to just moving a rectangle of memory
> (no tiling) and could be implemented with dma_copy if no conversion is
> needed or with a shader using buffers (no images), which would avoid
> using the image sampling hw which I believe can be more limiting than
> sampling a buffer when there is a lot of wavefronts. The data conversion
> can be done for no cost in the shader as it should be memory bound.
>

Too complicated.


> . (8, 8) is not optimal for linear tiled images (but I guess we don't
> often get to use them with resource_copy_region).
>

Yes, linear -> linear copies are slower. tiled <-> linear copies wouldn't
be improved.

Marek