[3/3] radeonsi: use a fragment shader blit instead of DB->CB copy for ZS CPU mappings

Submitted by Marek Olšák on June 21, 2019, 5:02 p.m.

Details

Message ID 20190621170250.27794-3-maraeo@gmail.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Marek Olšák June 21, 2019, 5:02 p.m.
From: Marek Olšák <marek.olsak@amd.com>

This mainly removes and simplifies code that is no longer needed.

There were some issues with the DB->CB stencil copy on gfx10, so let's
just use a fragment shader blit for all ZS mappings. It's more reliable.
---
 src/gallium/drivers/radeonsi/si_blit.c    |  29 +---
 src/gallium/drivers/radeonsi/si_pipe.h    |   9 +-
 src/gallium/drivers/radeonsi/si_state.c   |   2 +-
 src/gallium/drivers/radeonsi/si_texture.c | 166 +++++++---------------
 4 files changed, 52 insertions(+), 154 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 5806342cca9..638f2ee4d24 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -173,45 +173,20 @@  si_blit_dbcb_copy(struct si_context *sctx,
 	}
 
 	sctx->decompression_enabled = false;
 	sctx->dbcb_depth_copy_enabled = false;
 	sctx->dbcb_stencil_copy_enabled = false;
 	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
 	return fully_copied_levels;
 }
 
-void si_blit_decompress_depth(struct pipe_context *ctx,
-			      struct si_texture *texture,
-			      struct si_texture *staging,
-			      unsigned first_level, unsigned last_level,
-			      unsigned first_layer, unsigned last_layer,
-			      unsigned first_sample, unsigned last_sample)
-{
-	const struct util_format_description *desc;
-	unsigned planes = 0;
-
-	assert(staging != NULL && "use si_blit_decompress_zs_in_place instead");
-
-	desc = util_format_description(staging->buffer.b.b.format);
-
-	if (util_format_has_depth(desc))
-		planes |= PIPE_MASK_Z;
-	if (util_format_has_stencil(desc))
-		planes |= PIPE_MASK_S;
-
-	si_blit_dbcb_copy(
-		(struct si_context *)ctx, texture, staging, planes,
-		u_bit_consecutive(first_level, last_level - first_level + 1),
-		first_layer, last_layer, first_sample, last_sample);
-}
-
 /* Helper function for si_blit_decompress_zs_in_place.
  */
 static void
 si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
 				      struct si_texture *texture,
 				      unsigned planes, unsigned level_mask,
 				      unsigned first_layer, unsigned last_layer)
 {
 	struct pipe_surface *zsurf, surf_tmpl = {{0}};
 	unsigned layer, max_layer, checked_last_layer;
@@ -348,21 +323,21 @@  si_decompress_depth(struct si_context *sctx,
 		u_log_printf(sctx->log,
 			     "\n------------------------------------------------\n"
 			     "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 0x%x)\n\n",
 			     first_level, last_level, levels_z, levels_s);
 
 	/* We may have to allocate the flushed texture here when called from
 	 * si_decompress_subresource.
 	 */
 	if (copy_planes &&
 	    (tex->flushed_depth_texture ||
-	     si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b, NULL))) {
+	     si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
 		struct si_texture *dst = tex->flushed_depth_texture;
 		unsigned fully_copied_levels;
 		unsigned levels = 0;
 
 		assert(tex->flushed_depth_texture);
 
 		if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
 			copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
 
 		if (copy_planes & PIPE_MASK_Z) {
@@ -1242,21 +1217,21 @@  static void si_blit(struct pipe_context *ctx,
 	assert(util_blitter_is_blit_supported(sctx->blitter, info));
 
 	/* The driver doesn't decompress resources automatically while
 	 * u_blitter is rendering. */
 	vi_disable_dcc_if_incompatible_format(sctx, info->src.resource,
 					      info->src.level,
 					      info->src.format);
 	vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource,
 					      info->dst.level,
 					      info->dst.format);
-	si_decompress_subresource(ctx, info->src.resource, info->mask,
+	si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS,
 				  info->src.level,
 				  info->src.box.z,
 				  info->src.box.z + info->src.box.depth - 1);
 
 	if (sctx->screen->debug_flags & DBG(FORCE_DMA) &&
 	    util_try_blit_via_copy_region(ctx, info))
 		return;
 
 	si_blitter_begin(sctx, SI_BLIT |
 			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 11678e1b4cb..8512c27b2cd 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1202,26 +1202,20 @@  void si_blitter_end(struct si_context *sctx);
 void si_init_blit_functions(struct si_context *sctx);
 void si_decompress_textures(struct si_context *sctx, unsigned shader_mask);
 void si_resource_copy_region(struct pipe_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
 			     unsigned dstx, unsigned dsty, unsigned dstz,
 			     struct pipe_resource *src,
 			     unsigned src_level,
 			     const struct pipe_box *src_box);
 void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex);
-void si_blit_decompress_depth(struct pipe_context *ctx,
-			      struct si_texture *texture,
-			      struct si_texture *staging,
-			      unsigned first_level, unsigned last_level,
-			      unsigned first_layer, unsigned last_layer,
-			      unsigned first_sample, unsigned last_sample);
 
 /* si_buffer.c */
 bool si_rings_is_buffer_referenced(struct si_context *sctx,
 				   struct pb_buffer *buf,
 				   enum radeon_bo_usage usage);
 void *si_buffer_map_sync_with_rings(struct si_context *sctx,
 				    struct si_resource *resource,
 				    unsigned usage);
 void si_init_resource_fields(struct si_screen *sscreen,
 			     struct si_resource *res,
@@ -1445,22 +1439,21 @@  bool si_prepare_for_dma_blit(struct si_context *sctx,
 			     unsigned dst_level, unsigned dstx,
 			     unsigned dsty, unsigned dstz,
 			     struct si_texture *src,
 			     unsigned src_level,
 			     const struct pipe_box *src_box);
 void si_eliminate_fast_color_clear(struct si_context *sctx,
 				   struct si_texture *tex);
 void si_texture_discard_cmask(struct si_screen *sscreen,
 			      struct si_texture *tex);
 bool si_init_flushed_depth_texture(struct pipe_context *ctx,
-				   struct pipe_resource *texture,
-				   struct si_texture **staging);
+				   struct pipe_resource *texture);
 void si_print_texture_info(struct si_screen *sscreen,
 			   struct si_texture *tex, struct u_log_context *log);
 struct pipe_resource *si_texture_create(struct pipe_screen *screen,
 					const struct pipe_resource *templ);
 bool vi_dcc_formats_compatible(enum pipe_format format1,
 			       enum pipe_format format2);
 bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
 				     unsigned level,
 				     enum pipe_format view_format);
 void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index b9fc77f7918..3996d280470 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4120,21 +4120,21 @@  si_create_sampler_view_custom(struct pipe_context *ctx,
 	    state->target == PIPE_TEXTURE_RECT ||
 	    state->target == PIPE_TEXTURE_CUBE)
 		last_layer = state->u.tex.first_layer;
 
 	/* Texturing with separate depth and stencil. */
 	pipe_format = state->format;
 
 	/* Depth/stencil texturing sometimes needs separate texture. */
 	if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
 		if (!tex->flushed_depth_texture &&
-		    !si_init_flushed_depth_texture(ctx, texture, NULL)) {
+		    !si_init_flushed_depth_texture(ctx, texture)) {
 			pipe_resource_reference(&view->base.texture, NULL);
 			FREE(view);
 			return NULL;
 		}
 
 		assert(tex->flushed_depth_texture);
 
 		/* Override format for the case where the flushed texture
 		 * contains only Z or only S.
 		 */
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
index dd383635675..be2562c45b4 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -121,59 +121,58 @@  static void si_copy_region_with_blit(struct pipe_context *pipe,
 	blit.src.box = *src_box;
 	blit.dst.resource = dst;
 	blit.dst.format = dst->format;
 	blit.dst.level = dst_level;
 	blit.dst.box.x = dstx;
 	blit.dst.box.y = dsty;
 	blit.dst.box.z = dstz;
 	blit.dst.box.width = src_box->width;
 	blit.dst.box.height = src_box->height;
 	blit.dst.box.depth = src_box->depth;
-	blit.mask = util_format_get_mask(src->format) &
-		    util_format_get_mask(dst->format);
+	blit.mask = util_format_get_mask(dst->format);
 	blit.filter = PIPE_TEX_FILTER_NEAREST;
 
 	if (blit.mask) {
 		pipe->blit(pipe, &blit);
 	}
 }
 
 /* Copy from a full GPU texture to a transfer's staging one. */
 static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
 	struct pipe_resource *dst = &stransfer->staging->b.b;
 	struct pipe_resource *src = transfer->resource;
 
-	if (src->nr_samples > 1) {
+	if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) {
 		si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
 					   src, transfer->level, &transfer->box);
 		return;
 	}
 
 	sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
 		       &transfer->box);
 }
 
 /* Copy from a transfer's staging texture to a full GPU one. */
 static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
 	struct pipe_resource *dst = transfer->resource;
 	struct pipe_resource *src = &stransfer->staging->b.b;
 	struct pipe_box sbox;
 
 	u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
 
-	if (dst->nr_samples > 1) {
+	if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) {
 		si_copy_region_with_blit(ctx, dst, transfer->level,
 					   transfer->box.x, transfer->box.y, transfer->box.z,
 					   src, 0, &sbox);
 		return;
 	}
 
 	if (util_format_is_compressed(dst->format)) {
 		sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
 		sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
 	}
@@ -1707,80 +1706,71 @@  static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
 					      sscreen->info.max_alignment,
 					      &stride, &offset);
 	if (!buf)
 		return NULL;
 
 	return si_texture_from_winsys_buffer(sscreen, templ, buf, stride,
 					     offset, usage, true);
 }
 
 bool si_init_flushed_depth_texture(struct pipe_context *ctx,
-				   struct pipe_resource *texture,
-				   struct si_texture **staging)
+				   struct pipe_resource *texture)
 {
 	struct si_texture *tex = (struct si_texture*)texture;
 	struct pipe_resource resource;
-	struct si_texture **flushed_depth_texture = staging ?
-			staging : &tex->flushed_depth_texture;
 	enum pipe_format pipe_format = texture->format;
 
-	if (!staging) {
-		if (tex->flushed_depth_texture)
-			return true; /* it's ready */
-
-		if (!tex->can_sample_z && tex->can_sample_s) {
-			switch (pipe_format) {
-			case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-				/* Save memory by not allocating the S plane. */
-				pipe_format = PIPE_FORMAT_Z32_FLOAT;
-				break;
-			case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-			case PIPE_FORMAT_S8_UINT_Z24_UNORM:
-				/* Save memory bandwidth by not copying the
-				 * stencil part during flush.
-				 *
-				 * This potentially increases memory bandwidth
-				 * if an application uses both Z and S texturing
-				 * simultaneously (a flushed Z24S8 texture
-				 * would be stored compactly), but how often
-				 * does that really happen?
-				 */
-				pipe_format = PIPE_FORMAT_Z24X8_UNORM;
-				break;
-			default:;
-			}
-		} else if (!tex->can_sample_s && tex->can_sample_z) {
-			assert(util_format_has_stencil(util_format_description(pipe_format)));
-
-			/* DB->CB copies to an 8bpp surface don't work. */
-			pipe_format = PIPE_FORMAT_X24S8_UINT;
+	assert(!tex->flushed_depth_texture);
+
+	if (!tex->can_sample_z && tex->can_sample_s) {
+		switch (pipe_format) {
+		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+			/* Save memory by not allocating the S plane. */
+			pipe_format = PIPE_FORMAT_Z32_FLOAT;
+			break;
+		case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+			/* Save memory bandwidth by not copying the
+			 * stencil part during flush.
+			 *
+			 * This potentially increases memory bandwidth
+			 * if an application uses both Z and S texturing
+			 * simultaneously (a flushed Z24S8 texture
+			 * would be stored compactly), but how often
+			 * does that really happen?
+			 */
+			pipe_format = PIPE_FORMAT_Z24X8_UNORM;
+			break;
+		default:;
 		}
+	} else if (!tex->can_sample_s && tex->can_sample_z) {
+		assert(util_format_has_stencil(util_format_description(pipe_format)));
+
+		/* DB->CB copies to an 8bpp surface don't work. */
+		pipe_format = PIPE_FORMAT_X24S8_UINT;
 	}
 
 	memset(&resource, 0, sizeof(resource));
 	resource.target = texture->target;
 	resource.format = pipe_format;
 	resource.width0 = texture->width0;
 	resource.height0 = texture->height0;
 	resource.depth0 = texture->depth0;
 	resource.array_size = texture->array_size;
 	resource.last_level = texture->last_level;
 	resource.nr_samples = texture->nr_samples;
-	resource.usage = staging ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
+	resource.usage = PIPE_USAGE_DEFAULT;
 	resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
 	resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
 
-	if (staging)
-		resource.flags |= SI_RESOURCE_FLAG_TRANSFER;
-
-	*flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
-	if (*flushed_depth_texture == NULL) {
+	tex->flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+	if (!tex->flushed_depth_texture) {
 		PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
 		return false;
 	}
 	return true;
 }
 
 /**
  * Initialize the pipe_resource descriptor to be of the same size as the box,
  * which is supposed to hold a subregion of the texture "orig" at the given
  * mipmap level.
@@ -1873,22 +1863,24 @@  static void *si_texture_transfer_map(struct pipe_context *ctx,
 	struct si_texture *tex = (struct si_texture*)texture;
 	struct si_transfer *trans;
 	struct si_resource *buf;
 	unsigned offset = 0;
 	char *map;
 	bool use_staging_texture = false;
 
 	assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
 	assert(box->width && box->height && box->depth);
 
-	/* Depth textures use staging unconditionally. */
-	if (!tex->is_depth) {
+	if (tex->is_depth) {
+		/* Depth textures use staging unconditionally. */
+		use_staging_texture = true;
+	} else {
 		/* Degrade the tile mode if we get too many transfers on APUs.
 		 * On dGPUs, the staging texture is always faster.
 		 * Only count uploads that are at least 4x4 pixels large.
 		 */
 		if (!sctx->screen->info.has_dedicated_vram &&
 		    level == 0 &&
 		    box->width >= 4 && box->height >= 4 &&
 		    p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
 			bool can_invalidate =
 				si_can_invalidate_texture(sctx->screen, tex,
@@ -1929,90 +1921,36 @@  static void *si_texture_transfer_map(struct pipe_context *ctx,
 	}
 
 	trans = CALLOC_STRUCT(si_transfer);
 	if (!trans)
 		return NULL;
 	pipe_resource_reference(&trans->b.b.resource, texture);
 	trans->b.b.level = level;
 	trans->b.b.usage = usage;
 	trans->b.b.box = *box;
 
-	if (tex->is_depth) {
-		struct si_texture *staging_depth;
-
-		if (tex->buffer.b.b.nr_samples > 1) {
-			/* MSAA depth buffers need to be converted to single sample buffers.
-			 *
-			 * Mapping MSAA depth buffers can occur if ReadPixels is called
-			 * with a multisample GLX visual.
-			 *
-			 * First downsample the depth buffer to a temporary texture,
-			 * then decompress the temporary one to staging.
-			 *
-			 * Only the region being mapped is transfered.
-			 */
-			struct pipe_resource resource;
-
-			si_init_temp_resource_from_box(&resource, texture, box, level, 0);
-
-			if (!si_init_flushed_depth_texture(ctx, &resource, &staging_depth)) {
-				PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
-				goto fail_trans;
-			}
-
-			if (usage & PIPE_TRANSFER_READ) {
-				struct pipe_resource *temp = ctx->screen->resource_create(ctx->screen, &resource);
-				if (!temp) {
-					PRINT_ERR("failed to create a temporary depth texture\n");
-					goto fail_trans;
-				}
-
-				si_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box);
-				si_blit_decompress_depth(ctx, (struct si_texture*)temp, staging_depth,
-							 0, 0, 0, box->depth, 0, 0);
-				pipe_resource_reference(&temp, NULL);
-			}
-
-			/* Just get the strides. */
-			si_texture_get_offset(sctx->screen, staging_depth, level, NULL,
-						&trans->b.b.stride,
-						&trans->b.b.layer_stride);
-		} else {
-			/* XXX: only readback the rectangle which is being mapped? */
-			/* XXX: when discard is true, no need to read back from depth texture */
-			if (!si_init_flushed_depth_texture(ctx, texture, &staging_depth)) {
-				PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
-				goto fail_trans;
-			}
-
-			si_blit_decompress_depth(ctx, tex, staging_depth,
-						 level, level,
-						 box->z, box->z + box->depth - 1,
-						 0, 0);
-
-			offset = si_texture_get_offset(sctx->screen, staging_depth,
-							 level, box,
-							 &trans->b.b.stride,
-							 &trans->b.b.layer_stride);
-		}
-
-		trans->staging = &staging_depth->buffer;
-		buf = trans->staging;
-	} else if (use_staging_texture) {
+	if (use_staging_texture) {
 		struct pipe_resource resource;
 		struct si_texture *staging;
 
 		si_init_temp_resource_from_box(&resource, texture, box, level,
 						 SI_RESOURCE_FLAG_TRANSFER);
 		resource.usage = (usage & PIPE_TRANSFER_READ) ?
 			PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
 
+		/* Since depth-stencil textures don't support linear tiling,
+		 * blit from ZS to color and vice versa. u_blitter will do
+		 * the packing for these formats.
+		 */
+		if (tex->is_depth)
+			resource.format = util_blitter_get_color_format_for_zs(resource.format);
+
 		/* Create the temporary texture. */
 		staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
 		if (!staging) {
 			PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
 			goto fail_trans;
 		}
 		trans->staging = &staging->buffer;
 
 		/* Just get the strides. */
 		si_texture_get_offset(sctx->screen, staging, 0, NULL,
@@ -2063,30 +2001,22 @@  static void si_texture_transfer_unmap(struct pipe_context *ctx,
 	/* Always unmap texture CPU mappings on 32-bit architectures, so that
 	 * we don't run out of the CPU address space.
 	 */
 	if (sizeof(void*) == 4) {
 		struct si_resource *buf =
 			stransfer->staging ? stransfer->staging : &tex->buffer;
 
 		sctx->ws->buffer_unmap(buf->buf);
 	}
 
-	if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) {
-		if (tex->is_depth && tex->buffer.b.b.nr_samples <= 1) {
-			ctx->resource_copy_region(ctx, texture, transfer->level,
-						  transfer->box.x, transfer->box.y, transfer->box.z,
-						  &stransfer->staging->b.b, transfer->level,
-						  &transfer->box);
-		} else {
-			si_copy_from_staging_texture(ctx, stransfer);
-		}
-	}
+	if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
+		si_copy_from_staging_texture(ctx, stransfer);
 
 	if (stransfer->staging) {
 		sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
 		si_resource_reference(&stransfer->staging, NULL);
 	}
 
 	/* Heuristic for {upload, draw, upload, draw, ..}:
 	 *
 	 * Flush the gfx IB if we've allocated too much texture storage.
 	 *

Comments

For the series

Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>

on Polaris 20, openSUSE Tumbleweed, KDE Plasma 5

Dieter

Am 21.06.2019 19:02, schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
> 
> This mainly removes and simplifies code that is no longer needed.
> 
> There were some issues with the DB->CB stencil copy on gfx10, so let's
> just use a fragment shader blit for all ZS mappings. It's more 
> reliable.
> ---
>  src/gallium/drivers/radeonsi/si_blit.c    |  29 +---
>  src/gallium/drivers/radeonsi/si_pipe.h    |   9 +-
>  src/gallium/drivers/radeonsi/si_state.c   |   2 +-
>  src/gallium/drivers/radeonsi/si_texture.c | 166 +++++++---------------
>  4 files changed, 52 insertions(+), 154 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_blit.c
> b/src/gallium/drivers/radeonsi/si_blit.c
> index 5806342cca9..638f2ee4d24 100644
> --- a/src/gallium/drivers/radeonsi/si_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_blit.c
> @@ -173,45 +173,20 @@ si_blit_dbcb_copy(struct si_context *sctx,
>  	}
> 
>  	sctx->decompression_enabled = false;
>  	sctx->dbcb_depth_copy_enabled = false;
>  	sctx->dbcb_stencil_copy_enabled = false;
>  	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
> 
>  	return fully_copied_levels;
>  }
> 
> -void si_blit_decompress_depth(struct pipe_context *ctx,
> -			      struct si_texture *texture,
> -			      struct si_texture *staging,
> -			      unsigned first_level, unsigned last_level,
> -			      unsigned first_layer, unsigned last_layer,
> -			      unsigned first_sample, unsigned last_sample)
> -{
> -	const struct util_format_description *desc;
> -	unsigned planes = 0;
> -
> -	assert(staging != NULL && "use si_blit_decompress_zs_in_place 
> instead");
> -
> -	desc = util_format_description(staging->buffer.b.b.format);
> -
> -	if (util_format_has_depth(desc))
> -		planes |= PIPE_MASK_Z;
> -	if (util_format_has_stencil(desc))
> -		planes |= PIPE_MASK_S;
> -
> -	si_blit_dbcb_copy(
> -		(struct si_context *)ctx, texture, staging, planes,
> -		u_bit_consecutive(first_level, last_level - first_level + 1),
> -		first_layer, last_layer, first_sample, last_sample);
> -}
> -
>  /* Helper function for si_blit_decompress_zs_in_place.
>   */
>  static void
>  si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
>  				      struct si_texture *texture,
>  				      unsigned planes, unsigned level_mask,
>  				      unsigned first_layer, unsigned last_layer)
>  {
>  	struct pipe_surface *zsurf, surf_tmpl = {{0}};
>  	unsigned layer, max_layer, checked_last_layer;
> @@ -348,21 +323,21 @@ si_decompress_depth(struct si_context *sctx,
>  		u_log_printf(sctx->log,
>  			     "\n------------------------------------------------\n"
>  			     "Decompress Depth (levels %u - %u, levels Z: 0x%x S: 
> 0x%x)\n\n",
>  			     first_level, last_level, levels_z, levels_s);
> 
>  	/* We may have to allocate the flushed texture here when called from
>  	 * si_decompress_subresource.
>  	 */
>  	if (copy_planes &&
>  	    (tex->flushed_depth_texture ||
> -	     si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b, 
> NULL))) {
> +	     si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b))) {
>  		struct si_texture *dst = tex->flushed_depth_texture;
>  		unsigned fully_copied_levels;
>  		unsigned levels = 0;
> 
>  		assert(tex->flushed_depth_texture);
> 
>  		if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
>  			copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
> 
>  		if (copy_planes & PIPE_MASK_Z) {
> @@ -1242,21 +1217,21 @@ static void si_blit(struct pipe_context *ctx,
>  	assert(util_blitter_is_blit_supported(sctx->blitter, info));
> 
>  	/* The driver doesn't decompress resources automatically while
>  	 * u_blitter is rendering. */
>  	vi_disable_dcc_if_incompatible_format(sctx, info->src.resource,
>  					      info->src.level,
>  					      info->src.format);
>  	vi_disable_dcc_if_incompatible_format(sctx, info->dst.resource,
>  					      info->dst.level,
>  					      info->dst.format);
> -	si_decompress_subresource(ctx, info->src.resource, info->mask,
> +	si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS,
>  				  info->src.level,
>  				  info->src.box.z,
>  				  info->src.box.z + info->src.box.depth - 1);
> 
>  	if (sctx->screen->debug_flags & DBG(FORCE_DMA) &&
>  	    util_try_blit_via_copy_region(ctx, info))
>  		return;
> 
>  	si_blitter_begin(sctx, SI_BLIT |
>  			 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/drivers/radeonsi/si_pipe.h
> index 11678e1b4cb..8512c27b2cd 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -1202,26 +1202,20 @@ void si_blitter_end(struct si_context *sctx);
>  void si_init_blit_functions(struct si_context *sctx);
>  void si_decompress_textures(struct si_context *sctx, unsigned 
> shader_mask);
>  void si_resource_copy_region(struct pipe_context *ctx,
>  			     struct pipe_resource *dst,
>  			     unsigned dst_level,
>  			     unsigned dstx, unsigned dsty, unsigned dstz,
>  			     struct pipe_resource *src,
>  			     unsigned src_level,
>  			     const struct pipe_box *src_box);
>  void si_decompress_dcc(struct si_context *sctx, struct si_texture 
> *tex);
> -void si_blit_decompress_depth(struct pipe_context *ctx,
> -			      struct si_texture *texture,
> -			      struct si_texture *staging,
> -			      unsigned first_level, unsigned last_level,
> -			      unsigned first_layer, unsigned last_layer,
> -			      unsigned first_sample, unsigned last_sample);
> 
>  /* si_buffer.c */
>  bool si_rings_is_buffer_referenced(struct si_context *sctx,
>  				   struct pb_buffer *buf,
>  				   enum radeon_bo_usage usage);
>  void *si_buffer_map_sync_with_rings(struct si_context *sctx,
>  				    struct si_resource *resource,
>  				    unsigned usage);
>  void si_init_resource_fields(struct si_screen *sscreen,
>  			     struct si_resource *res,
> @@ -1445,22 +1439,21 @@ bool si_prepare_for_dma_blit(struct si_context 
> *sctx,
>  			     unsigned dst_level, unsigned dstx,
>  			     unsigned dsty, unsigned dstz,
>  			     struct si_texture *src,
>  			     unsigned src_level,
>  			     const struct pipe_box *src_box);
>  void si_eliminate_fast_color_clear(struct si_context *sctx,
>  				   struct si_texture *tex);
>  void si_texture_discard_cmask(struct si_screen *sscreen,
>  			      struct si_texture *tex);
>  bool si_init_flushed_depth_texture(struct pipe_context *ctx,
> -				   struct pipe_resource *texture,
> -				   struct si_texture **staging);
> +				   struct pipe_resource *texture);
>  void si_print_texture_info(struct si_screen *sscreen,
>  			   struct si_texture *tex, struct u_log_context *log);
>  struct pipe_resource *si_texture_create(struct pipe_screen *screen,
>  					const struct pipe_resource *templ);
>  bool vi_dcc_formats_compatible(enum pipe_format format1,
>  			       enum pipe_format format2);
>  bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
>  				     unsigned level,
>  				     enum pipe_format view_format);
>  void vi_disable_dcc_if_incompatible_format(struct si_context *sctx,
> diff --git a/src/gallium/drivers/radeonsi/si_state.c
> b/src/gallium/drivers/radeonsi/si_state.c
> index b9fc77f7918..3996d280470 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -4120,21 +4120,21 @@ si_create_sampler_view_custom(struct 
> pipe_context *ctx,
>  	    state->target == PIPE_TEXTURE_RECT ||
>  	    state->target == PIPE_TEXTURE_CUBE)
>  		last_layer = state->u.tex.first_layer;
> 
>  	/* Texturing with separate depth and stencil. */
>  	pipe_format = state->format;
> 
>  	/* Depth/stencil texturing sometimes needs separate texture. */
>  	if (tex->is_depth && !si_can_sample_zs(tex, 
> view->is_stencil_sampler)) {
>  		if (!tex->flushed_depth_texture &&
> -		    !si_init_flushed_depth_texture(ctx, texture, NULL)) {
> +		    !si_init_flushed_depth_texture(ctx, texture)) {
>  			pipe_resource_reference(&view->base.texture, NULL);
>  			FREE(view);
>  			return NULL;
>  		}
> 
>  		assert(tex->flushed_depth_texture);
> 
>  		/* Override format for the case where the flushed texture
>  		 * contains only Z or only S.
>  		 */
> diff --git a/src/gallium/drivers/radeonsi/si_texture.c
> b/src/gallium/drivers/radeonsi/si_texture.c
> index dd383635675..be2562c45b4 100644
> --- a/src/gallium/drivers/radeonsi/si_texture.c
> +++ b/src/gallium/drivers/radeonsi/si_texture.c
> @@ -121,59 +121,58 @@ static void si_copy_region_with_blit(struct
> pipe_context *pipe,
>  	blit.src.box = *src_box;
>  	blit.dst.resource = dst;
>  	blit.dst.format = dst->format;
>  	blit.dst.level = dst_level;
>  	blit.dst.box.x = dstx;
>  	blit.dst.box.y = dsty;
>  	blit.dst.box.z = dstz;
>  	blit.dst.box.width = src_box->width;
>  	blit.dst.box.height = src_box->height;
>  	blit.dst.box.depth = src_box->depth;
> -	blit.mask = util_format_get_mask(src->format) &
> -		    util_format_get_mask(dst->format);
> +	blit.mask = util_format_get_mask(dst->format);
>  	blit.filter = PIPE_TEX_FILTER_NEAREST;
> 
>  	if (blit.mask) {
>  		pipe->blit(pipe, &blit);
>  	}
>  }
> 
>  /* Copy from a full GPU texture to a transfer's staging one. */
>  static void si_copy_to_staging_texture(struct pipe_context *ctx,
> struct si_transfer *stransfer)
>  {
>  	struct si_context *sctx = (struct si_context*)ctx;
>  	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
>  	struct pipe_resource *dst = &stransfer->staging->b.b;
>  	struct pipe_resource *src = transfer->resource;
> 
> -	if (src->nr_samples > 1) {
> +	if (src->nr_samples > 1 || ((struct si_texture*)src)->is_depth) {
>  		si_copy_region_with_blit(ctx, dst, 0, 0, 0, 0,
>  					   src, transfer->level, &transfer->box);
>  		return;
>  	}
> 
>  	sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level,
>  		       &transfer->box);
>  }
> 
>  /* Copy from a transfer's staging texture to a full GPU one. */
>  static void si_copy_from_staging_texture(struct pipe_context *ctx,
> struct si_transfer *stransfer)
>  {
>  	struct si_context *sctx = (struct si_context*)ctx;
>  	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
>  	struct pipe_resource *dst = transfer->resource;
>  	struct pipe_resource *src = &stransfer->staging->b.b;
>  	struct pipe_box sbox;
> 
>  	u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height,
> transfer->box.depth, &sbox);
> 
> -	if (dst->nr_samples > 1) {
> +	if (dst->nr_samples > 1 || ((struct si_texture*)dst)->is_depth) {
>  		si_copy_region_with_blit(ctx, dst, transfer->level,
>  					   transfer->box.x, transfer->box.y, transfer->box.z,
>  					   src, 0, &sbox);
>  		return;
>  	}
> 
>  	if (util_format_is_compressed(dst->format)) {
>  		sbox.width = util_format_get_nblocksx(dst->format, sbox.width);
>  		sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
>  	}
> @@ -1707,80 +1706,71 @@ static struct pipe_resource
> *si_texture_from_handle(struct pipe_screen *screen,
>  					      sscreen->info.max_alignment,
>  					      &stride, &offset);
>  	if (!buf)
>  		return NULL;
> 
>  	return si_texture_from_winsys_buffer(sscreen, templ, buf, stride,
>  					     offset, usage, true);
>  }
> 
>  bool si_init_flushed_depth_texture(struct pipe_context *ctx,
> -				   struct pipe_resource *texture,
> -				   struct si_texture **staging)
> +				   struct pipe_resource *texture)
>  {
>  	struct si_texture *tex = (struct si_texture*)texture;
>  	struct pipe_resource resource;
> -	struct si_texture **flushed_depth_texture = staging ?
> -			staging : &tex->flushed_depth_texture;
>  	enum pipe_format pipe_format = texture->format;
> 
> -	if (!staging) {
> -		if (tex->flushed_depth_texture)
> -			return true; /* it's ready */
> -
> -		if (!tex->can_sample_z && tex->can_sample_s) {
> -			switch (pipe_format) {
> -			case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
> -				/* Save memory by not allocating the S plane. */
> -				pipe_format = PIPE_FORMAT_Z32_FLOAT;
> -				break;
> -			case PIPE_FORMAT_Z24_UNORM_S8_UINT:
> -			case PIPE_FORMAT_S8_UINT_Z24_UNORM:
> -				/* Save memory bandwidth by not copying the
> -				 * stencil part during flush.
> -				 *
> -				 * This potentially increases memory bandwidth
> -				 * if an application uses both Z and S texturing
> -				 * simultaneously (a flushed Z24S8 texture
> -				 * would be stored compactly), but how often
> -				 * does that really happen?
> -				 */
> -				pipe_format = PIPE_FORMAT_Z24X8_UNORM;
> -				break;
> -			default:;
> -			}
> -		} else if (!tex->can_sample_s && tex->can_sample_z) {
> -			assert(util_format_has_stencil(util_format_description(pipe_format)));
> -
> -			/* DB->CB copies to an 8bpp surface don't work. */
> -			pipe_format = PIPE_FORMAT_X24S8_UINT;
> +	assert(!tex->flushed_depth_texture);
> +
> +	if (!tex->can_sample_z && tex->can_sample_s) {
> +		switch (pipe_format) {
> +		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
> +			/* Save memory by not allocating the S plane. */
> +			pipe_format = PIPE_FORMAT_Z32_FLOAT;
> +			break;
> +		case PIPE_FORMAT_Z24_UNORM_S8_UINT:
> +		case PIPE_FORMAT_S8_UINT_Z24_UNORM:
> +			/* Save memory bandwidth by not copying the
> +			 * stencil part during flush.
> +			 *
> +			 * This potentially increases memory bandwidth
> +			 * if an application uses both Z and S texturing
> +			 * simultaneously (a flushed Z24S8 texture
> +			 * would be stored compactly), but how often
> +			 * does that really happen?
> +			 */
> +			pipe_format = PIPE_FORMAT_Z24X8_UNORM;
> +			break;
> +		default:;
>  		}
> +	} else if (!tex->can_sample_s && tex->can_sample_z) {
> +		assert(util_format_has_stencil(util_format_description(pipe_format)));
> +
> +		/* DB->CB copies to an 8bpp surface don't work. */
> +		pipe_format = PIPE_FORMAT_X24S8_UINT;
>  	}
> 
>  	memset(&resource, 0, sizeof(resource));
>  	resource.target = texture->target;
>  	resource.format = pipe_format;
>  	resource.width0 = texture->width0;
>  	resource.height0 = texture->height0;
>  	resource.depth0 = texture->depth0;
>  	resource.array_size = texture->array_size;
>  	resource.last_level = texture->last_level;
>  	resource.nr_samples = texture->nr_samples;
> -	resource.usage = staging ? PIPE_USAGE_STAGING : PIPE_USAGE_DEFAULT;
> +	resource.usage = PIPE_USAGE_DEFAULT;
>  	resource.bind = texture->bind & ~PIPE_BIND_DEPTH_STENCIL;
>  	resource.flags = texture->flags | SI_RESOURCE_FLAG_FLUSHED_DEPTH;
> 
> -	if (staging)
> -		resource.flags |= SI_RESOURCE_FLAG_TRANSFER;
> -
> -	*flushed_depth_texture = (struct si_texture
> *)ctx->screen->resource_create(ctx->screen, &resource);
> -	if (*flushed_depth_texture == NULL) {
> +	tex->flushed_depth_texture = (struct si_texture
> *)ctx->screen->resource_create(ctx->screen, &resource);
> +	if (!tex->flushed_depth_texture) {
>  		PRINT_ERR("failed to create temporary texture to hold flushed 
> depth\n");
>  		return false;
>  	}
>  	return true;
>  }
> 
>  /**
>   * Initialize the pipe_resource descriptor to be of the same size as 
> the box,
>   * which is supposed to hold a subregion of the texture "orig" at the 
> given
>   * mipmap level.
> @@ -1873,22 +1863,24 @@ static void *si_texture_transfer_map(struct
> pipe_context *ctx,
>  	struct si_texture *tex = (struct si_texture*)texture;
>  	struct si_transfer *trans;
>  	struct si_resource *buf;
>  	unsigned offset = 0;
>  	char *map;
>  	bool use_staging_texture = false;
> 
>  	assert(!(texture->flags & SI_RESOURCE_FLAG_TRANSFER));
>  	assert(box->width && box->height && box->depth);
> 
> -	/* Depth textures use staging unconditionally. */
> -	if (!tex->is_depth) {
> +	if (tex->is_depth) {
> +		/* Depth textures use staging unconditionally. */
> +		use_staging_texture = true;
> +	} else {
>  		/* Degrade the tile mode if we get too many transfers on APUs.
>  		 * On dGPUs, the staging texture is always faster.
>  		 * Only count uploads that are at least 4x4 pixels large.
>  		 */
>  		if (!sctx->screen->info.has_dedicated_vram &&
>  		    level == 0 &&
>  		    box->width >= 4 && box->height >= 4 &&
>  		    p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
>  			bool can_invalidate =
>  				si_can_invalidate_texture(sctx->screen, tex,
> @@ -1929,90 +1921,36 @@ static void *si_texture_transfer_map(struct
> pipe_context *ctx,
>  	}
> 
>  	trans = CALLOC_STRUCT(si_transfer);
>  	if (!trans)
>  		return NULL;
>  	pipe_resource_reference(&trans->b.b.resource, texture);
>  	trans->b.b.level = level;
>  	trans->b.b.usage = usage;
>  	trans->b.b.box = *box;
> 
> -	if (tex->is_depth) {
> -		struct si_texture *staging_depth;
> -
> -		if (tex->buffer.b.b.nr_samples > 1) {
> -			/* MSAA depth buffers need to be converted to single sample 
> buffers.
> -			 *
> -			 * Mapping MSAA depth buffers can occur if ReadPixels is called
> -			 * with a multisample GLX visual.
> -			 *
> -			 * First downsample the depth buffer to a temporary texture,
> -			 * then decompress the temporary one to staging.
> -			 *
> -			 * Only the region being mapped is transfered.
> -			 */
> -			struct pipe_resource resource;
> -
> -			si_init_temp_resource_from_box(&resource, texture, box, level, 0);
> -
> -			if (!si_init_flushed_depth_texture(ctx, &resource, &staging_depth)) 
> {
> -				PRINT_ERR("failed to create temporary texture to hold untiled 
> copy\n");
> -				goto fail_trans;
> -			}
> -
> -			if (usage & PIPE_TRANSFER_READ) {
> -				struct pipe_resource *temp =
> ctx->screen->resource_create(ctx->screen, &resource);
> -				if (!temp) {
> -					PRINT_ERR("failed to create a temporary depth texture\n");
> -					goto fail_trans;
> -				}
> -
> -				si_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, 
> box);
> -				si_blit_decompress_depth(ctx, (struct si_texture*)temp, 
> staging_depth,
> -							 0, 0, 0, box->depth, 0, 0);
> -				pipe_resource_reference(&temp, NULL);
> -			}
> -
> -			/* Just get the strides. */
> -			si_texture_get_offset(sctx->screen, staging_depth, level, NULL,
> -						&trans->b.b.stride,
> -						&trans->b.b.layer_stride);
> -		} else {
> -			/* XXX: only readback the rectangle which is being mapped? */
> -			/* XXX: when discard is true, no need to read back from depth 
> texture */
> -			if (!si_init_flushed_depth_texture(ctx, texture, &staging_depth)) {
> -				PRINT_ERR("failed to create temporary texture to hold untiled 
> copy\n");
> -				goto fail_trans;
> -			}
> -
> -			si_blit_decompress_depth(ctx, tex, staging_depth,
> -						 level, level,
> -						 box->z, box->z + box->depth - 1,
> -						 0, 0);
> -
> -			offset = si_texture_get_offset(sctx->screen, staging_depth,
> -							 level, box,
> -							 &trans->b.b.stride,
> -							 &trans->b.b.layer_stride);
> -		}
> -
> -		trans->staging = &staging_depth->buffer;
> -		buf = trans->staging;
> -	} else if (use_staging_texture) {
> +	if (use_staging_texture) {
>  		struct pipe_resource resource;
>  		struct si_texture *staging;
> 
>  		si_init_temp_resource_from_box(&resource, texture, box, level,
>  						 SI_RESOURCE_FLAG_TRANSFER);
>  		resource.usage = (usage & PIPE_TRANSFER_READ) ?
>  			PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
> 
> +		/* Since depth-stencil textures don't support linear tiling,
> +		 * blit from ZS to color and vice versa. u_blitter will do
> +		 * the packing for these formats.
> +		 */
> +		if (tex->is_depth)
> +			resource.format = 
> util_blitter_get_color_format_for_zs(resource.format);
> +
>  		/* Create the temporary texture. */
>  		staging = (struct
> si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
>  		if (!staging) {
>  			PRINT_ERR("failed to create temporary texture to hold untiled 
> copy\n");
>  			goto fail_trans;
>  		}
>  		trans->staging = &staging->buffer;
> 
>  		/* Just get the strides. */
>  		si_texture_get_offset(sctx->screen, staging, 0, NULL,
> @@ -2063,30 +2001,22 @@ static void si_texture_transfer_unmap(struct
> pipe_context *ctx,
>  	/* Always unmap texture CPU mappings on 32-bit architectures, so that
>  	 * we don't run out of the CPU address space.
>  	 */
>  	if (sizeof(void*) == 4) {
>  		struct si_resource *buf =
>  			stransfer->staging ? stransfer->staging : &tex->buffer;
> 
>  		sctx->ws->buffer_unmap(buf->buf);
>  	}
> 
> -	if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) {
> -		if (tex->is_depth && tex->buffer.b.b.nr_samples <= 1) {
> -			ctx->resource_copy_region(ctx, texture, transfer->level,
> -						  transfer->box.x, transfer->box.y, transfer->box.z,
> -						  &stransfer->staging->b.b, transfer->level,
> -						  &transfer->box);
> -		} else {
> -			si_copy_from_staging_texture(ctx, stransfer);
> -		}
> -	}
> +	if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging)
> +		si_copy_from_staging_texture(ctx, stransfer);
> 
>  	if (stransfer->staging) {
>  		sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
>  		si_resource_reference(&stransfer->staging, NULL);
>  	}
> 
>  	/* Heuristic for {upload, draw, upload, draw, ..}:
>  	 *
>  	 * Flush the gfx IB if we've allocated too much texture storage.
>  	 *