[Mesa-dev] RadeonSI 10.6 candidates

Submitted by Marek Olšák on July 31, 2015, 10:39 p.m.

Details

Message ID CAAxE2A46ZioToqkJFuf7_koZZn1g8GZBECKq_-K7wksku_otqA@mail.gmail.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Marek Olšák July 31, 2015, 10:39 p.m.
Hi Emil,

I'm nominating the attached patches for 10.6. They are quite big, but
they fix a lot of graphical corruption on SI cards, especially in
games from Valve (Portal, CS:GO, etc). I've checked with piglit that
there are no regressions.

There was a comment on Phoronix forums that this might fix about a
half of radeonsi bugs in the bugzilla. (I sincerely hope that's true,
but I don't know)

Marek

Patch hide | download patch | download mbox

From cc86439146dd158f511458b7972260ffa7b7a7cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 25 Jul 2015 00:53:16 +0200
Subject: [PATCH 2/2] radeonsi: completely rework updating descriptors without
 CP DMA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For 10.6: This fixes graphical corruption occuring on most Southern Islands
Radeon GPUs. This will allow closing a lot of bugs in the bugzilla.

The patch has a better explanation. Just a summary here:
- The CPU always uploads a whole descriptor array to previously-unused memory.
- CP DMA isn't used.
- No caches need to be flushed.
- All descriptors are always up-to-date in memory even after a hang, because
  CP DMA doesn't serve as a middle man to update them.

This should bring:
- better hang recovery (descriptors are always up-to-date)
- better GPU performance (no KCACHE and TC flushes)
- worse CPU performance for partial updates (only whole arrays are uploaded)
- less used IB space (no CP_DMA and WRITE_DATA packets)
- simpler code
- corruption issues are fixed on SI cards

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
(cherry picked from commit b0528118dfb1af00e7d08cdb637191b80c14c2ba)
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 354 +++++++++-----------------
 src/gallium/drivers/radeonsi/si_pipe.h        |   6 -
 src/gallium/drivers/radeonsi/si_state.h       |  32 +--
 src/gallium/drivers/radeonsi/si_state_draw.c  |   7 +-
 4 files changed, 128 insertions(+), 271 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index f31cccb..f6d2b2c 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -24,14 +24,23 @@ 
  *      Marek Olšák <marek.olsak@amd.com>
  */
 
-/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
- * live in memory on SI.
+/* Resource binding slots and sampler states (each described with 8 or
+ * 4 dwords) are stored in lists in memory which is accessed by shaders
+ * using scalar load instructions.
  *
- * This file is responsible for managing lists of resources and sampler states
- * in memory and binding them, which means updating those structures in memory.
+ * This file is responsible for managing such lists. It keeps a copy of all
+ * descriptors in CPU memory and re-uploads a whole list if some slots have
+ * been changed.
  *
- * There is also code for updating shader pointers to resources and sampler
- * states. CP DMA functions are here too.
+ * This code is also reponsible for updating shader pointers to those lists.
+ *
+ * Note that CP DMA can't be used for updating the lists, because a GPU hang
+ * could leave the list in a mid-IB state and the next IB would get wrong
+ * descriptors and the whole context would be unusable at that point.
+ * (Note: The register shadowing can't be used due to the same reason)
+ *
+ * Also, uploading descriptors to newly allocated memory doesn't require
+ * a KCACHE flush.
  */
 
 #include "radeon/r600_cs.h"
@@ -42,7 +51,6 @@ 
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 
-#define SI_NUM_CONTEXTS 16
 
 /* NULL image and buffer descriptor.
  *
@@ -139,159 +147,62 @@  static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 	}
 }
 
-static void si_init_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
+static void si_init_descriptors(struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
-				unsigned num_elements,
-				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
+				unsigned num_elements)
 {
+	int i;
+
 	assert(num_elements <= sizeof(desc->enabled_mask)*8);
-	assert(num_elements <= sizeof(desc->dirty_mask)*8);
 
-	desc->atom.emit = (void*)emit_func;
-	desc->shader_userdata_offset = shader_userdata_index * 4;
+	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
-	desc->context_size = num_elements * element_dw_size * 4;
-
-	desc->buffer = (struct r600_resource*)
-		pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT,
-				   SI_NUM_CONTEXTS * desc->context_size);
-
-	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
+	desc->list_dirty = true; /* upload the list before the next draw */
+	desc->shader_userdata_offset = shader_userdata_index * 4;
 
-	/* We don't check for CS space here, because this should be called
-	 * only once at context initialization. */
-	si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
-				    desc->buffer->b.b.width0, 0,
-				    R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
+	/* Initialize the array to NULL descriptors if the element size is 8. */
+	if (element_dw_size == 8)
+		for (i = 0; i < num_elements; i++)
+			memcpy(desc->list + i*element_dw_size, null_descriptor,
+			       sizeof(null_descriptor));
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
 	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
+	FREE(desc->list);
 }
 
-static void si_update_descriptors(struct si_context *sctx,
+static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc)
 {
-	if (desc->dirty_mask) {
-		desc->atom.num_dw =
-			7 + /* copy */
-			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */
-
-		desc->atom.dirty = true;
-		desc->pointer_dirty = true;
-		sctx->shader_userdata.atom.dirty = true;
-
-		/* TODO: Investigate if these flushes can be removed after
-		 * adding CE support. */
-
-		/* The descriptors are read with the K cache. */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
-
-		/* Since SI uses uncached CP DMA to update descriptors,
-		 * we have to flush TC L2, which is used to fetch constants
-		 * along with KCACHE. */
-		if (sctx->b.chip_class == SI)
-			sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
-	} else {
-		desc->atom.dirty = false;
-	}
-}
+	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	void *ptr;
 
-static void si_emit_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
-				uint32_t **descriptors)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va_base;
-	int packet_start = 0;
-	int packet_size = 0;
-	int last_index = desc->num_elements; /* point to a non-existing element */
-	uint64_t dirty_mask = desc->dirty_mask;
-	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
-
-	assert(dirty_mask);
-
-	va_base = desc->buffer->gpu_address;
-
-	/* Copy the descriptors to a new context slot. */
-	si_emit_cp_dma_copy_buffer(sctx,
-				   va_base + new_context_id * desc->context_size,
-				   va_base + desc->current_context_id * desc->context_size,
-				   desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
-
-	va_base += new_context_id * desc->context_size;
-
-	/* Update the descriptors.
-	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
-	 *
-	 * XXX When unbinding lots of resources, consider clearing the memory
-	 *     with CP DMA instead of emitting zeros.
-	 */
-	while (dirty_mask) {
-		int i = u_bit_scan64(&dirty_mask);
-
-		assert(i < desc->num_elements);
+	if (!desc->list_dirty)
+		return true;
 
-		if (last_index+1 == i && packet_size) {
-			/* Append new data at the end of the last packet. */
-			packet_size += desc->element_dw_size;
-			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
-		} else {
-			/* Start a new packet. */
-			uint64_t va = va_base + i * desc->element_dw_size * 4;
-
-			packet_start = cs->cdw;
-			packet_size = 2 + desc->element_dw_size;
-
-			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
-			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
-						PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
-						PKT3_WRITE_DATA_DST_SEL_TC_L2) |
-					     PKT3_WRITE_DATA_WR_CONFIRM |
-					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
-			radeon_emit(cs, va & 0xFFFFFFFFUL);
-			radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
-		}
+	u_upload_alloc(sctx->b.uploader, 0, list_size,
+		       &desc->buffer_offset,
+		       (struct pipe_resource**)&desc->buffer, &ptr);
+	if (!desc->buffer)
+		return false; /* skip the draw call */
 
-		radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
+	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-		last_index = i;
-	}
+	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+			      RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-	desc->dirty_mask = 0;
-	desc->current_context_id = new_context_id;
+	desc->list_dirty = false;
+	desc->pointer_dirty = true;
+	sctx->shader_userdata.atom.dirty = true;
+	return true;
 }
 
 /* SAMPLER VIEWS */
 
-static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_views *views = (struct si_sampler_views*)atom;
-
-	si_emit_descriptors(sctx, &views->desc, views->desc_data);
-}
-
-static void si_init_sampler_views(struct si_context *sctx,
-				  struct si_sampler_views *views)
-{
-	int i;
-
-	si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE,
-			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
-
-	for (i = 0; i < views->desc.num_elements; i++) {
-		views->desc_data[i] = null_descriptor;
-		views->desc.dirty_mask |= 1llu << i;
-	}
-	si_update_descriptors(sctx, &views->desc);
-}
-
 static void si_release_sampler_views(struct si_sampler_views *views)
 {
 	int i;
@@ -332,6 +243,8 @@  static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 				      si_get_resource_ro_priority(rview->resource));
 	}
 
+	if (!views->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -354,17 +267,16 @@  static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 				rview->resource, RADEON_USAGE_READ,
 				si_get_resource_ro_priority(rview->resource));
 
-
 		pipe_sampler_view_reference(&views->views[slot], view);
-		views->desc_data[slot] = view_desc;
+		memcpy(views->desc.list + slot*8, view_desc, 8*4);
 		views->desc.enabled_mask |= 1llu << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		views->desc_data[slot] = null_descriptor;
+		memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
 		views->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	views->desc.dirty_mask |= 1llu << slot;
+	views->desc.list_dirty = true;
 }
 
 static void si_set_sampler_views(struct pipe_context *ctx,
@@ -423,22 +335,15 @@  static void si_set_sampler_views(struct pipe_context *ctx,
 					    NULL, NULL);
 		}
 	}
-
-	si_update_descriptors(sctx, &samplers->views.desc);
 }
 
 /* SAMPLER STATES */
 
-static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_states *states = (struct si_sampler_states*)atom;
-
-	si_emit_descriptors(sctx, &states->desc, states->desc_data);
-}
-
 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 					   struct si_sampler_states *states)
 {
+	if (!states->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -460,64 +365,39 @@  void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 	for (i = 0; i < count; i++) {
 		unsigned slot = start + i;
 
-		if (!sstates[i]) {
-			samplers->desc.dirty_mask &= ~(1llu << slot);
+		if (!sstates[i])
 			continue;
-		}
 
-		samplers->desc_data[slot] = sstates[i]->val;
-		samplers->desc.dirty_mask |= 1llu << slot;
+		memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
+		samplers->desc.list_dirty = true;
 	}
-
-	si_update_descriptors(sctx, &samplers->desc);
 }
 
 /* BUFFER RESOURCES */
 
-static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
-
-	si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
-}
-
-static void si_init_buffer_resources(struct si_context *sctx,
-				     struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 				     unsigned num_buffers,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_priority priority)
 {
-	int i;
-
-	buffers->num_buffers = num_buffers;
 	buffers->shader_usage = shader_usage;
 	buffers->priority = priority;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-	buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
 
-	/* si_emit_descriptors only accepts an array of arrays.
-	 * This adds such an array. */
-	buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
-	for (i = 0; i < num_buffers; i++) {
-		buffers->desc_data[i] = &buffers->desc_storage[i*4];
-	}
-
-	si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4,
-			    num_buffers, si_emit_buffer_resources);
+	si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
+			    num_buffers);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
 {
 	int i;
 
-	for (i = 0; i < buffers->num_buffers; i++) {
+	for (i = 0; i < buffers->desc.num_elements; i++) {
 		pipe_resource_reference(&buffers->buffers[i], NULL);
 	}
 
 	FREE(buffers->buffers);
-	FREE(buffers->desc_storage);
-	FREE(buffers->desc_data);
 	si_release_descriptors(&buffers->desc);
 }
 
@@ -535,6 +415,8 @@  static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 				      buffers->shader_usage, buffers->priority);
 	}
 
+	if (!buffers->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
@@ -560,12 +442,15 @@  static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 	}
+
+	if (!desc->buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 }
 
-void si_update_vertex_buffers(struct si_context *sctx)
+static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
 	struct si_descriptors *desc = &sctx->vertex_buffers;
 	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
@@ -573,8 +458,10 @@  void si_update_vertex_buffers(struct si_context *sctx)
 	uint64_t va;
 	uint32_t *ptr;
 
+	if (!sctx->vertex_buffers_dirty)
+		return true;
 	if (!count || !sctx->vertex_elements)
-		return;
+		return true;
 
 	/* Vertex buffer descriptors are the only ones which are uploaded
 	 * directly through a staging buffer and don't go through
@@ -582,13 +469,14 @@  void si_update_vertex_buffers(struct si_context *sctx)
 	 */
 	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+	if (!desc->buffer)
+		return false;
 
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 
 	assert(count <= SI_NUM_VERTEX_BUFFERS);
-	assert(desc->current_context_id == 0);
 
 	for (i = 0; i < count; i++) {
 		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
@@ -640,6 +528,8 @@  void si_update_vertex_buffers(struct si_context *sctx)
 	 * cache is needed. */
 	desc->pointer_dirty = true;
 	sctx->shader_userdata.atom.dirty = true;
+	sctx->vertex_buffers_dirty = false;
+	return true;
 }
 
 
@@ -664,7 +554,7 @@  static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	if (shader >= SI_NUM_SHADERS)
 		return;
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
@@ -691,7 +581,7 @@  static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(0);
@@ -710,12 +600,11 @@  static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* RING BUFFERS */
@@ -735,7 +624,7 @@  void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 	/* The stride field in the resource descriptor has 14 bits */
 	assert(stride < (1 << 14));
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	if (buffer) {
@@ -780,7 +669,7 @@  void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(stride) |
@@ -803,12 +692,11 @@  void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* STREAMOUT BUFFERS */
@@ -870,7 +758,7 @@  static void si_set_streamout_targets(struct pipe_context *ctx,
 			uint64_t va = r600_resource(buffer)->gpu_address;
 
 			/* Set the descriptor. */
-			uint32_t *desc = buffers->desc_data[bufidx];
+			uint32_t *desc = buffers->desc.list + bufidx*4;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
 			desc[2] = 0xffffffff;
@@ -888,24 +776,22 @@  static void si_set_streamout_targets(struct pipe_context *ctx,
 			buffers->desc.enabled_mask |= 1llu << bufidx;
 		} else {
 			/* Clear the descriptor and unset the resource. */
-			memset(buffers->desc_data[bufidx], 0,
+			memset(buffers->desc.list + bufidx*4, 0,
 			       sizeof(uint32_t) * 4);
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						NULL);
 			buffers->desc.enabled_mask &= ~(1llu << bufidx);
 		}
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 	for (; i < old_num_targets; i++) {
 		bufidx = SI_SO_BUF_OFFSET + i;
 		/* Clear the descriptor and unset the resource. */
-		memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
 		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
 		buffers->desc.enabled_mask &= ~(1llu << bufidx);
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
@@ -974,22 +860,19 @@  static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Read/Write buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
-
 				if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
 					/* Update the streamout state. */
 					if (sctx->b.streamout.begin_emitted) {
@@ -1001,34 +884,25 @@  static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 				}
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Constant buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
-
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Texture buffers - update virtual addresses in sampler view descriptors. */
@@ -1040,23 +914,20 @@  static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Texture buffers - update bindings. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_sampler_views *views = &sctx->samplers[shader].views;
-		bool found = false;
 		uint64_t mask = views->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (views->views[i]->texture == buf) {
+				si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
+							    old_va, buf);
+				views->desc.list_dirty = true;
+
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SHADER_BUFFER_RO);
-
-				views->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &views->desc);
-		}
 	}
 }
 
@@ -1282,11 +1153,10 @@  static void si_emit_shader_pointer(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	uint64_t va;
 
-	if (!desc->pointer_dirty)
+	if (!desc->pointer_dirty || !desc->buffer)
 		return;
 
 	va = desc->buffer->gpu_address +
-	     desc->current_context_id * desc->context_size +
 	     desc->buffer_offset;
 
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
@@ -1323,34 +1193,28 @@  static void si_emit_shader_userdata(struct si_context *sctx,
 	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
 }
 
-/* INIT/DEINIT */
+/* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
+		si_init_buffer_resources(&sctx->const_buffers[i],
 					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
 					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
-		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
+		si_init_buffer_resources(&sctx->rw_buffers[i],
 					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-		si_init_sampler_views(sctx, &sctx->samplers[i].views);
-
-		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES,
-				    si_emit_sampler_states);
-
-		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
-		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
-		sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
-		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
+		si_init_descriptors(&sctx->samplers[i].views.desc,
+				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+		si_init_descriptors(&sctx->samplers[i].states.desc,
+				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
-			    4, SI_NUM_VERTEX_BUFFERS, NULL);
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1372,6 +1236,20 @@  void si_init_all_descriptors(struct si_context *sctx)
 	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
+bool si_upload_shader_descriptors(struct si_context *sctx)
+{
+	int i;
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
+			return false;
+	}
+	return si_upload_vertex_buffer_descriptors(sctx);
+}
+
 void si_release_all_descriptors(struct si_context *sctx)
 {
 	int i;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 3f00c96..4a9f9b7 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -139,12 +139,6 @@  struct si_context {
 	union {
 		struct {
 			/* The order matters. */
-			struct r600_atom *const_buffers[SI_NUM_SHADERS];
-			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
-			struct r600_atom *sampler_views[SI_NUM_SHADERS];
-			struct r600_atom *sampler_states[SI_NUM_SHADERS];
-			/* Caches must be flushed after resource descriptors are
-			 * updated in memory. */
 			struct r600_atom *cache_flush;
 			struct r600_atom *streamout_begin;
 			struct r600_atom *streamout_enable; /* must be after streamout_begin */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index a2a007e..b549913 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -151,60 +151,48 @@  struct si_shader_data {
 #define SI_NUM_VERTEX_BUFFERS	16
 
 
-/* This represents resource descriptors in memory, such as buffer resources,
+/* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
 struct si_descriptors {
-	struct r600_atom atom;
-
-	/* The size of one resource descriptor. */
+	/* The list of descriptors in malloc'd memory. */
+	uint32_t *list;
+	/* The size of one descriptor. */
 	unsigned element_dw_size;
-	/* The maximum number of resource descriptors. */
+	/* The maximum number of descriptors. */
 	unsigned num_elements;
+	/* Whether the list has been changed and should be re-uploaded. */
+	bool list_dirty;
 
-	/* The buffer where resource descriptors are stored. */
+	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
 	unsigned buffer_offset;
 
-	/* The i-th bit is set if that element is dirty (changed but not emitted). */
-	uint64_t dirty_mask;
 	/* The i-th bit is set if that element is enabled (non-NULL resource). */
 	uint64_t enabled_mask;
 
-	/* We can't update descriptors directly because the GPU might be
-	 * reading them at the same time, so we have to update them
-	 * in a copy-on-write manner. Each such copy is called a context,
-	 * which is just another array descriptors in the same buffer. */
-	unsigned current_context_id;
-	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
-	unsigned context_size;
-
 	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
 	unsigned shader_userdata_offset;
+	/* Whether the pointer should be re-emitted. */
 	bool pointer_dirty;
 };
 
 struct si_sampler_views {
 	struct si_descriptors		desc;
 	struct pipe_sampler_view	*views[SI_NUM_SAMPLER_VIEWS];
-	uint32_t			*desc_data[SI_NUM_SAMPLER_VIEWS];
 };
 
 struct si_sampler_states {
 	struct si_descriptors		desc;
-	uint32_t			*desc_data[SI_NUM_SAMPLER_STATES];
 	void				*saved_states[2]; /* saved for u_blitter */
 };
 
 struct si_buffer_resources {
 	struct si_descriptors		desc;
-	unsigned			num_buffers;
 	enum radeon_bo_usage		shader_usage; /* READ, WRITE, or READWRITE */
 	enum radeon_bo_priority		priority;
 	struct pipe_resource		**buffers; /* this has num_buffers elements */
-	uint32_t			*desc_storage; /* this has num_buffers*4 elements */
-	uint32_t			**desc_data; /* an array of pointers pointing to desc_storage */
 };
 
 #define si_pm4_block_idx(member) \
@@ -240,13 +228,13 @@  struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 				unsigned start, unsigned count, void **states);
-void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
 			unsigned element_size, unsigned index_stride);
 void si_init_all_descriptors(struct si_context *sctx);
+bool si_upload_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
 void si_copy_buffer(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index abfae1c..b22b266 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -539,11 +539,8 @@  void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		sctx->current_rast_prim = info->mode;
 
 	si_update_shaders(sctx);
-
-	if (sctx->vertex_buffers_dirty) {
-		si_update_vertex_buffers(sctx);
-		sctx->vertex_buffers_dirty = false;
-	}
+	if (!si_upload_shader_descriptors(sctx))
+		return;
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
-- 
2.1.4