[v2] radv: Support VK_EXT_inline_uniform_block.

Submitted by Samuel Pitoiset on April 16, 2019, 3:29 p.m.

Details

Message ID 20190416152912.14620-1-samuel.pitoiset@gmail.com
State Accepted
Headers show
Series "Series without cover letter" ( rev: 2 ) in Mesa

Not browsing as part of any series.

Commit Message

Samuel Pitoiset April 16, 2019, 3:29 p.m.
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

Basically just reserve the memory in the descriptor sets.

On the shader side we construct a buffer descriptor, since
AFAIU VGPR indexing on 32-bit pointers in LLVM is still broken.

This fully supports update after bind and variable descriptor set
sizes. However, the limits are somewhat arbitrary and are mostly
about finding a reasonable division of a 2 GiB max memory size over
the set.

v2: - rebased on top of master (Samuel)
    - remove the loading resources rework (Samuel)
    - only load UBO descriptors if it's a pointer (Samuel)
    - use LLVMBuildPtrToInt to avoid IR failures (Samuel)
---
 src/amd/vulkan/radv_descriptor_set.c | 83 ++++++++++++++++++++++++----
 src/amd/vulkan/radv_device.c         | 22 +++++++-
 src/amd/vulkan/radv_extensions.py    |  1 +
 src/amd/vulkan/radv_nir_to_llvm.c    | 31 ++++++++++-
 src/amd/vulkan/radv_private.h        |  2 +
 5 files changed, 124 insertions(+), 15 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c
index 68171b5d244..6c6b88a4553 100644
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -127,6 +127,7 @@  VkResult radv_CreateDescriptorSetLayout(
 		uint32_t b = binding->binding;
 		uint32_t alignment;
 		unsigned binding_buffer_count = 0;
+		uint32_t descriptor_count = binding->descriptorCount;
 
 		switch (binding->descriptorType) {
 		case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -164,6 +165,11 @@  VkResult radv_CreateDescriptorSetLayout(
 			set_layout->binding[b].size = 16;
 			alignment = 16;
 			break;
+		case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+			alignment = 16;
+			set_layout->binding[b].size = descriptor_count;
+			descriptor_count = 1;
+			break;
 		default:
 			unreachable("unknown descriptor type\n");
 			break;
@@ -171,7 +177,7 @@  VkResult radv_CreateDescriptorSetLayout(
 
 		set_layout->size = align(set_layout->size, alignment);
 		set_layout->binding[b].type = binding->descriptorType;
-		set_layout->binding[b].array_size = binding->descriptorCount;
+		set_layout->binding[b].array_size = descriptor_count;
 		set_layout->binding[b].offset = set_layout->size;
 		set_layout->binding[b].buffer_offset = buffer_count;
 		set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count;
@@ -207,9 +213,9 @@  VkResult radv_CreateDescriptorSetLayout(
 			samplers_offset += 4 * sizeof(uint32_t) * binding->descriptorCount;
 		}
 
-		set_layout->size += binding->descriptorCount * set_layout->binding[b].size;
-		buffer_count += binding->descriptorCount * binding_buffer_count;
-		dynamic_offset_count += binding->descriptorCount *
+		set_layout->size += descriptor_count * set_layout->binding[b].size;
+		buffer_count += descriptor_count * binding_buffer_count;
+		dynamic_offset_count += descriptor_count *
 			set_layout->binding[b].dynamic_offset_count;
 		set_layout->shader_stages |= binding->stageFlags;
 	}
@@ -264,6 +270,7 @@  void radv_GetDescriptorSetLayoutSupport(VkDevice device,
 
 		uint64_t descriptor_size = 0;
 		uint64_t descriptor_alignment = 1;
+		uint32_t descriptor_count = binding->descriptorCount;
 		switch (binding->descriptorType) {
 		case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
 		case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
@@ -282,7 +289,7 @@  void radv_GetDescriptorSetLayoutSupport(VkDevice device,
 			descriptor_alignment = 32;
 			break;
 		case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-			if (!has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount)) {
+			if (!has_equal_immutable_samplers(binding->pImmutableSamplers, descriptor_count)) {
 				descriptor_size = 64;
 			} else {
 				descriptor_size = 96;
@@ -290,11 +297,16 @@  void radv_GetDescriptorSetLayoutSupport(VkDevice device,
 			descriptor_alignment = 32;
 			break;
 		case VK_DESCRIPTOR_TYPE_SAMPLER:
-			if (!has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount)) {
+			if (!has_equal_immutable_samplers(binding->pImmutableSamplers, descriptor_count)) {
 				descriptor_size = 16;
 				descriptor_alignment = 16;
 			}
 			break;
+		case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+			descriptor_alignment = 16;
+			descriptor_size = descriptor_count;
+			descriptor_count = 1;
+			break;
 		default:
 			unreachable("unknown descriptor type\n");
 			break;
@@ -305,18 +317,20 @@  void radv_GetDescriptorSetLayoutSupport(VkDevice device,
 		}
 		size = align_u64(size, descriptor_alignment);
 
-		uint64_t max_count = UINT64_MAX;
-		if (descriptor_size)
-			max_count = (UINT64_MAX - size) / descriptor_size;
+		uint64_t max_count = INT32_MAX;
+		if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
+			max_count = INT32_MAX - size;
+		else if (descriptor_size)
+			max_count = (INT32_MAX - size) / descriptor_size;
 
-		if (max_count < binding->descriptorCount) {
+		if (max_count < descriptor_count) {
 			supported = false;
 		}
 		if (variable_flags && binding->binding <variable_flags->bindingCount && variable_count &&
 		    (variable_flags->pBindingFlags[binding->binding] & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)) {
 			variable_count->maxVariableDescriptorCount = MIN2(UINT32_MAX, max_count);
 		}
-		size += binding->descriptorCount * descriptor_size;
+		size += descriptor_count * descriptor_size;
 	}
 
 	free(bindings);
@@ -543,6 +557,21 @@  VkResult radv_CreateDescriptorPool(
 	uint64_t size = sizeof(struct radv_descriptor_pool);
 	uint64_t bo_size = 0, bo_count = 0, range_count = 0;
 
+	vk_foreach_struct(ext, pCreateInfo->pNext) {
+		switch (ext->sType) {
+		case VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO_EXT: {
+			const struct VkDescriptorPoolInlineUniformBlockCreateInfoEXT *info =
+				(const struct VkDescriptorPoolInlineUniformBlockCreateInfoEXT*)ext;
+			/* the sizes are 4 aligned, and we need to align to at
+			 * most 32, which needs at most 28 bytes extra per
+			 * binding. */
+			bo_size += 28llu * info->maxInlineUniformBlockBindings;
+			break;
+		}
+		default:
+			break;
+		}
+	}
 
 	for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
 		if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER)
@@ -569,6 +598,9 @@  VkResult radv_CreateDescriptorPool(
 		case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
 			bo_size += 96 * pCreateInfo->pPoolSizes[i].descriptorCount;
 			break;
+		case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
+			bo_size += pCreateInfo->pPoolSizes[i].descriptorCount;
+			break;
 		default:
 			unreachable("unknown descriptor type\n");
 			break;
@@ -764,6 +796,17 @@  static void write_buffer_descriptor(struct radv_device *device,
 		*buffer_list = buffer->bo;
 }
 
+static void write_block_descriptor(struct radv_device *device,
+                                   struct radv_cmd_buffer *cmd_buffer,
+                                   void *dst,
+                                   const VkWriteDescriptorSet *writeset)
+{
+	const VkWriteDescriptorSetInlineUniformBlockEXT *inline_ub =
+		vk_find_struct_const(writeset->pNext, WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK_EXT);
+
+	memcpy(dst, inline_ub->pData, inline_ub->dataSize);
+}
+
 static void write_dynamic_buffer_descriptor(struct radv_device *device,
                                             struct radv_descriptor_range *range,
                                             struct radeon_winsys_bo **buffer_list,
@@ -862,6 +905,12 @@  void radv_update_descriptor_sets(
 		const uint32_t *samplers = radv_immutable_samplers(set->layout, binding_layout);
 
 		ptr += binding_layout->offset / 4;
+
+		if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+			write_block_descriptor(device, cmd_buffer, (uint8_t*)ptr + writeset->dstArrayElement, writeset);
+			continue;
+		}
+
 		ptr += binding_layout->size * writeset->dstArrayElement / 4;
 		buffer_list += binding_layout->buffer_offset;
 		buffer_list += writeset->dstArrayElement;
@@ -1042,7 +1091,12 @@  VkResult radv_CreateDescriptorUpdateTemplate(VkDevice _device,
 			default:
 				break;
 			}
-			dst_offset = binding_layout->offset / 4 + binding_layout->size * entry->dstArrayElement / 4;
+			dst_offset = binding_layout->offset / 4;
+			if (entry->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
+				dst_offset += entry->dstArrayElement / 4;
+			else
+				dst_offset += binding_layout->size * entry->dstArrayElement / 4;
+
 			dst_stride = binding_layout->size / 4;
 			break;
 		}
@@ -1092,6 +1146,11 @@  void radv_update_descriptor_set_with_template(struct radv_device *device,
 		const uint8_t *pSrc = ((const uint8_t *) pData) + templ->entry[i].src_offset;
 		uint32_t j;
 
+		if (templ->entry[i].descriptor_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+			memcpy((uint8_t*)pDst, pSrc, templ->entry[i].descriptor_count);
+			continue;
+		}
+
 		for (j = 0; j < templ->entry[i].descriptor_count; ++j) {
 			switch (templ->entry[i].descriptor_type) {
 			case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 1f77dcadb17..db42d5d49b3 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -906,6 +906,14 @@  void radv_GetPhysicalDeviceFeatures2(
 			features->shaderInt8 = true;
 			break;
 		}
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: {
+			VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features =
+				(VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext;
+
+			features->inlineUniformBlock = true;
+			features->descriptorBindingInlineUniformBlockUpdateAfterBind = true;
+			break;
+		}
 		default:
 			break;
 		}
@@ -1200,7 +1208,8 @@  void radv_GetPhysicalDeviceProperties2(
 			properties->robustBufferAccessUpdateAfterBind = false;
 			properties->quadDivergentImplicitLod = false;
 
-			size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
+			size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS -
+				MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
 			          (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
 			           32 /* storage buffer, 32 due to potential space wasted on alignment */ +
 			           32 /* sampler, largest when combined with image */ +
@@ -1288,6 +1297,17 @@  void radv_GetPhysicalDeviceProperties2(
 			properties->transformFeedbackDraw = true;
 			break;
 		}
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: {
+			VkPhysicalDeviceInlineUniformBlockPropertiesEXT *props =
+				(VkPhysicalDeviceInlineUniformBlockPropertiesEXT *)ext;
+
+			props->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
+			props->maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_SETS;
+			props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_SETS;
+			props->maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT;
+			props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT;
+			break;
+		}
 		default:
 			break;
 		}
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index 13fe391e623..034d55df7c5 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -111,6 +111,7 @@  EXTENSIONS = [
     Extension('VK_EXT_external_memory_host',              1, 'device->rad_info.has_userptr'),
     Extension('VK_EXT_global_priority',                   1, 'device->rad_info.has_ctx_priority'),
     Extension('VK_EXT_host_query_reset',                  1, True),
+    Extension('VK_EXT_inline_uniform_block',              1, True),
     Extension('VK_EXT_memory_budget',                     1, True),
     Extension('VK_EXT_memory_priority',                   1, True),
     Extension('VK_EXT_pci_bus_info',                      2, True),
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 58a3cf18fe1..5bc88298ee6 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1305,13 +1305,35 @@  radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index,
 	} else
 		stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
 
-	offset = ac_build_imad(&ctx->ac, index, stride,
-			       LLVMConstInt(ctx->ac.i32, base_offset, false));
+	offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
+
+	if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+		offset = ac_build_imad(&ctx->ac, index, stride, offset);
+	}
 
 	desc_ptr = ac_build_gep0(&ctx->ac, desc_ptr, offset);
 	desc_ptr = ac_cast_ptr(&ctx->ac, desc_ptr, ctx->ac.v4i32);
 	LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
 
+	if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
+		uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+			S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+			S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+			S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+			S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+			S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+		LLVMValueRef desc_components[4] = {
+			LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""),
+			LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false),
+			/* High limit to support variable sizes. */
+			LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
+			LLVMConstInt(ctx->ac.i32, desc_type, false),
+		};
+
+		return ac_build_gather_values(&ctx->ac, desc_components, 4);
+	}
+
 	return desc_ptr;
 }
 
@@ -1910,6 +1932,11 @@  static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer
 	struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
 	LLVMValueRef result;
 
+	if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr)) != LLVMPointerTypeKind) {
+		/* Do not load the descriptor for inlined uniform blocks. */
+		return buffer_ptr;
+	}
+
 	LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
 
 	result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 82ab4eff8ca..cd3af7e614d 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -102,6 +102,8 @@  typedef uint32_t xcb_window_t;
 #define MAX_SO_STREAMS 4
 #define MAX_SO_BUFFERS 4
 #define MAX_SO_OUTPUTS 64
+#define MAX_INLINE_UNIFORM_BLOCK_SIZE (4ull * 1024 * 1024)
+#define MAX_INLINE_UNIFORM_BLOCK_COUNT 64
 
 #define NUM_DEPTH_CLEAR_PIPELINES 3
 

Comments

So I have trouble making sense of what did you change but on its own
the patch looks good to me. r-b

On Tue, Apr 16, 2019 at 5:26 PM Samuel Pitoiset
<samuel.pitoiset@gmail.com> wrote:
>
> From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
>
> Basically just reserve the memory in the descriptor sets.
>
> On the shader side we construct a buffer descriptor, since
> AFAIU VGPR indexing on 32-bit pointers in LLVM is still broken.
>
> This fully supports update after bind and variable descriptor set
> sizes. However, the limits are somewhat arbitrary and are mostly
> about finding a reasonable division of a 2 GiB max memory size over
> the set.
>
> v2: - rebased on top of master (Samuel)
>     - remove the loading resources rework (Samuel)
>     - only load UBO descriptors if it's a pointer (Samuel)
>     - use LLVMBuildPtrToInt to avoid IR failures (Samuel)
> ---
>  src/amd/vulkan/radv_descriptor_set.c | 83 ++++++++++++++++++++++++----
>  src/amd/vulkan/radv_device.c         | 22 +++++++-
>  src/amd/vulkan/radv_extensions.py    |  1 +
>  src/amd/vulkan/radv_nir_to_llvm.c    | 31 ++++++++++-
>  src/amd/vulkan/radv_private.h        |  2 +
>  5 files changed, 124 insertions(+), 15 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c
> index 68171b5d244..6c6b88a4553 100644
> --- a/src/amd/vulkan/radv_descriptor_set.c
> +++ b/src/amd/vulkan/radv_descriptor_set.c
> @@ -127,6 +127,7 @@ VkResult radv_CreateDescriptorSetLayout(
>                 uint32_t b = binding->binding;
>                 uint32_t alignment;
>                 unsigned binding_buffer_count = 0;
> +               uint32_t descriptor_count = binding->descriptorCount;
>
>                 switch (binding->descriptorType) {
>                 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
> @@ -164,6 +165,11 @@ VkResult radv_CreateDescriptorSetLayout(
>                         set_layout->binding[b].size = 16;
>                         alignment = 16;
>                         break;
> +               case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
> +                       alignment = 16;
> +                       set_layout->binding[b].size = descriptor_count;
> +                       descriptor_count = 1;
> +                       break;
>                 default:
>                         unreachable("unknown descriptor type\n");
>                         break;
> @@ -171,7 +177,7 @@ VkResult radv_CreateDescriptorSetLayout(
>
>                 set_layout->size = align(set_layout->size, alignment);
>                 set_layout->binding[b].type = binding->descriptorType;
> -               set_layout->binding[b].array_size = binding->descriptorCount;
> +               set_layout->binding[b].array_size = descriptor_count;
>                 set_layout->binding[b].offset = set_layout->size;
>                 set_layout->binding[b].buffer_offset = buffer_count;
>                 set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count;
> @@ -207,9 +213,9 @@ VkResult radv_CreateDescriptorSetLayout(
>                         samplers_offset += 4 * sizeof(uint32_t) * binding->descriptorCount;
>                 }
>
> -               set_layout->size += binding->descriptorCount * set_layout->binding[b].size;
> -               buffer_count += binding->descriptorCount * binding_buffer_count;
> -               dynamic_offset_count += binding->descriptorCount *
> +               set_layout->size += descriptor_count * set_layout->binding[b].size;
> +               buffer_count += descriptor_count * binding_buffer_count;
> +               dynamic_offset_count += descriptor_count *
>                         set_layout->binding[b].dynamic_offset_count;
>                 set_layout->shader_stages |= binding->stageFlags;
>         }
> @@ -264,6 +270,7 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device,
>
>                 uint64_t descriptor_size = 0;
>                 uint64_t descriptor_alignment = 1;
> +               uint32_t descriptor_count = binding->descriptorCount;
>                 switch (binding->descriptorType) {
>                 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
>                 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
> @@ -282,7 +289,7 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device,
>                         descriptor_alignment = 32;
>                         break;
>                 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
> -                       if (!has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount)) {
> +                       if (!has_equal_immutable_samplers(binding->pImmutableSamplers, descriptor_count)) {
>                                 descriptor_size = 64;
>                         } else {
>                                 descriptor_size = 96;
> @@ -290,11 +297,16 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device,
>                         descriptor_alignment = 32;
>                         break;
>                 case VK_DESCRIPTOR_TYPE_SAMPLER:
> -                       if (!has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount)) {
> +                       if (!has_equal_immutable_samplers(binding->pImmutableSamplers, descriptor_count)) {
>                                 descriptor_size = 16;
>                                 descriptor_alignment = 16;
>                         }
>                         break;
> +               case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
> +                       descriptor_alignment = 16;
> +                       descriptor_size = descriptor_count;
> +                       descriptor_count = 1;
> +                       break;
>                 default:
>                         unreachable("unknown descriptor type\n");
>                         break;
> @@ -305,18 +317,20 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device,
>                 }
>                 size = align_u64(size, descriptor_alignment);
>
> -               uint64_t max_count = UINT64_MAX;
> -               if (descriptor_size)
> -                       max_count = (UINT64_MAX - size) / descriptor_size;
> +               uint64_t max_count = INT32_MAX;
> +               if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
> +                       max_count = INT32_MAX - size;
> +               else if (descriptor_size)
> +                       max_count = (INT32_MAX - size) / descriptor_size;
>
> -               if (max_count < binding->descriptorCount) {
> +               if (max_count < descriptor_count) {
>                         supported = false;
>                 }
>                 if (variable_flags && binding->binding <variable_flags->bindingCount && variable_count &&
>                     (variable_flags->pBindingFlags[binding->binding] & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)) {
>                         variable_count->maxVariableDescriptorCount = MIN2(UINT32_MAX, max_count);
>                 }
> -               size += binding->descriptorCount * descriptor_size;
> +               size += descriptor_count * descriptor_size;
>         }
>
>         free(bindings);
> @@ -543,6 +557,21 @@ VkResult radv_CreateDescriptorPool(
>         uint64_t size = sizeof(struct radv_descriptor_pool);
>         uint64_t bo_size = 0, bo_count = 0, range_count = 0;
>
> +       vk_foreach_struct(ext, pCreateInfo->pNext) {
> +               switch (ext->sType) {
> +               case VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO_EXT: {
> +                       const struct VkDescriptorPoolInlineUniformBlockCreateInfoEXT *info =
> +                               (const struct VkDescriptorPoolInlineUniformBlockCreateInfoEXT*)ext;
> +                       /* the sizes are 4 aligned, and we need to align to at
> +                        * most 32, which needs at most 28 bytes extra per
> +                        * binding. */
> +                       bo_size += 28llu * info->maxInlineUniformBlockBindings;
> +                       break;
> +               }
> +               default:
> +                       break;
> +               }
> +       }
>
>         for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
>                 if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER)
> @@ -569,6 +598,9 @@ VkResult radv_CreateDescriptorPool(
>                 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
>                         bo_size += 96 * pCreateInfo->pPoolSizes[i].descriptorCount;
>                         break;
> +               case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT:
> +                       bo_size += pCreateInfo->pPoolSizes[i].descriptorCount;
> +                       break;
>                 default:
>                         unreachable("unknown descriptor type\n");
>                         break;
> @@ -764,6 +796,17 @@ static void write_buffer_descriptor(struct radv_device *device,
>                 *buffer_list = buffer->bo;
>  }
>
> +static void write_block_descriptor(struct radv_device *device,
> +                                   struct radv_cmd_buffer *cmd_buffer,
> +                                   void *dst,
> +                                   const VkWriteDescriptorSet *writeset)
> +{
> +       const VkWriteDescriptorSetInlineUniformBlockEXT *inline_ub =
> +               vk_find_struct_const(writeset->pNext, WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK_EXT);
> +
> +       memcpy(dst, inline_ub->pData, inline_ub->dataSize);
> +}
> +
>  static void write_dynamic_buffer_descriptor(struct radv_device *device,
>                                              struct radv_descriptor_range *range,
>                                              struct radeon_winsys_bo **buffer_list,
> @@ -862,6 +905,12 @@ void radv_update_descriptor_sets(
>                 const uint32_t *samplers = radv_immutable_samplers(set->layout, binding_layout);
>
>                 ptr += binding_layout->offset / 4;
> +
> +               if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
> +                       write_block_descriptor(device, cmd_buffer, (uint8_t*)ptr + writeset->dstArrayElement, writeset);
> +                       continue;
> +               }
> +
>                 ptr += binding_layout->size * writeset->dstArrayElement / 4;
>                 buffer_list += binding_layout->buffer_offset;
>                 buffer_list += writeset->dstArrayElement;
> @@ -1042,7 +1091,12 @@ VkResult radv_CreateDescriptorUpdateTemplate(VkDevice _device,
>                         default:
>                                 break;
>                         }
> -                       dst_offset = binding_layout->offset / 4 + binding_layout->size * entry->dstArrayElement / 4;
> +                       dst_offset = binding_layout->offset / 4;
> +                       if (entry->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
> +                               dst_offset += entry->dstArrayElement / 4;
> +                       else
> +                               dst_offset += binding_layout->size * entry->dstArrayElement / 4;
> +
>                         dst_stride = binding_layout->size / 4;
>                         break;
>                 }
> @@ -1092,6 +1146,11 @@ void radv_update_descriptor_set_with_template(struct radv_device *device,
>                 const uint8_t *pSrc = ((const uint8_t *) pData) + templ->entry[i].src_offset;
>                 uint32_t j;
>
> +               if (templ->entry[i].descriptor_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
> +                       memcpy((uint8_t*)pDst, pSrc, templ->entry[i].descriptor_count);
> +                       continue;
> +               }
> +
>                 for (j = 0; j < templ->entry[i].descriptor_count; ++j) {
>                         switch (templ->entry[i].descriptor_type) {
>                         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 1f77dcadb17..db42d5d49b3 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -906,6 +906,14 @@ void radv_GetPhysicalDeviceFeatures2(
>                         features->shaderInt8 = true;
>                         break;
>                 }
> +               case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: {
> +                       VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features =
> +                               (VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext;
> +
> +                       features->inlineUniformBlock = true;
> +                       features->descriptorBindingInlineUniformBlockUpdateAfterBind = true;
> +                       break;
> +               }
>                 default:
>                         break;
>                 }
> @@ -1200,7 +1208,8 @@ void radv_GetPhysicalDeviceProperties2(
>                         properties->robustBufferAccessUpdateAfterBind = false;
>                         properties->quadDivergentImplicitLod = false;
>
> -                       size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
> +                       size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS -
> +                               MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) /
>                                   (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
>                                    32 /* storage buffer, 32 due to potential space wasted on alignment */ +
>                                    32 /* sampler, largest when combined with image */ +
> @@ -1288,6 +1297,17 @@ void radv_GetPhysicalDeviceProperties2(
>                         properties->transformFeedbackDraw = true;
>                         break;
>                 }
> +               case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: {
> +                       VkPhysicalDeviceInlineUniformBlockPropertiesEXT *props =
> +                               (VkPhysicalDeviceInlineUniformBlockPropertiesEXT *)ext;
> +
> +                       props->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE;
> +                       props->maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_SETS;
> +                       props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_SETS;
> +                       props->maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT;
> +                       props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT;
> +                       break;
> +               }
>                 default:
>                         break;
>                 }
> diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
> index 13fe391e623..034d55df7c5 100644
> --- a/src/amd/vulkan/radv_extensions.py
> +++ b/src/amd/vulkan/radv_extensions.py
> @@ -111,6 +111,7 @@ EXTENSIONS = [
>      Extension('VK_EXT_external_memory_host',              1, 'device->rad_info.has_userptr'),
>      Extension('VK_EXT_global_priority',                   1, 'device->rad_info.has_ctx_priority'),
>      Extension('VK_EXT_host_query_reset',                  1, True),
> +    Extension('VK_EXT_inline_uniform_block',              1, True),
>      Extension('VK_EXT_memory_budget',                     1, True),
>      Extension('VK_EXT_memory_priority',                   1, True),
>      Extension('VK_EXT_pci_bus_info',                      2, True),
> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
> index 58a3cf18fe1..5bc88298ee6 100644
> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> @@ -1305,13 +1305,35 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index,
>         } else
>                 stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
>
> -       offset = ac_build_imad(&ctx->ac, index, stride,
> -                              LLVMConstInt(ctx->ac.i32, base_offset, false));
> +       offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
> +
> +       if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
> +               offset = ac_build_imad(&ctx->ac, index, stride, offset);
> +       }
>
>         desc_ptr = ac_build_gep0(&ctx->ac, desc_ptr, offset);
>         desc_ptr = ac_cast_ptr(&ctx->ac, desc_ptr, ctx->ac.v4i32);
>         LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
>
> +       if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
> +               uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +                       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +                       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +                       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +                       S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
> +                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
> +
> +               LLVMValueRef desc_components[4] = {
> +                       LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""),
> +                       LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false),
> +                       /* High limit to support variable sizes. */
> +                       LLVMConstInt(ctx->ac.i32, 0xffffffff, false),
> +                       LLVMConstInt(ctx->ac.i32, desc_type, false),
> +               };
> +
> +               return ac_build_gather_values(&ctx->ac, desc_components, 4);
> +       }
> +
>         return desc_ptr;
>  }
>
> @@ -1910,6 +1932,11 @@ static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer
>         struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
>         LLVMValueRef result;
>
> +       if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr)) != LLVMPointerTypeKind) {
> +               /* Do not load the descriptor for inlined uniform blocks. */
> +               return buffer_ptr;
> +       }
> +
>         LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
>
>         result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, "");
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 82ab4eff8ca..cd3af7e614d 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -102,6 +102,8 @@ typedef uint32_t xcb_window_t;
>  #define MAX_SO_STREAMS 4
>  #define MAX_SO_BUFFERS 4
>  #define MAX_SO_OUTPUTS 64
> +#define MAX_INLINE_UNIFORM_BLOCK_SIZE (4ull * 1024 * 1024)
> +#define MAX_INLINE_UNIFORM_BLOCK_COUNT 64
>
>  #define NUM_DEPTH_CLEAR_PIPELINES 3
>
> --
> 2.21.0
>