[1/3] anv: Implement VK_KHR_draw_indirect_count for gen 7.5+

Submitted by Danylo Piliaiev on Oct. 17, 2018, 11:58 a.m.

Details

Message ID 20181017115902.25843-2-danylo.piliaiev@globallogic.com
State New
Headers show
Series "anv: Implement VK_KHR_draw_indirect_count and VK_EXT_conditional_rendering" ( rev: 2 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Danylo Piliaiev Oct. 17, 2018, 11:58 a.m.
Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
---
 src/intel/vulkan/anv_extensions.py |   1 +
 src/intel/vulkan/genX_cmd_buffer.c | 155 +++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index d4915c9501..7f44da6648 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -113,6 +113,7 @@  EXTENSIONS = [
     Extension('VK_KHR_xlib_surface',                      6, 'VK_USE_PLATFORM_XLIB_KHR'),
     Extension('VK_KHR_multiview',                         1, True),
     Extension('VK_KHR_display',                          23, 'VK_USE_PLATFORM_DISPLAY_KHR'),
+    Extension('VK_KHR_draw_indirect_count',               1, 'device->info.gen >= 8 || device->info.is_haswell'),
     Extension('VK_EXT_acquire_xlib_display',              1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
     Extension('VK_EXT_debug_report',                      8, True),
     Extension('VK_EXT_direct_mode_display',               1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 43a02f2256..d7b94efd19 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -2982,6 +2982,161 @@  void genX(CmdDrawIndexedIndirect)(
    }
 }
 
+#if GEN_IS_HASWELL || GEN_GEN >= 8
+static void
+emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
+                          struct anv_address count_address,
+                          uint32_t draw_index)
+{
+   /* Upload the current draw count from the draw parameters buffer to
+    * MI_PREDICATE_SRC0.
+    */
+   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(MI_ALU_REG14));
+
+   /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, draw_index);
+   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
+
+   if (draw_index == 0) {
+       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+          mip.LoadOperation    = LOAD_LOADINV;
+          mip.CombineOperation = COMBINE_SET;
+          mip.CompareOperation = COMPARE_SRCS_EQUAL;
+       }
+   } else {
+       /* While draw_index < draw_count the predicate's result will be
+        *  (draw_index == draw_count) ^ TRUE = TRUE
+        * When draw_index == draw_count the result is
+        *  (TRUE) ^ TRUE = FALSE
+        * After this all results will be:
+        *  (FALSE) ^ FALSE = FALSE
+        */
+       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+          mip.LoadOperation    = LOAD_LOAD;
+          mip.CombineOperation = COMBINE_XOR;
+          mip.CompareOperation = COMPARE_SRCS_EQUAL;
+       }
+   }
+}
+
+void genX(CmdDrawIndirectCountKHR)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   struct anv_address count_address =
+      anv_address_add(count_buffer->address, countBufferOffset);
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+     pc.PipeControlFlushEnable = true;
+   }
+
+   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);
+   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);
+
+   for (uint32_t i = 0; i < maxDrawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      emit_draw_count_predicate(cmd_buffer, count_address, i);
+
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+
+      load_indirect_parameters(cmd_buffer, draw, false);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = true;
+         prim.VertexAccessType         = SEQUENTIAL;
+         prim.PrimitiveTopologyType    = pipeline->topology;
+      }
+
+      offset += stride;
+   }
+}
+
+void genX(CmdDrawIndexedIndirectCountKHR)(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    _buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    _countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
+   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
+   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;
+   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
+
+   if (anv_batch_has_error(&cmd_buffer->batch))
+      return;
+
+   genX(cmd_buffer_flush_state)(cmd_buffer);
+
+   struct anv_address count_address =
+      anv_address_add(count_buffer->address, countBufferOffset);
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers.
+    */
+   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+     pc.PipeControlFlushEnable = true;
+   }
+
+   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);
+   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);
+
+   for (uint32_t i = 0; i < maxDrawCount; i++) {
+      struct anv_address draw = anv_address_add(buffer->address, offset);
+
+      emit_draw_count_predicate(cmd_buffer, count_address, i);
+
+      /* TODO: We need to stomp base vertex to 0 somehow */
+      if (vs_prog_data->uses_firstvertex ||
+          vs_prog_data->uses_baseinstance)
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
+      if (vs_prog_data->uses_drawid)
+         emit_draw_index(cmd_buffer, i);
+
+      load_indirect_parameters(cmd_buffer, draw, true);
+
+      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+         prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = true;
+         prim.VertexAccessType         = RANDOM;
+         prim.PrimitiveTopologyType    = pipeline->topology;
+      }
+
+      offset += stride;
+   }
+}
+#endif
+
 static VkResult
 flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
 {

Comments

On Wed, Oct 17, 2018 at 6:59 AM Danylo Piliaiev <danylo.piliaiev@gmail.com>
wrote:

> Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
> ---
>  src/intel/vulkan/anv_extensions.py |   1 +
>  src/intel/vulkan/genX_cmd_buffer.c | 155 +++++++++++++++++++++++++++++
>  2 files changed, 156 insertions(+)
>
> diff --git a/src/intel/vulkan/anv_extensions.py
> b/src/intel/vulkan/anv_extensions.py
> index d4915c9501..7f44da6648 100644
> --- a/src/intel/vulkan/anv_extensions.py
> +++ b/src/intel/vulkan/anv_extensions.py
> @@ -113,6 +113,7 @@ EXTENSIONS = [
>      Extension('VK_KHR_xlib_surface',                      6,
> 'VK_USE_PLATFORM_XLIB_KHR'),
>      Extension('VK_KHR_multiview',                         1, True),
>      Extension('VK_KHR_display',                          23,
> 'VK_USE_PLATFORM_DISPLAY_KHR'),
> +    Extension('VK_KHR_draw_indirect_count',               1,
> 'device->info.gen >= 8 || device->info.is_haswell'),
>      Extension('VK_EXT_acquire_xlib_display',              1,
> 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
>      Extension('VK_EXT_debug_report',                      8, True),
>      Extension('VK_EXT_direct_mode_display',               1,
> 'VK_USE_PLATFORM_DISPLAY_KHR'),
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> b/src/intel/vulkan/genX_cmd_buffer.c
> index 43a02f2256..d7b94efd19 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -2982,6 +2982,161 @@ void genX(CmdDrawIndexedIndirect)(
>     }
>  }
>
> +#if GEN_IS_HASWELL || GEN_GEN >= 8
> +static void
> +emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
> +                          struct anv_address count_address,
> +                          uint32_t draw_index)
> +{
> +   /* Upload the current draw count from the draw parameters buffer to
> +    * MI_PREDICATE_SRC0.
> +    */
> +   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(MI_ALU_REG14));
>

Do we also need to set MI_PREDICATE_SRC0 + 4 to 0?  I suspect we do.

Also, we can likely save some batch space if we have a "prepare" function
which sets MI_PREDICATE_SRC0, SRC0 + 4, and SRC1 + 4 and only emit one
LOAD_REGISTER_IMM and the MI_PREDICATE per-draw.  For lots of primitives,
those extra three MI_LOAD_REGISTER_* calls will add up.


> +
> +   /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
> +   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, draw_index);
> +   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
> +
> +   if (draw_index == 0) {
> +       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
> +          mip.LoadOperation    = LOAD_LOADINV;
> +          mip.CombineOperation = COMBINE_SET;
> +          mip.CompareOperation = COMPARE_SRCS_EQUAL;
> +       }
> +   } else {
> +       /* While draw_index < draw_count the predicate's result will be
> +        *  (draw_index == draw_count) ^ TRUE = TRUE
> +        * When draw_index == draw_count the result is
> +        *  (TRUE) ^ TRUE = FALSE
> +        * After this all results will be:
> +        *  (FALSE) ^ FALSE = FALSE
> +        */
> +       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
> +          mip.LoadOperation    = LOAD_LOAD;
> +          mip.CombineOperation = COMBINE_XOR;
> +          mip.CompareOperation = COMPARE_SRCS_EQUAL;
> +       }
> +   }
> +}
> +
> +void genX(CmdDrawIndirectCountKHR)(
> +    VkCommandBuffer                             commandBuffer,
> +    VkBuffer                                    _buffer,
> +    VkDeviceSize                                offset,
> +    VkBuffer                                    _countBuffer,
> +    VkDeviceSize                                countBufferOffset,
> +    uint32_t                                    maxDrawCount,
> +    uint32_t                                    stride)
> +{
> +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
> +   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
> +   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
> +   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
> +   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;
> +   const struct brw_vs_prog_data *vs_prog_data =
> get_vs_prog_data(pipeline);
> +
> +   if (anv_batch_has_error(&cmd_buffer->batch))
> +      return;
> +
> +   genX(cmd_buffer_flush_state)(cmd_buffer);
> +
> +   struct anv_address count_address =
> +      anv_address_add(count_buffer->address, countBufferOffset);
> +
> +   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
> +    * command when loading the values into the predicate source registers.
> +    */
> +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +     pc.PipeControlFlushEnable = true;
> +   }
>

Have you seen this be an actual problem?  If not, why?  A documentation
citation would be  nice.


> +
> +   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);
> +   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);
> +
> +   for (uint32_t i = 0; i < maxDrawCount; i++) {
> +      struct anv_address draw = anv_address_add(buffer->address, offset);
> +
> +      emit_draw_count_predicate(cmd_buffer, count_address, i);
> +
> +      if (vs_prog_data->uses_firstvertex ||
> +          vs_prog_data->uses_baseinstance)
> +         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw,
> 8));
> +      if (vs_prog_data->uses_drawid)
> +         emit_draw_index(cmd_buffer, i);
> +
> +      load_indirect_parameters(cmd_buffer, draw, false);
> +
> +      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> +         prim.IndirectParameterEnable  = true;
> +         prim.PredicateEnable          = true;
> +         prim.VertexAccessType         = SEQUENTIAL;
> +         prim.PrimitiveTopologyType    = pipeline->topology;
> +      }
> +
> +      offset += stride;
> +   }
> +}
> +
> +void genX(CmdDrawIndexedIndirectCountKHR)(
> +    VkCommandBuffer                             commandBuffer,
> +    VkBuffer                                    _buffer,
> +    VkDeviceSize                                offset,
> +    VkBuffer                                    _countBuffer,
> +    VkDeviceSize                                countBufferOffset,
> +    uint32_t                                    maxDrawCount,
> +    uint32_t                                    stride)
> +{
> +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
> +   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
> +   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
> +   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
> +   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;
> +   const struct brw_vs_prog_data *vs_prog_data =
> get_vs_prog_data(pipeline);
> +
> +   if (anv_batch_has_error(&cmd_buffer->batch))
> +      return;
> +
> +   genX(cmd_buffer_flush_state)(cmd_buffer);
> +
> +   struct anv_address count_address =
> +      anv_address_add(count_buffer->address, countBufferOffset);
> +
> +   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
> +    * command when loading the values into the predicate source registers.
> +    */
> +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +     pc.PipeControlFlushEnable = true;
> +   }
> +
> +   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);
> +   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);
> +
> +   for (uint32_t i = 0; i < maxDrawCount; i++) {
> +      struct anv_address draw = anv_address_add(buffer->address, offset);
> +
> +      emit_draw_count_predicate(cmd_buffer, count_address, i);
> +
> +      /* TODO: We need to stomp base vertex to 0 somehow */
> +      if (vs_prog_data->uses_firstvertex ||
> +          vs_prog_data->uses_baseinstance)
> +         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw,
> 12));
> +      if (vs_prog_data->uses_drawid)
> +         emit_draw_index(cmd_buffer, i);
> +
> +      load_indirect_parameters(cmd_buffer, draw, true);
> +
> +      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> +         prim.IndirectParameterEnable  = true;
> +         prim.PredicateEnable          = true;
> +         prim.VertexAccessType         = RANDOM;
> +         prim.PrimitiveTopologyType    = pipeline->topology;
> +      }
> +
> +      offset += stride;
> +   }
> +}
> +#endif
> +
>  static VkResult
>  flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
>  {
> --
> 2.18.0
>
>
On 11/6/18 12:39 AM, Jason Ekstrand wrote:
> On Wed, Oct 17, 2018 at 6:59 AM Danylo Piliaiev <danylo.piliaiev@gmail.com>
> wrote:
>
>> Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
>> ---
>>   src/intel/vulkan/anv_extensions.py |   1 +
>>   src/intel/vulkan/genX_cmd_buffer.c | 155 +++++++++++++++++++++++++++++
>>   2 files changed, 156 insertions(+)
>>
>> diff --git a/src/intel/vulkan/anv_extensions.py
>> b/src/intel/vulkan/anv_extensions.py
>> index d4915c9501..7f44da6648 100644
>> --- a/src/intel/vulkan/anv_extensions.py
>> +++ b/src/intel/vulkan/anv_extensions.py
>> @@ -113,6 +113,7 @@ EXTENSIONS = [
>>       Extension('VK_KHR_xlib_surface',                      6,
>> 'VK_USE_PLATFORM_XLIB_KHR'),
>>       Extension('VK_KHR_multiview',                         1, True),
>>       Extension('VK_KHR_display',                          23,
>> 'VK_USE_PLATFORM_DISPLAY_KHR'),
>> +    Extension('VK_KHR_draw_indirect_count',               1,
>> 'device->info.gen >= 8 || device->info.is_haswell'),
>>       Extension('VK_EXT_acquire_xlib_display',              1,
>> 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
>>       Extension('VK_EXT_debug_report',                      8, True),
>>       Extension('VK_EXT_direct_mode_display',               1,
>> 'VK_USE_PLATFORM_DISPLAY_KHR'),
>> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
>> b/src/intel/vulkan/genX_cmd_buffer.c
>> index 43a02f2256..d7b94efd19 100644
>> --- a/src/intel/vulkan/genX_cmd_buffer.c
>> +++ b/src/intel/vulkan/genX_cmd_buffer.c
>> @@ -2982,6 +2982,161 @@ void genX(CmdDrawIndexedIndirect)(
>>      }
>>   }
>>
>> +#if GEN_IS_HASWELL || GEN_GEN >= 8
>> +static void
>> +emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
>> +                          struct anv_address count_address,
>> +                          uint32_t draw_index)
>> +{
>> +   /* Upload the current draw count from the draw parameters buffer to
>> +    * MI_PREDICATE_SRC0.
>> +    */
>> +   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_SRC0, CS_GPR(MI_ALU_REG14));
>>
> Do we also need to set MI_PREDICATE_SRC0 + 4 to 0?  I suspect we do.
Yes.
I'll also recheck other places.
>
> Also, we can likely save some batch space if we have a "prepare" function
> which sets MI_PREDICATE_SRC0, SRC0 + 4, and SRC1 + 4 and only emit one
> LOAD_REGISTER_IMM and the MI_PREDICATE per-draw.  For lots of primitives,
> those extra three MI_LOAD_REGISTER_* calls will add up.
>
Makes sense
>> +
>> +   /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
>> +   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, draw_index);
>> +   emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
>> +
>> +   if (draw_index == 0) {
>> +       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
>> +          mip.LoadOperation    = LOAD_LOADINV;
>> +          mip.CombineOperation = COMBINE_SET;
>> +          mip.CompareOperation = COMPARE_SRCS_EQUAL;
>> +       }
>> +   } else {
>> +       /* While draw_index < draw_count the predicate's result will be
>> +        *  (draw_index == draw_count) ^ TRUE = TRUE
>> +        * When draw_index == draw_count the result is
>> +        *  (TRUE) ^ TRUE = FALSE
>> +        * After this all results will be:
>> +        *  (FALSE) ^ FALSE = FALSE
>> +        */
>> +       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
>> +          mip.LoadOperation    = LOAD_LOAD;
>> +          mip.CombineOperation = COMBINE_XOR;
>> +          mip.CompareOperation = COMPARE_SRCS_EQUAL;
>> +       }
>> +   }
>> +}
>> +
>> +void genX(CmdDrawIndirectCountKHR)(
>> +    VkCommandBuffer                             commandBuffer,
>> +    VkBuffer                                    _buffer,
>> +    VkDeviceSize                                offset,
>> +    VkBuffer                                    _countBuffer,
>> +    VkDeviceSize                                countBufferOffset,
>> +    uint32_t                                    maxDrawCount,
>> +    uint32_t                                    stride)
>> +{
>> +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>> +   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
>> +   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
>> +   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
>> +   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;
>> +   const struct brw_vs_prog_data *vs_prog_data =
>> get_vs_prog_data(pipeline);
>> +
>> +   if (anv_batch_has_error(&cmd_buffer->batch))
>> +      return;
>> +
>> +   genX(cmd_buffer_flush_state)(cmd_buffer);
>> +
>> +   struct anv_address count_address =
>> +      anv_address_add(count_buffer->address, countBufferOffset);
>> +
>> +   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
>> +    * command when loading the values into the predicate source registers.
>> +    */
>> +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
>> +     pc.PipeControlFlushEnable = true;
>> +   }
>>
> Have you seen this be an actual problem?  If not, why?  A documentation
> citation would be  nice.
>
You are right - citation is needed:

Volume 7: 3D-Media-GPGPU (Skylake), MI_PREDICATE:

    MI_LOAD_REGISTER_MEM commands can be used to load the MItemp0, MItemp1,
    and PredicateData registers prior to MI_PREDICATE.
    To ensure the memory sources of the MI_LOAD_REGISTER_MEM commands
    are coherent
    with previous 3D_PIPECONTROL store-DWord operations, software can
    use the
    new Pipe Control Flush Enable bit in the PIPE_CONTROL command.

It looks like memory may be not coherent here unless it is enforced 
elsewhere.
>> +
>> +   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);
>> +   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);
>> +
>> +   for (uint32_t i = 0; i < maxDrawCount; i++) {
>> +      struct anv_address draw = anv_address_add(buffer->address, offset);
>> +
>> +      emit_draw_count_predicate(cmd_buffer, count_address, i);
>> +
>> +      if (vs_prog_data->uses_firstvertex ||
>> +          vs_prog_data->uses_baseinstance)
>> +         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw,
>> 8));
>> +      if (vs_prog_data->uses_drawid)
>> +         emit_draw_index(cmd_buffer, i);
>> +
>> +      load_indirect_parameters(cmd_buffer, draw, false);
>> +
>> +      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>> +         prim.IndirectParameterEnable  = true;
>> +         prim.PredicateEnable          = true;
>> +         prim.VertexAccessType         = SEQUENTIAL;
>> +         prim.PrimitiveTopologyType    = pipeline->topology;
>> +      }
>> +
>> +      offset += stride;
>> +   }
>> +}
>> +
>> +void genX(CmdDrawIndexedIndirectCountKHR)(
>> +    VkCommandBuffer                             commandBuffer,
>> +    VkBuffer                                    _buffer,
>> +    VkDeviceSize                                offset,
>> +    VkBuffer                                    _countBuffer,
>> +    VkDeviceSize                                countBufferOffset,
>> +    uint32_t                                    maxDrawCount,
>> +    uint32_t                                    stride)
>> +{
>> +   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>> +   ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
>> +   ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
>> +   struct anv_cmd_state *cmd_state = &cmd_buffer->state;
>> +   struct anv_pipeline *pipeline = cmd_state->gfx.base.pipeline;
>> +   const struct brw_vs_prog_data *vs_prog_data =
>> get_vs_prog_data(pipeline);
>> +
>> +   if (anv_batch_has_error(&cmd_buffer->batch))
>> +      return;
>> +
>> +   genX(cmd_buffer_flush_state)(cmd_buffer);
>> +
>> +   struct anv_address count_address =
>> +      anv_address_add(count_buffer->address, countBufferOffset);
>> +
>> +   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
>> +    * command when loading the values into the predicate source registers.
>> +    */
>> +   anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
>> +     pc.PipeControlFlushEnable = true;
>> +   }
>> +
>> +   emit_lrm(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14), count_address);
>> +   emit_lri(&cmd_buffer->batch, CS_GPR(MI_ALU_REG14) + 4, 0);
>> +
>> +   for (uint32_t i = 0; i < maxDrawCount; i++) {
>> +      struct anv_address draw = anv_address_add(buffer->address, offset);
>> +
>> +      emit_draw_count_predicate(cmd_buffer, count_address, i);
>> +
>> +      /* TODO: We need to stomp base vertex to 0 somehow */
>> +      if (vs_prog_data->uses_firstvertex ||
>> +          vs_prog_data->uses_baseinstance)
>> +         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw,
>> 12));
>> +      if (vs_prog_data->uses_drawid)
>> +         emit_draw_index(cmd_buffer, i);
>> +
>> +      load_indirect_parameters(cmd_buffer, draw, true);
>> +
>> +      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>> +         prim.IndirectParameterEnable  = true;
>> +         prim.PredicateEnable          = true;
>> +         prim.VertexAccessType         = RANDOM;
>> +         prim.PrimitiveTopologyType    = pipeline->topology;
>> +      }
>> +
>> +      offset += stride;
>> +   }
>> +}
>> +#endif
>> +
>>   static VkResult
>>   flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer)
>>   {
>> --
>> 2.18.0
>>
>>