[3/3] anv: Implement VK_EXT_conditional_rendering for gen 7.5+

Submitted by Danylo Piliaiev on Oct. 17, 2018, 11:58 a.m.

Details

Message ID 20181017115902.25843-4-danylo.piliaiev@globallogic.com
State New
Headers show
Series "anv: Implement VK_KHR_draw_indirect_count and VK_EXT_conditional_rendering" ( rev: 2 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Danylo Piliaiev Oct. 17, 2018, 11:58 a.m.
Conditional rendering affects next functions:
- vkCmdDraw, vkCmdDrawIndexed, vkCmdDrawIndirect, vkCmdDrawIndexedIndirect
- vkCmdDrawIndirectCountKHR, vkCmdDrawIndexedIndirectCountKHR
- vkCmdDispatch, vkCmdDispatchIndirect, vkCmdDispatchBase
- vkCmdClearAttachments

To reduce readings from the memory a result of the condition is calculated
and stored into designated register MI_ALU_REG15.

In current implementation affected functions expect MI_PREDICATE_RESULT
being set before their call so any code which changes the predicate
should restore it with restore_conditional_render_predicate.
An alternative is to restore MI_PREDICATE_RESULT in all affected
functions at their beginning.

Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
---
 src/intel/vulkan/anv_blorp.c       |   7 +-
 src/intel/vulkan/anv_device.c      |  12 ++
 src/intel/vulkan/anv_extensions.py |   1 +
 src/intel/vulkan/anv_private.h     |   2 +
 src/intel/vulkan/genX_cmd_buffer.c | 192 ++++++++++++++++++++++++++++-
 5 files changed, 209 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 478b8e7a3d..157875d16f 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -1144,8 +1144,11 @@  void anv_CmdClearAttachments(
     * trash our depth and stencil buffers.
     */
    struct blorp_batch batch;
-   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
-                    BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
+   enum blorp_batch_flags flags = BLORP_BATCH_NO_EMIT_DEPTH_STENCIL;
+   if (cmd_buffer->state.conditional_render_enabled) {
+       flags |= BLORP_BATCH_PREDICATE_ENABLE;
+   }
+   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer, flags);
 
    for (uint32_t a = 0; a < attachmentCount; ++a) {
       if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index a2551452eb..930a192c25 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -957,6 +957,18 @@  void anv_GetPhysicalDeviceFeatures2(
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
+         VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
+            (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
+         ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+         features->conditionalRendering = pdevice->info.gen >= 8 ||
+                                          pdevice->info.is_haswell;
+         features->inheritedConditionalRendering = pdevice->info.gen >= 8 ||
+                                                   pdevice->info.is_haswell;
+         break;
+      }
+
       default:
          anv_debug_ignored_stype(ext->sType);
          break;
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index c13ce531ee..2ef7a52d01 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -127,6 +127,7 @@  EXTENSIONS = [
     Extension('VK_EXT_vertex_attribute_divisor',          3, True),
     Extension('VK_EXT_post_depth_coverage',               1, 'device->info.gen >= 9'),
     Extension('VK_EXT_sampler_filter_minmax',             1, 'device->info.gen >= 9'),
+    Extension('VK_EXT_conditional_rendering',             1, 'device->info.gen >= 8 || device->info.is_haswell'),
 ]
 
 class VkVersion:
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 599b903f25..108da51a59 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2032,6 +2032,8 @@  struct anv_cmd_state {
     */
    bool                                         hiz_enabled;
 
+   bool                                         conditional_render_enabled;
+
    /**
     * Array length is anv_cmd_state::pass::attachment_count. Array content is
     * valid only when recording a render pass instance.
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index f07a6aa7c9..87abc443b6 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -479,8 +479,9 @@  transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
                        0, 0, 1, hiz_op);
 }
 
-#define MI_PREDICATE_SRC0  0x2400
-#define MI_PREDICATE_SRC1  0x2408
+#define MI_PREDICATE_SRC0    0x2400
+#define MI_PREDICATE_SRC1    0x2408
+#define MI_PREDICATE_RESULT  0x2418
 
 static void
 set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
@@ -545,6 +546,14 @@  mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
 
 #define CS_GPR(n) (0x2600 + (n) * 8)
 
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+static void
+restore_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
+{
+   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT, CS_GPR(MI_ALU_REG15));
+}
+#endif
+
 /* This is only really practical on haswell and above because it requires
  * MI math in order to get it correct.
  */
@@ -1144,6 +1153,12 @@  transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
       }
    }
 
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+   if (cmd_buffer->state.conditional_render_enabled) {
+      restore_conditional_render_predicate(cmd_buffer);
+   }
+#endif
+
    cmd_buffer->state.pending_pipe_bits |=
       ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
 }
@@ -1397,6 +1412,26 @@  genX(BeginCommandBuffer)(
       cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
    }
 
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      vk_foreach_struct_const(s, pBeginInfo->pInheritanceInfo->pNext) {
+         switch (s->sType) {
+         case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
+            const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
+               (const VkCommandBufferInheritanceConditionalRenderingInfoEXT *) s;
+            /* We should emit commands as if conditional render is enabled. */
+            cmd_buffer->state.conditional_render_enabled =
+               conditional_rendering_info->conditionalRenderingEnable;
+            break;
+         }
+         default:
+            anv_debug_ignored_stype(s->sType);
+            break;
+         }
+      }
+   }
+#endif
+
    return result;
 }
 
@@ -1501,6 +1536,20 @@  genX(CmdExecuteCommands)(
       assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
       assert(!anv_batch_has_error(&secondary->batch));
 
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+      if (secondary->state.conditional_render_enabled) {
+          /* Secondary buffer is constructed as if it will be executed
+           * with conditional rendering, we should satisfy this dependency
+           * regardless of conditional rendering being enabled in primary.
+           */
+          if (!primary->state.conditional_render_enabled) {
+             emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15), 1);
+             emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15) + 4, 0);
+             emit_lrr(&primary->batch, MI_PREDICATE_RESULT, CS_GPR(MI_ALU_REG15));
+          }
+      }
+#endif
+
       if (secondary->usage_flags &
           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
          /* If we're continuing a render pass from the primary, we need to
@@ -2761,6 +2810,7 @@  void genX(CmdDraw)(
    instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
 
    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
       prim.VertexAccessType         = SEQUENTIAL;
       prim.PrimitiveTopologyType    = pipeline->topology;
       prim.VertexCountPerInstance   = vertexCount;
@@ -2800,6 +2850,7 @@  void genX(CmdDrawIndexed)(
    instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
 
    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+      prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
       prim.VertexAccessType         = RANDOM;
       prim.PrimitiveTopologyType    = pipeline->topology;
       prim.VertexCountPerInstance   = indexCount;
@@ -2935,6 +2986,7 @@  void genX(CmdDrawIndirect)(
 
       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
          prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
          prim.VertexAccessType         = SEQUENTIAL;
          prim.PrimitiveTopologyType    = pipeline->topology;
       }
@@ -2974,6 +3026,7 @@  void genX(CmdDrawIndexedIndirect)(
 
       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
          prim.IndirectParameterEnable  = true;
+         prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
          prim.VertexAccessType         = RANDOM;
          prim.PrimitiveTopologyType    = pipeline->topology;
       }
@@ -3024,6 +3077,42 @@  emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
    }
 }
 
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+static void
+emit_draw_count_predicate_with_conditional_render(
+                          struct anv_cmd_buffer *cmd_buffer,
+                          struct anv_address count_address,
+                          uint32_t draw_index)
+{
+   const int draw_index_reg = MI_ALU_REG0;
+   const int draw_count_reg = MI_ALU_REG14;
+   const int condition_reg = MI_ALU_REG15;
+   const int tmp_result_reg = MI_ALU_REG1;
+
+   emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg), draw_index);
+   emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg) + 4, 0);
+
+   uint32_t *dw;
+   /* Compute (draw_index < draw_count).
+    * We do this by subtracting and storing the carry bit.
+    */
+   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
+   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, draw_index_reg);
+   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, draw_count_reg);
+   dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
+   dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_CF);
+
+   /* & condition */
+   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
+   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, tmp_result_reg);
+   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, condition_reg);
+   dw[3] = mi_alu(MI_ALU_AND, 0, 0);
+   dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_ACCU);
+
+   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT, CS_GPR(tmp_result_reg));
+}
+#endif
+
 void genX(CmdDrawIndirectCountKHR)(
     VkCommandBuffer                             commandBuffer,
     VkBuffer                                    _buffer,
@@ -3063,7 +3152,15 @@  void genX(CmdDrawIndirectCountKHR)(
    for (uint32_t i = 0; i < maxDrawCount; i++) {
       struct anv_address draw = anv_address_add(buffer->address, offset);
 
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+      if (cmd_state->conditional_render_enabled) {
+         emit_draw_count_predicate_with_conditional_render(cmd_buffer, count_address, i);
+      } else {
+         emit_draw_count_predicate(cmd_buffer, count_address, i);
+      }
+#else
       emit_draw_count_predicate(cmd_buffer, count_address, i);
+#endif
 
       if (vs_prog_data->uses_firstvertex ||
           vs_prog_data->uses_baseinstance)
@@ -3082,6 +3179,12 @@  void genX(CmdDrawIndirectCountKHR)(
 
       offset += stride;
    }
+
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+   if (cmd_state->conditional_render_enabled) {
+      restore_conditional_render_predicate(cmd_buffer);
+   }
+#endif
 }
 
 void genX(CmdDrawIndexedIndirectCountKHR)(
@@ -3123,7 +3226,15 @@  void genX(CmdDrawIndexedIndirectCountKHR)(
    for (uint32_t i = 0; i < maxDrawCount; i++) {
       struct anv_address draw = anv_address_add(buffer->address, offset);
 
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+      if (cmd_state->conditional_render_enabled) {
+         emit_draw_count_predicate_with_conditional_render(cmd_buffer, count_address, i);
+      } else {
+         emit_draw_count_predicate(cmd_buffer, count_address, i);
+      }
+#else
       emit_draw_count_predicate(cmd_buffer, count_address, i);
+#endif
 
       /* TODO: We need to stomp base vertex to 0 somehow */
       if (vs_prog_data->uses_firstvertex ||
@@ -3143,6 +3254,12 @@  void genX(CmdDrawIndexedIndirectCountKHR)(
 
       offset += stride;
    }
+
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+   if (cmd_state->conditional_render_enabled) {
+      restore_conditional_render_predicate(cmd_buffer);
+   }
+#endif
 }
 
 static VkResult
@@ -3351,6 +3468,7 @@  void genX(CmdDispatchBase)(
    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 
    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
+      ggw.PredicateEnable              = cmd_buffer->state.conditional_render_enabled;
       ggw.SIMDSize                     = prog_data->simd_size / 16;
       ggw.ThreadDepthCounterMaximum    = 0;
       ggw.ThreadHeightCounterMaximum   = 0;
@@ -3448,7 +3566,8 @@  void genX(CmdDispatchIndirect)(
 
    anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
       ggw.IndirectParameterEnable      = true;
-      ggw.PredicateEnable              = GEN_GEN <= 7;
+      ggw.PredicateEnable              = GEN_GEN <= 7 ||
+                                         cmd_buffer->state.conditional_render_enabled;
       ggw.SIMDSize                     = prog_data->simd_size / 16;
       ggw.ThreadDepthCounterMaximum    = 0;
       ggw.ThreadHeightCounterMaximum   = 0;
@@ -4158,3 +4277,70 @@  void genX(CmdEndRenderPass2KHR)(
 {
    genX(CmdEndRenderPass)(commandBuffer);
 }
+
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+void genX(CmdBeginConditionalRenderingEXT)(
+	VkCommandBuffer                             commandBuffer,
+	const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
+{
+    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
+    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+    struct anv_address value_address =
+       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
+
+    const bool inverted = pConditionalRenderingBegin->flags &
+                          VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
+
+    cmd_state->conditional_render_enabled = true;
+
+    /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+     * command when loading the values into the predicate source registers.
+     */
+    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+      pc.PipeControlFlushEnable = true;
+    }
+
+    /* Section 19.4 of the Vulkan 1.1.85 spec says:
+     *
+     *    If the value of the predicate in buffer memory changes
+     *    while conditional rendering is active, the rendering commands
+     *    may be discarded in an implementation-dependent way.
+     *    Some implementations may latch the value of the predicate
+     *    upon beginning conditional rendering while others
+     *    may read it before every rendering command.
+     *
+     * So it's perfectly fine to read a value from the buffer once.
+     */
+
+    emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0, value_address);
+    /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
+    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0);
+    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, 0);
+    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
+
+    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+        mip.LoadOperation    = inverted ? LOAD_LOAD : LOAD_LOADINV;
+        mip.CombineOperation = COMBINE_SET;
+        mip.CompareOperation = COMPARE_SRCS_EQUAL;
+    }
+
+    /* Calculate predicate result once and store it in MI_ALU_REG15
+     * to prevent recalculating it when interacting with
+     * VK_KHR_draw_indirect_count which also uses predicates.
+     * It is also the only way to support conditional render of
+     * secondary buffers because they are formed before we
+     * know whether conditional render is enabled.
+     */
+    emit_lrr(&cmd_buffer->batch, CS_GPR(MI_ALU_REG15), MI_PREDICATE_RESULT);
+}
+
+void genX(CmdEndConditionalRenderingEXT)(
+	VkCommandBuffer                             commandBuffer)
+{
+    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+
+    cmd_state->conditional_render_enabled = false;
+}
+#endif

Comments

On Wed, Oct 17, 2018 at 6:59 AM Danylo Piliaiev <danylo.piliaiev@gmail.com>
wrote:

> Conditional rendering affects next functions:
> - vkCmdDraw, vkCmdDrawIndexed, vkCmdDrawIndirect, vkCmdDrawIndexedIndirect
> - vkCmdDrawIndirectCountKHR, vkCmdDrawIndexedIndirectCountKHR
> - vkCmdDispatch, vkCmdDispatchIndirect, vkCmdDispatchBase
> - vkCmdClearAttachments
>
> To reduce readings from the memory a result of the condition is calculated
> and stored into designated register MI_ALU_REG15.
>
> In current implementation affected functions expect MI_PREDICATE_RESULT
> being set before their call so any code which changes the predicate
> should restore it with restore_conditional_render_predicate.
> An alternative is to restore MI_PREDICATE_RESULT in all affected
> functions at their beginning.
>
> Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
> ---
>  src/intel/vulkan/anv_blorp.c       |   7 +-
>  src/intel/vulkan/anv_device.c      |  12 ++
>  src/intel/vulkan/anv_extensions.py |   1 +
>  src/intel/vulkan/anv_private.h     |   2 +
>  src/intel/vulkan/genX_cmd_buffer.c | 192 ++++++++++++++++++++++++++++-
>  5 files changed, 209 insertions(+), 5 deletions(-)
>
> diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
> index 478b8e7a3d..157875d16f 100644
> --- a/src/intel/vulkan/anv_blorp.c
> +++ b/src/intel/vulkan/anv_blorp.c
> @@ -1144,8 +1144,11 @@ void anv_CmdClearAttachments(
>      * trash our depth and stencil buffers.
>      */
>     struct blorp_batch batch;
> -   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
> -                    BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
> +   enum blorp_batch_flags flags = BLORP_BATCH_NO_EMIT_DEPTH_STENCIL;
> +   if (cmd_buffer->state.conditional_render_enabled) {
> +       flags |= BLORP_BATCH_PREDICATE_ENABLE;
> +   }
> +   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
> flags);
>
>     for (uint32_t a = 0; a < attachmentCount; ++a) {
>        if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV)
> {
> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
> index a2551452eb..930a192c25 100644
> --- a/src/intel/vulkan/anv_device.c
> +++ b/src/intel/vulkan/anv_device.c
> @@ -957,6 +957,18 @@ void anv_GetPhysicalDeviceFeatures2(
>           break;
>        }
>
> +      case
> VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
> +         VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
> +            (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
> +         ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
> +
> +         features->conditionalRendering = pdevice->info.gen >= 8 ||
> +                                          pdevice->info.is_haswell;
> +         features->inheritedConditionalRendering = pdevice->info.gen >= 8
> ||
> +
>  pdevice->info.is_haswell;
> +         break;
> +      }
> +
>        default:
>           anv_debug_ignored_stype(ext->sType);
>           break;
> diff --git a/src/intel/vulkan/anv_extensions.py
> b/src/intel/vulkan/anv_extensions.py
> index c13ce531ee..2ef7a52d01 100644
> --- a/src/intel/vulkan/anv_extensions.py
> +++ b/src/intel/vulkan/anv_extensions.py
> @@ -127,6 +127,7 @@ EXTENSIONS = [
>      Extension('VK_EXT_vertex_attribute_divisor',          3, True),
>      Extension('VK_EXT_post_depth_coverage',               1,
> 'device->info.gen >= 9'),
>      Extension('VK_EXT_sampler_filter_minmax',             1,
> 'device->info.gen >= 9'),
> +    Extension('VK_EXT_conditional_rendering',             1,
> 'device->info.gen >= 8 || device->info.is_haswell'),
>  ]
>
>  class VkVersion:
> diff --git a/src/intel/vulkan/anv_private.h
> b/src/intel/vulkan/anv_private.h
> index 599b903f25..108da51a59 100644
> --- a/src/intel/vulkan/anv_private.h
> +++ b/src/intel/vulkan/anv_private.h
> @@ -2032,6 +2032,8 @@ struct anv_cmd_state {
>      */
>     bool                                         hiz_enabled;
>
> +   bool
>  conditional_render_enabled;
> +
>     /**
>      * Array length is anv_cmd_state::pass::attachment_count. Array
> content is
>      * valid only when recording a render pass instance.
> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
> b/src/intel/vulkan/genX_cmd_buffer.c
> index f07a6aa7c9..87abc443b6 100644
> --- a/src/intel/vulkan/genX_cmd_buffer.c
> +++ b/src/intel/vulkan/genX_cmd_buffer.c
> @@ -479,8 +479,9 @@ transition_depth_buffer(struct anv_cmd_buffer
> *cmd_buffer,
>                         0, 0, 1, hiz_op);
>  }
>
> -#define MI_PREDICATE_SRC0  0x2400
> -#define MI_PREDICATE_SRC1  0x2408
> +#define MI_PREDICATE_SRC0    0x2400
> +#define MI_PREDICATE_SRC1    0x2408
> +#define MI_PREDICATE_RESULT  0x2418
>
>  static void
>  set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
> @@ -545,6 +546,14 @@ mi_alu(uint32_t opcode, uint32_t operand1, uint32_t
> operand2)
>
>  #define CS_GPR(n) (0x2600 + (n) * 8)
>
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +static void
> +restore_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
> +{
> +   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT,
> CS_GPR(MI_ALU_REG15));
>

Does this work?  Is it sufficient to just set MI_PREDICATE_RESULT or do we
actually need to use an MI_PREDICATE?  I genuinely don't know and this
strikes me as odd.


> +}
> +#endif
> +
>  /* This is only really practical on haswell and above because it requires
>   * MI math in order to get it correct.
>   */
> @@ -1144,6 +1153,12 @@ transition_color_buffer(struct anv_cmd_buffer
> *cmd_buffer,
>        }
>     }
>
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +   if (cmd_buffer->state.conditional_render_enabled) {
> +      restore_conditional_render_predicate(cmd_buffer);
> +   }
> +#endif
> +
>     cmd_buffer->state.pending_pipe_bits |=
>        ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
>  }
> @@ -1397,6 +1412,26 @@ genX(BeginCommandBuffer)(
>        cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
>     }
>
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
> +      vk_foreach_struct_const(s, pBeginInfo->pInheritanceInfo->pNext) {
> +         switch (s->sType) {
> +         case
> VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT:
> {
> +            const VkCommandBufferInheritanceConditionalRenderingInfoEXT
> *conditional_rendering_info =
> +               (const
> VkCommandBufferInheritanceConditionalRenderingInfoEXT *) s;
> +            /* We should emit commands as if conditional render is
> enabled. */
> +            cmd_buffer->state.conditional_render_enabled =
> +               conditional_rendering_info->conditionalRenderingEnable;
>

Might be easier to just use vk_find_struct_const() instead of the loop.


> +            break;
> +         }
> +         default:
> +            anv_debug_ignored_stype(s->sType);
> +            break;
> +         }
> +      }
> +   }
> +#endif
> +
>     return result;
>  }
>
> @@ -1501,6 +1536,20 @@ genX(CmdExecuteCommands)(
>        assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
>        assert(!anv_batch_has_error(&secondary->batch));
>
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +      if (secondary->state.conditional_render_enabled) {
> +          /* Secondary buffer is constructed as if it will be executed
> +           * with conditional rendering, we should satisfy this dependency
> +           * regardless of conditional rendering being enabled in primary.
> +           */
> +          if (!primary->state.conditional_render_enabled) {
> +             emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15), 1);
> +             emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15) + 4, 0);
> +             emit_lrr(&primary->batch, MI_PREDICATE_RESULT,
> CS_GPR(MI_ALU_REG15));
> +          }
> +      }
> +#endif
> +
>        if (secondary->usage_flags &
>            VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
>           /* If we're continuing a render pass from the primary, we need to
> @@ -2761,6 +2810,7 @@ void genX(CmdDraw)(
>     instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
>
>     anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> +      prim.PredicateEnable          =
> cmd_buffer->state.conditional_render_enabled;
>        prim.VertexAccessType         = SEQUENTIAL;
>        prim.PrimitiveTopologyType    = pipeline->topology;
>        prim.VertexCountPerInstance   = vertexCount;
> @@ -2800,6 +2850,7 @@ void genX(CmdDrawIndexed)(
>     instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
>
>     anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
> +      prim.PredicateEnable          =
> cmd_buffer->state.conditional_render_enabled;
>        prim.VertexAccessType         = RANDOM;
>        prim.PrimitiveTopologyType    = pipeline->topology;
>        prim.VertexCountPerInstance   = indexCount;
> @@ -2935,6 +2986,7 @@ void genX(CmdDrawIndirect)(
>
>        anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>           prim.IndirectParameterEnable  = true;
> +         prim.PredicateEnable          =
> cmd_buffer->state.conditional_render_enabled;
>           prim.VertexAccessType         = SEQUENTIAL;
>           prim.PrimitiveTopologyType    = pipeline->topology;
>        }
> @@ -2974,6 +3026,7 @@ void genX(CmdDrawIndexedIndirect)(
>
>        anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>           prim.IndirectParameterEnable  = true;
> +         prim.PredicateEnable          =
> cmd_buffer->state.conditional_render_enabled;
>           prim.VertexAccessType         = RANDOM;
>           prim.PrimitiveTopologyType    = pipeline->topology;
>        }
> @@ -3024,6 +3077,42 @@ emit_draw_count_predicate(struct anv_cmd_buffer
> *cmd_buffer,
>     }
>  }
>
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +static void
> +emit_draw_count_predicate_with_conditional_render(
> +                          struct anv_cmd_buffer *cmd_buffer,
> +                          struct anv_address count_address,
> +                          uint32_t draw_index)
> +{
> +   const int draw_index_reg = MI_ALU_REG0;
> +   const int draw_count_reg = MI_ALU_REG14;
> +   const int condition_reg = MI_ALU_REG15;
> +   const int tmp_result_reg = MI_ALU_REG1;
> +
> +   emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg), draw_index);
> +   emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg) + 4, 0);
> +
> +   uint32_t *dw;
> +   /* Compute (draw_index < draw_count).
> +    * We do this by subtracting and storing the carry bit.
> +    */
> +   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
> +   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, draw_index_reg);
> +   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, draw_count_reg);
> +   dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
> +   dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_CF);
> +
> +   /* & condition */
> +   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
> +   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, tmp_result_reg);
> +   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, condition_reg);
> +   dw[3] = mi_alu(MI_ALU_AND, 0, 0);
> +   dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_ACCU);
> +
> +   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT,
> CS_GPR(tmp_result_reg));
>

Again, is this sufficient?  Maybe I'm missing something.


> +}
> +#endif
> +
>  void genX(CmdDrawIndirectCountKHR)(
>      VkCommandBuffer                             commandBuffer,
>      VkBuffer                                    _buffer,
> @@ -3063,7 +3152,15 @@ void genX(CmdDrawIndirectCountKHR)(
>     for (uint32_t i = 0; i < maxDrawCount; i++) {
>        struct anv_address draw = anv_address_add(buffer->address, offset);
>
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +      if (cmd_state->conditional_render_enabled) {
> +         emit_draw_count_predicate_with_conditional_render(cmd_buffer,
> count_address, i);
> +      } else {
> +         emit_draw_count_predicate(cmd_buffer, count_address, i);
> +      }
> +#else
>        emit_draw_count_predicate(cmd_buffer, count_address, i);
> +#endif
>
>        if (vs_prog_data->uses_firstvertex ||
>            vs_prog_data->uses_baseinstance)
> @@ -3082,6 +3179,12 @@ void genX(CmdDrawIndirectCountKHR)(
>
>        offset += stride;
>     }
> +
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +   if (cmd_state->conditional_render_enabled) {
> +      restore_conditional_render_predicate(cmd_buffer);
> +   }
> +#endif
>  }
>
>  void genX(CmdDrawIndexedIndirectCountKHR)(
> @@ -3123,7 +3226,15 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
>     for (uint32_t i = 0; i < maxDrawCount; i++) {
>        struct anv_address draw = anv_address_add(buffer->address, offset);
>
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +      if (cmd_state->conditional_render_enabled) {
> +         emit_draw_count_predicate_with_conditional_render(cmd_buffer,
> count_address, i);
> +      } else {
> +         emit_draw_count_predicate(cmd_buffer, count_address, i);
> +      }
> +#else
>        emit_draw_count_predicate(cmd_buffer, count_address, i);
> +#endif
>
>        /* TODO: We need to stomp base vertex to 0 somehow */
>        if (vs_prog_data->uses_firstvertex ||
> @@ -3143,6 +3254,12 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
>
>        offset += stride;
>     }
> +
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +   if (cmd_state->conditional_render_enabled) {
> +      restore_conditional_render_predicate(cmd_buffer);
> +   }
> +#endif
>  }
>
>  static VkResult
> @@ -3351,6 +3468,7 @@ void genX(CmdDispatchBase)(
>     genX(cmd_buffer_flush_compute_state)(cmd_buffer);
>
>     anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
> +      ggw.PredicateEnable              =
> cmd_buffer->state.conditional_render_enabled;
>        ggw.SIMDSize                     = prog_data->simd_size / 16;
>        ggw.ThreadDepthCounterMaximum    = 0;
>        ggw.ThreadHeightCounterMaximum   = 0;
> @@ -3448,7 +3566,8 @@ void genX(CmdDispatchIndirect)(
>
>     anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
>        ggw.IndirectParameterEnable      = true;
> -      ggw.PredicateEnable              = GEN_GEN <= 7;
> +      ggw.PredicateEnable              = GEN_GEN <= 7 ||
> +
>  cmd_buffer->state.conditional_render_enabled;
>        ggw.SIMDSize                     = prog_data->simd_size / 16;
>        ggw.ThreadDepthCounterMaximum    = 0;
>        ggw.ThreadHeightCounterMaximum   = 0;
> @@ -4158,3 +4277,70 @@ void genX(CmdEndRenderPass2KHR)(
>  {
>     genX(CmdEndRenderPass)(commandBuffer);
>  }
> +
> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
> +void genX(CmdBeginConditionalRenderingEXT)(
> +       VkCommandBuffer                             commandBuffer,
> +       const VkConditionalRenderingBeginInfoEXT*
>  pConditionalRenderingBegin)
> +{
> +    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
> +    ANV_FROM_HANDLE(anv_buffer, buffer,
> pConditionalRenderingBegin->buffer);
> +    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
> +    struct anv_address value_address =
> +       anv_address_add(buffer->address,
> pConditionalRenderingBegin->offset);
> +
> +    const bool inverted = pConditionalRenderingBegin->flags &
> +                          VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
> +
> +    cmd_state->conditional_render_enabled = true;
> +
> +    /* Needed to ensure the memory is coherent for the
> MI_LOAD_REGISTER_MEM
> +     * command when loading the values into the predicate source
> registers.
> +     */
> +    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
> +      pc.PipeControlFlushEnable = true;
> +    }
> +
> +    /* Section 19.4 of the Vulkan 1.1.85 spec says:
> +     *
> +     *    If the value of the predicate in buffer memory changes
> +     *    while conditional rendering is active, the rendering commands
> +     *    may be discarded in an implementation-dependent way.
> +     *    Some implementations may latch the value of the predicate
> +     *    upon beginning conditional rendering while others
> +     *    may read it before every rendering command.
> +     *
> +     * So it's perfectly fine to read a value from the buffer once.
> +     */
> +
> +    emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0, value_address);
> +    /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
> +    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0);
> +    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, 0);
> +    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
> +
> +    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
> +        mip.LoadOperation    = inverted ? LOAD_LOAD : LOAD_LOADINV;
> +        mip.CombineOperation = COMBINE_SET;
> +        mip.CompareOperation = COMPARE_SRCS_EQUAL;
> +    }
> +
> +    /* Calculate predicate result once and store it in MI_ALU_REG15
> +     * to prevent recalculating it when interacting with
> +     * VK_KHR_draw_indirect_count which also uses predicates.
> +     * It is also the only way to support conditional render of
> +     * secondary buffers because they are formed before we
> +     * know whether conditional render is enabled.
> +     */
> +    emit_lrr(&cmd_buffer->batch, CS_GPR(MI_ALU_REG15),
> MI_PREDICATE_RESULT);
> +}
> +
> +void genX(CmdEndConditionalRenderingEXT)(
> +       VkCommandBuffer                             commandBuffer)
> +{
> +    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
> +    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
> +
> +    cmd_state->conditional_render_enabled = false;
> +}
> +#endif
> --
> 2.18.0
>
>
On 11/6/18 1:05 AM, Jason Ekstrand wrote:
> On Wed, Oct 17, 2018 at 6:59 AM Danylo Piliaiev <danylo.piliaiev@gmail.com>
> wrote:
>
>> Conditional rendering affects next functions:
>> - vkCmdDraw, vkCmdDrawIndexed, vkCmdDrawIndirect, vkCmdDrawIndexedIndirect
>> - vkCmdDrawIndirectCountKHR, vkCmdDrawIndexedIndirectCountKHR
>> - vkCmdDispatch, vkCmdDispatchIndirect, vkCmdDispatchBase
>> - vkCmdClearAttachments
>>
>> To reduce readings from the memory a result of the condition is calculated
>> and stored into designated register MI_ALU_REG15.
>>
>> In current implementation affected functions expect MI_PREDICATE_RESULT
>> being set before their call so any code which changes the predicate
>> should restore it with restore_conditional_render_predicate.
>> An alternative is to restore MI_PREDICATE_RESULT in all affected
>> functions at their beginning.
>>
>> Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
>> ---
>>   src/intel/vulkan/anv_blorp.c       |   7 +-
>>   src/intel/vulkan/anv_device.c      |  12 ++
>>   src/intel/vulkan/anv_extensions.py |   1 +
>>   src/intel/vulkan/anv_private.h     |   2 +
>>   src/intel/vulkan/genX_cmd_buffer.c | 192 ++++++++++++++++++++++++++++-
>>   5 files changed, 209 insertions(+), 5 deletions(-)
>>
>> diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
>> index 478b8e7a3d..157875d16f 100644
>> --- a/src/intel/vulkan/anv_blorp.c
>> +++ b/src/intel/vulkan/anv_blorp.c
>> @@ -1144,8 +1144,11 @@ void anv_CmdClearAttachments(
>>       * trash our depth and stencil buffers.
>>       */
>>      struct blorp_batch batch;
>> -   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
>> -                    BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
>> +   enum blorp_batch_flags flags = BLORP_BATCH_NO_EMIT_DEPTH_STENCIL;
>> +   if (cmd_buffer->state.conditional_render_enabled) {
>> +       flags |= BLORP_BATCH_PREDICATE_ENABLE;
>> +   }
>> +   blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
>> flags);
>>
>>      for (uint32_t a = 0; a < attachmentCount; ++a) {
>>         if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV)
>> {
>> diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
>> index a2551452eb..930a192c25 100644
>> --- a/src/intel/vulkan/anv_device.c
>> +++ b/src/intel/vulkan/anv_device.c
>> @@ -957,6 +957,18 @@ void anv_GetPhysicalDeviceFeatures2(
>>            break;
>>         }
>>
>> +      case
>> VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
>> +         VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
>> +            (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
>> +         ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
>> +
>> +         features->conditionalRendering = pdevice->info.gen >= 8 ||
>> +                                          pdevice->info.is_haswell;
>> +         features->inheritedConditionalRendering = pdevice->info.gen >= 8
>> ||
>> +
>>   pdevice->info.is_haswell;
>> +         break;
>> +      }
>> +
>>         default:
>>            anv_debug_ignored_stype(ext->sType);
>>            break;
>> diff --git a/src/intel/vulkan/anv_extensions.py
>> b/src/intel/vulkan/anv_extensions.py
>> index c13ce531ee..2ef7a52d01 100644
>> --- a/src/intel/vulkan/anv_extensions.py
>> +++ b/src/intel/vulkan/anv_extensions.py
>> @@ -127,6 +127,7 @@ EXTENSIONS = [
>>       Extension('VK_EXT_vertex_attribute_divisor',          3, True),
>>       Extension('VK_EXT_post_depth_coverage',               1,
>> 'device->info.gen >= 9'),
>>       Extension('VK_EXT_sampler_filter_minmax',             1,
>> 'device->info.gen >= 9'),
>> +    Extension('VK_EXT_conditional_rendering',             1,
>> 'device->info.gen >= 8 || device->info.is_haswell'),
>>   ]
>>
>>   class VkVersion:
>> diff --git a/src/intel/vulkan/anv_private.h
>> b/src/intel/vulkan/anv_private.h
>> index 599b903f25..108da51a59 100644
>> --- a/src/intel/vulkan/anv_private.h
>> +++ b/src/intel/vulkan/anv_private.h
>> @@ -2032,6 +2032,8 @@ struct anv_cmd_state {
>>       */
>>      bool                                         hiz_enabled;
>>
>> +   bool
>>   conditional_render_enabled;
>> +
>>      /**
>>       * Array length is anv_cmd_state::pass::attachment_count. Array
>> content is
>>       * valid only when recording a render pass instance.
>> diff --git a/src/intel/vulkan/genX_cmd_buffer.c
>> b/src/intel/vulkan/genX_cmd_buffer.c
>> index f07a6aa7c9..87abc443b6 100644
>> --- a/src/intel/vulkan/genX_cmd_buffer.c
>> +++ b/src/intel/vulkan/genX_cmd_buffer.c
>> @@ -479,8 +479,9 @@ transition_depth_buffer(struct anv_cmd_buffer
>> *cmd_buffer,
>>                          0, 0, 1, hiz_op);
>>   }
>>
>> -#define MI_PREDICATE_SRC0  0x2400
>> -#define MI_PREDICATE_SRC1  0x2408
>> +#define MI_PREDICATE_SRC0    0x2400
>> +#define MI_PREDICATE_SRC1    0x2408
>> +#define MI_PREDICATE_RESULT  0x2418
>>
>>   static void
>>   set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
>> @@ -545,6 +546,14 @@ mi_alu(uint32_t opcode, uint32_t operand1, uint32_t
>> operand2)
>>
>>   #define CS_GPR(n) (0x2600 + (n) * 8)
>>
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +static void
>> +restore_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
>> +{
>> +   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT,
>> CS_GPR(MI_ALU_REG15));
>>
> Does this work?  Is it sufficient to just set MI_PREDICATE_RESULT or do we
> actually need to use an MI_PREDICATE?  I genuinely don't know and this
> strikes me as odd.
>
It does work. However I didn't see that being _explicitly_ said in docs.

- It is explicitly stated that MI_PREDICATE uses MI_PREDICATE_DATA register
for calculations and MI_PREDICATE_RESULT to store predicate bit.

- We are not forbidden to write there.

- The description of "Predicate Enable" bit has:
      If set, this command is executed (or not) depending on the current 
value of the MI Predicate internal state bit.

So docs indirectly tell us that it should work.

I would agree that this may require some explicit confirmation.
>> +}
>> +#endif
>> +
>>   /* This is only really practical on haswell and above because it requires
>>    * MI math in order to get it correct.
>>    */
>> @@ -1144,6 +1153,12 @@ transition_color_buffer(struct anv_cmd_buffer
>> *cmd_buffer,
>>         }
>>      }
>>
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +   if (cmd_buffer->state.conditional_render_enabled) {
>> +      restore_conditional_render_predicate(cmd_buffer);
>> +   }
>> +#endif
>> +
>>      cmd_buffer->state.pending_pipe_bits |=
>>         ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT | ANV_PIPE_CS_STALL_BIT;
>>   }
>> @@ -1397,6 +1412,26 @@ genX(BeginCommandBuffer)(
>>         cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
>>      }
>>
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
>> +      vk_foreach_struct_const(s, pBeginInfo->pInheritanceInfo->pNext) {
>> +         switch (s->sType) {
>> +         case
>> VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT:
>> {
>> +            const VkCommandBufferInheritanceConditionalRenderingInfoEXT
>> *conditional_rendering_info =
>> +               (const
>> VkCommandBufferInheritanceConditionalRenderingInfoEXT *) s;
>> +            /* We should emit commands as if conditional render is
>> enabled. */
>> +            cmd_buffer->state.conditional_render_enabled =
>> +               conditional_rendering_info->conditionalRenderingEnable;
>>
> Might be easier to just use vk_find_struct_const() instead of the loop.
>
Missed vk_find_struct_const, thanks!
>> +            break;
>> +         }
>> +         default:
>> +            anv_debug_ignored_stype(s->sType);
>> +            break;
>> +         }
>> +      }
>> +   }
>> +#endif
>> +
>>      return result;
>>   }
>>
>> @@ -1501,6 +1536,20 @@ genX(CmdExecuteCommands)(
>>         assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
>>         assert(!anv_batch_has_error(&secondary->batch));
>>
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +      if (secondary->state.conditional_render_enabled) {
>> +          /* Secondary buffer is constructed as if it will be executed
>> +           * with conditional rendering, we should satisfy this dependency
>> +           * regardless of conditional rendering being enabled in primary.
>> +           */
>> +          if (!primary->state.conditional_render_enabled) {
>> +             emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15), 1);
>> +             emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15) + 4, 0);
>> +             emit_lrr(&primary->batch, MI_PREDICATE_RESULT,
>> CS_GPR(MI_ALU_REG15));
>> +          }
>> +      }
>> +#endif
>> +
>>         if (secondary->usage_flags &
>>             VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
>>            /* If we're continuing a render pass from the primary, we need to
>> @@ -2761,6 +2810,7 @@ void genX(CmdDraw)(
>>      instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
>>
>>      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>> +      prim.PredicateEnable          =
>> cmd_buffer->state.conditional_render_enabled;
>>         prim.VertexAccessType         = SEQUENTIAL;
>>         prim.PrimitiveTopologyType    = pipeline->topology;
>>         prim.VertexCountPerInstance   = vertexCount;
>> @@ -2800,6 +2850,7 @@ void genX(CmdDrawIndexed)(
>>      instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
>>
>>      anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>> +      prim.PredicateEnable          =
>> cmd_buffer->state.conditional_render_enabled;
>>         prim.VertexAccessType         = RANDOM;
>>         prim.PrimitiveTopologyType    = pipeline->topology;
>>         prim.VertexCountPerInstance   = indexCount;
>> @@ -2935,6 +2986,7 @@ void genX(CmdDrawIndirect)(
>>
>>         anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>>            prim.IndirectParameterEnable  = true;
>> +         prim.PredicateEnable          =
>> cmd_buffer->state.conditional_render_enabled;
>>            prim.VertexAccessType         = SEQUENTIAL;
>>            prim.PrimitiveTopologyType    = pipeline->topology;
>>         }
>> @@ -2974,6 +3026,7 @@ void genX(CmdDrawIndexedIndirect)(
>>
>>         anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
>>            prim.IndirectParameterEnable  = true;
>> +         prim.PredicateEnable          =
>> cmd_buffer->state.conditional_render_enabled;
>>            prim.VertexAccessType         = RANDOM;
>>            prim.PrimitiveTopologyType    = pipeline->topology;
>>         }
>> @@ -3024,6 +3077,42 @@ emit_draw_count_predicate(struct anv_cmd_buffer
>> *cmd_buffer,
>>      }
>>   }
>>
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +static void
>> +emit_draw_count_predicate_with_conditional_render(
>> +                          struct anv_cmd_buffer *cmd_buffer,
>> +                          struct anv_address count_address,
>> +                          uint32_t draw_index)
>> +{
>> +   const int draw_index_reg = MI_ALU_REG0;
>> +   const int draw_count_reg = MI_ALU_REG14;
>> +   const int condition_reg = MI_ALU_REG15;
>> +   const int tmp_result_reg = MI_ALU_REG1;
>> +
>> +   emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg), draw_index);
>> +   emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg) + 4, 0);
>> +
>> +   uint32_t *dw;
>> +   /* Compute (draw_index < draw_count).
>> +    * We do this by subtracting and storing the carry bit.
>> +    */
>> +   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
>> +   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, draw_index_reg);
>> +   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, draw_count_reg);
>> +   dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
>> +   dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_CF);
>> +
>> +   /* & condition */
>> +   dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
>> +   dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, tmp_result_reg);
>> +   dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, condition_reg);
>> +   dw[3] = mi_alu(MI_ALU_AND, 0, 0);
>> +   dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_ACCU);
>> +
>> +   emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT,
>> CS_GPR(tmp_result_reg));
>>
> Again, is this sufficient?  Maybe I'm missing something.
>
>
>> +}
>> +#endif
>> +
>>   void genX(CmdDrawIndirectCountKHR)(
>>       VkCommandBuffer                             commandBuffer,
>>       VkBuffer                                    _buffer,
>> @@ -3063,7 +3152,15 @@ void genX(CmdDrawIndirectCountKHR)(
>>      for (uint32_t i = 0; i < maxDrawCount; i++) {
>>         struct anv_address draw = anv_address_add(buffer->address, offset);
>>
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +      if (cmd_state->conditional_render_enabled) {
>> +         emit_draw_count_predicate_with_conditional_render(cmd_buffer,
>> count_address, i);
>> +      } else {
>> +         emit_draw_count_predicate(cmd_buffer, count_address, i);
>> +      }
>> +#else
>>         emit_draw_count_predicate(cmd_buffer, count_address, i);
>> +#endif
>>
>>         if (vs_prog_data->uses_firstvertex ||
>>             vs_prog_data->uses_baseinstance)
>> @@ -3082,6 +3179,12 @@ void genX(CmdDrawIndirectCountKHR)(
>>
>>         offset += stride;
>>      }
>> +
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +   if (cmd_state->conditional_render_enabled) {
>> +      restore_conditional_render_predicate(cmd_buffer);
>> +   }
>> +#endif
>>   }
>>
>>   void genX(CmdDrawIndexedIndirectCountKHR)(
>> @@ -3123,7 +3226,15 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
>>      for (uint32_t i = 0; i < maxDrawCount; i++) {
>>         struct anv_address draw = anv_address_add(buffer->address, offset);
>>
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +      if (cmd_state->conditional_render_enabled) {
>> +         emit_draw_count_predicate_with_conditional_render(cmd_buffer,
>> count_address, i);
>> +      } else {
>> +         emit_draw_count_predicate(cmd_buffer, count_address, i);
>> +      }
>> +#else
>>         emit_draw_count_predicate(cmd_buffer, count_address, i);
>> +#endif
>>
>>         /* TODO: We need to stomp base vertex to 0 somehow */
>>         if (vs_prog_data->uses_firstvertex ||
>> @@ -3143,6 +3254,12 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
>>
>>         offset += stride;
>>      }
>> +
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +   if (cmd_state->conditional_render_enabled) {
>> +      restore_conditional_render_predicate(cmd_buffer);
>> +   }
>> +#endif
>>   }
>>
>>   static VkResult
>> @@ -3351,6 +3468,7 @@ void genX(CmdDispatchBase)(
>>      genX(cmd_buffer_flush_compute_state)(cmd_buffer);
>>
>>      anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
>> +      ggw.PredicateEnable              =
>> cmd_buffer->state.conditional_render_enabled;
>>         ggw.SIMDSize                     = prog_data->simd_size / 16;
>>         ggw.ThreadDepthCounterMaximum    = 0;
>>         ggw.ThreadHeightCounterMaximum   = 0;
>> @@ -3448,7 +3566,8 @@ void genX(CmdDispatchIndirect)(
>>
>>      anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
>>         ggw.IndirectParameterEnable      = true;
>> -      ggw.PredicateEnable              = GEN_GEN <= 7;
>> +      ggw.PredicateEnable              = GEN_GEN <= 7 ||
>> +
>>   cmd_buffer->state.conditional_render_enabled;
>>         ggw.SIMDSize                     = prog_data->simd_size / 16;
>>         ggw.ThreadDepthCounterMaximum    = 0;
>>         ggw.ThreadHeightCounterMaximum   = 0;
>> @@ -4158,3 +4277,70 @@ void genX(CmdEndRenderPass2KHR)(
>>   {
>>      genX(CmdEndRenderPass)(commandBuffer);
>>   }
>> +
>> +#if GEN_GEN >= 8 || GEN_IS_HASWELL
>> +void genX(CmdBeginConditionalRenderingEXT)(
>> +       VkCommandBuffer                             commandBuffer,
>> +       const VkConditionalRenderingBeginInfoEXT*
>>   pConditionalRenderingBegin)
>> +{
>> +    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>> +    ANV_FROM_HANDLE(anv_buffer, buffer,
>> pConditionalRenderingBegin->buffer);
>> +    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
>> +    struct anv_address value_address =
>> +       anv_address_add(buffer->address,
>> pConditionalRenderingBegin->offset);
>> +
>> +    const bool inverted = pConditionalRenderingBegin->flags &
>> +                          VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
>> +
>> +    cmd_state->conditional_render_enabled = true;
>> +
>> +    /* Needed to ensure the memory is coherent for the
>> MI_LOAD_REGISTER_MEM
>> +     * command when loading the values into the predicate source
>> registers.
>> +     */
>> +    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
>> +      pc.PipeControlFlushEnable = true;
>> +    }
>> +
>> +    /* Section 19.4 of the Vulkan 1.1.85 spec says:
>> +     *
>> +     *    If the value of the predicate in buffer memory changes
>> +     *    while conditional rendering is active, the rendering commands
>> +     *    may be discarded in an implementation-dependent way.
>> +     *    Some implementations may latch the value of the predicate
>> +     *    upon beginning conditional rendering while others
>> +     *    may read it before every rendering command.
>> +     *
>> +     * So it's perfectly fine to read a value from the buffer once.
>> +     */
>> +
>> +    emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0, value_address);
>> +    /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
>> +    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0);
>> +    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, 0);
>> +    emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
>> +
>> +    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
>> +        mip.LoadOperation    = inverted ? LOAD_LOAD : LOAD_LOADINV;
>> +        mip.CombineOperation = COMBINE_SET;
>> +        mip.CompareOperation = COMPARE_SRCS_EQUAL;
>> +    }
>> +
>> +    /* Calculate predicate result once and store it in MI_ALU_REG15
>> +     * to prevent recalculating it when interacting with
>> +     * VK_KHR_draw_indirect_count which also uses predicates.
>> +     * It is also the only way to support conditional render of
>> +     * secondary buffers because they are formed before we
>> +     * know whether conditional render is enabled.
>> +     */
>> +    emit_lrr(&cmd_buffer->batch, CS_GPR(MI_ALU_REG15),
>> MI_PREDICATE_RESULT);
>> +}
>> +
>> +void genX(CmdEndConditionalRenderingEXT)(
>> +       VkCommandBuffer                             commandBuffer)
>> +{
>> +    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
>> +    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
>> +
>> +    cmd_state->conditional_render_enabled = false;
>> +}
>> +#endif
>> --
>> 2.18.0
>>
>>