[4/4] i965: Reimplement all the PIPE_CONTROL rules.

Submitted by Kenneth Graunke on Nov. 2, 2018, 3:04 a.m.

Details

Message ID 20181102030421.13894-4-kenneth@whitecape.org
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Kenneth Graunke Nov. 2, 2018, 3:04 a.m.
This implements virtually all documented PIPE_CONTROL restrictions
in a centralized helper.  You now simply ask for the operations you
want, and the pipe control "brain" will figure out exactly what pipe
controls to emit to make that happen without tanking your system.

The hope is that this will fix some intermittent flushing issues as
well as GPU hangs.  However, it also has a high risk of causing GPU
hangs and other regressions, as this is a particularly sensitive
area and poking the bear isn't always advisable.
---
 src/mesa/drivers/dri/i965/genX_pipe_control.c | 563 +++++++++++++-----
 1 file changed, 428 insertions(+), 135 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/mesa/drivers/dri/i965/genX_pipe_control.c b/src/mesa/drivers/dri/i965/genX_pipe_control.c
index 8eb37444253..503e674661b 100644
--- a/src/mesa/drivers/dri/i965/genX_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/genX_pipe_control.c
@@ -25,172 +25,465 @@ 
 #include "brw_defines.h"
 #include "brw_state.h"
 
+static unsigned
+flags_to_post_sync_op(uint32_t flags)
+{
+   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
+      return WriteImmediateData;
+
+   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
+      return WritePSDepthCount;
+
+   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
+      return WriteTimestamp;
+
+   return 0;
+}
+
 /**
- * According to the latest documentation, any PIPE_CONTROL with the
- * "Command Streamer Stall" bit set must also have another bit set,
- * with five different options:
- *
- *  - Render Target Cache Flush
- *  - Depth Cache Flush
- *  - Stall at Pixel Scoreboard
- *  - Post-Sync Operation
- *  - Depth Stall
- *  - DC Flush Enable
- *
- * I chose "Stall at Pixel Scoreboard" since we've used it effectively
- * in the past, but the choice is fairly arbitrary.
+ * Do the given flags have a Post Sync or LRI Post Sync operation?
  */
-static void
-gen8_add_cs_stall_workaround_bits(uint32_t *flags)
+static enum pipe_control_flags
+get_post_sync_flags(enum pipe_control_flags flags)
 {
-   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                      PIPE_CONTROL_WRITE_IMMEDIATE |
-                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
-                      PIPE_CONTROL_WRITE_TIMESTAMP |
-                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
-                      PIPE_CONTROL_DEPTH_STALL |
-                      PIPE_CONTROL_DATA_CACHE_FLUSH;
-
-   /* If we're doing a CS stall, and don't already have one of the
-    * workaround bits set, add "Stall at Pixel Scoreboard."
+   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
+            PIPE_CONTROL_WRITE_DEPTH_COUNT |
+            PIPE_CONTROL_WRITE_TIMESTAMP |
+            PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+   /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
+    * "LRI Post Sync Operation".  So more than one bit set would be illegal.
     */
-   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
-      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+   assert(util_bitcount(flags) <= 1);
+
+   return flags;
 }
 
-/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+#define IS_COMPUTE_PIPELINE(brw) \
+   (GEN_GEN >= 7 && brw->last_pipeline == BRW_COMPUTE_PIPELINE)
+
+/* Closed interval - GEN_GEN \in [x, y] */
+#define IS_GEN_BETWEEN(x, y) (GEN_GEN >= x && GEN_GEN <= y)
+#define IS_GENx10_BETWEEN(x, y) \
+   (GEN_VERSIONx10 >= x && GEN_VERSIONx10 <= y)
+
+/**
+ * Emit a series of PIPE_CONTROL commands, taking into account any
+ * workarounds necessary to actually accomplish the caller's request.
+ *
+ * Unless otherwise noted, spec quotations in this function come from:
  *
- * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
- *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+ * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
+ * Restrictions for PIPE_CONTROL.
  *
- * Note that the kernel does CS stalls between batches, so we only need
- * to count them within a batch.
+ * You should not use this function directly.  Use the helpers in
+ * brw_pipe_control.c instead, which may split the pipe control further.
  */
-static uint32_t
-gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
+void
+genX(emit_raw_pipe_control)(struct brw_context *brw, uint32_t flags,
+                            struct brw_bo *bo, uint32_t offset, uint64_t imm)
 {
-   if (GEN_GEN == 7 && !GEN_IS_HASWELL) {
-      if (flags & PIPE_CONTROL_CS_STALL) {
-         /* If we're doing a CS stall, reset the counter and carry on. */
-         brw->pipe_controls_since_last_cs_stall = 0;
-         return 0;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
+   enum pipe_control_flags non_lri_post_sync_flags =
+      post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
+
+   /* Recursive PIPE_CONTROL workarounds --------------------------------
+    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
+    *
+    * We do these first because we want to look at the original operation,
+    * rather than any workarounds we set.
+    */
+   if (GEN_GEN == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
+      /* Hardware workaround: SNB B-Spec says:
+       *
+       *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
+       *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
+       *     required."
+       */
+      brw_emit_post_sync_nonzero_flush(brw);
+   }
+
+   if (GEN_GEN == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
+      /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
+       * lists several workarounds:
+       *
+       *    "Project: SKL, KBL, BXT
+       *
+       *     If the VF Cache Invalidation Enable is set to a 1 in a
+       *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
+       *     sets to 0, with the VF Cache Invalidation Enable set to 0
+       *     needs to be sent prior to the PIPE_CONTROL with VF Cache
+       *     Invalidation Enable set to a 1."
+       */
+      genX(emit_raw_pipe_control)(brw, 0, NULL, 0, 0);
+   }
+
+   if (GEN_GEN == 9 && IS_COMPUTE_PIPELINE(brw) && post_sync_flags) {
+      /* Project: SKL / Argument: LRI Post Sync Operation [23]
+       *
+       * "PIPECONTROL command with “Command Streamer Stall Enable” must be
+       *  programmed prior to programming a PIPECONTROL command with "LRI
+       *  Post Sync Operation" in GPGPU mode of operation (i.e when
+       *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
+       *
+       * The same text exists a few rows below for Post Sync Op.
+       */
+      genX(emit_raw_pipe_control)(brw, PIPE_CONTROL_CS_STALL, bo, offset, imm);
+   }
+
+   if (GEN_GEN == 10 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
+      /* Cannonlake:
+       * "Before sending a PIPE_CONTROL command with bit 12 set, SW must issue
+       *  another PIPE_CONTROL with Render Target Cache Flush Enable (bit 12)
+       *  = 0 and Pipe Control Flush Enable (bit 7) = 1"
+       */
+      genX(emit_raw_pipe_control)(brw, PIPE_CONTROL_FLUSH_ENABLE,
+                                  bo, offset, imm);
+   }
+
+   /* "Flush Types" workarounds ---------------------------------------------
+    * We do these now because they may add post-sync operations or CS stalls.
+    */
+
+   if (IS_GEN_BETWEEN(8, 10) && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
+      /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
+       *
+       * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
+       *  'Write PS Depth Count' or 'Write Timestamp'."
+       */
+      if (!bo) {
+         flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
+         post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
+         non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
+         bo = brw->workaround_bo;
       }
+   }
 
-      /* If this is the fourth pipe control without a CS stall, do one now. */
-      if (++brw->pipe_controls_since_last_cs_stall == 4) {
-         brw->pipe_controls_since_last_cs_stall = 0;
-         return PIPE_CONTROL_CS_STALL;
+   /* #1130 from Gen10 workarounds page:
+    *
+    *    "Enable Depth Stall on every Post Sync Op if Render target Cache
+    *     Flush is not enabled in same PIPE CONTROL and Enable Pixel score
+    *     board stall if Render target cache flush is enabled."
+    *
+    * Applicable to CNL B0 and C0 steppings only.
+    *
+    * The wording here is unclear, and this workaround doesn't look anything
+    * like the internal bug report recommendations, but leave it be for now...
+    */
+   if (GEN_GEN == 10) {
+      if (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) {
+         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+      } else if (flags & non_lri_post_sync_flags) {
+         flags |= PIPE_CONTROL_DEPTH_STALL;
       }
    }
-   return 0;
-}
 
-/* #1130 from gen10 workarounds page in h/w specs:
- * "Enable Depth Stall on every Post Sync Op if Render target Cache Flush is
- *  not enabled in same PIPE CONTROL and Enable Pixel score board stall if
- *  Render target cache flush is enabled."
- *
- * Applicable to CNL B0 and C0 steppings only.
- */
-static void
-gen10_add_rcpfe_workaround_bits(uint32_t *flags)
-{
-   if (*flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) {
-      *flags = *flags | PIPE_CONTROL_STALL_AT_SCOREBOARD;
-   } else if (*flags &
-             (PIPE_CONTROL_WRITE_IMMEDIATE |
-              PIPE_CONTROL_WRITE_DEPTH_COUNT |
-              PIPE_CONTROL_WRITE_TIMESTAMP)) {
-      *flags = *flags | PIPE_CONTROL_DEPTH_STALL;
+   if (GEN_VERSIONx10 < 75 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
+      /* Project: PRE-HSW / Argument: Depth Stall
+       *
+       * "The following bits must be clear:
+       *  - Render Target Cache Flush Enable ([12] of DW1)
+       *  - Depth Cache Flush Enable ([0] of DW1)"
+       */
+      assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                        PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
    }
-}
 
-static unsigned
-flags_to_post_sync_op(uint32_t flags)
-{
-   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
-            PIPE_CONTROL_WRITE_DEPTH_COUNT |
-            PIPE_CONTROL_WRITE_TIMESTAMP;
+   if (GEN_GEN >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
+      /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
+       *
+       *    "This bit must be DISABLED for operations other than writing
+       *     PS_DEPTH_COUNT."
+       *
+       * This seems like nonsense.  An Ivybridge workaround requires us to
+       * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
+       * operation.  Gen8+ requires us to emit depth stalls and depth cache
+       * flushes together.  So, it's hard to imagine this means anything other
+       * than "we originally intended this to be used for PS_DEPTH_COUNT".
+       *
+       * We ignore the supposed restriction and do nothing.
+       */
+   }
 
-   assert(util_bitcount(flags) <= 1);
+   if (GEN_VERSIONx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
+      /* Project: PRE-HSW / Argument: Depth Cache Flush
+       *
+       * "Depth Stall must be clear ([13] of DW1)."
+       */
+      assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
+   }
 
-   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
-      return WriteImmediateData;
+   if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
+      /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
+       *
+       *    "This bit must be DISABLED for End-of-pipe (Read) fences,
+       *     PS_DEPTH_COUNT or TIMESTAMP queries."
+       *
+       * TODO: Implement end-of-pipe checking.
+       */
+      assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                                  PIPE_CONTROL_WRITE_TIMESTAMP)));
+   }
 
-   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
-      return WritePSDepthCount;
+   if (GEN_GEN < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
+      /* From the PIPE_CONTROL instruction table, bit 1:
+       *
+       *    "This bit is ignored if Depth Stall Enable is set.
+       *     Further, the render cache is not flushed even if Write Cache
+       *     Flush Enable bit is set."
+       *
+       * We assert that the caller doesn't do this combination, to try and
+       * prevent mistakes.  It shouldn't hurt the GPU, though.
+       *
+       * We skip this check on Gen11+ as the "Stall and Pixel Scoreboard"
+       * and "Render Target Flush" combo is explicitly required for BTI
+       * update workarounds.
+       */
+      assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
+                        PIPE_CONTROL_RENDER_TARGET_FLUSH)));
+   }
 
-   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
-      return WriteTimestamp;
+   /* PIPE_CONTROL page workarounds ------------------------------------- */
 
-   return 0;
-}
+   if (IS_GEN_BETWEEN(7, 8) && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
+      /* From the PIPE_CONTROL page itself:
+       *
+       *    "IVB, HSW, BDW
+       *     Restriction: Pipe_control with CS-stall bit set must be issued
+       *     before a pipe-control command that has the State Cache
+       *     Invalidate bit set."
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
 
-void
-genX(emit_raw_pipe_control)(struct brw_context *brw, uint32_t flags,
-                            struct brw_bo *bo, uint32_t offset, uint64_t imm)
-{
-   if (GEN_GEN >= 8) {
-      if (GEN_GEN == 8)
-         gen8_add_cs_stall_workaround_bits(&flags);
-
-      if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
-         if (GEN_GEN == 9) {
-            /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
-             * lists several workarounds:
-             *
-             *    "Project: SKL, KBL, BXT
-             *
-             *     If the VF Cache Invalidation Enable is set to a 1 in a
-             *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
-             *     sets to 0, with the VF Cache Invalidation Enable set to 0
-             *     needs to be sent prior to the PIPE_CONTROL with VF Cache
-             *     Invalidation Enable set to a 1."
-             */
-            brw_emit_pipe_control_flush(brw, 0);
-         }
-
-         if (GEN_GEN >= 9) {
-            /* THE PIPE_CONTROL "VF Cache Invalidation Enable" docs continue:
-             *
-             *    "Project: BDW+
-             *
-             *     When VF Cache Invalidate is set “Post Sync Operation” must
-             *     be enabled to “Write Immediate Data” or “Write PS Depth
-             *     Count” or “Write Timestamp”."
-             *
-             * If there's a BO, we're already doing some kind of write.
-             * If not, add a write to the workaround BO.
-             *
-             * XXX: This causes GPU hangs on Broadwell, so restrict it to
-             *      Gen9+ for now...see this bug for more information:
-             *      https://bugs.freedesktop.org/show_bug.cgi?id=103787
-             */
-            if (!bo) {
-               flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
-               bo = brw->workaround_bo;
-            }
-         }
+   if (GEN_IS_HASWELL) {
+      /* From the PIPE_CONTROL page itself:
+       *
+       *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
+       *     Prior to programming a PIPECONTROL command with any of the RO
+       *     cache invalidation bit set, program a PIPECONTROL flush command
+       *     with “CS stall” bit and “HDC Flush” bit set."
+       *
+       * TODO: Actually implement this.  What's an HDC Flush?
+       */
+   }
+
+   if (flags & PIPE_CONTROL_FLUSH_LLC) {
+      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
+       *
+       *    "Project: ALL
+       *     SW must always program Post-Sync Operation to "Write Immediate
+       *     Data" when Flush LLC is set."
+       *
+       * For now, we just require the caller to do it.
+       */
+      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
+   }
+
+   /* "Post-Sync Operation" workarounds -------------------------------- */
+
+   /* Project: All / Argument: Global Snapshot Count Reset [19]
+    *
+    * "This bit must not be exercised on any product.
+    *  Requires stall bit ([20] of DW1) set."
+    *
+    * We don't use this, so we just assert that it isn't used.  The
+    * PIPE_CONTROL instruction page indicates that they intended this
+    * as a debug feature and don't think it is useful in production,
+    * but it may actually be usable, should we ever want to.
+    */
+   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
+
+   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
+                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
+      /* Project: All / Arguments:
+       *
+       * - Generic Media State Clear [16]
+       * - Indirect State Pointers Disable [16]
+       *
+       *    "Requires stall bit ([20] of DW1) set."
+       *
+       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
+       * State Clear) says:
+       *
+       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
+       *     programmed prior to programming a PIPECONTROL command with "Media
+       *     State Clear" set in GPGPU mode of operation"
+       *
+       * This is a subset of the earlier rule, so there's nothing to do.
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
+      /* Project: All / Argument: Store Data Index
+       *
+       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+       *  than '0'."
+       *
+       * For now, we just assert that the caller does this.  We might want to
+       * automatically add a write to the workaround BO...
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (flags & PIPE_CONTROL_SYNC_GFDT) {
+      /* Project: All / Argument: Sync GFDT
+       *
+       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
+       *  than '0' or 0x2520[13] must be set."
+       *
+       * For now, we just assert that the caller does this.
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (IS_GENx10_BETWEEN(60, 75) && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
+      /* Project: SNB, IVB, HSW / Argument: TLB inv
+       *
+       * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
+       *  must be set to something other than '0'."
+       *
+       * For now, we just assert that the caller does this.
+       */
+      assert(non_lri_post_sync_flags != 0);
+   }
+
+   if (GEN_GEN >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
+      /* Project: IVB+ / Argument: TLB inv
+       *
+       *    "Requires stall bit ([20] of DW1) set."
+       *
+       * Also, from the PIPE_CONTROL instruction table:
+       *
+       *    "Project: SKL+
+       *     Post Sync Operation or CS stall must be set to ensure a TLB
+       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
+       *     cache to invalidate."
+       *
+       * This is not a subset of the earlier rule, so there's nothing to do.
+       */
+      flags |= PIPE_CONTROL_CS_STALL;
+   }
+
+   if (GEN_GEN == 9 && devinfo->gt == 4) {
+      /* TODO: The big Skylake GT4 post sync op workaround */
+   }
+
+   /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
+
+   if (IS_COMPUTE_PIPELINE(brw)) {
+      if (GEN_GEN >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
+         /* Project: SKL+ / Argument: Tex Invalidate
+          * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
+          */
+         flags |= PIPE_CONTROL_CS_STALL;
       }
 
-      if (GEN_GEN == 10)
-         gen10_add_rcpfe_workaround_bits(&flags);
-   } else if (GEN_GEN >= 6) {
-      if (GEN_GEN == 6 &&
-          (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
-         /* Hardware workaround: SNB B-Spec says:
+      if (GEN_GEN == 8 && (post_sync_flags ||
+                           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
+                                     PIPE_CONTROL_DEPTH_STALL |
+                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                     PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
+         /* Project: BDW / Arguments:
           *
-          *   [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
-          *   Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
-          *   required.
+          * - LRI Post Sync Operation   [23]
+          * - Post Sync Op              [15:14]
+          * - Notify En                 [8]
+          * - Depth Stall               [13]
+          * - Render Target Cache Flush [12]
+          * - Depth Cache Flush         [0]
+          * - DC Flush Enable           [5]
+          *
+          *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
+          *     Workloads."
           */
-         brw_emit_post_sync_nonzero_flush(brw);
+         flags |= PIPE_CONTROL_CS_STALL;
+
+         /* Also, from the PIPE_CONTROL instruction table, bit 20:
+          *
+          *    "Project: BDW
+          *     This bit must be always set when PIPE_CONTROL command is
+          *     programmed by GPGPU and MEDIA workloads, except for the cases
+          *     when only Read Only Cache Invalidation bits are set (State
+          *     Cache Invalidation Enable, Instruction cache Invalidation
+          *     Enable, Texture Cache Invalidation Enable, Constant Cache
+          *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
+          *     need not implemented when FF_DOP_CG is disable via "Fixed
+          *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
+          *
+          * It sounds like we could avoid CS stalls in some cases, but we
+          * don't currently bother.  This list isn't exactly the list above,
+          * either...
+          */
+      }
+   }
+
+   /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
+    *
+    * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
+    *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
+    *
+    * Note that the kernel does CS stalls between batches, so we only need
+    * to count them within a batch.  We currently naively count every 4, and
+    * don't skip the ones with only read-cache-invalidate bits set.  This
+    * may or may not be a problem...
+    */
+   if (GEN_GEN == 7 && !GEN_IS_HASWELL) {
+      if (flags & PIPE_CONTROL_CS_STALL) {
+         /* If we're doing a CS stall, reset the counter and carry on. */
+         brw->pipe_controls_since_last_cs_stall = 0;
       }
 
-      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
+      /* If this is the fourth pipe control without a CS stall, do one now. */
+      if (++brw->pipe_controls_since_last_cs_stall == 4) {
+         brw->pipe_controls_since_last_cs_stall = 0;
+         flags |= PIPE_CONTROL_CS_STALL;
+      }
    }
 
+   /* "Stall" workarounds ----------------------------------------------
+    * These have to come after the earlier ones because we may have added
+    * some additional CS stalls above.
+    */
+
+   if (GEN_GEN < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
+      /* Project: PRE-SKL, VLV, CHV
+       *
+       * "[All Stepping][All SKUs]:
+       *
+       *  One of the following must also be set:
+       *
+       *  - Render Target Cache Flush Enable ([12] of DW1)
+       *  - Depth Cache Flush Enable ([0] of DW1)
+       *  - Stall at Pixel Scoreboard ([1] of DW1)
+       *  - Depth Stall ([13] of DW1)
+       *  - Post-Sync Operation ([13] of DW1)
+       *  - DC Flush Enable ([5] of DW1)"
+       *
+       * If we don't already have one of those bits set, we choose to add
+       * "Stall at Pixel Scoreboard".  Some of the other bits require a
+       * CS stall as a workaround (see above), which would send us into
+       * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
+       * appears to be safe, so we choose that.
+       */
+      const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                               PIPE_CONTROL_WRITE_IMMEDIATE |
+                               PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                               PIPE_CONTROL_WRITE_TIMESTAMP |
+                               PIPE_CONTROL_STALL_AT_SCOREBOARD |
+                               PIPE_CONTROL_DEPTH_STALL |
+                               PIPE_CONTROL_DATA_CACHE_FLUSH;
+      if (!(flags & wa_bits))
+         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
+   }
+
+   /* Emit --------------------------------------------------------------- */
+
    brw_batch_emit(brw, GENX(PIPE_CONTROL), pc) {
    #if GEN_GEN >= 9
       pc.FlushLLC = 0;

Comments

On Thu, Nov 01, 2018 at 08:04:21PM -0700, Kenneth Graunke wrote:
> This implements virtually all documented PIPE_CONTROL restrictions
> in a centralized helper.  You now simply ask for the operations you
> want, and the pipe control "brain" will figure out exactly what pipe
> controls to emit to make that happen without tanking your system.
> 
> The hope is that this will fix some intermittent flushing issues as
> well as GPU hangs.  However, it also has a high risk of causing GPU
> hangs and other regressions, as this is a particularly sensitive
> area and poking the bear isn't always advisable.

First I checked I could find all the things in bspec. There was one that I
couldn't, noted further down.

Second I checked that all the rules earlier were implemented. Found one
exception, noted further down as well.

I didn't check if the rules still miss something in bspec. That would be
another exercise...

> ---
>  src/mesa/drivers/dri/i965/genX_pipe_control.c | 563 +++++++++++++-----
>  1 file changed, 428 insertions(+), 135 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/genX_pipe_control.c b/src/mesa/drivers/dri/i965/genX_pipe_control.c
> index 8eb37444253..503e674661b 100644
> --- a/src/mesa/drivers/dri/i965/genX_pipe_control.c
> +++ b/src/mesa/drivers/dri/i965/genX_pipe_control.c
> @@ -25,172 +25,465 @@
>  #include "brw_defines.h"
>  #include "brw_state.h"
>  
> +static unsigned
> +flags_to_post_sync_op(uint32_t flags)
> +{
> +   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
> +      return WriteImmediateData;
> +
> +   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
> +      return WritePSDepthCount;
> +
> +   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
> +      return WriteTimestamp;
> +
> +   return 0;
> +}
> +
>  /**
> - * According to the latest documentation, any PIPE_CONTROL with the
> - * "Command Streamer Stall" bit set must also have another bit set,
> - * with five different options:
> - *
> - *  - Render Target Cache Flush
> - *  - Depth Cache Flush
> - *  - Stall at Pixel Scoreboard
> - *  - Post-Sync Operation
> - *  - Depth Stall
> - *  - DC Flush Enable
> - *
> - * I chose "Stall at Pixel Scoreboard" since we've used it effectively
> - * in the past, but the choice is fairly arbitrary.
> + * Do the given flags have a Post Sync or LRI Post Sync operation?
>   */
> -static void
> -gen8_add_cs_stall_workaround_bits(uint32_t *flags)
> +static enum pipe_control_flags
> +get_post_sync_flags(enum pipe_control_flags flags)
>  {
> -   uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
> -                      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> -                      PIPE_CONTROL_WRITE_IMMEDIATE |
> -                      PIPE_CONTROL_WRITE_DEPTH_COUNT |
> -                      PIPE_CONTROL_WRITE_TIMESTAMP |
> -                      PIPE_CONTROL_STALL_AT_SCOREBOARD |
> -                      PIPE_CONTROL_DEPTH_STALL |
> -                      PIPE_CONTROL_DATA_CACHE_FLUSH;
> -
> -   /* If we're doing a CS stall, and don't already have one of the
> -    * workaround bits set, add "Stall at Pixel Scoreboard."
> +   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
> +            PIPE_CONTROL_WRITE_DEPTH_COUNT |
> +            PIPE_CONTROL_WRITE_TIMESTAMP |
> +            PIPE_CONTROL_LRI_POST_SYNC_OP;
> +
> +   /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
> +    * "LRI Post Sync Operation".  So more than one bit set would be illegal.
>      */
> -   if ((*flags & PIPE_CONTROL_CS_STALL) != 0 && (*flags & wa_bits) == 0)
> -      *flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
> +   assert(util_bitcount(flags) <= 1);
> +
> +   return flags;
>  }
>  
> -/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
> +#define IS_COMPUTE_PIPELINE(brw) \
> +   (GEN_GEN >= 7 && brw->last_pipeline == BRW_COMPUTE_PIPELINE)
> +
> +/* Closed interval - GEN_GEN \in [x, y] */
> +#define IS_GEN_BETWEEN(x, y) (GEN_GEN >= x && GEN_GEN <= y)
> +#define IS_GENx10_BETWEEN(x, y) \
> +   (GEN_VERSIONx10 >= x && GEN_VERSIONx10 <= y)
> +
> +/**
> + * Emit a series of PIPE_CONTROL commands, taking into account any
> + * workarounds necessary to actually accomplish the caller's request.
> + *
> + * Unless otherwise noted, spec quotations in this function come from:
>   *
> - * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
> - *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
> + * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
> + * Restrictions for PIPE_CONTROL.
>   *
> - * Note that the kernel does CS stalls between batches, so we only need
> - * to count them within a batch.
> + * You should not use this function directly.  Use the helpers in
> + * brw_pipe_control.c instead, which may split the pipe control further.
>   */
> -static uint32_t
> -gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
> +void
> +genX(emit_raw_pipe_control)(struct brw_context *brw, uint32_t flags,
> +                            struct brw_bo *bo, uint32_t offset, uint64_t imm)
>  {
> -   if (GEN_GEN == 7 && !GEN_IS_HASWELL) {
> -      if (flags & PIPE_CONTROL_CS_STALL) {
> -         /* If we're doing a CS stall, reset the counter and carry on. */
> -         brw->pipe_controls_since_last_cs_stall = 0;
> -         return 0;
> +   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
> +   enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
> +   enum pipe_control_flags non_lri_post_sync_flags =
> +      post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
> +
> +   /* Recursive PIPE_CONTROL workarounds --------------------------------
> +    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
> +    *
> +    * We do these first because we want to look at the original operation,
> +    * rather than any workarounds we set.
> +    */
> +   if (GEN_GEN == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
> +      /* Hardware workaround: SNB B-Spec says:
> +       *
> +       *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
> +       *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
> +       *     required."
> +       */
> +      brw_emit_post_sync_nonzero_flush(brw);
> +   }
> +
> +   if (GEN_GEN == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
> +      /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
> +       * lists several workarounds:
> +       *
> +       *    "Project: SKL, KBL, BXT
> +       *
> +       *     If the VF Cache Invalidation Enable is set to a 1 in a
> +       *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
> +       *     sets to 0, with the VF Cache Invalidation Enable set to 0
> +       *     needs to be sent prior to the PIPE_CONTROL with VF Cache
> +       *     Invalidation Enable set to a 1."
> +       */
> +      genX(emit_raw_pipe_control)(brw, 0, NULL, 0, 0);
> +   }
> +
> +   if (GEN_GEN == 9 && IS_COMPUTE_PIPELINE(brw) && post_sync_flags) {
> +      /* Project: SKL / Argument: LRI Post Sync Operation [23]
> +       *
> +       * "PIPECONTROL command with “Command Streamer Stall Enable” must be
> +       *  programmed prior to programming a PIPECONTROL command with "LRI
> +       *  Post Sync Operation" in GPGPU mode of operation (i.e when
> +       *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
> +       *
> +       * The same text exists a few rows below for Post Sync Op.
> +       */
> +      genX(emit_raw_pipe_control)(brw, PIPE_CONTROL_CS_STALL, bo, offset, imm);

Are bo, offset, imm needed here as well?

> +   }
> +
> +   if (GEN_GEN == 10 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
> +      /* Cannonlake:
> +       * "Before sending a PIPE_CONTROL command with bit 12 set, SW must issue
> +       *  another PIPE_CONTROL with Render Target Cache Flush Enable (bit 12)
> +       *  = 0 and Pipe Control Flush Enable (bit 7) = 1"
> +       */
> +      genX(emit_raw_pipe_control)(brw, PIPE_CONTROL_FLUSH_ENABLE,
> +                                  bo, offset, imm);
> +   }
> +
> +   /* "Flush Types" workarounds ---------------------------------------------
> +    * We do these now because they may add post-sync operations or CS stalls.
> +    */
> +
> +   if (IS_GEN_BETWEEN(8, 10) && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
> +      /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
> +       *
> +       * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
> +       *  'Write PS Depth Count' or 'Write Timestamp'."
> +       */
> +      if (!bo) {
> +         flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
> +         post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
> +         non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
> +         bo = brw->workaround_bo;
>        }
> +   }
>  
> -      /* If this is the fourth pipe control without a CS stall, do one now. */
> -      if (++brw->pipe_controls_since_last_cs_stall == 4) {
> -         brw->pipe_controls_since_last_cs_stall = 0;
> -         return PIPE_CONTROL_CS_STALL;
> +   /* #1130 from Gen10 workarounds page:
> +    *
> +    *    "Enable Depth Stall on every Post Sync Op if Render target Cache
> +    *     Flush is not enabled in same PIPE CONTROL and Enable Pixel score
> +    *     board stall if Render target cache flush is enabled."
> +    *
> +    * Applicable to CNL B0 and C0 steppings only.
> +    *
> +    * The wording here is unclear, and this workaround doesn't look anything
> +    * like the internal bug report recommendations, but leave it be for now...
> +    */
> +   if (GEN_GEN == 10) {
> +      if (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) {
> +         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
> +      } else if (flags & non_lri_post_sync_flags) {
> +         flags |= PIPE_CONTROL_DEPTH_STALL;
>        }
>     }
> -   return 0;
> -}
>  
> -/* #1130 from gen10 workarounds page in h/w specs:
> - * "Enable Depth Stall on every Post Sync Op if Render target Cache Flush is
> - *  not enabled in same PIPE CONTROL and Enable Pixel score board stall if
> - *  Render target cache flush is enabled."
> - *
> - * Applicable to CNL B0 and C0 steppings only.
> - */
> -static void
> -gen10_add_rcpfe_workaround_bits(uint32_t *flags)
> -{
> -   if (*flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) {
> -      *flags = *flags | PIPE_CONTROL_STALL_AT_SCOREBOARD;
> -   } else if (*flags &
> -             (PIPE_CONTROL_WRITE_IMMEDIATE |
> -              PIPE_CONTROL_WRITE_DEPTH_COUNT |
> -              PIPE_CONTROL_WRITE_TIMESTAMP)) {
> -      *flags = *flags | PIPE_CONTROL_DEPTH_STALL;
> +   if (GEN_VERSIONx10 < 75 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
> +      /* Project: PRE-HSW / Argument: Depth Stall
> +       *
> +       * "The following bits must be clear:
> +       *  - Render Target Cache Flush Enable ([12] of DW1)
> +       *  - Depth Cache Flush Enable ([0] of DW1)"
> +       */
> +      assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
> +                        PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
>     }
> -}
>  
> -static unsigned
> -flags_to_post_sync_op(uint32_t flags)
> -{
> -   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
> -            PIPE_CONTROL_WRITE_DEPTH_COUNT |
> -            PIPE_CONTROL_WRITE_TIMESTAMP;
> +   if (GEN_GEN >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
> +      /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
> +       *
> +       *    "This bit must be DISABLED for operations other than writing
> +       *     PS_DEPTH_COUNT."
> +       *
> +       * This seems like nonsense.  An Ivybridge workaround requires us to
> +       * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
> +       * operation.  Gen8+ requires us to emit depth stalls and depth cache
> +       * flushes together.  So, it's hard to imagine this means anything other
> +       * than "we originally intended this to be used for PS_DEPTH_COUNT".
> +       *
> +       * We ignore the supposed restriction and do nothing.
> +       */
> +   }
>  
> -   assert(util_bitcount(flags) <= 1);
> +   if (GEN_VERSIONx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
> +      /* Project: PRE-HSW / Argument: Depth Cache Flush
> +       *
> +       * "Depth Stall must be clear ([13] of DW1)."
> +       */
> +      assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
> +   }
>  
> -   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
> -      return WriteImmediateData;
> +   if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
> +                PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
> +      /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
> +       *
> +       *    "This bit must be DISABLED for End-of-pipe (Read) fences,
> +       *     PS_DEPTH_COUNT or TIMESTAMP queries."
> +       *
> +       * TODO: Implement end-of-pipe checking.
> +       */
> +      assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
> +                                  PIPE_CONTROL_WRITE_TIMESTAMP)));
> +   }
>  
> -   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
> -      return WritePSDepthCount;
> +   if (GEN_GEN < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
> +      /* From the PIPE_CONTROL instruction table, bit 1:
> +       *
> +       *    "This bit is ignored if Depth Stall Enable is set.
> +       *     Further, the render cache is not flushed even if Write Cache
> +       *     Flush Enable bit is set."
> +       *
> +       * We assert that the caller doesn't do this combination, to try and
> +       * prevent mistakes.  It shouldn't hurt the GPU, though.
> +       *
> +       * We skip this check on Gen11+ as the "Stall and Pixel Scoreboard"
> +       * and "Render Target Flush" combo is explicitly required for BTI
> +       * update workarounds.
> +       */
> +      assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
> +                        PIPE_CONTROL_RENDER_TARGET_FLUSH)));
> +   }
>  
> -   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
> -      return WriteTimestamp;
> +   /* PIPE_CONTROL page workarounds ------------------------------------- */
>  
> -   return 0;
> -}
> +   if (IS_GEN_BETWEEN(7, 8) && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
> +      /* From the PIPE_CONTROL page itself:
> +       *
> +       *    "IVB, HSW, BDW
> +       *     Restriction: Pipe_control with CS-stall bit set must be issued
> +       *     before a pipe-control command that has the State Cache
> +       *     Invalidate bit set."
> +       */
> +      flags |= PIPE_CONTROL_CS_STALL;
> +   }
>  
> -void
> -genX(emit_raw_pipe_control)(struct brw_context *brw, uint32_t flags,
> -                            struct brw_bo *bo, uint32_t offset, uint64_t imm)
> -{
> -   if (GEN_GEN >= 8) {
> -      if (GEN_GEN == 8)
> -         gen8_add_cs_stall_workaround_bits(&flags);
> -
> -      if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
> -         if (GEN_GEN == 9) {
> -            /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
> -             * lists several workarounds:
> -             *
> -             *    "Project: SKL, KBL, BXT
> -             *
> -             *     If the VF Cache Invalidation Enable is set to a 1 in a
> -             *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
> -             *     sets to 0, with the VF Cache Invalidation Enable set to 0
> -             *     needs to be sent prior to the PIPE_CONTROL with VF Cache
> -             *     Invalidation Enable set to a 1."
> -             */
> -            brw_emit_pipe_control_flush(brw, 0);
> -         }
> -
> -         if (GEN_GEN >= 9) {
> -            /* THE PIPE_CONTROL "VF Cache Invalidation Enable" docs continue:
> -             *
> -             *    "Project: BDW+
> -             *
> -             *     When VF Cache Invalidate is set “Post Sync Operation” must
> -             *     be enabled to “Write Immediate Data” or “Write PS Depth
> -             *     Count” or “Write Timestamp”."
> -             *
> -             * If there's a BO, we're already doing some kind of write.
> -             * If not, add a write to the workaround BO.
> -             *
> -             * XXX: This causes GPU hangs on Broadwell, so restrict it to
> -             *      Gen9+ for now...see this bug for more information:
> -             *      https://bugs.freedesktop.org/show_bug.cgi?id=103787

In "Flush Types" workarounds later on you apply this for gen8 as well.

> -             */
> -            if (!bo) {
> -               flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
> -               bo = brw->workaround_bo;
> -            }
> -         }
> +   if (GEN_IS_HASWELL) {
> +      /* From the PIPE_CONTROL page itself:
> +       *
> +       *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
> +       *     Prior to programming a PIPECONTROL command with any of the RO
> +       *     cache invalidation bit set, program a PIPECONTROL flush command
> +       *     with “CS stall” bit and “HDC Flush” bit set."
> +       *
> +       * TODO: Actually implement this.  What's an HDC Flush?

There is bit 9 HDC Flush but that is defined for ICL, for HSW I couldn't find
anything either.

> +       */
> +   }
> +
> +   if (flags & PIPE_CONTROL_FLUSH_LLC) {
> +      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
> +       *
> +       *    "Project: ALL
> +       *     SW must always program Post-Sync Operation to "Write Immediate
> +       *     Data" when Flush LLC is set."
> +       *
> +       * For now, we just require the caller to do it.
> +       */
> +      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
> +   }
> +
> +   /* "Post-Sync Operation" workarounds -------------------------------- */
> +
> +   /* Project: All / Argument: Global Snapshot Count Reset [19]
> +    *
> +    * "This bit must not be exercised on any product.
> +    *  Requires stall bit ([20] of DW1) set."
> +    *
> +    * We don't use this, so we just assert that it isn't used.  The
> +    * PIPE_CONTROL instruction page indicates that they intended this
> +    * as a debug feature and don't think it is useful in production,
> +    * but it may actually be usable, should we ever want to.
> +    */
> +   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
> +
> +   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
> +                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
> +      /* Project: All / Arguments:
> +       *
> +       * - Generic Media State Clear [16]
> +       * - Indirect State Pointers Disable [16]
> +       *
> +       *    "Requires stall bit ([20] of DW1) set."
> +       *
> +       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
> +       * State Clear) says:
> +       *
> +       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
> +       *     programmed prior to programming a PIPECONTROL command with "Media
> +       *     State Clear" set in GPGPU mode of operation"
> +       *
> +       * This is a subset of the earlier rule, so there's nothing to do.
> +       */
> +      flags |= PIPE_CONTROL_CS_STALL;
> +   }
> +
> +   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
> +      /* Project: All / Argument: Store Data Index
> +       *
> +       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
> +       *  than '0'."
> +       *
> +       * For now, we just assert that the caller does this.  We might want to
> +       * automatically add a write to the workaround BO...
> +       */
> +      assert(non_lri_post_sync_flags != 0);
> +   }
> +
> +   if (flags & PIPE_CONTROL_SYNC_GFDT) {
> +      /* Project: All / Argument: Sync GFDT
> +       *
> +       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
> +       *  than '0' or 0x2520[13] must be set."
> +       *
> +       * For now, we just assert that the caller does this.
> +       */
> +      assert(non_lri_post_sync_flags != 0);
> +   }
> +
> +   if (IS_GENx10_BETWEEN(60, 75) && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
> +      /* Project: SNB, IVB, HSW / Argument: TLB inv
> +       *
> +       * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
> +       *  must be set to something other than '0'."
> +       *
> +       * For now, we just assert that the caller does this.
> +       */
> +      assert(non_lri_post_sync_flags != 0);
> +   }
> +
> +   if (GEN_GEN >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
> +      /* Project: IVB+ / Argument: TLB inv
> +       *
> +       *    "Requires stall bit ([20] of DW1) set."
> +       *
> +       * Also, from the PIPE_CONTROL instruction table:
> +       *
> +       *    "Project: SKL+
> +       *     Post Sync Operation or CS stall must be set to ensure a TLB
> +       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
> +       *     cache to invalidate."
> +       *
> +       * This is not a subset of the earlier rule, so there's nothing to do.
> +       */
> +      flags |= PIPE_CONTROL_CS_STALL;
> +   }
> +
> +   if (GEN_GEN == 9 && devinfo->gt == 4) {
> +      /* TODO: The big Skylake GT4 post sync op workaround */
> +   }
> +
> +   /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
> +
> +   if (IS_COMPUTE_PIPELINE(brw)) {
> +      if (GEN_GEN >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
> +         /* Project: SKL+ / Argument: Tex Invalidate
> +          * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
> +          */
> +         flags |= PIPE_CONTROL_CS_STALL;
>        }
>  
> -      if (GEN_GEN == 10)
> -         gen10_add_rcpfe_workaround_bits(&flags);
> -   } else if (GEN_GEN >= 6) {
> -      if (GEN_GEN == 6 &&
> -          (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
> -         /* Hardware workaround: SNB B-Spec says:
> +      if (GEN_GEN == 8 && (post_sync_flags ||
> +                           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
> +                                     PIPE_CONTROL_DEPTH_STALL |
> +                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
> +                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +                                     PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
> +         /* Project: BDW / Arguments:
>            *
> -          *   [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
> -          *   Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
> -          *   required.
> +          * - LRI Post Sync Operation   [23]
> +          * - Post Sync Op              [15:14]
> +          * - Notify En                 [8]
> +          * - Depth Stall               [13]
> +          * - Render Target Cache Flush [12]
> +          * - Depth Cache Flush         [0]
> +          * - DC Flush Enable           [5]
> +          *
> +          *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
> +          *     Workloads."

This I couldn't find.

>            */
> -         brw_emit_post_sync_nonzero_flush(brw);
> +         flags |= PIPE_CONTROL_CS_STALL;
> +
> +         /* Also, from the PIPE_CONTROL instruction table, bit 20:
> +          *
> +          *    "Project: BDW
> +          *     This bit must be always set when PIPE_CONTROL command is
> +          *     programmed by GPGPU and MEDIA workloads, except for the cases
> +          *     when only Read Only Cache Invalidation bits are set (State
> +          *     Cache Invalidation Enable, Instruction cache Invalidation
> +          *     Enable, Texture Cache Invalidation Enable, Constant Cache
> +          *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
> +          *     need not implemented when FF_DOP_CG is disable via "Fixed
> +          *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
> +          *
> +          * It sounds like we could avoid CS stalls in some cases, but we
> +          * don't currently bother.  This list isn't exactly the list above,
> +          * either...
> +          */
> +      }
> +   }
> +
> +   /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
> +    *
> +    * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
> +    *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
> +    *
> +    * Note that the kernel does CS stalls between batches, so we only need
> +    * to count them within a batch.  We currently naively count every 4, and
> +    * don't skip the ones with only read-cache-invalidate bits set.  This
> +    * may or may not be a problem...
> +    */
> +   if (GEN_GEN == 7 && !GEN_IS_HASWELL) {
> +      if (flags & PIPE_CONTROL_CS_STALL) {
> +         /* If we're doing a CS stall, reset the counter and carry on. */
> +         brw->pipe_controls_since_last_cs_stall = 0;
>        }
>  
> -      flags |= gen7_cs_stall_every_four_pipe_controls(brw, flags);
> +      /* If this is the fourth pipe control without a CS stall, do one now. */
> +      if (++brw->pipe_controls_since_last_cs_stall == 4) {
> +         brw->pipe_controls_since_last_cs_stall = 0;
> +         flags |= PIPE_CONTROL_CS_STALL;
> +      }
>     }
>  
> +   /* "Stall" workarounds ----------------------------------------------
> +    * These have to come after the earlier ones because we may have added
> +    * some additional CS stalls above.
> +    */
> +
> +   if (GEN_GEN < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
> +      /* Project: PRE-SKL, VLV, CHV
> +       *
> +       * "[All Stepping][All SKUs]:
> +       *
> +       *  One of the following must also be set:
> +       *
> +       *  - Render Target Cache Flush Enable ([12] of DW1)
> +       *  - Depth Cache Flush Enable ([0] of DW1)
> +       *  - Stall at Pixel Scoreboard ([1] of DW1)
> +       *  - Depth Stall ([13] of DW1)
> +       *  - Post-Sync Operation ([13] of DW1)
> +       *  - DC Flush Enable ([5] of DW1)"
> +       *
> +       * If we don't already have one of those bits set, we choose to add
> +       * "Stall at Pixel Scoreboard".  Some of the other bits require a
> +       * CS stall as a workaround (see above), which would send us into
> +       * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
> +       * appears to be safe, so we choose that.
> +       */
> +      const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
> +                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +                               PIPE_CONTROL_WRITE_IMMEDIATE |
> +                               PIPE_CONTROL_WRITE_DEPTH_COUNT |
> +                               PIPE_CONTROL_WRITE_TIMESTAMP |
> +                               PIPE_CONTROL_STALL_AT_SCOREBOARD |
> +                               PIPE_CONTROL_DEPTH_STALL |
> +                               PIPE_CONTROL_DATA_CACHE_FLUSH;
> +      if (!(flags & wa_bits))
> +         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
> +   }
> +
> +   /* Emit --------------------------------------------------------------- */
> +
>     brw_batch_emit(brw, GENX(PIPE_CONTROL), pc) {
>     #if GEN_GEN >= 9
>        pc.FlushLLC = 0;
> -- 
> 2.19.1
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Monday, February 25, 2019 6:33:11 AM PST Pohjolainen, Topi wrote:
> On Thu, Nov 01, 2018 at 08:04:21PM -0700, Kenneth Graunke wrote:
> > This implements virtually all documented PIPE_CONTROL restrictions
> > in a centralized helper.  You now simply ask for the operations you
> > want, and the pipe control "brain" will figure out exactly what pipe
> > controls to emit to make that happen without tanking your system.
> > 
> > The hope is that this will fix some intermittent flushing issues as
> > well as GPU hangs.  However, it also has a high risk of causing GPU
> > hangs and other regressions, as this is a particularly sensitive
> > area and poking the bear isn't always advisable.
> 
> First I checked I could find all the things in bspec. There was one that I
> couldn't, noted further down.
> 
> Second I checked that all the rules earlier were implemented. Found one
> exception, noted further down as well.
> 
> I didn't check if the rules still miss something in bspec. That would be
> another exercise...

[snip]
> > +   /* Recursive PIPE_CONTROL workarounds --------------------------------
> > +    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
> > +    *
> > +    * We do these first because we want to look at the original operation,
> > +    * rather than any workarounds we set.
> > +    */
> > +   if (GEN_GEN == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
> > +      /* Hardware workaround: SNB B-Spec says:
> > +       *
> > +       *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
> > +       *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
> > +       *     required."
> > +       */
> > +      brw_emit_post_sync_nonzero_flush(brw);
> > +   }
> > +
> > +   if (GEN_GEN == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
> > +      /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
> > +       * lists several workarounds:
> > +       *
> > +       *    "Project: SKL, KBL, BXT
> > +       *
> > +       *     If the VF Cache Invalidation Enable is set to a 1 in a
> > +       *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
> > +       *     sets to 0, with the VF Cache Invalidation Enable set to 0
> > +       *     needs to be sent prior to the PIPE_CONTROL with VF Cache
> > +       *     Invalidation Enable set to a 1."
> > +       */
> > +      genX(emit_raw_pipe_control)(brw, 0, NULL, 0, 0);
> > +   }
> > +
> > +   if (GEN_GEN == 9 && IS_COMPUTE_PIPELINE(brw) && post_sync_flags) {
> > +      /* Project: SKL / Argument: LRI Post Sync Operation [23]
> > +       *
> > +       * "PIPECONTROL command with “Command Streamer Stall Enable” must be
> > +       *  programmed prior to programming a PIPECONTROL command with "LRI
> > +       *  Post Sync Operation" in GPGPU mode of operation (i.e when
> > +       *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
> > +       *
> > +       * The same text exists a few rows below for Post Sync Op.
> > +       */
> > +      genX(emit_raw_pipe_control)(brw, PIPE_CONTROL_CS_STALL, bo, offset, imm);
> 
> Are bo, offset, imm needed here as well?

No, I don't think so.  We should pass NULL, 0, 0 here - we're just doing
a plain CS stall separately.  We'll use them for the actual write later.

Good catch!

[snip]
> > -         if (GEN_GEN >= 9) {
> > -            /* THE PIPE_CONTROL "VF Cache Invalidation Enable" docs continue:
> > -             *
> > -             *    "Project: BDW+
> > -             *
> > -             *     When VF Cache Invalidate is set “Post Sync Operation” must
> > -             *     be enabled to “Write Immediate Data” or “Write PS Depth
> > -             *     Count” or “Write Timestamp”."
> > -             *
> > -             * If there's a BO, we're already doing some kind of write.
> > -             * If not, add a write to the workaround BO.
> > -             *
> > -             * XXX: This causes GPU hangs on Broadwell, so restrict it to
> > -             *      Gen9+ for now...see this bug for more information:
> > -             *      https://bugs.freedesktop.org/show_bug.cgi?id=103787
> 
> In "Flush Types" workarounds later on you apply this for gen8 as well.

Yes, that's intentional - we're supposed to according to the docs.
I re-tested the Piglit test from bug 103787 on my Broadwell, and it
works fine - no GPU hangs.  I think we were just doing it wrong before.

Trying to figure out an ordering for the workarounds is awful... :(

> > -             */
> > -            if (!bo) {
> > -               flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
> > -               bo = brw->workaround_bo;
> > -            }
> > -         }
> > +   if (GEN_IS_HASWELL) {
> > +      /* From the PIPE_CONTROL page itself:
> > +       *
> > +       *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
> > +       *     Prior to programming a PIPECONTROL command with any of the RO
> > +       *     cache invalidation bit set, program a PIPECONTROL flush command
> > +       *     with “CS stall” bit and “HDC Flush” bit set."
> > +       *
> > +       * TODO: Actually implement this.  What's an HDC Flush?
> 
> There is bit 9 HDC Flush but that is defined for ICL, for HSW I couldn't find
> anything either.

Yeah...I figured I'd call it out, but...I don't think we're doing
anything today either, so this is at least no worse.

> > +       */
> > +   }
> > +
> > +   if (flags & PIPE_CONTROL_FLUSH_LLC) {
> > +      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
> > +       *
> > +       *    "Project: ALL
> > +       *     SW must always program Post-Sync Operation to "Write Immediate
> > +       *     Data" when Flush LLC is set."
> > +       *
> > +       * For now, we just require the caller to do it.
> > +       */
> > +      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
> > +   }
> > +
> > +   /* "Post-Sync Operation" workarounds -------------------------------- */
> > +
> > +   /* Project: All / Argument: Global Snapshot Count Reset [19]
> > +    *
> > +    * "This bit must not be exercised on any product.
> > +    *  Requires stall bit ([20] of DW1) set."
> > +    *
> > +    * We don't use this, so we just assert that it isn't used.  The
> > +    * PIPE_CONTROL instruction page indicates that they intended this
> > +    * as a debug feature and don't think it is useful in production,
> > +    * but it may actually be usable, should we ever want to.
> > +    */
> > +   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
> > +
> > +   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
> > +                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
> > +      /* Project: All / Arguments:
> > +       *
> > +       * - Generic Media State Clear [16]
> > +       * - Indirect State Pointers Disable [16]
> > +       *
> > +       *    "Requires stall bit ([20] of DW1) set."
> > +       *
> > +       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
> > +       * State Clear) says:
> > +       *
> > +       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
> > +       *     programmed prior to programming a PIPECONTROL command with "Media
> > +       *     State Clear" set in GPGPU mode of operation"
> > +       *
> > +       * This is a subset of the earlier rule, so there's nothing to do.
> > +       */
> > +      flags |= PIPE_CONTROL_CS_STALL;
> > +   }
> > +
> > +   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
> > +      /* Project: All / Argument: Store Data Index
> > +       *
> > +       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
> > +       *  than '0'."
> > +       *
> > +       * For now, we just assert that the caller does this.  We might want to
> > +       * automatically add a write to the workaround BO...
> > +       */
> > +      assert(non_lri_post_sync_flags != 0);
> > +   }
> > +
> > +   if (flags & PIPE_CONTROL_SYNC_GFDT) {
> > +      /* Project: All / Argument: Sync GFDT
> > +       *
> > +       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
> > +       *  than '0' or 0x2520[13] must be set."
> > +       *
> > +       * For now, we just assert that the caller does this.
> > +       */
> > +      assert(non_lri_post_sync_flags != 0);
> > +   }
> > +
> > +   if (IS_GENx10_BETWEEN(60, 75) && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
> > +      /* Project: SNB, IVB, HSW / Argument: TLB inv
> > +       *
> > +       * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
> > +       *  must be set to something other than '0'."
> > +       *
> > +       * For now, we just assert that the caller does this.
> > +       */
> > +      assert(non_lri_post_sync_flags != 0);
> > +   }
> > +
> > +   if (GEN_GEN >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
> > +      /* Project: IVB+ / Argument: TLB inv
> > +       *
> > +       *    "Requires stall bit ([20] of DW1) set."
> > +       *
> > +       * Also, from the PIPE_CONTROL instruction table:
> > +       *
> > +       *    "Project: SKL+
> > +       *     Post Sync Operation or CS stall must be set to ensure a TLB
> > +       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
> > +       *     cache to invalidate."
> > +       *
> > +       * This is not a subset of the earlier rule, so there's nothing to do.
> > +       */
> > +      flags |= PIPE_CONTROL_CS_STALL;
> > +   }
> > +
> > +   if (GEN_GEN == 9 && devinfo->gt == 4) {
> > +      /* TODO: The big Skylake GT4 post sync op workaround */
> > +   }
> > +
> > +   /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
> > +
> > +   if (IS_COMPUTE_PIPELINE(brw)) {
> > +      if (GEN_GEN >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
> > +         /* Project: SKL+ / Argument: Tex Invalidate
> > +          * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
> > +          */
> > +         flags |= PIPE_CONTROL_CS_STALL;
> >        }
> >  
> > -      if (GEN_GEN == 10)
> > -         gen10_add_rcpfe_workaround_bits(&flags);
> > -   } else if (GEN_GEN >= 6) {
> > -      if (GEN_GEN == 6 &&
> > -          (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
> > -         /* Hardware workaround: SNB B-Spec says:
> > +      if (GEN_GEN == 8 && (post_sync_flags ||
> > +                           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
> > +                                     PIPE_CONTROL_DEPTH_STALL |
> > +                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
> > +                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > +                                     PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
> > +         /* Project: BDW / Arguments:
> >            *
> > -          *   [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
> > -          *   Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
> > -          *   required.
> > +          * - LRI Post Sync Operation   [23]
> > +          * - Post Sync Op              [15:14]
> > +          * - Notify En                 [8]
> > +          * - Depth Stall               [13]
> > +          * - Render Target Cache Flush [12]
> > +          * - Depth Cache Flush         [0]
> > +          * - DC Flush Enable           [5]
> > +          *
> > +          *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
> > +          *     Workloads."
> 
> This I couldn't find.

Ah, sorry, this could probably be clearer.

On the "Programming Restrictions for PIPE_CONTROL" subpages ("Post-Sync
Operation", "Flush Types"), there are separate table rows for each bit,
with the same workaround text:

 +--------------------------------------------------------------------+
 | LRI Post  | 23 | Requires stall bit ([20] of DW) set for all | BDW |
 | Sync      |    | GPGPU and Media Workloads.                  |     |
 | Operation |    |                                             |     |
 +--------------------------------------------------------------------+

I just grouped them together because they're all the same workaround...
just documented in 8 different table rows.

I'm definitely open to suggestions on how to improve the comment :)

Thanks so much for looking at this!  I owe you!

--Ken
On Mon, Feb 25, 2019 at 10:32:27AM -0800, Kenneth Graunke wrote:
> On Monday, February 25, 2019 6:33:11 AM PST Pohjolainen, Topi wrote:
> > On Thu, Nov 01, 2018 at 08:04:21PM -0700, Kenneth Graunke wrote:
> > > This implements virtually all documented PIPE_CONTROL restrictions
> > > in a centralized helper.  You now simply ask for the operations you
> > > want, and the pipe control "brain" will figure out exactly what pipe
> > > controls to emit to make that happen without tanking your system.
> > > 
> > > The hope is that this will fix some intermittent flushing issues as
> > > well as GPU hangs.  However, it also has a high risk of causing GPU
> > > hangs and other regressions, as this is a particularly sensitive
> > > area and poking the bear isn't always advisable.
> > 
> > First I checked I could find all the things in bspec. There was one that I
> > couldn't, noted further down.
> > 
> > Second I checked that all the rules earlier were implemented. Found one
> > exception, noted further down as well.
> > 
> > I didn't check if the rules still miss something in bspec. That would be
> > another exercise...
> 
> [snip]
> > > +   /* Recursive PIPE_CONTROL workarounds --------------------------------
> > > +    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
> > > +    *
> > > +    * We do these first because we want to look at the original operation,
> > > +    * rather than any workarounds we set.
> > > +    */
> > > +   if (GEN_GEN == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
> > > +      /* Hardware workaround: SNB B-Spec says:
> > > +       *
> > > +       *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
> > > +       *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
> > > +       *     required."
> > > +       */
> > > +      brw_emit_post_sync_nonzero_flush(brw);
> > > +   }
> > > +
> > > +   if (GEN_GEN == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
> > > +      /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
> > > +       * lists several workarounds:
> > > +       *
> > > +       *    "Project: SKL, KBL, BXT
> > > +       *
> > > +       *     If the VF Cache Invalidation Enable is set to a 1 in a
> > > +       *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
> > > +       *     sets to 0, with the VF Cache Invalidation Enable set to 0
> > > +       *     needs to be sent prior to the PIPE_CONTROL with VF Cache
> > > +       *     Invalidation Enable set to a 1."
> > > +       */
> > > +      genX(emit_raw_pipe_control)(brw, 0, NULL, 0, 0);
> > > +   }
> > > +
> > > +   if (GEN_GEN == 9 && IS_COMPUTE_PIPELINE(brw) && post_sync_flags) {
> > > +      /* Project: SKL / Argument: LRI Post Sync Operation [23]
> > > +       *
> > > +       * "PIPECONTROL command with “Command Streamer Stall Enable” must be
> > > +       *  programmed prior to programming a PIPECONTROL command with "LRI
> > > +       *  Post Sync Operation" in GPGPU mode of operation (i.e when
> > > +       *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
> > > +       *
> > > +       * The same text exists a few rows below for Post Sync Op.
> > > +       */
> > > +      genX(emit_raw_pipe_control)(brw, PIPE_CONTROL_CS_STALL, bo, offset, imm);
> > 
> > Are bo, offset, imm needed here as well?
> 
> No, I don't think so.  We should pass NULL, 0, 0 here - we're just doing
> a plain CS stall separately.  We'll use them for the actual write later.
> 
> Good catch!
> 
> [snip]
> > > -         if (GEN_GEN >= 9) {
> > > -            /* THE PIPE_CONTROL "VF Cache Invalidation Enable" docs continue:
> > > -             *
> > > -             *    "Project: BDW+
> > > -             *
> > > -             *     When VF Cache Invalidate is set “Post Sync Operation” must
> > > -             *     be enabled to “Write Immediate Data” or “Write PS Depth
> > > -             *     Count” or “Write Timestamp”."
> > > -             *
> > > -             * If there's a BO, we're already doing some kind of write.
> > > -             * If not, add a write to the workaround BO.
> > > -             *
> > > -             * XXX: This causes GPU hangs on Broadwell, so restrict it to
> > > -             *      Gen9+ for now...see this bug for more information:
> > > -             *      https://bugs.freedesktop.org/show_bug.cgi?id=103787
> > 
> > In "Flush Types" workarounds later on you apply this for gen8 as well.
> 
> Yes, that's intentional - we're supposed to according to the docs.
> I re-tested the Piglit test from bug 103787 on my Broadwell, and it
> works fine - no GPU hangs.  I think we were just doing it wrong before.
> 
> Trying to figure out an ordering for the workarounds is awful... :(

What would you think about another patch just before this to enable that for
gen8? Just in case it causes problems it would bisect to much smaller patch.

> 
> > > -             */
> > > -            if (!bo) {
> > > -               flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
> > > -               bo = brw->workaround_bo;
> > > -            }
> > > -         }
> > > +   if (GEN_IS_HASWELL) {
> > > +      /* From the PIPE_CONTROL page itself:
> > > +       *
> > > +       *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
> > > +       *     Prior to programming a PIPECONTROL command with any of the RO
> > > +       *     cache invalidation bit set, program a PIPECONTROL flush command
> > > +       *     with “CS stall” bit and “HDC Flush” bit set."
> > > +       *
> > > +       * TODO: Actually implement this.  What's an HDC Flush?
> > 
> > There is bit 9 HDC Flush but that is defined for ICL, for HSW I couldn't find
> > anything either.
> 
> Yeah...I figured I'd call it out, but...I don't think we're doing
> anything today either, so this is at least no worse.
> 
> > > +       */
> > > +   }
> > > +
> > > +   if (flags & PIPE_CONTROL_FLUSH_LLC) {
> > > +      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
> > > +       *
> > > +       *    "Project: ALL
> > > +       *     SW must always program Post-Sync Operation to "Write Immediate
> > > +       *     Data" when Flush LLC is set."
> > > +       *
> > > +       * For now, we just require the caller to do it.
> > > +       */
> > > +      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
> > > +   }
> > > +
> > > +   /* "Post-Sync Operation" workarounds -------------------------------- */
> > > +
> > > +   /* Project: All / Argument: Global Snapshot Count Reset [19]
> > > +    *
> > > +    * "This bit must not be exercised on any product.
> > > +    *  Requires stall bit ([20] of DW1) set."
> > > +    *
> > > +    * We don't use this, so we just assert that it isn't used.  The
> > > +    * PIPE_CONTROL instruction page indicates that they intended this
> > > +    * as a debug feature and don't think it is useful in production,
> > > +    * but it may actually be usable, should we ever want to.
> > > +    */
> > > +   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
> > > +
> > > +   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
> > > +                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
> > > +      /* Project: All / Arguments:
> > > +       *
> > > +       * - Generic Media State Clear [16]
> > > +       * - Indirect State Pointers Disable [16]
> > > +       *
> > > +       *    "Requires stall bit ([20] of DW1) set."
> > > +       *
> > > +       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
> > > +       * State Clear) says:
> > > +       *
> > > +       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
> > > +       *     programmed prior to programming a PIPECONTROL command with "Media
> > > +       *     State Clear" set in GPGPU mode of operation"
> > > +       *
> > > +       * This is a subset of the earlier rule, so there's nothing to do.
> > > +       */
> > > +      flags |= PIPE_CONTROL_CS_STALL;
> > > +   }
> > > +
> > > +   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
> > > +      /* Project: All / Argument: Store Data Index
> > > +       *
> > > +       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
> > > +       *  than '0'."
> > > +       *
> > > +       * For now, we just assert that the caller does this.  We might want to
> > > +       * automatically add a write to the workaround BO...
> > > +       */
> > > +      assert(non_lri_post_sync_flags != 0);
> > > +   }
> > > +
> > > +   if (flags & PIPE_CONTROL_SYNC_GFDT) {
> > > +      /* Project: All / Argument: Sync GFDT
> > > +       *
> > > +       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
> > > +       *  than '0' or 0x2520[13] must be set."
> > > +       *
> > > +       * For now, we just assert that the caller does this.
> > > +       */
> > > +      assert(non_lri_post_sync_flags != 0);
> > > +   }
> > > +
> > > +   if (IS_GENx10_BETWEEN(60, 75) && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
> > > +      /* Project: SNB, IVB, HSW / Argument: TLB inv
> > > +       *
> > > +       * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
> > > +       *  must be set to something other than '0'."
> > > +       *
> > > +       * For now, we just assert that the caller does this.
> > > +       */
> > > +      assert(non_lri_post_sync_flags != 0);
> > > +   }
> > > +
> > > +   if (GEN_GEN >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
> > > +      /* Project: IVB+ / Argument: TLB inv
> > > +       *
> > > +       *    "Requires stall bit ([20] of DW1) set."
> > > +       *
> > > +       * Also, from the PIPE_CONTROL instruction table:
> > > +       *
> > > +       *    "Project: SKL+
> > > +       *     Post Sync Operation or CS stall must be set to ensure a TLB
> > > +       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
> > > +       *     cache to invalidate."
> > > +       *
> > > +       * This is not a subset of the earlier rule, so there's nothing to do.
> > > +       */
> > > +      flags |= PIPE_CONTROL_CS_STALL;
> > > +   }
> > > +
> > > +   if (GEN_GEN == 9 && devinfo->gt == 4) {
> > > +      /* TODO: The big Skylake GT4 post sync op workaround */
> > > +   }
> > > +
> > > +   /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
> > > +
> > > +   if (IS_COMPUTE_PIPELINE(brw)) {
> > > +      if (GEN_GEN >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
> > > +         /* Project: SKL+ / Argument: Tex Invalidate
> > > +          * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
> > > +          */
> > > +         flags |= PIPE_CONTROL_CS_STALL;
> > >        }
> > >  
> > > -      if (GEN_GEN == 10)
> > > -         gen10_add_rcpfe_workaround_bits(&flags);
> > > -   } else if (GEN_GEN >= 6) {
> > > -      if (GEN_GEN == 6 &&
> > > -          (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
> > > -         /* Hardware workaround: SNB B-Spec says:
> > > +      if (GEN_GEN == 8 && (post_sync_flags ||
> > > +                           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
> > > +                                     PIPE_CONTROL_DEPTH_STALL |
> > > +                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
> > > +                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> > > +                                     PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
> > > +         /* Project: BDW / Arguments:
> > >            *
> > > -          *   [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
> > > -          *   Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
> > > -          *   required.
> > > +          * - LRI Post Sync Operation   [23]
> > > +          * - Post Sync Op              [15:14]
> > > +          * - Notify En                 [8]
> > > +          * - Depth Stall               [13]
> > > +          * - Render Target Cache Flush [12]
> > > +          * - Depth Cache Flush         [0]
> > > +          * - DC Flush Enable           [5]
> > > +          *
> > > +          *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
> > > +          *     Workloads."
> > 
> > This I couldn't find.
> 
> Ah, sorry, this could probably be clearer.
> 
> On the "Programming Restrictions for PIPE_CONTROL" subpages ("Post-Sync
> Operation", "Flush Types"), there are separate table rows for each bit,
> with the same workaround text:
> 
>  +--------------------------------------------------------------------+
>  | LRI Post  | 23 | Requires stall bit ([20] of DW) set for all | BDW |
>  | Sync      |    | GPGPU and Media Workloads.                  |     |
>  | Operation |    |                                             |     |
>  +--------------------------------------------------------------------+
> 
> I just grouped them together because they're all the same workaround...
> just documented in 8 different table rows.
> 
> I'm definitely open to suggestions on how to improve the comment :)

Ah, now I got it. Makes sense. Perhaps just that it combines the rules for
BDW from the two tables.

> 
> Thanks so much for looking at this!  I owe you!

No problem, writing this patch in the first place must have not been that
easy :)

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
On Monday, February 25, 2019 11:01:33 AM PST Pohjolainen, Topi wrote:
> On Mon, Feb 25, 2019 at 10:32:27AM -0800, Kenneth Graunke wrote:
> > On Monday, February 25, 2019 6:33:11 AM PST Pohjolainen, Topi wrote:
> > > On Thu, Nov 01, 2018 at 08:04:21PM -0700, Kenneth Graunke wrote:
> > > > -         if (GEN_GEN >= 9) {
> > > > -            /* THE PIPE_CONTROL "VF Cache Invalidation Enable" docs continue:
> > > > -             *
> > > > -             *    "Project: BDW+
> > > > -             *
> > > > -             *     When VF Cache Invalidate is set “Post Sync Operation” must
> > > > -             *     be enabled to “Write Immediate Data” or “Write PS Depth
> > > > -             *     Count” or “Write Timestamp”."
> > > > -             *
> > > > -             * If there's a BO, we're already doing some kind of write.
> > > > -             * If not, add a write to the workaround BO.
> > > > -             *
> > > > -             * XXX: This causes GPU hangs on Broadwell, so restrict it to
> > > > -             *      Gen9+ for now...see this bug for more information:
> > > > -             *      https://bugs.freedesktop.org/show_bug.cgi?id=103787
> > > 
> > > In "Flush Types" workarounds later on you apply this for gen8 as well.
> > 
> > Yes, that's intentional - we're supposed to according to the docs.
> > I re-tested the Piglit test from bug 103787 on my Broadwell, and it
> > works fine - no GPU hangs.  I think we were just doing it wrong before.
> > 
> > Trying to figure out an ordering for the workarounds is awful... :(
> 
> What would you think about another patch just before this to enable that for
> gen8? Just in case it causes problems it would bisect to much smaller patch.

It isn't simply enabling it though...in the old code, we had:

      if (devinfo->gen == 8)
         gen8_add_cs_stall_workaround_bits(&flags);
         
      if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
         if (devinfo->gen >= 9) {
            ...
            if (!bo) {
               flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
               bo = brw->workaround_bo;
            }
         }
      }

Which adds a WRITE_IMMEDIATE to the current PIPE_CONTROL, but does so
after the call to gen8_add_cs_stall_workaround_bits - and that function
would have added a CS_STALL had it seen the WRITE_IMMEDIATE.  I suspect
this bad ordering is why we saw hangs on Broadwell - we missed a stall.

The new code performs these in the opposite order, correctly adding
the necessary CS_STALL.

I could probably write a patch to swap those and enable it on Gen8+.

--Ken
On Mon, Feb 25, 2019 at 02:18:52PM -0800, Kenneth Graunke wrote:
> On Monday, February 25, 2019 11:01:33 AM PST Pohjolainen, Topi wrote:
> > On Mon, Feb 25, 2019 at 10:32:27AM -0800, Kenneth Graunke wrote:
> > > On Monday, February 25, 2019 6:33:11 AM PST Pohjolainen, Topi wrote:
> > > > On Thu, Nov 01, 2018 at 08:04:21PM -0700, Kenneth Graunke wrote:
> > > > > -         if (GEN_GEN >= 9) {
> > > > > -            /* THE PIPE_CONTROL "VF Cache Invalidation Enable" docs continue:
> > > > > -             *
> > > > > -             *    "Project: BDW+
> > > > > -             *
> > > > > -             *     When VF Cache Invalidate is set “Post Sync Operation” must
> > > > > -             *     be enabled to “Write Immediate Data” or “Write PS Depth
> > > > > -             *     Count” or “Write Timestamp”."
> > > > > -             *
> > > > > -             * If there's a BO, we're already doing some kind of write.
> > > > > -             * If not, add a write to the workaround BO.
> > > > > -             *
> > > > > -             * XXX: This causes GPU hangs on Broadwell, so restrict it to
> > > > > -             *      Gen9+ for now...see this bug for more information:
> > > > > -             *      https://bugs.freedesktop.org/show_bug.cgi?id=103787
> > > > 
> > > > In "Flush Types" workarounds later on you apply this for gen8 as well.
> > > 
> > > Yes, that's intentional - we're supposed to according to the docs.
> > > I re-tested the Piglit test from bug 103787 on my Broadwell, and it
> > > works fine - no GPU hangs.  I think we were just doing it wrong before.
> > > 
> > > Trying to figure out an ordering for the workarounds is awful... :(
> > 
> > What would you think about another patch just before this to enable that for
> > gen8? Just in case it causes problems it would bisect to much smaller patch.
> 
> It isn't simply enabling it though...in the old code, we had:
> 
>       if (devinfo->gen == 8)
>          gen8_add_cs_stall_workaround_bits(&flags);
>          
>       if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
>          if (devinfo->gen >= 9) {
>             ...
>             if (!bo) {
>                flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
>                bo = brw->workaround_bo;
>             }
>          }
>       }
> 
> Which adds a WRITE_IMMEDIATE to the current PIPE_CONTROL, but does so
> after the call to gen8_add_cs_stall_workaround_bits - and that function
> would have added a CS_STALL had it seen the WRITE_IMMEDIATE.  I suspect
> this bad ordering is why we saw hangs on Broadwell - we missed a stall.
> 
> The new code performs these in the opposite order, correctly adding
> the necessary CS_STALL.
> 
> I could probably write a patch to swap those and enable it on Gen8+.

Ah, okay. Well, I think mentioning this in the commit would be sufficient
as well. But it is your call, I'm fine either way.