gallivm: fix broken 8-wide s3tc decoding

Submitted by sroland@vmware.com on May 7, 2019, 12:12 a.m.

Details

Message ID 20190507001253.30909-1-sroland@vmware.com
State New
Headers show
Series "gallivm: fix broken 8-wide s3tc decoding" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

sroland@vmware.com May 7, 2019, 12:12 a.m.
From: Roland Scheidegger <sroland@vmware.com>

Brian noticed there was an uninitialized var for the 8-wide case and 128
bit blocks, which made it always crash. Likewise, the 64bit block case
had another crash bug due to type mismatch.
Color decode (used for all s3tc formats) also had a bogus shuffle for
this case, leading to decode artifacts.
Fix these all up, which makes the code actually work 8-wide. Note that
it's still not used - I've verified it works, and the generated assembly
does look quite a bit simpler actually (20-30% less instructions for the
s3tc decode part with avx2), however in practice it still seems to be
sligthly slower for some unknown reason (tested with openarena) on my
haswell box, so for now continue to split things into 4-wide vectors
before decoding.
---
 .../auxiliary/gallivm/lp_bld_format_s3tc.c    | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
index 9561c349dad..8f6e9bec18a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
@@ -77,24 +77,17 @@  lp_build_uninterleave2_half(struct gallivm_state *gallivm,
                             unsigned lo_hi)
 {
    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i, j;
+   unsigned i;
 
    assert(type.length <= LP_MAX_VECTOR_LENGTH);
    assert(lo_hi < 2);
 
    if (type.length * type.width == 256) {
-      assert(type.length >= 4);
-      for (i = 0, j = 0; i < type.length; ++i) {
-         if (i == type.length / 4) {
-            j = type.length;
-         } else if (i == type.length / 2) {
-            j = type.length / 2;
-         } else if (i == 3 * type.length / 4) {
-            j = 3 * type.length / 4;
-         } else {
-            j += 2;
-         }
-         elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
+      assert(type.length == 8);
+      assert(type.width == 32);
+      const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
+      for (i = 0; i < type.length; ++i) {
+         elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
       }
    } else {
       for (i = 0; i < type.length; ++i) {
@@ -277,7 +270,7 @@  lp_build_gather_s3tc(struct gallivm_state *gallivm,
    }
    else {
       LLVMValueRef tmp[4], cc01, cc23;
-      struct lp_type lp_type32, lp_type64, lp_type32dxt;
+      struct lp_type lp_type32, lp_type64;
       memset(&lp_type32, 0, sizeof lp_type32);
       lp_type32.width = 32;
       lp_type32.length = length;
@@ -309,10 +302,14 @@  lp_build_gather_s3tc(struct gallivm_state *gallivm,
                                               lp_build_const_extend_shuffle(gallivm, 2, 4), "");
          }
          if (length == 8) {
+            struct lp_type lp_type32_4;
+            memset(&lp_type32_4, 0, sizeof lp_type32_4);
+            lp_type32_4.width = 32;
+            lp_type32_4.length = 4;
             for (i = 0; i < 4; ++i) {
                tmp[0] = elems[i];
                tmp[1] = elems[i+4];
-               elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
+               elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
             }
          }
          cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
@@ -811,7 +808,7 @@  s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
    tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
    bit_pos = LLVMBuildAnd(builder, bit_pos,
                           lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
-   /* Warning: slow shift with per element count */
+   /* Warning: slow shift with per element count (without avx2) */
    /*
     * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
     * to select the right byte with pshufb. Then for the remaining one bit
@@ -1640,7 +1637,6 @@  s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
                           lp_build_const_int_vec(gallivm, type16, 8), "");
    alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
    shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
-   /* XXX this shuffle broken with LLVM 2.8 */
    alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
    alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
 
@@ -2176,6 +2172,9 @@  lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
       return rgba;
    }
 
+   /*
+    * Could use n > 8 here with avx2, but doesn't seem faster.
+    */
    if (n > 4) {
       unsigned count;
       LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);

Comments

LGTM.  Just two little nits below.

Reviewed-by: Brian Paul <brianp@vmware.com>

Perhaps you could review the 5-patch series of clean-ups I posted on 
Saturday?


On 05/06/2019 06:12 PM, sroland@vmware.com wrote:
> From: Roland Scheidegger <sroland@vmware.com>
> 
> Brian noticed there was an uninitialized var for the 8-wide case and 128
> bit blocks, which made it always crash. Likewise, the 64bit block case
> had another crash bug due to type mismatch.
> Color decode (used for all s3tc formats) also had a bogus shuffle for
> this case, leading to decode artifacts.
> Fix these all up, which makes the code actually work 8-wide. Note that
> it's still not used - I've verified it works, and the generated assembly
> does look quite a bit simpler actually (20-30% less instructions for the
> s3tc decode part with avx2), however in practice it still seems to be
> sligthly slower for some unknown reason (tested with openarena) on my
> haswell box, so for now continue to split things into 4-wide vectors
> before decoding.
> ---
>   .../auxiliary/gallivm/lp_bld_format_s3tc.c    | 33 +++++++++----------
>   1 file changed, 16 insertions(+), 17 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> index 9561c349dad..8f6e9bec18a 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c
> @@ -77,24 +77,17 @@ lp_build_uninterleave2_half(struct gallivm_state *gallivm,
>                               unsigned lo_hi)
>   {
>      LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
> -   unsigned i, j;
> +   unsigned i;
>   
>      assert(type.length <= LP_MAX_VECTOR_LENGTH);
>      assert(lo_hi < 2);
>   
>      if (type.length * type.width == 256) {
> -      assert(type.length >= 4);
> -      for (i = 0, j = 0; i < type.length; ++i) {
> -         if (i == type.length / 4) {
> -            j = type.length;
> -         } else if (i == type.length / 2) {
> -            j = type.length / 2;
> -         } else if (i == 3 * type.length / 4) {
> -            j = 3 * type.length / 4;
> -         } else {
> -            j += 2;
> -         }
> -         elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
> +      assert(type.length == 8);
> +      assert(type.width == 32);
> +      const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};

could be static


> +      for (i = 0; i < type.length; ++i) {
> +         elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
>         }
>      } else {
>         for (i = 0; i < type.length; ++i) {
> @@ -277,7 +270,7 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,
>      }
>      else {
>         LLVMValueRef tmp[4], cc01, cc23;
> -      struct lp_type lp_type32, lp_type64, lp_type32dxt;
> +      struct lp_type lp_type32, lp_type64;
>         memset(&lp_type32, 0, sizeof lp_type32);
>         lp_type32.width = 32;
>         lp_type32.length = length;
> @@ -309,10 +302,14 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,
>                                                 lp_build_const_extend_shuffle(gallivm, 2, 4), "");
>            }
>            if (length == 8) {
> +            struct lp_type lp_type32_4;
> +            memset(&lp_type32_4, 0, sizeof lp_type32_4);

Maybe we could move toward using initializers in these case?  struct 
lp_type lp_type32_4 = {0}; ?


> +            lp_type32_4.width = 32;
> +            lp_type32_4.length = 4;
>               for (i = 0; i < 4; ++i) {
>                  tmp[0] = elems[i];
>                  tmp[1] = elems[i+4];
> -               elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
> +               elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
>               }
>            }
>            cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
> @@ -811,7 +808,7 @@ s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
>      tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
>      bit_pos = LLVMBuildAnd(builder, bit_pos,
>                             lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
> -   /* Warning: slow shift with per element count */
> +   /* Warning: slow shift with per element count (without avx2) */
>      /*
>       * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
>       * to select the right byte with pshufb. Then for the remaining one bit
> @@ -1640,7 +1637,6 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
>                             lp_build_const_int_vec(gallivm, type16, 8), "");
>      alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
>      shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
> -   /* XXX this shuffle broken with LLVM 2.8 */
>      alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
>      alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
>   
> @@ -2176,6 +2172,9 @@ lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
>         return rgba;
>      }
>   
> +   /*
> +    * Could use n > 8 here with avx2, but doesn't seem faster.
> +    */
>      if (n > 4) {
>         unsigned count;
>         LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
>