avx2: Add fast path for over_reverse_n_8888

Submitted by Matt Turner on Jan. 22, 2019, 3:43 a.m.

Details

Message ID 20190122034324.790-1-mattst88@gmail.com
State New
Series "avx2: Add fast path for over_reverse_n_8888"
Headers show

Commit Message

Matt Turner Jan. 22, 2019, 3:43 a.m.
lowlevel-blt-bench, over_reverse_n_8888, 100 iterations:

       Before          After
      Mean StdDev     Mean StdDev   Confidence   Change
L1  2372.6   2.50   4387.6   8.00    100.00%     +84.9%
L2  2490.3   5.29   4326.5  20.79    100.00%     +73.7%
M   2418.3  10.43   3718.0  38.55    100.00%     +53.7%
HT  1555.8  13.35   2112.9  23.85    100.00%     +35.8%
VT  1120.1   9.58   1403.7  15.43    100.00%     +25.3%
R    958.5  17.66   1176.9  20.87    100.00%     +22.8%
RT   407.3   6.79    450.1   7.22    100.00%     +10.5%

At most 18 outliers rejected per test per set.

cairo-perf-trace with trimmed traces, 30 iterations:

            Before          After
           Mean StdDev     Mean StdDev   Confidence   Change
poppler   0.516  0.003    0.478  0.002   100.000%      +8.1%

Cairo perf reports the running time, but the change is computed for
operations per second instead (inverse of running time).
---
 pixman/pixman-avx2.c | 94 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

Patch hide | download patch | download mbox

diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c
index faef552..6a67515 100644
--- a/pixman/pixman-avx2.c
+++ b/pixman/pixman-avx2.c
@@ -28,6 +28,18 @@  negate_2x256 (__m256i  data_lo,
     *neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2);
 }
 
+static force_inline __m256i
+unpack_32_1x256 (uint32_t data)
+{
+    return _mm256_unpacklo_epi8 (_mm256_broadcastd_epi32 (_mm_cvtsi32_si128 (data)), _mm256_setzero_si256 ());
+}
+
+static force_inline __m256i
+expand_pixel_32_1x256 (uint32_t data)
+{
+    return _mm256_shuffle_epi32 (unpack_32_1x256 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
 static force_inline __m256i
 pack_2x256_256 (__m256i lo, __m256i hi)
 {
@@ -100,6 +112,13 @@  save_256_aligned (__m256i* dst,
     _mm256_store_si256 (dst, data);
 }
 
+static force_inline void
+save_256_unaligned (__m256i* dst,
+		    __m256i  data)
+{
+    _mm256_storeu_si256 (dst, data);
+}
+
 static force_inline int
 is_opaque_256 (__m256i x)
 {
@@ -429,12 +448,87 @@  avx2_composite_over_8888_8888 (pixman_implementation_t *imp,
 	src += src_stride;
     }
 }
+
+static void
+avx2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+				    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    __m256i ymm_src;
+    __m256i ymm_dst, ymm_dst_lo, ymm_dst_hi;
+    __m256i ymm_dsta_hi, ymm_dsta_lo;
+    int dst_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    ymm_src = expand_pixel_32_1x256 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w >= 8)
+	{
+	    __m256i tmp_lo, tmp_hi;
+
+	    ymm_dst = load_256_unaligned ((__m256i*)dst);
+
+	    unpack_256_2x256 (ymm_dst, &ymm_dst_lo, &ymm_dst_hi);
+	    expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi, &ymm_dsta_lo, &ymm_dsta_hi);
+
+	    tmp_lo = ymm_src;
+	    tmp_hi = ymm_src;
+
+	    over_2x256 (&ymm_dst_lo, &ymm_dst_hi,
+			&ymm_dsta_lo, &ymm_dsta_hi,
+			&tmp_lo, &tmp_hi);
+
+	    save_256_unaligned (
+		(__m256i*)dst, pack_2x256_256 (tmp_lo, tmp_hi));
+
+	    w -= 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      _mm256_castsi256_si128 (ymm_src)));
+	    w--;
+	    dst++;
+	}
+
+    }
+
+}
+
 static const pixman_fast_path_t avx2_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, avx2_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, avx2_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, avx2_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, avx2_composite_over_8888_8888),
+
+    /* PIXMAN_OP_OVER_REVERSE */
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, avx2_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, avx2_composite_over_reverse_n_8888),
+
     { PIXMAN_OP_NONE },
 };