[3/3] Rev2 of patch: AVX2 versions of OVER and ROVER operators.

Submitted by Devulapalli, Raghuveer on Jan. 17, 2019, 1 a.m.

Details

Message ID 20190117010000.21632-3-raghuveer.devulapalli@intel.com
State New
Series "Series without cover letter"
Headers show

Commit Message

Devulapalli, Raghuveer Jan. 17, 2019, 1 a.m.
From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>

These were found to be upto 1.8 times faster (depending on the array
size) than the corresponding SSE2 version. The AVX2 and SSE2 were
benchmarked on a Intel(R) Core(TM) i5-6260U CPU @ 1.80GHz. The AVX2 and
SSE versions were benchmarked by measuring how many TSC cycles each of
the avx2_combine_over_u and sse2_combine_over_u functions took to run
for various array sizes. For the purpose of benchmarking, turbo was
disabled and intel_pstate governor was set to performance to avoid
variance in CPU frequencies across multiple runs.

| Array size | #cycles SSE2 | #cycles AVX2 |
--------------------------------------------
| 400        | 53966        | 32800        |
| 800        | 107595       | 62595        |
| 1600       | 214810       | 122482       |
| 3200       | 429748       | 241971       |
| 6400       | 859070       | 481076       |

Also ran lowlevel-blt-bench for OVER_8888_8888 operation and that
also shows a 1.55x-1.79x improvement over SSE2. Here are the details:

AVX2: OVER_8888_8888 =  L1:2136.35  L2:2109.46  M:1751.99 ( 60.90%)
SSE2: OVER_8888_8888 =  L1:1188.91  L2:1190.63  M:1128.32 ( 40.31%)

The AVX2 implementation uses the SSE2 version for manipulating pixels
that are not 32 byte aligned. The helper functions from pixman-sse2.h
are re-used for this purpose.
---
 pixman/pixman-avx2.c | 431 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 430 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c
index d860d67..faef552 100644
--- a/pixman/pixman-avx2.c
+++ b/pixman/pixman-avx2.c
@@ -6,13 +6,439 @@ 
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 #include "pixman-inlines.h"
+#include "pixman-sse2.h"
 
+#define MASK_0080_AVX2 _mm256_set1_epi16(0x0080)
+#define MASK_00FF_AVX2 _mm256_set1_epi16(0x00ff)
+#define MASK_0101_AVX2 _mm256_set1_epi16(0x0101)
+
+static force_inline __m256i
+load_256_aligned (__m256i* src)
+{
+    return _mm256_load_si256(src);
+}
+
+static force_inline void
+negate_2x256 (__m256i  data_lo,
+	      __m256i  data_hi,
+	      __m256i* neg_lo,
+	      __m256i* neg_hi)
+{
+    *neg_lo = _mm256_xor_si256 (data_lo, MASK_00FF_AVX2);
+    *neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2);
+}
+
+static force_inline __m256i
+pack_2x256_256 (__m256i lo, __m256i hi)
+{
+    return _mm256_packus_epi16 (lo, hi);
+}
+ 
+static force_inline void
+pix_multiply_2x256 (__m256i* data_lo,
+		    __m256i* data_hi,
+		    __m256i* alpha_lo,
+		    __m256i* alpha_hi,
+		    __m256i* ret_lo,
+		    __m256i* ret_hi)
+{
+    __m256i lo, hi;
+
+    lo = _mm256_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm256_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm256_adds_epu16 (lo, MASK_0080_AVX2);
+    hi = _mm256_adds_epu16 (hi, MASK_0080_AVX2);
+    *ret_lo = _mm256_mulhi_epu16 (lo, MASK_0101_AVX2);
+    *ret_hi = _mm256_mulhi_epu16 (hi, MASK_0101_AVX2);
+}
+ 
+static force_inline void
+over_2x256 (__m256i* src_lo,
+	    __m256i* src_hi,
+	    __m256i* alpha_lo,
+	    __m256i* alpha_hi,
+	    __m256i* dst_lo,
+	    __m256i* dst_hi)
+{
+    __m256i t1, t2;
+
+    negate_2x256 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x256 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = _mm256_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm256_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+expand_alpha_2x256 (__m256i  data_lo,
+		    __m256i  data_hi,
+		    __m256i* alpha_lo,
+		    __m256i* alpha_hi)
+{
+    __m256i lo, hi;
+
+    lo = _mm256_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm256_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm256_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm256_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline  void
+unpack_256_2x256 (__m256i data, __m256i* data_lo, __m256i* data_hi)
+{
+    *data_lo = _mm256_unpacklo_epi8 (data, _mm256_setzero_si256 ());
+    *data_hi = _mm256_unpackhi_epi8 (data, _mm256_setzero_si256 ());
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_256_aligned (__m256i* dst,
+		  __m256i  data)
+{
+    _mm256_store_si256 (dst, data);
+}
+
+static force_inline int
+is_opaque_256 (__m256i x)
+{
+    __m256i ffs = _mm256_cmpeq_epi8 (x, x);
+
+    return (_mm256_movemask_epi8
+	    (_mm256_cmpeq_epi8 (x, ffs)) & 0x88888888) == 0x88888888;
+}
+
+static force_inline int
+is_zero_256 (__m256i x)
+{
+    return _mm256_movemask_epi8 (
+	_mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) == 0xffffffff;
+}
+
+static force_inline int
+is_transparent_256 (__m256i x)
+{
+    return (_mm256_movemask_epi8 (
+		_mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) & 0x88888888)
+                == 0x88888888;
+}
+
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m256i
+load_256_unaligned (const __m256i* src)
+{
+    return _mm256_loadu_si256 (src);
+}
+
+static force_inline __m256i
+combine8 (const __m256i *ps, const __m256i *pm)
+{
+    __m256i ymm_src_lo, ymm_src_hi;
+    __m256i ymm_msk_lo, ymm_msk_hi;
+    __m256i s;
+
+    if (pm)
+    {
+	ymm_msk_lo = load_256_unaligned (pm);
+
+	if (is_transparent_256 (ymm_msk_lo))
+	    return _mm256_setzero_si256 ();
+    }
+
+    s = load_256_unaligned (ps);
+
+    if (pm)
+    {
+	unpack_256_2x256 (s, &ymm_src_lo, &ymm_src_hi);
+	unpack_256_2x256 (ymm_msk_lo, &ymm_msk_lo, &ymm_msk_hi);
+
+	expand_alpha_2x256 (ymm_msk_lo, ymm_msk_hi, &ymm_msk_lo, &ymm_msk_hi);
+
+	pix_multiply_2x256 (&ymm_src_lo, &ymm_src_hi,
+			    &ymm_msk_lo, &ymm_msk_hi,
+			    &ymm_src_lo, &ymm_src_hi);
+
+	s = pack_2x256_256 (ymm_src_lo, ymm_src_hi);
+    }
+
+    return s;
+}
+
+static force_inline void
+core_combine_over_u_avx2_mask (uint32_t *	  pd,
+			       const uint32_t*	  ps,
+			       const uint32_t*	  pm,
+			       int		  w)
+{
+    uint32_t s, d;
+    while (w && ((uintptr_t)pd & 31))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+	w--;
+    }
+
+    /*
+     * dst is 32 byte aligned, and w >=8 means the next 256 bits
+     * contain relevant data
+    */
+    
+    while (w >= 8)
+    {
+	__m256i mask = load_256_unaligned ((__m256i *)pm);
+
+	if (!is_zero_256 (mask))
+	{
+	    __m256i src;
+	    __m256i src_hi, src_lo;
+	    __m256i mask_hi, mask_lo;
+	    __m256i alpha_hi, alpha_lo;
+
+	    src = load_256_unaligned ((__m256i *)ps);
+
+	    if (is_opaque_256 (_mm256_and_si256 (src, mask)))
+	    {
+	        save_256_aligned ((__m256i *)pd, src);
+	    }
+	    else
+	    {
+	        __m256i dst = load_256_aligned ((__m256i *)pd);
+	        __m256i dst_hi, dst_lo;
+
+	        unpack_256_2x256 (mask, &mask_lo, &mask_hi);
+	        unpack_256_2x256 (src, &src_lo, &src_hi);
+
+	        expand_alpha_2x256 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+	        pix_multiply_2x256 (&src_lo, &src_hi,
+				    &mask_lo, &mask_hi,
+				    &src_lo, &src_hi);
+		
+		unpack_256_2x256 (dst, &dst_lo, &dst_hi);
+		expand_alpha_2x256 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+
+		over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_256_aligned (
+		    (__m256i *)pd,
+		    pack_2x256_256 (dst_lo, dst_hi));
+	    }
+	}
+	pm += 8;
+	ps += 8;
+	pd += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+	w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_avx2_no_mask (uint32_t *	     pd,
+				  const uint32_t*    ps,
+				  int		     w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 31))
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m256i src;
+	__m256i src_hi, src_lo, dst_hi, dst_lo;
+	__m256i alpha_hi, alpha_lo;
+
+	src = load_256_unaligned ((__m256i *)ps);
+
+	if (!is_zero_256 (src))
+	{
+	    if (is_opaque_256 (src))
+	    {
+		save_256_aligned ((__m256i *)pd, src);
+	    }
+	    else
+	    {
+		__m256i dst = load_256_aligned ((__m256i *)pd);
+
+		unpack_256_2x256 (src, &src_lo, &src_hi);
+		unpack_256_2x256 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x256 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+		over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_256_aligned (
+		    (__m256i *)pd,
+		    pack_2x256_256 (dst_lo, dst_hi));
+	    }
+	}
+
+	ps += 8;
+	pd += 8;
+	w -= 8;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	w--;
+    }
+}
+
+static force_inline void
+avx2_combine_over_u (pixman_implementation_t *imp,
+		     pixman_op_t	      op,
+		     uint32_t *		      pd,
+		     const uint32_t *	      ps,
+		     const uint32_t *	      pm,
+		     int		      w)
+{
+    if (pm)
+	core_combine_over_u_avx2_mask (pd, ps, pm, w);
+    else
+	core_combine_over_u_avx2_no_mask (pd, ps, w);
+}
+
+static void
+avx2_combine_over_reverse_u (pixman_implementation_t *imp,
+			     pixman_op_t	      op,
+			     uint32_t *		      pd,
+			     const uint32_t *	      ps,
+			     const uint32_t *	      pm,
+			     int		      w)
+{
+    uint32_t s, d;
+
+    __m256i ymm_dst_lo, ymm_dst_hi;
+    __m256i ymm_src_lo, ymm_src_hi;
+    __m256i ymm_alpha_lo, ymm_alpha_hi;
+
+    /* Align dst on a 16-byte boundary */
+    while (w &&
+	   ((uintptr_t)pd & 31))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 8)
+    {
+	ymm_src_hi = combine8 ((__m256i*)ps, (__m256i*)pm);
+	ymm_dst_hi = load_256_aligned ((__m256i*) pd);
+
+	unpack_256_2x256 (ymm_src_hi, &ymm_src_lo, &ymm_src_hi);
+	unpack_256_2x256 (ymm_dst_hi, &ymm_dst_lo, &ymm_dst_hi);
+
+	expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi,
+			    &ymm_alpha_lo, &ymm_alpha_hi);
+
+	over_2x256 (&ymm_dst_lo, &ymm_dst_hi,
+		    &ymm_alpha_lo, &ymm_alpha_hi,
+		    &ymm_src_lo, &ymm_src_hi);
+
+	/* rebuid the 4 pixel data and save*/
+	save_256_aligned ((__m256i*)pd,
+			  pack_2x256_256 (ymm_src_lo, ymm_src_hi));
+
+	w -= 8;
+	ps += 8;
+	pd += 8;
+
+	if (pm)
+	    pm += 8;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+avx2_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+	avx2_combine_over_u (imp, op, dst, src, NULL, width);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
 static const pixman_fast_path_t avx2_fast_paths[] =
 {
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, avx2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, avx2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, avx2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, avx2_composite_over_8888_8888),
     { PIXMAN_OP_NONE },
 };
 
-static const pixman_iter_info_t avx2_iters[] = 
+static const pixman_iter_info_t avx2_iters[] =
 {
     { PIXMAN_null },
 };
@@ -26,6 +452,9 @@  _pixman_implementation_create_avx2 (pixman_implementation_t *fallback)
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, avx2_fast_paths);
 
     /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_OVER] = avx2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = avx2_combine_over_reverse_u;
+    
     imp->iter_info = avx2_iters;
 
     return imp;

Comments

Chris Wilson Jan. 17, 2019, 8:09 a.m.
Quoting Raghuveer Devulapalli (2019-01-17 01:00:00)
> +static force_inline void
> +core_combine_over_u_avx2_no_mask (uint32_t *        pd,
> +                                 const uint32_t*    ps,
> +                                 int                w)
> +{
> +    uint32_t s, d;
> +
> +    /* Align dst on a 16-byte boundary */
> +    while (w && ((uintptr_t)pd & 31))

32-byte.

You can reuse the comment from _mask.

/* We operate on 256-bit chunks, aligned to a 32-byte boundary. */

Explains both the alignment phase and main loop.
-Chris
Matt Turner Jan. 22, 2019, 1:45 a.m.
On Wed, Jan 16, 2019 at 4:57 PM Raghuveer Devulapalli
<raghuveer.devulapalli@intel.com> wrote:
>
> From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>
>
> These were found to be upto 1.8 times faster (depending on the array
> size) than the corresponding SSE2 version. The AVX2 and SSE2 were
> benchmarked on a Intel(R) Core(TM) i5-6260U CPU @ 1.80GHz. The AVX2 and
> SSE versions were benchmarked by measuring how many TSC cycles each of
> the avx2_combine_over_u and sse2_combine_over_u functions took to run
> for various array sizes. For the purpose of benchmarking, turbo was
> disabled and intel_pstate governor was set to performance to avoid
> variance in CPU frequencies across multiple runs.
>
> | Array size | #cycles SSE2 | #cycles AVX2 |
> --------------------------------------------
> | 400        | 53966        | 32800        |
> | 800        | 107595       | 62595        |
> | 1600       | 214810       | 122482       |
> | 3200       | 429748       | 241971       |
> | 6400       | 859070       | 481076       |
>
> Also ran lowlevel-blt-bench for OVER_8888_8888 operation and that
> also shows a 1.55x-1.79x improvement over SSE2. Here are the details:
>
> AVX2: OVER_8888_8888 =  L1:2136.35  L2:2109.46  M:1751.99 ( 60.90%)
> SSE2: OVER_8888_8888 =  L1:1188.91  L2:1190.63  M:1128.32 ( 40.31%)
>
> The AVX2 implementation uses the SSE2 version for manipulating pixels
> that are not 32 byte aligned. The helper functions from pixman-sse2.h
> are re-used for this purpose.

I still cannot measure any performance improvement with cairo-traces.
If we're not improving performance in any real world application, then
I don't think it's worth adding a significant amount of code.

As I told you in person and in private mail, I suspect that you're
more likely to see real performance improvements in operations that
are more compute-heavy, like bilinear filtering. You could probably
use AVX2's gather instructions in the bilinear code as well. Filling
out the avx2_iters array would also be a good place to start, since
those functions execute when we do not have a specific fast-path for
an operation (which will be the case for AVX2).

I sense that you want to check this off your todo list and move on. If
that's the case, we can include the avx2_composite_over_reverse_n_8888
function I wrote (and will send as a separate patch) to confirm that
using AVX2 is capable of giving a performance improvement in some
cairo traces.

> ---
>  pixman/pixman-avx2.c | 431 ++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 430 insertions(+), 1 deletion(-)
>
> diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c
> index d860d67..faef552 100644
> --- a/pixman/pixman-avx2.c
> +++ b/pixman/pixman-avx2.c
> @@ -6,13 +6,439 @@
>  #include "pixman-private.h"
>  #include "pixman-combine32.h"
>  #include "pixman-inlines.h"
> +#include "pixman-sse2.h"
>
> +#define MASK_0080_AVX2 _mm256_set1_epi16(0x0080)
> +#define MASK_00FF_AVX2 _mm256_set1_epi16(0x00ff)
> +#define MASK_0101_AVX2 _mm256_set1_epi16(0x0101)
> +
> +static force_inline __m256i
> +load_256_aligned (__m256i* src)
> +{
> +    return _mm256_load_si256(src);
> +}
> +
> +static force_inline void
> +negate_2x256 (__m256i  data_lo,
> +             __m256i  data_hi,
> +             __m256i* neg_lo,
> +             __m256i* neg_hi)
> +{
> +    *neg_lo = _mm256_xor_si256 (data_lo, MASK_00FF_AVX2);
> +    *neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2);
> +}
> +
> +static force_inline __m256i
> +pack_2x256_256 (__m256i lo, __m256i hi)
> +{
> +    return _mm256_packus_epi16 (lo, hi);
> +}
> +

Stray space

> +static force_inline void
> +pix_multiply_2x256 (__m256i* data_lo,
> +                   __m256i* data_hi,
> +                   __m256i* alpha_lo,
> +                   __m256i* alpha_hi,
> +                   __m256i* ret_lo,
> +                   __m256i* ret_hi)
> +{
> +    __m256i lo, hi;
> +
> +    lo = _mm256_mullo_epi16 (*data_lo, *alpha_lo);
> +    hi = _mm256_mullo_epi16 (*data_hi, *alpha_hi);
> +    lo = _mm256_adds_epu16 (lo, MASK_0080_AVX2);
> +    hi = _mm256_adds_epu16 (hi, MASK_0080_AVX2);
> +    *ret_lo = _mm256_mulhi_epu16 (lo, MASK_0101_AVX2);
> +    *ret_hi = _mm256_mulhi_epu16 (hi, MASK_0101_AVX2);
> +}
> +

Stray space

> +static force_inline void
> +over_2x256 (__m256i* src_lo,
> +           __m256i* src_hi,
> +           __m256i* alpha_lo,
> +           __m256i* alpha_hi,
> +           __m256i* dst_lo,
> +           __m256i* dst_hi)
> +{
> +    __m256i t1, t2;
> +
> +    negate_2x256 (*alpha_lo, *alpha_hi, &t1, &t2);
> +
> +    pix_multiply_2x256 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
> +
> +    *dst_lo = _mm256_adds_epu8 (*src_lo, *dst_lo);
> +    *dst_hi = _mm256_adds_epu8 (*src_hi, *dst_hi);
> +}
> +
> +static force_inline void
> +expand_alpha_2x256 (__m256i  data_lo,
> +                   __m256i  data_hi,
> +                   __m256i* alpha_lo,
> +                   __m256i* alpha_hi)
> +{
> +    __m256i lo, hi;
> +
> +    lo = _mm256_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
> +    hi = _mm256_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
> +
> +    *alpha_lo = _mm256_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
> +    *alpha_hi = _mm256_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
> +}
> +
> +static force_inline  void
> +unpack_256_2x256 (__m256i data, __m256i* data_lo, __m256i* data_hi)
> +{
> +    *data_lo = _mm256_unpacklo_epi8 (data, _mm256_setzero_si256 ());
> +    *data_hi = _mm256_unpackhi_epi8 (data, _mm256_setzero_si256 ());
> +}
> +
> +/* save 4 pixels on a 16-byte boundary aligned address */
> +static force_inline void
> +save_256_aligned (__m256i* dst,
> +                 __m256i  data)
> +{
> +    _mm256_store_si256 (dst, data);
> +}
> +
> +static force_inline int
> +is_opaque_256 (__m256i x)
> +{
> +    __m256i ffs = _mm256_cmpeq_epi8 (x, x);
> +
> +    return (_mm256_movemask_epi8
> +           (_mm256_cmpeq_epi8 (x, ffs)) & 0x88888888) == 0x88888888;
> +}
> +
> +static force_inline int
> +is_zero_256 (__m256i x)
> +{
> +    return _mm256_movemask_epi8 (
> +       _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) == 0xffffffff;
> +}
> +
> +static force_inline int
> +is_transparent_256 (__m256i x)
> +{
> +    return (_mm256_movemask_epi8 (
> +               _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) & 0x88888888)
> +                == 0x88888888;
> +}
> +
> +

Extra newline

> +/* load 4 pixels from a unaligned address */
> +static force_inline __m256i
> +load_256_unaligned (const __m256i* src)
> +{
> +    return _mm256_loadu_si256 (src);
> +}
> +
> +static force_inline __m256i
> +combine8 (const __m256i *ps, const __m256i *pm)
> +{
> +    __m256i ymm_src_lo, ymm_src_hi;
> +    __m256i ymm_msk_lo, ymm_msk_hi;
> +    __m256i s;
> +
> +    if (pm)
> +    {
> +       ymm_msk_lo = load_256_unaligned (pm);
> +
> +       if (is_transparent_256 (ymm_msk_lo))
> +           return _mm256_setzero_si256 ();
> +    }
> +
> +    s = load_256_unaligned (ps);
> +
> +    if (pm)
> +    {
> +       unpack_256_2x256 (s, &ymm_src_lo, &ymm_src_hi);
> +       unpack_256_2x256 (ymm_msk_lo, &ymm_msk_lo, &ymm_msk_hi);
> +
> +       expand_alpha_2x256 (ymm_msk_lo, ymm_msk_hi, &ymm_msk_lo, &ymm_msk_hi);
> +
> +       pix_multiply_2x256 (&ymm_src_lo, &ymm_src_hi,
> +                           &ymm_msk_lo, &ymm_msk_hi,
> +                           &ymm_src_lo, &ymm_src_hi);
> +
> +       s = pack_2x256_256 (ymm_src_lo, ymm_src_hi);
> +    }
> +
> +    return s;
> +}
> +
> +static force_inline void
> +core_combine_over_u_avx2_mask (uint32_t *        pd,
> +                              const uint32_t*    ps,
> +                              const uint32_t*    pm,
> +                              int                w)
> +{
> +    uint32_t s, d;
> +    while (w && ((uintptr_t)pd & 31))
> +    {
> +       d = *pd;
> +       s = combine1 (ps, pm);
> +
> +       if (s)
> +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> +       pd++;
> +       ps++;
> +       pm++;
> +       w--;
> +    }
> +

Is the alignment loop here actually important for performance? As far
as I know unaligned loads are cheap on all CPUs with AVX2. We might be
able to do unaligned loads/stores in the loop below and get into the
vectorized code sooner./+pack_2x256_256

> +    /*
> +     * dst is 32 byte aligned, and w >=8 means the next 256 bits
> +     * contain relevant data
> +    */
> +

Stray whitespace, and */ is not aligned

> +    while (w >= 8)
> +    {
> +       __m256i mask = load_256_unaligned ((__m256i *)pm);
> +
> +       if (!is_zero_256 (mask))
> +       {
> +           __m256i src;
> +           __m256i src_hi, src_lo;
> +           __m256i mask_hi, mask_lo;
> +           __m256i alpha_hi, alpha_lo;
> +
> +           src = load_256_unaligned ((__m256i *)ps);
> +
> +           if (is_opaque_256 (_mm256_and_si256 (src, mask)))
> +           {
> +               save_256_aligned ((__m256i *)pd, src);
> +           }
> +           else
> +           {
> +               __m256i dst = load_256_aligned ((__m256i *)pd);
> +               __m256i dst_hi, dst_lo;
> +
> +               unpack_256_2x256 (mask, &mask_lo, &mask_hi);
> +               unpack_256_2x256 (src, &src_lo, &src_hi);
> +
> +               expand_alpha_2x256 (mask_lo, mask_hi, &mask_lo, &mask_hi);
> +               pix_multiply_2x256 (&src_lo, &src_hi,
> +                                   &mask_lo, &mask_hi,
> +                                   &src_lo, &src_hi);
> +

Stray spaces again

> +               unpack_256_2x256 (dst, &dst_lo, &dst_hi);
> +               expand_alpha_2x256 (src_lo, src_hi,
> +                                   &alpha_lo, &alpha_hi);
> +
> +               over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
> +                           &dst_lo, &dst_hi);
> +
> +               save_256_aligned (
> +                   (__m256i *)pd,
> +                   pack_2x256_256 (dst_lo, dst_hi));
> +           }
> +       }
> +       pm += 8;
> +       ps += 8;
> +       pd += 8;
> +       w -= 8;
> +    }
> +
> +    while (w)
> +    {
> +       d = *pd;
> +       s = combine1 (ps, pm);
> +
> +       if (s)
> +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> +       pd++;
> +       ps++;
> +       pm++;
> +       w--;
> +    }
> +}
> +
> +static force_inline void
> +core_combine_over_u_avx2_no_mask (uint32_t *        pd,
> +                                 const uint32_t*    ps,
> +                                 int                w)
> +{
> +    uint32_t s, d;
> +
> +    /* Align dst on a 16-byte boundary */
> +    while (w && ((uintptr_t)pd & 31))
> +    {
> +       d = *pd;
> +       s = *ps;
> +
> +       if (s)
> +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> +       pd++;
> +       ps++;
> +       w--;
> +    }
> +
> +    while (w >= 8)
> +    {
> +       __m256i src;
> +       __m256i src_hi, src_lo, dst_hi, dst_lo;
> +       __m256i alpha_hi, alpha_lo;
> +
> +       src = load_256_unaligned ((__m256i *)ps);
> +
> +       if (!is_zero_256 (src))
> +       {
> +           if (is_opaque_256 (src))
> +           {
> +               save_256_aligned ((__m256i *)pd, src);
> +           }
> +           else
> +           {
> +               __m256i dst = load_256_aligned ((__m256i *)pd);
> +
> +               unpack_256_2x256 (src, &src_lo, &src_hi);
> +               unpack_256_2x256 (dst, &dst_lo, &dst_hi);
> +
> +               expand_alpha_2x256 (src_lo, src_hi,
> +                                   &alpha_lo, &alpha_hi);
> +               over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
> +                           &dst_lo, &dst_hi);
> +
> +               save_256_aligned (
> +                   (__m256i *)pd,
> +                   pack_2x256_256 (dst_lo, dst_hi));
> +           }
> +       }
> +
> +       ps += 8;
> +       pd += 8;
> +       w -= 8;
> +    }

Should have a blank line here

> +    while (w)
> +    {
> +       d = *pd;
> +       s = *ps;
> +
> +       if (s)
> +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> +       pd++;
> +       ps++;
> +       w--;
> +    }
> +}
> +
> +static force_inline void
> +avx2_combine_over_u (pixman_implementation_t *imp,
> +                    pixman_op_t              op,
> +                    uint32_t *               pd,
> +                    const uint32_t *         ps,
> +                    const uint32_t *         pm,
> +                    int                      w)
> +{
> +    if (pm)
> +       core_combine_over_u_avx2_mask (pd, ps, pm, w);
> +    else
> +       core_combine_over_u_avx2_no_mask (pd, ps, w);
> +}
> +
> +static void
> +avx2_combine_over_reverse_u (pixman_implementation_t *imp,
> +                            pixman_op_t              op,
> +                            uint32_t *               pd,
> +                            const uint32_t *         ps,
> +                            const uint32_t *         pm,
> +                            int                      w)
> +{
> +    uint32_t s, d;
> +
> +    __m256i ymm_dst_lo, ymm_dst_hi;
> +    __m256i ymm_src_lo, ymm_src_hi;
> +    __m256i ymm_alpha_lo, ymm_alpha_hi;
> +
> +    /* Align dst on a 16-byte boundary */
> +    while (w &&
> +          ((uintptr_t)pd & 31))
> +    {
> +       d = *pd;
> +       s = combine1 (ps, pm);
> +
> +       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
> +       w--;
> +       ps++;
> +       if (pm)
> +           pm++;
> +    }
> +
> +    while (w >= 8)
> +    {
> +       ymm_src_hi = combine8 ((__m256i*)ps, (__m256i*)pm);
> +       ymm_dst_hi = load_256_aligned ((__m256i*) pd);
> +
> +       unpack_256_2x256 (ymm_src_hi, &ymm_src_lo, &ymm_src_hi);
> +       unpack_256_2x256 (ymm_dst_hi, &ymm_dst_lo, &ymm_dst_hi);
> +
> +       expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi,
> +                           &ymm_alpha_lo, &ymm_alpha_hi);
> +
> +       over_2x256 (&ymm_dst_lo, &ymm_dst_hi,
> +                   &ymm_alpha_lo, &ymm_alpha_hi,
> +                   &ymm_src_lo, &ymm_src_hi);
> +
> +       /* rebuid the 4 pixel data and save*/
> +       save_256_aligned ((__m256i*)pd,
> +                         pack_2x256_256 (ymm_src_lo, ymm_src_hi));
> +
> +       w -= 8;
> +       ps += 8;
> +       pd += 8;
> +
> +       if (pm)
> +           pm += 8;
> +    }
> +
> +    while (w)
> +    {
> +       d = *pd;
> +       s = combine1 (ps, pm);
> +
> +       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
> +       ps++;
> +       w--;
> +       if (pm)
> +           pm++;
> +    }
> +}
> +
> +static void
> +avx2_composite_over_8888_8888 (pixman_implementation_t *imp,
> +                               pixman_composite_info_t *info)
> +{
> +    PIXMAN_COMPOSITE_ARGS (info);
> +    int dst_stride, src_stride;
> +    uint32_t    *dst_line, *dst;
> +    uint32_t    *src_line, *src;
> +
> +    PIXMAN_IMAGE_GET_LINE (
> +       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
> +    PIXMAN_IMAGE_GET_LINE (
> +       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
> +
> +    dst = dst_line;
> +    src = src_line;
> +
> +    while (height--)
> +    {
> +       avx2_combine_over_u (imp, op, dst, src, NULL, width);
> +
> +       dst += dst_stride;
> +       src += src_stride;
> +    }
> +}

Should have a blank line here

>  static const pixman_fast_path_t avx2_fast_paths[] =
>  {
> +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, avx2_composite_over_8888_8888),
> +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, avx2_composite_over_8888_8888),
> +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, avx2_composite_over_8888_8888),
> +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, avx2_composite_over_8888_8888),
>      { PIXMAN_OP_NONE },
>  };
>
> -static const pixman_iter_info_t avx2_iters[] =
> +static const pixman_iter_info_t avx2_iters[] =

There was stray whitespace added in the earlier commit here, and now
removed. Please just remove it from the original commit.

>  {
>      { PIXMAN_null },
>  };
> @@ -26,6 +452,9 @@ _pixman_implementation_create_avx2 (pixman_implementation_t *fallback)
>      pixman_implementation_t *imp = _pixman_implementation_create (fallback, avx2_fast_paths);
>
>      /* Set up function pointers */
> +    imp->combine_32[PIXMAN_OP_OVER] = avx2_combine_over_u;
> +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = avx2_combine_over_reverse_u;
> +

More stray spaces


>      imp->iter_info = avx2_iters;
>
>      return imp;
> --
> 2.17.1
>
> _______________________________________________
> Pixman mailing list
> Pixman@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/pixman
Devulapalli, Raghuveer Jan. 23, 2019, 2:40 p.m.
Hmm, I wonder why none of the cairo perf traces don’t show any improvement when the low level benchmark in PIXMAN shows 1.8x improvement. I would like to investigate that further to know what's happening (might need your help in doing so). Please do include the avx2_composite_over_reverse_n_8888 patch. If you know specific functions where avx2 is likely to benefit, I am happy to work with you to add those functionalities in avx2. 

For the ones I did submit, would you like me to fix the patch which adds the avx2 infrastructure and submit that again? And do you still want the patch which splits sse2 helpers functions into a separate file? 

Thanks for your help. 

-----Original Message-----
From: Matt Turner [mailto:mattst88@gmail.com] 

Sent: Monday, January 21, 2019 5:46 PM
To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
Cc: pixman@lists.freedesktop.org
Subject: Re: [Pixman] [PATCH 3/3] Rev2 of patch: AVX2 versions of OVER and ROVER operators.

On Wed, Jan 16, 2019 at 4:57 PM Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> wrote:
>

> From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>

>

> These were found to be upto 1.8 times faster (depending on the array

> size) than the corresponding SSE2 version. The AVX2 and SSE2 were 

> benchmarked on a Intel(R) Core(TM) i5-6260U CPU @ 1.80GHz. The AVX2 

> and SSE versions were benchmarked by measuring how many TSC cycles 

> each of the avx2_combine_over_u and sse2_combine_over_u functions took 

> to run for various array sizes. For the purpose of benchmarking, turbo 

> was disabled and intel_pstate governor was set to performance to avoid 

> variance in CPU frequencies across multiple runs.

>

> | Array size | #cycles SSE2 | #cycles AVX2 |

> --------------------------------------------

> | 400        | 53966        | 32800        |

> | 800        | 107595       | 62595        |

> | 1600       | 214810       | 122482       |

> | 3200       | 429748       | 241971       |

> | 6400       | 859070       | 481076       |

>

> Also ran lowlevel-blt-bench for OVER_8888_8888 operation and that also 

> shows a 1.55x-1.79x improvement over SSE2. Here are the details:

>

> AVX2: OVER_8888_8888 =  L1:2136.35  L2:2109.46  M:1751.99 ( 60.90%)

> SSE2: OVER_8888_8888 =  L1:1188.91  L2:1190.63  M:1128.32 ( 40.31%)

>

> The AVX2 implementation uses the SSE2 version for manipulating pixels 

> that are not 32 byte aligned. The helper functions from pixman-sse2.h 

> are re-used for this purpose.


I still cannot measure any performance improvement with cairo-traces.
If we're not improving performance in any real world application, then I don't think it's worth adding a significant amount of code.

As I told you in person and in private mail, I suspect that you're more likely to see real performance improvements in operations that are more compute-heavy, like bilinear filtering. You could probably use AVX2's gather instructions in the bilinear code as well. Filling out the avx2_iters array would also be a good place to start, since those functions execute when we do not have a specific fast-path for an operation (which will be the case for AVX2).

I sense that you want to check this off your todo list and move on. If that's the case, we can include the avx2_composite_over_reverse_n_8888
function I wrote (and will send as a separate patch) to confirm that using AVX2 is capable of giving a performance improvement in some cairo traces.

> ---

>  pixman/pixman-avx2.c | 431 

> ++++++++++++++++++++++++++++++++++++++++++-

>  1 file changed, 430 insertions(+), 1 deletion(-)

>

> diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c index 

> d860d67..faef552 100644

> --- a/pixman/pixman-avx2.c

> +++ b/pixman/pixman-avx2.c

> @@ -6,13 +6,439 @@

>  #include "pixman-private.h"

>  #include "pixman-combine32.h"

>  #include "pixman-inlines.h"

> +#include "pixman-sse2.h"

>

> +#define MASK_0080_AVX2 _mm256_set1_epi16(0x0080) #define 

> +MASK_00FF_AVX2 _mm256_set1_epi16(0x00ff) #define MASK_0101_AVX2 

> +_mm256_set1_epi16(0x0101)

> +

> +static force_inline __m256i

> +load_256_aligned (__m256i* src)

> +{

> +    return _mm256_load_si256(src);

> +}

> +

> +static force_inline void

> +negate_2x256 (__m256i  data_lo,

> +             __m256i  data_hi,

> +             __m256i* neg_lo,

> +             __m256i* neg_hi)

> +{

> +    *neg_lo = _mm256_xor_si256 (data_lo, MASK_00FF_AVX2);

> +    *neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2); }

> +

> +static force_inline __m256i

> +pack_2x256_256 (__m256i lo, __m256i hi) {

> +    return _mm256_packus_epi16 (lo, hi); }

> +


Stray space

> +static force_inline void

> +pix_multiply_2x256 (__m256i* data_lo,

> +                   __m256i* data_hi,

> +                   __m256i* alpha_lo,

> +                   __m256i* alpha_hi,

> +                   __m256i* ret_lo,

> +                   __m256i* ret_hi)

> +{

> +    __m256i lo, hi;

> +

> +    lo = _mm256_mullo_epi16 (*data_lo, *alpha_lo);

> +    hi = _mm256_mullo_epi16 (*data_hi, *alpha_hi);

> +    lo = _mm256_adds_epu16 (lo, MASK_0080_AVX2);

> +    hi = _mm256_adds_epu16 (hi, MASK_0080_AVX2);

> +    *ret_lo = _mm256_mulhi_epu16 (lo, MASK_0101_AVX2);

> +    *ret_hi = _mm256_mulhi_epu16 (hi, MASK_0101_AVX2); }

> +


Stray space

> +static force_inline void

> +over_2x256 (__m256i* src_lo,

> +           __m256i* src_hi,

> +           __m256i* alpha_lo,

> +           __m256i* alpha_hi,

> +           __m256i* dst_lo,

> +           __m256i* dst_hi)

> +{

> +    __m256i t1, t2;

> +

> +    negate_2x256 (*alpha_lo, *alpha_hi, &t1, &t2);

> +

> +    pix_multiply_2x256 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);

> +

> +    *dst_lo = _mm256_adds_epu8 (*src_lo, *dst_lo);

> +    *dst_hi = _mm256_adds_epu8 (*src_hi, *dst_hi); }

> +

> +static force_inline void

> +expand_alpha_2x256 (__m256i  data_lo,

> +                   __m256i  data_hi,

> +                   __m256i* alpha_lo,

> +                   __m256i* alpha_hi) {

> +    __m256i lo, hi;

> +

> +    lo = _mm256_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));

> +    hi = _mm256_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));

> +

> +    *alpha_lo = _mm256_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));

> +    *alpha_hi = _mm256_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 

> +3)); }

> +

> +static force_inline  void

> +unpack_256_2x256 (__m256i data, __m256i* data_lo, __m256i* data_hi) {

> +    *data_lo = _mm256_unpacklo_epi8 (data, _mm256_setzero_si256 ());

> +    *data_hi = _mm256_unpackhi_epi8 (data, _mm256_setzero_si256 ()); 

> +}

> +

> +/* save 4 pixels on a 16-byte boundary aligned address */ static 

> +force_inline void save_256_aligned (__m256i* dst,

> +                 __m256i  data)

> +{

> +    _mm256_store_si256 (dst, data);

> +}

> +

> +static force_inline int

> +is_opaque_256 (__m256i x)

> +{

> +    __m256i ffs = _mm256_cmpeq_epi8 (x, x);

> +

> +    return (_mm256_movemask_epi8

> +           (_mm256_cmpeq_epi8 (x, ffs)) & 0x88888888) == 0x88888888; 

> +}

> +

> +static force_inline int

> +is_zero_256 (__m256i x)

> +{

> +    return _mm256_movemask_epi8 (

> +       _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) == 0xffffffff; 

> +}

> +

> +static force_inline int

> +is_transparent_256 (__m256i x)

> +{

> +    return (_mm256_movemask_epi8 (

> +               _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) & 0x88888888)

> +                == 0x88888888;

> +}

> +

> +


Extra newline

> +/* load 4 pixels from a unaligned address */ static force_inline 

> +__m256i load_256_unaligned (const __m256i* src) {

> +    return _mm256_loadu_si256 (src);

> +}

> +

> +static force_inline __m256i

> +combine8 (const __m256i *ps, const __m256i *pm) {

> +    __m256i ymm_src_lo, ymm_src_hi;

> +    __m256i ymm_msk_lo, ymm_msk_hi;

> +    __m256i s;

> +

> +    if (pm)

> +    {

> +       ymm_msk_lo = load_256_unaligned (pm);

> +

> +       if (is_transparent_256 (ymm_msk_lo))

> +           return _mm256_setzero_si256 ();

> +    }

> +

> +    s = load_256_unaligned (ps);

> +

> +    if (pm)

> +    {

> +       unpack_256_2x256 (s, &ymm_src_lo, &ymm_src_hi);

> +       unpack_256_2x256 (ymm_msk_lo, &ymm_msk_lo, &ymm_msk_hi);

> +

> +       expand_alpha_2x256 (ymm_msk_lo, ymm_msk_hi, &ymm_msk_lo, 

> + &ymm_msk_hi);

> +

> +       pix_multiply_2x256 (&ymm_src_lo, &ymm_src_hi,

> +                           &ymm_msk_lo, &ymm_msk_hi,

> +                           &ymm_src_lo, &ymm_src_hi);

> +

> +       s = pack_2x256_256 (ymm_src_lo, ymm_src_hi);

> +    }

> +

> +    return s;

> +}

> +

> +static force_inline void

> +core_combine_over_u_avx2_mask (uint32_t *        pd,

> +                              const uint32_t*    ps,

> +                              const uint32_t*    pm,

> +                              int                w)

> +{

> +    uint32_t s, d;

> +    while (w && ((uintptr_t)pd & 31))

> +    {

> +       d = *pd;

> +       s = combine1 (ps, pm);

> +

> +       if (s)

> +           *pd = core_combine_over_u_pixel_sse2 (s, d);

> +       pd++;

> +       ps++;

> +       pm++;

> +       w--;

> +    }

> +


Is the alignment loop here actually important for performance? As far as I know unaligned loads are cheap on all CPUs with AVX2. We might be able to do unaligned loads/stores in the loop below and get into the vectorized code sooner./+pack_2x256_256

> +    /*

> +     * dst is 32 byte aligned, and w >=8 means the next 256 bits

> +     * contain relevant data

> +    */

> +


Stray whitespace, and */ is not aligned

> +    while (w >= 8)

> +    {

> +       __m256i mask = load_256_unaligned ((__m256i *)pm);

> +

> +       if (!is_zero_256 (mask))

> +       {

> +           __m256i src;

> +           __m256i src_hi, src_lo;

> +           __m256i mask_hi, mask_lo;

> +           __m256i alpha_hi, alpha_lo;

> +

> +           src = load_256_unaligned ((__m256i *)ps);

> +

> +           if (is_opaque_256 (_mm256_and_si256 (src, mask)))

> +           {

> +               save_256_aligned ((__m256i *)pd, src);

> +           }

> +           else

> +           {

> +               __m256i dst = load_256_aligned ((__m256i *)pd);

> +               __m256i dst_hi, dst_lo;

> +

> +               unpack_256_2x256 (mask, &mask_lo, &mask_hi);

> +               unpack_256_2x256 (src, &src_lo, &src_hi);

> +

> +               expand_alpha_2x256 (mask_lo, mask_hi, &mask_lo, &mask_hi);

> +               pix_multiply_2x256 (&src_lo, &src_hi,

> +                                   &mask_lo, &mask_hi,

> +                                   &src_lo, &src_hi);

> +


Stray spaces again

> +               unpack_256_2x256 (dst, &dst_lo, &dst_hi);

> +               expand_alpha_2x256 (src_lo, src_hi,

> +                                   &alpha_lo, &alpha_hi);

> +

> +               over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,

> +                           &dst_lo, &dst_hi);

> +

> +               save_256_aligned (

> +                   (__m256i *)pd,

> +                   pack_2x256_256 (dst_lo, dst_hi));

> +           }

> +       }

> +       pm += 8;

> +       ps += 8;

> +       pd += 8;

> +       w -= 8;

> +    }

> +

> +    while (w)

> +    {

> +       d = *pd;

> +       s = combine1 (ps, pm);

> +

> +       if (s)

> +           *pd = core_combine_over_u_pixel_sse2 (s, d);

> +       pd++;

> +       ps++;

> +       pm++;

> +       w--;

> +    }

> +}

> +

> +static force_inline void

> +core_combine_over_u_avx2_no_mask (uint32_t *        pd,

> +                                 const uint32_t*    ps,

> +                                 int                w)

> +{

> +    uint32_t s, d;

> +

> +    /* Align dst on a 16-byte boundary */

> +    while (w && ((uintptr_t)pd & 31))

> +    {

> +       d = *pd;

> +       s = *ps;

> +

> +       if (s)

> +           *pd = core_combine_over_u_pixel_sse2 (s, d);

> +       pd++;

> +       ps++;

> +       w--;

> +    }

> +

> +    while (w >= 8)

> +    {

> +       __m256i src;

> +       __m256i src_hi, src_lo, dst_hi, dst_lo;

> +       __m256i alpha_hi, alpha_lo;

> +

> +       src = load_256_unaligned ((__m256i *)ps);

> +

> +       if (!is_zero_256 (src))

> +       {

> +           if (is_opaque_256 (src))

> +           {

> +               save_256_aligned ((__m256i *)pd, src);

> +           }

> +           else

> +           {

> +               __m256i dst = load_256_aligned ((__m256i *)pd);

> +

> +               unpack_256_2x256 (src, &src_lo, &src_hi);

> +               unpack_256_2x256 (dst, &dst_lo, &dst_hi);

> +

> +               expand_alpha_2x256 (src_lo, src_hi,

> +                                   &alpha_lo, &alpha_hi);

> +               over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,

> +                           &dst_lo, &dst_hi);

> +

> +               save_256_aligned (

> +                   (__m256i *)pd,

> +                   pack_2x256_256 (dst_lo, dst_hi));

> +           }

> +       }

> +

> +       ps += 8;

> +       pd += 8;

> +       w -= 8;

> +    }


Should have a blank line here

> +    while (w)

> +    {

> +       d = *pd;

> +       s = *ps;

> +

> +       if (s)

> +           *pd = core_combine_over_u_pixel_sse2 (s, d);

> +       pd++;

> +       ps++;

> +       w--;

> +    }

> +}

> +

> +static force_inline void

> +avx2_combine_over_u (pixman_implementation_t *imp,

> +                    pixman_op_t              op,

> +                    uint32_t *               pd,

> +                    const uint32_t *         ps,

> +                    const uint32_t *         pm,

> +                    int                      w)

> +{

> +    if (pm)

> +       core_combine_over_u_avx2_mask (pd, ps, pm, w);

> +    else

> +       core_combine_over_u_avx2_no_mask (pd, ps, w); }

> +

> +static void

> +avx2_combine_over_reverse_u (pixman_implementation_t *imp,

> +                            pixman_op_t              op,

> +                            uint32_t *               pd,

> +                            const uint32_t *         ps,

> +                            const uint32_t *         pm,

> +                            int                      w)

> +{

> +    uint32_t s, d;

> +

> +    __m256i ymm_dst_lo, ymm_dst_hi;

> +    __m256i ymm_src_lo, ymm_src_hi;

> +    __m256i ymm_alpha_lo, ymm_alpha_hi;

> +

> +    /* Align dst on a 16-byte boundary */

> +    while (w &&

> +          ((uintptr_t)pd & 31))

> +    {

> +       d = *pd;

> +       s = combine1 (ps, pm);

> +

> +       *pd++ = core_combine_over_u_pixel_sse2 (d, s);

> +       w--;

> +       ps++;

> +       if (pm)

> +           pm++;

> +    }

> +

> +    while (w >= 8)

> +    {

> +       ymm_src_hi = combine8 ((__m256i*)ps, (__m256i*)pm);

> +       ymm_dst_hi = load_256_aligned ((__m256i*) pd);

> +

> +       unpack_256_2x256 (ymm_src_hi, &ymm_src_lo, &ymm_src_hi);

> +       unpack_256_2x256 (ymm_dst_hi, &ymm_dst_lo, &ymm_dst_hi);

> +

> +       expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi,

> +                           &ymm_alpha_lo, &ymm_alpha_hi);

> +

> +       over_2x256 (&ymm_dst_lo, &ymm_dst_hi,

> +                   &ymm_alpha_lo, &ymm_alpha_hi,

> +                   &ymm_src_lo, &ymm_src_hi);

> +

> +       /* rebuid the 4 pixel data and save*/

> +       save_256_aligned ((__m256i*)pd,

> +                         pack_2x256_256 (ymm_src_lo, ymm_src_hi));

> +

> +       w -= 8;

> +       ps += 8;

> +       pd += 8;

> +

> +       if (pm)

> +           pm += 8;

> +    }

> +

> +    while (w)

> +    {

> +       d = *pd;

> +       s = combine1 (ps, pm);

> +

> +       *pd++ = core_combine_over_u_pixel_sse2 (d, s);

> +       ps++;

> +       w--;

> +       if (pm)

> +           pm++;

> +    }

> +}

> +

> +static void

> +avx2_composite_over_8888_8888 (pixman_implementation_t *imp,

> +                               pixman_composite_info_t *info) {

> +    PIXMAN_COMPOSITE_ARGS (info);

> +    int dst_stride, src_stride;

> +    uint32_t    *dst_line, *dst;

> +    uint32_t    *src_line, *src;

> +

> +    PIXMAN_IMAGE_GET_LINE (

> +       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

> +    PIXMAN_IMAGE_GET_LINE (

> +       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

> +

> +    dst = dst_line;

> +    src = src_line;

> +

> +    while (height--)

> +    {

> +       avx2_combine_over_u (imp, op, dst, src, NULL, width);

> +

> +       dst += dst_stride;

> +       src += src_stride;

> +    }

> +}


Should have a blank line here

>  static const pixman_fast_path_t avx2_fast_paths[] =  {

> +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, avx2_composite_over_8888_8888),

> +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, avx2_composite_over_8888_8888),

> +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, avx2_composite_over_8888_8888),

> +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, 

> + avx2_composite_over_8888_8888),

>      { PIXMAN_OP_NONE },

>  };

>

> -static const pixman_iter_info_t avx2_iters[] =

> +static const pixman_iter_info_t avx2_iters[] =


There was stray whitespace added in the earlier commit here, and now removed. Please just remove it from the original commit.

>  {

>      { PIXMAN_null },

>  };

> @@ -26,6 +452,9 @@ _pixman_implementation_create_avx2 (pixman_implementation_t *fallback)

>      pixman_implementation_t *imp = _pixman_implementation_create 

> (fallback, avx2_fast_paths);

>

>      /* Set up function pointers */

> +    imp->combine_32[PIXMAN_OP_OVER] = avx2_combine_over_u;

> +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = 

> + avx2_combine_over_reverse_u;

> +


More stray spaces


>      imp->iter_info = avx2_iters;

>

>      return imp;

> --

> 2.17.1

>

> _______________________________________________

> Pixman mailing list

> Pixman@lists.freedesktop.org

> https://lists.freedesktop.org/mailman/listinfo/pixman
Petr Kobalíček Jan. 24, 2019, 11:45 a.m.
To identity a performance problem I would look from at it from the
following perspective.

SSE implementation:
  1. Alignment loop [alignment to 16 byte boundary] (max 3 pixels)
  2. Inner loop (4+ pixels)
  3. Trailing loop (max 3 pixels).

  Worst case: 6 pixels avoiding inner loop completely.
  Minimum number of pixels to enter inner loop: 4.

AVX2 implementation:
  1. Alignment loop [alignment to 32 byte boundary] (max 7 pixels)
  2. Inner loop (8+ pixels)
  3. Trailing loop (max 7 pixels).

  Worst case: 14 pixels avoiding inner loop completely.
  Minimum number of pixels to enter inner loop: 8.

Based on these observations I would suggest doing this smarter and
introducing a 4 pixel case that would be able to speed-up spans that would
normally not reach inner loop at all or that would spend too much time in
single pixel loops. The reason is simple - single pixel loop would probably
be very comparable to inner loop in terms of performance because of tight
dependency and impossibility to execute anything in single-pixel case in
parallel.

Decreasing the array size gradually in the initial benchmark to some small
width like 4 would definitely reveal all the problems with the current
implementation. I think minimum value of 400 pixels is just too high for
any reasonable conclusion about performance of such optimization, because
2D is usually about smaller fills/blits most of the time.

- Petr


On Wed, Jan 23, 2019 at 3:40 PM Devulapalli, Raghuveer <
raghuveer.devulapalli@intel.com> wrote:

> Hmm, I wonder why none of the cairo perf traces don’t show any improvement
> when the low level benchmark in PIXMAN shows 1.8x improvement. I would like
> to investigate that further to know what's happening (might need your help
> in doing so). Please do include the avx2_composite_over_reverse_n_8888
> patch. If you know specific functions where avx2 is likely to benefit, I am
> happy to work with you to add those functionalities in avx2.
>
> For the ones I did submit, would you like me to fix the patch which adds
> the avx2 infrastructure and submit that again? And do you still want the
> patch which splits sse2 helpers functions into a separate file?
>
> Thanks for your help.
>
> -----Original Message-----
> From: Matt Turner [mailto:mattst88@gmail.com]
> Sent: Monday, January 21, 2019 5:46 PM
> To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> Cc: pixman@lists.freedesktop.org
> Subject: Re: [Pixman] [PATCH 3/3] Rev2 of patch: AVX2 versions of OVER and
> ROVER operators.
>
> On Wed, Jan 16, 2019 at 4:57 PM Raghuveer Devulapalli <
> raghuveer.devulapalli@intel.com> wrote:
> >
> > From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>
> >
> > These were found to be upto 1.8 times faster (depending on the array
> > size) than the corresponding SSE2 version. The AVX2 and SSE2 were
> > benchmarked on a Intel(R) Core(TM) i5-6260U CPU @ 1.80GHz. The AVX2
> > and SSE versions were benchmarked by measuring how many TSC cycles
> > each of the avx2_combine_over_u and sse2_combine_over_u functions took
> > to run for various array sizes. For the purpose of benchmarking, turbo
> > was disabled and intel_pstate governor was set to performance to avoid
> > variance in CPU frequencies across multiple runs.
> >
> > | Array size | #cycles SSE2 | #cycles AVX2 |
> > --------------------------------------------
> > | 400        | 53966        | 32800        |
> > | 800        | 107595       | 62595        |
> > | 1600       | 214810       | 122482       |
> > | 3200       | 429748       | 241971       |
> > | 6400       | 859070       | 481076       |
> >
> > Also ran lowlevel-blt-bench for OVER_8888_8888 operation and that also
> > shows a 1.55x-1.79x improvement over SSE2. Here are the details:
> >
> > AVX2: OVER_8888_8888 =  L1:2136.35  L2:2109.46  M:1751.99 ( 60.90%)
> > SSE2: OVER_8888_8888 =  L1:1188.91  L2:1190.63  M:1128.32 ( 40.31%)
> >
> > The AVX2 implementation uses the SSE2 version for manipulating pixels
> > that are not 32 byte aligned. The helper functions from pixman-sse2.h
> > are re-used for this purpose.
>
> I still cannot measure any performance improvement with cairo-traces.
> If we're not improving performance in any real world application, then I
> don't think it's worth adding a significant amount of code.
>
> As I told you in person and in private mail, I suspect that you're more
> likely to see real performance improvements in operations that are more
> compute-heavy, like bilinear filtering. You could probably use AVX2's
> gather instructions in the bilinear code as well. Filling out the
> avx2_iters array would also be a good place to start, since those functions
> execute when we do not have a specific fast-path for an operation (which
> will be the case for AVX2).
>
> I sense that you want to check this off your todo list and move on. If
> that's the case, we can include the avx2_composite_over_reverse_n_8888
> function I wrote (and will send as a separate patch) to confirm that using
> AVX2 is capable of giving a performance improvement in some cairo traces.
>
> > ---
> >  pixman/pixman-avx2.c | 431
> > ++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 430 insertions(+), 1 deletion(-)
> >
> > diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c index
> > d860d67..faef552 100644
> > --- a/pixman/pixman-avx2.c
> > +++ b/pixman/pixman-avx2.c
> > @@ -6,13 +6,439 @@
> >  #include "pixman-private.h"
> >  #include "pixman-combine32.h"
> >  #include "pixman-inlines.h"
> > +#include "pixman-sse2.h"
> >
> > +#define MASK_0080_AVX2 _mm256_set1_epi16(0x0080) #define
> > +MASK_00FF_AVX2 _mm256_set1_epi16(0x00ff) #define MASK_0101_AVX2
> > +_mm256_set1_epi16(0x0101)
> > +
> > +static force_inline __m256i
> > +load_256_aligned (__m256i* src)
> > +{
> > +    return _mm256_load_si256(src);
> > +}
> > +
> > +static force_inline void
> > +negate_2x256 (__m256i  data_lo,
> > +             __m256i  data_hi,
> > +             __m256i* neg_lo,
> > +             __m256i* neg_hi)
> > +{
> > +    *neg_lo = _mm256_xor_si256 (data_lo, MASK_00FF_AVX2);
> > +    *neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2); }
> > +
> > +static force_inline __m256i
> > +pack_2x256_256 (__m256i lo, __m256i hi) {
> > +    return _mm256_packus_epi16 (lo, hi); }
> > +
>
> Stray space
>
> > +static force_inline void
> > +pix_multiply_2x256 (__m256i* data_lo,
> > +                   __m256i* data_hi,
> > +                   __m256i* alpha_lo,
> > +                   __m256i* alpha_hi,
> > +                   __m256i* ret_lo,
> > +                   __m256i* ret_hi)
> > +{
> > +    __m256i lo, hi;
> > +
> > +    lo = _mm256_mullo_epi16 (*data_lo, *alpha_lo);
> > +    hi = _mm256_mullo_epi16 (*data_hi, *alpha_hi);
> > +    lo = _mm256_adds_epu16 (lo, MASK_0080_AVX2);
> > +    hi = _mm256_adds_epu16 (hi, MASK_0080_AVX2);
> > +    *ret_lo = _mm256_mulhi_epu16 (lo, MASK_0101_AVX2);
> > +    *ret_hi = _mm256_mulhi_epu16 (hi, MASK_0101_AVX2); }
> > +
>
> Stray space
>
> > +static force_inline void
> > +over_2x256 (__m256i* src_lo,
> > +           __m256i* src_hi,
> > +           __m256i* alpha_lo,
> > +           __m256i* alpha_hi,
> > +           __m256i* dst_lo,
> > +           __m256i* dst_hi)
> > +{
> > +    __m256i t1, t2;
> > +
> > +    negate_2x256 (*alpha_lo, *alpha_hi, &t1, &t2);
> > +
> > +    pix_multiply_2x256 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
> > +
> > +    *dst_lo = _mm256_adds_epu8 (*src_lo, *dst_lo);
> > +    *dst_hi = _mm256_adds_epu8 (*src_hi, *dst_hi); }
> > +
> > +static force_inline void
> > +expand_alpha_2x256 (__m256i  data_lo,
> > +                   __m256i  data_hi,
> > +                   __m256i* alpha_lo,
> > +                   __m256i* alpha_hi) {
> > +    __m256i lo, hi;
> > +
> > +    lo = _mm256_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
> > +    hi = _mm256_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
> > +
> > +    *alpha_lo = _mm256_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
> > +    *alpha_hi = _mm256_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3,
> > +3)); }
> > +
> > +static force_inline  void
> > +unpack_256_2x256 (__m256i data, __m256i* data_lo, __m256i* data_hi) {
> > +    *data_lo = _mm256_unpacklo_epi8 (data, _mm256_setzero_si256 ());
> > +    *data_hi = _mm256_unpackhi_epi8 (data, _mm256_setzero_si256 ());
> > +}
> > +
> > +/* save 4 pixels on a 16-byte boundary aligned address */ static
> > +force_inline void save_256_aligned (__m256i* dst,
> > +                 __m256i  data)
> > +{
> > +    _mm256_store_si256 (dst, data);
> > +}
> > +
> > +static force_inline int
> > +is_opaque_256 (__m256i x)
> > +{
> > +    __m256i ffs = _mm256_cmpeq_epi8 (x, x);
> > +
> > +    return (_mm256_movemask_epi8
> > +           (_mm256_cmpeq_epi8 (x, ffs)) & 0x88888888) == 0x88888888;
> > +}
> > +
> > +static force_inline int
> > +is_zero_256 (__m256i x)
> > +{
> > +    return _mm256_movemask_epi8 (
> > +       _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) == 0xffffffff;
> > +}
> > +
> > +static force_inline int
> > +is_transparent_256 (__m256i x)
> > +{
> > +    return (_mm256_movemask_epi8 (
> > +               _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) &
> 0x88888888)
> > +                == 0x88888888;
> > +}
> > +
> > +
>
> Extra newline
>
> > +/* load 4 pixels from a unaligned address */ static force_inline
> > +__m256i load_256_unaligned (const __m256i* src) {
> > +    return _mm256_loadu_si256 (src);
> > +}
> > +
> > +static force_inline __m256i
> > +combine8 (const __m256i *ps, const __m256i *pm) {
> > +    __m256i ymm_src_lo, ymm_src_hi;
> > +    __m256i ymm_msk_lo, ymm_msk_hi;
> > +    __m256i s;
> > +
> > +    if (pm)
> > +    {
> > +       ymm_msk_lo = load_256_unaligned (pm);
> > +
> > +       if (is_transparent_256 (ymm_msk_lo))
> > +           return _mm256_setzero_si256 ();
> > +    }
> > +
> > +    s = load_256_unaligned (ps);
> > +
> > +    if (pm)
> > +    {
> > +       unpack_256_2x256 (s, &ymm_src_lo, &ymm_src_hi);
> > +       unpack_256_2x256 (ymm_msk_lo, &ymm_msk_lo, &ymm_msk_hi);
> > +
> > +       expand_alpha_2x256 (ymm_msk_lo, ymm_msk_hi, &ymm_msk_lo,
> > + &ymm_msk_hi);
> > +
> > +       pix_multiply_2x256 (&ymm_src_lo, &ymm_src_hi,
> > +                           &ymm_msk_lo, &ymm_msk_hi,
> > +                           &ymm_src_lo, &ymm_src_hi);
> > +
> > +       s = pack_2x256_256 (ymm_src_lo, ymm_src_hi);
> > +    }
> > +
> > +    return s;
> > +}
> > +
> > +static force_inline void
> > +core_combine_over_u_avx2_mask (uint32_t *        pd,
> > +                              const uint32_t*    ps,
> > +                              const uint32_t*    pm,
> > +                              int                w)
> > +{
> > +    uint32_t s, d;
> > +    while (w && ((uintptr_t)pd & 31))
> > +    {
> > +       d = *pd;
> > +       s = combine1 (ps, pm);
> > +
> > +       if (s)
> > +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> > +       pd++;
> > +       ps++;
> > +       pm++;
> > +       w--;
> > +    }
> > +
>
> Is the alignment loop here actually important for performance? As far as I
> know unaligned loads are cheap on all CPUs with AVX2. We might be able to
> do unaligned loads/stores in the loop below and get into the vectorized
> code sooner./+pack_2x256_256
>
> > +    /*
> > +     * dst is 32 byte aligned, and w >=8 means the next 256 bits
> > +     * contain relevant data
> > +    */
> > +
>
> Stray whitespace, and */ is not aligned
>
> > +    while (w >= 8)
> > +    {
> > +       __m256i mask = load_256_unaligned ((__m256i *)pm);
> > +
> > +       if (!is_zero_256 (mask))
> > +       {
> > +           __m256i src;
> > +           __m256i src_hi, src_lo;
> > +           __m256i mask_hi, mask_lo;
> > +           __m256i alpha_hi, alpha_lo;
> > +
> > +           src = load_256_unaligned ((__m256i *)ps);
> > +
> > +           if (is_opaque_256 (_mm256_and_si256 (src, mask)))
> > +           {
> > +               save_256_aligned ((__m256i *)pd, src);
> > +           }
> > +           else
> > +           {
> > +               __m256i dst = load_256_aligned ((__m256i *)pd);
> > +               __m256i dst_hi, dst_lo;
> > +
> > +               unpack_256_2x256 (mask, &mask_lo, &mask_hi);
> > +               unpack_256_2x256 (src, &src_lo, &src_hi);
> > +
> > +               expand_alpha_2x256 (mask_lo, mask_hi, &mask_lo,
> &mask_hi);
> > +               pix_multiply_2x256 (&src_lo, &src_hi,
> > +                                   &mask_lo, &mask_hi,
> > +                                   &src_lo, &src_hi);
> > +
>
> Stray spaces again
>
> > +               unpack_256_2x256 (dst, &dst_lo, &dst_hi);
> > +               expand_alpha_2x256 (src_lo, src_hi,
> > +                                   &alpha_lo, &alpha_hi);
> > +
> > +               over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
> > +                           &dst_lo, &dst_hi);
> > +
> > +               save_256_aligned (
> > +                   (__m256i *)pd,
> > +                   pack_2x256_256 (dst_lo, dst_hi));
> > +           }
> > +       }
> > +       pm += 8;
> > +       ps += 8;
> > +       pd += 8;
> > +       w -= 8;
> > +    }
> > +
> > +    while (w)
> > +    {
> > +       d = *pd;
> > +       s = combine1 (ps, pm);
> > +
> > +       if (s)
> > +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> > +       pd++;
> > +       ps++;
> > +       pm++;
> > +       w--;
> > +    }
> > +}
> > +
> > +static force_inline void
> > +core_combine_over_u_avx2_no_mask (uint32_t *        pd,
> > +                                 const uint32_t*    ps,
> > +                                 int                w)
> > +{
> > +    uint32_t s, d;
> > +
> > +    /* Align dst on a 16-byte boundary */
> > +    while (w && ((uintptr_t)pd & 31))
> > +    {
> > +       d = *pd;
> > +       s = *ps;
> > +
> > +       if (s)
> > +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> > +       pd++;
> > +       ps++;
> > +       w--;
> > +    }
> > +
> > +    while (w >= 8)
> > +    {
> > +       __m256i src;
> > +       __m256i src_hi, src_lo, dst_hi, dst_lo;
> > +       __m256i alpha_hi, alpha_lo;
> > +
> > +       src = load_256_unaligned ((__m256i *)ps);
> > +
> > +       if (!is_zero_256 (src))
> > +       {
> > +           if (is_opaque_256 (src))
> > +           {
> > +               save_256_aligned ((__m256i *)pd, src);
> > +           }
> > +           else
> > +           {
> > +               __m256i dst = load_256_aligned ((__m256i *)pd);
> > +
> > +               unpack_256_2x256 (src, &src_lo, &src_hi);
> > +               unpack_256_2x256 (dst, &dst_lo, &dst_hi);
> > +
> > +               expand_alpha_2x256 (src_lo, src_hi,
> > +                                   &alpha_lo, &alpha_hi);
> > +               over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
> > +                           &dst_lo, &dst_hi);
> > +
> > +               save_256_aligned (
> > +                   (__m256i *)pd,
> > +                   pack_2x256_256 (dst_lo, dst_hi));
> > +           }
> > +       }
> > +
> > +       ps += 8;
> > +       pd += 8;
> > +       w -= 8;
> > +    }
>
> Should have a blank line here
>
> > +    while (w)
> > +    {
> > +       d = *pd;
> > +       s = *ps;
> > +
> > +       if (s)
> > +           *pd = core_combine_over_u_pixel_sse2 (s, d);
> > +       pd++;
> > +       ps++;
> > +       w--;
> > +    }
> > +}
> > +
> > +static force_inline void
> > +avx2_combine_over_u (pixman_implementation_t *imp,
> > +                    pixman_op_t              op,
> > +                    uint32_t *               pd,
> > +                    const uint32_t *         ps,
> > +                    const uint32_t *         pm,
> > +                    int                      w)
> > +{
> > +    if (pm)
> > +       core_combine_over_u_avx2_mask (pd, ps, pm, w);
> > +    else
> > +       core_combine_over_u_avx2_no_mask (pd, ps, w); }
> > +
> > +static void
> > +avx2_combine_over_reverse_u (pixman_implementation_t *imp,
> > +                            pixman_op_t              op,
> > +                            uint32_t *               pd,
> > +                            const uint32_t *         ps,
> > +                            const uint32_t *         pm,
> > +                            int                      w)
> > +{
> > +    uint32_t s, d;
> > +
> > +    __m256i ymm_dst_lo, ymm_dst_hi;
> > +    __m256i ymm_src_lo, ymm_src_hi;
> > +    __m256i ymm_alpha_lo, ymm_alpha_hi;
> > +
> > +    /* Align dst on a 16-byte boundary */
> > +    while (w &&
> > +          ((uintptr_t)pd & 31))
> > +    {
> > +       d = *pd;
> > +       s = combine1 (ps, pm);
> > +
> > +       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
> > +       w--;
> > +       ps++;
> > +       if (pm)
> > +           pm++;
> > +    }
> > +
> > +    while (w >= 8)
> > +    {
> > +       ymm_src_hi = combine8 ((__m256i*)ps, (__m256i*)pm);
> > +       ymm_dst_hi = load_256_aligned ((__m256i*) pd);
> > +
> > +       unpack_256_2x256 (ymm_src_hi, &ymm_src_lo, &ymm_src_hi);
> > +       unpack_256_2x256 (ymm_dst_hi, &ymm_dst_lo, &ymm_dst_hi);
> > +
> > +       expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi,
> > +                           &ymm_alpha_lo, &ymm_alpha_hi);
> > +
> > +       over_2x256 (&ymm_dst_lo, &ymm_dst_hi,
> > +                   &ymm_alpha_lo, &ymm_alpha_hi,
> > +                   &ymm_src_lo, &ymm_src_hi);
> > +
> > +       /* rebuid the 4 pixel data and save*/
> > +       save_256_aligned ((__m256i*)pd,
> > +                         pack_2x256_256 (ymm_src_lo, ymm_src_hi));
> > +
> > +       w -= 8;
> > +       ps += 8;
> > +       pd += 8;
> > +
> > +       if (pm)
> > +           pm += 8;
> > +    }
> > +
> > +    while (w)
> > +    {
> > +       d = *pd;
> > +       s = combine1 (ps, pm);
> > +
> > +       *pd++ = core_combine_over_u_pixel_sse2 (d, s);
> > +       ps++;
> > +       w--;
> > +       if (pm)
> > +           pm++;
> > +    }
> > +}
> > +
> > +static void
> > +avx2_composite_over_8888_8888 (pixman_implementation_t *imp,
> > +                               pixman_composite_info_t *info) {
> > +    PIXMAN_COMPOSITE_ARGS (info);
> > +    int dst_stride, src_stride;
> > +    uint32_t    *dst_line, *dst;
> > +    uint32_t    *src_line, *src;
> > +
> > +    PIXMAN_IMAGE_GET_LINE (
> > +       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
> > +    PIXMAN_IMAGE_GET_LINE (
> > +       src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
> > +
> > +    dst = dst_line;
> > +    src = src_line;
> > +
> > +    while (height--)
> > +    {
> > +       avx2_combine_over_u (imp, op, dst, src, NULL, width);
> > +
> > +       dst += dst_stride;
> > +       src += src_stride;
> > +    }
> > +}
>
> Should have a blank line here
>
> >  static const pixman_fast_path_t avx2_fast_paths[] =  {
> > +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8,
> avx2_composite_over_8888_8888),
> > +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8,
> avx2_composite_over_8888_8888),
> > +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8,
> avx2_composite_over_8888_8888),
> > +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8,
> > + avx2_composite_over_8888_8888),
> >      { PIXMAN_OP_NONE },
> >  };
> >
> > -static const pixman_iter_info_t avx2_iters[] =
> > +static const pixman_iter_info_t avx2_iters[] =
>
> There was stray whitespace added in the earlier commit here, and now
> removed. Please just remove it from the original commit.
>
> >  {
> >      { PIXMAN_null },
> >  };
> > @@ -26,6 +452,9 @@ _pixman_implementation_create_avx2
> (pixman_implementation_t *fallback)
> >      pixman_implementation_t *imp = _pixman_implementation_create
> > (fallback, avx2_fast_paths);
> >
> >      /* Set up function pointers */
> > +    imp->combine_32[PIXMAN_OP_OVER] = avx2_combine_over_u;
> > +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] =
> > + avx2_combine_over_reverse_u;
> > +
>
> More stray spaces
>
>
> >      imp->iter_info = avx2_iters;
> >
> >      return imp;
> > --
> > 2.17.1
> >
> > _______________________________________________
> > Pixman mailing list
> > Pixman@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/pixman
> _______________________________________________
> Pixman mailing list
> Pixman@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/pixman
>