[2/3] Moving helper functions in pixman-sse2.c to pixman-sse2.h.

Submitted by Devulapalli, Raghuveer on Jan. 17, 2019, 12:59 a.m.

Details

Message ID 20190117010000.21632-2-raghuveer.devulapalli@intel.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Pixman

Not browsing as part of any series.

Commit Message

Devulapalli, Raghuveer Jan. 17, 2019, 12:59 a.m.
From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>

These helper function will be reused in pixman-avx2.c implementations in
the future.
---
 pixman/pixman-sse2.c | 504 +------------------------------------------
 pixman/pixman-sse2.h | 502 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 503 insertions(+), 503 deletions(-)
 create mode 100644 pixman/pixman-sse2.h

Patch hide | download patch | download mbox

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 8955103..8dea0c2 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -32,509 +32,7 @@ 
 
 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
 #define PSHUFD_IS_FAST 0
-
-#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
-#include <emmintrin.h> /* for SSE2 intrinsics */
-#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-inlines.h"
-
-static __m128i mask_0080;
-static __m128i mask_00ff;
-static __m128i mask_0101;
-static __m128i mask_ffff;
-static __m128i mask_ff000000;
-static __m128i mask_alpha;
-
-static __m128i mask_565_r;
-static __m128i mask_565_g1, mask_565_g2;
-static __m128i mask_565_b;
-static __m128i mask_red;
-static __m128i mask_green;
-static __m128i mask_blue;
-
-static __m128i mask_565_fix_rb;
-static __m128i mask_565_fix_g;
-
-static __m128i mask_565_rb;
-static __m128i mask_565_pack_multiplier;
-
-static force_inline __m128i
-unpack_32_1x128 (uint32_t data)
-{
-    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
-}
-
-static force_inline void
-unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
-{
-    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
-    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
-}
-
-static force_inline __m128i
-unpack_565_to_8888 (__m128i lo)
-{
-    __m128i r, g, b, rb, t;
-
-    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
-    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
-    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
-
-    rb = _mm_or_si128 (r, b);
-    t  = _mm_and_si128 (rb, mask_565_fix_rb);
-    t  = _mm_srli_epi32 (t, 5);
-    rb = _mm_or_si128 (rb, t);
-
-    t  = _mm_and_si128 (g, mask_565_fix_g);
-    t  = _mm_srli_epi32 (t, 6);
-    g  = _mm_or_si128 (g, t);
-
-    return _mm_or_si128 (rb, g);
-}
-
-static force_inline void
-unpack_565_128_4x128 (__m128i  data,
-                      __m128i* data0,
-                      __m128i* data1,
-                      __m128i* data2,
-                      __m128i* data3)
-{
-    __m128i lo, hi;
-
-    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
-    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
-
-    lo = unpack_565_to_8888 (lo);
-    hi = unpack_565_to_8888 (hi);
-
-    unpack_128_2x128 (lo, data0, data1);
-    unpack_128_2x128 (hi, data2, data3);
-}
-
-static force_inline uint16_t
-pack_565_32_16 (uint32_t pixel)
-{
-    return (uint16_t) (((pixel >> 8) & 0xf800) |
-		       ((pixel >> 5) & 0x07e0) |
-		       ((pixel >> 3) & 0x001f));
-}
-
-static force_inline __m128i
-pack_2x128_128 (__m128i lo, __m128i hi)
-{
-    return _mm_packus_epi16 (lo, hi);
-}
-
-static force_inline __m128i
-pack_565_2packedx128_128 (__m128i lo, __m128i hi)
-{
-    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
-    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
-
-    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
-    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
-
-    __m128i g0 = _mm_and_si128 (lo, mask_green);
-    __m128i g1 = _mm_and_si128 (hi, mask_green);
-
-    t0 = _mm_or_si128 (t0, g0);
-    t1 = _mm_or_si128 (t1, g1);
-
-    /* Simulates _mm_packus_epi32 */
-    t0 = _mm_slli_epi32 (t0, 16 - 5);
-    t1 = _mm_slli_epi32 (t1, 16 - 5);
-    t0 = _mm_srai_epi32 (t0, 16);
-    t1 = _mm_srai_epi32 (t1, 16);
-    return _mm_packs_epi32 (t0, t1);
-}
-
-static force_inline __m128i
-pack_565_2x128_128 (__m128i lo, __m128i hi)
-{
-    __m128i data;
-    __m128i r, g1, g2, b;
-
-    data = pack_2x128_128 (lo, hi);
-
-    r  = _mm_and_si128 (data, mask_565_r);
-    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
-    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
-    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
-
-    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
-}
-
-static force_inline __m128i
-pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
-{
-    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
-			     pack_565_2x128_128 (*xmm2, *xmm3));
-}
-
-static force_inline int
-is_opaque (__m128i x)
-{
-    __m128i ffs = _mm_cmpeq_epi8 (x, x);
-
-    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
-}
-
-static force_inline int
-is_zero (__m128i x)
-{
-    return _mm_movemask_epi8 (
-	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
-}
-
-static force_inline int
-is_transparent (__m128i x)
-{
-    return (_mm_movemask_epi8 (
-		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
-}
-
-static force_inline __m128i
-expand_pixel_32_1x128 (uint32_t data)
-{
-    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
-}
-
-static force_inline __m128i
-expand_alpha_1x128 (__m128i data)
-{
-    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
-						     _MM_SHUFFLE (3, 3, 3, 3)),
-				_MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_2x128 (__m128i  data_lo,
-                    __m128i  data_hi,
-                    __m128i* alpha_lo,
-                    __m128i* alpha_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
-
-    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
-    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
-}
-
-static force_inline void
-expand_alpha_rev_2x128 (__m128i  data_lo,
-                        __m128i  data_hi,
-                        __m128i* alpha_lo,
-                        __m128i* alpha_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
-    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
-    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline void
-pix_multiply_2x128 (__m128i* data_lo,
-                    __m128i* data_hi,
-                    __m128i* alpha_lo,
-                    __m128i* alpha_hi,
-                    __m128i* ret_lo,
-                    __m128i* ret_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
-    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
-    lo = _mm_adds_epu16 (lo, mask_0080);
-    hi = _mm_adds_epu16 (hi, mask_0080);
-    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
-    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
-}
-
-static force_inline void
-pix_add_multiply_2x128 (__m128i* src_lo,
-                        __m128i* src_hi,
-                        __m128i* alpha_dst_lo,
-                        __m128i* alpha_dst_hi,
-                        __m128i* dst_lo,
-                        __m128i* dst_hi,
-                        __m128i* alpha_src_lo,
-                        __m128i* alpha_src_hi,
-                        __m128i* ret_lo,
-                        __m128i* ret_hi)
-{
-    __m128i t1_lo, t1_hi;
-    __m128i t2_lo, t2_hi;
-
-    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
-    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
-
-    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
-    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
-}
-
-static force_inline void
-negate_2x128 (__m128i  data_lo,
-              __m128i  data_hi,
-              __m128i* neg_lo,
-              __m128i* neg_hi)
-{
-    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
-    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
-}
-
-static force_inline void
-invert_colors_2x128 (__m128i  data_lo,
-                     __m128i  data_hi,
-                     __m128i* inv_lo,
-                     __m128i* inv_hi)
-{
-    __m128i lo, hi;
-
-    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
-    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
-    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
-    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline void
-over_2x128 (__m128i* src_lo,
-            __m128i* src_hi,
-            __m128i* alpha_lo,
-            __m128i* alpha_hi,
-            __m128i* dst_lo,
-            __m128i* dst_hi)
-{
-    __m128i t1, t2;
-
-    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
-
-    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
-
-    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
-    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
-}
-
-static force_inline void
-over_rev_non_pre_2x128 (__m128i  src_lo,
-                        __m128i  src_hi,
-                        __m128i* dst_lo,
-                        __m128i* dst_hi)
-{
-    __m128i lo, hi;
-    __m128i alpha_lo, alpha_hi;
-
-    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
-
-    lo = _mm_or_si128 (alpha_lo, mask_alpha);
-    hi = _mm_or_si128 (alpha_hi, mask_alpha);
-
-    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
-
-    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
-
-    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
-}
-
-static force_inline void
-in_over_2x128 (__m128i* src_lo,
-               __m128i* src_hi,
-               __m128i* alpha_lo,
-               __m128i* alpha_hi,
-               __m128i* mask_lo,
-               __m128i* mask_hi,
-               __m128i* dst_lo,
-               __m128i* dst_hi)
-{
-    __m128i s_lo, s_hi;
-    __m128i a_lo, a_hi;
-
-    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
-    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
-
-    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
-}
-
-/* load 4 pixels from a 16-byte boundary aligned address */
-static force_inline __m128i
-load_128_aligned (__m128i* src)
-{
-    return _mm_load_si128 (src);
-}
-
-/* load 4 pixels from a unaligned address */
-static force_inline __m128i
-load_128_unaligned (const __m128i* src)
-{
-    return _mm_loadu_si128 (src);
-}
-
-/* save 4 pixels using Write Combining memory on a 16-byte
- * boundary aligned address
- */
-static force_inline void
-save_128_write_combining (__m128i* dst,
-                          __m128i  data)
-{
-    _mm_stream_si128 (dst, data);
-}
-
-/* save 4 pixels on a 16-byte boundary aligned address */
-static force_inline void
-save_128_aligned (__m128i* dst,
-                  __m128i  data)
-{
-    _mm_store_si128 (dst, data);
-}
-
-/* save 4 pixels on a unaligned address */
-static force_inline void
-save_128_unaligned (__m128i* dst,
-                    __m128i  data)
-{
-    _mm_storeu_si128 (dst, data);
-}
-
-static force_inline __m128i
-load_32_1x128 (uint32_t data)
-{
-    return _mm_cvtsi32_si128 (data);
-}
-
-static force_inline __m128i
-expand_alpha_rev_1x128 (__m128i data)
-{
-    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m128i
-expand_pixel_8_1x128 (uint8_t data)
-{
-    return _mm_shufflelo_epi16 (
-	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
-}
-
-static force_inline __m128i
-pix_multiply_1x128 (__m128i data,
-		    __m128i alpha)
-{
-    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
-					    mask_0080),
-			    mask_0101);
-}
-
-static force_inline __m128i
-pix_add_multiply_1x128 (__m128i* src,
-			__m128i* alpha_dst,
-			__m128i* dst,
-			__m128i* alpha_src)
-{
-    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
-    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
-
-    return _mm_adds_epu8 (t1, t2);
-}
-
-static force_inline __m128i
-negate_1x128 (__m128i data)
-{
-    return _mm_xor_si128 (data, mask_00ff);
-}
-
-static force_inline __m128i
-invert_colors_1x128 (__m128i data)
-{
-    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
-}
-
-static force_inline __m128i
-over_1x128 (__m128i src, __m128i alpha, __m128i dst)
-{
-    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
-}
-
-static force_inline __m128i
-in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
-{
-    return over_1x128 (pix_multiply_1x128 (*src, *mask),
-		       pix_multiply_1x128 (*alpha, *mask),
-		       *dst);
-}
-
-static force_inline __m128i
-over_rev_non_pre_1x128 (__m128i src, __m128i dst)
-{
-    __m128i alpha = expand_alpha_1x128 (src);
-
-    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
-					   _mm_or_si128 (alpha, mask_alpha)),
-		       alpha,
-		       dst);
-}
-
-static force_inline uint32_t
-pack_1x128_32 (__m128i data)
-{
-    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
-}
-
-static force_inline __m128i
-expand565_16_1x128 (uint16_t pixel)
-{
-    __m128i m = _mm_cvtsi32_si128 (pixel);
-
-    m = unpack_565_to_8888 (m);
-
-    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
-}
-
-static force_inline uint32_t
-core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
-{
-    uint8_t a;
-    __m128i xmms;
-
-    a = src >> 24;
-
-    if (a == 0xff)
-    {
-	return src;
-    }
-    else if (src)
-    {
-	xmms = unpack_32_1x128 (src);
-	return pack_1x128_32 (
-	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
-			unpack_32_1x128 (dst)));
-    }
-
-    return dst;
-}
-
-static force_inline uint32_t
-combine1 (const uint32_t *ps, const uint32_t *pm)
-{
-    uint32_t s = *ps;
-
-    if (pm)
-    {
-	__m128i ms, mm;
-
-	mm = unpack_32_1x128 (*pm);
-	mm = expand_alpha_1x128 (mm);
-
-	ms = unpack_32_1x128 (s);
-	ms = pix_multiply_1x128 (ms, mm);
-
-	s = pack_1x128_32 (ms);
-    }
-
-    return s;
-}
+#include "pixman-sse2.h"
 
 static force_inline __m128i
 combine4 (const __m128i *ps, const __m128i *pm)
diff --git a/pixman/pixman-sse2.h b/pixman/pixman-sse2.h
new file mode 100644
index 0000000..37105e4
--- /dev/null
+++ b/pixman/pixman-sse2.h
@@ -0,0 +1,502 @@ 
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+__m128i mask_0080;
+__m128i mask_00ff;
+__m128i mask_0101;
+__m128i mask_ffff;
+__m128i mask_ff000000;
+__m128i mask_alpha;
+
+__m128i mask_565_r;
+__m128i mask_565_g1, mask_565_g2;
+__m128i mask_565_b;
+__m128i mask_red;
+__m128i mask_green;
+__m128i mask_blue;
+
+__m128i mask_565_fix_rb;
+__m128i mask_565_fix_g;
+
+__m128i mask_565_rb;
+__m128i mask_565_pack_multiplier;
+
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+    __m128i r, g, b, rb, t;
+
+    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+    rb = _mm_or_si128 (r, b);
+    t  = _mm_and_si128 (rb, mask_565_fix_rb);
+    t  = _mm_srli_epi32 (t, 5);
+    rb = _mm_or_si128 (rb, t);
+
+    t  = _mm_and_si128 (g, mask_565_fix_g);
+    t  = _mm_srli_epi32 (t, 6);
+    g  = _mm_or_si128 (g, t);
+
+    return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
+{
+    __m128i lo, hi;
+
+    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+    lo = unpack_565_to_8888 (lo);
+    hi = unpack_565_to_8888 (hi);
+
+    unpack_128_2x128 (lo, data0, data1);
+    unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+		       ((pixel >> 5) & 0x07e0) |
+		       ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+    return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2packedx128_128 (__m128i lo, __m128i hi)
+{
+    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
+    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
+
+    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
+    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
+
+    __m128i g0 = _mm_and_si128 (lo, mask_green);
+    __m128i g1 = _mm_and_si128 (hi, mask_green);
+
+    t0 = _mm_or_si128 (t0, g0);
+    t1 = _mm_or_si128 (t1, g1);
+
+    /* Simulates _mm_packus_epi32 */
+    t0 = _mm_slli_epi32 (t0, 16 - 5);
+    t1 = _mm_slli_epi32 (t1, 16 - 5);
+    t0 = _mm_srai_epi32 (t0, 16);
+    t1 = _mm_srai_epi32 (t1, 16);
+    return _mm_packs_epi32 (t0, t1);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+    __m128i data;
+    __m128i r, g1, g2, b;
+
+    data = pack_2x128_128 (lo, hi);
+
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+			     pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+    __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+    return _mm_movemask_epi8 (
+	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+    return (_mm_movemask_epi8 (
+		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+						     _MM_SHUFFLE (3, 3, 3, 3)),
+				_MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm_adds_epu16 (lo, mask_0080);
+    hi = _mm_adds_epu16 (hi, mask_0080);
+    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
+{
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
+
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
+{
+    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
+{
+    __m128i t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
+{
+    __m128i lo, hi;
+    __m128i alpha_lo, alpha_hi;
+
+    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+    lo = _mm_or_si128 (alpha_lo, mask_alpha);
+    hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
+{
+    __m128i s_lo, s_hi;
+    __m128i a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+    return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+    return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
+{
+    _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
+{
+    _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
+{
+    _mm_storeu_si128 (dst, data);
+}
+
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+		    __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+					    mask_0080),
+			    mask_0101);
+}
+
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+			__m128i* alpha_dst,
+			__m128i* dst,
+			__m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+		       pix_multiply_1x128 (*alpha, *mask),
+		       *dst);
+}
+
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+					   _mm_or_si128 (alpha, mask_alpha)),
+		       alpha,
+		       dst);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    __m128i xmms;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	xmms = unpack_32_1x128 (src);
+	return pack_1x128_32 (
+	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
+			unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+	__m128i ms, mm;
+
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
+
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply_1x128 (ms, mm);
+
+	s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}

Comments

Quoting Raghuveer Devulapalli (2019-01-17 00:59:59)
> From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>
> 
> These helper function will be reused in pixman-avx2.c implementations in
> the future.

Are we ever going to run into a naming conflict in the future? Is it
worth prefixing all the inlines with sse2_? Probably makes sense so that
we can see the instruction set used when mixing later.
-Chris
Sounds good to me. I made the minor fixes. I can submit new patches for merging if everything else looks okay. 

-----Original Message-----
From: Chris Wilson [mailto:chris@chris-wilson.co.uk] 

Sent: Thursday, January 17, 2019 12:27 AM
To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>; pixman@lists.freedesktop.org
Cc: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
Subject: Re: [Pixman] [PATCH 2/3] Moving helper functions in pixman-sse2.c to pixman-sse2.h.

Quoting Raghuveer Devulapalli (2019-01-17 00:59:59)
> From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>

> 

> These helper function will be reused in pixman-avx2.c implementations 

> in the future.


Are we ever going to run into a naming conflict in the future? Is it worth prefixing all the inlines with sse2_? Probably makes sense so that we can see the instruction set used when mixing later.
-Chris
On Thu, Jan 17, 2019 at 12:27 AM Chris Wilson <chris@chris-wilson.co.uk> wrote:
>
> Quoting Raghuveer Devulapalli (2019-01-17 00:59:59)
> > From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>
> >
> > These helper function will be reused in pixman-avx2.c implementations in
> > the future.
>
> Are we ever going to run into a naming conflict in the future? Is it
> worth prefixing all the inlines with sse2_? Probably makes sense so that
> we can see the instruction set used when mixing later.
> -Chris

The SSE2 intrinsics will actually be compiled into VEX-prefixed (AVX)
instructions operating on xmm registers when -mavx2 is used.

I can't think of a reason the lack of a prefix would cause any
confusion for the ones that already have "128" in the name. For
unpack_565_to_8888, etc, maybe it would be best to add a _128 suffix.
We have functions (e.g., pack_565_2x128_128) that look like that
already.
On Wed, Jan 16, 2019 at 4:57 PM Raghuveer Devulapalli
<raghuveer.devulapalli@intel.com> wrote:
>
> From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>
>
> These helper function will be reused in pixman-avx2.c implementations in
> the future.
> ---
>  pixman/pixman-sse2.c | 504 +------------------------------------------
>  pixman/pixman-sse2.h | 502 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 503 insertions(+), 503 deletions(-)
>  create mode 100644 pixman/pixman-sse2.h
>
> diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
> index 8955103..8dea0c2 100644
> --- a/pixman/pixman-sse2.c
> +++ b/pixman/pixman-sse2.c
> @@ -32,509 +32,7 @@
>
>  /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
>  #define PSHUFD_IS_FAST 0
> -
> -#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
> -#include <emmintrin.h> /* for SSE2 intrinsics */
> -#include "pixman-private.h"
> -#include "pixman-combine32.h"
> -#include "pixman-inlines.h"
> -
> -static __m128i mask_0080;
> -static __m128i mask_00ff;
> -static __m128i mask_0101;
> -static __m128i mask_ffff;
> -static __m128i mask_ff000000;
> -static __m128i mask_alpha;
> -
> -static __m128i mask_565_r;
> -static __m128i mask_565_g1, mask_565_g2;
> -static __m128i mask_565_b;
> -static __m128i mask_red;
> -static __m128i mask_green;
> -static __m128i mask_blue;
> -
> -static __m128i mask_565_fix_rb;
> -static __m128i mask_565_fix_g;
> -
> -static __m128i mask_565_rb;
> -static __m128i mask_565_pack_multiplier;
> -

These are moving to pixman-sse2.h to be used by the code below, which
is to be used by the AVX2 code. But they're initialized in
_pixman_implementation_create_sse2(), which means if you used
PIXMAN_DISABLE=sse2 the AVX2 paths would fail.

I suspect these constants do need to be prefixed with "sse2_", and in
_pixman_x86_get_implementations() you should disable avx2 if
PIXMAN_DISABLE=sse2.
> -static force_inline __m128i
> -unpack_32_1x128 (uint32_t data)
> -{
> -    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
> -}
> -
> -static force_inline void
> -unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
> -{
> -    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
> -    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
> -}
> -
> -static force_inline __m128i
> -unpack_565_to_8888 (__m128i lo)
> -{
> -    __m128i r, g, b, rb, t;
> -
> -    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
> -    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
> -    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
> -
> -    rb = _mm_or_si128 (r, b);
> -    t  = _mm_and_si128 (rb, mask_565_fix_rb);
> -    t  = _mm_srli_epi32 (t, 5);
> -    rb = _mm_or_si128 (rb, t);
> -
> -    t  = _mm_and_si128 (g, mask_565_fix_g);
> -    t  = _mm_srli_epi32 (t, 6);
> -    g  = _mm_or_si128 (g, t);
> -
> -    return _mm_or_si128 (rb, g);
> -}
> -
> -static force_inline void
> -unpack_565_128_4x128 (__m128i  data,
> -                      __m128i* data0,
> -                      __m128i* data1,
> -                      __m128i* data2,
> -                      __m128i* data3)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
> -    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
> -
> -    lo = unpack_565_to_8888 (lo);
> -    hi = unpack_565_to_8888 (hi);
> -
> -    unpack_128_2x128 (lo, data0, data1);
> -    unpack_128_2x128 (hi, data2, data3);
> -}
> -
> -static force_inline uint16_t
> -pack_565_32_16 (uint32_t pixel)
> -{
> -    return (uint16_t) (((pixel >> 8) & 0xf800) |
> -                      ((pixel >> 5) & 0x07e0) |
> -                      ((pixel >> 3) & 0x001f));
> -}
> -
> -static force_inline __m128i
> -pack_2x128_128 (__m128i lo, __m128i hi)
> -{
> -    return _mm_packus_epi16 (lo, hi);
> -}
> -
> -static force_inline __m128i
> -pack_565_2packedx128_128 (__m128i lo, __m128i hi)
> -{
> -    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
> -    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
> -
> -    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
> -    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
> -
> -    __m128i g0 = _mm_and_si128 (lo, mask_green);
> -    __m128i g1 = _mm_and_si128 (hi, mask_green);
> -
> -    t0 = _mm_or_si128 (t0, g0);
> -    t1 = _mm_or_si128 (t1, g1);
> -
> -    /* Simulates _mm_packus_epi32 */
> -    t0 = _mm_slli_epi32 (t0, 16 - 5);
> -    t1 = _mm_slli_epi32 (t1, 16 - 5);
> -    t0 = _mm_srai_epi32 (t0, 16);
> -    t1 = _mm_srai_epi32 (t1, 16);
> -    return _mm_packs_epi32 (t0, t1);
> -}
> -
> -static force_inline __m128i
> -pack_565_2x128_128 (__m128i lo, __m128i hi)
> -{
> -    __m128i data;
> -    __m128i r, g1, g2, b;
> -
> -    data = pack_2x128_128 (lo, hi);
> -
> -    r  = _mm_and_si128 (data, mask_565_r);
> -    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
> -    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
> -    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
> -
> -    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
> -}
> -
> -static force_inline __m128i
> -pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
> -{
> -    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
> -                            pack_565_2x128_128 (*xmm2, *xmm3));
> -}
> -
> -static force_inline int
> -is_opaque (__m128i x)
> -{
> -    __m128i ffs = _mm_cmpeq_epi8 (x, x);
> -
> -    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
> -}
> -
> -static force_inline int
> -is_zero (__m128i x)
> -{
> -    return _mm_movemask_epi8 (
> -       _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
> -}
> -
> -static force_inline int
> -is_transparent (__m128i x)
> -{
> -    return (_mm_movemask_epi8 (
> -               _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
> -}
> -
> -static force_inline __m128i
> -expand_pixel_32_1x128 (uint32_t data)
> -{
> -    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
> -}
> -
> -static force_inline __m128i
> -expand_alpha_1x128 (__m128i data)
> -{
> -    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
> -                                                    _MM_SHUFFLE (3, 3, 3, 3)),
> -                               _MM_SHUFFLE (3, 3, 3, 3));
> -}
> -
> -static force_inline void
> -expand_alpha_2x128 (__m128i  data_lo,
> -                    __m128i  data_hi,
> -                    __m128i* alpha_lo,
> -                    __m128i* alpha_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
> -    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
> -
> -    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
> -    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
> -}
> -
> -static force_inline void
> -expand_alpha_rev_2x128 (__m128i  data_lo,
> -                        __m128i  data_hi,
> -                        __m128i* alpha_lo,
> -                        __m128i* alpha_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
> -    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
> -    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
> -    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline void
> -pix_multiply_2x128 (__m128i* data_lo,
> -                    __m128i* data_hi,
> -                    __m128i* alpha_lo,
> -                    __m128i* alpha_hi,
> -                    __m128i* ret_lo,
> -                    __m128i* ret_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
> -    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
> -    lo = _mm_adds_epu16 (lo, mask_0080);
> -    hi = _mm_adds_epu16 (hi, mask_0080);
> -    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
> -    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
> -}
> -
> -static force_inline void
> -pix_add_multiply_2x128 (__m128i* src_lo,
> -                        __m128i* src_hi,
> -                        __m128i* alpha_dst_lo,
> -                        __m128i* alpha_dst_hi,
> -                        __m128i* dst_lo,
> -                        __m128i* dst_hi,
> -                        __m128i* alpha_src_lo,
> -                        __m128i* alpha_src_hi,
> -                        __m128i* ret_lo,
> -                        __m128i* ret_hi)
> -{
> -    __m128i t1_lo, t1_hi;
> -    __m128i t2_lo, t2_hi;
> -
> -    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
> -    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
> -
> -    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
> -    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
> -}
> -
> -static force_inline void
> -negate_2x128 (__m128i  data_lo,
> -              __m128i  data_hi,
> -              __m128i* neg_lo,
> -              __m128i* neg_hi)
> -{
> -    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
> -    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
> -}
> -
> -static force_inline void
> -invert_colors_2x128 (__m128i  data_lo,
> -                     __m128i  data_hi,
> -                     __m128i* inv_lo,
> -                     __m128i* inv_hi)
> -{
> -    __m128i lo, hi;
> -
> -    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
> -    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
> -    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
> -    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
> -}
> -
> -static force_inline void
> -over_2x128 (__m128i* src_lo,
> -            __m128i* src_hi,
> -            __m128i* alpha_lo,
> -            __m128i* alpha_hi,
> -            __m128i* dst_lo,
> -            __m128i* dst_hi)
> -{
> -    __m128i t1, t2;
> -
> -    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
> -
> -    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
> -
> -    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
> -    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
> -}
> -
> -static force_inline void
> -over_rev_non_pre_2x128 (__m128i  src_lo,
> -                        __m128i  src_hi,
> -                        __m128i* dst_lo,
> -                        __m128i* dst_hi)
> -{
> -    __m128i lo, hi;
> -    __m128i alpha_lo, alpha_hi;
> -
> -    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
> -
> -    lo = _mm_or_si128 (alpha_lo, mask_alpha);
> -    hi = _mm_or_si128 (alpha_hi, mask_alpha);
> -
> -    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
> -
> -    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
> -
> -    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
> -}
> -
> -static force_inline void
> -in_over_2x128 (__m128i* src_lo,
> -               __m128i* src_hi,
> -               __m128i* alpha_lo,
> -               __m128i* alpha_hi,
> -               __m128i* mask_lo,
> -               __m128i* mask_hi,
> -               __m128i* dst_lo,
> -               __m128i* dst_hi)
> -{
> -    __m128i s_lo, s_hi;
> -    __m128i a_lo, a_hi;
> -
> -    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
> -    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
> -
> -    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
> -}
> -
> -/* load 4 pixels from a 16-byte boundary aligned address */
> -static force_inline __m128i
> -load_128_aligned (__m128i* src)
> -{
> -    return _mm_load_si128 (src);
> -}
> -
> -/* load 4 pixels from a unaligned address */
> -static force_inline __m128i
> -load_128_unaligned (const __m128i* src)
> -{
> -    return _mm_loadu_si128 (src);
> -}
> -
> -/* save 4 pixels using Write Combining memory on a 16-byte
> - * boundary aligned address
> - */
> -static force_inline void
> -save_128_write_combining (__m128i* dst,
> -                          __m128i  data)
> -{
> -    _mm_stream_si128 (dst, data);
> -}
> -
> -/* save 4 pixels on a 16-byte boundary aligned address */
> -static force_inline void
> -save_128_aligned (__m128i* dst,
> -                  __m128i  data)
> -{
> -    _mm_store_si128 (dst, data);
> -}
> -
> -/* save 4 pixels on a unaligned address */
> -static force_inline void
> -save_128_unaligned (__m128i* dst,
> -                    __m128i  data)
> -{
> -    _mm_storeu_si128 (dst, data);
> -}
> -
> -static force_inline __m128i
> -load_32_1x128 (uint32_t data)
> -{
> -    return _mm_cvtsi32_si128 (data);
> -}
> -
> -static force_inline __m128i
> -expand_alpha_rev_1x128 (__m128i data)
> -{
> -    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline __m128i
> -expand_pixel_8_1x128 (uint8_t data)
> -{
> -    return _mm_shufflelo_epi16 (
> -       unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
> -}
> -
> -static force_inline __m128i
> -pix_multiply_1x128 (__m128i data,
> -                   __m128i alpha)
> -{
> -    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
> -                                           mask_0080),
> -                           mask_0101);
> -}
> -
> -static force_inline __m128i
> -pix_add_multiply_1x128 (__m128i* src,
> -                       __m128i* alpha_dst,
> -                       __m128i* dst,
> -                       __m128i* alpha_src)
> -{
> -    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
> -    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
> -
> -    return _mm_adds_epu8 (t1, t2);
> -}
> -
> -static force_inline __m128i
> -negate_1x128 (__m128i data)
> -{
> -    return _mm_xor_si128 (data, mask_00ff);
> -}
> -
> -static force_inline __m128i
> -invert_colors_1x128 (__m128i data)
> -{
> -    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
> -}
> -
> -static force_inline __m128i
> -over_1x128 (__m128i src, __m128i alpha, __m128i dst)
> -{
> -    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
> -}
> -
> -static force_inline __m128i
> -in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
> -{
> -    return over_1x128 (pix_multiply_1x128 (*src, *mask),
> -                      pix_multiply_1x128 (*alpha, *mask),
> -                      *dst);
> -}
> -
> -static force_inline __m128i
> -over_rev_non_pre_1x128 (__m128i src, __m128i dst)
> -{
> -    __m128i alpha = expand_alpha_1x128 (src);
> -
> -    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
> -                                          _mm_or_si128 (alpha, mask_alpha)),
> -                      alpha,
> -                      dst);
> -}
> -
> -static force_inline uint32_t
> -pack_1x128_32 (__m128i data)
> -{
> -    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
> -}
> -
> -static force_inline __m128i
> -expand565_16_1x128 (uint16_t pixel)
> -{
> -    __m128i m = _mm_cvtsi32_si128 (pixel);
> -
> -    m = unpack_565_to_8888 (m);
> -
> -    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
> -}
> -
> -static force_inline uint32_t
> -core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
> -{
> -    uint8_t a;
> -    __m128i xmms;
> -
> -    a = src >> 24;
> -
> -    if (a == 0xff)
> -    {
> -       return src;
> -    }
> -    else if (src)
> -    {
> -       xmms = unpack_32_1x128 (src);
> -       return pack_1x128_32 (
> -           over_1x128 (xmms, expand_alpha_1x128 (xmms),
> -                       unpack_32_1x128 (dst)));
> -    }
> -
> -    return dst;
> -}
> -
> -static force_inline uint32_t
> -combine1 (const uint32_t *ps, const uint32_t *pm)
> -{
> -    uint32_t s = *ps;
> -
> -    if (pm)
> -    {
> -       __m128i ms, mm;
> -
> -       mm = unpack_32_1x128 (*pm);
> -       mm = expand_alpha_1x128 (mm);
> -
> -       ms = unpack_32_1x128 (s);
> -       ms = pix_multiply_1x128 (ms, mm);
> -
> -       s = pack_1x128_32 (ms);
> -    }
> -
> -    return s;
> -}
> +#include "pixman-sse2.h"
>
>  static force_inline __m128i
>  combine4 (const __m128i *ps, const __m128i *pm)
> diff --git a/pixman/pixman-sse2.h b/pixman/pixman-sse2.h
> new file mode 100644
> index 0000000..37105e4
> --- /dev/null
> +++ b/pixman/pixman-sse2.h
> @@ -0,0 +1,502 @@
> +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
> +#include <emmintrin.h> /* for SSE2 intrinsics */
> +#include "pixman-private.h"
> +#include "pixman-combine32.h"
> +#include "pixman-inlines.h"
> +
> +__m128i mask_0080;
> +__m128i mask_00ff;
> +__m128i mask_0101;
> +__m128i mask_ffff;
> +__m128i mask_ff000000;
> +__m128i mask_alpha;
> +
> +__m128i mask_565_r;
> +__m128i mask_565_g1, mask_565_g2;
> +__m128i mask_565_b;
> +__m128i mask_red;
> +__m128i mask_green;
> +__m128i mask_blue;
> +
> +__m128i mask_565_fix_rb;
> +__m128i mask_565_fix_g;
> +
> +__m128i mask_565_rb;
> +__m128i mask_565_pack_multiplier;

I think you want to make these declarations 'extern', and keep their
storage in pixman-sse2.c. As it is, we're now allocating space for
them in every translation unit this header is included in.
Quoting Matt Turner (2019-01-22 00:49:39)
> On Thu, Jan 17, 2019 at 12:27 AM Chris Wilson <chris@chris-wilson.co.uk> wrote:
> >
> > Quoting Raghuveer Devulapalli (2019-01-17 00:59:59)
> > > From: raghuveer devulapalli <raghuveer.devulapalli@intel.com>
> > >
> > > These helper function will be reused in pixman-avx2.c implementations in
> > > the future.
> >
> > Are we ever going to run into a naming conflict in the future? Is it
> > worth prefixing all the inlines with sse2_? Probably makes sense so that
> > we can see the instruction set used when mixing later.
> > -Chris
> 
> The SSE2 intrinsics will actually be compiled into VEX-prefixed (AVX)
> instructions operating on xmm registers when -mavx2 is used.
> 
> I can't think of a reason the lack of a prefix would cause any
> confusion for the ones that already have "128" in the name. For
> unpack_565_to_8888, etc, maybe it would be best to add a _128 suffix.
> We have functions (e.g., pack_565_2x128_128) that look like that
> already.

Ok, that seems like a reasonable means to distinguish these helpers from
possible future ones; as width is only likely to go up, and an
incompatible ia will then not include the sse2/avx2 inlines.
-Chris