Add support for aarch64 neon optimization

Submitted by Mizuki Asakura on April 2, 2016, 12:30 p.m.

Details

Message ID CADqU6iT3mFAfBP9y6eAzuqTj6MCY25a736SHL=Ep+fFxigDbBQ@mail.gmail.com
State Superseded
Headers show
Series "Add support for aarch64 neon optimization" ( rev: 1 ) in Pixman

Not browsing as part of any series.

Commit Message

Mizuki Asakura April 2, 2016, 12:30 p.m.
Since aarch64 has different neon syntax from aarch32 and has no
support for (older) arm-simd,
there are no SIMD accelerations for pixman on aarch64.

We need new implementations.


This patch only contains STD_FAST_PATH codes, not scaling (nearest,
bilinear) codes.
After completing optimization this patch, scaling related codes should be done.


This is a first step towards optimizations for aarch64-neon.


Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
Signed-off-by: Mizuki Asakura <ed6e117f@gmail.com>
---
 configure.ac                    |   34 +
 pixman/Makefile.am              |   14 +
 pixman/pixman-arm-neon.c        |   10 +-
 pixman/pixman-arm.c             |    6 +
 pixman/pixman-arma64-neon-asm.S | 3771 +++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arma64-neon-asm.h | 1288 +++++++++++++
 pixman/pixman-private.h         |    5 +
 7 files changed, 5127 insertions(+), 1 deletion(-)
 create mode 100644 pixman/pixman-arma64-neon-asm.S
 create mode 100644 pixman/pixman-arma64-neon-asm.h

Patch hide | download patch | download mbox

diff --git a/configure.ac b/configure.ac
old mode 100644
new mode 100755
index 6b2134e..bb0192a
--- a/configure.ac
+++ b/configure.ac
@@ -667,6 +667,40 @@  if test $enable_arm_neon = yes && test
$have_arm_neon = no ; then
    AC_MSG_ERROR([ARM NEON intrinsics not detected])
 fi

+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions
+have_arm_a64_neon=no
+AC_MSG_CHECKING(whether to use ARM A64 NEON assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.arch armv8-a
+.altmacro
+prfm pldl2strm, [x0]
+xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-a64-neon,
+   [AC_HELP_STRING([--disable-arm-a64-neon],
+                   [disable ARM A64 NEON fast paths])],
+   [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto])
+
+if test $enable_arm_a64_neon = no ; then
+   have_arm_a64_neon=disabled
+fi
+
+if test $have_arm_a64_neon = yes ; then
+   AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes)
+
+AC_MSG_RESULT($have_arm_a64_neon)
+if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then
+   AC_MSG_ERROR([ARM A64 NEON intrinsics not detected])
+fi
+
 dnl ===========================================================================
 dnl Check for IWMMXT

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
old mode 100644
new mode 100755
index 581b6f6..1b1a8ac
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -94,6 +94,20 @@  libpixman_1_la_LIBADD += libpixman-arm-neon.la
 ASM_CFLAGS_arm_neon=
 endif

+# arm a64 neon code
+if USE_ARM_A64_NEON
+noinst_LTLIBRARIES += libpixman-arma64-neon.la
+libpixman_arma64_neon_la_SOURCES = \
+        pixman-arm-neon.c    \
+        pixman-arm-common.h    \
+        pixman-arma64-neon-asm.S    \
+        pixman-arm-asm.h    \
+        pixman-arma64-neon-asm.h
+libpixman_1_la_LIBADD += libpixman-arma64-neon.la
+
+ASM_CFLAGS_arm_neon=
+endif
+
 # iwmmxt code
 if USE_ARM_IWMMXT
 libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
old mode 100644
new mode 100755
index be761c9..cef8c90
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -121,6 +121,7 @@  PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon,
over_8888_8_0565,
 PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
                                         uint16_t, 1, uint8_t, 1, uint16_t, 1)

+#ifndef __aarch64__
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
                                         uint32_t, uint32_t)
 PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
@@ -160,6 +161,7 @@  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST
(SKIP_ZERO_SRC, neon, 8888_8_8888, OV
                                             uint32_t, uint32_t)
 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon,
8888_8_8888, ADD,
                                             uint32_t, uint32_t)
+#endif

 void
 pixman_composite_src_n_8_asm_neon (int32_t   w,
@@ -194,7 +196,7 @@  arm_neon_fill (pixman_implementation_t *imp,
            uint32_t                 _xor)
 {
     /* stride is always multiple of 32bit units in pixman */
-    uint32_t byte_stride = stride * sizeof(uint32_t);
+    int32_t byte_stride = stride * sizeof(uint32_t);

     switch (bpp)
     {
@@ -362,6 +364,7 @@  static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8,
neon_composite_out_reverse_8_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8,
neon_composite_out_reverse_8_8888),

+#ifndef __aarch64__
     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
@@ -420,10 +423,12 @@  static const pixman_fast_path_t arm_neon_fast_paths[] =

     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8,
neon_8888_8_8888),
     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8,
neon_8888_8_8888),
+#endif

     { PIXMAN_OP_NONE },
 };

+#ifndef __aarch64__
 #define BIND_COMBINE_U(name)                                             \
 void                                                                     \
 pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
@@ -454,6 +459,7 @@  neon_combine_##name##_u (pixman_implementation_t
*imp,                   \
 BIND_COMBINE_U (over)
 BIND_COMBINE_U (add)
 BIND_COMBINE_U (out_reverse)
+#endif

 pixman_implementation_t *
 _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
@@ -461,9 +467,11 @@  _pixman_implementation_create_arm_neon
(pixman_implementation_t *fallback)
     pixman_implementation_t *imp =
     _pixman_implementation_create (fallback, arm_neon_fast_paths);

+#ifndef __aarch64__
     imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
     imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
+#endif

     imp->blt = arm_neon_blt;
     imp->fill = arm_neon_fill;
diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
old mode 100644
new mode 100755
index 23374e4..734cbea
--- a/pixman/pixman-arm.c
+++ b/pixman/pixman-arm.c
@@ -221,5 +221,11 @@  _pixman_arm_get_implementations
(pixman_implementation_t *imp)
     imp = _pixman_implementation_create_arm_neon (imp);
 #endif

+#ifdef USE_ARM_A64_NEON
+    /* neon is a part of aarch64 */
+    if (!_pixman_disabled ("arm-neon"))
+        imp = _pixman_implementation_create_arm_neon (imp);
+#endif
+
     return imp;
 }
diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
new file mode 100644
index 0000000..f60d1b4
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm.S
@@ -0,0 +1,3771 @@ 
+/*
+ * Copyright ツゥ 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.arch armv8-a
+
+.altmacro
+.p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-asm.h"
+#include "pixman-arma64-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch
efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * v0,  v1,  v2,  v3  - contain loaded source pixel data
+ * v4,  v5,  v6,  v7  - contain loaded destination pixels (if they are needed)
+ * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
+ * v28, v29, v30, v31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * v0,  v1,  v2,  v3  - contain loaded source pixel data
+ * v4,  v5            - contain loaded destination pixels (they are needed)
+ * v28, v29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
+ * perform all the needed calculations and write the result to {v28, v29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
+ * actually use v0 register for blue channel (a vector of eight 8-bit
+ * values), v1 register for green, v2 for red and v3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion: // vuzp8 is a wrapper macro
+ *  vuzp8 v0, v1
+ *  vuzp8 v2, v3
+ *  vuzp8 v1, v3
+ *  vuzp8 v0, v2
+ *
+ * Planar to packed conversion: // vzip8 is a wrapper macro
+ *  vzip8 v0, v2
+ *  vzip8 v1, v3
+ *  vzip8 v2, v3
+ *  vzip8 v0, v1
+ *
+ * But pixel can be loaded directly in planar format using LD4 / b NEON
+ * instruction. It is 1 cycle slower than LD1 / s, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+       and put data into v6 - red, v7 - green, v30 - blue */
+    mov         v4.d[1], v5.d[0]
+    shrn        v6.8b, v4.8h, #8
+    shrn        v7.8b, v4.8h, #3
+    sli         v4.8h, v4.8h, #5
+    sri         v6.8b, v6.8b, #5
+    mvn         v3.8b, v3.8b      /* invert source alpha */
+    sri         v7.8b, v7.8b, #6
+    shrn        v30.8b, v4.8h, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into v20 - red, v23 - green, v22 - blue */
+    umull       v10.8h, v3.8b, v6.8b
+    umull       v11.8h, v3.8b, v7.8b
+    umull       v12.8h, v3.8b, v30.8b
+    urshr       v17.8h, v10.8h, #8
+    urshr       v18.8h, v11.8h, #8
+    urshr       v19.8h, v12.8h, #8
+    raddhn      v20.8b, v10.8h, v17.8h
+    raddhn      v23.8b, v11.8h, v18.8h
+    raddhn      v22.8b, v12.8h, v19.8h
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    uqadd       v17.8b, v2.8b, v20.8b
+    uqadd       v18.8b, v0.8b, v22.8b
+    uqadd       v19.8b, v1.8b, v23.8b
+    /* convert the result to r5g6b5 and store it into {v14} */
+    ushll       v14.8h, v17.8b, #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v8.8h, v19.8b, #7
+    sli         v8.8h, v8.8h, #1
+    ushll       v9.8h, v18.8b, #7
+    sli         v9.8h, v9.8h, #1
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   st1         {v28.4h, v29.4h}, [DST_W], #32
+ *   ld1         {v4.4h, v5.4h}, [DST_R], #16
+ *   ld4         {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few LD/ST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        uqadd       v17.8b, v2.8b, v20.8b
+    ld1         {v4.4h, v5.4h}, [DST_R], #16
+    mov         v4.d[1], v5.d[0]
+        uqadd       v18.8b, v0.8b, v22.8b
+        uqadd       v19.8b, v1.8b, v23.8b
+    shrn        v6.8b, v4.8h, #8
+    fetch_src_pixblock
+    shrn        v7.8b, v4.8h, #3
+    sli         v4.8h, v4.8h, #5
+        ushll       v14.8h, v17.8b, #7
+        sli         v14.8h, v14.8h, #1
+                                    PF add PF_X, PF_X, #8
+        ushll       v8.8h, v19.8b, #7
+        sli         v8.8h, v8.8h,  #1
+                                    PF tst PF_CTL, #0xF
+    sri         v6.8b, v6.8b, #5
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+10:
+    mvn         v3.8b, v3.8b
+                                    PF beq 10f
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+    sri         v7.8b, v7.8b, #6
+    shrn        v30.8b, v4.8h, #2
+    umull       v10.8h, v3.8b, v6.8b
+                                    PF lsl DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+    umull       v11.8h, v3.8b, v7.8b
+    umull       v12.8h, v3.8b, v30.8b
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+        sri         v14.8h, v8.8h, #5
+                                    PF cmp PF_X, ORIG_W
+        ushll       v9.8h, v18.8b, #7
+        sli         v9.8h, v9.8h, #1
+    urshr       v17.8h, v10.8h, #8
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    urshr       v19.8h, v11.8h, #8
+    urshr       v18.8h, v12.8h, #8
+                                    PF ble 10f
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+        sri         v14.8h, v9.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+10:
+    raddhn      v20.8b, v10.8h, v17.8h
+    raddhn      v23.8b, v11.8h, v19.8h
+                                    PF ble 10f
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_SRC, #1
+10:
+    raddhn      v22.8b, v12.8h, v18.8h
+        st1         {v14.8h}, [DST_W], #16
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    st1         {v14.8h}, [DST_W], #16
+    ld1         {v4.4h, v4.5h}, [DST_R], #16
+    fetch_src_pixblock
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+       and put data into v6 - red, v7 - green, v30 - blue */
+    mov         v4.d[1], v5.d[0]
+    shrn        v6.8b, v4.8h, #8
+    shrn        v7.8b, v4.8h, #3
+    sli         v4.8h, v4.8h, #5
+    sri         v6.8b, v6.8b, #5
+    sri         v7.8b, v7.8b, #6
+    shrn        v30.8b, v4.8h, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into v20 - red, v23 - green, v22 - blue */
+    umull       v10.8h, v3.8b, v6.8b
+    umull       v11.8h, v3.8b, v7.8b
+    umull       v12.8h, v3.8b, v30.8b
+    urshr       v13.8h, v10.8h, #8
+    urshr       v14.8h, v11.8h, #8
+    urshr       v15.8h, v12.8h, #8
+    raddhn      v20.8b, v10.8h, v13.8h
+    raddhn      v23.8b, v11.8h, v14.8h
+    raddhn      v22.8b, v12.8h, v15.8h
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    uqadd       v17.8b, v2.8b, v20.8b
+    uqadd       v18.8b, v0.8b, v22.8b
+    uqadd       v19.8b, v1.8b, v23.8b
+    /* convert the result to r5g6b5 and store it into {v14} */
+    ushll       v14.8h, v17.8b, #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v8.8h, v19.8b, #7
+    sli         v8.8h, v8.8h, #1
+    ushll       v9.8h, v18.8b, #7
+    sli         v9.8h, v9.8h, #1
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    ld1         {v4.4h, v5.4h}, [DST_R], #16
+    st1         {v14.8h}, [DST_W], #16
+    pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+    mvn         v3.8b, v3.8b      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    ushll       v8.8h,  v1.8b,  #7
+    sli         v8.8h,  v8.8h,  #1
+    ushll       v14.8h, v2.8b,  #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v9.8h,  v0.8b,  #7
+    sli         v9.8h,  v9.8h,  #1
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        sri         v14.8h, v8.8h, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    fetch_src_pixblock
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        sri         v14.8h, v9.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+                                    PF cmp PF_X, ORIG_W
+                                    PF lsl DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+    ushll       v8.8h, v1.8b, #7
+    sli         v8.8h, v8.8h, #1
+        st1        {v14.8h}, [DST_W], #16
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    ushll       v14.8h, v2.8b, #7
+    sli         v14.8h, v14.8h, #1
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+10:
+    ushll       v9.8h, v0.8b, #7
+    sli         v9.8h, v9.8h, #1
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    mov         v0.d[1], v1.d[0]
+    shrn        v30.8b, v0.8h, #8
+    shrn        v29.8b, v0.8h, #3
+    sli         v0.8h,  v0.8h, #5
+    movi        v31.8b, #255
+    sri         v30.8b, v30.8b, #5
+    sri         v29.8b, v29.8b, #6
+    shrn        v28.8b, v0.8h, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    fetch_src_pixblock
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+    uqadd       v28.8b, v0.8b, v4.8b
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #32
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp PF_X, ORIG_W
+                                    PF lsl DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    uqadd       v28.8b, v0.8b, v4.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_DST, #1
+10:
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp PF_X, ORIG_W
+                                    PF lsl DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    uqadd       v28.8b, v0.8b, v4.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_DST, #1
+10:
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+    mvn         v24.8b, v3.8b  /* get inverted alpha */
+    /* do alpha blending */
+    umull       v8.8h, v24.8b, v4.8b
+    umull       v9.8h, v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+    umull       v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    urshr       v14.8h, v8.8h, #8
+    urshr       v15.8h, v9.8h, #8
+    urshr       v16.8h, v10.8h, #8
+    urshr       v17.8h, v11.8h, #8
+    raddhn      v28.8b, v14.8h, v8.8h
+    raddhn      v29.8b, v15.8h, v9.8h
+    raddhn      v30.8b, v16.8h, v10.8h
+    raddhn      v31.8b, v17.8h, v11.8h
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v14.8h, v8.8h, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        urshr       v15.8h, v9.8h, #8
+        urshr       v16.8h, v10.8h, #8
+        urshr       v17.8h, v11.8h, #8
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+                                    PF cmp PF_X, ORIG_W
+        raddhn      v30.8b, v16.8h, v10.8h
+        raddhn      v31.8b, v17.8h, v11.8h
+    fetch_src_pixblock
+                                    PF lsl DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+    mvn         v22.8b, v3.8b
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    umull      v8.8h, v22.8b, v4.8b
+                                    PF ble 10f
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    umull      v9.8h, v22.8b, v5.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+10:
+    umull      v10.8h, v22.8b, v6.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_DST, #1
+10:
+     umull     v11.8h, v22.8b, v7.8b
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v14.8h, v8.8h, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        urshr       v15.8h, v9.8h, #8
+        urshr       v16.8h, v10.8h, #8
+        urshr       v17.8h, v11.8h, #8
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+                                    PF cmp PF_X, ORIG_W
+        raddhn      v30.8b, v16.8h, v10.8h
+        raddhn      v31.8b, v17.8h, v11.8h
+        uqadd       v28.8b, v0.8b, v28.8b
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+    fetch_src_pixblock
+                                    PF lsl DUMMY, PF_X, #src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+    mvn        v22.8b, v3.8b
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    umull      v8.8h, v22.8b, v4.8b
+                                    PF ble 10f
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    umull      v9.8h, v22.8b, v5.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+10:
+    umull      v10.8h, v22.8b, v6.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_DST, #1
+10:
+    umull      v11.8h, v22.8b, v7.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+    /* deinterleaved source pixels in {v0, v1, v2, v3} */
+    /* inverted alpha in {v24} */
+    /* destination pixels in {v4, v5, v6, v7} */
+    umull       v8.8h, v24.8b, v4.8b
+    umull       v9.8h, v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+    umull       v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+    urshr       v14.8h, v8.8h, #8
+    urshr       v15.8h, v9.8h, #8
+    urshr       v16.8h, v10.8h, #8
+    urshr       v17.8h, v11.8h, #8
+    raddhn      v28.8b, v14.8h, v8.8h
+    raddhn      v29.8b, v15.8h, v9.8h
+    raddhn      v30.8b, v16.8h, v10.8h
+    raddhn      v31.8b, v17.8h, v11.8h
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+        urshr       v14.8h, v8.8h, #8
+        urshr       v15.8h, v9.8h, #8
+        urshr       v16.8h, v10.8h, #8
+        urshr       v17.8h, v11.8h, #8
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+        raddhn      v30.8b, v16.8h, v10.8h
+        raddhn      v31.8b, v17.8h, v11.8h
+    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        uqadd       v28.8b, v0.8b, v28.8b
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0x0F
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+                                    PF cmp PF_X, ORIG_W
+    umull       v8.8h, v24.8b, v4.8b
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+    umull       v9.8h, v24.8b, v5.8b
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    umull       v10.8h, v24.8b, v6.8b
+                                    PF subs PF_CTL, PF_CTL, #0x10
+    umull       v11.8h, v24.8b, v7.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_DST, #1
+10:
+        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+    mvn         v24.8b, v3.8b  /* get inverted alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        urshr       v14.8h, v8.8h, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        urshr       v15.8h, v9.8h, #8
+        urshr       v12.8h, v10.8h, #8
+        urshr       v13.8h, v11.8h, #8
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        raddhn      v28.8b, v14.8h, v8.8h
+        raddhn      v29.8b, v15.8h, v9.8h
+                                    PF cmp PF_X, ORIG_W
+        raddhn      v30.8b, v12.8h, v10.8h
+        raddhn      v31.8b, v13.8h, v11.8h
+        uqadd       v28.8b, v0.8b, v28.8b
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+    ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
+    mvn         v22.8b, v3.8b
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF blt 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    umull       v8.8h, v22.8b, v4.8b
+                                    PF blt 10f
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    umull       v9.8h, v22.8b, v5.8b
+    umull       v10.8h, v22.8b, v6.8b
+                                    PF blt 10f
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_DST, #1
+10:
+    umull       v11.8h, v22.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    mov         v7.s[0], w4
+    dup         v4.8b, v7.b[0]
+    dup         v5.8b, v7.b[1]
+    dup         v6.8b, v7.b[2]
+    dup         v7.8b, v7.b[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+    umull       v0.8h,  v24.8b, v8.8b    /* IN for SRC pixels (part1) */
+    umull       v1.8h,  v24.8b, v9.8b
+    umull       v2.8h,  v24.8b, v10.8b
+    umull       v3.8h,  v24.8b, v11.8b
+        mov         v4.d[1], v5.d[0]
+        shrn        v25.8b,  v4.8h, #8 /* convert DST_R data to
32-bpp (part1) */
+        shrn        v26.8b,  v4.8h, #3
+        sli         v4.8h,   v4.8h, #5
+    urshr       v17.8h, v0.8h,  #8    /* IN for SRC pixels (part2) */
+    urshr       v18.8h, v1.8h,  #8
+    urshr       v19.8h, v2.8h,  #8
+    urshr       v20.8h, v3.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v17.8h
+    raddhn      v1.8b,  v1.8h,  v18.8h
+    raddhn      v2.8b,  v2.8h,  v19.8h
+    raddhn      v3.8b,  v3.8h,  v20.8h
+        sri         v25.8b, v25.8b, #5 /* convert DST_R data to
32-bpp (part2) */
+        sri         v26.8b, v26.8b, #6
+    mvn         v3.8b,  v3.8b
+        shrn        v30.8b, v4.8h,  #2
+    umull       v18.8h, v3.8b, v25.8b     /* now do alpha blending */
+    umull       v19.8h, v3.8b, v26.8b
+    umull       v20.8h, v3.8b, v30.8b
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+    /* 3 cycle bubble (after vmull.u8) */
+    urshr       v5.8h, v18.8h, #8
+    urshr       v6.8h, v19.8h, #8
+    urshr       v7.8h, v20.8h, #8
+    raddhn      v17.8b, v18.8h, v5.8h
+    raddhn      v19.8b, v19.8h, v6.8h
+    raddhn      v18.8b, v20.8h, v7.8h
+    uqadd       v5.8b, v2.8b,  v17.8b
+    /* 1 cycle bubble */
+    uqadd       v6.8b, v0.8b,  v18.8b
+    uqadd       v7.8b, v1.8b,  v19.8b
+    ushll       v14.8h, v5.8b, #7    /* convert to 16bpp */
+    sli         v14.8h, v14.8h, #1
+    ushll       v18.8h, v7.8b, #7
+    sli         v18.8h, v18.8h, #1
+    ushll       v19.8h, v6.8b, #7
+    sli         v19.8h, v19.8h, #1
+    sri         v14.8h, v18.8h, #5
+    /* 1 cycle bubble */
+    sri         v14.8h, v19.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+#if 0
+    ld1         {v4.8h}, [DST_R], #16
+    shrn        v25.8b,  v4.8h,  #8
+    fetch_mask_pixblock
+    shrn        v26.8b,  v4.8h,  #3
+    fetch_src_pixblock
+    umull       v22.8h,  v24.8b, v10.8b
+        urshr       v13.8h, v18.8h, #8
+        urshr       v11.8h, v19.8h, #8
+        urshr       v15.8h, v20.8h, #8
+        raddhn      v17.8b, v18.8h, v13.8h
+        raddhn      v19.8b, v19.8h, v11.8h
+        raddhn      v18.8b, v20.8h, v15.8h
+        uqadd       v17.8b, v2.8b, v17.8b
+    umull       v21.8h,  v24.8b, v9.8b
+        uqadd       v18.8b, v0.8b, v18.8b
+        uqadd       v19.8b, v1.8b, v19.8b
+        ushll       v14.8h, v17.8b, #7
+        sli         v14.8h, v14.8h, #1
+    umull       v20.8h,  v24.8b, v8.8b
+        ushll       v18.8h,  v18.8b, #7
+        sli         v18.8h,  v18.8h, #1
+        ushll       v19.8h,  v19.8b, #7
+        sli         v19.8h,  v19.8h, #1
+        sri         v14.8h,  v18.8h, #5
+    umull       v23.8h,  v24.8b, v11.8b
+        sri         v14.8h,  v19.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+
+    cache_preload 8, 8
+
+    sli         v4.8h,  v4.8h,   #5
+    urshr       v16.8h, v20.8h,  #8
+    urshr       v17.8h, v21.8h,  #8
+    urshr       v18.8h, v22.8h,  #8
+    urshr       v19.8h, v23.8h,  #8
+    raddhn      v0.8b,  v20.8h, v16.8h
+    raddhn      v1.8b,  v21.8h, v17.8h
+    raddhn      v2.8b,  v22.8h, v18.8h
+    raddhn      v3.8b,  v23.8h, v19.8h
+    sri         v25.8b,  v25.8b,  #5
+    sri         v26.8b,  v26.8b,  #6
+    mvn         v3.8b,  v3.8b
+    shrn        v30.8b, v4.8h,  #2
+    st1         {v14.8h}, [DST_W], #16
+    umull       v18.8h, v3.8b, v25.8b
+    umull       v19.8h, v3.8b, v26.8b
+    umull       v20.8h, v3.8b, v30.8b
+#else
+    pixman_composite_over_8888_8_0565_process_pixblock_tail
+    st1         {v28.4h, v29.4h}, [DST_W], #16
+    ld1         {v4.4h, v5.4h}, [DST_R], #16
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    pixman_composite_over_8888_8_0565_process_pixblock_head
+#endif
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    mov         v11.s[0], w4
+    dup         v8.8b, v11.b[0]
+    dup         v9.8b, v11.b[1]
+    dup         v10.8b, v11.b[2]
+    dup         v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+    mov         v24.s[0], w6
+    dup         v24.8b, v24.b[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_0565_init, \
+    pixman_composite_over_8888_n_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
+    fetch_src_pixblock
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    st1         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
+.endm
+
+.macro pixman_composite_src_n_8_init
+    mov         v0.s[0], w4
+    dup         v3.8b, v0.b[0]
+    dup         v2.8b, v0.b[0]
+    dup         v1.8b, v0.b[0]
+    dup         v0.8b, v0.b[0]
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    mov         v0.s[0], w4
+    dup         v3.4h, v0.h[0]
+    dup         v2.4h, v0.h[0]
+    dup         v1.4h, v0.h[0]
+    dup         v0.4h, v0.h[0]
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    st1         {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    mov         v0.s[0], w4
+    dup         v3.2s, v0.s[0]
+    dup         v2.2s, v0.s[0]
+    dup         v1.2s, v0.s[0]
+    dup         v0.2s, v0.s[0]
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    st1  {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    orr      v0.8b, v0.8b, v4.8b
+    orr      v1.8b, v1.8b, v4.8b
+    orr      v2.8b, v2.8b, v4.8b
+    orr      v3.8b, v3.8b, v4.8b
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    st1      {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
+    fetch_src_pixblock
+    orr      v0.8b, v0.8b, v4.8b
+    orr      v1.8b, v1.8b, v4.8b
+    orr      v2.8b, v2.8b, v4.8b
+    orr      v3.8b, v3.8b, v4.8b
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    mov     w20, #0xFF
+    dup     v4.8b, w20
+    shl     v4.2s, v4.2s, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+    /* expecting solid source in {v0, v1, v2, v3} */
+    /* mask is in v24 (v25, v26, v27 are unused) */
+
+    /* in */
+    umull       v8.8h,  v24.8b, v0.8b
+    umull       v9.8h,  v24.8b, v1.8b
+    umull       v10.8h, v24.8b, v2.8b
+    umull       v11.8h, v24.8b, v3.8b
+    ursra       v8.8h,  v8.8h, #8
+    ursra       v9.8h,  v9.8h, #8
+    ursra       v10.8h, v10.8h, #8
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+    rshrn       v28.8b, v8.8h, #8
+    rshrn       v29.8b, v9.8h, #8
+    rshrn       v30.8b, v10.8h, #8
+    rshrn       v31.8b, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        rshrn       v28.8b, v8.8h, #8
+                                    PF tst PF_CTL, #0x0F
+        rshrn       v29.8b, v9.8h, #8
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+10:
+        rshrn      v30.8b, v10.8h, #8
+                                    PF beq 10f
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        rshrn      v31.8b, v11.8h, #8
+                                    PF cmp PF_X, ORIG_W
+    umull          v8.8h, v24.8b, v0.8b
+                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
+                                    PF prfm pldl2strm, [PF_MASK, DUMMY]
+    umull          v9.8h, v24.8b, v1.8b
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    umull          v10.8h, v24.8b, v2.8b
+                                    PF ble 10f
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    umull          v11.8h, v24.8b, v3.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
+                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
+                                    PF add PF_MASK, PF_MASK, #1
+10:
+        st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ursra       v8.8h, v8.8h, #8
+    ursra       v9.8h, v9.8h, #8
+    ursra       v10.8h, v10.8h, #8
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8888_init, \
+    pixman_composite_src_n_8_8888_cleanup, \
+    pixman_composite_src_n_8_8888_process_pixblock_head, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+    umull       v0.8h, v24.8b, v16.8b
+    umull       v1.8h, v25.8b, v16.8b
+    umull       v2.8h, v26.8b, v16.8b
+    umull       v3.8h, v27.8b, v16.8b
+    ursra       v0.8h, v0.8h,  #8
+    ursra       v1.8h, v1.8h,  #8
+    ursra       v2.8h, v2.8h,  #8
+    ursra       v3.8h, v3.8h,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+    rshrn       v28.8b, v0.8h, #8
+    rshrn       v29.8b, v1.8h, #8
+    rshrn       v30.8b, v2.8h, #8
+    rshrn       v31.8b, v3.8h, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        rshrn       v28.8b, v0.8h, #8
+                                    PF tst PF_CTL, #0x0F
+        rshrn       v29.8b, v1.8h, #8
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+10:
+        rshrn       v30.8b, v2.8h, #8
+                                    PF beq 10f
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        rshrn       v31.8b, v3.8h, #8
+                                    PF cmp PF_X, ORIG_W
+    umull       v0.8h,  v24.8b, v16.8b
+                                    PF lsl DUMMY, PF_X, mask_bpp_shift
+                                    PF prfm pldl2strm, [PF_MASK, DUMMY]
+    umull       v1.8h,  v25.8b, v16.8b
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    umull       v2.8h,  v26.8b, v16.8b
+                                    PF ble 10f
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    umull       v3.8h,  v27.8b, v16.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
+                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
+                                    PF add PF_MASK, PF_MASK, #1
+10:
+        st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ursra       v0.8h, v0.8h,  #8
+    ursra       v1.8h, v1.8h,  #8
+    ursra       v2.8h, v2.8h,  #8
+    ursra       v3.8h, v3.8h,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+    mov         v16.s[0], w4
+    dup         v16.8b, v16.b[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8_init, \
+    pixman_composite_src_n_8_8_cleanup, \
+    pixman_composite_src_n_8_8_process_pixblock_head, \
+    pixman_composite_src_n_8_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {v8, v9, v10, v11} */
+    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
+    /* and destination data in {v4, v5, v6, v7} */
+    /* mask is in v24 (v25, v26, v27 are unused) */
+
+    /* in */
+    umull       v12.8h, v24.8b, v8.8b
+    umull       v13.8h, v24.8b, v9.8b
+    umull       v14.8h, v24.8b, v10.8b
+    umull       v15.8h, v24.8b, v11.8b
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    urshr       v18.8h, v14.8h, #8
+    urshr       v19.8h, v15.8h, #8
+    raddhn      v0.8b, v12.8h, v16.8h
+    raddhn      v1.8b, v13.8h, v17.8h
+    raddhn      v2.8b, v14.8h, v18.8h
+    raddhn      v3.8b, v15.8h, v19.8h
+    mvn         v25.8b, v3.8b  /* get inverted alpha */
+    /* source:      v0 - blue, v1 - green, v2 - red, v3 - alpha */
+    /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
+    /* now do alpha blending */
+    umull       v12.8h, v25.8b, v4.8b
+    umull       v13.8h, v25.8b, v5.8b
+    umull       v14.8h, v25.8b, v6.8b
+    umull       v15.8h, v25.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    urshr       v18.8h, v14.8h, #8
+    urshr       v19.8h, v15.8h, #8
+    raddhn      v28.8b, v16.8h, v12.8h
+    raddhn      v29.8b, v17.8h, v13.8h
+    raddhn      v30.8b, v18.8h, v14.8h
+    raddhn      v31.8b, v19.8h, v15.8h
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+        urshr       v16.8h, v12.8h, #8
+     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v17.8h, v13.8h, #8
+    fetch_mask_pixblock
+        urshr       v18.8h, v14.8h, #8
+                                    PF add PF_X, PF_X, #8
+        urshr       v19.8h, v15.8h, #8
+                                    PF tst PF_CTL, #0x0F
+        raddhn      v28.8b, v16.8h, v12.8h
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+10:
+        raddhn      v29.8b, v17.8h, v13.8h
+                                    PF beq 10f
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        raddhn      v30.8b, v18.8h, v14.8h
+                                    PF cmp PF_X, ORIG_W
+        raddhn      v31.8b, v19.8h, v15.8h
+                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
+                                    PF prfm pldl2strm, [PF_DST, DUMMY]
+    umull       v16.8h, v24.8b, v8.8b
+                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
+                                    PF prfm pldl2strm, [PF_MASK, DUMMY]
+    umull       v17.8h, v24.8b, v9.8b
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+10:
+    umull       v18.8h, v24.8b, v10.8b
+                                    PF ble 10f
+                                    PF subs PF_CTL, PF_CTL, #0x10
+10:
+    umull       v19.8h, v24.8b, v11.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
+                                    PF add PF_DST, PF_DST, #1
+10:
+        uqadd       v28.8b, v0.8b, v28.8b
+                                    PF ble 10f
+                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
+                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
+                                    PF add PF_MASK, PF_MASK, #1
+10:
+        uqadd        v29.8b, v1.8b, v29.8b
+        uqadd        v30.8b, v2.8b, v30.8b
+        uqadd        v31.8b, v3.8b, v31.8b
+    urshr       v12.8h, v16.8h, #8
+    urshr       v13.8h, v17.8h, #8
+    urshr       v14.8h, v18.8h, #8
+    urshr       v15.8h, v19.8h, #8
+    raddhn      v0.8b, v16.8h, v12.8h
+    raddhn      v1.8b, v17.8h, v13.8h
+    raddhn      v2.8b, v18.8h, v14.8h
+    raddhn      v3.8b, v19.8h, v15.8h
+        st4          {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    mvn         v25.8b, v3.8b
+    umull       v12.8h, v25.8b, v4.8b
+    umull       v13.8h, v25.8b, v5.8b
+    umull       v14.8h, v25.8b, v6.8b
+    umull       v15.8h, v25.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    mov         v11.s[0], w4
+    dup         v8.8b, v11.b[0]
+    dup         v9.8b, v11.b[1]
+    dup         v10.8b, v11.b[2]
+    dup         v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+    umull       v0.8h,  v24.8b, v8.8b
+    umull       v1.8h,  v25.8b, v8.8b
+    umull       v2.8h,  v26.8b, v8.8b
+    umull       v3.8h,  v27.8b, v8.8b
+    urshr       v10.8h, v0.8h,  #8
+    urshr       v11.8h, v1.8h,  #8
+    urshr       v12.8h, v2.8h,  #8
+    urshr       v13.8h, v3.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v10.8h
+    raddhn      v1.8b,  v1.8h,  v11.8h
+    raddhn      v2.8b,  v2.8h,  v12.8h
+    raddhn      v3.8b,  v3.8h,  v13.8h
+    mvn         v24.8b, v0.8b
+    mvn         v25.8b, v1.8b
+    mvn         v26.8b, v2.8b
+    mvn         v27.8b, v3.8b
+    umull       v10.8h, v24.8b, v4.8b
+    umull       v11.8h, v25.8b, v5.8b
+    umull       v12.8h, v26.8b, v6.8b
+    umull       v13.8h, v27.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+    urshr       v14.8h, v10.8h,  #8
+    urshr       v15.8h, v11.8h,  #8
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    raddhn      v28.8b, v14.8h, v10.8h
+    raddhn      v29.8b, v15.8h, v11.8h
+    raddhn      v30.8b, v16.8h, v12.8h
+    raddhn      v31.8b, v17.8h, v13.8h
+    uqadd       v28.8b, v0.8b,  v28.8b
+    uqadd       v29.8b, v1.8b,  v29.8b
+    uqadd       v30.8b, v2.8b,  v30.8b
+    uqadd       v31.8b, v3.8b,  v31.8b
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_n_8_8_process_pixblock_tail
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+    mov         v8.s[0], w4
+    dup         v8.8b, v8.b[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8_init, \
+    pixman_composite_over_n_8_8_cleanup, \
+    pixman_composite_over_n_8_8_process_pixblock_head, \
+    pixman_composite_over_n_8_8_process_pixblock_tail, \
+    pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {v8,  v9,  v10, v11}
+     *         dest in          {v4,  v5,  v6,  v7 }
+     *         mask in          {v24, v25, v26, v27}
+     * output: updated src in   {v0,  v1,  v2,  v3 }
+     *         updated mask in  {v24, v25, v26, v3 }
+     */
+    umull       v0.8h,  v24.8b, v8.8b
+    umull       v1.8h,  v25.8b, v9.8b
+    umull       v2.8h,  v26.8b, v10.8b
+    umull       v3.8h,  v27.8b, v11.8b
+    umull       v12.8h, v11.8b, v25.8b
+    umull       v13.8h, v11.8b, v24.8b
+    umull       v14.8h, v11.8b, v26.8b
+    urshr       v15.8h, v0.8h,  #8
+    urshr       v16.8h, v1.8h,  #8
+    urshr       v17.8h, v2.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v15.8h
+    raddhn      v1.8b,  v1.8h,  v16.8h
+    raddhn      v2.8b,  v2.8h,  v17.8h
+    urshr       v15.8h, v13.8h, #8
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v14.8h, #8
+    urshr       v18.8h, v3.8h,  #8
+    raddhn      v24.8b, v13.8h, v15.8h
+    raddhn      v25.8b, v12.8h, v16.8h
+    raddhn      v26.8b, v14.8h, v17.8h
+    raddhn      v3.8b,  v3.8h,  v18.8h
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {v28, v29, v30, v31}
+     */
+    mvn         v24.8b, v24.8b
+    mvn         v25.8b, v25.8b
+    mvn         v26.8b, v26.8b
+    mvn         v27.8b, v3.8b
+    umull       v12.8h, v24.8b, v4.8b
+    umull       v13.8h, v25.8b, v5.8b
+    umull       v14.8h, v26.8b, v6.8b
+    umull       v15.8h, v27.8b, v7.8b
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    urshr       v16.8h, v12.8h, #8
+    urshr       v17.8h, v13.8h, #8
+    urshr       v18.8h, v14.8h, #8
+    urshr       v19.8h, v15.8h, #8
+    raddhn      v28.8b, v16.8h, v12.8h
+    raddhn      v29.8b, v17.8h, v13.8h
+    raddhn      v30.8b, v18.8h, v14.8h
+    raddhn      v31.8b, v19.8h, v15.8h
+    uqadd       v28.8b, v0.8b,  v28.8b
+    uqadd       v29.8b, v1.8b,  v29.8b
+    uqadd       v30.8b, v2.8b,  v30.8b
+    uqadd       v31.8b, v3.8b,  v31.8b
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        urshr       v16.8h, v12.8h, #8
+        urshr       v17.8h, v13.8h, #8
+    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+        urshr       v18.8h, v14.8h, #8
+        urshr       v19.8h, v15.8h, #8
+        raddhn      v28.8b, v16.8h, v12.8h
+        raddhn      v29.8b, v17.8h, v13.8h
+        raddhn      v30.8b, v18.8h, v14.8h
+        raddhn      v31.8b, v19.8h, v15.8h
+    fetch_mask_pixblock
+        uqadd       v28.8b, v0.8b, v28.8b
+        uqadd       v29.8b, v1.8b, v29.8b
+        uqadd       v30.8b, v2.8b, v30.8b
+        uqadd       v31.8b, v3.8b, v31.8b
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    mov         v13.s[0], w4
+    dup         v8.8b, v13.b[0]
+    dup         v9.8b, v13.b[1]
+    dup         v10.8b, v13.b[2]
+    dup         v11.8b, v13.b[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
+     *         mask in          {v24, v25, v26}       [B, G, R]
+     * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
+     *         updated mask in  {v24, v25, v26}       [B, G, R]
+     */
+    umull       v0.8h,  v24.8b, v8.8b
+    umull       v1.8h,  v25.8b, v9.8b
+    umull       v2.8h,  v26.8b, v10.8b
+    umull       v12.8h, v11.8b, v24.8b
+    umull       v13.8h, v11.8b, v25.8b
+    umull       v14.8h, v11.8b, v26.8b
+    urshr       v15.8h, v0.8h,  #8
+    urshr       v16.8h, v1.8h,  #8
+    urshr       v17.8h, v2.8h,  #8
+    raddhn      v0.8b,  v0.8h,  v15.8h
+    raddhn      v1.8b,  v1.8h,  v16.8h
+    raddhn      v2.8b,  v2.8h,  v17.8h
+    urshr       v19.8h, v12.8h, #8
+    urshr       v20.8h, v13.8h, #8
+    urshr       v21.8h, v14.8h, #8
+    raddhn      v24.8b, v12.8h, v19.8h
+    raddhn      v25.8b, v13.8h, v20.8h
+    /*
+     * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
+     * and put data into v16 - blue, v17 - green, v18 - red
+     */
+       mov         v4.d[1], v5.d[0]
+       shrn        v17.8b, v4.8h,  #3
+       shrn        v18.8b, v4.8h,  #8
+    raddhn      v26.8b, v14.8h, v21.8h
+       sli         v4.8h,  v4.8h,  #5
+       sri         v18.8b, v18.8b, #5
+       sri         v17.8b, v17.8b, #6
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in v16 - blue, v17 - green, v18 - red
+     */
+    mvn         v24.8b, v24.8b
+    mvn         v25.8b, v25.8b
+       shrn       v16.8b, v4.8h,  #2
+    mvn         v26.8b, v26.8b
+    umull       v5.8h, v16.8b, v24.8b
+    umull       v6.8h, v17.8b, v25.8b
+    umull       v7.8h, v18.8b, v26.8b
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    urshr       v13.8h, v5.8h, #8
+    urshr       v14.8h, v6.8h, #8
+    urshr       v15.8h, v7.8h, #8
+    raddhn      v16.8b, v13.8h, v5.8h
+    raddhn      v17.8b, v14.8h, v6.8h
+    raddhn      v18.8b, v15.8h, v7.8h
+    uqadd       v16.8b, v0.8b, v16.8b
+    uqadd       v17.8b, v1.8b, v17.8b
+    uqadd       v18.8b, v2.8b, v18.8b
+    /*
+     * convert the results in v16, v17, v18 to r5g6b5 and store
+     * them into {v14}
+     */
+    ushll       v14.8h, v18.8b, #7
+    sli         v14.8h, v14.8h, #1
+    ushll       v12.8h, v17.8b, #7
+    sli         v12.8h, v12.8h, #1
+    ushll       v13.8h, v16.8b, #7
+    sli         v13.8h, v13.8h, #1
+    sri         v14.8h, v12.8h, #5
+    sri         v14.8h, v13.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+    fetch_mask_pixblock
+        urshr       v13.8h, v5.8h, #8
+        urshr       v14.8h, v6.8h, #8
+    ld1         {v4.8h}, [DST_R], #16
+        urshr       v15.8h, v7.8h, #8
+        raddhn      v16.8b, v13.8h, v5.8h
+        raddhn      v17.8b, v14.8h, v6.8h
+        raddhn      v18.8b, v15.8h, v7.8h
+    mov         v5.d[0], v4.d[1]
+            /* process_pixblock_head */
+            /*
+             * 'combine_mask_ca' replacement
+             *
+             * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
+             *         mask in          {v24, v25, v26}       [B, G, R]
+             * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
+             *         updated mask in  {v24, v25, v26}       [B, G, R]
+             */
+        uqadd       v16.8b, v0.8b, v16.8b
+        uqadd       v17.8b, v1.8b, v17.8b
+        uqadd       v18.8b, v2.8b, v18.8b
+            umull       v0.8h,  v24.8b, v8.8b
+            umull       v1.8h,  v25.8b, v9.8b
+            umull       v2.8h,  v26.8b, v10.8b
+        /*
+         * convert the result in v16, v17, v18 to r5g6b5 and store
+         * it into {v14}
+         */
+        ushll       v14.8h, v18.8b, #7
+        sli         v14.8h, v14.8h, #1
+        ushll       v18.8h, v16.8b, #7
+        sli         v18.8h, v18.8h, #1
+        ushll       v19.8h, v17.8b, #7
+        sli         v19.8h, v19.8h, #1
+            umull       v12.8h, v11.8b, v24.8b
+        sri         v14.8h, v19.8h, #5
+            umull       v13.8h, v11.8b, v25.8b
+            umull       v15.8h, v11.8b, v26.8b
+        sri         v14.8h, v18.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+    cache_preload 8, 8
+            urshr       v16.8h, v0.8h,  #8
+            urshr       v17.8h, v1.8h,  #8
+            urshr       v18.8h, v2.8h,  #8
+            raddhn      v0.8b,  v0.8h,  v16.8h
+            raddhn      v1.8b,  v1.8h,  v17.8h
+            raddhn      v2.8b,  v2.8h,  v18.8h
+            urshr       v19.8h, v12.8h, #8
+            urshr       v20.8h, v13.8h, #8
+            urshr       v21.8h, v15.8h, #8
+            raddhn      v24.8b, v12.8h, v19.8h
+            raddhn      v25.8b, v13.8h, v20.8h
+                /*
+                 * convert 8 r5g6b5 pixel data from {v4, v5} to planar
+             * 8-bit format and put data into v16 - blue, v17 - green,
+             * v18 - red
+                 */
+        mov         v4.d[1], v5.d[0]
+                shrn        v17.8b, v4.8h,  #3
+                shrn        v18.8b, v4.8h,  #8
+            raddhn      v26.8b, v15.8h, v21.8h
+                sli         v4.8h,  v4.8h,  #5
+                sri         v17.8b, v17.8b, #6
+                sri         v18.8b, v18.8b, #5
+            /*
+             * 'combine_over_ca' replacement
+             *
+             * output: updated dest in v16 - blue, v17 - green, v18 - red
+             */
+            mvn         v24.8b, v24.8b
+            mvn         v25.8b, v25.8b
+                shrn        v16.8b, v4.8h,  #2
+            mvn         v26.8b, v26.8b
+            umull       v5.8h, v16.8b, v24.8b
+            umull       v6.8h, v17.8b, v25.8b
+            umull       v7.8h, v18.8b, v26.8b
+    st1         {v14.8h}, [DST_W], #16
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+    mov         v13.s[0], w4
+    dup         v8.8b, v13.b[0]
+    dup         v9.8b, v13.b[1]
+    dup         v10.8b, v13.b[2]
+    dup         v11.8b, v13.b[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_0565_ca_init, \
+    pixman_composite_over_n_8888_0565_ca_cleanup, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* and destination data in {v4, v5, v6, v7} */
+    umull       v8.8h,  v4.8b,  v3.8b
+    umull       v9.8h,  v5.8b,  v3.8b
+    umull       v10.8h, v6.8b,  v3.8b
+    umull       v11.8h, v7.8b,  v3.8b
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+    urshr       v14.8h, v8.8h,  #8
+    urshr       v15.8h, v9.8h,  #8
+    urshr       v12.8h, v10.8h, #8
+    urshr       v13.8h, v11.8h, #8
+    raddhn      v28.8b, v8.8h,  v14.8h
+    raddhn      v29.8b, v9.8h,  v15.8h
+    raddhn      v30.8b, v10.8h, v12.8h
+    raddhn      v31.8b, v11.8h, v13.8h
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+    pixman_composite_in_n_8_process_pixblock_tail
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    cache_preload 32, 32
+    pixman_composite_in_n_8_process_pixblock_head
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_in_n_8_init
+    mov         v3.s[0], w4
+    dup         v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_in_n_8_init, \
+    pixman_composite_in_n_8_cleanup, \
+    pixman_composite_in_n_8_process_pixblock_head, \
+    pixman_composite_in_n_8_process_pixblock_tail, \
+    pixman_composite_in_n_8_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {v8, v9, v10, v11} */
+    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
+    /* and destination data in {v4, v5, v6, v7} */
+    /* mask is in v24, v25, v26, v27 */
+    umull       v0.8h, v24.8b, v11.8b
+    umull       v1.8h, v25.8b, v11.8b
+    umull       v2.8h, v26.8b, v11.8b
+    umull       v3.8h, v27.8b, v11.8b
+    urshr       v12.8h, v0.8h, #8
+    urshr       v13.8h, v1.8h, #8
+    urshr       v14.8h, v2.8h, #8
+    urshr       v15.8h, v3.8h, #8
+    raddhn      v0.8b, v0.8h, v12.8h
+    raddhn      v1.8b, v1.8h, v13.8h
+    raddhn      v2.8b, v2.8h, v14.8h
+    raddhn      v3.8b, v3.8h, v15.8h
+    uqadd       v28.8b, v0.8b, v4.8b
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    mov         v11.s[0], w4
+    dup         v11.8b, v11.b[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* destination data in {v4, v5, v6, v7} */
+    /* mask in {v24, v25, v26, v27} */
+    umull       v8.8h, v24.8b, v0.8b
+    umull       v9.8h, v25.8b, v1.8b
+    umull       v10.8h, v26.8b, v2.8b
+    umull       v11.8h, v27.8b, v3.8b
+    urshr       v0.8h, v8.8h, #8
+    urshr       v1.8h, v9.8h, #8
+    urshr       v12.8h, v10.8h, #8
+    urshr       v13.8h, v11.8h, #8
+    raddhn      v0.8b, v0.8h, v8.8h
+    raddhn      v1.8b, v1.8h, v9.8h
+    raddhn      v2.8b, v12.8h, v10.8h
+    raddhn      v3.8b, v13.8h, v11.8h
+    uqadd       v28.8b, v0.8b, v4.8b
+    uqadd       v29.8b, v1.8b, v5.8b
+    uqadd       v30.8b, v2.8b, v6.8b
+    uqadd       v31.8b, v3.8b, v7.8b
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* destination data in {v4, v5, v6, v7} */
+    /* mask in {v24, v25, v26, v27} */
+    umull       v8.8h,  v27.8b, v0.8b
+    umull       v9.8h,  v27.8b, v1.8b
+    umull       v10.8h, v27.8b, v2.8b
+    umull       v11.8h, v27.8b, v3.8b
+    /* 1 cycle bubble */
+    ursra       v8.8h,  v8.8h,  #8
+    ursra       v9.8h,  v9.8h,  #8
+    ursra       v10.8h, v10.8h, #8
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    /* 2 cycle bubble */
+    rshrn       v28.8b, v8.8h,  #8
+    rshrn       v29.8b, v9.8h,  #8
+    rshrn       v30.8b, v10.8h, #8
+    rshrn       v31.8b, v11.8h, #8
+    uqadd       v28.8b, v4.8b,  v28.8b
+    uqadd       v29.8b, v5.8b,  v29.8b
+    uqadd       v30.8b, v6.8b,  v30.8b
+    uqadd       v31.8b, v7.8b,  v31.8b
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+        rshrn       v28.8b, v8.8h,  #8
+    fetch_mask_pixblock
+        rshrn       v29.8b, v9.8h,  #8
+    umull       v8.8h,  v27.8b, v0.8b
+        rshrn       v30.8b, v10.8h, #8
+    umull       v9.8h,  v27.8b, v1.8b
+        rshrn       v31.8b, v11.8h, #8
+    umull       v10.8h, v27.8b, v2.8b
+    umull       v11.8h, v27.8b, v3.8b
+        uqadd       v28.8b, v4.8b,  v28.8b
+        uqadd       v29.8b, v5.8b,  v29.8b
+        uqadd       v30.8b, v6.8b,  v30.8b
+        uqadd       v31.8b, v7.8b,  v31.8b
+    ursra       v8.8h,  v8.8h,  #8
+    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    ursra       v9.8h,  v9.8h,  #8
+        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+    ursra       v10.8h, v10.8h, #8
+
+    cache_preload 8, 8
+
+    ursra       v11.8h, v11.8h, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function \
+    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+    mov         v3.s[0], w4
+    dup         v0.8b, v3.b[0]
+    dup         v1.8b, v3.b[1]
+    dup         v2.8b, v3.b[2]
+    dup         v3.8b, v3.b[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8888_init, \
+    pixman_composite_add_n_8_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+    mov         v27.s[0], w6
+    dup         v27.8b, v27.b[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8888_n_8888_init, \
+    pixman_composite_add_8888_n_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    /* expecting source data in {v0, v1, v2, v3} */
+    /* destination data in {v4, v5, v6, v7} */
+    /* solid mask is in v15 */
+
+    /* 'in' */
+    umull       v11.8h, v15.8b, v3.8b
+    umull       v10.8h, v15.8b, v2.8b
+    umull       v9.8h,  v15.8b, v1.8b
+    umull       v8.8h,  v15.8b, v0.8b
+    urshr       v16.8h, v11.8h, #8
+    urshr       v14.8h, v10.8h, #8
+    urshr       v13.8h,  v9.8h, #8
+    urshr       v12.8h,  v8.8h, #8
+    raddhn      v3.8b, v11.8h, v16.8h
+    raddhn      v2.8b, v10.8h, v14.8h
+    raddhn      v1.8b,  v9.8h, v13.8h
+    raddhn      v0.8b,  v8.8h, v12.8h
+    mvn         v24.8b, v3.8b  /* get inverted alpha */
+    /* now do alpha blending */
+    umull       v8.8h, v24.8b, v4.8b
+    umull       v9.8h, v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+    umull       v11.8h, v24.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    urshr       v16.8h, v8.8h, #8
+    urshr       v17.8h, v9.8h, #8
+    urshr       v18.8h, v10.8h, #8
+    urshr       v19.8h, v11.8h, #8
+    raddhn      v28.8b, v16.8h, v8.8h
+    raddhn      v29.8b, v17.8h, v9.8h
+    raddhn      v30.8b, v18.8h, v10.8h
+    raddhn      v31.8b, v19.8h, v11.8h
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    uqadd       v28.8b, v0.8b, v28.8b
+    uqadd       v29.8b, v1.8b, v29.8b
+    uqadd       v30.8b, v2.8b, v30.8b
+    uqadd       v31.8b, v3.8b, v31.8b
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    mov         v15.s[0], w6
+    dup         v15.8b, v15.b[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    st3     {v0.8b, v1.8b, v2.8b}, [DST_W], #24
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    mov    v31.8b, v2.8b
+    mov    v2.8b, v0.8b
+    mov    v0.8b, v31.8b
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    st4    {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
+    fetch_src_pixblock
+    mov    v31.8b, v2.8b
+    mov    v2.8b, v0.8b
+    mov    v0.8b, v31.8b
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    eor    v3.8b, v3.8b, v3.8b
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    ushll       v8.8h, v1.8b, #7
+    sli         v8.8h, v8.8h, #1
+    ushll       v9.8h, v2.8b, #7
+    sli         v9.8h, v9.8h, #1
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    ushll       v14.8h, v0.8b, #7
+    sli         v14.8h, v14.8h, #1
+    sri         v14.8h, v8.8h, #5
+    sri         v14.8h, v9.8h, #11
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        ushll       v14.8h, v0.8b, #7
+        sli         v14.8h, v14.8h, #1
+    fetch_src_pixblock
+        sri         v14.8h, v8.8h, #5
+        sri         v14.8h, v9.8h, #11
+        mov         v28.d[0], v14.d[0]
+        mov         v29.d[0], v14.d[1]
+    ushll       v8.8h, v1.8b, #7
+    sli         v8.8h, v8.8h, #1
+        st1     {v14.8h}, [DST_W], #16
+    ushll       v9.8h, v2.8b, #7
+    sli         v9.8h, v9.8h, #1
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    urshr       v11.8h, v8.8h, #8
+    mov         v30.8b, v31.8b
+    mov         v31.8b, v3.8b
+    mov         v3.8b, v30.8b
+    urshr       v12.8h, v9.8h, #8
+    urshr       v13.8h, v10.8h, #8
+    raddhn      v30.8b, v11.8h, v8.8h
+    raddhn      v29.8b, v12.8h, v9.8h
+    raddhn      v28.8b, v13.8h, v10.8h
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        urshr       v11.8h, v8.8h, #8
+        mov         v30.8b, v31.8b
+        mov         v31.8b, v3.8b
+        mov         v3.8b, v31.8b
+        urshr       v12.8h, v9.8h, #8
+        urshr       v13.8h, v10.8h, #8
+    fetch_src_pixblock
+        raddhn      v30.8b, v11.8h, v8.8h
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        raddhn      v29.8b, v12.8h, v9.8h
+        raddhn      v28.8b, v13.8h, v10.8h
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp PF_X, ORIG_W
+                                    PF lsl DUMMY, PF_X, src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+                                    PF subs PF_CTL, PF_CTL, #0x10
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+10:
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    urshr       v11.8h, v8.8h, #8
+    mov         v30.8b, v31.8b
+    mov         v31.8b, v3.8b
+    mov         v3.8b, v30.8b
+    urshr       v12.8h, v9.8h, #8
+    urshr       v13.8h, v10.8h, #8
+    raddhn      v28.8b, v11.8h, v8.8h
+    raddhn      v29.8b, v12.8h, v9.8h
+    raddhn      v30.8b, v13.8h, v10.8h
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        urshr       v11.8h, v8.8h, #8
+        mov         v30.8b, v31.8b
+        mov         v31.8b, v3.8b
+        mov         v3.8b, v30.8b
+        urshr       v12.8h, v9.8h, #8
+        urshr       v13.8h, v10.8h, #8
+    fetch_src_pixblock
+        raddhn      v28.8b, v11.8h, v8.8h
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF beq 10f
+                                    PF add PF_X, PF_X, #8
+                                    PF sub PF_CTL, PF_CTL, #1
+10:
+        raddhn      v29.8b, v12.8h, v9.8h
+        raddhn      v30.8b, v13.8h, v10.8h
+    umull       v8.8h, v3.8b, v0.8b
+    umull       v9.8h, v3.8b, v1.8b
+    umull       v10.8h, v3.8b, v2.8b
+         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+                                    PF cmp PF_X, ORIG_W
+                                    PF lsl DUMMY, PF_X, src_bpp_shift
+                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
+                                    PF ble 10f
+                                    PF sub PF_X, PF_X, ORIG_W
+                                    PF subs PF_CTL, PF_CTL, #0x10
+                                    PF ble 10f
+                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+                                    PF add PF_SRC, PF_SRC, #1
+10:
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+    /* mask is in v15 */
+    mov         v4.d[0], v8.d[0]
+    mov         v4.d[1], v9.d[0]
+    mov         v13.d[0], v10.d[0]
+    mov         v13.d[1], v11.d[0]
+    convert_0565_to_x888 v4, v2, v1, v0
+    convert_0565_to_x888 v13, v6, v5, v4
+    /* source pixel data is in      {v0, v1, v2, XX} */
+    /* destination pixel data is in {v4, v5, v6, XX} */
+    mvn         v7.8b,  v15.8b
+    umull       v10.8h, v15.8b, v2.8b
+    umull       v9.8h,  v15.8b, v1.8b
+    umull       v8.8h,  v15.8b, v0.8b
+    umull       v11.8h, v7.8b,  v4.8b
+    umull       v12.8h, v7.8b,  v5.8b
+    umull       v13.8h, v7.8b,  v6.8b
+    urshr       v19.8h, v10.8h, #8
+    urshr       v18.8h, v9.8h,  #8
+    urshr       v17.8h, v8.8h,  #8
+    raddhn      v2.8b,  v10.8h, v19.8h
+    raddhn      v1.8b,  v9.8h,  v18.8h
+    raddhn      v0.8b,  v8.8h,  v17.8h
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+    urshr       v17.8h, v11.8h,  #8
+    urshr       v18.8h, v12.8h,  #8
+    urshr       v19.8h, v13.8h,  #8
+    raddhn      v28.8b, v17.8h, v11.8h
+    raddhn      v29.8b, v18.8h, v12.8h
+    raddhn      v30.8b, v19.8h, v13.8h
+    uqadd       v0.8b,  v0.8b,  v28.8b
+    uqadd       v1.8b,  v1.8b,  v29.8b
+    uqadd       v2.8b,  v2.8b,  v30.8b
+    /* 32bpp result is in {v0, v1, v2, XX} */
+    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_over_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    ld1        {v10.4h, v11.4h}, [DST_R], #16
+    cache_preload 8, 8
+    pixman_composite_over_0565_8_0565_process_pixblock_head
+    st1        {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+    mov         v15.s[0], w6
+    dup         v15.8b, v15.b[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_0565_n_0565_init, \
+    pixman_composite_over_0565_n_0565_cleanup, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in v15 */
+    mov         v4.d[0], v8.d[0]
+    mov         v4.d[1], v9.d[0]
+    mov         v13.d[0], v10.d[0]
+    mov         v13.d[1], v11.d[0]
+    convert_0565_to_x888 v4,  v2, v1, v0
+    convert_0565_to_x888 v13, v6, v5, v4
+    /* source pixel data is in      {v0, v1, v2, XX} */
+    /* destination pixel data is in {v4, v5, v6, XX} */
+    umull       v9.8h,  v15.8b, v2.8b
+    umull       v8.8h,  v15.8b, v1.8b
+    umull       v7.8h,  v15.8b, v0.8b
+    urshr       v12.8h, v9.8h,  #8
+    urshr       v11.8h, v8.8h,  #8
+    urshr       v10.8h, v7.8h,  #8
+    raddhn      v2.8b,  v9.8h,  v12.8h
+    raddhn      v1.8b,  v8.8h,  v11.8h
+    raddhn      v0.8b,  v7.8h,  v10.8h
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    uqadd       v0.8b,  v0.8b,  v4.8b
+    uqadd       v1.8b,  v1.8b,  v5.8b
+    uqadd       v2.8b,  v2.8b,  v6.8b
+    /* 32bpp result is in {v0, v1, v2, XX} */
+    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    ld1        {v10.4h, v11.4h}, [DST_R], #16
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    st1        {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in v15 */
+    mov         v12.d[0], v10.d[0]
+    mov         v12.d[1], v11.d[0]
+    convert_0565_to_x888 v12, v6, v5, v4
+    /* destination pixel data is in {v4, v5, v6, xx} */
+    mvn         v24.8b, v15.8b /* get inverted alpha */
+    /* now do alpha blending */
+    umull       v8.8h,  v24.8b, v4.8b
+    umull       v9.8h,  v24.8b, v5.8b
+    umull       v10.8h, v24.8b, v6.8b
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    urshr       v11.8h, v8.8h, #8
+    urshr       v12.8h, v9.8h, #8
+    urshr       v13.8h, v10.8h, #8
+    raddhn      v0.8b, v11.8h, v8.8h
+    raddhn      v1.8b, v12.8h, v9.8h
+    raddhn      v2.8b, v13.8h, v10.8h
+    /* 32bpp result is in {v0, v1, v2, XX} */
+    convert_8888_to_0565 v2, v1, v0, v14, v12, v3
+    mov         v28.d[0], v14.d[0]
+    mov         v29.d[0], v14.d[1]
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    ld1        {v10.4h, v11.4h}, [DST_R], #16
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    st1        {v14.8h}, [DST_W], #16
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+    /* src is in v0 */
+    /* destination pixel data is in {v4, v5, v6, v7} */
+    mvn         v1.8b, v0.8b /* get inverted alpha */
+    /* now do alpha blending */
+    umull       v8.8h, v1.8b, v4.8b
+    umull       v9.8h, v1.8b, v5.8b
+    umull       v10.8h, v1.8b, v6.8b
+    umull       v11.8h, v1.8b, v7.8b
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    urshr       v14.8h, v8.8h, #8
+    urshr       v15.8h, v9.8h, #8
+    urshr       v12.8h, v10.8h, #8
+    urshr       v13.8h, v11.8h, #8
+    raddhn      v28.8b, v14.8h, v8.8h
+    raddhn      v29.8b, v15.8h, v9.8h
+    raddhn      v30.8b, v12.8h, v10.8h
+    raddhn      v31.8b, v13.8h, v11.8h
+    /* 32bpp result is in {v28, v29, v30, v31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    ld4       {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_8888_process_pixblock_head
+    st4       {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head, \
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    asr       TMP1, X, #16
+    add       X, X, UX
+    lsl       TMP2, TMP1, #2
+    add       TMP1, TOP, TMP2
+    ld1       {&reg1&.2s}, [TMP1], STRIDE
+    ld1       {&reg2&.2s}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    asr       TMP1, X, #16
+    add       X, X, UX
+    lsl       TMP2, TMP1, #1
+    add       TMP1, TOP, TMP2
+    ld1       {&reg2&.s}[0], [TMP1], STRIDE
+    ld1       {&reg2&.s}[1], [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    umull     &acc1&.8h, &reg1&.8b, v28.8b
+    umlal     &acc1&.8h, &reg2&.8b, v29.8b
+    bilinear_load_8888 reg3, reg4, tmp2
+    umull     &acc2&.8h, &reg3&.8b, v28.8b
+    umlal     &acc2&.8h, &reg4&.8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro vzip reg1, reg2
+    umov      TMP4, v31.d[0]
+    zip1      v31.8b, reg1, reg2
+    zip2      reg2,   reg1, reg2
+    mov       reg1,   v31.8b
+    mov       v31.d[0], TMP4
+.endm
+
+.macro vuzp reg1, reg2
+    umov      TMP4, v31.d[0]
+    uzp1      v31.8b, reg1, reg2
+    uzp2      reg2,   reg1, reg2
+    mov       reg1,   v31.8b
+    mov       v31.d[0], TMP4
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+    asr       TMP1, X, #16
+    add       X, X, UX
+    lsl       TMP2, TMP1, #1
+    add       TMP1, TOP, TMP2
+    asr       TMP2, X, #16
+    add       X, X, UX
+    lsl       TMP3, TMP2, #1
+    add       TMP2, TOP, TMP3
+    ld1       {&acc2&.s}[0], [TMP1], STRIDE
+    ld1       {&acc2&.s}[2], [TMP2], STRIDE
+    ld1       {&acc2&.s}[1], [TMP1]
+    ld1       {&acc2&.s}[3], [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip      &reg1&.8b, &reg3&.8b
+    vzip      &reg2&.8b, &reg4&.8b
+    vzip      &reg3&.8b, &reg4&.8b
+    vzip      &reg1&.8b, &reg2&.8b
+    umull     &acc1&.8h, &reg1&.8b, v28.8b
+    umlal     &acc1&.8h, &reg2&.8b, v29.8b
+    umull     &acc2&.8h, &reg3&.8b, v28.8b
+    umlal     &acc2&.8h, &reg4&.8b, v29.8b
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+    asr       TMP1, X, #16
+    add       X, X, UX
+    lsl       TMP2, TMP1, #1
+    add       TMP1, TOP, TMP2
+    asr       TMP2, X, #16
+    add       X, X, UX
+    lsl       TMP3, TMP2, #1
+    add       TMP2, TOP, TMP3
+    ld1       {&xacc2&.s}[0], [TMP1], STRIDE
+    ld1       {&xacc2&.s}[2], [TMP2], STRIDE
+    ld1       {&xacc2&.s}[1], [TMP1]
+    ld1       {&xacc2&.s}[3], [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    asr       TMP1, X, #16
+    add       X, X, UX
+    lsl       TMP2, TMP1, #1
+    add       TMP1, TOP, TMP2
+    asr       TMP2, X, #16
+    add       X, X, UX
+    lsl       TMP3, TMP2, #1
+    add       TMP2, TOP, TMP3
+    ld1       {&yacc2&.s}[0], [TMP1], STRIDE
+    vzip      &xreg1&.8b, &xreg3&.8b
+    ld1       {&yacc2&.s}[2], [TMP2], STRIDE
+    vzip      &xreg2&.8b, &xreg4&.8b
+    ld1       {&yacc2&.s}[1], [TMP1]
+    vzip      &xreg3&.8b, &xreg4&.8b
+    ld1       {&yacc2&.s}[3], [TMP2]
+    vzip      &xreg1&.8b, &xreg2&.8b
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    umull     &xacc1&.8h, &xreg1&.8b, v28.8b
+    vzip      &yreg1&.8b, &yreg3&.8b
+    umlal     &xacc1&.8h, &xreg2&.8b, v29.8b
+    vzip      &yreg2&.8b, &yreg4&.8b
+    umull     &xacc2&.8h, &xreg3&.8b, v28.8b
+    vzip      &yreg3&.8b, &yreg4&.8b
+    umlal     &xacc2&.8h, &xreg4&.8b, v29.8b
+    vzip      &yreg1&.8b, &yreg2&.8b
+    umull     &yacc1&.8h, &yreg1&.8b, v28.8b
+    umlal     &yacc1&.8h, &yreg2&.8b, v29.8b
+    umull     &yacc2&.8h, &yreg3&.8b, v28.8b
+    umlal     &yacc2&.8h, &yreg4&.8b, v29.8b
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    st1       {v0.2s, v1.2s}, [OUT], #16
+.elseif numpix == 2
+    st1       {v0.2s}, [OUT], #8
+.elseif numpix == 1
+    st1       {v0.s}[0], [OUT], #4
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp      v0.8b, v1.8b
+    vuzp      v2.8b, v3.8b
+    vuzp      v1.8b, v3.8b
+    vuzp      v0.8b, v2.8b
+    convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
+.if numpix == 4
+    st1       {v1.4h}, [OUT], #8
+.elseif numpix == 2
+    st1       {v1.s}[0], [OUT], #4
+.elseif numpix == 1
+    st1       {v1.h}[0], [OUT], #2
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_load_&src_fmt v0, v1, v2
+    umull     v2.8h, v0.8b, v28.8b
+    umlal     v2.8h, v1.8b, v29.8b
+    /* 5 cycles bubble */
+    mov       v3.d[0], v2.d[1]
+    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v2.4h, v30.4h
+    umlal     v0.4s, v3.4h, v30.4h
+    /* 5 cycles bubble */
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    /* 3 cycles bubble */
+    xtn       v0.8b, v0.8h
+    /* 1 cycle bubble */
+    bilinear_store_&dst_fmt 1, v3, v4
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                v1, v11, v2, v3, v20, v21, v22, v23
+    mov       v2.d[0], v1.d[0]
+    mov       v3.d[0], v1.d[1]
+    mov       v22.d[0], v11.d[0]
+    mov       v23.d[0], v11.d[1]
+    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v2.4h, v30.4h
+    umlal     v0.4s, v3.4h, v30.4h
+    ushll     v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v10.4s, v22.4h, v31.4h
+    umlal     v10.4s, v23.4h, v31.4h
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn      v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    mov       v0.d[1], v1.d[0]
+    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    mov       v31.d[0], v30.d[1]
+    add       v12.8h, v12.8h, v13.8h
+    xtn       v0.8b, v0.8h
+    bilinear_store_&dst_fmt 2, v3, v4
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                v1, v11, v14, v15, v16, v17, v22, v23 \
+                v3, v9,  v24, v25, v26, v27, v18, v19
+    prfm      pldl2strm, [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    mov       v6.d[0],  v3.d[0]
+    mov       v7.d[0],  v3.d[1]
+    mov       v18.d[0], v9.d[0]
+    mov       v19.d[0], v9.d[1]
+    mov       v2.d[0],  v1.d[0]
+    mov       v3.d[0],  v1.d[1]
+    mov       v22.d[0], v11.d[0]
+    mov       v23.d[0], v11.d[1]
+    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v2.4h, v30.4h
+    umlal     v0.4s, v3.4h, v30.4h
+    ushll     v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v10.4s, v22.4h, v31.4h
+    umlal     v10.4s, v23.4h, v31.4h
+    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    mov       v31.d[0], v30.d[1]
+    ushll     v2.4s, v6.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v2.4s, v6.4h, v30.4h
+    umlal     v2.4s, v7.4h, v30.4h
+    ushll     v8.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
+    prfm      pldl2strm, [TMP2, PF_OFFS]
+    umlsl     v8.4s, v18.4h, v31.4h
+    umlal     v8.4s, v19.4h, v31.4h
+    add       v12.8h, v12.8h, v13.8h
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn      v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    mov       v0.d[1], v1.d[0]
+    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn      v5.4h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    mov       v2.d[1], v5.d[0]
+    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    mov       v31.d[0], v30.d[1]
+    xtn       v0.8b, v0.8h
+    xtn       v1.8b, v2.8h
+    add       v12.8h, v12.8h, v13.8h
+    bilinear_store_&dst_fmt 4, v3, v4
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4,          0
+.set BILINEAR_FLAG_UNROLL_8,          1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ *  fname             - name of the function to generate
+ *  src_fmt           - source color format (8888 or 0565)
+ *  dst_fmt           - destination color format (8888 or 0565)
+ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
+ *  prefetch_distance - prefetch in the source image by that many
+ *                      pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+                                       src_bpp_shift, dst_bpp_shift, \
+                                       prefetch_distance, flags
+
+pixman_asm_function fname
+    OUT       .req      x0
+    TOP       .req      x1
+    BOTTOM    .req      x2
+    WT        .req      x3
+    WB        .req      x4
+    X         .req      x5
+    UX        .req      x6
+    WIDTH     .req      x7
+    TMP1      .req      x8
+    TMP2      .req      x9
+    PF_OFFS   .req      x10
+    TMP3      .req      x11
+    TMP4      .req      x12
+    STRIDE    .req      x13
+
+    sxtw      x3, w3
+    sxtw      x4, w4
+    sxtw      x5, w5
+    sxtw      x6, w6
+    sxtw      x7, w7
+
+    stp       x29, x30, [sp, -16]!
+    mov       x29, sp
+    sub       sp,  sp, 112  /* push all registers */
+    sub       x29, x29, 64
+    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+    stp        x8,  x9, [x29, -80]
+    stp       x10, x11, [x29, -96]
+    stp       x12, x13, [x29, -112]
+
+    mov       PF_OFFS, #prefetch_distance
+    mul       PF_OFFS, PF_OFFS, UX
+
+    subs      STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       300f
+
+    dup       v12.8h, w5
+    dup       v13.8h, w6
+    dup       v28.8b, w3
+    dup       v29.8b, w4
+    mov       v25.d[0], v12.d[1]
+    mov       v26.d[0], v13.d[0]
+    add       v25.4h, v25.4h, v26.4h
+    mov       v12.d[1], v25.d[0]
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       100f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       100f
+    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    mov       v31.d[0], v30.d[1]
+    add       v12.8h, v12.8h, v13.8h
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #1
+100:
+    add       v13.8h, v13.8h, v13.8h
+    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    mov       v31.d[0], v30.d[1]
+    add       v12.8h, v12.8h, v13.8h
+
+    cmp       WIDTH, #2
+    blt       100f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       100f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #2
+100:
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+    cmp       WIDTH, #4
+    blt       100f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       100f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #4
+100:
+    subs      WIDTH, WIDTH, #8
+    blt       100f
+    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
+    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    blt       500f
+1000:
+    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    bge       1000b
+500:
+    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+100:
+    tst       WIDTH, #4
+    beq       200f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+200:
+.else
+/*********** 4 pixels per iteration *****************/
+    subs      WIDTH, WIDTH, #4
+    blt       100f
+    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    blt       500f
+1000:
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    bge       1000b
+500:
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+100:
+/****************************************************/
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       200f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+200:
+    tst       WIDTH, #1
+    beq       300f
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+300:
+    sub       x29, x29, 64
+    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+    ldp        x8,  x9, [x29, -80]
+    ldp       x10, x11, [x29, -96]
+    ldp       x12, x13, [x29, -104]
+    mov       sp, x29
+    ldp       x29, x30, [sp], 16
+    ret
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.endfunc
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+    asr       TMP1, X, #16
+    add       X, X, UX
+    lsl       TMP4, TMP1, #2
+    add       TMP1, TOP, TMP4
+    asr       TMP2, X, #16
+    add       X, X, UX
+    lsl       TMP4, TMP2, #2
+    add       TMP2, TOP, TMP4
+
+    ld1       {v22.2s}, [TMP1], STRIDE
+    ld1       {v23.2s}, [TMP1]
+    asr       TMP3, X, #16
+    add       X, X, UX
+    lsl       TMP4, TMP3, #2
+    add       TMP3, TOP, TMP4
+    umull     v8.8h, v22.8b, v28.8b
+    umlal     v8.8h, v23.8b, v29.8b
+    mov       v16.d[0], v8.d[0]
+    mov       v17.d[0], v8.d[1]
+
+    ld1       {v22.2s}, [TMP2], STRIDE
+    ld1       {v23.2s}, [TMP2]
+    asr       TMP4, X, #16
+    add       X, X, UX
+    lsl       TMP1, TMP4, #2
+    add       TMP4, TOP, TMP1
+    umull     v9.8h, v22.8b, v28.8b
+    umlal     v9.8h, v23.8b, v29.8b
+    mov       v18.d[0], v9.d[0]
+    mov       v19.d[0], v9.d[1]
+
+    ld1       {v22.2s}, [TMP3], STRIDE
+    ld1       {v23.2s}, [TMP3]
+    umull     v10.8h, v22.8b, v28.8b
+    umlal     v10.8h, v23.8b, v29.8b
+    mov       v20.d[0], v10.d[0]
+    mov       v21.d[0], v10.d[1]
+
+    ushll     v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v0.4s, v16.4h, v30.4h
+    umlal     v0.4s, v17.4h, v30.4h
+
+    prfm      pldl2strm, [TMP4, PF_OFFS]
+    ld1       {v16.2s}, [TMP4], STRIDE
+    ld1       {v17.2s}, [TMP4]
+    prfm      pldl2strm, [TMP4, PF_OFFS]
+    umull     v11.8h, v16.8b, v28.8b
+    umlal     v11.8h, v17.8b, v29.8b
+    mov       v22.d[0], v11.d[0]
+    mov       v23.d[0], v11.d[1]
+
+    ushll     v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v1.4s, v18.4h, v31.4h
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+    umlal     v1.4s, v19.4h, v31.4h
+    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    mov       v30.d[0], v15.d[0]
+    mov       v31.d[0], v15.d[1]
+    ushll     v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v2.4s, v20.4h, v30.4h
+    umlal     v2.4s, v21.4h, v30.4h
+    ushll     v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v3.4s, v22.4h, v31.4h
+    umlal     v3.4s, v23.4h, v31.4h
+    add       v12.8h, v12.8h, v13.8h
+    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    shrn      v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    mov       v0.d[1], v1.d[0]
+    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+    mov       v31.d[0], v30.d[1]
+    shrn      v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    mov       v2.d[1], v5.d[0]
+    xtn       v6.8b, v0.8h
+    xtn       v7.8b, v2.8h
+    add       v12.8h, v12.8h, v13.8h
+    st1       {v6.2s, v7.2s}, [OUT], #16
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+    asr       TMP1, X, #16
+    add       X, X, UX
+    lsl       TMP2, TMP1, #2
+    add       TMP1, TOP, TMP2
+    asr       TMP2, X, #16
+    add       X, X, UX
+    lsl       TMP3, TMP2, #2
+    add       TMP2, TOP, TMP3
+        umlal     v1.4s, v19.4h, v31.4h
+        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+        mov       v30.d[0], v15.d[0]
+        mov       v31.d[0], v15.d[1]
+        ushll     v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
+        umlsl     v2.4s, v20.4h, v30.4h
+        umlal     v2.4s, v21.4h, v30.4h
+        ushll     v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
+    ld1       {v20.2s}, [TMP1], STRIDE
+        umlsl     v3.4s, v22.4h, v31.4h
+        umlal     v3.4s, v23.4h, v31.4h
+    ld1       {v21.2s}, [TMP1]
+    umull     v8.8h, v20.8b, v28.8b
+    umlal     v8.8h, v21.8b, v29.8b
+    mov       v16.d[0], v8.d[0]
+    mov       v17.d[0], v8.d[1]
+        shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+        shrn      v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+        mov       v0.d[1], v1.d[0]
+        shrn      v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+    ld1       {v22.2s}, [TMP2], STRIDE
+        shrn      v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
+        mov       v2.d[0], v4.d[0]
+        mov       v2.d[1], v5.d[0]
+        add       v12.8h, v12.8h, v13.8h
+    ld1       {v23.2s}, [TMP2]
+    umull     v9.8h, v22.8b, v28.8b
+    asr       TMP3, X, #16
+    add       X, X, UX
+    lsl       TMP4, TMP3, #2
+    add       TMP3, TOP, TMP4
+    asr       TMP4, X, #16
+    add       X, X, UX
+    lsl       TMP1, TMP4, #2
+    add       TMP4, TOP, TMP1
+    umlal     v9.8h, v23.8b, v29.8b
+    mov       v18.d[0], v9.d[0]
+    mov       v19.d[0], v9.d[1]
+    ld1       {v22.2s}, [TMP3], STRIDE
+        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
+        mov       v30.d[0], v15.d[0]
+        mov       v31.d[0], v15.d[1]
+    ld1       {v23.2s}, [TMP3]
+    umull     v10.8h, v22.8b, v28.8b
+    umlal     v10.8h, v23.8b, v29.8b
+    mov       v20.d[0], v10.d[0]
+    mov       v21.d[0], v10.d[1]
+        xtn       v6.8b, v0.8h
+    ushll     v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
+        xtn       v7.8b, v2.8h
+    umlsl     v0.4s, v16.4h, v30.4h
+    umlal     v0.4s, v17.4h, v30.4h
+    prfm      pldl2strm, [TMP4, PF_OFFS]
+    ld1       {v16.2s}, [TMP4], STRIDE
+        add       v12.8h, v12.8h, v13.8h
+    ld1       {v17.2s}, [TMP4]
+    prfm      pldl2strm, [TMP4, PF_OFFS]
+    umull     v11.8h, v16.8b, v28.8b
+    umlal     v11.8h, v17.8b, v29.8b
+    mov       v22.d[0], v11.d[0]
+    mov       v23.d[0], v11.d[1]
+        st1       {v6.2s, v7.2s}, [OUT], #16
+    ushll     v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
+    umlsl     v1.4s, v18.4h, v31.4h
+.endm
+
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+    2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+    1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+    1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
new file mode 100644
index 0000000..97cde5d
--- /dev/null
+++ b/pixman/pixman-arma64-neon-asm.h
@@ -0,0 +1,1288 @@ 
+/*
+ * Copyright ツゥ 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+    op {v&reg1&.&elem_size}, [&mem_operand&], #8
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+    op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size,
v&reg4&.&elem_size}, [&mem_operand&], #32
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
+    op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size},
[&mem_operand&], #24
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op {v&reg1&.&elem_size, v&reg2&.&elem_size,
v&reg3&.&elem_size}[idx], [&mem_operand&], #3
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+    .if elem_size==32
+        pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+    .elseif elem_size==16
+        pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+    .else
+        pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+    .endif
+.elseif numbytes == 16
+    .if elem_size==32
+          pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
+    .elseif elem_size==16
+          pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
+    .else
+          pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
+    .endif
+.elseif numbytes == 8
+    .if elem_size==32
+        pixldst1 op, 2s, %(basereg+1), mem_operand, abits
+    .elseif elem_size==16
+        pixldst1 op, 4h, %(basereg+1), mem_operand, abits
+    .else
+        pixldst1 op, 8b, %(basereg+1), mem_operand, abits
+    .endif
+.elseif numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+        pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
+    .elseif elem_size == 16
+        pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
+        pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
+    .else
+        pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
+        pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
+        pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
+        pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
+    .endif
+.elseif numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+        pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
+    .else
+        pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
+        pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
+    .endif
+.elseif numbytes == 1
+        pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP1, #1
+    add     TMP1, mem_operand, DUMMY
+    asr     TMP2, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP2, #1
+    add     TMP2, mem_operand, DUMMY
+    ld1     {v&reg1&.h}[0], [TMP1]
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP1, #1
+    add     TMP1, mem_operand, DUMMY
+    ld1     {v&reg1&.h}[1], [TMP2]
+    asr     TMP2, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP2, #1
+    add     TMP2, mem_operand, DUMMY
+    ld1     {v&reg1&.h}[2], [TMP1]
+    ld1     {v&reg1&.h}[3], [TMP2]
+.elseif elem_size == 32
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP1, #2
+    add     TMP1, mem_operand, DUMMY
+    asr     TMP2, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP2, #2
+    add     TMP2, mem_operand, DUMMY
+    ld1     {v&reg1&.s}[0], [TMP1]
+    ld1     {v&reg1&.s}[1], [TMP2]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if 0 /* elem_size == 32 */
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    sub     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    ld1     {v&reg1&.s}[0], [TMP1]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    ld1     {v&reg2&.s}[0], [TMP2, :32]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    ld1     {v&reg1&.s}[1], [TMP1]
+    ld1     {v&reg2&.s}[1], [TMP2]
+.else
+    pixld1_s elem_size, reg1, mem_operand
+    pixld1_s elem_size, reg2, mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+    asr     TMP1, VX, #16
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP1, #1
+    add     TMP1, mem_operand, DUMMY
+    ld1     {v&reg1&.h}[idx], [TMP1]
+.elseif elem_size == 32
+    asr     DUMMY, VX, #16
+    mov     TMP1, DUMMY
+    adds    VX, VX, UNIT_X
+    bmi     55f
+5:  subs    VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+55:
+    lsl     DUMMY, TMP1, #2
+    add     TMP1, mem_operand, DUMMY
+    ld1     {v&reg1&.s}[idx], [TMP1]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
+    pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
+.elseif numbytes == 8
+    pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+    .if elem_size == 32
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .elseif elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+    .endif
+.elseif numbytes == 2
+    .if elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .endif
+.elseif numbytes == 1
+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    umov DUMMY, v16.d[0]
+    uzp1 v16.8b,     v&reg1&.8b, v&reg2&.8b
+    uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
+    mov  v&reg1&.8b, v16.8b
+    mov  v16.d[0], DUMMY
+.endm
+
+.macro vzip8 reg1, reg2
+    umov DUMMY, v16.d[0]
+    zip1 v16.8b,     v&reg1&.8b, v&reg2&.8b
+    zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
+    mov  v&reg1&.8b, v16.8b
+    mov  v16.d[0], DUMMY
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(basereg+0), %(basereg+1)
+    vuzp8 %(basereg+2), %(basereg+3)
+    vuzp8 %(basereg+1), %(basereg+3)
+    vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(basereg+0), %(basereg+2)
+    vzip8 %(basereg+1), %(basereg+3)
+    vzip8 %(basereg+2), %(basereg+3)
+    vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in software
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if std_increment != 0
+    PF add PF_X, PF_X, #std_increment
+.endif
+    PF tst PF_CTL, #0xF
+    PF beq 71f
+    PF add PF_X, PF_X, #boost_increment
+    PF sub PF_CTL, PF_CTL, #1
+71:
+    PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF lsl DUMMY, PF_X, #src_bpp_shift
+    PF prfm pldl2strm, [PF_SRC, DUMMY]
+.endif
+.if dst_r_bpp != 0
+    PF lsl DUMMY, PF_X, #dst_bpp_shift
+    PF prfm pldl2strm, [PF_DST, DUMMY]
+.endif
+.if mask_bpp_shift >= 0
+    PF lsl DUMMY, PF_X, #mask_bpp_shift
+    PF prfm pldl2strm, [PF_MASK, DUMMY]
+.endif
+    PF ble 71f
+    PF sub PF_X, PF_X, ORIG_W
+    PF subs PF_CTL, PF_CTL, #0x10
+71:
+    PF ble 72f
+.if src_bpp_shift >= 0
+    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
+    PF ldrsb DUMMY, [PF_SRC, DUMMY]
+    PF add PF_SRC, PF_SRC, #1
+.endif
+.if dst_r_bpp != 0
+    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
+    PF ldrsb DUMMY, [PF_DST, DUMMY]
+    PF add PF_DST, PF_DST, #1
+.endif
+.if mask_bpp_shift >= 0
+    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
+    PF ldrsb DUMMY, [PF_MASK, DUMMY]
+    PF add PF_MASK, PF_MASK, #1
+.endif
+72:
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    prfm pldl2strm, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    prfm pldl2strm, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         52f
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #lowbit
+    beq         51f
+.endif
+    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #lowbit
+.endif
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+51:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #lowbit
+    beq         51f
+.endif
+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+51:
+.endif
+.endr
+.endif
+52:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         52f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         51f
+    pixld_src   chunk_size, src_bpp, src_basereg, SRC
+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+    PF add      PF_X, PF_X, #chunk_size
+.endif
+51:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+.if cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         51f
+.if dst_aligned_flag != 0
+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+51:
+.endif
+.endr
+52:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+    mov         W, ORIG_W
+    lsl         DUMMY, DST_STRIDE, #dst_bpp_shift
+    add         DST_W, DST_W, DUMMY
+.if src_bpp != 0
+    lsl         DUMMY, SRC_STRIDE, #src_bpp_shift
+    add         SRC, SRC, DUMMY
+.endif
+.if mask_bpp != 0
+    lsl         DUMMY, MASK_STRIDE, #mask_bpp_shift
+    add         MASK, MASK, DUMMY
+.endif
+.if (dst_w_bpp != 24)
+    lsl         DUMMY, W, #dst_bpp_shift
+    sub         DST_W, DST_W, DUMMY
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    lsl         DUMMY, W, #src_bpp_shift
+    sub         SRC, SRC, DUMMY
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    lsl         DUMMY, W, #mask_bpp_shift
+    sub         MASK, MASK, DUMMY
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+    bge         start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * v0, v1, v2, v3     - reserved for loading source pixel data
+ * v4, v5, v6, v7     - reserved for loading destination pixel data
+ * v24, v25, v26, v27 - reserved for loading mask pixel data
+ * v28, v29, v30, v31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    pixman_asm_function fname
+    stp         x29, x30, [sp, -16]!
+    mov         x29, sp
+    sub         sp,   sp, 232  /* push all registers */
+    sub         x29, x29, 64
+    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
+    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
+    stp          x8,   x9, [x29, -80]
+    stp         x10,  x11, [x29, -96]
+    stp         x12,  x13, [x29, -112]
+    stp         x14,  x15, [x29, -128]
+    stp         x16,  x17, [x29, -144]
+    stp         x18,  x19, [x29, -160]
+    stp         x20,  x21, [x29, -176]
+    stp         x22,  x23, [x29, -192]
+    stp         x24,  x25, [x29, -208]
+    stp         x26,  x27, [x29, -224]
+    str         x28, [x29, -232]
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req       x0      /* width (is updated during processing) */
+    H           .req       x1      /* height (is updated during processing) */
+    DST_W       .req       x2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req       x3      /* destination image stride */
+    SRC         .req       x4      /* source buffer pointer */
+    SRC_STRIDE  .req       x5      /* source image stride */
+    MASK        .req       x6      /* mask pointer */
+    MASK_STRIDE .req       x7      /* mask stride */
+
+    DST_R       .req       x8      /* destination buffer pointer for reads */
+
+    PF_CTL      .req       x9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req       x10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req       x11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req       x12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req       x13     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+
+    ORIG_W      .req       x14     /* saved original width */
+    DUMMY       .req       x15     /* temporary register */
+
+    sxtw        x0, w0
+    sxtw        x1, w1
+    sxtw        x3, w3
+    sxtw        x5, w5
+    sxtw        x7, w7
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+    .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+    PF mov      PF_X, #0
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF lsl      DUMMY, H, #4
+    PF mov      PF_CTL, DUMMY
+    PF add      PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
+
+    init
+    subs        H, H, #1
+    mov         ORIG_W, W
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         800f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add      PF_X, PF_X, #pixblock_size
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         200f
+
+100:
+    process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         100b
+
+200:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+    cleanup
+1000:
+    /* pop all registers */
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp          x8,   x9, [x29, -80]
+    ldp         x10,  x11, [x29, -96]
+    ldp         x12,  x13, [x29, -112]
+    ldp         x14,  x15, [x29, -128]
+    ldp         x16,  x17, [x29, -144]
+    ldp         x18,  x19, [x29, -160]
+    ldp         x20,  x21, [x29, -176]
+    ldp         x22,  x23, [x29, -192]
+    ldp         x24,  x25, [x29, -208]
+    ldp         x26,  x27, [x29, -224]
+    ldr         x28, [x29, -232]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+800:
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         100f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+100:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 800b
+9:
+    cleanup
+    /* pop all registers */
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp          x8,   x9, [x29, -80]
+    ldp         x10,  x11, [x29, -96]
+    ldp         x12,  x13, [x29, -112]
+    ldp         x14,  x15, [x29, -128]
+    ldp         x16,  x17, [x29, -144]
+    ldp         x18,  x19, [x29, -160]
+    ldp         x20,  x21, [x29, -176]
+    ldp         x22,  x23, [x29, -192]
+    ldp         x24,  x25, [x29, -208]
+    ldp         x26,  x27, [x29, -224]
+    ldr         x28, [x29, -232]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+
process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    pixman_asm_function fname
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+.if use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        x0
+    DST_W       .req        x1
+    SRC         .req        x2
+    VX          .req        x3
+    UNIT_X      .req        x4
+    SRC_WIDTH_FIXED .req    x5
+    MASK        .req        x6
+    TMP1        .req        x8
+    TMP2        .req        x9
+    DST_R       .req        x10
+    DUMMY       .req        x30
+
+    .macro pixld_src x:vararg
+        pixld_s x
+    .endm
+
+    sxtw        x0, w0
+    sxtw        x3, w3
+    sxtw        x4, w4
+    sxtw        x5, w5
+
+    stp         x29, x30, [sp, -16]!
+    mov         x29, sp
+    sub         sp, sp, 88
+    sub         x29, x29, 64
+    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    stp         x8, x9, [x29, -80]
+    str         x10, [x29, -88]
+.else
+    /*
+     * Assign symbolic names to registers
+     */
+    W           .req        x0      /* width (is updated during processing) */
+    DST_W       .req        x1      /* destination buffer pointer for writes */
+    SRC         .req        x2      /* source buffer pointer */
+    MASK        .req        x3      /* mask pointer */
+    DST_R       .req        x4      /* destination buffer pointer for reads */
+    DUMMY       .req        x30
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+
+    sxtw        x0, w0
+
+    stp         x29, x30, [sp, -16]!
+    mov         x29, sp
+    sub         sp, sp, 64
+    sub         x29, x29, 64
+    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
+    init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         800f
+
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         700f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         200f
+100:
+    process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         100b
+200:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+700:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+.if use_nearest_scaling != 0
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp         x8, x9, [x29, -80]
+    ldr         x10, [x29, -96]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+.else
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+.endif
+800:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+.if use_nearest_scaling != 0
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    ldp         x8, x9, [x29, -80]
+    ldr         x10, [x29, -88]
+    mov         sp, x29
+    ldp         x29, x30, [sp], 16
+    ret  /* exit */
+
+    .unreq      DUMMY
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+    .unreq      SRC_WIDTH_FIXED
+
+.else
+    sub         x29, x29, 64
+    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
+    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
+    mov          sp, x29
+    ldp          x29, x30, [sp], 16
+    ret  /* exit */
+
+    .unreq      DUMMY
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .endfunc
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores v8-v15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+.endm
+
+.macro default_cleanup_need_all_regs
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ *          value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+    shrn        &out_r&.8b, &in&.8h,    #8
+    shrn        &out_g&.8b, &in&.8h,    #3
+    sli         &in&.8h,    &in&.8h,    #5
+    movi        &out_a&.8b, #255
+    sri         &out_r&.8b, &out_r&.8b, #5
+    sri         &out_g&.8b, &out_g&.8b, #6
+    shrn        &out_b&.8b, &in&.8h,    #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+    shrn        &out_r&.8b, &in&.8h,    #8
+    shrn        &out_g&.8b, &in&.8h,    #3
+    sli         &in&.8h,    &in&.8h,    #5
+    sri         &out_r&.8b, &out_r&.8b, #5
+    sri         &out_g&.8b, &out_g&.8b, #6
+    shrn        &out_b&.8b, &in&.8h,    #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+    ushll       &tmp1&.8h, &in_g&.8b, #7
+    shl         &tmp1&.8h, &tmp1&.8h, #1
+    ushll       &out&.8h,  &in_r&.8b, #7
+    shl         &out&.8h,  &out&.8h,  #1
+    ushll       &tmp2&.8h, &in_b&.8b, #7
+    shl         &tmp2&.8h, &tmp2&.8h, #1
+    sri         &out&.8h, &tmp1&.8h, #5
+    sri         &out&.8h, &tmp2&.8h, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+    shl         &out0&.4h, &in&.4h,   #5  /* G top 6 bits */
+    shl         &tmp&.4h,  &in&.4h,   #11 /* B top 5 bits */
+    sri         &in&.4h,   &in&.4h,   #5  /* R is ready in top bits */
+    sri         &out0&.4h, &out0&.4h, #6  /* G is ready in top bits */
+    sri         &tmp&.4h,  &tmp&.4h,  #5  /* B is ready in top bits */
+    ushr        &out1&.4h, &in&.4h,   #8  /* R is in place */
+    sri         &out0&.4h, &tmp&.4h,  #8  /* G & B is in place */
+    zip1        &tmp&.4h,  &out0&.4h, &out1&.4h  /* everything is in place */
+    zip2        &out1&.4h, &out0&.4h, &out1&.4h
+    mov         &out0&.d[0], &tmp&.d[0]
+.endm
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
old mode 100644
new mode 100755
index 73a5414..81e0f23
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -607,6 +607,11 @@  pixman_implementation_t *
 _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
 #endif

+#ifdef USE_ARM_A64_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
 #ifdef USE_MIPS_DSPR2
 pixman_implementation_t *
 _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);

Comments

> The 'advanced' prefetch type is implemented by having some branchless ARM code

If the prefetch code assumes that "branch-less", it cannot be done in aarch64
since aarch64 doesn't support conditional alythmetics such as subge, subges.

If so, we could / should remove all prefetch-related codes because it
might cause
performance regression (by branching) rather than benefit of prefetching.

And also, we could remove all "tail-head" optimizatoins that is only
for highly utilizing prefetching.
"tail-head" codes are very complicated, hard to understand and hard to maintain.
If we could remove these codes, asm code could be more slimmer and
easy-to-maintain.

Ofcource, the modification shouldn't be applied for original
aarch32-neon codes. It may cause
performance regression on some architecture.
But for aarch64, it would be a considerable changes ?


On 2 April 2016 at 21:30, Mizuki Asakura <ed6e117f@gmail.com> wrote:
> Since aarch64 has different neon syntax from aarch32 and has no
> support for (older) arm-simd,
> there are no SIMD accelerations for pixman on aarch64.
>
> We need new implementations.
>
>
> This patch only contains STD_FAST_PATH codes, not scaling (nearest,
> bilinear) codes.
> After completing optimization this patch, scaling related codes should be done.
>
>
> This is a first step towards optimizations for aarch64-neon.
>
>
> Added: https://bugs.freedesktop.org/show_bug.cgi?id=94758
> Signed-off-by: Mizuki Asakura <ed6e117f@gmail.com>
> ---
>  configure.ac                    |   34 +
>  pixman/Makefile.am              |   14 +
>  pixman/pixman-arm-neon.c        |   10 +-
>  pixman/pixman-arm.c             |    6 +
>  pixman/pixman-arma64-neon-asm.S | 3771 +++++++++++++++++++++++++++++++++++++++
>  pixman/pixman-arma64-neon-asm.h | 1288 +++++++++++++
>  pixman/pixman-private.h         |    5 +
>  7 files changed, 5127 insertions(+), 1 deletion(-)
>  create mode 100644 pixman/pixman-arma64-neon-asm.S
>  create mode 100644 pixman/pixman-arma64-neon-asm.h
>
> diff --git a/configure.ac b/configure.ac
> old mode 100644
> new mode 100755
> index 6b2134e..bb0192a
> --- a/configure.ac
> +++ b/configure.ac
> @@ -667,6 +667,40 @@ if test $enable_arm_neon = yes && test
> $have_arm_neon = no ; then
>     AC_MSG_ERROR([ARM NEON intrinsics not detected])
>  fi
>
> +dnl ==========================================================================
> +dnl Check if assembler is gas compatible and supports ARM-a64 NEON instructions
> +have_arm_a64_neon=no
> +AC_MSG_CHECKING(whether to use ARM A64 NEON assembler)
> +xserver_save_CFLAGS=$CFLAGS
> +CFLAGS="-x assembler-with-cpp $CFLAGS"
> +AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
> +.text
> +.arch armv8-a
> +.altmacro
> +prfm pldl2strm, [x0]
> +xtn v0.8b, v0.8h]])], have_arm_a64_neon=yes)
> +CFLAGS=$xserver_save_CFLAGS
> +
> +AC_ARG_ENABLE(arm-a64-neon,
> +   [AC_HELP_STRING([--disable-arm-a64-neon],
> +                   [disable ARM A64 NEON fast paths])],
> +   [enable_arm_a64_neon=$enableval], [enable_arm_a64_neon=auto])
> +
> +if test $enable_arm_a64_neon = no ; then
> +   have_arm_a64_neon=disabled
> +fi
> +
> +if test $have_arm_a64_neon = yes ; then
> +   AC_DEFINE(USE_ARM_A64_NEON, 1, [use ARM A64_NEON assembly optimizations])
> +fi
> +
> +AM_CONDITIONAL(USE_ARM_A64_NEON, test $have_arm_a64_neon = yes)
> +
> +AC_MSG_RESULT($have_arm_a64_neon)
> +if test $enable_arm_a64_neon = yes && test $have_arm_a64_neon4 = no ; then
> +   AC_MSG_ERROR([ARM A64 NEON intrinsics not detected])
> +fi
> +
>  dnl ===========================================================================
>  dnl Check for IWMMXT
>
> diff --git a/pixman/Makefile.am b/pixman/Makefile.am
> old mode 100644
> new mode 100755
> index 581b6f6..1b1a8ac
> --- a/pixman/Makefile.am
> +++ b/pixman/Makefile.am
> @@ -94,6 +94,20 @@ libpixman_1_la_LIBADD += libpixman-arm-neon.la
>  ASM_CFLAGS_arm_neon=
>  endif
>
> +# arm a64 neon code
> +if USE_ARM_A64_NEON
> +noinst_LTLIBRARIES += libpixman-arma64-neon.la
> +libpixman_arma64_neon_la_SOURCES = \
> +        pixman-arm-neon.c    \
> +        pixman-arm-common.h    \
> +        pixman-arma64-neon-asm.S    \
> +        pixman-arm-asm.h    \
> +        pixman-arma64-neon-asm.h
> +libpixman_1_la_LIBADD += libpixman-arma64-neon.la
> +
> +ASM_CFLAGS_arm_neon=
> +endif
> +
>  # iwmmxt code
>  if USE_ARM_IWMMXT
>  libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
> diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
> old mode 100644
> new mode 100755
> index be761c9..cef8c90
> --- a/pixman/pixman-arm-neon.c
> +++ b/pixman/pixman-arm-neon.c
> @@ -121,6 +121,7 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon,
> over_8888_8_0565,
>  PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
>                                          uint16_t, 1, uint8_t, 1, uint16_t, 1)
>
> +#ifndef __aarch64__
>  PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
>                                          uint32_t, uint32_t)
>  PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
> @@ -160,6 +161,7 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST
> (SKIP_ZERO_SRC, neon, 8888_8_8888, OV
>                                              uint32_t, uint32_t)
>  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon,
> 8888_8_8888, ADD,
>                                              uint32_t, uint32_t)
> +#endif
>
>  void
>  pixman_composite_src_n_8_asm_neon (int32_t   w,
> @@ -194,7 +196,7 @@ arm_neon_fill (pixman_implementation_t *imp,
>             uint32_t                 _xor)
>  {
>      /* stride is always multiple of 32bit units in pixman */
> -    uint32_t byte_stride = stride * sizeof(uint32_t);
> +    int32_t byte_stride = stride * sizeof(uint32_t);
>
>      switch (bpp)
>      {
> @@ -362,6 +364,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
>      PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8,
> neon_composite_out_reverse_8_8888),
>      PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8,
> neon_composite_out_reverse_8_8888),
>
> +#ifndef __aarch64__
>      SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
>      SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
>      SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
> @@ -420,10 +423,12 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
>
>      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8,
> neon_8888_8_8888),
>      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8,
> neon_8888_8_8888),
> +#endif
>
>      { PIXMAN_OP_NONE },
>  };
>
> +#ifndef __aarch64__
>  #define BIND_COMBINE_U(name)                                             \
>  void                                                                     \
>  pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
> @@ -454,6 +459,7 @@ neon_combine_##name##_u (pixman_implementation_t
> *imp,                   \
>  BIND_COMBINE_U (over)
>  BIND_COMBINE_U (add)
>  BIND_COMBINE_U (out_reverse)
> +#endif
>
>  pixman_implementation_t *
>  _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
> @@ -461,9 +467,11 @@ _pixman_implementation_create_arm_neon
> (pixman_implementation_t *fallback)
>      pixman_implementation_t *imp =
>      _pixman_implementation_create (fallback, arm_neon_fast_paths);
>
> +#ifndef __aarch64__
>      imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
>      imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
>      imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
> +#endif
>
>      imp->blt = arm_neon_blt;
>      imp->fill = arm_neon_fill;
> diff --git a/pixman/pixman-arm.c b/pixman/pixman-arm.c
> old mode 100644
> new mode 100755
> index 23374e4..734cbea
> --- a/pixman/pixman-arm.c
> +++ b/pixman/pixman-arm.c
> @@ -221,5 +221,11 @@ _pixman_arm_get_implementations
> (pixman_implementation_t *imp)
>      imp = _pixman_implementation_create_arm_neon (imp);
>  #endif
>
> +#ifdef USE_ARM_A64_NEON
> +    /* neon is a part of aarch64 */
> +    if (!_pixman_disabled ("arm-neon"))
> +        imp = _pixman_implementation_create_arm_neon (imp);
> +#endif
> +
>      return imp;
>  }
> diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
> new file mode 100644
> index 0000000..f60d1b4
> --- /dev/null
> +++ b/pixman/pixman-arma64-neon-asm.S
> @@ -0,0 +1,3771 @@
> +/*
> + * Copyright ツゥ 2009 Nokia Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + *
> + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
> + */
> +
> +/*
> + * This file contains implementations of NEON optimized pixel processing
> + * functions. There is no full and detailed tutorial, but some functions
> + * (those which are exposing some new or interesting features) are
> + * extensively commented and can be used as examples.
> + *
> + * You may want to have a look at the comments for following functions:
> + *  - pixman_composite_over_8888_0565_asm_neon
> + *  - pixman_composite_over_n_8_0565_asm_neon
> + */
> +
> +/* Prevent the stack from becoming executable for no reason... */
> +#if defined(__linux__) && defined(__ELF__)
> +.section .note.GNU-stack,"",%progbits
> +#endif
> +
> +.text
> +.arch armv8-a
> +
> +.altmacro
> +.p2align 2
> +
> +#include "pixman-private.h"
> +#include "pixman-arm-asm.h"
> +#include "pixman-arma64-neon-asm.h"
> +
> +/* Global configuration options and preferences */
> +
> +/*
> + * The code can optionally make use of unaligned memory accesses to improve
> + * performance of handling leading/trailing pixels for each scanline.
> + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
> + * example in linux if unaligned memory accesses are not configured to
> + * generate.exceptions.
> + */
> +.set RESPECT_STRICT_ALIGNMENT, 1
> +
> +/*
> + * Set default prefetch type. There is a choice between the following options:
> + *
> + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
> + * as NOP to workaround some HW bugs or for whatever other reason)
> + *
> + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
> + * advanced prefetch intruduces heavy overhead)
> + *
> + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
> + * which can run ARM and NEON instructions simultaneously so that extra ARM
> + * instructions do not add (many) extra cycles, but improve prefetch
> efficiency)
> + *
> + * Note: some types of function can't support advanced prefetch and fallback
> + *       to simple one (those which handle 24bpp pixels)
> + */
> +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
> +
> +/* Prefetch distance in pixels for simple prefetch */
> +.set PREFETCH_DISTANCE_SIMPLE, 64
> +
> +/*
> + * Implementation of pixman_composite_over_8888_0565_asm_neon
> + *
> + * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
> + * performs OVER compositing operation. Function fast_composite_over_8888_0565
> + * from pixman-fast-path.c does the same in C and can be used as a reference.
> + *
> + * First we need to have some NEON assembly code which can do the actual
> + * operation on the pixels and provide it to the template macro.
> + *
> + * Template macro quite conveniently takes care of emitting all the necessary
> + * code for memory reading and writing (including quite tricky cases of
> + * handling unaligned leading/trailing pixels), so we only need to deal with
> + * the data in NEON registers.
> + *
> + * NEON registers allocation in general is recommented to be the following:
> + * v0,  v1,  v2,  v3  - contain loaded source pixel data
> + * v4,  v5,  v6,  v7  - contain loaded destination pixels (if they are needed)
> + * v24, v25, v26, v27 - contain loading mask pixel data (if mask is used)
> + * v28, v29, v30, v31 - place for storing the result (destination pixels)
> + *
> + * As can be seen above, four 64-bit NEON registers are used for keeping
> + * intermediate pixel data and up to 8 pixels can be processed in one step
> + * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
> + *
> + * This particular function uses the following registers allocation:
> + * v0,  v1,  v2,  v3  - contain loaded source pixel data
> + * v4,  v5            - contain loaded destination pixels (they are needed)
> + * v28, v29           - place for storing the result (destination pixels)
> + */
> +
> +/*
> + * Step one. We need to have some code to do some arithmetics on pixel data.
> + * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
> + * back-to-back, they take pixel data from {v0, v1, v2, v3} and {v4, v5},
> + * perform all the needed calculations and write the result to {v28, v29}.
> + * The rationale for having two macros and not just one will be explained
> + * later. In practice, any single monolitic function which does the work can
> + * be split into two parts in any arbitrary way without affecting correctness.
> + *
> + * There is one special trick here too. Common template macro can optionally
> + * make our life a bit easier by doing R, G, B, A color components
> + * deinterleaving for 32bpp pixel formats (and this feature is used in
> + * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
> + * instead of having 8 packed pixels in {v0, v1, v2, v3} registers, we
> + * actually use v0 register for blue channel (a vector of eight 8-bit
> + * values), v1 register for green, v2 for red and v3 for alpha. This
> + * simple conversion can be also done with a few NEON instructions:
> + *
> + * Packed to planar conversion: // vuzp8 is a wrapper macro
> + *  vuzp8 v0, v1
> + *  vuzp8 v2, v3
> + *  vuzp8 v1, v3
> + *  vuzp8 v0, v2
> + *
> + * Planar to packed conversion: // vzip8 is a wrapper macro
> + *  vzip8 v0, v2
> + *  vzip8 v1, v3
> + *  vzip8 v2, v3
> + *  vzip8 v0, v1
> + *
> + * But pixel can be loaded directly in planar format using LD4 / b NEON
> + * instruction. It is 1 cycle slower than LD1 / s, so this is not always
> + * desirable, that's why deinterleaving is optional.
> + *
> + * But anyway, here is the code:
> + */
> +
> +.macro pixman_composite_over_8888_0565_process_pixblock_head
> +    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
> +       and put data into v6 - red, v7 - green, v30 - blue */
> +    mov         v4.d[1], v5.d[0]
> +    shrn        v6.8b, v4.8h, #8
> +    shrn        v7.8b, v4.8h, #3
> +    sli         v4.8h, v4.8h, #5
> +    sri         v6.8b, v6.8b, #5
> +    mvn         v3.8b, v3.8b      /* invert source alpha */
> +    sri         v7.8b, v7.8b, #6
> +    shrn        v30.8b, v4.8h, #2
> +    /* now do alpha blending, storing results in 8-bit planar format
> +       into v20 - red, v23 - green, v22 - blue */
> +    umull       v10.8h, v3.8b, v6.8b
> +    umull       v11.8h, v3.8b, v7.8b
> +    umull       v12.8h, v3.8b, v30.8b
> +    urshr       v17.8h, v10.8h, #8
> +    urshr       v18.8h, v11.8h, #8
> +    urshr       v19.8h, v12.8h, #8
> +    raddhn      v20.8b, v10.8h, v17.8h
> +    raddhn      v23.8b, v11.8h, v18.8h
> +    raddhn      v22.8b, v12.8h, v19.8h
> +.endm
> +
> +.macro pixman_composite_over_8888_0565_process_pixblock_tail
> +    /* ... continue alpha blending */
> +    uqadd       v17.8b, v2.8b, v20.8b
> +    uqadd       v18.8b, v0.8b, v22.8b
> +    uqadd       v19.8b, v1.8b, v23.8b
> +    /* convert the result to r5g6b5 and store it into {v14} */
> +    ushll       v14.8h, v17.8b, #7
> +    sli         v14.8h, v14.8h, #1
> +    ushll       v8.8h, v19.8b, #7
> +    sli         v8.8h, v8.8h, #1
> +    ushll       v9.8h, v18.8b, #7
> +    sli         v9.8h, v9.8h, #1
> +    sri         v14.8h, v8.8h, #5
> +    sri         v14.8h, v9.8h, #11
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +/*
> + * OK, now we got almost everything that we need. Using the above two
> + * macros, the work can be done right. But now we want to optimize
> + * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
> + * a lot from good code scheduling and software pipelining.
> + *
> + * Let's construct some code, which will run in the core main loop.
> + * Some pseudo-code of the main loop will look like this:
> + *   head
> + *   while (...) {
> + *     tail
> + *     head
> + *   }
> + *   tail
> + *
> + * It may look a bit weird, but this setup allows to hide instruction
> + * latencies better and also utilize dual-issue capability more
> + * efficiently (make pairs of load-store and ALU instructions).
> + *
> + * So what we need now is a '*_tail_head' macro, which will be used
> + * in the core main loop. A trivial straightforward implementation
> + * of this macro would look like this:
> + *
> + *   pixman_composite_over_8888_0565_process_pixblock_tail
> + *   st1         {v28.4h, v29.4h}, [DST_W], #32
> + *   ld1         {v4.4h, v5.4h}, [DST_R], #16
> + *   ld4         {v0.2s, v1.2s, v2.2s, v3.2s}, [SRC], #32
> + *   pixman_composite_over_8888_0565_process_pixblock_head
> + *   cache_preload 8, 8
> + *
> + * Now it also got some VLD/VST instructions. We simply can't move from
> + * processing one block of pixels to the other one with just arithmetics.
> + * The previously processed data needs to be written to memory and new
> + * data needs to be fetched. Fortunately, this main loop does not deal
> + * with partial leading/trailing pixels and can load/store a full block
> + * of pixels in a bulk. Additionally, destination buffer is already
> + * 16 bytes aligned here (which is good for performance).
> + *
> + * New things here are DST_R, DST_W, SRC and MASK identifiers. These
> + * are the aliases for ARM registers which are used as pointers for
> + * accessing data. We maintain separate pointers for reading and writing
> + * destination buffer (DST_R and DST_W).
> + *
> + * Another new thing is 'cache_preload' macro. It is used for prefetching
> + * data into CPU L2 cache and improve performance when dealing with large
> + * images which are far larger than cache size. It uses one argument
> + * (actually two, but they need to be the same here) - number of pixels
> + * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
> + * details about this macro. Moreover, if good performance is needed
> + * the code from this macro needs to be copied into '*_tail_head' macro
> + * and mixed with the rest of code for optimal instructions scheduling.
> + * We are actually doing it below.
> + *
> + * Now after all the explanations, here is the optimized code.
> + * Different instruction streams (originaling from '*_head', '*_tail'
> + * and 'cache_preload' macro) use different indentation levels for
> + * better readability. Actually taking the code from one of these
> + * indentation levels and ignoring a few LD/ST instructions would
> + * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
> + * macro!
> + */
> +
> +#if 1
> +
> +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
> +        uqadd       v17.8b, v2.8b, v20.8b
> +    ld1         {v4.4h, v5.4h}, [DST_R], #16
> +    mov         v4.d[1], v5.d[0]
> +        uqadd       v18.8b, v0.8b, v22.8b
> +        uqadd       v19.8b, v1.8b, v23.8b
> +    shrn        v6.8b, v4.8h, #8
> +    fetch_src_pixblock
> +    shrn        v7.8b, v4.8h, #3
> +    sli         v4.8h, v4.8h, #5
> +        ushll       v14.8h, v17.8b, #7
> +        sli         v14.8h, v14.8h, #1
> +                                    PF add PF_X, PF_X, #8
> +        ushll       v8.8h, v19.8b, #7
> +        sli         v8.8h, v8.8h,  #1
> +                                    PF tst PF_CTL, #0xF
> +    sri         v6.8b, v6.8b, #5
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +10:
> +    mvn         v3.8b, v3.8b
> +                                    PF beq 10f
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +    sri         v7.8b, v7.8b, #6
> +    shrn        v30.8b, v4.8h, #2
> +    umull       v10.8h, v3.8b, v6.8b
> +                                    PF lsl DUMMY, PF_X, #src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +    umull       v11.8h, v3.8b, v7.8b
> +    umull       v12.8h, v3.8b, v30.8b
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +        sri         v14.8h, v8.8h, #5
> +                                    PF cmp PF_X, ORIG_W
> +        ushll       v9.8h, v18.8b, #7
> +        sli         v9.8h, v9.8h, #1
> +    urshr       v17.8h, v10.8h, #8
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    urshr       v19.8h, v11.8h, #8
> +    urshr       v18.8h, v12.8h, #8
> +                                    PF ble 10f
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +        sri         v14.8h, v9.8h, #11
> +        mov         v28.d[0], v14.d[0]
> +        mov         v29.d[0], v14.d[1]
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +10:
> +    raddhn      v20.8b, v10.8h, v17.8h
> +    raddhn      v23.8b, v11.8h, v19.8h
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_SRC, #1
> +10:
> +    raddhn      v22.8b, v12.8h, v18.8h
> +        st1         {v14.8h}, [DST_W], #16
> +.endm
> +
> +#else
> +
> +/* If we did not care much about the performance, we would just use this... */
> +.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
> +    pixman_composite_over_8888_0565_process_pixblock_tail
> +    st1         {v14.8h}, [DST_W], #16
> +    ld1         {v4.4h, v4.5h}, [DST_R], #16
> +    fetch_src_pixblock
> +    pixman_composite_over_8888_0565_process_pixblock_head
> +    cache_preload 8, 8
> +.endm
> +
> +#endif
> +
> +/*
> + * And now the final part. We are using 'generate_composite_function' macro
> + * to put all the stuff together. We are specifying the name of the function
> + * which we want to get, number of bits per pixel for the source, mask and
> + * destination (0 if unused, like mask in this case). Next come some bit
> + * flags:
> + *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
> + *                             and written, for write-only buffer we would use
> + *                             FLAG_DST_WRITEONLY flag instead
> + *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
> + *                             and separate color channels for 32bpp format.
> + * The next things are:
> + *  - the number of pixels processed per iteration (8 in this case, because
> + *    that's the maximum what can fit into four 64-bit NEON registers).
> + *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
> + *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
> + *    prefetch distance can be selected by running some benchmarks.
> + *
> + * After that we specify some macros, these are 'default_init',
> + * 'default_cleanup' here which are empty (but it is possible to have custom
> + * init/cleanup macros to be able to save/restore some extra NEON registers
> + * like d8-d15 or do anything else) followed by
> + * 'pixman_composite_over_8888_0565_process_pixblock_head',
> + * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
> + * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
> + * which we got implemented above.
> + *
> + * The last part is the NEON registers allocation scheme.
> + */
> +generate_composite_function \
> +    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_over_8888_0565_process_pixblock_head, \
> +    pixman_composite_over_8888_0565_process_pixblock_tail, \
> +    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_0565_process_pixblock_head
> +    /* convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
> +       and put data into v6 - red, v7 - green, v30 - blue */
> +    mov         v4.d[1], v5.d[0]
> +    shrn        v6.8b, v4.8h, #8
> +    shrn        v7.8b, v4.8h, #3
> +    sli         v4.8h, v4.8h, #5
> +    sri         v6.8b, v6.8b, #5
> +    sri         v7.8b, v7.8b, #6
> +    shrn        v30.8b, v4.8h, #2
> +    /* now do alpha blending, storing results in 8-bit planar format
> +       into v20 - red, v23 - green, v22 - blue */
> +    umull       v10.8h, v3.8b, v6.8b
> +    umull       v11.8h, v3.8b, v7.8b
> +    umull       v12.8h, v3.8b, v30.8b
> +    urshr       v13.8h, v10.8h, #8
> +    urshr       v14.8h, v11.8h, #8
> +    urshr       v15.8h, v12.8h, #8
> +    raddhn      v20.8b, v10.8h, v13.8h
> +    raddhn      v23.8b, v11.8h, v14.8h
> +    raddhn      v22.8b, v12.8h, v15.8h
> +.endm
> +
> +.macro pixman_composite_over_n_0565_process_pixblock_tail
> +    /* ... continue alpha blending */
> +    uqadd       v17.8b, v2.8b, v20.8b
> +    uqadd       v18.8b, v0.8b, v22.8b
> +    uqadd       v19.8b, v1.8b, v23.8b
> +    /* convert the result to r5g6b5 and store it into {v14} */
> +    ushll       v14.8h, v17.8b, #7
> +    sli         v14.8h, v14.8h, #1
> +    ushll       v8.8h, v19.8b, #7
> +    sli         v8.8h, v8.8h, #1
> +    ushll       v9.8h, v18.8b, #7
> +    sli         v9.8h, v9.8h, #1
> +    sri         v14.8h, v8.8h, #5
> +    sri         v14.8h, v9.8h, #11
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_n_0565_process_pixblock_tail_head
> +    pixman_composite_over_n_0565_process_pixblock_tail
> +    ld1         {v4.4h, v5.4h}, [DST_R], #16
> +    st1         {v14.8h}, [DST_W], #16
> +    pixman_composite_over_n_0565_process_pixblock_head
> +    cache_preload 8, 8
> +.endm
> +
> +.macro pixman_composite_over_n_0565_init
> +    mov         v3.s[0], w4
> +    dup         v0.8b, v3.b[0]
> +    dup         v1.8b, v3.b[1]
> +    dup         v2.8b, v3.b[2]
> +    dup         v3.8b, v3.b[3]
> +    mvn         v3.8b, v3.8b      /* invert source alpha */
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_n_0565_init, \
> +    default_cleanup, \
> +    pixman_composite_over_n_0565_process_pixblock_head, \
> +    pixman_composite_over_n_0565_process_pixblock_tail, \
> +    pixman_composite_over_n_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_8888_0565_process_pixblock_head
> +    ushll       v8.8h,  v1.8b,  #7
> +    sli         v8.8h,  v8.8h,  #1
> +    ushll       v14.8h, v2.8b,  #7
> +    sli         v14.8h, v14.8h, #1
> +    ushll       v9.8h,  v0.8b,  #7
> +    sli         v9.8h,  v9.8h,  #1
> +.endm
> +
> +.macro pixman_composite_src_8888_0565_process_pixblock_tail
> +    sri         v14.8h, v8.8h, #5
> +    sri         v14.8h, v9.8h, #11
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
> +        sri         v14.8h, v8.8h, #5
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0xF
> +    fetch_src_pixblock
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        sri         v14.8h, v9.8h, #11
> +        mov         v28.d[0], v14.d[0]
> +        mov         v29.d[0], v14.d[1]
> +                                    PF cmp PF_X, ORIG_W
> +                                    PF lsl DUMMY, PF_X, #src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +    ushll       v8.8h, v1.8b, #7
> +    sli         v8.8h, v8.8h, #1
> +        st1        {v14.8h}, [DST_W], #16
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    ushll       v14.8h, v2.8b, #7
> +    sli         v14.8h, v14.8h, #1
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +10:
> +    ushll       v9.8h, v0.8b, #7
> +    sli         v9.8h, v9.8h, #1
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_8888_0565_process_pixblock_head, \
> +    pixman_composite_src_8888_0565_process_pixblock_tail, \
> +    pixman_composite_src_8888_0565_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0565_8888_process_pixblock_head
> +    mov         v0.d[1], v1.d[0]
> +    shrn        v30.8b, v0.8h, #8
> +    shrn        v29.8b, v0.8h, #3
> +    sli         v0.8h,  v0.8h, #5
> +    movi        v31.8b, #255
> +    sri         v30.8b, v30.8b, #5
> +    sri         v29.8b, v29.8b, #6
> +    shrn        v28.8b, v0.8h, #2
> +.endm
> +
> +.macro pixman_composite_src_0565_8888_process_pixblock_tail
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
> +    pixman_composite_src_0565_8888_process_pixblock_tail
> +    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    fetch_src_pixblock
> +    pixman_composite_src_0565_8888_process_pixblock_head
> +    cache_preload 8, 8
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_0565_8888_process_pixblock_head, \
> +    pixman_composite_src_0565_8888_process_pixblock_tail, \
> +    pixman_composite_src_0565_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8_8_process_pixblock_head
> +    uqadd       v28.8b, v0.8b, v4.8b
> +    uqadd       v29.8b, v1.8b, v5.8b
> +    uqadd       v30.8b, v2.8b, v6.8b
> +    uqadd       v31.8b, v3.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_add_8_8_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_add_8_8_process_pixblock_tail_head
> +    fetch_src_pixblock
> +                                    PF add PF_X, PF_X, #32
> +                                    PF tst PF_CTL, #0xF
> +    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #32
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +                                    PF cmp PF_X, ORIG_W
> +                                    PF lsl DUMMY, PF_X, #src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    uqadd       v28.8b, v0.8b, v4.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_DST, #1
> +10:
> +    uqadd       v29.8b, v1.8b, v5.8b
> +    uqadd       v30.8b, v2.8b, v6.8b
> +    uqadd       v31.8b, v3.8b, v7.8b
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
> +    FLAG_DST_READWRITE, \
> +    32, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_add_8_8_process_pixblock_head, \
> +    pixman_composite_add_8_8_process_pixblock_tail, \
> +    pixman_composite_add_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
> +    fetch_src_pixblock
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0xF
> +    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        st1     {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +                                    PF cmp PF_X, ORIG_W
> +                                    PF lsl DUMMY, PF_X, #src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    uqadd       v28.8b, v0.8b, v4.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_DST, #1
> +10:
> +    uqadd       v29.8b, v1.8b, v5.8b
> +    uqadd       v30.8b, v2.8b, v6.8b
> +    uqadd       v31.8b, v3.8b, v7.8b
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_add_8_8_process_pixblock_head, \
> +    pixman_composite_add_8_8_process_pixblock_tail, \
> +    pixman_composite_add_8888_8888_process_pixblock_tail_head
> +
> +generate_composite_function_single_scanline \
> +    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_add_8_8_process_pixblock_head, \
> +    pixman_composite_add_8_8_process_pixblock_tail, \
> +    pixman_composite_add_8888_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
> +    mvn         v24.8b, v3.8b  /* get inverted alpha */
> +    /* do alpha blending */
> +    umull       v8.8h, v24.8b, v4.8b
> +    umull       v9.8h, v24.8b, v5.8b
> +    umull       v10.8h, v24.8b, v6.8b
> +    umull       v11.8h, v24.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
> +    urshr       v14.8h, v8.8h, #8
> +    urshr       v15.8h, v9.8h, #8
> +    urshr       v16.8h, v10.8h, #8
> +    urshr       v17.8h, v11.8h, #8
> +    raddhn      v28.8b, v14.8h, v8.8h
> +    raddhn      v29.8b, v15.8h, v9.8h
> +    raddhn      v30.8b, v16.8h, v10.8h
> +    raddhn      v31.8b, v17.8h, v11.8h
> +.endm
> +
> +.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
> +     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +        urshr       v14.8h, v8.8h, #8
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0xF
> +        urshr       v15.8h, v9.8h, #8
> +        urshr       v16.8h, v10.8h, #8
> +        urshr       v17.8h, v11.8h, #8
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        raddhn      v28.8b, v14.8h, v8.8h
> +        raddhn      v29.8b, v15.8h, v9.8h
> +                                    PF cmp PF_X, ORIG_W
> +        raddhn      v30.8b, v16.8h, v10.8h
> +        raddhn      v31.8b, v17.8h, v11.8h
> +    fetch_src_pixblock
> +                                    PF lsl DUMMY, PF_X, #src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +    mvn         v22.8b, v3.8b
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    umull      v8.8h, v22.8b, v4.8b
> +                                    PF ble 10f
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    umull      v9.8h, v22.8b, v5.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +10:
> +    umull      v10.8h, v22.8b, v6.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_DST, #1
> +10:
> +     umull     v11.8h, v22.8b, v7.8b
> +.endm
> +
> +generate_composite_function_single_scanline \
> +    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
> +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
> +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_8888_process_pixblock_head
> +    pixman_composite_out_reverse_8888_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_over_8888_8888_process_pixblock_tail
> +    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
> +    uqadd       v28.8b, v0.8b, v28.8b
> +    uqadd       v29.8b, v1.8b, v29.8b
> +    uqadd       v30.8b, v2.8b, v30.8b
> +    uqadd       v31.8b, v3.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
> +     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +        urshr       v14.8h, v8.8h, #8
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0xF
> +        urshr       v15.8h, v9.8h, #8
> +        urshr       v16.8h, v10.8h, #8
> +        urshr       v17.8h, v11.8h, #8
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        raddhn      v28.8b, v14.8h, v8.8h
> +        raddhn      v29.8b, v15.8h, v9.8h
> +                                    PF cmp PF_X, ORIG_W
> +        raddhn      v30.8b, v16.8h, v10.8h
> +        raddhn      v31.8b, v17.8h, v11.8h
> +        uqadd       v28.8b, v0.8b, v28.8b
> +        uqadd       v29.8b, v1.8b, v29.8b
> +        uqadd       v30.8b, v2.8b, v30.8b
> +        uqadd       v31.8b, v3.8b, v31.8b
> +    fetch_src_pixblock
> +                                    PF lsl DUMMY, PF_X, #src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +    mvn        v22.8b, v3.8b
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +         st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    umull      v8.8h, v22.8b, v4.8b
> +                                    PF ble 10f
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    umull      v9.8h, v22.8b, v5.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +10:
> +    umull      v10.8h, v22.8b, v6.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_DST, #1
> +10:
> +    umull      v11.8h, v22.8b, v7.8b
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_over_8888_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail_head
> +
> +generate_composite_function_single_scanline \
> +    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_over_8888_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8888_process_pixblock_head
> +    /* deinterleaved source pixels in {v0, v1, v2, v3} */
> +    /* inverted alpha in {v24} */
> +    /* destination pixels in {v4, v5, v6, v7} */
> +    umull       v8.8h, v24.8b, v4.8b
> +    umull       v9.8h, v24.8b, v5.8b
> +    umull       v10.8h, v24.8b, v6.8b
> +    umull       v11.8h, v24.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_process_pixblock_tail
> +    urshr       v14.8h, v8.8h, #8
> +    urshr       v15.8h, v9.8h, #8
> +    urshr       v16.8h, v10.8h, #8
> +    urshr       v17.8h, v11.8h, #8
> +    raddhn      v28.8b, v14.8h, v8.8h
> +    raddhn      v29.8b, v15.8h, v9.8h
> +    raddhn      v30.8b, v16.8h, v10.8h
> +    raddhn      v31.8b, v17.8h, v11.8h
> +    uqadd       v28.8b, v0.8b, v28.8b
> +    uqadd       v29.8b, v1.8b, v29.8b
> +    uqadd       v30.8b, v2.8b, v30.8b
> +    uqadd       v31.8b, v3.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_process_pixblock_tail_head
> +        urshr       v14.8h, v8.8h, #8
> +        urshr       v15.8h, v9.8h, #8
> +        urshr       v16.8h, v10.8h, #8
> +        urshr       v17.8h, v11.8h, #8
> +        raddhn      v28.8b, v14.8h, v8.8h
> +        raddhn      v29.8b, v15.8h, v9.8h
> +        raddhn      v30.8b, v16.8h, v10.8h
> +        raddhn      v31.8b, v17.8h, v11.8h
> +    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +        uqadd       v28.8b, v0.8b, v28.8b
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0x0F
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        uqadd       v29.8b, v1.8b, v29.8b
> +        uqadd       v30.8b, v2.8b, v30.8b
> +        uqadd       v31.8b, v3.8b, v31.8b
> +                                    PF cmp PF_X, ORIG_W
> +    umull       v8.8h, v24.8b, v4.8b
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +    umull       v9.8h, v24.8b, v5.8b
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    umull       v10.8h, v24.8b, v6.8b
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +    umull       v11.8h, v24.8b, v7.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_DST, #1
> +10:
> +        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_over_n_8888_init
> +    mov         v3.s[0], w4
> +    dup         v0.8b, v3.b[0]
> +    dup         v1.8b, v3.b[1]
> +    dup         v2.8b, v3.b[2]
> +    dup         v3.8b, v3.b[3]
> +    mvn         v24.8b, v3.8b  /* get inverted alpha */
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_n_8888_init, \
> +    default_cleanup, \
> +    pixman_composite_over_8888_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail, \
> +    pixman_composite_over_n_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
> +        urshr       v14.8h, v8.8h, #8
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0xF
> +        urshr       v15.8h, v9.8h, #8
> +        urshr       v12.8h, v10.8h, #8
> +        urshr       v13.8h, v11.8h, #8
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        raddhn      v28.8b, v14.8h, v8.8h
> +        raddhn      v29.8b, v15.8h, v9.8h
> +                                    PF cmp PF_X, ORIG_W
> +        raddhn      v30.8b, v12.8h, v10.8h
> +        raddhn      v31.8b, v13.8h, v11.8h
> +        uqadd       v28.8b, v0.8b, v28.8b
> +        uqadd       v29.8b, v1.8b, v29.8b
> +        uqadd       v30.8b, v2.8b, v30.8b
> +        uqadd       v31.8b, v3.8b, v31.8b
> +    ld4         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_R], #32
> +    mvn         v22.8b, v3.8b
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +                                    PF blt 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    umull       v8.8h, v22.8b, v4.8b
> +                                    PF blt 10f
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    umull       v9.8h, v22.8b, v5.8b
> +    umull       v10.8h, v22.8b, v6.8b
> +                                    PF blt 10f
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_DST, #1
> +10:
> +    umull       v11.8h, v22.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_reverse_n_8888_init
> +    mov         v7.s[0], w4
> +    dup         v4.8b, v7.b[0]
> +    dup         v5.8b, v7.b[1]
> +    dup         v6.8b, v7.b[2]
> +    dup         v7.8b, v7.b[3]
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_reverse_n_8888_init, \
> +    default_cleanup, \
> +    pixman_composite_over_8888_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail, \
> +    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    0,  /* dst_r_basereg */ \
> +    4,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_8_0565_process_pixblock_head
> +    umull       v0.8h,  v24.8b, v8.8b    /* IN for SRC pixels (part1) */
> +    umull       v1.8h,  v24.8b, v9.8b
> +    umull       v2.8h,  v24.8b, v10.8b
> +    umull       v3.8h,  v24.8b, v11.8b
> +        mov         v4.d[1], v5.d[0]
> +        shrn        v25.8b,  v4.8h, #8 /* convert DST_R data to
> 32-bpp (part1) */
> +        shrn        v26.8b,  v4.8h, #3
> +        sli         v4.8h,   v4.8h, #5
> +    urshr       v17.8h, v0.8h,  #8    /* IN for SRC pixels (part2) */
> +    urshr       v18.8h, v1.8h,  #8
> +    urshr       v19.8h, v2.8h,  #8
> +    urshr       v20.8h, v3.8h,  #8
> +    raddhn      v0.8b,  v0.8h,  v17.8h
> +    raddhn      v1.8b,  v1.8h,  v18.8h
> +    raddhn      v2.8b,  v2.8h,  v19.8h
> +    raddhn      v3.8b,  v3.8h,  v20.8h
> +        sri         v25.8b, v25.8b, #5 /* convert DST_R data to
> 32-bpp (part2) */
> +        sri         v26.8b, v26.8b, #6
> +    mvn         v3.8b,  v3.8b
> +        shrn        v30.8b, v4.8h,  #2
> +    umull       v18.8h, v3.8b, v25.8b     /* now do alpha blending */
> +    umull       v19.8h, v3.8b, v26.8b
> +    umull       v20.8h, v3.8b, v30.8b
> +.endm
> +
> +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
> +    /* 3 cycle bubble (after vmull.u8) */
> +    urshr       v5.8h, v18.8h, #8
> +    urshr       v6.8h, v19.8h, #8
> +    urshr       v7.8h, v20.8h, #8
> +    raddhn      v17.8b, v18.8h, v5.8h
> +    raddhn      v19.8b, v19.8h, v6.8h
> +    raddhn      v18.8b, v20.8h, v7.8h
> +    uqadd       v5.8b, v2.8b,  v17.8b
> +    /* 1 cycle bubble */
> +    uqadd       v6.8b, v0.8b,  v18.8b
> +    uqadd       v7.8b, v1.8b,  v19.8b
> +    ushll       v14.8h, v5.8b, #7    /* convert to 16bpp */
> +    sli         v14.8h, v14.8h, #1
> +    ushll       v18.8h, v7.8b, #7
> +    sli         v18.8h, v18.8h, #1
> +    ushll       v19.8h, v6.8b, #7
> +    sli         v19.8h, v19.8h, #1
> +    sri         v14.8h, v18.8h, #5
> +    /* 1 cycle bubble */
> +    sri         v14.8h, v19.8h, #11
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
> +#if 0
> +    ld1         {v4.8h}, [DST_R], #16
> +    shrn        v25.8b,  v4.8h,  #8
> +    fetch_mask_pixblock
> +    shrn        v26.8b,  v4.8h,  #3
> +    fetch_src_pixblock
> +    umull       v22.8h,  v24.8b, v10.8b
> +        urshr       v13.8h, v18.8h, #8
> +        urshr       v11.8h, v19.8h, #8
> +        urshr       v15.8h, v20.8h, #8
> +        raddhn      v17.8b, v18.8h, v13.8h
> +        raddhn      v19.8b, v19.8h, v11.8h
> +        raddhn      v18.8b, v20.8h, v15.8h
> +        uqadd       v17.8b, v2.8b, v17.8b
> +    umull       v21.8h,  v24.8b, v9.8b
> +        uqadd       v18.8b, v0.8b, v18.8b
> +        uqadd       v19.8b, v1.8b, v19.8b
> +        ushll       v14.8h, v17.8b, #7
> +        sli         v14.8h, v14.8h, #1
> +    umull       v20.8h,  v24.8b, v8.8b
> +        ushll       v18.8h,  v18.8b, #7
> +        sli         v18.8h,  v18.8h, #1
> +        ushll       v19.8h,  v19.8b, #7
> +        sli         v19.8h,  v19.8h, #1
> +        sri         v14.8h,  v18.8h, #5
> +    umull       v23.8h,  v24.8b, v11.8b
> +        sri         v14.8h,  v19.8h, #11
> +        mov         v28.d[0], v14.d[0]
> +        mov         v29.d[0], v14.d[1]
> +
> +    cache_preload 8, 8
> +
> +    sli         v4.8h,  v4.8h,   #5
> +    urshr       v16.8h, v20.8h,  #8
> +    urshr       v17.8h, v21.8h,  #8
> +    urshr       v18.8h, v22.8h,  #8
> +    urshr       v19.8h, v23.8h,  #8
> +    raddhn      v0.8b,  v20.8h, v16.8h
> +    raddhn      v1.8b,  v21.8h, v17.8h
> +    raddhn      v2.8b,  v22.8h, v18.8h
> +    raddhn      v3.8b,  v23.8h, v19.8h
> +    sri         v25.8b,  v25.8b,  #5
> +    sri         v26.8b,  v26.8b,  #6
> +    mvn         v3.8b,  v3.8b
> +    shrn        v30.8b, v4.8h,  #2
> +    st1         {v14.8h}, [DST_W], #16
> +    umull       v18.8h, v3.8b, v25.8b
> +    umull       v19.8h, v3.8b, v26.8b
> +    umull       v20.8h, v3.8b, v30.8b
> +#else
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail
> +    st1         {v28.4h, v29.4h}, [DST_W], #16
> +    ld1         {v4.4h, v5.4h}, [DST_R], #16
> +    fetch_mask_pixblock
> +    fetch_src_pixblock
> +    pixman_composite_over_8888_8_0565_process_pixblock_head
> +#endif
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +/*
> + * This function needs a special initialization of solid mask.
> + * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
> + * offset, split into color components and replicated in d8-d11
> + * registers. Additionally, this function needs all the NEON registers,
> + * so it has to save d8-d15 registers which are callee saved according
> + * to ABI. These registers are restored from 'cleanup' macro. All the
> + * other NEON registers are caller saved, so can be clobbered freely
> + * without introducing any problems.
> + */
> +.macro pixman_composite_over_n_8_0565_init
> +    mov         v11.s[0], w4
> +    dup         v8.8b, v11.b[0]
> +    dup         v9.8b, v11.b[1]
> +    dup         v10.8b, v11.b[2]
> +    dup         v11.8b, v11.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_n_8_0565_init, \
> +    pixman_composite_over_n_8_0565_cleanup, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_n_0565_init
> +    mov         v24.s[0], w6
> +    dup         v24.8b, v24.b[3]
> +.endm
> +
> +.macro pixman_composite_over_8888_n_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_8888_n_0565_init, \
> +    pixman_composite_over_8888_n_0565_cleanup, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0565_0565_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_0565_0565_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
> +    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
> +    fetch_src_pixblock
> +    cache_preload 16, 16
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
> +    FLAG_DST_WRITEONLY, \
> +    16, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_0565_0565_process_pixblock_head, \
> +    pixman_composite_src_0565_0565_process_pixblock_tail, \
> +    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_n_8_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_n_8_process_pixblock_tail_head
> +    st1         {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], 32
> +.endm
> +
> +.macro pixman_composite_src_n_8_init
> +    mov         v0.s[0], w4
> +    dup         v3.8b, v0.b[0]
> +    dup         v2.8b, v0.b[0]
> +    dup         v1.8b, v0.b[0]
> +    dup         v0.8b, v0.b[0]
> +.endm
> +
> +.macro pixman_composite_src_n_8_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
> +    FLAG_DST_WRITEONLY, \
> +    32, /* number of pixels, processed in a single block */ \
> +    0,  /* prefetch distance */ \
> +    pixman_composite_src_n_8_init, \
> +    pixman_composite_src_n_8_cleanup, \
> +    pixman_composite_src_n_8_process_pixblock_head, \
> +    pixman_composite_src_n_8_process_pixblock_tail, \
> +    pixman_composite_src_n_8_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_0565_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_n_0565_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_n_0565_process_pixblock_tail_head
> +    st1     {v0.4h, v1.4h, v2.4h, v3.4h}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_src_n_0565_init
> +    mov         v0.s[0], w4
> +    dup         v3.4h, v0.h[0]
> +    dup         v2.4h, v0.h[0]
> +    dup         v1.4h, v0.h[0]
> +    dup         v0.4h, v0.h[0]
> +.endm
> +
> +.macro pixman_composite_src_n_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
> +    FLAG_DST_WRITEONLY, \
> +    16, /* number of pixels, processed in a single block */ \
> +    0,  /* prefetch distance */ \
> +    pixman_composite_src_n_0565_init, \
> +    pixman_composite_src_n_0565_cleanup, \
> +    pixman_composite_src_n_0565_process_pixblock_head, \
> +    pixman_composite_src_n_0565_process_pixblock_tail, \
> +    pixman_composite_src_n_0565_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_n_8888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_n_8888_process_pixblock_tail_head
> +    st1         {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_src_n_8888_init
> +    mov         v0.s[0], w4
> +    dup         v3.2s, v0.s[0]
> +    dup         v2.2s, v0.s[0]
> +    dup         v1.2s, v0.s[0]
> +    dup         v0.2s, v0.s[0]
> +.endm
> +
> +.macro pixman_composite_src_n_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
> +    FLAG_DST_WRITEONLY, \
> +    8, /* number of pixels, processed in a single block */ \
> +    0, /* prefetch distance */ \
> +    pixman_composite_src_n_8888_init, \
> +    pixman_composite_src_n_8888_cleanup, \
> +    pixman_composite_src_n_8888_process_pixblock_head, \
> +    pixman_composite_src_n_8888_process_pixblock_tail, \
> +    pixman_composite_src_n_8888_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_8888_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_8888_8888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
> +    st1  {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
> +    fetch_src_pixblock
> +    cache_preload 8, 8
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_WRITEONLY, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_8888_8888_process_pixblock_head, \
> +    pixman_composite_src_8888_8888_process_pixblock_tail, \
> +    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_x888_8888_process_pixblock_head
> +    orr      v0.8b, v0.8b, v4.8b
> +    orr      v1.8b, v1.8b, v4.8b
> +    orr      v2.8b, v2.8b, v4.8b
> +    orr      v3.8b, v3.8b, v4.8b
> +.endm
> +
> +.macro pixman_composite_src_x888_8888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
> +    st1      {v0.2s, v1.2s, v2.2s, v3.2s}, [DST_W], #32
> +    fetch_src_pixblock
> +    orr      v0.8b, v0.8b, v4.8b
> +    orr      v1.8b, v1.8b, v4.8b
> +    orr      v2.8b, v2.8b, v4.8b
> +    orr      v3.8b, v3.8b, v4.8b
> +    cache_preload 8, 8
> +.endm
> +
> +.macro pixman_composite_src_x888_8888_init
> +    mov     w20, #0xFF
> +    dup     v4.8b, w20
> +    shl     v4.2s, v4.2s, #24
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_WRITEONLY, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    pixman_composite_src_x888_8888_init, \
> +    default_cleanup, \
> +    pixman_composite_src_x888_8888_process_pixblock_head, \
> +    pixman_composite_src_x888_8888_process_pixblock_tail, \
> +    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8_8888_process_pixblock_head
> +    /* expecting solid source in {v0, v1, v2, v3} */
> +    /* mask is in v24 (v25, v26, v27 are unused) */
> +
> +    /* in */
> +    umull       v8.8h,  v24.8b, v0.8b
> +    umull       v9.8h,  v24.8b, v1.8b
> +    umull       v10.8h, v24.8b, v2.8b
> +    umull       v11.8h, v24.8b, v3.8b
> +    ursra       v8.8h,  v8.8h, #8
> +    ursra       v9.8h,  v9.8h, #8
> +    ursra       v10.8h, v10.8h, #8
> +    ursra       v11.8h, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_process_pixblock_tail
> +    rshrn       v28.8b, v8.8h, #8
> +    rshrn       v29.8b, v9.8h, #8
> +    rshrn       v30.8b, v10.8h, #8
> +    rshrn       v31.8b, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
> +    fetch_mask_pixblock
> +                                    PF add PF_X, PF_X, #8
> +        rshrn       v28.8b, v8.8h, #8
> +                                    PF tst PF_CTL, #0x0F
> +        rshrn       v29.8b, v9.8h, #8
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +10:
> +        rshrn      v30.8b, v10.8h, #8
> +                                    PF beq 10f
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        rshrn      v31.8b, v11.8h, #8
> +                                    PF cmp PF_X, ORIG_W
> +    umull          v8.8h, v24.8b, v0.8b
> +                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
> +                                    PF prfm pldl2strm, [PF_MASK, DUMMY]
> +    umull          v9.8h, v24.8b, v1.8b
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    umull          v10.8h, v24.8b, v2.8b
> +                                    PF ble 10f
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    umull          v11.8h, v24.8b, v3.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
> +                                    PF add PF_MASK, PF_MASK, #1
> +10:
> +        st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    ursra       v8.8h, v8.8h, #8
> +    ursra       v9.8h, v9.8h, #8
> +    ursra       v10.8h, v10.8h, #8
> +    ursra       v11.8h, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_init
> +    mov         v3.s[0], w4
> +    dup         v0.8b, v3.b[0]
> +    dup         v1.8b, v3.b[1]
> +    dup         v2.8b, v3.b[2]
> +    dup         v3.8b, v3.b[3]
> +.endm
> +
> +.macro pixman_composite_src_n_8_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_src_n_8_8888_init, \
> +    pixman_composite_src_n_8_8888_cleanup, \
> +    pixman_composite_src_n_8_8888_process_pixblock_head, \
> +    pixman_composite_src_n_8_8888_process_pixblock_tail, \
> +    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_n_8_8_process_pixblock_head
> +    umull       v0.8h, v24.8b, v16.8b
> +    umull       v1.8h, v25.8b, v16.8b
> +    umull       v2.8h, v26.8b, v16.8b
> +    umull       v3.8h, v27.8b, v16.8b
> +    ursra       v0.8h, v0.8h,  #8
> +    ursra       v1.8h, v1.8h,  #8
> +    ursra       v2.8h, v2.8h,  #8
> +    ursra       v3.8h, v3.8h,  #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_process_pixblock_tail
> +    rshrn       v28.8b, v0.8h, #8
> +    rshrn       v29.8b, v1.8h, #8
> +    rshrn       v30.8b, v2.8h, #8
> +    rshrn       v31.8b, v3.8h, #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
> +    fetch_mask_pixblock
> +                                    PF add PF_X, PF_X, #8
> +        rshrn       v28.8b, v0.8h, #8
> +                                    PF tst PF_CTL, #0x0F
> +        rshrn       v29.8b, v1.8h, #8
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +10:
> +        rshrn       v30.8b, v2.8h, #8
> +                                    PF beq 10f
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        rshrn       v31.8b, v3.8h, #8
> +                                    PF cmp PF_X, ORIG_W
> +    umull       v0.8h,  v24.8b, v16.8b
> +                                    PF lsl DUMMY, PF_X, mask_bpp_shift
> +                                    PF prfm pldl2strm, [PF_MASK, DUMMY]
> +    umull       v1.8h,  v25.8b, v16.8b
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    umull       v2.8h,  v26.8b, v16.8b
> +                                    PF ble 10f
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    umull       v3.8h,  v27.8b, v16.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
> +                                    PF add PF_MASK, PF_MASK, #1
> +10:
> +        st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    ursra       v0.8h, v0.8h,  #8
> +    ursra       v1.8h, v1.8h,  #8
> +    ursra       v2.8h, v2.8h,  #8
> +    ursra       v3.8h, v3.8h,  #8
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_init
> +    mov         v16.s[0], w4
> +    dup         v16.8b, v16.b[3]
> +.endm
> +
> +.macro pixman_composite_src_n_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
> +    FLAG_DST_WRITEONLY, \
> +    32, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_src_n_8_8_init, \
> +    pixman_composite_src_n_8_8_cleanup, \
> +    pixman_composite_src_n_8_8_process_pixblock_head, \
> +    pixman_composite_src_n_8_8_process_pixblock_tail, \
> +    pixman_composite_src_n_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8_8888_process_pixblock_head
> +    /* expecting deinterleaved source data in {v8, v9, v10, v11} */
> +    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
> +    /* and destination data in {v4, v5, v6, v7} */
> +    /* mask is in v24 (v25, v26, v27 are unused) */
> +
> +    /* in */
> +    umull       v12.8h, v24.8b, v8.8b
> +    umull       v13.8h, v24.8b, v9.8b
> +    umull       v14.8h, v24.8b, v10.8b
> +    umull       v15.8h, v24.8b, v11.8b
> +    urshr       v16.8h, v12.8h, #8
> +    urshr       v17.8h, v13.8h, #8
> +    urshr       v18.8h, v14.8h, #8
> +    urshr       v19.8h, v15.8h, #8
> +    raddhn      v0.8b, v12.8h, v16.8h
> +    raddhn      v1.8b, v13.8h, v17.8h
> +    raddhn      v2.8b, v14.8h, v18.8h
> +    raddhn      v3.8b, v15.8h, v19.8h
> +    mvn         v25.8b, v3.8b  /* get inverted alpha */
> +    /* source:      v0 - blue, v1 - green, v2 - red, v3 - alpha */
> +    /* destination: v4 - blue, v5 - green, v6 - red, v7 - alpha */
> +    /* now do alpha blending */
> +    umull       v12.8h, v25.8b, v4.8b
> +    umull       v13.8h, v25.8b, v5.8b
> +    umull       v14.8h, v25.8b, v6.8b
> +    umull       v15.8h, v25.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_process_pixblock_tail
> +    urshr       v16.8h, v12.8h, #8
> +    urshr       v17.8h, v13.8h, #8
> +    urshr       v18.8h, v14.8h, #8
> +    urshr       v19.8h, v15.8h, #8
> +    raddhn      v28.8b, v16.8h, v12.8h
> +    raddhn      v29.8b, v17.8h, v13.8h
> +    raddhn      v30.8b, v18.8h, v14.8h
> +    raddhn      v31.8b, v19.8h, v15.8h
> +    uqadd       v28.8b, v0.8b, v28.8b
> +    uqadd       v29.8b, v1.8b, v29.8b
> +    uqadd       v30.8b, v2.8b, v30.8b
> +    uqadd       v31.8b, v3.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
> +        urshr       v16.8h, v12.8h, #8
> +     ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +        urshr       v17.8h, v13.8h, #8
> +    fetch_mask_pixblock
> +        urshr       v18.8h, v14.8h, #8
> +                                    PF add PF_X, PF_X, #8
> +        urshr       v19.8h, v15.8h, #8
> +                                    PF tst PF_CTL, #0x0F
> +        raddhn      v28.8b, v16.8h, v12.8h
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +10:
> +        raddhn      v29.8b, v17.8h, v13.8h
> +                                    PF beq 10f
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        raddhn      v30.8b, v18.8h, v14.8h
> +                                    PF cmp PF_X, ORIG_W
> +        raddhn      v31.8b, v19.8h, v15.8h
> +                                    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +                                    PF prfm pldl2strm, [PF_DST, DUMMY]
> +    umull       v16.8h, v24.8b, v8.8b
> +                                    PF lsl DUMMY, PF_X, #mask_bpp_shift
> +                                    PF prfm pldl2strm, [PF_MASK, DUMMY]
> +    umull       v17.8h, v24.8b, v9.8b
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +10:
> +    umull       v18.8h, v24.8b, v10.8b
> +                                    PF ble 10f
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +10:
> +    umull       v19.8h, v24.8b, v11.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +                                    PF add PF_DST, PF_DST, #1
> +10:
> +        uqadd       v28.8b, v0.8b, v28.8b
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_MASK, DUMMY]
> +                                    PF add PF_MASK, PF_MASK, #1
> +10:
> +        uqadd        v29.8b, v1.8b, v29.8b
> +        uqadd        v30.8b, v2.8b, v30.8b
> +        uqadd        v31.8b, v3.8b, v31.8b
> +    urshr       v12.8h, v16.8h, #8
> +    urshr       v13.8h, v17.8h, #8
> +    urshr       v14.8h, v18.8h, #8
> +    urshr       v15.8h, v19.8h, #8
> +    raddhn      v0.8b, v16.8h, v12.8h
> +    raddhn      v1.8b, v17.8h, v13.8h
> +    raddhn      v2.8b, v18.8h, v14.8h
> +    raddhn      v3.8b, v19.8h, v15.8h
> +        st4          {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    mvn         v25.8b, v3.8b
> +    umull       v12.8h, v25.8b, v4.8b
> +    umull       v13.8h, v25.8b, v5.8b
> +    umull       v14.8h, v25.8b, v6.8b
> +    umull       v15.8h, v25.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_init
> +    mov         v11.s[0], w4
> +    dup         v8.8b, v11.b[0]
> +    dup         v9.8b, v11.b[1]
> +    dup         v10.8b, v11.b[2]
> +    dup         v11.8b, v11.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_n_8_8888_init, \
> +    pixman_composite_over_n_8_8888_cleanup, \
> +    pixman_composite_over_n_8_8888_process_pixblock_head, \
> +    pixman_composite_over_n_8_8888_process_pixblock_tail, \
> +    pixman_composite_over_n_8_8888_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8_8_process_pixblock_head
> +    umull       v0.8h,  v24.8b, v8.8b
> +    umull       v1.8h,  v25.8b, v8.8b
> +    umull       v2.8h,  v26.8b, v8.8b
> +    umull       v3.8h,  v27.8b, v8.8b
> +    urshr       v10.8h, v0.8h,  #8
> +    urshr       v11.8h, v1.8h,  #8
> +    urshr       v12.8h, v2.8h,  #8
> +    urshr       v13.8h, v3.8h,  #8
> +    raddhn      v0.8b,  v0.8h,  v10.8h
> +    raddhn      v1.8b,  v1.8h,  v11.8h
> +    raddhn      v2.8b,  v2.8h,  v12.8h
> +    raddhn      v3.8b,  v3.8h,  v13.8h
> +    mvn         v24.8b, v0.8b
> +    mvn         v25.8b, v1.8b
> +    mvn         v26.8b, v2.8b
> +    mvn         v27.8b, v3.8b
> +    umull       v10.8h, v24.8b, v4.8b
> +    umull       v11.8h, v25.8b, v5.8b
> +    umull       v12.8h, v26.8b, v6.8b
> +    umull       v13.8h, v27.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8_8_process_pixblock_tail
> +    urshr       v14.8h, v10.8h,  #8
> +    urshr       v15.8h, v11.8h,  #8
> +    urshr       v16.8h, v12.8h, #8
> +    urshr       v17.8h, v13.8h, #8
> +    raddhn      v28.8b, v14.8h, v10.8h
> +    raddhn      v29.8b, v15.8h, v11.8h
> +    raddhn      v30.8b, v16.8h, v12.8h
> +    raddhn      v31.8b, v17.8h, v13.8h
> +    uqadd       v28.8b, v0.8b,  v28.8b
> +    uqadd       v29.8b, v1.8b,  v29.8b
> +    uqadd       v30.8b, v2.8b,  v30.8b
> +    uqadd       v31.8b, v3.8b,  v31.8b
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
> +    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    pixman_composite_over_n_8_8_process_pixblock_tail
> +    fetch_mask_pixblock
> +    cache_preload 32, 32
> +    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    pixman_composite_over_n_8_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_over_n_8_8_init
> +    mov         v8.s[0], w4
> +    dup         v8.8b, v8.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
> +    FLAG_DST_READWRITE, \
> +    32, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_n_8_8_init, \
> +    pixman_composite_over_n_8_8_cleanup, \
> +    pixman_composite_over_n_8_8_process_pixblock_head, \
> +    pixman_composite_over_n_8_8_process_pixblock_tail, \
> +    pixman_composite_over_n_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
> +    /*
> +     * 'combine_mask_ca' replacement
> +     *
> +     * input:  solid src (n) in {v8,  v9,  v10, v11}
> +     *         dest in          {v4,  v5,  v6,  v7 }
> +     *         mask in          {v24, v25, v26, v27}
> +     * output: updated src in   {v0,  v1,  v2,  v3 }
> +     *         updated mask in  {v24, v25, v26, v3 }
> +     */
> +    umull       v0.8h,  v24.8b, v8.8b
> +    umull       v1.8h,  v25.8b, v9.8b
> +    umull       v2.8h,  v26.8b, v10.8b
> +    umull       v3.8h,  v27.8b, v11.8b
> +    umull       v12.8h, v11.8b, v25.8b
> +    umull       v13.8h, v11.8b, v24.8b
> +    umull       v14.8h, v11.8b, v26.8b
> +    urshr       v15.8h, v0.8h,  #8
> +    urshr       v16.8h, v1.8h,  #8
> +    urshr       v17.8h, v2.8h,  #8
> +    raddhn      v0.8b,  v0.8h,  v15.8h
> +    raddhn      v1.8b,  v1.8h,  v16.8h
> +    raddhn      v2.8b,  v2.8h,  v17.8h
> +    urshr       v15.8h, v13.8h, #8
> +    urshr       v16.8h, v12.8h, #8
> +    urshr       v17.8h, v14.8h, #8
> +    urshr       v18.8h, v3.8h,  #8
> +    raddhn      v24.8b, v13.8h, v15.8h
> +    raddhn      v25.8b, v12.8h, v16.8h
> +    raddhn      v26.8b, v14.8h, v17.8h
> +    raddhn      v3.8b,  v3.8h,  v18.8h
> +    /*
> +     * 'combine_over_ca' replacement
> +     *
> +     * output: updated dest in {v28, v29, v30, v31}
> +     */
> +    mvn         v24.8b, v24.8b
> +    mvn         v25.8b, v25.8b
> +    mvn         v26.8b, v26.8b
> +    mvn         v27.8b, v3.8b
> +    umull       v12.8h, v24.8b, v4.8b
> +    umull       v13.8h, v25.8b, v5.8b
> +    umull       v14.8h, v26.8b, v6.8b
> +    umull       v15.8h, v27.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
> +    /* ... continue 'combine_over_ca' replacement */
> +    urshr       v16.8h, v12.8h, #8
> +    urshr       v17.8h, v13.8h, #8
> +    urshr       v18.8h, v14.8h, #8
> +    urshr       v19.8h, v15.8h, #8
> +    raddhn      v28.8b, v16.8h, v12.8h
> +    raddhn      v29.8b, v17.8h, v13.8h
> +    raddhn      v30.8b, v18.8h, v14.8h
> +    raddhn      v31.8b, v19.8h, v15.8h
> +    uqadd       v28.8b, v0.8b,  v28.8b
> +    uqadd       v29.8b, v1.8b,  v29.8b
> +    uqadd       v30.8b, v2.8b,  v30.8b
> +    uqadd       v31.8b, v3.8b,  v31.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
> +        urshr       v16.8h, v12.8h, #8
> +        urshr       v17.8h, v13.8h, #8
> +    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +        urshr       v18.8h, v14.8h, #8
> +        urshr       v19.8h, v15.8h, #8
> +        raddhn      v28.8b, v16.8h, v12.8h
> +        raddhn      v29.8b, v17.8h, v13.8h
> +        raddhn      v30.8b, v18.8h, v14.8h
> +        raddhn      v31.8b, v19.8h, v15.8h
> +    fetch_mask_pixblock
> +        uqadd       v28.8b, v0.8b, v28.8b
> +        uqadd       v29.8b, v1.8b, v29.8b
> +        uqadd       v30.8b, v2.8b, v30.8b
> +        uqadd       v31.8b, v3.8b, v31.8b
> +    cache_preload 8, 8
> +    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
> +    st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_init
> +    mov         v13.s[0], w4
> +    dup         v8.8b, v13.b[0]
> +    dup         v9.8b, v13.b[1]
> +    dup         v10.8b, v13.b[2]
> +    dup         v11.8b, v13.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8888_8888_ca_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_n_8888_8888_ca_init, \
> +    pixman_composite_over_n_8888_8888_ca_cleanup, \
> +    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
> +    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
> +    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
> +    /*
> +     * 'combine_mask_ca' replacement
> +     *
> +     * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
> +     *         mask in          {v24, v25, v26}       [B, G, R]
> +     * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
> +     *         updated mask in  {v24, v25, v26}       [B, G, R]
> +     */
> +    umull       v0.8h,  v24.8b, v8.8b
> +    umull       v1.8h,  v25.8b, v9.8b
> +    umull       v2.8h,  v26.8b, v10.8b
> +    umull       v12.8h, v11.8b, v24.8b
> +    umull       v13.8h, v11.8b, v25.8b
> +    umull       v14.8h, v11.8b, v26.8b
> +    urshr       v15.8h, v0.8h,  #8
> +    urshr       v16.8h, v1.8h,  #8
> +    urshr       v17.8h, v2.8h,  #8
> +    raddhn      v0.8b,  v0.8h,  v15.8h
> +    raddhn      v1.8b,  v1.8h,  v16.8h
> +    raddhn      v2.8b,  v2.8h,  v17.8h
> +    urshr       v19.8h, v12.8h, #8
> +    urshr       v20.8h, v13.8h, #8
> +    urshr       v21.8h, v14.8h, #8
> +    raddhn      v24.8b, v12.8h, v19.8h
> +    raddhn      v25.8b, v13.8h, v20.8h
> +    /*
> +     * convert 8 r5g6b5 pixel data from {v4} to planar 8-bit format
> +     * and put data into v16 - blue, v17 - green, v18 - red
> +     */
> +       mov         v4.d[1], v5.d[0]
> +       shrn        v17.8b, v4.8h,  #3
> +       shrn        v18.8b, v4.8h,  #8
> +    raddhn      v26.8b, v14.8h, v21.8h
> +       sli         v4.8h,  v4.8h,  #5
> +       sri         v18.8b, v18.8b, #5
> +       sri         v17.8b, v17.8b, #6
> +    /*
> +     * 'combine_over_ca' replacement
> +     *
> +     * output: updated dest in v16 - blue, v17 - green, v18 - red
> +     */
> +    mvn         v24.8b, v24.8b
> +    mvn         v25.8b, v25.8b
> +       shrn       v16.8b, v4.8h,  #2
> +    mvn         v26.8b, v26.8b
> +    umull       v5.8h, v16.8b, v24.8b
> +    umull       v6.8h, v17.8b, v25.8b
> +    umull       v7.8h, v18.8b, v26.8b
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
> +    /* ... continue 'combine_over_ca' replacement */
> +    urshr       v13.8h, v5.8h, #8
> +    urshr       v14.8h, v6.8h, #8
> +    urshr       v15.8h, v7.8h, #8
> +    raddhn      v16.8b, v13.8h, v5.8h
> +    raddhn      v17.8b, v14.8h, v6.8h
> +    raddhn      v18.8b, v15.8h, v7.8h
> +    uqadd       v16.8b, v0.8b, v16.8b
> +    uqadd       v17.8b, v1.8b, v17.8b
> +    uqadd       v18.8b, v2.8b, v18.8b
> +    /*
> +     * convert the results in v16, v17, v18 to r5g6b5 and store
> +     * them into {v14}
> +     */
> +    ushll       v14.8h, v18.8b, #7
> +    sli         v14.8h, v14.8h, #1
> +    ushll       v12.8h, v17.8b, #7
> +    sli         v12.8h, v12.8h, #1
> +    ushll       v13.8h, v16.8b, #7
> +    sli         v13.8h, v13.8h, #1
> +    sri         v14.8h, v12.8h, #5
> +    sri         v14.8h, v13.8h, #11
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
> +    fetch_mask_pixblock
> +        urshr       v13.8h, v5.8h, #8
> +        urshr       v14.8h, v6.8h, #8
> +    ld1         {v4.8h}, [DST_R], #16
> +        urshr       v15.8h, v7.8h, #8
> +        raddhn      v16.8b, v13.8h, v5.8h
> +        raddhn      v17.8b, v14.8h, v6.8h
> +        raddhn      v18.8b, v15.8h, v7.8h
> +    mov         v5.d[0], v4.d[1]
> +            /* process_pixblock_head */
> +            /*
> +             * 'combine_mask_ca' replacement
> +             *
> +             * input:  solid src (n) in {v8,  v9,  v10, v11}  [B, G, R, A]
> +             *         mask in          {v24, v25, v26}       [B, G, R]
> +             * output: updated src in   {v0,  v1,  v2 }       [B, G, R]
> +             *         updated mask in  {v24, v25, v26}       [B, G, R]
> +             */
> +        uqadd       v16.8b, v0.8b, v16.8b
> +        uqadd       v17.8b, v1.8b, v17.8b
> +        uqadd       v18.8b, v2.8b, v18.8b
> +            umull       v0.8h,  v24.8b, v8.8b
> +            umull       v1.8h,  v25.8b, v9.8b
> +            umull       v2.8h,  v26.8b, v10.8b
> +        /*
> +         * convert the result in v16, v17, v18 to r5g6b5 and store
> +         * it into {v14}
> +         */
> +        ushll       v14.8h, v18.8b, #7
> +        sli         v14.8h, v14.8h, #1
> +        ushll       v18.8h, v16.8b, #7
> +        sli         v18.8h, v18.8h, #1
> +        ushll       v19.8h, v17.8b, #7
> +        sli         v19.8h, v19.8h, #1
> +            umull       v12.8h, v11.8b, v24.8b
> +        sri         v14.8h, v19.8h, #5
> +            umull       v13.8h, v11.8b, v25.8b
> +            umull       v15.8h, v11.8b, v26.8b
> +        sri         v14.8h, v18.8h, #11
> +        mov         v28.d[0], v14.d[0]
> +        mov         v29.d[0], v14.d[1]
> +    cache_preload 8, 8
> +            urshr       v16.8h, v0.8h,  #8
> +            urshr       v17.8h, v1.8h,  #8
> +            urshr       v18.8h, v2.8h,  #8
> +            raddhn      v0.8b,  v0.8h,  v16.8h
> +            raddhn      v1.8b,  v1.8h,  v17.8h
> +            raddhn      v2.8b,  v2.8h,  v18.8h
> +            urshr       v19.8h, v12.8h, #8
> +            urshr       v20.8h, v13.8h, #8
> +            urshr       v21.8h, v15.8h, #8
> +            raddhn      v24.8b, v12.8h, v19.8h
> +            raddhn      v25.8b, v13.8h, v20.8h
> +                /*
> +                 * convert 8 r5g6b5 pixel data from {v4, v5} to planar
> +             * 8-bit format and put data into v16 - blue, v17 - green,
> +             * v18 - red
> +                 */
> +        mov         v4.d[1], v5.d[0]
> +                shrn        v17.8b, v4.8h,  #3
> +                shrn        v18.8b, v4.8h,  #8
> +            raddhn      v26.8b, v15.8h, v21.8h
> +                sli         v4.8h,  v4.8h,  #5
> +                sri         v17.8b, v17.8b, #6
> +                sri         v18.8b, v18.8b, #5
> +            /*
> +             * 'combine_over_ca' replacement
> +             *
> +             * output: updated dest in v16 - blue, v17 - green, v18 - red
> +             */
> +            mvn         v24.8b, v24.8b
> +            mvn         v25.8b, v25.8b
> +                shrn        v16.8b, v4.8h,  #2
> +            mvn         v26.8b, v26.8b
> +            umull       v5.8h, v16.8b, v24.8b
> +            umull       v6.8h, v17.8b, v25.8b
> +            umull       v7.8h, v18.8b, v26.8b
> +    st1         {v14.8h}, [DST_W], #16
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_init
> +    mov         v13.s[0], w4
> +    dup         v8.8b, v13.b[0]
> +    dup         v9.8b, v13.b[1]
> +    dup         v10.8b, v13.b[2]
> +    dup         v11.8b, v13.b[3]
> +.endm
> +
> +.macro pixman_composite_over_n_8888_0565_ca_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_n_8888_0565_ca_init, \
> +    pixman_composite_over_n_8888_0565_ca_cleanup, \
> +    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
> +    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
> +    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_in_n_8_process_pixblock_head
> +    /* expecting source data in {v0, v1, v2, v3} */
> +    /* and destination data in {v4, v5, v6, v7} */
> +    umull       v8.8h,  v4.8b,  v3.8b
> +    umull       v9.8h,  v5.8b,  v3.8b
> +    umull       v10.8h, v6.8b,  v3.8b
> +    umull       v11.8h, v7.8b,  v3.8b
> +.endm
> +
> +.macro pixman_composite_in_n_8_process_pixblock_tail
> +    urshr       v14.8h, v8.8h,  #8
> +    urshr       v15.8h, v9.8h,  #8
> +    urshr       v12.8h, v10.8h, #8
> +    urshr       v13.8h, v11.8h, #8
> +    raddhn      v28.8b, v8.8h,  v14.8h
> +    raddhn      v29.8b, v9.8h,  v15.8h
> +    raddhn      v30.8b, v10.8h, v12.8h
> +    raddhn      v31.8b, v11.8h, v13.8h
> +.endm
> +
> +.macro pixman_composite_in_n_8_process_pixblock_tail_head
> +    pixman_composite_in_n_8_process_pixblock_tail
> +    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    cache_preload 32, 32
> +    pixman_composite_in_n_8_process_pixblock_head
> +    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_in_n_8_init
> +    mov         v3.s[0], w4
> +    dup         v3.8b, v3.b[3]
> +.endm
> +
> +.macro pixman_composite_in_n_8_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
> +    FLAG_DST_READWRITE, \
> +    32, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_in_n_8_init, \
> +    pixman_composite_in_n_8_cleanup, \
> +    pixman_composite_in_n_8_process_pixblock_head, \
> +    pixman_composite_in_n_8_process_pixblock_tail, \
> +    pixman_composite_in_n_8_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +.macro pixman_composite_add_n_8_8_process_pixblock_head
> +    /* expecting source data in {v8, v9, v10, v11} */
> +    /* v8 - blue, v9 - green, v10 - red, v11 - alpha */
> +    /* and destination data in {v4, v5, v6, v7} */
> +    /* mask is in v24, v25, v26, v27 */
> +    umull       v0.8h, v24.8b, v11.8b
> +    umull       v1.8h, v25.8b, v11.8b
> +    umull       v2.8h, v26.8b, v11.8b
> +    umull       v3.8h, v27.8b, v11.8b
> +    urshr       v12.8h, v0.8h, #8
> +    urshr       v13.8h, v1.8h, #8
> +    urshr       v14.8h, v2.8h, #8
> +    urshr       v15.8h, v3.8h, #8
> +    raddhn      v0.8b, v0.8h, v12.8h
> +    raddhn      v1.8b, v1.8h, v13.8h
> +    raddhn      v2.8b, v2.8h, v14.8h
> +    raddhn      v3.8b, v3.8h, v15.8h
> +    uqadd       v28.8b, v0.8b, v4.8b
> +    uqadd       v29.8b, v1.8b, v5.8b
> +    uqadd       v30.8b, v2.8b, v6.8b
> +    uqadd       v31.8b, v3.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_add_n_8_8_process_pixblock_tail
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
> +    pixman_composite_add_n_8_8_process_pixblock_tail
> +    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    fetch_mask_pixblock
> +    cache_preload 32, 32
> +    pixman_composite_add_n_8_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_add_n_8_8_init
> +    mov         v11.s[0], w4
> +    dup         v11.8b, v11.b[3]
> +.endm
> +
> +.macro pixman_composite_add_n_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
> +    FLAG_DST_READWRITE, \
> +    32, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_add_n_8_8_init, \
> +    pixman_composite_add_n_8_8_cleanup, \
> +    pixman_composite_add_n_8_8_process_pixblock_head, \
> +    pixman_composite_add_n_8_8_process_pixblock_tail, \
> +    pixman_composite_add_n_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8_8_8_process_pixblock_head
> +    /* expecting source data in {v0, v1, v2, v3} */
> +    /* destination data in {v4, v5, v6, v7} */
> +    /* mask in {v24, v25, v26, v27} */
> +    umull       v8.8h, v24.8b, v0.8b
> +    umull       v9.8h, v25.8b, v1.8b
> +    umull       v10.8h, v26.8b, v2.8b
> +    umull       v11.8h, v27.8b, v3.8b
> +    urshr       v0.8h, v8.8h, #8
> +    urshr       v1.8h, v9.8h, #8
> +    urshr       v12.8h, v10.8h, #8
> +    urshr       v13.8h, v11.8h, #8
> +    raddhn      v0.8b, v0.8h, v8.8h
> +    raddhn      v1.8b, v1.8h, v9.8h
> +    raddhn      v2.8b, v12.8h, v10.8h
> +    raddhn      v3.8b, v13.8h, v11.8h
> +    uqadd       v28.8b, v0.8b, v4.8b
> +    uqadd       v29.8b, v1.8b, v5.8b
> +    uqadd       v30.8b, v2.8b, v6.8b
> +    uqadd       v31.8b, v3.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_add_8_8_8_process_pixblock_tail
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
> +    pixman_composite_add_8_8_8_process_pixblock_tail
> +    st1         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    ld1         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    fetch_mask_pixblock
> +    fetch_src_pixblock
> +    cache_preload 32, 32
> +    pixman_composite_add_8_8_8_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_add_8_8_8_init
> +.endm
> +
> +.macro pixman_composite_add_8_8_8_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
> +    FLAG_DST_READWRITE, \
> +    32, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_add_8_8_8_init, \
> +    pixman_composite_add_8_8_8_cleanup, \
> +    pixman_composite_add_8_8_8_process_pixblock_head, \
> +    pixman_composite_add_8_8_8_process_pixblock_tail, \
> +    pixman_composite_add_8_8_8_process_pixblock_tail_head
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
> +    /* expecting source data in {v0, v1, v2, v3} */
> +    /* destination data in {v4, v5, v6, v7} */
> +    /* mask in {v24, v25, v26, v27} */
> +    umull       v8.8h,  v27.8b, v0.8b
> +    umull       v9.8h,  v27.8b, v1.8b
> +    umull       v10.8h, v27.8b, v2.8b
> +    umull       v11.8h, v27.8b, v3.8b
> +    /* 1 cycle bubble */
> +    ursra       v8.8h,  v8.8h,  #8
> +    ursra       v9.8h,  v9.8h,  #8
> +    ursra       v10.8h, v10.8h, #8
> +    ursra       v11.8h, v11.8h, #8
> +.endm
> +
> +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
> +    /* 2 cycle bubble */
> +    rshrn       v28.8b, v8.8h,  #8
> +    rshrn       v29.8b, v9.8h,  #8
> +    rshrn       v30.8b, v10.8h, #8
> +    rshrn       v31.8b, v11.8h, #8
> +    uqadd       v28.8b, v4.8b,  v28.8b
> +    uqadd       v29.8b, v5.8b,  v29.8b
> +    uqadd       v30.8b, v6.8b,  v30.8b
> +    uqadd       v31.8b, v7.8b,  v31.8b
> +.endm
> +
> +.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
> +    fetch_src_pixblock
> +        rshrn       v28.8b, v8.8h,  #8
> +    fetch_mask_pixblock
> +        rshrn       v29.8b, v9.8h,  #8
> +    umull       v8.8h,  v27.8b, v0.8b
> +        rshrn       v30.8b, v10.8h, #8
> +    umull       v9.8h,  v27.8b, v1.8b
> +        rshrn       v31.8b, v11.8h, #8
> +    umull       v10.8h, v27.8b, v2.8b
> +    umull       v11.8h, v27.8b, v3.8b
> +        uqadd       v28.8b, v4.8b,  v28.8b
> +        uqadd       v29.8b, v5.8b,  v29.8b
> +        uqadd       v30.8b, v6.8b,  v30.8b
> +        uqadd       v31.8b, v7.8b,  v31.8b
> +    ursra       v8.8h,  v8.8h,  #8
> +    ld4         {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    ursra       v9.8h,  v9.8h,  #8
> +        st4         {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +    ursra       v10.8h, v10.8h, #8
> +
> +    cache_preload 8, 8
> +
> +    ursra       v11.8h, v11.8h, #8
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +generate_composite_function_single_scanline \
> +    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +generate_composite_function \
> +    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    27  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_n_8_8888_init
> +    mov         v3.s[0], w4
> +    dup         v0.8b, v3.b[0]
> +    dup         v1.8b, v3.b[1]
> +    dup         v2.8b, v3.b[2]
> +    dup         v3.8b, v3.b[3]
> +.endm
> +
> +.macro pixman_composite_add_n_8_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_add_n_8_8888_init, \
> +    pixman_composite_add_n_8_8888_cleanup, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    27  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_8888_n_8888_init
> +    mov         v27.s[0], w6
> +    dup         v27.8b, v27.b[3]
> +.endm
> +
> +.macro pixman_composite_add_8888_n_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_add_8888_n_8888_init, \
> +    pixman_composite_add_8888_n_8888_cleanup, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
> +    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    27  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
> +    /* expecting source data in {v0, v1, v2, v3} */
> +    /* destination data in {v4, v5, v6, v7} */
> +    /* solid mask is in v15 */
> +
> +    /* 'in' */
> +    umull       v11.8h, v15.8b, v3.8b
> +    umull       v10.8h, v15.8b, v2.8b
> +    umull       v9.8h,  v15.8b, v1.8b
> +    umull       v8.8h,  v15.8b, v0.8b
> +    urshr       v16.8h, v11.8h, #8
> +    urshr       v14.8h, v10.8h, #8
> +    urshr       v13.8h,  v9.8h, #8
> +    urshr       v12.8h,  v8.8h, #8
> +    raddhn      v3.8b, v11.8h, v16.8h
> +    raddhn      v2.8b, v10.8h, v14.8h
> +    raddhn      v1.8b,  v9.8h, v13.8h
> +    raddhn      v0.8b,  v8.8h, v12.8h
> +    mvn         v24.8b, v3.8b  /* get inverted alpha */
> +    /* now do alpha blending */
> +    umull       v8.8h, v24.8b, v4.8b
> +    umull       v9.8h, v24.8b, v5.8b
> +    umull       v10.8h, v24.8b, v6.8b
> +    umull       v11.8h, v24.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
> +    urshr       v16.8h, v8.8h, #8
> +    urshr       v17.8h, v9.8h, #8
> +    urshr       v18.8h, v10.8h, #8
> +    urshr       v19.8h, v11.8h, #8
> +    raddhn      v28.8b, v16.8h, v8.8h
> +    raddhn      v29.8b, v17.8h, v9.8h
> +    raddhn      v30.8b, v18.8h, v10.8h
> +    raddhn      v31.8b, v19.8h, v11.8h
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
> +    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
> +    fetch_src_pixblock
> +    cache_preload 8, 8
> +    fetch_mask_pixblock
> +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
> +    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function_single_scanline \
> +    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
> +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
> +    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    12  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_8888_n_8888_process_pixblock_head
> +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
> +    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
> +    uqadd       v28.8b, v0.8b, v28.8b
> +    uqadd       v29.8b, v1.8b, v29.8b
> +    uqadd       v30.8b, v2.8b, v30.8b
> +    uqadd       v31.8b, v3.8b, v31.8b
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
> +    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail
> +    fetch_src_pixblock
> +    cache_preload 8, 8
> +    pixman_composite_over_8888_n_8888_process_pixblock_head
> +    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +.macro pixman_composite_over_8888_n_8888_init
> +    mov         v15.s[0], w6
> +    dup         v15.8b, v15.b[3]
> +.endm
> +
> +.macro pixman_composite_over_8888_n_8888_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_8888_n_8888_init, \
> +    pixman_composite_over_8888_n_8888_cleanup, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    12  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
> +    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail
> +    fetch_src_pixblock
> +    cache_preload 8, 8
> +    fetch_mask_pixblock
> +    pixman_composite_over_8888_n_8888_process_pixblock_head
> +    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> +    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    12  /* mask_basereg  */
> +
> +generate_composite_function_single_scanline \
> +    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> +    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    12  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
> +    ld4        {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail
> +    fetch_src_pixblock
> +    cache_preload 8, 8
> +    fetch_mask_pixblock
> +    pixman_composite_over_8888_n_8888_process_pixblock_head
> +    st4        {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
> +    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    15  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0888_0888_process_pixblock_head
> +.endm
> +
> +.macro pixman_composite_src_0888_0888_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
> +    st3     {v0.8b, v1.8b, v2.8b}, [DST_W], #24
> +    fetch_src_pixblock
> +    cache_preload 8, 8
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
> +    FLAG_DST_WRITEONLY, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_0888_0888_process_pixblock_head, \
> +    pixman_composite_src_0888_0888_process_pixblock_tail, \
> +    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
> +    mov    v31.8b, v2.8b
> +    mov    v2.8b, v0.8b
> +    mov    v0.8b, v31.8b
> +.endm
> +
> +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
> +.endm
> +
> +.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
> +    st4    {v0.8b, v1.8b, v2.8b, v3.8b}, [DST_W], #32
> +    fetch_src_pixblock
> +    mov    v31.8b, v2.8b
> +    mov    v2.8b, v0.8b
> +    mov    v0.8b, v31.8b
> +    cache_preload 8, 8
> +.endm
> +
> +.macro pixman_composite_src_0888_8888_rev_init
> +    eor    v3.8b, v3.8b, v3.8b
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    pixman_composite_src_0888_8888_rev_init, \
> +    default_cleanup, \
> +    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
> +    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
> +    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
> +    0, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
> +    ushll       v8.8h, v1.8b, #7
> +    sli         v8.8h, v8.8h, #1
> +    ushll       v9.8h, v2.8b, #7
> +    sli         v9.8h, v9.8h, #1
> +.endm
> +
> +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
> +    ushll       v14.8h, v0.8b, #7
> +    sli         v14.8h, v14.8h, #1
> +    sri         v14.8h, v8.8h, #5
> +    sri         v14.8h, v9.8h, #11
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
> +        ushll       v14.8h, v0.8b, #7
> +        sli         v14.8h, v14.8h, #1
> +    fetch_src_pixblock
> +        sri         v14.8h, v8.8h, #5
> +        sri         v14.8h, v9.8h, #11
> +        mov         v28.d[0], v14.d[0]
> +        mov         v29.d[0], v14.d[1]
> +    ushll       v8.8h, v1.8b, #7
> +    sli         v8.8h, v8.8h, #1
> +        st1     {v14.8h}, [DST_W], #16
> +    ushll       v9.8h, v2.8b, #7
> +    sli         v9.8h, v9.8h, #1
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
> +    FLAG_DST_WRITEONLY, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
> +    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
> +    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
> +    umull       v8.8h, v3.8b, v0.8b
> +    umull       v9.8h, v3.8b, v1.8b
> +    umull       v10.8h, v3.8b, v2.8b
> +.endm
> +
> +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
> +    urshr       v11.8h, v8.8h, #8
> +    mov         v30.8b, v31.8b
> +    mov         v31.8b, v3.8b
> +    mov         v3.8b, v30.8b
> +    urshr       v12.8h, v9.8h, #8
> +    urshr       v13.8h, v10.8h, #8
> +    raddhn      v30.8b, v11.8h, v8.8h
> +    raddhn      v29.8b, v12.8h, v9.8h
> +    raddhn      v28.8b, v13.8h, v10.8h
> +.endm
> +
> +.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
> +        urshr       v11.8h, v8.8h, #8
> +        mov         v30.8b, v31.8b
> +        mov         v31.8b, v3.8b
> +        mov         v3.8b, v31.8b
> +        urshr       v12.8h, v9.8h, #8
> +        urshr       v13.8h, v10.8h, #8
> +    fetch_src_pixblock
> +        raddhn      v30.8b, v11.8h, v8.8h
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0xF
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        raddhn      v29.8b, v12.8h, v9.8h
> +        raddhn      v28.8b, v13.8h, v10.8h
> +    umull       v8.8h, v3.8b, v0.8b
> +    umull       v9.8h, v3.8b, v1.8b
> +    umull       v10.8h, v3.8b, v2.8b
> +         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +                                    PF cmp PF_X, ORIG_W
> +                                    PF lsl DUMMY, PF_X, src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +10:
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
> +    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
> +    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
> +    umull       v8.8h, v3.8b, v0.8b
> +    umull       v9.8h, v3.8b, v1.8b
> +    umull       v10.8h, v3.8b, v2.8b
> +.endm
> +
> +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
> +    urshr       v11.8h, v8.8h, #8
> +    mov         v30.8b, v31.8b
> +    mov         v31.8b, v3.8b
> +    mov         v3.8b, v30.8b
> +    urshr       v12.8h, v9.8h, #8
> +    urshr       v13.8h, v10.8h, #8
> +    raddhn      v28.8b, v11.8h, v8.8h
> +    raddhn      v29.8b, v12.8h, v9.8h
> +    raddhn      v30.8b, v13.8h, v10.8h
> +.endm
> +
> +.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
> +        urshr       v11.8h, v8.8h, #8
> +        mov         v30.8b, v31.8b
> +        mov         v31.8b, v3.8b
> +        mov         v3.8b, v30.8b
> +        urshr       v12.8h, v9.8h, #8
> +        urshr       v13.8h, v10.8h, #8
> +    fetch_src_pixblock
> +        raddhn      v28.8b, v11.8h, v8.8h
> +                                    PF add PF_X, PF_X, #8
> +                                    PF tst PF_CTL, #0xF
> +                                    PF beq 10f
> +                                    PF add PF_X, PF_X, #8
> +                                    PF sub PF_CTL, PF_CTL, #1
> +10:
> +        raddhn      v29.8b, v12.8h, v9.8h
> +        raddhn      v30.8b, v13.8h, v10.8h
> +    umull       v8.8h, v3.8b, v0.8b
> +    umull       v9.8h, v3.8b, v1.8b
> +    umull       v10.8h, v3.8b, v2.8b
> +         st4    {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +                                    PF cmp PF_X, ORIG_W
> +                                    PF lsl DUMMY, PF_X, src_bpp_shift
> +                                    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +                                    PF ble 10f
> +                                    PF sub PF_X, PF_X, ORIG_W
> +                                    PF subs PF_CTL, PF_CTL, #0x10
> +                                    PF ble 10f
> +                                    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +                                    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +                                    PF add PF_SRC, PF_SRC, #1
> +10:
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    10, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
> +    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
> +    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    0, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_0565_8_0565_process_pixblock_head
> +    /* mask is in v15 */
> +    mov         v4.d[0], v8.d[0]
> +    mov         v4.d[1], v9.d[0]
> +    mov         v13.d[0], v10.d[0]
> +    mov         v13.d[1], v11.d[0]
> +    convert_0565_to_x888 v4, v2, v1, v0
> +    convert_0565_to_x888 v13, v6, v5, v4
> +    /* source pixel data is in      {v0, v1, v2, XX} */
> +    /* destination pixel data is in {v4, v5, v6, XX} */
> +    mvn         v7.8b,  v15.8b
> +    umull       v10.8h, v15.8b, v2.8b
> +    umull       v9.8h,  v15.8b, v1.8b
> +    umull       v8.8h,  v15.8b, v0.8b
> +    umull       v11.8h, v7.8b,  v4.8b
> +    umull       v12.8h, v7.8b,  v5.8b
> +    umull       v13.8h, v7.8b,  v6.8b
> +    urshr       v19.8h, v10.8h, #8
> +    urshr       v18.8h, v9.8h,  #8
> +    urshr       v17.8h, v8.8h,  #8
> +    raddhn      v2.8b,  v10.8h, v19.8h
> +    raddhn      v1.8b,  v9.8h,  v18.8h
> +    raddhn      v0.8b,  v8.8h,  v17.8h
> +.endm
> +
> +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
> +    urshr       v17.8h, v11.8h,  #8
> +    urshr       v18.8h, v12.8h,  #8
> +    urshr       v19.8h, v13.8h,  #8
> +    raddhn      v28.8b, v17.8h, v11.8h
> +    raddhn      v29.8b, v18.8h, v12.8h
> +    raddhn      v30.8b, v19.8h, v13.8h
> +    uqadd       v0.8b,  v0.8b,  v28.8b
> +    uqadd       v1.8b,  v1.8b,  v29.8b
> +    uqadd       v2.8b,  v2.8b,  v30.8b
> +    /* 32bpp result is in {v0, v1, v2, XX} */
> +    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
> +    fetch_mask_pixblock
> +    pixman_composite_over_0565_8_0565_process_pixblock_tail
> +    fetch_src_pixblock
> +    ld1        {v10.4h, v11.4h}, [DST_R], #16
> +    cache_preload 8, 8
> +    pixman_composite_over_0565_8_0565_process_pixblock_head
> +    st1        {v14.8h}, [DST_W], #16
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_head, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    10,  /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    15  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_over_0565_n_0565_init
> +    mov         v15.s[0], w6
> +    dup         v15.8b, v15.b[3]
> +.endm
> +
> +.macro pixman_composite_over_0565_n_0565_cleanup
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    pixman_composite_over_0565_n_0565_init, \
> +    pixman_composite_over_0565_n_0565_cleanup, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_head, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    10,  /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    15  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_add_0565_8_0565_process_pixblock_head
> +    /* mask is in v15 */
> +    mov         v4.d[0], v8.d[0]
> +    mov         v4.d[1], v9.d[0]
> +    mov         v13.d[0], v10.d[0]
> +    mov         v13.d[1], v11.d[0]
> +    convert_0565_to_x888 v4,  v2, v1, v0
> +    convert_0565_to_x888 v13, v6, v5, v4
> +    /* source pixel data is in      {v0, v1, v2, XX} */
> +    /* destination pixel data is in {v4, v5, v6, XX} */
> +    umull       v9.8h,  v15.8b, v2.8b
> +    umull       v8.8h,  v15.8b, v1.8b
> +    umull       v7.8h,  v15.8b, v0.8b
> +    urshr       v12.8h, v9.8h,  #8
> +    urshr       v11.8h, v8.8h,  #8
> +    urshr       v10.8h, v7.8h,  #8
> +    raddhn      v2.8b,  v9.8h,  v12.8h
> +    raddhn      v1.8b,  v8.8h,  v11.8h
> +    raddhn      v0.8b,  v7.8h,  v10.8h
> +.endm
> +
> +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
> +    uqadd       v0.8b,  v0.8b,  v4.8b
> +    uqadd       v1.8b,  v1.8b,  v5.8b
> +    uqadd       v2.8b,  v2.8b,  v6.8b
> +    /* 32bpp result is in {v0, v1, v2, XX} */
> +    convert_8888_to_0565 v2, v1, v0, v14, v30, v13
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
> +    fetch_mask_pixblock
> +    pixman_composite_add_0565_8_0565_process_pixblock_tail
> +    fetch_src_pixblock
> +    ld1        {v10.4h, v11.4h}, [DST_R], #16
> +    cache_preload 8, 8
> +    pixman_composite_add_0565_8_0565_process_pixblock_head
> +    st1        {v14.8h}, [DST_W], #16
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_add_0565_8_0565_process_pixblock_head, \
> +    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
> +    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    10, /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    15  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
> +    /* mask is in v15 */
> +    mov         v12.d[0], v10.d[0]
> +    mov         v12.d[1], v11.d[0]
> +    convert_0565_to_x888 v12, v6, v5, v4
> +    /* destination pixel data is in {v4, v5, v6, xx} */
> +    mvn         v24.8b, v15.8b /* get inverted alpha */
> +    /* now do alpha blending */
> +    umull       v8.8h,  v24.8b, v4.8b
> +    umull       v9.8h,  v24.8b, v5.8b
> +    umull       v10.8h, v24.8b, v6.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
> +    urshr       v11.8h, v8.8h, #8
> +    urshr       v12.8h, v9.8h, #8
> +    urshr       v13.8h, v10.8h, #8
> +    raddhn      v0.8b, v11.8h, v8.8h
> +    raddhn      v1.8b, v12.8h, v9.8h
> +    raddhn      v2.8b, v13.8h, v10.8h
> +    /* 32bpp result is in {v0, v1, v2, XX} */
> +    convert_8888_to_0565 v2, v1, v0, v14, v12, v3
> +    mov         v28.d[0], v14.d[0]
> +    mov         v29.d[0], v14.d[1]
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
> +    fetch_src_pixblock
> +    pixman_composite_out_reverse_8_0565_process_pixblock_tail
> +    ld1        {v10.4h, v11.4h}, [DST_R], #16
> +    cache_preload 8, 8
> +    pixman_composite_out_reverse_8_0565_process_pixblock_head
> +    st1        {v14.8h}, [DST_W], #16
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
> +    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
> +    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    10,  /* dst_r_basereg */ \
> +    15, /* src_basereg   */ \
> +    0   /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
> +    /* src is in v0 */
> +    /* destination pixel data is in {v4, v5, v6, v7} */
> +    mvn         v1.8b, v0.8b /* get inverted alpha */
> +    /* now do alpha blending */
> +    umull       v8.8h, v1.8b, v4.8b
> +    umull       v9.8h, v1.8b, v5.8b
> +    umull       v10.8h, v1.8b, v6.8b
> +    umull       v11.8h, v1.8b, v7.8b
> +.endm
> +
> +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
> +    urshr       v14.8h, v8.8h, #8
> +    urshr       v15.8h, v9.8h, #8
> +    urshr       v12.8h, v10.8h, #8
> +    urshr       v13.8h, v11.8h, #8
> +    raddhn      v28.8b, v14.8h, v8.8h
> +    raddhn      v29.8b, v15.8h, v9.8h
> +    raddhn      v30.8b, v12.8h, v10.8h
> +    raddhn      v31.8b, v13.8h, v11.8h
> +    /* 32bpp result is in {v28, v29, v30, v31} */
> +.endm
> +
> +/* TODO: expand macros and do better instructions scheduling */
> +.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
> +    fetch_src_pixblock
> +    pixman_composite_out_reverse_8_8888_process_pixblock_tail
> +    ld4       {v4.8b, v5.8b, v6.8b, v7.8b}, [DST_R], #32
> +    cache_preload 8, 8
> +    pixman_composite_out_reverse_8_8888_process_pixblock_head
> +    st4       {v28.8b, v29.8b, v30.8b, v31.8b}, [DST_W], #32
> +.endm
> +
> +generate_composite_function \
> +    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    5, /* prefetch distance */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
> +    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
> +    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4, /* dst_r_basereg */ \
> +    0, /* src_basereg   */ \
> +    0   /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +generate_composite_function_nearest_scanline \
> +    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_over_8888_8888_process_pixblock_head, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail, \
> +    pixman_composite_over_8888_8888_process_pixblock_tail_head
> +
> +generate_composite_function_nearest_scanline \
> +    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_over_8888_0565_process_pixblock_head, \
> +    pixman_composite_over_8888_0565_process_pixblock_tail, \
> +    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    0,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +generate_composite_function_nearest_scanline \
> +    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_8888_0565_process_pixblock_head, \
> +    pixman_composite_src_8888_0565_process_pixblock_tail, \
> +    pixman_composite_src_8888_0565_process_pixblock_tail_head, \
> +
> +generate_composite_function_nearest_scanline \
> +    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
> +    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init, \
> +    default_cleanup, \
> +    pixman_composite_src_0565_8888_process_pixblock_head, \
> +    pixman_composite_src_0565_8888_process_pixblock_tail, \
> +    pixman_composite_src_0565_8888_process_pixblock_tail_head
> +
> +generate_composite_function_nearest_scanline \
> +    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
> +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_head, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
> +    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    4,  /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    24  /* mask_basereg  */
> +
> +generate_composite_function_nearest_scanline \
> +    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
> +    FLAG_DST_READWRITE, \
> +    8, /* number of pixels, processed in a single block */ \
> +    default_init_need_all_regs, \
> +    default_cleanup_need_all_regs, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_head, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
> +    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
> +    28, /* dst_w_basereg */ \
> +    10,  /* dst_r_basereg */ \
> +    8,  /* src_basereg   */ \
> +    15  /* mask_basereg  */
> +
> +/******************************************************************************/
> +
> +/*
> + * Bilinear scaling support code which tries to provide pixel fetching, color
> + * format conversion, and interpolation as separate macros which can be used
> + * as the basic building blocks for constructing bilinear scanline functions.
> + */
> +
> +.macro bilinear_load_8888 reg1, reg2, tmp
> +    asr       TMP1, X, #16
> +    add       X, X, UX
> +    lsl       TMP2, TMP1, #2
> +    add       TMP1, TOP, TMP2
> +    ld1       {&reg1&.2s}, [TMP1], STRIDE
> +    ld1       {&reg2&.2s}, [TMP1]
> +.endm
> +
> +.macro bilinear_load_0565 reg1, reg2, tmp
> +    asr       TMP1, X, #16
> +    add       X, X, UX
> +    lsl       TMP2, TMP1, #1
> +    add       TMP1, TOP, TMP2
> +    ld1       {&reg2&.s}[0], [TMP1], STRIDE
> +    ld1       {&reg2&.s}[1], [TMP1]
> +    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_two_8888 \
> +                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
> +
> +    bilinear_load_8888 reg1, reg2, tmp1
> +    umull     &acc1&.8h, &reg1&.8b, v28.8b
> +    umlal     &acc1&.8h, &reg2&.8b, v29.8b
> +    bilinear_load_8888 reg3, reg4, tmp2
> +    umull     &acc2&.8h, &reg3&.8b, v28.8b
> +    umlal     &acc2&.8h, &reg4&.8b, v29.8b
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_four_8888 \
> +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
> +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
> +
> +    bilinear_load_and_vertical_interpolate_two_8888 \
> +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
> +    bilinear_load_and_vertical_interpolate_two_8888 \
> +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
> +.endm
> +
> +.macro vzip reg1, reg2
> +    umov      TMP4, v31.d[0]
> +    zip1      v31.8b, reg1, reg2
> +    zip2      reg2,   reg1, reg2
> +    mov       reg1,   v31.8b
> +    mov       v31.d[0], TMP4
> +.endm
> +
> +.macro vuzp reg1, reg2
> +    umov      TMP4, v31.d[0]
> +    uzp1      v31.8b, reg1, reg2
> +    uzp2      reg2,   reg1, reg2
> +    mov       reg1,   v31.8b
> +    mov       v31.d[0], TMP4
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_two_0565 \
> +                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
> +    asr       TMP1, X, #16
> +    add       X, X, UX
> +    lsl       TMP2, TMP1, #1
> +    add       TMP1, TOP, TMP2
> +    asr       TMP2, X, #16
> +    add       X, X, UX
> +    lsl       TMP3, TMP2, #1
> +    add       TMP2, TOP, TMP3
> +    ld1       {&acc2&.s}[0], [TMP1], STRIDE
> +    ld1       {&acc2&.s}[2], [TMP2], STRIDE
> +    ld1       {&acc2&.s}[1], [TMP1]
> +    ld1       {&acc2&.s}[3], [TMP2]
> +    convert_0565_to_x888 acc2, reg3, reg2, reg1
> +    vzip      &reg1&.8b, &reg3&.8b
> +    vzip      &reg2&.8b, &reg4&.8b
> +    vzip      &reg3&.8b, &reg4&.8b
> +    vzip      &reg1&.8b, &reg2&.8b
> +    umull     &acc1&.8h, &reg1&.8b, v28.8b
> +    umlal     &acc1&.8h, &reg2&.8b, v29.8b
> +    umull     &acc2&.8h, &reg3&.8b, v28.8b
> +    umlal     &acc2&.8h, &reg4&.8b, v29.8b
> +.endm
> +
> +.macro bilinear_load_and_vertical_interpolate_four_0565 \
> +                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
> +                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
> +    asr       TMP1, X, #16
> +    add       X, X, UX
> +    lsl       TMP2, TMP1, #1
> +    add       TMP1, TOP, TMP2
> +    asr       TMP2, X, #16
> +    add       X, X, UX
> +    lsl       TMP3, TMP2, #1
> +    add       TMP2, TOP, TMP3
> +    ld1       {&xacc2&.s}[0], [TMP1], STRIDE
> +    ld1       {&xacc2&.s}[2], [TMP2], STRIDE
> +    ld1       {&xacc2&.s}[1], [TMP1]
> +    ld1       {&xacc2&.s}[3], [TMP2]
> +    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
> +    asr       TMP1, X, #16
> +    add       X, X, UX
> +    lsl       TMP2, TMP1, #1
> +    add       TMP1, TOP, TMP2
> +    asr       TMP2, X, #16
> +    add       X, X, UX
> +    lsl       TMP3, TMP2, #1
> +    add       TMP2, TOP, TMP3
> +    ld1       {&yacc2&.s}[0], [TMP1], STRIDE
> +    vzip      &xreg1&.8b, &xreg3&.8b
> +    ld1       {&yacc2&.s}[2], [TMP2], STRIDE
> +    vzip      &xreg2&.8b, &xreg4&.8b
> +    ld1       {&yacc2&.s}[1], [TMP1]
> +    vzip      &xreg3&.8b, &xreg4&.8b
> +    ld1       {&yacc2&.s}[3], [TMP2]
> +    vzip      &xreg1&.8b, &xreg2&.8b
> +    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
> +    umull     &xacc1&.8h, &xreg1&.8b, v28.8b
> +    vzip      &yreg1&.8b, &yreg3&.8b
> +    umlal     &xacc1&.8h, &xreg2&.8b, v29.8b
> +    vzip      &yreg2&.8b, &yreg4&.8b
> +    umull     &xacc2&.8h, &xreg3&.8b, v28.8b
> +    vzip      &yreg3&.8b, &yreg4&.8b
> +    umlal     &xacc2&.8h, &xreg4&.8b, v29.8b
> +    vzip      &yreg1&.8b, &yreg2&.8b
> +    umull     &yacc1&.8h, &yreg1&.8b, v28.8b
> +    umlal     &yacc1&.8h, &yreg2&.8b, v29.8b
> +    umull     &yacc2&.8h, &yreg3&.8b, v28.8b
> +    umlal     &yacc2&.8h, &yreg4&.8b, v29.8b
> +.endm
> +
> +.macro bilinear_store_8888 numpix, tmp1, tmp2
> +.if numpix == 4
> +    st1       {v0.2s, v1.2s}, [OUT], #16
> +.elseif numpix == 2
> +    st1       {v0.2s}, [OUT], #8
> +.elseif numpix == 1
> +    st1       {v0.s}[0], [OUT], #4
> +.else
> +    .error bilinear_store_8888 numpix is unsupported
> +.endif
> +.endm
> +
> +.macro bilinear_store_0565 numpix, tmp1, tmp2
> +    vuzp      v0.8b, v1.8b
> +    vuzp      v2.8b, v3.8b
> +    vuzp      v1.8b, v3.8b
> +    vuzp      v0.8b, v2.8b
> +    convert_8888_to_0565 v2, v1, v0, v1, tmp1, tmp2
> +.if numpix == 4
> +    st1       {v1.4h}, [OUT], #8
> +.elseif numpix == 2
> +    st1       {v1.s}[0], [OUT], #4
> +.elseif numpix == 1
> +    st1       {v1.h}[0], [OUT], #2
> +.else
> +    .error bilinear_store_0565 numpix is unsupported
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
> +    bilinear_load_&src_fmt v0, v1, v2
> +    umull     v2.8h, v0.8b, v28.8b
> +    umlal     v2.8h, v1.8b, v29.8b
> +    /* 5 cycles bubble */
> +    mov       v3.d[0], v2.d[1]
> +    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v0.4s, v2.4h, v30.4h
> +    umlal     v0.4s, v3.4h, v30.4h
> +    /* 5 cycles bubble */
> +    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    /* 3 cycles bubble */
> +    xtn       v0.8b, v0.8h
> +    /* 1 cycle bubble */
> +    bilinear_store_&dst_fmt 1, v3, v4
> +.endm
> +
> +.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
> +    bilinear_load_and_vertical_interpolate_two_&src_fmt \
> +                v1, v11, v2, v3, v20, v21, v22, v23
> +    mov       v2.d[0], v1.d[0]
> +    mov       v3.d[0], v1.d[1]
> +    mov       v22.d[0], v11.d[0]
> +    mov       v23.d[0], v11.d[1]
> +    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v0.4s, v2.4h, v30.4h
> +    umlal     v0.4s, v3.4h, v30.4h
> +    ushll     v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v10.4s, v22.4h, v31.4h
> +    umlal     v10.4s, v23.4h, v31.4h
> +    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    shrn      v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    mov       v0.d[1], v1.d[0]
> +    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +    mov       v31.d[0], v30.d[1]
> +    add       v12.8h, v12.8h, v13.8h
> +    xtn       v0.8b, v0.8h
> +    bilinear_store_&dst_fmt 2, v3, v4
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +    bilinear_load_and_vertical_interpolate_four_&src_fmt \
> +                v1, v11, v14, v15, v16, v17, v22, v23 \
> +                v3, v9,  v24, v25, v26, v27, v18, v19
> +    prfm      pldl2strm, [TMP1, PF_OFFS]
> +    sub       TMP1, TMP1, STRIDE
> +    mov       v6.d[0],  v3.d[0]
> +    mov       v7.d[0],  v3.d[1]
> +    mov       v18.d[0], v9.d[0]
> +    mov       v19.d[0], v9.d[1]
> +    mov       v2.d[0],  v1.d[0]
> +    mov       v3.d[0],  v1.d[1]
> +    mov       v22.d[0], v11.d[0]
> +    mov       v23.d[0], v11.d[1]
> +    ushll     v0.4s, v2.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v0.4s, v2.4h, v30.4h
> +    umlal     v0.4s, v3.4h, v30.4h
> +    ushll     v10.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v10.4s, v22.4h, v31.4h
> +    umlal     v10.4s, v23.4h, v31.4h
> +    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +    mov       v31.d[0], v30.d[1]
> +    ushll     v2.4s, v6.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v2.4s, v6.4h, v30.4h
> +    umlal     v2.4s, v7.4h, v30.4h
> +    ushll     v8.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
> +    prfm      pldl2strm, [TMP2, PF_OFFS]
> +    umlsl     v8.4s, v18.4h, v31.4h
> +    umlal     v8.4s, v19.4h, v31.4h
> +    add       v12.8h, v12.8h, v13.8h
> +    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    shrn      v1.4h, v10.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    mov       v0.d[1], v1.d[0]
> +    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    shrn      v5.4h, v8.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    mov       v2.d[1], v5.d[0]
> +    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +    mov       v31.d[0], v30.d[1]
> +    xtn       v0.8b, v0.8h
> +    xtn       v1.8b, v2.8h
> +    add       v12.8h, v12.8h, v13.8h
> +    bilinear_store_&dst_fmt 4, v3, v4
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
> +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
> +.else
> +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
> +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
> +    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
> +.else
> +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
> +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
> +.else
> +    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
> +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
> +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
> +.else
> +    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
> +.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
> +    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
> +.else
> +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +.endif
> +.endm
> +
> +.set BILINEAR_FLAG_UNROLL_4,          0
> +.set BILINEAR_FLAG_UNROLL_8,          1
> +.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
> +
> +/*
> + * Main template macro for generating NEON optimized bilinear scanline
> + * functions.
> + *
> + * Bilinear scanline scaler macro template uses the following arguments:
> + *  fname             - name of the function to generate
> + *  src_fmt           - source color format (8888 or 0565)
> + *  dst_fmt           - destination color format (8888 or 0565)
> + *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
> + *  prefetch_distance - prefetch in the source image by that many
> + *                      pixels ahead
> + */
> +
> +.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
> +                                       src_bpp_shift, dst_bpp_shift, \
> +                                       prefetch_distance, flags
> +
> +pixman_asm_function fname
> +    OUT       .req      x0
> +    TOP       .req      x1
> +    BOTTOM    .req      x2
> +    WT        .req      x3
> +    WB        .req      x4
> +    X         .req      x5
> +    UX        .req      x6
> +    WIDTH     .req      x7
> +    TMP1      .req      x8
> +    TMP2      .req      x9
> +    PF_OFFS   .req      x10
> +    TMP3      .req      x11
> +    TMP4      .req      x12
> +    STRIDE    .req      x13
> +
> +    sxtw      x3, w3
> +    sxtw      x4, w4
> +    sxtw      x5, w5
> +    sxtw      x6, w6
> +    sxtw      x7, w7
> +
> +    stp       x29, x30, [sp, -16]!
> +    mov       x29, sp
> +    sub       sp,  sp, 112  /* push all registers */
> +    sub       x29, x29, 64
> +    st1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
> +    st1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
> +    stp        x8,  x9, [x29, -80]
> +    stp       x10, x11, [x29, -96]
> +    stp       x12, x13, [x29, -112]
> +
> +    mov       PF_OFFS, #prefetch_distance
> +    mul       PF_OFFS, PF_OFFS, UX
> +
> +    subs      STRIDE, BOTTOM, TOP
> +    .unreq    BOTTOM
> +
> +    cmp       WIDTH, #0
> +    ble       300f
> +
> +    dup       v12.8h, w5
> +    dup       v13.8h, w6
> +    dup       v28.8b, w3
> +    dup       v29.8b, w4
> +    mov       v25.d[0], v12.d[1]
> +    mov       v26.d[0], v13.d[0]
> +    add       v25.4h, v25.4h, v26.4h
> +    mov       v12.d[1], v25.d[0]
> +
> +    /* ensure good destination alignment  */
> +    cmp       WIDTH, #1
> +    blt       100f
> +    tst       OUT, #(1 << dst_bpp_shift)
> +    beq       100f
> +    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +    mov       v31.d[0], v30.d[1]
> +    add       v12.8h, v12.8h, v13.8h
> +    bilinear_interpolate_last_pixel src_fmt, dst_fmt
> +    sub       WIDTH, WIDTH, #1
> +100:
> +    add       v13.8h, v13.8h, v13.8h
> +    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +    mov       v31.d[0], v30.d[1]
> +    add       v12.8h, v12.8h, v13.8h
> +
> +    cmp       WIDTH, #2
> +    blt       100f
> +    tst       OUT, #(1 << (dst_bpp_shift + 1))
> +    beq       100f
> +    bilinear_interpolate_two_pixels src_fmt, dst_fmt
> +    sub       WIDTH, WIDTH, #2
> +100:
> +.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
> +/*********** 8 pixels per iteration *****************/
> +    cmp       WIDTH, #4
> +    blt       100f
> +    tst       OUT, #(1 << (dst_bpp_shift + 2))
> +    beq       100f
> +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +    sub       WIDTH, WIDTH, #4
> +100:
> +    subs      WIDTH, WIDTH, #8
> +    blt       100f
> +    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
> +    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
> +    subs      WIDTH, WIDTH, #8
> +    blt       500f
> +1000:
> +    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
> +    subs      WIDTH, WIDTH, #8
> +    bge       1000b
> +500:
> +    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
> +100:
> +    tst       WIDTH, #4
> +    beq       200f
> +    bilinear_interpolate_four_pixels src_fmt, dst_fmt
> +200:
> +.else
> +/*********** 4 pixels per iteration *****************/
> +    subs      WIDTH, WIDTH, #4
> +    blt       100f
> +    asr       PF_OFFS, PF_OFFS, #(16 - src_bpp_shift)
> +    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
> +    subs      WIDTH, WIDTH, #4
> +    blt       500f
> +1000:
> +    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
> +    subs      WIDTH, WIDTH, #4
> +    bge       1000b
> +500:
> +    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
> +100:
> +/****************************************************/
> +.endif
> +    /* handle the remaining trailing pixels */
> +    tst       WIDTH, #2
> +    beq       200f
> +    bilinear_interpolate_two_pixels src_fmt, dst_fmt
> +200:
> +    tst       WIDTH, #1
> +    beq       300f
> +    bilinear_interpolate_last_pixel src_fmt, dst_fmt
> +300:
> +    sub       x29, x29, 64
> +    ld1       {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
> +    ld1       {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
> +    ldp        x8,  x9, [x29, -80]
> +    ldp       x10, x11, [x29, -96]
> +    ldp       x12, x13, [x29, -104]
> +    mov       sp, x29
> +    ldp       x29, x30, [sp], 16
> +    ret
> +
> +    .unreq    OUT
> +    .unreq    TOP
> +    .unreq    WT
> +    .unreq    WB
> +    .unreq    X
> +    .unreq    UX
> +    .unreq    WIDTH
> +    .unreq    TMP1
> +    .unreq    TMP2
> +    .unreq    PF_OFFS
> +    .unreq    TMP3
> +    .unreq    TMP4
> +    .unreq    STRIDE
> +.endfunc
> +
> +.endm
> +
> +/*****************************************************************************/
> +
> +.set have_bilinear_interpolate_four_pixels_8888_8888, 1
> +
> +.macro bilinear_interpolate_four_pixels_8888_8888_head
> +    asr       TMP1, X, #16
> +    add       X, X, UX
> +    lsl       TMP4, TMP1, #2
> +    add       TMP1, TOP, TMP4
> +    asr       TMP2, X, #16
> +    add       X, X, UX
> +    lsl       TMP4, TMP2, #2
> +    add       TMP2, TOP, TMP4
> +
> +    ld1       {v22.2s}, [TMP1], STRIDE
> +    ld1       {v23.2s}, [TMP1]
> +    asr       TMP3, X, #16
> +    add       X, X, UX
> +    lsl       TMP4, TMP3, #2
> +    add       TMP3, TOP, TMP4
> +    umull     v8.8h, v22.8b, v28.8b
> +    umlal     v8.8h, v23.8b, v29.8b
> +    mov       v16.d[0], v8.d[0]
> +    mov       v17.d[0], v8.d[1]
> +
> +    ld1       {v22.2s}, [TMP2], STRIDE
> +    ld1       {v23.2s}, [TMP2]
> +    asr       TMP4, X, #16
> +    add       X, X, UX
> +    lsl       TMP1, TMP4, #2
> +    add       TMP4, TOP, TMP1
> +    umull     v9.8h, v22.8b, v28.8b
> +    umlal     v9.8h, v23.8b, v29.8b
> +    mov       v18.d[0], v9.d[0]
> +    mov       v19.d[0], v9.d[1]
> +
> +    ld1       {v22.2s}, [TMP3], STRIDE
> +    ld1       {v23.2s}, [TMP3]
> +    umull     v10.8h, v22.8b, v28.8b
> +    umlal     v10.8h, v23.8b, v29.8b
> +    mov       v20.d[0], v10.d[0]
> +    mov       v21.d[0], v10.d[1]
> +
> +    ushll     v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v0.4s, v16.4h, v30.4h
> +    umlal     v0.4s, v17.4h, v30.4h
> +
> +    prfm      pldl2strm, [TMP4, PF_OFFS]
> +    ld1       {v16.2s}, [TMP4], STRIDE
> +    ld1       {v17.2s}, [TMP4]
> +    prfm      pldl2strm, [TMP4, PF_OFFS]
> +    umull     v11.8h, v16.8b, v28.8b
> +    umlal     v11.8h, v17.8b, v29.8b
> +    mov       v22.d[0], v11.d[0]
> +    mov       v23.d[0], v11.d[1]
> +
> +    ushll     v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v1.4s, v18.4h, v31.4h
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_8888_8888_tail
> +    umlal     v1.4s, v19.4h, v31.4h
> +    ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +    mov       v30.d[0], v15.d[0]
> +    mov       v31.d[0], v15.d[1]
> +    ushll     v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v2.4s, v20.4h, v30.4h
> +    umlal     v2.4s, v21.4h, v30.4h
> +    ushll     v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v3.4s, v22.4h, v31.4h
> +    umlal     v3.4s, v23.4h, v31.4h
> +    add       v12.8h, v12.8h, v13.8h
> +    shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    shrn      v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    mov       v0.d[1], v1.d[0]
> +    shrn      v2.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    ushr      v30.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +    mov       v31.d[0], v30.d[1]
> +    shrn      v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    mov       v2.d[1], v5.d[0]
> +    xtn       v6.8b, v0.8h
> +    xtn       v7.8b, v2.8h
> +    add       v12.8h, v12.8h, v13.8h
> +    st1       {v6.2s, v7.2s}, [OUT], #16
> +.endm
> +
> +.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
> +    asr       TMP1, X, #16
> +    add       X, X, UX
> +    lsl       TMP2, TMP1, #2
> +    add       TMP1, TOP, TMP2
> +    asr       TMP2, X, #16
> +    add       X, X, UX
> +    lsl       TMP3, TMP2, #2
> +    add       TMP2, TOP, TMP3
> +        umlal     v1.4s, v19.4h, v31.4h
> +        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +        mov       v30.d[0], v15.d[0]
> +        mov       v31.d[0], v15.d[1]
> +        ushll     v2.4s, v20.4h, #BILINEAR_INTERPOLATION_BITS
> +        umlsl     v2.4s, v20.4h, v30.4h
> +        umlal     v2.4s, v21.4h, v30.4h
> +        ushll     v3.4s, v22.4h, #BILINEAR_INTERPOLATION_BITS
> +    ld1       {v20.2s}, [TMP1], STRIDE
> +        umlsl     v3.4s, v22.4h, v31.4h
> +        umlal     v3.4s, v23.4h, v31.4h
> +    ld1       {v21.2s}, [TMP1]
> +    umull     v8.8h, v20.8b, v28.8b
> +    umlal     v8.8h, v21.8b, v29.8b
> +    mov       v16.d[0], v8.d[0]
> +    mov       v17.d[0], v8.d[1]
> +        shrn      v0.4h, v0.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +        shrn      v1.4h, v1.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +        mov       v0.d[1], v1.d[0]
> +        shrn      v4.4h, v2.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +    ld1       {v22.2s}, [TMP2], STRIDE
> +        shrn      v5.4h, v3.4s, #(2 * BILINEAR_INTERPOLATION_BITS)
> +        mov       v2.d[0], v4.d[0]
> +        mov       v2.d[1], v5.d[0]
> +        add       v12.8h, v12.8h, v13.8h
> +    ld1       {v23.2s}, [TMP2]
> +    umull     v9.8h, v22.8b, v28.8b
> +    asr       TMP3, X, #16
> +    add       X, X, UX
> +    lsl       TMP4, TMP3, #2
> +    add       TMP3, TOP, TMP4
> +    asr       TMP4, X, #16
> +    add       X, X, UX
> +    lsl       TMP1, TMP4, #2
> +    add       TMP4, TOP, TMP1
> +    umlal     v9.8h, v23.8b, v29.8b
> +    mov       v18.d[0], v9.d[0]
> +    mov       v19.d[0], v9.d[1]
> +    ld1       {v22.2s}, [TMP3], STRIDE
> +        ushr      v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
> +        mov       v30.d[0], v15.d[0]
> +        mov       v31.d[0], v15.d[1]
> +    ld1       {v23.2s}, [TMP3]
> +    umull     v10.8h, v22.8b, v28.8b
> +    umlal     v10.8h, v23.8b, v29.8b
> +    mov       v20.d[0], v10.d[0]
> +    mov       v21.d[0], v10.d[1]
> +        xtn       v6.8b, v0.8h
> +    ushll     v0.4s, v16.4h, #BILINEAR_INTERPOLATION_BITS
> +        xtn       v7.8b, v2.8h
> +    umlsl     v0.4s, v16.4h, v30.4h
> +    umlal     v0.4s, v17.4h, v30.4h
> +    prfm      pldl2strm, [TMP4, PF_OFFS]
> +    ld1       {v16.2s}, [TMP4], STRIDE
> +        add       v12.8h, v12.8h, v13.8h
> +    ld1       {v17.2s}, [TMP4]
> +    prfm      pldl2strm, [TMP4, PF_OFFS]
> +    umull     v11.8h, v16.8b, v28.8b
> +    umlal     v11.8h, v17.8b, v29.8b
> +    mov       v22.d[0], v11.d[0]
> +    mov       v23.d[0], v11.d[1]
> +        st1       {v6.2s, v7.2s}, [OUT], #16
> +    ushll     v1.4s, v18.4h, #BILINEAR_INTERPOLATION_BITS
> +    umlsl     v1.4s, v18.4h, v31.4h
> +.endm
> +
> +/*****************************************************************************/
> +
> +generate_bilinear_scanline_func \
> +    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
> +    2, 2, 28, BILINEAR_FLAG_UNROLL_4
> +
> +generate_bilinear_scanline_func \
> +    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
> +    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
> +
> +generate_bilinear_scanline_func \
> +    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
> +    1, 2, 28, BILINEAR_FLAG_UNROLL_4
> +
> +generate_bilinear_scanline_func \
> +    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
> +    1, 1, 28, BILINEAR_FLAG_UNROLL_4
> diff --git a/pixman/pixman-arma64-neon-asm.h b/pixman/pixman-arma64-neon-asm.h
> new file mode 100644
> index 0000000..97cde5d
> --- /dev/null
> +++ b/pixman/pixman-arma64-neon-asm.h
> @@ -0,0 +1,1288 @@
> +/*
> + * Copyright ツゥ 2009 Nokia Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + *
> + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
> + */
> +
> +/*
> + * This file contains a macro ('generate_composite_function') which can
> + * construct 2D image processing functions, based on a common template.
> + * Any combinations of source, destination and mask images with 8bpp,
> + * 16bpp, 24bpp, 32bpp color formats are supported.
> + *
> + * This macro takes care of:
> + *  - handling of leading and trailing unaligned pixels
> + *  - doing most of the work related to L2 cache preload
> + *  - encourages the use of software pipelining for better instructions
> + *    scheduling
> + *
> + * The user of this macro has to provide some configuration parameters
> + * (bit depths for the images, prefetch distance, etc.) and a set of
> + * macros, which should implement basic code chunks responsible for
> + * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
> + * examples.
> + *
> + * TODO:
> + *  - try overlapped pixel method (from Ian Rickards) when processing
> + *    exactly two blocks of pixels
> + *  - maybe add an option to do reverse scanline processing
> + */
> +
> +/*
> + * Bit flags for 'generate_composite_function' macro which are used
> + * to tune generated functions behavior.
> + */
> +.set FLAG_DST_WRITEONLY,       0
> +.set FLAG_DST_READWRITE,       1
> +.set FLAG_DEINTERLEAVE_32BPP,  2
> +
> +/*
> + * Constants for selecting preferable prefetch type.
> + */
> +.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
> +.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
> +.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
> +
> +/*
> + * Definitions of supplementary pixld/pixst macros (for partial load/store of
> + * pixel data).
> + */
> +
> +.macro pixldst1 op, elem_size, reg1, mem_operand, abits
> +    op {v&reg1&.&elem_size}, [&mem_operand&], #8
> +.endm
> +
> +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
> +    op {v&reg1&.&elem_size, v&reg2&.&elem_size}, [&mem_operand&], #16
> +.endm
> +
> +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
> +    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size,
> v&reg4&.&elem_size}, [&mem_operand&], #32
> +.endm
> +
> +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
> +    op {v&reg1&.&elem_size}[idx], [&mem_operand&], #&bytes&
> +.endm
> +
> +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
> +    op {v&reg1&.&elem_size, v&reg2&.&elem_size, v&reg3&.&elem_size},
> [&mem_operand&], #24
> +.endm
> +
> +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
> +    op {v&reg1&.&elem_size, v&reg2&.&elem_size,
> v&reg3&.&elem_size}[idx], [&mem_operand&], #3
> +.endm
> +
> +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
> +.if numbytes == 32
> +    .if elem_size==32
> +        pixldst4 op, 2s, %(basereg+4), %(basereg+5), \
> +                              %(basereg+6), %(basereg+7), mem_operand, abits
> +    .elseif elem_size==16
> +        pixldst4 op, 4h, %(basereg+4), %(basereg+5), \
> +                              %(basereg+6), %(basereg+7), mem_operand, abits
> +    .else
> +        pixldst4 op, 8b, %(basereg+4), %(basereg+5), \
> +                              %(basereg+6), %(basereg+7), mem_operand, abits
> +    .endif
> +.elseif numbytes == 16
> +    .if elem_size==32
> +          pixldst2 op, 2s, %(basereg+2), %(basereg+3), mem_operand, abits
> +    .elseif elem_size==16
> +          pixldst2 op, 4h, %(basereg+2), %(basereg+3), mem_operand, abits
> +    .else
> +          pixldst2 op, 8b, %(basereg+2), %(basereg+3), mem_operand, abits
> +    .endif
> +.elseif numbytes == 8
> +    .if elem_size==32
> +        pixldst1 op, 2s, %(basereg+1), mem_operand, abits
> +    .elseif elem_size==16
> +        pixldst1 op, 4h, %(basereg+1), mem_operand, abits
> +    .else
> +        pixldst1 op, 8b, %(basereg+1), mem_operand, abits
> +    .endif
> +.elseif numbytes == 4
> +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
> +        pixldst0 op, s, %(basereg+0), 1, mem_operand, abits, 4
> +    .elseif elem_size == 16
> +        pixldst0 op, h, %(basereg+0), 2, mem_operand, abits, 2
> +        pixldst0 op, h, %(basereg+0), 3, mem_operand, abits, 2
> +    .else
> +        pixldst0 op, b, %(basereg+0), 4, mem_operand, abits, 1
> +        pixldst0 op, b, %(basereg+0), 5, mem_operand, abits, 1
> +        pixldst0 op, b, %(basereg+0), 6, mem_operand, abits, 1
> +        pixldst0 op, b, %(basereg+0), 7, mem_operand, abits, 1
> +    .endif
> +.elseif numbytes == 2
> +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
> +        pixldst0 op, h, %(basereg+0), 1, mem_operand, abits, 2
> +    .else
> +        pixldst0 op, b, %(basereg+0), 2, mem_operand, abits, 1
> +        pixldst0 op, b, %(basereg+0), 3, mem_operand, abits, 1
> +    .endif
> +.elseif numbytes == 1
> +        pixldst0 op, b, %(basereg+0), 1, mem_operand, abits, 1
> +.else
> +    .error "unsupported size: numbytes"
> +.endif
> +.endm
> +
> +.macro pixld numpix, bpp, basereg, mem_operand, abits=0
> +.if bpp > 0
> +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> +    pixldst4 ld4, 8b, %(basereg+4), %(basereg+5), \
> +                      %(basereg+6), %(basereg+7), mem_operand, abits
> +.elseif (bpp == 24) && (numpix == 8)
> +    pixldst3 ld3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
> +.elseif (bpp == 24) && (numpix == 4)
> +    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
> +    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
> +    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
> +    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
> +.elseif (bpp == 24) && (numpix == 2)
> +    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
> +    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
> +.elseif (bpp == 24) && (numpix == 1)
> +    pixldst30 ld3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
> +.else
> +    pixldst %(numpix * bpp / 8), ld1, %(bpp), basereg, mem_operand, abits
> +.endif
> +.endif
> +.endm
> +
> +.macro pixst numpix, bpp, basereg, mem_operand, abits=0
> +.if bpp > 0
> +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> +    pixldst4 st4, 8b, %(basereg+4), %(basereg+5), \
> +                      %(basereg+6), %(basereg+7), mem_operand, abits
> +.elseif (bpp == 24) && (numpix == 8)
> +    pixldst3 st3, 8b, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
> +.elseif (bpp == 24) && (numpix == 4)
> +    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
> +    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
> +    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
> +    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
> +.elseif (bpp == 24) && (numpix == 2)
> +    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
> +    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
> +.elseif (bpp == 24) && (numpix == 1)
> +    pixldst30 st3, b, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
> +.else
> +    pixldst %(numpix * bpp / 8), st1, %(bpp), basereg, mem_operand, abits
> +.endif
> +.endif
> +.endm
> +
> +.macro pixld_a numpix, bpp, basereg, mem_operand
> +.if (bpp * numpix) <= 128
> +    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
> +.else
> +    pixld numpix, bpp, basereg, mem_operand, 128
> +.endif
> +.endm
> +
> +.macro pixst_a numpix, bpp, basereg, mem_operand
> +.if (bpp * numpix) <= 128
> +    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
> +.else
> +    pixst numpix, bpp, basereg, mem_operand, 128
> +.endif
> +.endm
> +
> +/*
> + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
> + * aliases to be defined)
> + */
> +.macro pixld1_s elem_size, reg1, mem_operand
> +.if elem_size == 16
> +    asr     TMP1, VX, #16
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP1, #1
> +    add     TMP1, mem_operand, DUMMY
> +    asr     TMP2, VX, #16
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP2, #1
> +    add     TMP2, mem_operand, DUMMY
> +    ld1     {v&reg1&.h}[0], [TMP1]
> +    asr     TMP1, VX, #16
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP1, #1
> +    add     TMP1, mem_operand, DUMMY
> +    ld1     {v&reg1&.h}[1], [TMP2]
> +    asr     TMP2, VX, #16
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP2, #1
> +    add     TMP2, mem_operand, DUMMY
> +    ld1     {v&reg1&.h}[2], [TMP1]
> +    ld1     {v&reg1&.h}[3], [TMP2]
> +.elseif elem_size == 32
> +    asr     TMP1, VX, #16
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP1, #2
> +    add     TMP1, mem_operand, DUMMY
> +    asr     TMP2, VX, #16
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP2, #2
> +    add     TMP2, mem_operand, DUMMY
> +    ld1     {v&reg1&.s}[0], [TMP1]
> +    ld1     {v&reg1&.s}[1], [TMP2]
> +.else
> +    .error "unsupported"
> +.endif
> +.endm
> +
> +.macro pixld2_s elem_size, reg1, reg2, mem_operand
> +.if 0 /* elem_size == 32 */
> +    mov     TMP1, VX, asr #16
> +    add     VX, VX, UNIT_X, asl #1
> +    add     TMP1, mem_operand, TMP1, asl #2
> +    mov     TMP2, VX, asr #16
> +    sub     VX, VX, UNIT_X
> +    add     TMP2, mem_operand, TMP2, asl #2
> +    ld1     {v&reg1&.s}[0], [TMP1]
> +    mov     TMP1, VX, asr #16
> +    add     VX, VX, UNIT_X, asl #1
> +    add     TMP1, mem_operand, TMP1, asl #2
> +    ld1     {v&reg2&.s}[0], [TMP2, :32]
> +    mov     TMP2, VX, asr #16
> +    add     VX, VX, UNIT_X
> +    add     TMP2, mem_operand, TMP2, asl #2
> +    ld1     {v&reg1&.s}[1], [TMP1]
> +    ld1     {v&reg2&.s}[1], [TMP2]
> +.else
> +    pixld1_s elem_size, reg1, mem_operand
> +    pixld1_s elem_size, reg2, mem_operand
> +.endif
> +.endm
> +
> +.macro pixld0_s elem_size, reg1, idx, mem_operand
> +.if elem_size == 16
> +    asr     TMP1, VX, #16
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP1, #1
> +    add     TMP1, mem_operand, DUMMY
> +    ld1     {v&reg1&.h}[idx], [TMP1]
> +.elseif elem_size == 32
> +    asr     DUMMY, VX, #16
> +    mov     TMP1, DUMMY
> +    adds    VX, VX, UNIT_X
> +    bmi     55f
> +5:  subs    VX, VX, SRC_WIDTH_FIXED
> +    bpl     5b
> +55:
> +    lsl     DUMMY, TMP1, #2
> +    add     TMP1, mem_operand, DUMMY
> +    ld1     {v&reg1&.s}[idx], [TMP1]
> +.endif
> +.endm
> +
> +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
> +.if numbytes == 32
> +    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
> +    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
> +    pixdeinterleave elem_size, %(basereg+4)
> +.elseif numbytes == 16
> +    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
> +.elseif numbytes == 8
> +    pixld1_s elem_size, %(basereg+1), mem_operand
> +.elseif numbytes == 4
> +    .if elem_size == 32
> +        pixld0_s elem_size, %(basereg+0), 1, mem_operand
> +    .elseif elem_size == 16
> +        pixld0_s elem_size, %(basereg+0), 2, mem_operand
> +        pixld0_s elem_size, %(basereg+0), 3, mem_operand
> +    .else
> +        pixld0_s elem_size, %(basereg+0), 4, mem_operand
> +        pixld0_s elem_size, %(basereg+0), 5, mem_operand
> +        pixld0_s elem_size, %(basereg+0), 6, mem_operand
> +        pixld0_s elem_size, %(basereg+0), 7, mem_operand
> +    .endif
> +.elseif numbytes == 2
> +    .if elem_size == 16
> +        pixld0_s elem_size, %(basereg+0), 1, mem_operand
> +    .else
> +        pixld0_s elem_size, %(basereg+0), 2, mem_operand
> +        pixld0_s elem_size, %(basereg+0), 3, mem_operand
> +    .endif
> +.elseif numbytes == 1
> +    pixld0_s elem_size, %(basereg+0), 1, mem_operand
> +.else
> +    .error "unsupported size: numbytes"
> +.endif
> +.endm
> +
> +.macro pixld_s numpix, bpp, basereg, mem_operand
> +.if bpp > 0
> +    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
> +.endif
> +.endm
> +
> +.macro vuzp8 reg1, reg2
> +    umov DUMMY, v16.d[0]
> +    uzp1 v16.8b,     v&reg1&.8b, v&reg2&.8b
> +    uzp2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
> +    mov  v&reg1&.8b, v16.8b
> +    mov  v16.d[0], DUMMY
> +.endm
> +
> +.macro vzip8 reg1, reg2
> +    umov DUMMY, v16.d[0]
> +    zip1 v16.8b,     v&reg1&.8b, v&reg2&.8b
> +    zip2 v&reg2&.8b, v&reg1&.8b, v&reg2&.8b
> +    mov  v&reg1&.8b, v16.8b
> +    mov  v16.d[0], DUMMY
> +.endm
> +
> +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
> +.macro pixdeinterleave bpp, basereg
> +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> +    vuzp8 %(basereg+0), %(basereg+1)
> +    vuzp8 %(basereg+2), %(basereg+3)
> +    vuzp8 %(basereg+1), %(basereg+3)
> +    vuzp8 %(basereg+0), %(basereg+2)
> +.endif
> +.endm
> +
> +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
> +.macro pixinterleave bpp, basereg
> +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
> +    vzip8 %(basereg+0), %(basereg+2)
> +    vzip8 %(basereg+1), %(basereg+3)
> +    vzip8 %(basereg+2), %(basereg+3)
> +    vzip8 %(basereg+0), %(basereg+1)
> +.endif
> +.endm
> +
> +/*
> + * This is a macro for implementing cache preload. The main idea is that
> + * cache preload logic is mostly independent from the rest of pixels
> + * processing code. It starts at the top left pixel and moves forward
> + * across pixels and can jump across scanlines. Prefetch distance is
> + * handled in an 'incremental' way: it starts from 0 and advances to the
> + * optimal distance over time. After reaching optimal prefetch distance,
> + * it is kept constant. There are some checks which prevent prefetching
> + * unneeded pixel lines below the image (but it still can prefetch a bit
> + * more data on the right side of the image - not a big issue and may
> + * be actually helpful when rendering text glyphs). Additional trick is
> + * the use of LDR instruction for prefetch instead of PLD when moving to
> + * the next line, the point is that we have a high chance of getting TLB
> + * miss in this case, and PLD would be useless.
> + *
> + * This sounds like it may introduce a noticeable overhead (when working with
> + * fully cached data). But in reality, due to having a separate pipeline and
> + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
> + * execute simultaneously with NEON and be completely shadowed by it. Thus
> + * we get no performance overhead at all (*). This looks like a very nice
> + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
> + * but still can implement some rather advanced prefetch logic in software
> + * for almost zero cost!
> + *
> + * (*) The overhead of the prefetcher is visible when running some trivial
> + * pixels processing like simple copy. Anyway, having prefetch is a must
> + * when working with the graphics data.
> + */
> +.macro PF a, x:vararg
> +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
> +    a x
> +.endif
> +.endm
> +
> +.macro cache_preload std_increment, boost_increment
> +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
> +.if std_increment != 0
> +    PF add PF_X, PF_X, #std_increment
> +.endif
> +    PF tst PF_CTL, #0xF
> +    PF beq 71f
> +    PF add PF_X, PF_X, #boost_increment
> +    PF sub PF_CTL, PF_CTL, #1
> +71:
> +    PF cmp PF_X, ORIG_W
> +.if src_bpp_shift >= 0
> +    PF lsl DUMMY, PF_X, #src_bpp_shift
> +    PF prfm pldl2strm, [PF_SRC, DUMMY]
> +.endif
> +.if dst_r_bpp != 0
> +    PF lsl DUMMY, PF_X, #dst_bpp_shift
> +    PF prfm pldl2strm, [PF_DST, DUMMY]
> +.endif
> +.if mask_bpp_shift >= 0
> +    PF lsl DUMMY, PF_X, #mask_bpp_shift
> +    PF prfm pldl2strm, [PF_MASK, DUMMY]
> +.endif
> +    PF ble 71f
> +    PF sub PF_X, PF_X, ORIG_W
> +    PF subs PF_CTL, PF_CTL, #0x10
> +71:
> +    PF ble 72f
> +.if src_bpp_shift >= 0
> +    PF lsl DUMMY, SRC_STRIDE, #src_bpp_shift
> +    PF ldrsb DUMMY, [PF_SRC, DUMMY]
> +    PF add PF_SRC, PF_SRC, #1
> +.endif
> +.if dst_r_bpp != 0
> +    PF lsl DUMMY, DST_STRIDE, #dst_bpp_shift
> +    PF ldrsb DUMMY, [PF_DST, DUMMY]
> +    PF add PF_DST, PF_DST, #1
> +.endif
> +.if mask_bpp_shift >= 0
> +    PF lsl DUMMY, MASK_STRIDE, #mask_bpp_shift
> +    PF ldrsb DUMMY, [PF_MASK, DUMMY]
> +    PF add PF_MASK, PF_MASK, #1
> +.endif
> +72:
> +.endif
> +.endm
> +
> +.macro cache_preload_simple
> +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
> +.if src_bpp > 0
> +    prfm pldl2strm, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
> +.endif
> +.if dst_r_bpp > 0
> +    prfm pldl2strm, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
> +.endif
> +.if mask_bpp > 0
> +    prfm pldl2strm, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
> +.endif
> +.endif
> +.endm
> +
> +.macro fetch_mask_pixblock
> +    pixld       pixblock_size, mask_bpp, \
> +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> +.endm
> +
> +/*
> + * Macro which is used to process leading pixels until destination
> + * pointer is properly aligned (at 16 bytes boundary). When destination
> + * buffer uses 16bpp format, this is unnecessary, or even pointless.
> + */
> +.macro ensure_destination_ptr_alignment process_pixblock_head, \
> +                                        process_pixblock_tail, \
> +                                        process_pixblock_tail_head
> +.if dst_w_bpp != 24
> +    tst         DST_R, #0xF
> +    beq         52f
> +.irp lowbit, 1, 2, 4, 8, 16
> +local skip1
> +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
> +.if lowbit < 16 /* we don't need more than 16-byte alignment */
> +    tst         DST_R, #lowbit
> +    beq         51f
> +.endif
> +    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
> +    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
> +.if dst_r_bpp > 0
> +    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
> +.else
> +    add         DST_R, DST_R, #lowbit
> +.endif
> +    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
> +    sub         W, W, #(lowbit * 8 / dst_w_bpp)
> +51:
> +.endif
> +.endr
> +    pixdeinterleave src_bpp, src_basereg
> +    pixdeinterleave mask_bpp, mask_basereg
> +    pixdeinterleave dst_r_bpp, dst_r_basereg
> +
> +    process_pixblock_head
> +    cache_preload 0, pixblock_size
> +    cache_preload_simple
> +    process_pixblock_tail
> +
> +    pixinterleave dst_w_bpp, dst_w_basereg
> +
> +.irp lowbit, 1, 2, 4, 8, 16
> +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
> +.if lowbit < 16 /* we don't need more than 16-byte alignment */
> +    tst         DST_W, #lowbit
> +    beq         51f
> +.endif
> +    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
> +51:
> +.endif
> +.endr
> +.endif
> +52:
> +.endm
> +
> +/*
> + * Special code for processing up to (pixblock_size - 1) remaining
> + * trailing pixels. As SIMD processing performs operation on
> + * pixblock_size pixels, anything smaller than this has to be loaded
> + * and stored in a special way. Loading and storing of pixel data is
> + * performed in such a way that we fill some 'slots' in the NEON
> + * registers (some slots naturally are unused), then perform compositing
> + * operation as usual. In the end, the data is taken from these 'slots'
> + * and saved to memory.
> + *
> + * cache_preload_flag - allows to suppress prefetch if
> + *                      set to 0
> + * dst_aligned_flag   - selects whether destination buffer
> + *                      is aligned
> + */
> +.macro process_trailing_pixels cache_preload_flag, \
> +                               dst_aligned_flag, \
> +                               process_pixblock_head, \
> +                               process_pixblock_tail, \
> +                               process_pixblock_tail_head
> +    tst         W, #(pixblock_size - 1)
> +    beq         52f
> +.irp chunk_size, 16, 8, 4, 2, 1
> +.if pixblock_size > chunk_size
> +    tst         W, #chunk_size
> +    beq         51f
> +    pixld_src   chunk_size, src_bpp, src_basereg, SRC
> +    pixld       chunk_size, mask_bpp, mask_basereg, MASK
> +.if dst_aligned_flag != 0
> +    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
> +.else
> +    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
> +.endif
> +.if cache_preload_flag != 0
> +    PF add      PF_X, PF_X, #chunk_size
> +.endif
> +51:
> +.endif
> +.endr
> +    pixdeinterleave src_bpp, src_basereg
> +    pixdeinterleave mask_bpp, mask_basereg
> +    pixdeinterleave dst_r_bpp, dst_r_basereg
> +
> +    process_pixblock_head
> +.if cache_preload_flag != 0
> +    cache_preload 0, pixblock_size
> +    cache_preload_simple
> +.endif
> +    process_pixblock_tail
> +    pixinterleave dst_w_bpp, dst_w_basereg
> +.irp chunk_size, 16, 8, 4, 2, 1
> +.if pixblock_size > chunk_size
> +    tst         W, #chunk_size
> +    beq         51f
> +.if dst_aligned_flag != 0
> +    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
> +.else
> +    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
> +.endif
> +51:
> +.endif
> +.endr
> +52:
> +.endm
> +
> +/*
> + * Macro, which performs all the needed operations to switch to the next
> + * scanline and start the next loop iteration unless all the scanlines
> + * are already processed.
> + */
> +.macro advance_to_next_scanline start_of_loop_label
> +    mov         W, ORIG_W
> +    lsl         DUMMY, DST_STRIDE, #dst_bpp_shift
> +    add         DST_W, DST_W, DUMMY
> +.if src_bpp != 0
> +    lsl         DUMMY, SRC_STRIDE, #src_bpp_shift
> +    add         SRC, SRC, DUMMY
> +.endif
> +.if mask_bpp != 0
> +    lsl         DUMMY, MASK_STRIDE, #mask_bpp_shift
> +    add         MASK, MASK, DUMMY
> +.endif
> +.if (dst_w_bpp != 24)
> +    lsl         DUMMY, W, #dst_bpp_shift
> +    sub         DST_W, DST_W, DUMMY
> +.endif
> +.if (src_bpp != 24) && (src_bpp != 0)
> +    lsl         DUMMY, W, #src_bpp_shift
> +    sub         SRC, SRC, DUMMY
> +.endif
> +.if (mask_bpp != 24) && (mask_bpp != 0)
> +    lsl         DUMMY, W, #mask_bpp_shift
> +    sub         MASK, MASK, DUMMY
> +.endif
> +    subs        H, H, #1
> +    mov         DST_R, DST_W
> +    bge         start_of_loop_label
> +.endm
> +
> +/*
> + * Registers are allocated in the following way by default:
> + * v0, v1, v2, v3     - reserved for loading source pixel data
> + * v4, v5, v6, v7     - reserved for loading destination pixel data
> + * v24, v25, v26, v27 - reserved for loading mask pixel data
> + * v28, v29, v30, v31 - final destination pixel data for writeback to memory
> + */
> +.macro generate_composite_function fname, \
> +                                   src_bpp_, \
> +                                   mask_bpp_, \
> +                                   dst_w_bpp_, \
> +                                   flags, \
> +                                   pixblock_size_, \
> +                                   prefetch_distance, \
> +                                   init, \
> +                                   cleanup, \
> +                                   process_pixblock_head, \
> +                                   process_pixblock_tail, \
> +                                   process_pixblock_tail_head, \
> +                                   dst_w_basereg_ = 28, \
> +                                   dst_r_basereg_ = 4, \
> +                                   src_basereg_   = 0, \
> +                                   mask_basereg_  = 24
> +
> +    pixman_asm_function fname
> +    stp         x29, x30, [sp, -16]!
> +    mov         x29, sp
> +    sub         sp,   sp, 232  /* push all registers */
> +    sub         x29, x29, 64
> +    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
> +    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
> +    stp          x8,   x9, [x29, -80]
> +    stp         x10,  x11, [x29, -96]
> +    stp         x12,  x13, [x29, -112]
> +    stp         x14,  x15, [x29, -128]
> +    stp         x16,  x17, [x29, -144]
> +    stp         x18,  x19, [x29, -160]
> +    stp         x20,  x21, [x29, -176]
> +    stp         x22,  x23, [x29, -192]
> +    stp         x24,  x25, [x29, -208]
> +    stp         x26,  x27, [x29, -224]
> +    str         x28, [x29, -232]
> +
> +/*
> + * Select prefetch type for this function. If prefetch distance is
> + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
> + * has to be used instead of ADVANCED.
> + */
> +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
> +.if prefetch_distance == 0
> +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
> +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
> +        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
> +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
> +.endif
> +
> +/*
> + * Make some macro arguments globally visible and accessible
> + * from other macros
> + */
> +    .set src_bpp, src_bpp_
> +    .set mask_bpp, mask_bpp_
> +    .set dst_w_bpp, dst_w_bpp_
> +    .set pixblock_size, pixblock_size_
> +    .set dst_w_basereg, dst_w_basereg_
> +    .set dst_r_basereg, dst_r_basereg_
> +    .set src_basereg, src_basereg_
> +    .set mask_basereg, mask_basereg_
> +
> +    .macro pixld_src x:vararg
> +        pixld x
> +    .endm
> +    .macro fetch_src_pixblock
> +        pixld_src   pixblock_size, src_bpp, \
> +                    (src_basereg - pixblock_size * src_bpp / 64), SRC
> +    .endm
> +/*
> + * Assign symbolic names to registers
> + */
> +    W           .req       x0      /* width (is updated during processing) */
> +    H           .req       x1      /* height (is updated during processing) */
> +    DST_W       .req       x2      /* destination buffer pointer for writes */
> +    DST_STRIDE  .req       x3      /* destination image stride */
> +    SRC         .req       x4      /* source buffer pointer */
> +    SRC_STRIDE  .req       x5      /* source image stride */
> +    MASK        .req       x6      /* mask pointer */
> +    MASK_STRIDE .req       x7      /* mask stride */
> +
> +    DST_R       .req       x8      /* destination buffer pointer for reads */
> +
> +    PF_CTL      .req       x9      /* combined lines counter and prefetch */
> +                                    /* distance increment counter */
> +    PF_X        .req       x10     /* pixel index in a scanline for current */
> +                                    /* pretetch position */
> +    PF_SRC      .req       x11     /* pointer to source scanline start */
> +                                    /* for prefetch purposes */
> +    PF_DST      .req       x12     /* pointer to destination scanline start */
> +                                    /* for prefetch purposes */
> +    PF_MASK     .req       x13     /* pointer to mask scanline start */
> +                                    /* for prefetch purposes */
> +
> +    ORIG_W      .req       x14     /* saved original width */
> +    DUMMY       .req       x15     /* temporary register */
> +
> +    sxtw        x0, w0
> +    sxtw        x1, w1
> +    sxtw        x3, w3
> +    sxtw        x5, w5
> +    sxtw        x7, w7
> +
> +    .set mask_bpp_shift, -1
> +.if src_bpp == 32
> +    .set src_bpp_shift, 2
> +.elseif src_bpp == 24
> +    .set src_bpp_shift, 0
> +.elseif src_bpp == 16
> +    .set src_bpp_shift, 1
> +.elseif src_bpp == 8
> +    .set src_bpp_shift, 0
> +.elseif src_bpp == 0
> +    .set src_bpp_shift, -1
> +.else
> +    .error "requested src bpp (src_bpp) is not supported"
> +.endif
> +.if mask_bpp == 32
> +    .set mask_bpp_shift, 2
> +.elseif mask_bpp == 24
> +    .set mask_bpp_shift, 0
> +.elseif mask_bpp == 8
> +    .set mask_bpp_shift, 0
> +.elseif mask_bpp == 0
> +    .set mask_bpp_shift, -1
> +.else
> +    .error "requested mask bpp (mask_bpp) is not supported"
> +.endif
> +.if dst_w_bpp == 32
> +    .set dst_bpp_shift, 2
> +.elseif dst_w_bpp == 24
> +    .set dst_bpp_shift, 0
> +.elseif dst_w_bpp == 16
> +    .set dst_bpp_shift, 1
> +.elseif dst_w_bpp == 8
> +    .set dst_bpp_shift, 0
> +.else
> +    .error "requested dst bpp (dst_w_bpp) is not supported"
> +.endif
> +
> +.if (((flags) & FLAG_DST_READWRITE) != 0)
> +    .set dst_r_bpp, dst_w_bpp
> +.else
> +    .set dst_r_bpp, 0
> +.endif
> +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
> +    .set DEINTERLEAVE_32BPP_ENABLED, 1
> +.else
> +    .set DEINTERLEAVE_32BPP_ENABLED, 0
> +.endif
> +
> +.if prefetch_distance < 0 || prefetch_distance > 15
> +    .error "invalid prefetch distance (prefetch_distance)"
> +.endif
> +
> +    PF mov      PF_X, #0
> +    mov         DST_R, DST_W
> +
> +.if src_bpp == 24
> +    sub         SRC_STRIDE, SRC_STRIDE, W
> +    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
> +.endif
> +.if mask_bpp == 24
> +    sub         MASK_STRIDE, MASK_STRIDE, W
> +    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
> +.endif
> +.if dst_w_bpp == 24
> +    sub         DST_STRIDE, DST_STRIDE, W
> +    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
> +.endif
> +
> +/*
> + * Setup advanced prefetcher initial state
> + */
> +    PF mov      PF_SRC, SRC
> +    PF mov      PF_DST, DST_R
> +    PF mov      PF_MASK, MASK
> +    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
> +    PF lsl      DUMMY, H, #4
> +    PF mov      PF_CTL, DUMMY
> +    PF add      PF_CTL, PF_CTL, #(prefetch_distance - 0x10)
> +
> +    init
> +    subs        H, H, #1
> +    mov         ORIG_W, W
> +    blt         9f
> +    cmp         W, #(pixblock_size * 2)
> +    blt         800f
> +/*
> + * This is the start of the pipelined loop, which if optimized for
> + * long scanlines
> + */
> +0:
> +    ensure_destination_ptr_alignment process_pixblock_head, \
> +                                     process_pixblock_tail, \
> +                                     process_pixblock_tail_head
> +
> +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
> +    pixld_a     pixblock_size, dst_r_bpp, \
> +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
> +    fetch_src_pixblock
> +    pixld       pixblock_size, mask_bpp, \
> +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> +    PF add      PF_X, PF_X, #pixblock_size
> +    process_pixblock_head
> +    cache_preload 0, pixblock_size
> +    cache_preload_simple
> +    subs        W, W, #(pixblock_size * 2)
> +    blt         200f
> +
> +100:
> +    process_pixblock_tail_head
> +    cache_preload_simple
> +    subs        W, W, #pixblock_size
> +    bge         100b
> +
> +200:
> +    process_pixblock_tail
> +    pixst_a     pixblock_size, dst_w_bpp, \
> +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
> +
> +    /* Process the remaining trailing pixels in the scanline */
> +    process_trailing_pixels 1, 1, \
> +                            process_pixblock_head, \
> +                            process_pixblock_tail, \
> +                            process_pixblock_tail_head
> +    advance_to_next_scanline 0b
> +
> +    cleanup
> +1000:
> +    /* pop all registers */
> +    sub         x29, x29, 64
> +    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +    ldp          x8,   x9, [x29, -80]
> +    ldp         x10,  x11, [x29, -96]
> +    ldp         x12,  x13, [x29, -112]
> +    ldp         x14,  x15, [x29, -128]
> +    ldp         x16,  x17, [x29, -144]
> +    ldp         x18,  x19, [x29, -160]
> +    ldp         x20,  x21, [x29, -176]
> +    ldp         x22,  x23, [x29, -192]
> +    ldp         x24,  x25, [x29, -208]
> +    ldp         x26,  x27, [x29, -224]
> +    ldr         x28, [x29, -232]
> +    mov         sp, x29
> +    ldp         x29, x30, [sp], 16
> +    ret  /* exit */
> +/*
> + * This is the start of the loop, designed to process images with small width
> + * (less than pixblock_size * 2 pixels). In this case neither pipelining
> + * nor prefetch are used.
> + */
> +800:
> +    /* Process exactly pixblock_size pixels if needed */
> +    tst         W, #pixblock_size
> +    beq         100f
> +    pixld       pixblock_size, dst_r_bpp, \
> +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
> +    fetch_src_pixblock
> +    pixld       pixblock_size, mask_bpp, \
> +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> +    process_pixblock_head
> +    process_pixblock_tail
> +    pixst       pixblock_size, dst_w_bpp, \
> +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
> +100:
> +    /* Process the remaining trailing pixels in the scanline */
> +    process_trailing_pixels 0, 0, \
> +                            process_pixblock_head, \
> +                            process_pixblock_tail, \
> +                            process_pixblock_tail_head
> +    advance_to_next_scanline 800b
> +9:
> +    cleanup
> +    /* pop all registers */
> +    sub         x29, x29, 64
> +    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +    ldp          x8,   x9, [x29, -80]
> +    ldp         x10,  x11, [x29, -96]
> +    ldp         x12,  x13, [x29, -112]
> +    ldp         x14,  x15, [x29, -128]
> +    ldp         x16,  x17, [x29, -144]
> +    ldp         x18,  x19, [x29, -160]
> +    ldp         x20,  x21, [x29, -176]
> +    ldp         x22,  x23, [x29, -192]
> +    ldp         x24,  x25, [x29, -208]
> +    ldp         x26,  x27, [x29, -224]
> +    ldr         x28, [x29, -232]
> +    mov         sp, x29
> +    ldp         x29, x30, [sp], 16
> +    ret  /* exit */
> +
> +    .purgem     fetch_src_pixblock
> +    .purgem     pixld_src
> +
> +    .unreq      SRC
> +    .unreq      MASK
> +    .unreq      DST_R
> +    .unreq      DST_W
> +    .unreq      ORIG_W
> +    .unreq      W
> +    .unreq      H
> +    .unreq      SRC_STRIDE
> +    .unreq      DST_STRIDE
> +    .unreq      MASK_STRIDE
> +    .unreq      PF_CTL
> +    .unreq      PF_X
> +    .unreq      PF_SRC
> +    .unreq      PF_DST
> +    .unreq      PF_MASK
> +    .unreq      DUMMY
> +    .endfunc
> +.endm
> +
> +/*
> + * A simplified variant of function generation template for a single
> + * scanline processing (for implementing pixman combine functions)
> + */
> +.macro generate_composite_function_scanline        use_nearest_scaling, \
> +                                                   fname, \
> +                                                   src_bpp_, \
> +                                                   mask_bpp_, \
> +                                                   dst_w_bpp_, \
> +                                                   flags, \
> +                                                   pixblock_size_, \
> +                                                   init, \
> +                                                   cleanup, \
> +                                                   process_pixblock_head, \
> +                                                   process_pixblock_tail, \
> +
> process_pixblock_tail_head, \
> +                                                   dst_w_basereg_ = 28, \
> +                                                   dst_r_basereg_ = 4, \
> +                                                   src_basereg_   = 0, \
> +                                                   mask_basereg_  = 24
> +
> +    pixman_asm_function fname
> +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
> +
> +/*
> + * Make some macro arguments globally visible and accessible
> + * from other macros
> + */
> +    .set src_bpp, src_bpp_
> +    .set mask_bpp, mask_bpp_
> +    .set dst_w_bpp, dst_w_bpp_
> +    .set pixblock_size, pixblock_size_
> +    .set dst_w_basereg, dst_w_basereg_
> +    .set dst_r_basereg, dst_r_basereg_
> +    .set src_basereg, src_basereg_
> +    .set mask_basereg, mask_basereg_
> +
> +.if use_nearest_scaling != 0
> +    /*
> +     * Assign symbolic names to registers for nearest scaling
> +     */
> +    W           .req        x0
> +    DST_W       .req        x1
> +    SRC         .req        x2
> +    VX          .req        x3
> +    UNIT_X      .req        x4
> +    SRC_WIDTH_FIXED .req    x5
> +    MASK        .req        x6
> +    TMP1        .req        x8
> +    TMP2        .req        x9
> +    DST_R       .req        x10
> +    DUMMY       .req        x30
> +
> +    .macro pixld_src x:vararg
> +        pixld_s x
> +    .endm
> +
> +    sxtw        x0, w0
> +    sxtw        x3, w3
> +    sxtw        x4, w4
> +    sxtw        x5, w5
> +
> +    stp         x29, x30, [sp, -16]!
> +    mov         x29, sp
> +    sub         sp, sp, 88
> +    sub         x29, x29, 64
> +    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +    stp         x8, x9, [x29, -80]
> +    str         x10, [x29, -88]
> +.else
> +    /*
> +     * Assign symbolic names to registers
> +     */
> +    W           .req        x0      /* width (is updated during processing) */
> +    DST_W       .req        x1      /* destination buffer pointer for writes */
> +    SRC         .req        x2      /* source buffer pointer */
> +    MASK        .req        x3      /* mask pointer */
> +    DST_R       .req        x4      /* destination buffer pointer for reads */
> +    DUMMY       .req        x30
> +
> +    .macro pixld_src x:vararg
> +        pixld x
> +    .endm
> +
> +    sxtw        x0, w0
> +
> +    stp         x29, x30, [sp, -16]!
> +    mov         x29, sp
> +    sub         sp, sp, 64
> +    sub         x29, x29, 64
> +    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +.endif
> +
> +.if (((flags) & FLAG_DST_READWRITE) != 0)
> +    .set dst_r_bpp, dst_w_bpp
> +.else
> +    .set dst_r_bpp, 0
> +.endif
> +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
> +    .set DEINTERLEAVE_32BPP_ENABLED, 1
> +.else
> +    .set DEINTERLEAVE_32BPP_ENABLED, 0
> +.endif
> +
> +    .macro fetch_src_pixblock
> +        pixld_src   pixblock_size, src_bpp, \
> +                    (src_basereg - pixblock_size * src_bpp / 64), SRC
> +    .endm
> +
> +    init
> +    mov         DST_R, DST_W
> +
> +    cmp         W, #pixblock_size
> +    blt         800f
> +
> +    ensure_destination_ptr_alignment process_pixblock_head, \
> +                                     process_pixblock_tail, \
> +                                     process_pixblock_tail_head
> +
> +    subs        W, W, #pixblock_size
> +    blt         700f
> +
> +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
> +    pixld_a     pixblock_size, dst_r_bpp, \
> +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
> +    fetch_src_pixblock
> +    pixld       pixblock_size, mask_bpp, \
> +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
> +    process_pixblock_head
> +    subs        W, W, #pixblock_size
> +    blt         200f
> +100:
> +    process_pixblock_tail_head
> +    subs        W, W, #pixblock_size
> +    bge         100b
> +200:
> +    process_pixblock_tail
> +    pixst_a     pixblock_size, dst_w_bpp, \
> +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
> +700:
> +    /* Process the remaining trailing pixels in the scanline (dst aligned) */
> +    process_trailing_pixels 0, 1, \
> +                            process_pixblock_head, \
> +                            process_pixblock_tail, \
> +                            process_pixblock_tail_head
> +
> +    cleanup
> +.if use_nearest_scaling != 0
> +    sub         x29, x29, 64
> +    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +    ldp         x8, x9, [x29, -80]
> +    ldr         x10, [x29, -96]
> +    mov         sp, x29
> +    ldp         x29, x30, [sp], 16
> +    ret  /* exit */
> +.else
> +    sub         x29, x29, 64
> +    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +    mov         sp, x29
> +    ldp         x29, x30, [sp], 16
> +    ret  /* exit */
> +.endif
> +800:
> +    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
> +    process_trailing_pixels 0, 0, \
> +                            process_pixblock_head, \
> +                            process_pixblock_tail, \
> +                            process_pixblock_tail_head
> +
> +    cleanup
> +.if use_nearest_scaling != 0
> +    sub         x29, x29, 64
> +    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +    ldp         x8, x9, [x29, -80]
> +    ldr         x10, [x29, -88]
> +    mov         sp, x29
> +    ldp         x29, x30, [sp], 16
> +    ret  /* exit */
> +
> +    .unreq      DUMMY
> +    .unreq      DST_R
> +    .unreq      SRC
> +    .unreq      W
> +    .unreq      VX
> +    .unreq      UNIT_X
> +    .unreq      TMP1
> +    .unreq      TMP2
> +    .unreq      DST_W
> +    .unreq      MASK
> +    .unreq      SRC_WIDTH_FIXED
> +
> +.else
> +    sub         x29, x29, 64
> +    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
> +    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
> +    mov          sp, x29
> +    ldp          x29, x30, [sp], 16
> +    ret  /* exit */
> +
> +    .unreq      DUMMY
> +    .unreq      SRC
> +    .unreq      MASK
> +    .unreq      DST_R
> +    .unreq      DST_W
> +    .unreq      W
> +.endif
> +
> +    .purgem     fetch_src_pixblock
> +    .purgem     pixld_src
> +
> +    .endfunc
> +.endm
> +
> +.macro generate_composite_function_single_scanline x:vararg
> +    generate_composite_function_scanline 0, x
> +.endm
> +
> +.macro generate_composite_function_nearest_scanline x:vararg
> +    generate_composite_function_scanline 1, x
> +.endm
> +
> +/* Default prologue/epilogue, nothing special needs to be done */
> +
> +.macro default_init
> +.endm
> +
> +.macro default_cleanup
> +.endm
> +
> +/*
> + * Prologue/epilogue variant which additionally saves/restores v8-v15
> + * registers (they need to be saved/restored by callee according to ABI).
> + * This is required if the code needs to use all the NEON registers.
> + */
> +
> +.macro default_init_need_all_regs
> +.endm
> +
> +.macro default_cleanup_need_all_regs
> +.endm
> +
> +/******************************************************************************/
> +
> +/*
> + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
> + * into a planar a8r8g8b8 format (with a, r, g, b color components
> + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
> + *
> + * Warning: the conversion is destructive and the original
> + *          value (in) is lost.
> + */
> +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
> +    shrn        &out_r&.8b, &in&.8h,    #8
> +    shrn        &out_g&.8b, &in&.8h,    #3
> +    sli         &in&.8h,    &in&.8h,    #5
> +    movi        &out_a&.8b, #255
> +    sri         &out_r&.8b, &out_r&.8b, #5
> +    sri         &out_g&.8b, &out_g&.8b, #6
> +    shrn        &out_b&.8b, &in&.8h,    #2
> +.endm
> +
> +.macro convert_0565_to_x888 in, out_r, out_g, out_b
> +    shrn        &out_r&.8b, &in&.8h,    #8
> +    shrn        &out_g&.8b, &in&.8h,    #3
> +    sli         &in&.8h,    &in&.8h,    #5
> +    sri         &out_r&.8b, &out_r&.8b, #5
> +    sri         &out_g&.8b, &out_g&.8b, #6
> +    shrn        &out_b&.8b, &in&.8h,    #2
> +.endm
> +
> +/*
> + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
> + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
> + * pixels packed in 128-bit register (out). Requires two temporary 128-bit
> + * registers (tmp1, tmp2)
> + */
> +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
> +    ushll       &tmp1&.8h, &in_g&.8b, #7
> +    shl         &tmp1&.8h, &tmp1&.8h, #1
> +    ushll       &out&.8h,  &in_r&.8b, #7
> +    shl         &out&.8h,  &out&.8h,  #1
> +    ushll       &tmp2&.8h, &in_b&.8b, #7
> +    shl         &tmp2&.8h, &tmp2&.8h, #1
> +    sri         &out&.8h, &tmp1&.8h, #5
> +    sri         &out&.8h, &tmp2&.8h, #11
> +.endm
> +
> +/*
> + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
> + * returned in (out0, out1) registers pair. Requires one temporary
> + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
> + * value from 'in' is lost
> + */
> +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
> +    shl         &out0&.4h, &in&.4h,   #5  /* G top 6 bits */
> +    shl         &tmp&.4h,  &in&.4h,   #11 /* B top 5 bits */
> +    sri         &in&.4h,   &in&.4h,   #5  /* R is ready in top bits */
> +    sri         &out0&.4h, &out0&.4h, #6  /* G is ready in top bits */
> +    sri         &tmp&.4h,  &tmp&.4h,  #5  /* B is ready in top bits */
> +    ushr        &out1&.4h, &in&.4h,   #8  /* R is in place */
> +    sri         &out0&.4h, &tmp&.4h,  #8  /* G & B is in place */
> +    zip1        &tmp&.4h,  &out0&.4h, &out1&.4h  /* everything is in place */
> +    zip2        &out1&.4h, &out0&.4h, &out1&.4h
> +    mov         &out0&.d[0], &tmp&.d[0]
> +.endm
> diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
> old mode 100644
> new mode 100755
> index 73a5414..81e0f23
> --- a/pixman/pixman-private.h
> +++ b/pixman/pixman-private.h
> @@ -607,6 +607,11 @@ pixman_implementation_t *
>  _pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
>  #endif
>
> +#ifdef USE_ARM_A64_NEON
> +pixman_implementation_t *
> +_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
> +#endif
> +
>  #ifdef USE_MIPS_DSPR2
>  pixman_implementation_t *
>  _pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
> --
> 2.7.4
On Sat, 02 Apr 2016 13:30:58 +0100, Mizuki Asakura <ed6e117f@gmail.com> wrote:
> This patch only contains STD_FAST_PATH codes, not scaling (nearest,
> bilinear) codes.

Hi Mizuki,

It looks like you have used an automated process to convert the AArch32
NEON code to AArch64. Will you be able to repeat that process for other
code, or at least assist others to repeat your steps?

The reason I ask is that I have a large number of outstanding patches to
the ARM NEON support. The process of getting them merged into the
FreeDesktop git repository has been very slow because there aren't many
people on this list with the time and ability to review them, however my
versions are in many cases up to twice the speed of the FreeDesktop
versions, and it would be a shame if AArch64 couldn't benefit from them.
If your AArch64 conversion is a one-time thing, it will make make it
extremely difficult to merge my changes in.

> After completing optimization this patch, scaling related codes should be done.

One of my aims was to implement missing "iter" routines so as to accelerate
scaled plots for a much wider combination of pixels formats and Porter-Duff
combiner rules than the existing limited selection of fast paths could
cover. If you look towards the end of my patch series here:

https://github.com/bavison/pixman/commits/arm-neon-release1

you'll see that I discovered that I was actually outperforming Pixman's
existing bilinear plotters so consistently that I'm advocating removing
them entirely, with the additional advantage that it simplifies the code
base a lot. So you might want to consider whether it's worth bothering
converting those to AArch64 in the first place.

I would maybe go so far as to suggest that you try converting all the iters
first and only add fast paths if you find they do better than the iters.
One of the drawbacks of using iters is that the prefetch code can't be as
sophisticated - it can't easily be prefetching the start of the next row
while it is still working on the end of the current one. But since hardware
prefetchers are better now and conditional execution is hard in AArch64,
this will be less of a drawback with AArch64 CPUs.

I'll also repeat what has been said, that it's very neat the way the
existing prefetch code sneaks calculations into pipeline stalls, but it was
only ever really ideal for Cortex-A8. With Cortex-A7 (despite the number,
actually a much more recent 32-bit core) I noted that it was impossible to
schedule such complex prefetch code without adding to the cycle count, at
least when the images were already in the cache.

Ben
On Sun, 3 Apr 2016 20:17:45 +0900
Mizuki Asakura <ed6e117f@gmail.com> wrote:

> > The 'advanced' prefetch type is implemented by having some branchless ARM code  
> 
> If the prefetch code assumes that "branch-less", it cannot be done in aarch64
> since aarch64 doesn't support conditional alythmetics such as subge, subges.
> 
> If so, we could / should remove all prefetch-related codes because it
> might cause
> performance regression (by branching) rather than benefit of prefetching.

Yes, I'm fine and actually in favour of removing the prefetch related
AArch64 code (assuming that it does not do anything good for us).
Something similar happened to the pixman x86 prefetch code in the past:
    https://lists.freedesktop.org/archives/pixman/2010-June/000231.html

But I'm going to run some additional higher level benchmarks to be
sure.

> And also, we could remove all "tail-head" optimizatoins that is only
> for highly utilizing prefetching.

This code is not just there for prefetching. It is an example of
using software pipelining:
    https://cgit.freedesktop.org/pixman/tree/pixman/pixman-arm-neon-asm.S?id=pixman-0.34.0#n191
    https://en.wikipedia.org/wiki/Software_pipelining

> "tail-head" codes are very complicated, hard to understand and hard to maintain.
> If we could remove these codes, asm code could be more slimmer and
> easy-to-maintain.

If we were favouring ease of maintenance over performance, then we
would have used intrinsics instead of assembly in the first place.

> Ofcource, the modification shouldn't be applied for original
> aarch32-neon codes. It may cause
> performance regression on some architecture.
> But for aarch64, it would be a considerable changes ?

Well, just to make sure that there is no misunderstanding between
us. I would like to keep and do AArch64 conversion for all the
parts of code, which are well optimized and not planned to be
replaced in the near future. And I suggested not to bother with
the 'pixman-arm-neon-asm-bilinear.S' file, because this code
is not the best way to do the job and it had to be eventually
replaced with iterators:
    https://lists.freedesktop.org/archives/pixman/2013-September/002889.html
    https://lists.freedesktop.org/archives/pixman/2013-September/002892.html

Now it looks like Ben Avison has NEON patches for doing
separable bilinear scaling and, so this makes the
'pixman-arm-neon-asm-bilinear.S' file really obsolete.

The nearest scaling and rgb565 format support code is still useful.
The bilinear scaling code from pixman-arm-neon-asm.S is useful too, at
least the 'pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon' and
'pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon' functions.
But 'pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon' and
'pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon' are bad.

Anyway, your first patch was already usable. I only see that just a
few minor tweaks are needed and it will be good enough for pushing
to git. But if I'm mistaken and something is actually difficult,
then you don't need to spend too much time on it. Thanks.
> It looks like you have used an automated process to convert the AArch32
> NEON code to AArch64. Will you be able to repeat that process for other
> code, or at least assist others to repeat your steps?

Sorry, but I've wrote before, all of the patch were converted by hand.
"converter script" didn't work correctly.
# But the script was very helpful for me to understand the difference
# between aarch32 and aarch64 :)


> The reason I ask is that I have a large number of outstanding patches to
> the ARM NEON support.

Hmm...
How should we proceed the implementation ?

I've seen a comment that current (and I've based) pixman-arm-neon-asm*.S
were optimized on older Cortex-A8. And, your new patches seem to be
working well on latest Cortex chips.
If so, we should first apply your latest patch to the master, and then,
someone (or I ?) do the conversion to aarch64 again. It would be good both
aarch32 and aarch64 worlds.

# FYI: I've spent 1 week to convert all of the code,
# and 2 weeks to pass all tests.


On 5 April 2016 at 03:53, Ben Avison <bavison@riscosopen.org> wrote:
> On Sat, 02 Apr 2016 13:30:58 +0100, Mizuki Asakura <ed6e117f@gmail.com>
> wrote:
>>
>> This patch only contains STD_FAST_PATH codes, not scaling (nearest,
>> bilinear) codes.
>
>
> Hi Mizuki,
>
> It looks like you have used an automated process to convert the AArch32
> NEON code to AArch64. Will you be able to repeat that process for other
> code, or at least assist others to repeat your steps?
>
> The reason I ask is that I have a large number of outstanding patches to
> the ARM NEON support. The process of getting them merged into the
> FreeDesktop git repository has been very slow because there aren't many
> people on this list with the time and ability to review them, however my
> versions are in many cases up to twice the speed of the FreeDesktop
> versions, and it would be a shame if AArch64 couldn't benefit from them.
> If your AArch64 conversion is a one-time thing, it will make make it
> extremely difficult to merge my changes in.
>
>> After completing optimization this patch, scaling related codes should be
>> done.
>
>
> One of my aims was to implement missing "iter" routines so as to accelerate
> scaled plots for a much wider combination of pixels formats and Porter-Duff
> combiner rules than the existing limited selection of fast paths could
> cover. If you look towards the end of my patch series here:
>
> https://github.com/bavison/pixman/commits/arm-neon-release1
>
> you'll see that I discovered that I was actually outperforming Pixman's
> existing bilinear plotters so consistently that I'm advocating removing
> them entirely, with the additional advantage that it simplifies the code
> base a lot. So you might want to consider whether it's worth bothering
> converting those to AArch64 in the first place.
>
> I would maybe go so far as to suggest that you try converting all the iters
> first and only add fast paths if you find they do better than the iters.
> One of the drawbacks of using iters is that the prefetch code can't be as
> sophisticated - it can't easily be prefetching the start of the next row
> while it is still working on the end of the current one. But since hardware
> prefetchers are better now and conditional execution is hard in AArch64,
> this will be less of a drawback with AArch64 CPUs.
>
> I'll also repeat what has been said, that it's very neat the way the
> existing prefetch code sneaks calculations into pipeline stalls, but it was
> only ever really ideal for Cortex-A8. With Cortex-A7 (despite the number,
> actually a much more recent 32-bit core) I noted that it was impossible to
> schedule such complex prefetch code without adding to the cycle count, at
> least when the images were already in the cache.
>
> Ben
> This code is not just there for prefetching. It is an example of
> using software pipelining:

OK. I understand.
But the code is very hard to maintain... I've met too many register
conflictions.
# q2 and d2 were used in a same sequence. It cannot be exist in aarch64-neon.

Anyway, I'll try to remove unnecessary register copies as you've suggested.
After that, I'll also tryh to make benchmarks that
* advance vs none
* L1 / L2 / L3 (Cortex-A53 doesn't have), keep / strm
to find the better configuration.

But it is only a result of Cortex-A53 (that you ane me have). Does anyone can
test other (expensive :) aarch64 environment ?
(Cortex-Axx, Apple Ax, NVidia Denver, etc, etc...)


On 5 April 2016 at 16:53, Siarhei Siamashka <siarhei.siamashka@gmail.com> wrote:
> On Sun, 3 Apr 2016 20:17:45 +0900
> Mizuki Asakura <ed6e117f@gmail.com> wrote:
>
>> > The 'advanced' prefetch type is implemented by having some branchless ARM code
>>
>> If the prefetch code assumes that "branch-less", it cannot be done in aarch64
>> since aarch64 doesn't support conditional alythmetics such as subge, subges.
>>
>> If so, we could / should remove all prefetch-related codes because it
>> might cause
>> performance regression (by branching) rather than benefit of prefetching.
>
> Yes, I'm fine and actually in favour of removing the prefetch related
> AArch64 code (assuming that it does not do anything good for us).
> Something similar happened to the pixman x86 prefetch code in the past:
>     https://lists.freedesktop.org/archives/pixman/2010-June/000231.html
>
> But I'm going to run some additional higher level benchmarks to be
> sure.
>
>> And also, we could remove all "tail-head" optimizatoins that is only
>> for highly utilizing prefetching.
>
> This code is not just there for prefetching. It is an example of
> using software pipelining:
>     https://cgit.freedesktop.org/pixman/tree/pixman/pixman-arm-neon-asm.S?id=pixman-0.34.0#n191
>     https://en.wikipedia.org/wiki/Software_pipelining
>
>> "tail-head" codes are very complicated, hard to understand and hard to maintain.
>> If we could remove these codes, asm code could be more slimmer and
>> easy-to-maintain.
>
> If we were favouring ease of maintenance over performance, then we
> would have used intrinsics instead of assembly in the first place.
>
>> Ofcource, the modification shouldn't be applied for original
>> aarch32-neon codes. It may cause
>> performance regression on some architecture.
>> But for aarch64, it would be a considerable changes ?
>
> Well, just to make sure that there is no misunderstanding between
> us. I would like to keep and do AArch64 conversion for all the
> parts of code, which are well optimized and not planned to be
> replaced in the near future. And I suggested not to bother with
> the 'pixman-arm-neon-asm-bilinear.S' file, because this code
> is not the best way to do the job and it had to be eventually
> replaced with iterators:
>     https://lists.freedesktop.org/archives/pixman/2013-September/002889.html
>     https://lists.freedesktop.org/archives/pixman/2013-September/002892.html
>
> Now it looks like Ben Avison has NEON patches for doing
> separable bilinear scaling and, so this makes the
> 'pixman-arm-neon-asm-bilinear.S' file really obsolete.
>
> The nearest scaling and rgb565 format support code is still useful.
> The bilinear scaling code from pixman-arm-neon-asm.S is useful too, at
> least the 'pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon' and
> 'pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon' functions.
> But 'pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon' and
> 'pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon' are bad.
>
> Anyway, your first patch was already usable. I only see that just a
> few minor tweaks are needed and it will be good enough for pushing
> to git. But if I'm mistaken and something is actually difficult,
> then you don't need to spend too much time on it. Thanks.
>
> --
> Best regards,
> Siarhei Siamashka
On Tue, Apr 05, 2016 at 08:20:54PM +0900, Mizuki Asakura wrote:
> > This code is not just there for prefetching. It is an example of
> > using software pipelining:
> 
> OK. I understand.
> But the code is very hard to maintain... I've met too many register
> conflictions.
> # q2 and d2 were used in a same sequence. It cannot be exist in aarch64-neon.
> 
> Anyway, I'll try to remove unnecessary register copies as you've suggested.
> After that, I'll also tryh to make benchmarks that
> * advance vs none
> * L1 / L2 / L3 (Cortex-A53 doesn't have), keep / strm
> to find the better configuration.
> 
> But it is only a result of Cortex-A53 (that you ane me have). Does anyone can
> test other (expensive :) aarch64 environment ?
> (Cortex-Axx, Apple Ax, NVidia Denver, etc, etc...)

If someone can list what to run for a test I can probably run it on an A57.
Hi Ben,

On Mon, 04 Apr 2016 19:53:36 +0100
"Ben Avison" <bavison@riscosopen.org> wrote:

> On Sat, 02 Apr 2016 13:30:58 +0100, Mizuki Asakura <ed6e117f@gmail.com> wrote:
> > This patch only contains STD_FAST_PATH codes, not scaling (nearest,
> > bilinear) codes.  
> 
> Hi Mizuki,
> 
> It looks like you have used an automated process to convert the AArch32
> NEON code to AArch64. Will you be able to repeat that process for other
> code, or at least assist others to repeat your steps?
> 
> The reason I ask is that I have a large number of outstanding patches to
> the ARM NEON support. The process of getting them merged into the
> FreeDesktop git repository has been very slow because there aren't many
> people on this list with the time and ability to review them, however my
> versions are in many cases up to twice the speed of the FreeDesktop
> versions, and it would be a shame if AArch64 couldn't benefit from them.

It is always possible to find time for fixing bugs and reviewing the
code, which is doing something obviously useful.

> If your AArch64 conversion is a one-time thing, it will make make it
> extremely difficult to merge my changes in.

Yes, the way how we are going to keep the 32-bit and 64-bit code in sync
is one of the concerns that have to be addressed.

But you don't need to worry about this right now. Let's focus at one
task at a time. So it's probably best to use the current pixman code
for the initial AArch64 conversion round. And at the same time we can
try to look at integrating your patches as 32-bit code first.

> > After completing optimization this patch, scaling related codes should be done.  
> 
> One of my aims was to implement missing "iter" routines so as to accelerate
> scaled plots for a much wider combination of pixels formats and Porter-Duff
> combiner rules than the existing limited selection of fast paths could
> cover. If you look towards the end of my patch series here:
> 
> https://github.com/bavison/pixman/commits/arm-neon-release1
> 
> you'll see that I discovered that I was actually outperforming Pixman's
> existing bilinear plotters so consistently that I'm advocating removing
> them entirely,

Please hold your horses!

I did give this branch a try. And I'm not sure if you genuinely did
not notice this fact, but with your runs of "lowlevel-blt-bench -b"
tests, you are actually benchmarking a special code path for
horizontal-only scaling against the code that is doing scaling in both
directions. That's an obvious flaw in the benchmark itself, which gives
you misleading results! I have already sent a fix for this problem:

    https://lists.freedesktop.org/archives/pixman/2016-April/004511.html

And I have also pushed your patches with this lowlevel-blt-bench fix to
the following git branch:

    https://cgit.freedesktop.org/~siamashka/pixman/log/?h=20160405-arm-neon-release1-from-bavison

Reverting your removal of the existing bilinear plotters allows us to
benchmark the implementations against each other via setting the
PIXMAN_DISABLE=wholeops environment variable. Here are the results
that I got:

===========================
== ARM Cortex-A7 @1.3GHz ==
===========================

    $ ./lowlevel-blt-bench -b ... (old NEON bilinear fast paths)

           src_8888_8888 =  L1:  51.40  L2:  43.62  M: 48.08 ( 35.48%)  HT: 32.84  VT: 31.14  R: 27.95  RT: 14.12 ( 102Kops/s)
           src_8888_0565 =  L1:  48.61  L2:  44.98  M: 46.56 ( 25.72%)  HT: 32.31  VT: 32.22  R: 26.04  RT: 13.08 (  98Kops/s)
          over_8888_8888 =  L1:  40.44  L2:  34.28  M: 34.44 ( 25.39%)  HT: 22.75  VT: 22.67  R: 19.45  RT:  9.56 (  82Kops/s)

    $ PIXMAN_DISABLE=wholeops ./lowlevel-blt-bench -b ... (new separable NEON iterators)

           src_8888_8888 =  L1:  45.32  L2:  56.91  M: 47.55 ( 34.96%)  HT: 34.17  VT: 29.90  R: 26.83  RT: 10.79 (  80Kops/s)
           src_8888_0565 =  L1:  38.08  L2:  47.27  M: 48.13 ( 26.61%)  HT: 31.18  VT: 25.89  R: 23.72  RT:  8.87 (  70Kops/s)
          over_8888_8888 =  L1:  36.15  L2:  34.68  M: 33.45 ( 24.66%)  HT: 24.91  VT: 20.88  R: 19.38  RT:  8.23 (  68Kops/s)

    $ PIXMAN_DISABLE=wholeops ./lowlevel-blt-bench -bh ... (new separable NEON iterators, only horizontal)

           src_8888_8888 =  L1:  92.51  L2:  74.43  M: 65.66 ( 47.27%)  HT: 40.34  VT: 33.82  R: 30.02  RT: 11.76 (  84Kops/s)
           src_8888_0565 =  L1:  71.68  L2:  63.86  M: 58.99 ( 32.20%)  HT: 36.19  VT: 28.96  R: 26.57  RT:  9.68 (  74Kops/s)
          over_8888_8888 =  L1:  61.80  L2:  44.28  M: 40.05 ( 29.13%)  HT: 27.69  VT: 22.71  R: 21.11  RT:  8.86 (  71Kops/s)

=============================
=== ARM Cortex-A9 @1.4GHz ===
=============================

    $ ./lowlevel-blt-bench -b ... (old NEON bilinear fast paths)

           src_8888_8888 =  L1: 115.82  L2: 113.35  M:107.33 (117.45%)  HT: 57.95  VT: 48.20  R: 39.86  RT: 20.75 ( 140Kops/s)
           src_8888_0565 =  L1: 105.67  L2: 104.18  M: 99.80 ( 82.02%)  HT: 54.40  VT: 46.61  R: 37.24  RT: 19.10 ( 134Kops/s)
          over_8888_8888 =  L1:  80.68  L2:  79.07  M: 75.95 ( 83.09%)  HT: 38.69  VT: 29.25  R: 26.37  RT: 13.79 ( 112Kops/s)

    $ PIXMAN_DISABLE=wholeops ./lowlevel-blt-bench -b ... (new separable NEON iterators)

           src_8888_8888 =  L1:  52.17  L2:  69.91  M: 49.56 ( 54.43%)  HT: 43.70  VT: 36.51  R: 31.96  RT: 16.02 ( 112Kops/s)
           src_8888_0565 =  L1:  43.51  L2:  61.72  M: 47.09 ( 38.76%)  HT: 37.19  VT: 31.19  R: 27.30  RT: 12.64 (  97Kops/s)
          over_8888_8888 =  L1:  44.85  L2:  52.69  M: 25.41 ( 27.85%)  HT: 24.53  VT: 21.33  R: 20.29  RT: 11.73 (  94Kops/s)

    $ PIXMAN_DISABLE=wholeops ./lowlevel-blt-bench -bh ... (new separable NEON iterators, only horizontal)

           src_8888_8888 =  L1:  99.47  L2:  87.14  M: 63.50 ( 69.66%)  HT: 48.91  VT: 40.11  R: 35.05  RT: 17.69 ( 118Kops/s)
           src_8888_0565 =  L1:  81.97  L2:  76.98  M: 61.69 ( 50.79%)  HT: 40.95  VT: 33.84  R: 29.54  RT: 13.74 ( 102Kops/s)
          over_8888_8888 =  L1:  82.53  L2:  62.73  M: 29.01 ( 31.82%)  HT: 26.11  VT: 22.32  R: 21.50  RT: 12.67 (  99Kops/s)

============================

The accuracy of measurements is not perfect, but it is good enough to
easily see the pattern.

Your implementations are providing better performance on ARM Cortex-A7
for the special case of horizontal-only scaling. But when we are doing
general purpose scaling (for example the same scale factor for the x
and y axes), the performance is more or less comparable between the
existing bilinear fast paths and your new implementation.

And if we take a different processor, for example ARM Cortex-A9, then
everything changes. Now the general purpose scaling is much slower
with your code. And even the horizontal-only special case is somewhat
slower than the old fast paths which run general purpose bilinear
scaling.

I would say that rather than dropping the existing bilinear scaling
code, it might be worth trying to implement the horizontal-only and
the vertical-only special cases to see how much faster it can get.


However if we try to do a really fair comparison, then we need to get
rid of the redundant memcpy for the bilinear src_8888_8888 operation in
the iterators based implementation. I had some old patches for this:
    https://lists.freedesktop.org/archives/pixman/2013-September/002879.html

These patches were used in an old git branch:
    https://cgit.freedesktop.org/~siamashka/pixman/log/?h=ssse3-bilinear-fast-path-test

which I used for comparing the fast path based SSE2/SSSE3 bilinear
scaling implementation with the separable SSSE3 implementation on
two x86 processors: Core2 T7300 (Merom) and Core i7 860 (Nehalem):
    https://lists.freedesktop.org/archives/pixman/2013-September/002892.html
    https://people.freedesktop.org/~siamashka/files/20130905/ssse3-scaling-bench.png

It shows that on x86 processors, the straightforward single pass
implementation works faster for downscaling, but becomes worse than
the separable SSSE3 scaling code on upscaling. There is a crossover
point, which may be different for different processors.

Now I have rebased all this old code on a current pixman master
branch, applied your patches with the separable NEON bilinear scaling
implementation and have done pretty much the same benchmarks. This
git branch is here:
    https://cgit.freedesktop.org/~siamashka/pixman/log/?h=20160405-separable-neon-bilinear-test

This particular change makes your implementation faster by getting rid
of the redundant memcpy:
    https://cgit.freedesktop.org/~siamashka/pixman/commit/?h=20160405-separable-neon-bilinear-test&id=c8c12d62098bf5b22828f851a035b6c4ecf0fc0b

And here are the results from different ARM processors (all the
benchmark scripts are also included):
    https://people.freedesktop.org/~siamashka/files/20160405-arm-bilinear/

It appears that the existing bilinear scaling code (src_8888_8888
ARM NEON fast path) works faster for downscaling on Cortex-A8,
Cortex-A9, Cortex-A15 and Qualcomm Krait:
    https://people.freedesktop.org/~siamashka/files/20160405-arm-bilinear/neon-scaling-bench-a8-a9.png
    https://people.freedesktop.org/~siamashka/files/20160405-arm-bilinear/neon-scaling-bench-a15-krait.png

Also your separable implementation is always faster for both
upscaling/downscaling on ARM Cortex-A7, while the crossover point
for ARM Cortex-A53 is currently around ~0.7x scaling factor, which
is not exactly great:
    https://people.freedesktop.org/~siamashka/files/20160405-arm-bilinear/neon-scaling-bench-a7-a53.png

> with the additional advantage that it simplifies the code
> base a lot. So you might want to consider whether it's worth bothering
> converting those to AArch64 in the first place.

I'll try to tune the single-pass bilinear scaling code for Cortex-A53
to see if it can become more competitive. There are some obvious
performance problems in it. For example, Cortex-A53 wants the ARM
and NEON instructions to be interleaved in perfect pairs, so we can't
have separate chunks of ARM and NEON code anymore:
    https://cgit.freedesktop.org/pixman/tree/pixman/pixman-arm-neon-asm.S?id=pixman-0.34.0#n3313

Now the instructions need to be mixed. Cortex-A8 had separate
deep queue for NEON instructions, so it did not mind. As a side
effect, now the "advanced prefetch" instructions sequence in the
32-bit ARM code needs to be carefully interleaved with the NEON
instructions if we want it to run fast on Cortex-A53.

Another problem is that now there is at least 1 cycle stall between
vshll.u16 and vmlsl.u16 here:

        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
        vmlsl.u16 q2, d20, d30
        vmlal.u16 q2, d21, d30

Cortex-A8 could run this instructions sequence in 3 cycles without
stalls. Maybe there is something else. I have a sample code here
if anyone cares to experiment:
    https://gist.github.com/ssvb/343379ceeb6d017c0023424b70fc90e2

Anyway, I'll try to make a Cortex-A53 optimized version and also a
special horizontal scaling shortcut. Let's see how fast it can
be :-) 

> I would maybe go so far as to suggest that you try converting all the iters
> first and only add fast paths if you find they do better than the iters.
> One of the drawbacks of using iters is that the prefetch code can't be as
> sophisticated - it can't easily be prefetching the start of the next row
> while it is still working on the end of the current one. But since hardware
> prefetchers are better now and conditional execution is hard in AArch64,
> this will be less of a drawback with AArch64 CPUs.
> 
> I'll also repeat what has been said, that it's very neat the way the
> existing prefetch code sneaks calculations into pipeline stalls, but it was
> only ever really ideal for Cortex-A8. With Cortex-A7 (despite the number,
> actually a much more recent 32-bit core) I noted that it was impossible to
> schedule such complex prefetch code without adding to the cycle count, at
> least when the images were already in the cache.

Yes, Cortex-A7 is a pretty awful processor. It had the NEON unit cut
in half (can process only 64 bits of data per cycle) and also its
automatic hardware prefetcher can track just a single stream, which
is not good enough for pixman. This is an oddball hardware, which
differs a lot from most of the other ARM processors. I have not had
any experience with any Cortex-A5 based device though, probably it
may be roughly the same or even worse.

Regarding your patches in general. It would be great to have more
comments about the algorithm details, registers layout documentation
and other stuff. It would be great not to regress performance on non
Cortex-A7 processors (we probably need runtime detection and multiple
implementations). It would be also great to have your new iterators
framework eventually shared between ARM, x86 and other architectures
to avoid unnecessary code duplication.

Thanks for you work.
On Tue, 5 Apr 2016 08:26:38 -0400
"Lennart Sorensen" <lsorense@csclub.uwaterloo.ca> wrote:

> On Tue, Apr 05, 2016 at 08:20:54PM +0900, Mizuki Asakura wrote:
> > > This code is not just there for prefetching. It is an example of
> > > using software pipelining:  
> > 
> > OK. I understand.
> > But the code is very hard to maintain... I've met too many register
> > conflictions.
> > # q2 and d2 were used in a same sequence. It cannot be exist in aarch64-neon.
> > 
> > Anyway, I'll try to remove unnecessary register copies as you've suggested.
> > After that, I'll also tryh to make benchmarks that
> > * advance vs none
> > * L1 / L2 / L3 (Cortex-A53 doesn't have), keep / strm
> > to find the better configuration.
> > 
> > But it is only a result of Cortex-A53 (that you ane me have). Does anyone can
> > test other (expensive :) aarch64 environment ?
> > (Cortex-Axx, Apple Ax, NVidia Denver, etc, etc...)  
> 
> If someone can list what to run for a test I can probably run it on an A57.

Hi Lennart,

This is great, thanks. Could you please clone the following branch?

    https://cgit.freedesktop.org/~siamashka/pixman/log/?h=20160405-separable-neon-bilinear-test

And then try to compile static 32-bit pixman test programs using an
ARM crosscompiler? 

   ./autogen.sh
   ./configure --host=arm-linux-gnueabihf --enable-static-testprogs \
               --disable-libpng --disable-gtk
   make

Then run the "scaling-bench" program from the "test" directory on your
A57 device?

   PIXMAN_DISABLE="" ./scaling-bench > cortex-a57-neon-single-pass.txt
   PIXMAN_DISABLE="wholeops" ./scaling-bench > cortex-a57-neon-separable.txt

This information can be used to see whether the Cortex-A57 fits a
common pattern observed with other ARM processors:

   https://people.freedesktop.org/~siamashka/files/20160405-arm-bilinear/

I suspect that it will show results similar to Cortex-A15, but we will
never know until we try.

This can help to identify an optimal bilinear scaling strategy. And
also decide which parts of the existing 32-bit ARM assembly code are
worth converting to AArch64.
On Tue, 5 Apr 2016 20:20:54 +0900
Mizuki Asakura <ed6e117f@gmail.com> wrote:

> > This code is not just there for prefetching. It is an example of
> > using software pipelining:  
> 
> OK. I understand.
> But the code is very hard to maintain... I've met too many register
> conflictions.

The *_tail_head variant has exactly the same code as the individual
*_head and *_tail macros, but the instructions are just reordered.
There should be no additional register clashes if you are doing the
exact 1-to-1 conversion.

If you are considering to modify the algorithm, then now it's better
not to touch these problematic parts of code and keep them the way
they were in your first patch.

> # q2 and d2 were used in a same sequence. It cannot be exist in aarch64-neon.

Why not? The registers mapping is something like this:
   q2 -> v2.16b
   d2 -> v1.8b  (because d2 is the lower 64-bit half of the 128-bit q1 register)

Do you have a more specific example of a code fragment that needs
conversion?

> Anyway, I'll try to remove unnecessary register copies as you've suggested.
> After that, I'll also tryh to make benchmarks that
> * advance vs none
> * L1 / L2 / L3 (Cortex-A53 doesn't have), keep / strm
> to find the better configuration.

OK, thanks. Just don't overwork yourself. What I suggested was only
fixing a few very obvious and trivial things in the next revision of
your patch. Then I could have a look at what is remaining and maybe
have some ideas about how to fix it (or maybe not).

The benchmark against the 32-bit code is useful for prioritizing
this work (pay more attention to the things that have slowed down
the most).

But I think that your patch is already almost good enough. And it is
definitely very useful for the users of AArch64 hardware, so we
probably want to have it applied and released as soon as possible.

> But it is only a result of Cortex-A53 (that you ane me have). Does anyone can
> test other (expensive :) aarch64 environment ?
> (Cortex-Axx, Apple Ax, NVidia Denver, etc, etc...)

I have a ssh access to Cavium ThunderX and APM X-Gene. Both of these
are "server" ARM processors and taking care of graphics/multimedia is
not their primary task.

The ThunderX is a 48-core processor with very small and simple
individual cores, optimized for reducing the number of transistors.
It even does not implement the 32-bit mode at all. So we can't
compare the performance of the 32-bit and the 64-bit pixman code
on it. ThunderX has a 64-bit NEON data path. Moreover, it has a
particularly bad microcoded implementation of some NEON instructions,
for example the TBL instruction needs 320 (!) cycles:
    https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01676.html

The X-Gene is reasonably fast out-of-order processor with a wide
instructions decoder, which can run normal code reasonably fast.
However it also only has a 64-bit NEON data path.

A low power, but more multimedia oriented Cortex-A53 with a full
128-bit NEON data path is faster than either of these when running
NEON code.

I can try to run the lowlevel-blt-bench on X-Gene and provide the
32-bit and 64-bit logs. However I'm not the only user of that
machine and running the benchmark undisturbed may be problematic.

Either way, we are very likely just going to see that reducing the
number of redundant instructions has a positive impact on performance.
In a pretty much similar way as on Cortex-A53.
On Tue, 5 Apr 2016 19:28:19 +0900
Mizuki Asakura <ed6e117f@gmail.com> wrote:

> > It looks like you have used an automated process to convert the AArch32
> > NEON code to AArch64. Will you be able to repeat that process for other
> > code, or at least assist others to repeat your steps?  
> 
> Sorry, but I've wrote before, all of the patch were converted by hand.
> "converter script" didn't work correctly.
> # But the script was very helpful for me to understand the difference
> # between aarch32 and aarch64 :)
> 
> 
> > The reason I ask is that I have a large number of outstanding patches to
> > the ARM NEON support.  
> 
> Hmm...
> How should we proceed the implementation ?

Unless there are some objections, I would prefer to see your patch
just cleaned up a bit and pushed to the pixman repository. And then
we could have the next pixman version tagged soon.

> I've seen a comment that current (and I've based) pixman-arm-neon-asm*.S
> were optimized on older Cortex-A8. And, your new patches seem to be
> working well on latest Cortex chips.

Different ARM processors behave in a slightly different way, but that's
not the main point.

As far as the assembly code is concerned, Ben's patches are introducing
a new way of doing bilinear scaling. Implementing a separable algorithm,
previously implemented by Søren for x86 using SSSE3 instructions:

    https://lists.freedesktop.org/archives/pixman/2013-September/002900.html

There are both advantages and drawbacks to this scaling method, but it
is clearly useful if used wisely.

> If so, we should first apply your latest patch to the master, and then,
> someone (or I ?) do the conversion to aarch64 again. It would be good both
> aarch32 and aarch64 worlds.

There is no point suddenly turning the ARM assembly code into a moving
target right now. The Ben's code is not the first and hopefully not the
last change to the 32-bit ARM assembly source files in pixman history.

We will have to find a way to keep this stuff maintainable. That's why
I still think that a fully automated conversion and code sharing
between AArch64 and AArch32 is possible. But it can be introduced at a
bit later date. We should prioritize getting the AArch64 optimized
pixman release to the users as soon as possible.

> # FYI: I've spent 1 week to convert all of the code,
> # and 2 weeks to pass all tests.

Thanks. That was a good work. I guess, now we only need a few more
days, maybe a week maximum to clean this patch a bit. For example,
I spent roughly one evening on doing the most part of these cleanups:

   https://cgit.freedesktop.org/~siamashka/pixman/commit/?h=20160401-arm64-review&id=2f8c71416232bb714bb2420440333496b36fbfae
   https://cgit.freedesktop.org/~siamashka/pixman/commit/?h=20160401-arm64-review&id=76ad1ba645489e6f987da72a7e7f9fa3ef72141c

And while 2f8c71416232bb714bb2420440333496b36fbfae may look like
some major code changes, in fact I was changing it to reduce the
differences between the 32-bit and 64-bit code, looking at the
sources side by side.

I don't think that splitting the patches is necessary (other than
handling pixman-arm-neon-asm-bilinear.S separately). Because this
is not a usual patch review process (looking at the incremental
code changes), but more like the validation of the code conversion
quality. Either way, I'll be looking at the 32-bit and the 64-bit
code side by side when reviewing it. And splitting the patches may
make this more difficult.

And as we seem to be running in circles in this discussion (mostly
the same questions are getting repeated multiple times), I have also
created the following wiki page with the hope to keep the relevant
information more structured:

    https://pixman.miraheze.org/wiki/AArch64_Support

I'll try to add more information about the fully automated conversion
to the wiki later.
> Do you have a more specific example of a code fragment that needs
> conversion?

In original pixman-arm-neon-asm.S:

.macro pixman_composite_over_8888_8_0565_process_pixblock_head
...
vsli.u16    q2,  q2, #5
...
vraddhn.u16 d2,  q6,  q10
...
vshrn.u16   d30, q2, #2


If all registers just converted to Vn, it would be as follows:

.macro pixman_composite_over_8888_8_0565_process_pixblock_head
...
sli    v2.8h,  v2.8h, #5
...
raddhn v2.8b,  v6.8h,  v10.8h
...
shrn   v30.8b, v2.8h, #2


The second raddhn corrupts v2, then the next shrn v30.8b, v2.8h #2
would not be correct.

There are many other conflicts I've met.
I didn't find any specification on the ARM's document that
Dn can be a lower part of V(n/2).


On 7 April 2016 at 16:31, Siarhei Siamashka <siarhei.siamashka@gmail.com> wrote:
> On Tue, 5 Apr 2016 20:20:54 +0900
> Mizuki Asakura <ed6e117f@gmail.com> wrote:
>
>> > This code is not just there for prefetching. It is an example of
>> > using software pipelining:
>>
>> OK. I understand.
>> But the code is very hard to maintain... I've met too many register
>> conflictions.
>
> The *_tail_head variant has exactly the same code as the individual
> *_head and *_tail macros, but the instructions are just reordered.
> There should be no additional register clashes if you are doing the
> exact 1-to-1 conversion.
>
> If you are considering to modify the algorithm, then now it's better
> not to touch these problematic parts of code and keep them the way
> they were in your first patch.
>
>> # q2 and d2 were used in a same sequence. It cannot be exist in aarch64-neon.
>
> Why not? The registers mapping is something like this:
>    q2 -> v2.16b
>    d2 -> v1.8b  (because d2 is the lower 64-bit half of the 128-bit q1 register)
>
> Do you have a more specific example of a code fragment that needs
> conversion?
>
>> Anyway, I'll try to remove unnecessary register copies as you've suggested.
>> After that, I'll also tryh to make benchmarks that
>> * advance vs none
>> * L1 / L2 / L3 (Cortex-A53 doesn't have), keep / strm
>> to find the better configuration.
>
> OK, thanks. Just don't overwork yourself. What I suggested was only
> fixing a few very obvious and trivial things in the next revision of
> your patch. Then I could have a look at what is remaining and maybe
> have some ideas about how to fix it (or maybe not).
>
> The benchmark against the 32-bit code is useful for prioritizing
> this work (pay more attention to the things that have slowed down
> the most).
>
> But I think that your patch is already almost good enough. And it is
> definitely very useful for the users of AArch64 hardware, so we
> probably want to have it applied and released as soon as possible.
>
>> But it is only a result of Cortex-A53 (that you ane me have). Does anyone can
>> test other (expensive :) aarch64 environment ?
>> (Cortex-Axx, Apple Ax, NVidia Denver, etc, etc...)
>
> I have a ssh access to Cavium ThunderX and APM X-Gene. Both of these
> are "server" ARM processors and taking care of graphics/multimedia is
> not their primary task.
>
> The ThunderX is a 48-core processor with very small and simple
> individual cores, optimized for reducing the number of transistors.
> It even does not implement the 32-bit mode at all. So we can't
> compare the performance of the 32-bit and the 64-bit pixman code
> on it. ThunderX has a 64-bit NEON data path. Moreover, it has a
> particularly bad microcoded implementation of some NEON instructions,
> for example the TBL instruction needs 320 (!) cycles:
>     https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01676.html
>
> The X-Gene is reasonably fast out-of-order processor with a wide
> instructions decoder, which can run normal code reasonably fast.
> However it also only has a 64-bit NEON data path.
>
> A low power, but more multimedia oriented Cortex-A53 with a full
> 128-bit NEON data path is faster than either of these when running
> NEON code.
>
> I can try to run the lowlevel-blt-bench on X-Gene and provide the
> 32-bit and 64-bit logs. However I'm not the only user of that
> machine and running the benchmark undisturbed may be problematic.
>
> Either way, we are very likely just going to see that reducing the
> number of redundant instructions has a positive impact on performance.
> In a pretty much similar way as on Cortex-A53.
>
> --
> Best regards,
> Siarhei Siamashka
On Thu, 7 Apr 2016 19:45:03 +0900
Mizuki Asakura <ed6e117f@gmail.com> wrote:

> > Do you have a more specific example of a code fragment that needs
> > conversion?  
> 
> In original pixman-arm-neon-asm.S:
> 
> .macro pixman_composite_over_8888_8_0565_process_pixblock_head
> ...
> vsli.u16    q2,  q2, #5
> ...
> vraddhn.u16 d2,  q6,  q10

This 'd2' register is in fact one of the 64-bit halves of the
128-bit 'q1' register and does not clash with 'q2'.

> ...
> vshrn.u16   d30, q2, #2
> 
> 
> If all registers just converted to Vn, it would be as follows:
> 
> .macro pixman_composite_over_8888_8_0565_process_pixblock_head
> ...
> sli    v2.8h,  v2.8h, #5
> ...
> raddhn v2.8b,  v6.8h,  v10.8h

Hence here we need to convert it to 'v1.8b'.

And if, for example, we had to convert the "vraddhn.u16 d3, q6, q10"
instruction ('d3' instead of 'd2'), then the conversion result
would change to "raddhn2 v1.16b, v6.8h, v10.8h".


> ...
> shrn   v30.8b, v2.8h, #2

For the sake of consistency, here we need 'v15.8b' instead of 'v30.8b'
too.

> 
> 
> The second raddhn corrupts v2, then the next shrn v30.8b, v2.8h #2
> would not be correct.
> 
> There are many other conflicts I've met.
> I didn't find any specification on the ARM's document that
> Dn can be a lower part of V(n/2).

I guess, the whole source of confusion is that the AArch64 syntax
has 'Dn' registers too, but they are all mapped to lower halves
of the 'Vn' registers with the same number. Which is different
from the AArch32 Dn registers naming convention.

But in order to see through the deception, we really need to pay
attention to what exactly the instruction *does* instead of how it
*looks* in the AArch64 assembler syntax. Just because:

    https://en.wikipedia.org/wiki/A_rose_by_any_other_name_would_smell_as_sweet

And as I mentioned earlier, I hope to roll out a full fledged automatic
converter soon.
Hi Siarhei,

> It is always possible to find time for fixing bugs and reviewing the
> code, which is doing something obviously useful.

I think we have been unlucky because it seems to me that our schedules
haven't aligned very well. I have put time aside on a few occasions over
the last couple of years to make a concentrated effort to get my ARM
patches accepted, but there hasn't been anyone around with the ability to
review them at those times. I can see you're working on it at present, but
for personal reasons I don't have much time right now for detailed
discussion, benchmarking and reworking of my patches!

I still read the list though, so I was naturally very interested to see
AArch64 being discussed. I hate to see effort going to waste though,
whether it's mine or anyone else's, and it looked like Mizuki was unaware
of my work, so I thought I should mention it.

> I did give this branch a try. And I'm not sure if you genuinely did
> not notice this fact, but with your runs of "lowlevel-blt-bench -b"
> tests, you are actually benchmarking a special code path for
> horizontal-only scaling against the code that is doing scaling in both
> directions. That's an obvious flaw in the benchmark itself, which gives
> you misleading results!

In the past I had criticism for posting patch series that were too long to
be easily reviewed, so I have been dribbling them in piece by piece. I see
you also identified the limitations of lowlevel-blt-bench when it comes to
scaled plots - that's why I created affine-bench. That program *has* been
accepted into git, although my separable bilinear fetchers haven't yet,
even though they were the reason why I wrote it! Perhaps you may not have
noticed it? That specifically allows different scaling factors to be
tested. For example, for my ARMv6 separable bilinear filter code, I
recorded benchmarks in my commit log as follows:

   Improvement:
       x increment   0.5     0.75    1.0     1.5     2.0
   y increment
       0.5           +196.7% +183.6% +181.8% +206.6% +198.4%
       0.75          +182.2% +166.2% +164.0% +194.8% +185.8%
       1.0           +271.7% +234.4%         +282.7% +257.9%
       1.5           +154.6% +135.3% +134.3% +173.3% +164.8%
       2.0           +144.1% +124.2% +123.3% +165.6% +155.5%

It looks like I didn't record them in the ARMv7 separable bilinear filter
commit, but some of my private notes indicate that before I wrote the ARMv7
separable bilinear filter, I was measuring my ARMv6 separable bilinear
filter as around 20% faster on Cortex-A7 than the old ARMv7 bilinear fast
path for small y increments, even though it didn't have the advantage of
being able to use NEON instructions. That's how I decided it was worth the
effort of writing the ARMv7 separable filter in the first place.

The reason why I didn't include them is because profiling takes a lot of
time, and because I was intending to dribble the commits to the mailing
list in small batches, I decided to defer the detailed profiling until I
was ready to post those patches, in case any other patches accepted in the
meantime affected the numbers.

It is interesting to notice that the ARMv6 improvements in my table above
agree with your assessment that y factors of 1 are not particularly
representative. It also shows that the separable filters show the most
improvement when the y increment is small. This makes sense because the
buffer containing first-pass scaled data gets more re-use when the y factor
is small. I don't think it's unreasonable to pay close attention to the
performance for small y increments, because that's exactly the case where
you would expect to use bilinear filtering if you care about image quality.
For increments above 1, image quality is best served by a multi-tap filter
instead.

> Your implementations are providing better performance on ARM Cortex-A7
> for the special case of horizontal-only scaling. But when we are doing
> general purpose scaling (for example the same scale factor for the x
> and y axes), the performance is more or less comparable between the
> existing bilinear fast paths and your new implementation.
>
> And if we take a different processor, for example ARM Cortex-A9, then
> everything changes. Now the general purpose scaling is much slower
> with your code. And even the horizontal-only special case is somewhat
> slower than the old fast paths which run general purpose bilinear
> scaling.

OK, that's a bit disappointing, though it should be offset to some extent
by the reduction in calculations required when using scaling factors less
than 1, and of course the fact that the fetchers are applicable to a much
wider range of operations than the fast paths are.

It's possible I've chosen some code sequences that are particularly painful
for Cortex-A9. Since ARM have stopped publishing details of cycle counts,
interlocks etc, it has become quite difficult to hand-schedule code, and
IIRC at least the NEON pipeline in the A9 is in-order, like the A8, A5, A7
and A53, and whilst they can be figured out by experiment, it's very time
consuming and requires access to lots of hardware.

There is the question of whether we would want Pixman to have separate
routines optimized for each core, or just try to choose one routine which
is best all-round. But then who decides which cores are most important to
test with?

> However if we try to do a really fair comparison, then we need to get
> rid of the redundant memcpy for the bilinear src_8888_8888 operation in
> the iterators based implementation. I had some old patches for this:
>     https://lists.freedesktop.org/archives/pixman/2013-September/002879.html

Interesting. That seems a reasonable approach, and levels the playing field
a bit between iterators and fast paths. My pass-2 (vertical interpolation)
code currently assumes that the cacheline alignment is the same between its
input and output buffers, but I can imagine that losing the memcpy that
follows it for SRC operations could make it a net win. And in theory I
could add versions of the pass-2 code to support differing alignments to
improve the situation further.

> I'll try to tune the single-pass bilinear scaling code for Cortex-A53
> to see if it can become more competitive. There are some obvious
> performance problems in it. For example, Cortex-A53 wants the ARM
> and NEON instructions to be interleaved in perfect pairs, so we can't
> have separate chunks of ARM and NEON code anymore

Also interesting. I wasn't aware of that about the A53, although I admit I
haven't gone looking for information about its microarchitecture yet. Are
you aware of anywhere that information about the quirks of this and other
cores are documented?

> But you don't need to worry about this right now. Let's focus at one
> task at a time. So it's probably best to use the current pixman code
> for the initial AArch64 conversion round. And at the same time we can
> try to look at integrating your patches as 32-bit code first.

Thanks for all your detailed analysis, Siarhei. It's a very pleasant change
after months of my patches being ignored or getting bogged down in
discussing minutiae. If you've actually got time to look at some of my code
at present, I could perhaps repost a few of them to get things started,
perhaps focusing on the NEON ones as they're the ones pertinent to the
AArch64 conversion. I can't promise to make major reworks to anything in
the short term, but maybe at least some of them might be straightforward to
get accepted. I'd love to be able to merge my branch fully eventually; at
the moment Raspberry Pi is using my branch, but it'll cause a big headache
if/when they move to AArch64.

Ben
On Tue, Apr 05, 2016 at 05:12:13PM +0300, Siarhei Siamashka wrote:
> Hi Lennart,
> 
> This is great, thanks. Could you please clone the following branch?
> 
>     https://cgit.freedesktop.org/~siamashka/pixman/log/?h=20160405-separable-neon-bilinear-test
> 
> And then try to compile static 32-bit pixman test programs using an
> ARM crosscompiler? 

Hmm, OK I did a native compile which is probably 64 bit.  I will do a
32 bit build too.

>    ./autogen.sh
>    ./configure --host=arm-linux-gnueabihf --enable-static-testprogs \
>                --disable-libpng --disable-gtk
>    make
> 
> Then run the "scaling-bench" program from the "test" directory on your
> A57 device?
> 
>    PIXMAN_DISABLE="" ./scaling-bench > cortex-a57-neon-single-pass.txt
>    PIXMAN_DISABLE="wholeops" ./scaling-bench > cortex-a57-neon-separable.txt
> 
> This information can be used to see whether the Cortex-A57 fits a
> common pattern observed with other ARM processors:
> 
>    https://people.freedesktop.org/~siamashka/files/20160405-arm-bilinear/
> 
> I suspect that it will show results similar to Cortex-A15, but we will
> never know until we try.
> 
> This can help to identify an optimal bilinear scaling strategy. And
> also decide which parts of the existing 32-bit ARM assembly code are
> worth converting to AArch64.

I have attached a 32 bit chroot build (which says it used neon and
simd) and a 64 bit build (which did not use neon or simd).  Both are on
Debian Jessie.

As far as I can read the logs it seems the 64 bit C code is faster than
the 32 bit neon code on this CPU.  But I might not understand the output
correctly.  Certainly felt like the test ran faster on the 64 bit build.