[3/5] armv7: Add in_n_8888 fast path

Submitted by Ben Avison on April 21, 2015, 12:16 a.m.

Details

Message ID 1429575381-9879-3-git-send-email-bavison@riscosopen.org
State New
Headers show

Not browsing as part of any series.

Commit Message

Ben Avison April 21, 2015, 12:16 a.m.
This is tuned for Cortex-A7 (Raspberry Pi 2).
lowlevel-blt-bench results, compared to the ARMv6 fast path:

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  104.6  0.5      119.4  0.1     100.0%      +14.1%
L2  106.8  0.6      121.4  0.1     100.0%      +13.6%
M   100.3  1.3      116.4  0.0     100.0%      +16.0%
HT  64.5   1.0      70.8   0.1     100.0%      +9.8%
VT  56.0   0.8      62.2   0.1     100.0%      +11.1%
R   54.1   0.9      55.2   0.0     100.0%      +1.9%
RT  24.6   0.5      26.6   0.0     100.0%      +8.3%
---
 pixman/pixman-arm-neon-asm.S |   64 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-neon.c     |    4 ++
 2 files changed, 68 insertions(+), 0 deletions(-)

Patch hide | download patch | download mbox

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 2fecb5b..8554e0c 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2792,6 +2792,70 @@  generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_in_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.8      {d0[]}, [DUMMY]!
+    vld1.8      {d1[]}, [DUMMY]!
+    vld1.8      {d2[]}, [DUMMY]!
+    vld1.8      {d3[]}, [DUMMY]!
+.endm
+
+.macro pixman_composite_in_n_8888_process_pixblock_head
+    /* src is in d0-d3 (deinterleaved) */
+    /* destination pixel data is in d4-d7 (deinterleaved, though we only need alpha from d7) */
+    vmull.u8    q11, d3, d7
+    vmull.u8    q10, d2, d7
+    vmull.u8    q9, d1, d7
+    vmull.u8    q8, d0, d7
+    vrshr.u16   q15, q11, #8
+    vrshr.u16   q14, q10, #8
+    vrshr.u16   q13, q9, #8
+    vrshr.u16   q12, q8, #8
+    vraddhn.u16 d31, q11, q15
+    vraddhn.u16 d30, q10, q14
+    vraddhn.u16 d28, q8, q12
+    vraddhn.u16 d29, q9, q13
+.endm
+
+.macro pixman_composite_in_n_8888_process_pixblock_tail
+    /* result is in d28-d31 */
+.endm
+
+.macro pixman_composite_in_n_8888_process_pixblock_tail_head
+        vld4.8      {d4-d7}, [DST_R :128]!
+            cache_preload 8, 8
+    vzip.8      d28, d30
+        vmull.u8    q11, d3, d7
+    vzip.8      d29, d31
+        vmull.u8    q10, d2, d7
+        vmull.u8    q9, d1, d7
+        vmull.u8    q8, d0, d7
+    vzip.8      d28, d29
+    vzip.8      d30, d31
+    vst1.8      {d28-d31}, [DST_W :128]!
+        vrshr.u16   q15, q11, #8
+        vrshr.u16   q14, q10, #8
+        vrshr.u16   q13, q9, #8
+        vrshr.u16   q12, q8, #8
+        vraddhn.u16 d31, q11, q15
+        vraddhn.u16 d30, q10, q14
+        vraddhn.u16 d28, q8, q12
+        vraddhn.u16 d29, q9, q13
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    6, /* prefetch distance */ \
+    pixman_composite_in_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_in_n_8888_process_pixblock_head, \
+    pixman_composite_in_n_8888_process_pixblock_tail, \
+    pixman_composite_in_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
 generate_composite_function_nearest_scanline \
     pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 52ee9a4..ab8a58c 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -77,6 +77,8 @@  PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
                                  uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
                                  uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8888,
+                                 uint32_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
                                       uint8_t, 1, uint16_t, 1)
@@ -376,6 +378,8 @@  static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
     PIXMAN_STD_FAST_PATH (IN,   a8r8g8b8, null,     a8,       neon_composite_in_8888_8),
     PIXMAN_STD_FAST_PATH (IN,   a8b8g8r8, null,     a8,       neon_composite_in_8888_8),
+    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8r8g8b8, neon_composite_in_n_8888),
+    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8b8g8r8, neon_composite_in_n_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
     PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),