[24/37,v2] armv6: Add optimised scanline fetcher for a1r5g5b5

Submitted by Ben Avison on April 22, 2015, 11:54 p.m.

Details

Message ID 1429746841-25607-3-git-send-email-bavison@riscosopen.org
State New
Headers show

Commit Message

Ben Avison April 22, 2015, 11:54 p.m.
This supports a1r5g5b5 source images.

lowlevel-blt-bench results for src_1555_8888, which does not yet have a
dedicated fast path:

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  24.5   0.2      57.0   1.1     100.0%      +132.2%
L2  19.3   0.4      41.4   1.0     100.0%      +114.3%
M   20.4   0.0      49.8   0.1     100.0%      +144.7%
HT  12.8   0.1      21.4   0.3     100.0%      +67.0%
VT  12.7   0.1      21.0   0.3     100.0%      +65.4%
R   12.1   0.1      19.7   0.2     100.0%      +63.1%
RT  5.6    0.1      7.0    0.2     100.0%      +24.8%
---
 pixman/pixman-arm-simd-asm.S |   70 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |   12 +++++++
 2 files changed, 82 insertions(+), 0 deletions(-)

Patch hide | download patch | download mbox

diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index b251187..6674a9d 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -2744,3 +2744,73 @@  generate_composite_function_single_scanline \
     inout_reverse_process_tail
 
 /******************************************************************************/
+
+.macro src_1555_8888_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x83E083E0
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        msr     CPSR_s, #0x50000
+.endm
+
+.macro src_1555_8888_2pixels  reg1, reg2, tmp1, tmp2, mask
+        bic     WK&reg2, WK&reg1, mask              @ 0RRRRR00000BBBBB0rrrrr00000bbbbb
+        and     tmp1, WK&reg1, mask                 @ A00000GGGGG00000a00000ggggg00000
+        mov     tmp2, WK&reg2, lsr #16              @ 00000000000000000RRRRR00000BBBBB
+        orr     tmp1, tmp1, tmp1, lsr #5            @ A0000-GGGGGGGGGGa0000-gggggggggg
+        uxth    WK&reg2, WK&reg2                    @ 00000000000000000rrrrr00000bbbbb
+        mov     WK&reg1, tmp1, lsl #16              @ a0000-gggggggggg0000000000000000
+        orr     tmp2, tmp2, tmp2, lsl #5            @ 000000000000RRRRRRRRRRBBBBBBBBBB
+        orr     WK&reg2, WK&reg2, WK&reg2, lsl #5   @ 000000000000rrrrrrrrrrbbbbbbbbbb
+        mov     tmp1, tmp1, asr #10                 @ AAAAAAAAAAA0000-GGGGGGGGGG------
+        pkhbt   tmp2, tmp2, tmp2, lsl #4            @ 00000000RRRRRRRR------BBBBBBBBBB
+        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #4   @ 00000000rrrrrrrr------bbbbbbbbbb
+        mov     WK&reg1, WK&reg1, asr #10           @ aaaaaaaaaaa0000-gggggggggg000000
+        pkhtb   tmp2, tmp2, tmp2, asr #2            @ 00000000RRRRRRRR--------BBBBBBBB
+        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #2   @ 00000000rrrrrrrr--------bbbbbbbb
+        sel     WK&reg1, WK&reg2, WK&reg1           @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
+        sel     WK&reg2, tmp2, tmp1                 @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+.macro src_1555_8888_1pixel  reg, tmp, mask
+        bic     tmp, WK&reg, mask                   @ 00000000000000000rrrrr00000bbbbb
+        and     WK&reg, mask, WK&reg, lsl #16       @ a00000ggggg000000000000000000000
+        orr     tmp, tmp, tmp, lsl #5               @ 000000000000rrrrrrrrrrbbbbbbbbbb
+        orr     WK&reg, WK&reg, lsr #5              @ a0000-gggggggggg0000000000000000
+        pkhbt   tmp, tmp, tmp, lsl #4               @ 00000000rrrrrrrr------bbbbbbbbbb
+        mov     WK&reg, WK&reg, asr #10             @ aaaaaaaaaaa0000-gggggggggg000000
+        pkhtb   tmp, tmp, tmp, asr #2               @ 00000000rrrrrrrr--------bbbbbbbb
+        sel     WK&reg, tmp, WK&reg                 @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_1555_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+        pixld   , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+        pixld   , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_1555_8888_process_tail   cond, numbytes, firstreg
+ .if numbytes == 16
+        src_1555_8888_2pixels firstreg, %(firstreg+1), STRIDE_M, SCRATCH, MASK
+        src_1555_8888_2pixels %(firstreg+2), %(firstreg+3), STRIDE_M, SCRATCH, MASK
+ .elseif numbytes == 8
+        src_1555_8888_2pixels firstreg, %(firstreg+1), STRIDE_M, SCRATCH, MASK
+ .else
+        src_1555_8888_1pixel firstreg, SCRATCH, MASK
+ .endif
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_get_scanline_a1r5g5b5_asm_armv6, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+    3, /* prefetch distance */ \
+    src_1555_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_1555_8888_process_head, \
+    src_1555_8888_process_tail
+
+/******************************************************************************/
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 2e5c229..82ac8ed 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -111,6 +111,7 @@  PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
 
 PIXMAN_ARM_BIND_GET_SCANLINE (armv6, r5g6b5)
 PIXMAN_ARM_BIND_WRITE_BACK   (armv6, r5g6b5)
+PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a1r5g5b5)
 PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a8)
 
 static uint32_t *
@@ -368,6 +369,17 @@  static const pixman_iter_info_t arm_simd_iters[] =
       armv6_write_back_r5g6b5
     },
 
+    { PIXMAN_a1r5g5b5,
+      (FAST_PATH_STANDARD_FLAGS             |
+       FAST_PATH_ID_TRANSFORM               |
+       FAST_PATH_SAMPLES_COVER_CLIP_NEAREST |
+       FAST_PATH_BITS_IMAGE),
+      ITER_NARROW | ITER_SRC,
+      _pixman_iter_init_bits_stride,
+      armv6_get_scanline_a1r5g5b5,
+      NULL
+    },
+
     { PIXMAN_a8,
       (FAST_PATH_STANDARD_FLAGS             |
        FAST_PATH_ID_TRANSFORM               |