[23/37,v2] armv6: Add optimised scanline fetchers and writeback for r5g6b5 and a8

Submitted by Ben Avison on April 22, 2015, 11:54 p.m.

Details

Message ID 1429746841-25607-2-git-send-email-bavison@riscosopen.org
State New
Headers show

Not browsing as part of any series.

Commit Message

Ben Avison April 22, 2015, 11:54 p.m.
This supports r5g6b5 source and desitination images, and a8 source images.

lowlevel-blt-bench results for example operations which use these because
they lack a dedicated fast path at the time of writing:

in_reverse_8_8888

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  30.0   0.3      37.0   0.3     100.0%      +23.2%
L2  23.3   0.3      29.4   0.4     100.0%      +26.1%
M   24.0   0.0      31.3   0.1     100.0%      +30.5%
HT  12.8   0.1      16.1   0.1     100.0%      +25.8%
VT  11.9   0.1      14.8   0.1     100.0%      +24.6%
R   11.7   0.1      14.6   0.1     100.0%      +24.5%
RT  5.1    0.1      6.2    0.1     100.0%      +20.2%

in_0565_8888

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  22.0   0.1      28.3   0.2     100.0%      +28.4%
L2  16.6   0.2      23.6   0.3     100.0%      +42.2%
M   16.5   0.0      24.7   0.1     100.0%      +49.5%
HT  11.0   0.1      13.7   0.1     100.0%      +24.4%
VT  10.7   0.0      13.1   0.1     100.0%      +22.0%
R   10.3   0.0      12.6   0.1     100.0%      +22.5%
RT  5.3    0.1      5.7    0.1     100.0%      +9.0%

in_reverse_8888_0565

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  16.6   0.1      20.9   0.1     100.0%      +25.5%
L2  13.1   0.1      17.7   0.3     100.0%      +35.3%
M   13.2   0.0      19.2   0.0     100.0%      +45.3%
HT  9.6    0.0      11.7   0.1     100.0%      +21.8%
VT  9.3    0.0      11.4   0.1     100.0%      +22.4%
R   9.0    0.0      10.9   0.1     100.0%      +21.1%
RT  4.7    0.1      5.2    0.1     100.0%      +8.7%
---
 pixman/pixman-arm-common.h   |   31 ++++++++++++++
 pixman/pixman-arm-simd-asm.S |   94 ++++++++++++++++++++++++++++++++++++++++++
 pixman/pixman-arm-simd.c     |   55 ++++++++++++++++++++++++
 3 files changed, 180 insertions(+), 0 deletions(-)

Patch hide | download patch | download mbox

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 3558c15..f4632b2 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -453,4 +453,35 @@  cputype##_combine_##name##_u (pixman_implementation_t *imp,                   \
         pixman_composite_scanline_##name##_asm_##cputype (width, dest, src);  \
 }
 
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_GET_SCANLINE(cputype, name)                         \
+void                                                                        \
+pixman_get_scanline_##name##_asm_##cputype (int32_t        w,               \
+                                            uint32_t       *dst,            \
+                                            const uint32_t *src);           \
+                                                                            \
+uint32_t *                                                                  \
+cputype##_get_scanline_##name (pixman_iter_t *iter, const uint32_t *mask)   \
+{                                                                           \
+    pixman_get_scanline_##name##_asm_##cputype (iter->width, iter->buffer,  \
+                                                (uint32_t *) iter->bits);   \
+    iter->bits += iter->stride;                                             \
+    return iter->buffer;                                                    \
+}
+
+#define PIXMAN_ARM_BIND_WRITE_BACK(cputype, name)                                      \
+void                                                                                   \
+pixman_write_back_##name##_asm_##cputype (int32_t        w,                            \
+                                          uint32_t       *dst,                         \
+                                          const uint32_t *src);                        \
+                                                                                       \
+void                                                                                   \
+cputype##_write_back_##name (pixman_iter_t *iter)                                      \
+{                                                                                      \
+    pixman_write_back_##name##_asm_##cputype (iter->width,                             \
+                                              (uint32_t *)(iter->bits - iter->stride), \
+                                              iter->buffer);                           \
+}
+
 #endif
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index f61b715..b251187 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -388,6 +388,16 @@  generate_composite_function \
     src_0565_8888_process_head, \
     src_0565_8888_process_tail
 
+generate_composite_function_single_scanline \
+    pixman_get_scanline_r5g6b5_asm_armv6, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+    3, /* prefetch distance */ \
+    src_0565_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_0565_8888_process_head, \
+    src_0565_8888_process_tail
+
 /******************************************************************************/
 
 .macro src_x888_0565_init
@@ -465,6 +475,90 @@  generate_composite_function \
     src_x888_0565_process_head, \
     src_x888_0565_process_tail
 
+generate_composite_function_single_scanline \
+    pixman_write_back_r5g6b5_asm_armv6, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+    3, /* prefetch distance */ \
+    src_x888_0565_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_x888_0565_process_head, \
+    src_x888_0565_process_tail
+
+/******************************************************************************/
+
+.macro src_8_8888_init
+        mov     MASK, #0xff000000
+.endm
+
+.macro src_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   cond, numbytes/4, firstreg, SRC, unaligned_src
+.endm
+
+.macro src_8_8888_1pixel  cond, d0
+        mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_2pixels  cond, d0, d1
+        and&cond WK&d1, MASK, WK&d0, lsl #16
+        mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_4pixels  cond, d0, d1, d2, d3
+        and&cond WK&d3, MASK, WK&d0
+        and&cond WK&d2, MASK, WK&d0, lsl #8
+        and&cond WK&d1, MASK, WK&d0, lsl #16
+        mov&cond WK&d0, WK&d0, lsl #24
+.endm
+
+.macro src_8_8888_process_tail  cond, numbytes, firstreg
+ .if numbytes == 16
+        src_8_8888_4pixels  cond, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        src_8_8888_2pixels  cond, %(firstreg+0), %(firstreg+1)
+ .else // numbytes == 4
+        src_8_8888_1pixel  cond, %(firstreg+0)
+ .endif
+.endm
+
+.macro src_8_8888_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110: /* Length of inner loop is set to allow one preload per 32 source pixels */
+        ldr     STRIDE_M, [SRC], #4
+        and     WK3, MASK, STRIDE_M
+        and     WK2, MASK, STRIDE_M, lsl #8
+        and     WK1, MASK, STRIDE_M, lsl #16
+        mov     WK0, STRIDE_M, lsl #24
+        ldr     STRIDE_M, [SRC], #4
+ .rept 6
+        pixst   , 16, 0, DST
+        and     WK3, MASK, STRIDE_M
+        and     WK2, MASK, STRIDE_M, lsl #8
+        and     WK1, MASK, STRIDE_M, lsl #16
+        mov     WK0, STRIDE_M, lsl #24
+        ldr     STRIDE_M, [SRC], #4
+ .endr
+        pld     [SRC, SCRATCH]
+        pixst   , 16, 0, DST
+        and     WK3, MASK, STRIDE_M
+        and     WK2, MASK, STRIDE_M, lsl #8
+        and     WK1, MASK, STRIDE_M, lsl #16
+        mov     WK0, STRIDE_M, lsl #24
+        pixst   , 16, 0, DST
+        subs    X, X, #32
+        bhs     110b
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_get_scanline_a8_asm_armv6, 8, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    2, /* prefetch distance */ \
+    src_8_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_8_8888_process_head, \
+    src_8_8888_process_tail, \
+    src_8_8888_inner_loop
+
 /******************************************************************************/
 
 .macro add_8_8_8pixels  cond, dst1, dst2
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index f938342..2e5c229 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -109,6 +109,17 @@  PIXMAN_ARM_BIND_COMBINE_U (armv6, out)
 PIXMAN_ARM_BIND_COMBINE_U (armv6, out_reverse)
 PIXMAN_ARM_BIND_COMBINE_U (armv6, add)
 
+PIXMAN_ARM_BIND_GET_SCANLINE (armv6, r5g6b5)
+PIXMAN_ARM_BIND_WRITE_BACK   (armv6, r5g6b5)
+PIXMAN_ARM_BIND_GET_SCANLINE (armv6, a8)
+
+static uint32_t *
+fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->bits += iter->stride;
+    return iter->buffer;
+}
+
 void
 pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
                                        int32_t   h,
@@ -328,6 +339,49 @@  static const pixman_fast_path_t arm_simd_fast_paths[] =
     { PIXMAN_OP_NONE },
 };
 
+static const pixman_iter_info_t arm_simd_iters[] =
+{
+    { PIXMAN_r5g6b5,
+      (FAST_PATH_STANDARD_FLAGS             |
+       FAST_PATH_ID_TRANSFORM               |
+       FAST_PATH_SAMPLES_COVER_CLIP_NEAREST |
+       FAST_PATH_BITS_IMAGE),
+      ITER_NARROW | ITER_SRC,
+      _pixman_iter_init_bits_stride,
+      armv6_get_scanline_r5g6b5,
+      NULL
+    },
+
+    { PIXMAN_r5g6b5,
+      FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST,
+      _pixman_iter_init_bits_stride,
+      armv6_get_scanline_r5g6b5,
+      armv6_write_back_r5g6b5
+    },
+
+    { PIXMAN_r5g6b5,
+      FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA,
+      _pixman_iter_init_bits_stride,
+      fast_dest_fetch_noop,
+      armv6_write_back_r5g6b5
+    },
+
+    { PIXMAN_a8,
+      (FAST_PATH_STANDARD_FLAGS             |
+       FAST_PATH_ID_TRANSFORM               |
+       FAST_PATH_SAMPLES_COVER_CLIP_NEAREST |
+       FAST_PATH_BITS_IMAGE),
+      ITER_NARROW | ITER_SRC,
+      _pixman_iter_init_bits_stride,
+      armv6_get_scanline_a8,
+      NULL
+    },
+
+    { PIXMAN_null },
+};
+
 pixman_implementation_t *
 _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
 {
@@ -342,6 +396,7 @@  _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = armv6_combine_out_reverse_u;
     imp->combine_32[PIXMAN_OP_ADD] = armv6_combine_add_u;
 
+    imp->iter_info = arm_simd_iters;
     imp->blt = arm_simd_blt;
     imp->fill = arm_simd_fill;