[2/9,repost] armv7: Faster fill operations

Submitted by Ben Avison on April 11, 2016, 12:26 p.m.

Details

Message ID 1460377590-23285-3-git-send-email-bavison@riscosopen.org
State New
Series "Changes to existing ARMv7 routines"
Headers show

Commit Message

Ben Avison April 11, 2016, 12:26 p.m.
This eliminates a number of branches over blocks of code that are either
empty or can be trivially combined with a separate code block at the start
and end of each scanline. This has a surprisingly big effect, at least on
Cortex-A7, for src_n_8:

    Before          After
    Mean   StdDev   Mean   StdDev  Confidence  Change
L1  1570.4 133.1    1639.6 110.7   100.0%      +4.4%
L2  1042.6 19.9     1086.6 23.4    100.0%      +4.2%
M   1030.8 7.2      1036.8 3.2     100.0%      +0.6%
HT  287.4  3.5      303.3  2.9     100.0%      +5.5%
VT  262.0  2.6      263.3  2.6     99.9%       +0.5%
R   206.5  2.4      209.9  2.4     100.0%      +1.7%
RT  56.5   1.0      59.2   0.5     100.0%      +4.7%

Signed-off-by: Ben Avison <bavison@riscosopen.org>
---
 pixman/pixman-arm-neon-asm.h |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

Patch hide | download patch | download mbox

diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 76b3985..03257cc 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -468,6 +468,7 @@ 
     tst         DST_R, #0xF
     beq         2f
 
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
 .irp lowbit, 1, 2, 4, 8, 16
 local skip1
 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
@@ -487,6 +488,7 @@  local skip1
 1:
 .endif
 .endr
+.endif
     pixdeinterleave src_bpp, src_basereg
     pixdeinterleave mask_bpp, mask_basereg
     pixdeinterleave dst_r_bpp, dst_r_basereg
@@ -503,6 +505,9 @@  local skip1
     tst         DST_W, #lowbit
     beq         1f
 .endif
+.if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+.endif
     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
 1:
 .endif
@@ -533,6 +538,7 @@  local skip1
                                process_pixblock_tail_head
     tst         W, #(pixblock_size - 1)
     beq         2f
+.if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
 .irp chunk_size, 16, 8, 4, 2, 1
 .if pixblock_size > chunk_size
     tst         W, #chunk_size
@@ -550,6 +556,7 @@  local skip1
 1:
 .endif
 .endr
+.endif
     pixdeinterleave src_bpp, src_basereg
     pixdeinterleave mask_bpp, mask_basereg
     pixdeinterleave dst_r_bpp, dst_r_basereg