mmx: compile on MIPS for Loongson-3A MMI optimizations

Submitted by xianjudiao@gmail.com on Sept. 18, 2018, 9:33 a.m.

Details

Message ID 1537263206-6102-1-git-send-email-xianjudiao@gamil.com
State New
Series "mmx: compile on MIPS for Loongson-3A MMI optimizations"
Headers show

Commit Message

xianjudiao@gmail.com Sept. 18, 2018, 9:33 a.m.
From: Xianju Diao <xianjudiao@gmail.com>

make check:
	when I enable the USE_OPENMP, the test of 'glyph-test' and 'cover-test' will failed on Loongson-3A3000.
	Neither of the two test examples passed without optimizing the code.Maybe be multi-core synchronization
	of cpu bug,I will continue to debug this problem, Now, I use the critical of openMP, 'glyph-test' and '
	cover-test' can passed.

benchmark:
	Running cairo-perf-trace benchmark on Loongson-3A.
	          	                      image 		image16
	gvim                              5.425 -> 5.069     5.531 -> 5.236
	popler-reseau                     2.149 -> 2.13      2.152 -> 2.139
	swfdec-giant-steps-full          18.672 -> 8.215    33.167 -> 18.28
	swfdec-giant-steps                7.014 -> 2.455    12.48  -> 5.982
	xfce4-terminal-al                13.695 -> 5.241    15.703 -> 5.859
	gonme-system-monitor             12.783 -> 7.058    12.780 -> 7.104
	grads-heat-map                    0.482 -> 0.486     0.516 -> 0.514
	firefox-talos-svg               141.138 -> 134.621 152.495 -> 159.069
	firefox-talos-gfx                23.119 -> 14.437   24.870 -> 15.161
	firefox-world-map                32.018 -> 27.139   33.817 -> 28.085
	firefox-periodic-table           12.305 -> 12.443   12.876 -> 12.913
	evolution                         7.071 -> 3.564     8.550 -> 3.784
	firefox-planet-gnome             77.926 -> 67.526   81.554 -> 65.840
	ocitysmap                         4.934 -> 1.702     4.937 -> 1.701
---
 configure.ac                    |    7 +-
 pixman/Makefile.am              |    4 +-
 pixman/loongson-mmintrin.h      |   46 ++
 pixman/pixman-combine32.h       |    6 +
 pixman/pixman-mips-dspr2-asm.h  |    2 +-
 pixman/pixman-mips-memcpy-asm.S |  324 +++++-------
 pixman/pixman-mmx.c             | 1088 ++++++++++++++++++++++++++++++++++++++-
 pixman/pixman-private.h         |   32 +-
 pixman/pixman-solid-fill.c      |   49 +-
 pixman/pixman-utils.c           |   65 ++-
 test/Makefile.am                |    2 +-
 test/utils.c                    |    8 +
 12 files changed, 1418 insertions(+), 215 deletions(-)

Patch hide | download patch | download mbox

diff --git a/configure.ac b/configure.ac
index e833e45..3e3dde5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -154,9 +154,9 @@  AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
 # has set CFLAGS.
 if test $SUNCC = yes &&			\
    test "x$test_CFLAGS" = "x" &&	\
-   test "$CFLAGS" = "-g"
+   test "$CFLAGS" = "-g -mabi=n64"
 then
-  CFLAGS="-O -g"
+  CFLAGS="-O -g -mabi=n64"
 fi
 
 # 
@@ -183,6 +183,7 @@  AC_SUBST(LT_VERSION_INFO)
 # Check for dependencies
 
 PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-mabi=n64])
 PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
 PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
 PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
@@ -273,7 +274,7 @@  dnl ===========================================================================
 dnl Check for Loongson Multimedia Instructions
 
 if test "x$LS_CFLAGS" = "x" ; then
-    LS_CFLAGS="-march=loongson2f"
+    LS_CFLAGS="-march=loongson3a"
 fi
 
 have_loongson_mmi=no
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index 581b6f6..e3a080c 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -122,7 +122,7 @@  libpixman_mips_dspr2_la_SOURCES = \
         pixman-mips-dspr2.h \
         pixman-mips-dspr2-asm.S \
         pixman-mips-dspr2-asm.h \
-        pixman-mips-memcpy-asm.S
+        #pixman-mips-memcpy-asm.S
 libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
 
 ASM_CFLAGS_mips_dspr2=
@@ -131,7 +131,7 @@  endif
 # loongson code
 if USE_LOONGSON_MMI
 noinst_LTLIBRARIES += libpixman-loongson-mmi.la
-libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
+libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h pixman-mips-memcpy-asm.S
 libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
 libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
 libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 086c6e0..f049463 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -89,6 +89,17 @@  _mm_adds_pu8 (__m64 __m1, __m64 __m2)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andn_si64 (__m64 __m1, __m64 __m2)
+{
+	 __m64 ret;
+	asm("pandn %0, %1, %2\n\t"
+		: "=f" (ret)
+		: "f" (__m1), "f"(__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_and_si64 (__m64 __m1, __m64 __m2)
 {
 	__m64 ret;
@@ -100,6 +111,17 @@  _mm_and_si64 (__m64 __m1, __m64 __m2)
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
+{
+    __m64 ret;
+	asm("pcmpeqh %0, %1, %2\n\t"   
+		: "=f" (ret)    
+		: "f" (__m1), "f" (__m2)       
+	); 
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 {
 	__m64 ret;
@@ -110,6 +132,30 @@  _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
 	return ret;
 }
 
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_fand (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("fand %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pcmpgth %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_empty (void)
 {
diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index cdd56a6..27f62d9 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -14,6 +14,12 @@ 
 #define RB_ONE_HALF 0x800080
 #define RB_MASK_PLUS_ONE 0x10000100
 
+#define RGB_MASK 0xffffff
+#define RGB_DMASK 0xffffffffffffULL
+#define R_DMASK 0x0000ffff00000000ULL
+#define G_DMASK 0x00000000ffff0000ULL
+#define B_DMASK 0x000000000000ffffULL
+
 #define ALPHA_8(x) ((x) >> A_SHIFT)
 #define RED_8(x) (((x) >> R_SHIFT) & MASK)
 #define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index e238566..63d7d96 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -77,7 +77,7 @@ 
                 .ent    symbol, 0;                      \
 symbol:         .frame  sp, 0, ra;                      \
                 .set    push;                           \
-                .set    arch=mips32r2;                  \
+                .set    arch=mips64r2;                  \
                 .set    noreorder;                      \
                 .set    noat;
 
diff --git a/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman-mips-memcpy-asm.S
index 9ad6da5..a140191 100644
--- a/pixman/pixman-mips-memcpy-asm.S
+++ b/pixman/pixman-mips-memcpy-asm.S
@@ -54,19 +54,20 @@  LEAF_MIPS32R2(pixman_mips_fast_memcpy)
 
 /* Test if the src and dst are word-aligned, or can be made word-aligned */
 	xor	t8, a1, a0
-	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
+	andi	t8, t8, 0x7		/* t8 is a0/a1 word-displacement */
 
 	bne	t8, zero, $unaligned
 	negu	a3, a0
 
-	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
+	andi	a3, a3, 0x7	/* we need to copy a3 bytes to make a0/a1 aligned */
 	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
 	subu	a2, a2, a3	/* now a2 is the remining bytes count */
 
-	LWHI	t8, 0(a1)
-	addu	a1, a1, a3
-	SWHI	t8, 0(a0)
-	addu	a0, a0, a3
+	ld	t8, 0(a1)
+	daddu	a1, a1, a3
+	sdl	t8, 7(a0)
+	sdr	t8, 0(a0)
+	daddu	a0, a0, a3
 
 /* Now the dst/src are mutually word-aligned with word-aligned addresses */
 $chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
@@ -76,9 +77,9 @@  $chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
 				/* There will be at most 1 32-byte chunk after it */
 	subu	a3, a2, t8	/* subtract from a2 the reminder */
                                 /* Here a3 counts bytes in 16w chunks */
-	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
+	daddu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
 
-	addu	t0, a0, a2	/* t0 is the "past the end" address */
+	daddu	t0, a0, a2	/* t0 is the "past the end" address */
 
 /*
  * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
@@ -89,119 +90,98 @@  $chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
  */
 	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
 
-	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
-	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
-	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
-	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
+	lw    $0, 0(a1)		/* bring the first line of src, addr 0 */
+	lw    $0, 32(a1)	/* bring the second line of src, addr 32 */
+	lw    $0, 64(a1)	/* bring the third line of src, addr 64 */
+	lw	$0, 32(a0)	/* safe, as we have at least 64 bytes ahead */
 /* In case the a0 > t9 don't use "pref 30" at all */
 	sgtu	v1, a0, t9
 	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
 	nop
 /* otherwise, start with using pref30 */
-	pref	30, 64(a0)
+	lw	$0, 64(a0)
 $loop16w:
-	pref	0, 96(a1)
-	lw	t0, 0(a1)
+	lw	$0, 96(a1)
+	ld	t0, 0(a1)
 	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
-	lw	t1, 4(a1)
-	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
+	lw    $0, 96(a0)   /* continue setting up the dest, addr 96 */
 $skip_pref30_96:
-	lw	t2, 8(a1)
-	lw	t3, 12(a1)
-	lw	t4, 16(a1)
-	lw	t5, 20(a1)
-	lw	t6, 24(a1)
-	lw	t7, 28(a1)
-        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
-
-	sw	t0, 0(a0)
-	sw	t1, 4(a0)
-	sw	t2, 8(a0)
-	sw	t3, 12(a0)
-	sw	t4, 16(a0)
-	sw	t5, 20(a0)
-	sw	t6, 24(a0)
-	sw	t7, 28(a0)
-
-	lw	t0, 32(a1)
+	ld	t2, 8(a1)
+	ld	t4, 16(a1)
+	ld	t6, 24(a1)
+        lw	$0, 128(a1)    /* bring the next lines of src, addr 128 */
+	lw	$0, 0x0(a0)
+
+	sd	t0, 0(a0)
+	sd	t2, 8(a0)
+	sd	t4, 16(a0)
+	sd	t6, 24(a0)
+
+	ld	t0, 32(a1)
 	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
-	lw	t1, 36(a1)
-	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
+	lw    $0, 128(a0)   /* continue setting up the dest, addr 128 */
 $skip_pref30_128:
-	lw	t2, 40(a1)
-	lw	t3, 44(a1)
-	lw	t4, 48(a1)
-	lw	t5, 52(a1)
-	lw	t6, 56(a1)
-	lw	t7, 60(a1)
-        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
-
-	sw	t0, 32(a0)
-	sw	t1, 36(a0)
-	sw	t2, 40(a0)
-	sw	t3, 44(a0)
-	sw	t4, 48(a0)
-	sw	t5, 52(a0)
-	sw	t6, 56(a0)
-	sw	t7, 60(a0)
-
-	addiu	a0, a0, 64	/* adding 64 to dest */
+	ld	t2, 40(a1)
+	ld	t4, 48(a1)
+	ld	t6, 56(a1)
+        lw    $0, 160(a1)    /* bring the next lines of src, addr 160 */
+	lw	$0, 0x32(a0)
+
+	sd	t0, 32(a0)
+	sd	t2, 40(a0)
+	sd	t4, 48(a0)
+	sd	t6, 56(a0)
+
+	daddiu	a0, a0, 64	/* adding 64 to dest */
 	sgtu	v1, a0, t9
 	bne	a0, a3, $loop16w
-	addiu	a1, a1, 64	/* adding 64 to src */
+	daddiu	a1, a1, 64	/* adding 64 to src */
 	move	a2, t8
 
 /* Here we have src and dest word-aligned but less than 64-bytes to go */
 
 $chk8w:
-	pref 0, 0x0(a1)
+	lw	$0, 0x0(a1)
 	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
 				/* the t8 is the reminder count past 32-bytes */
 	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
 	 nop
 
-	lw	t0, 0(a1)
-	lw	t1, 4(a1)
-	lw	t2, 8(a1)
-	lw	t3, 12(a1)
-	lw	t4, 16(a1)
-	lw	t5, 20(a1)
-	lw	t6, 24(a1)
-	lw	t7, 28(a1)
-	addiu	a1, a1, 32
-
-	sw	t0, 0(a0)
-	sw	t1, 4(a0)
-	sw	t2, 8(a0)
-	sw	t3, 12(a0)
-	sw	t4, 16(a0)
-	sw	t5, 20(a0)
-	sw	t6, 24(a0)
-	sw	t7, 28(a0)
-	addiu	a0, a0, 32
+	ld	t0, 0(a1)
+	ld	t2, 8(a1)
+	ld	t4, 16(a1)
+	ld	t6, 24(a1)
+	lw	$0, 0x0(a0)
+	daddiu	a1, a1, 32
+
+	sd	t0, 0(a0)
+	sd	t2, 8(a0)
+	sd	t4, 16(a0)
+	sd	t6, 24(a0)
+	daddiu	a0, a0, 32
 
 $chk1w:
 	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
 	beq	a2, t8, $last8
 	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
-	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
+	daddu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
 
 /* copying in words (4-byte chunks) */
 $wordCopy_loop:
 	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
-	addiu	a1, a1, 4
-	addiu	a0, a0, 4
+	daddiu	a1, a1, 4
+	daddiu	a0, a0, 4
 	bne	a0, a3, $wordCopy_loop
 	sw	t3, -4(a0)
 
 /* For the last (<8) bytes */
 $last8:
 	blez	a2, leave
-	addu	a3, a0, a2	/* a3 is the last dst address */
+	daddu	a3, a0, a2	/* a3 is the last dst address */
 $last8loop:
 	lb	v1, 0(a1)
-	addiu	a1, a1, 1
-	addiu	a0, a0, 1
+	daddiu	a1, a1, 1
+	daddiu	a0, a0, 1
 	bne	a0, a3, $last8loop
 	sb	v1, -1(a0)
 
@@ -214,15 +194,16 @@  leave:	j	ra
 
 $unaligned:
 	/* got here with a3="negu a0" */
-	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
+	andi	a3, a3, 0x7	/* test if the a0 is word aligned */
 	beqz	a3, $ua_chk16w
 	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
 
-	LWHI	v1, 0(a1)
-	LWLO	v1, 3(a1)
-	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
-	SWHI	v1, 0(a0)
-	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
+        ldl     v1, 7(a1)
+        ldr     v1, 0(a1)
+	daddu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
+        sdl     v1, 7(a0)
+        sdr     v1, 0(a0)
+	daddu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
 
 $ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
 				/* t8 is the byte count after 64-byte chunks */
@@ -230,149 +211,116 @@  $ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
 				/* There will be at most 1 32-byte chunk after it */
 	subu	a3, a2, t8	/* subtract from a2 the reminder */
                                 /* Here a3 counts bytes in 16w chunks */
-	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
+	daddu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
 
-	addu	t0, a0, a2	/* t0 is the "past the end" address */
+	daddu	t0, a0, a2	/* t0 is the "past the end" address */
 
 	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
 
-	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
-	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
-	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
-	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
+	lw    $0, 0(a1)		/* bring the first line of src, addr 0 */
+	lw    $0, 32(a1)	/* bring the second line of src, addr 32 */
+	lw    $0, 64(a1)	/* bring the third line of src, addr 64 */
+	lw	$0, 32(a0)	/* safe, as we have at least 64 bytes ahead */
 /* In case the a0 > t9 don't use "pref 30" at all */
 	sgtu	v1, a0, t9
 	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
 	nop
 /* otherwise,  start with using pref30 */
-	pref	30, 64(a0)
+	lw	$0, 64(a0)
 $ua_loop16w:
-	pref	0, 96(a1)
-	LWHI	t0, 0(a1)
-	LWLO	t0, 3(a1)
-	LWHI	t1, 4(a1)
+	lw	$0, 96(a1)
+        ldl     t0, 7(a1)
+        ldr     t0, 0(a1)
 	bgtz	v1, $ua_skip_pref30_96
-	LWLO	t1, 7(a1)
-	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
+	lw    $0, 96(a0)   /* continue setting up the dest, addr 96 */
 $ua_skip_pref30_96:
-	LWHI	t2, 8(a1)
-	LWLO	t2, 11(a1)
-	LWHI	t3, 12(a1)
-	LWLO	t3, 15(a1)
-	LWHI	t4, 16(a1)
-	LWLO	t4, 19(a1)
-	LWHI	t5, 20(a1)
-	LWLO	t5, 23(a1)
-	LWHI	t6, 24(a1)
-	LWLO	t6, 27(a1)
-	LWHI	t7, 28(a1)
-	LWLO	t7, 31(a1)
-        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
-
-	sw	t0, 0(a0)
-	sw	t1, 4(a0)
-	sw	t2, 8(a0)
-	sw	t3, 12(a0)
-	sw	t4, 16(a0)
-	sw	t5, 20(a0)
-	sw	t6, 24(a0)
-	sw	t7, 28(a0)
-
-	LWHI	t0, 32(a1)
-	LWLO	t0, 35(a1)
-	LWHI	t1, 36(a1)
+	ldl	t2, 15(a1)
+	ldr	t2, 8(a1)
+	ldl	t4, 23(a1)
+	ldr	t4, 16(a1)
+	ldl	t6, 31(a1)
+	ldr	t6, 24(a1)
+        lw    $0, 128(a1)    /* bring the next lines of src, addr 128 */
+	lw	$0, 0(a0)
+
+	sd	t0, 0(a0)
+	sd	t2, 8(a0)
+	sd	t4, 16(a0)
+	sd	t6, 24(a0)
+
+	ldl	t0, 39(a1)
+	ldr	t0, 32(a1)
 	bgtz	v1, $ua_skip_pref30_128
-	LWLO	t1, 39(a1)
-	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
+	lw    $0, 128(a0)   /* continue setting up the dest, addr 128 */
 $ua_skip_pref30_128:
-	LWHI	t2, 40(a1)
-	LWLO	t2, 43(a1)
-	LWHI	t3, 44(a1)
-	LWLO	t3, 47(a1)
-	LWHI	t4, 48(a1)
-	LWLO	t4, 51(a1)
-	LWHI	t5, 52(a1)
-	LWLO	t5, 55(a1)
-	LWHI	t6, 56(a1)
-	LWLO	t6, 59(a1)
-	LWHI	t7, 60(a1)
-	LWLO	t7, 63(a1)
-        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
-
-	sw	t0, 32(a0)
-	sw	t1, 36(a0)
-	sw	t2, 40(a0)
-	sw	t3, 44(a0)
-	sw	t4, 48(a0)
-	sw	t5, 52(a0)
-	sw	t6, 56(a0)
-	sw	t7, 60(a0)
-
-	addiu	a0, a0, 64	/* adding 64 to dest */
+	ldl	t2, 47(a1)
+	ldr	t2, 40(a1)
+	ldl	t4, 55(a1)
+	ldr	t4, 48(a1)
+	ldl	t6, 63(a1)
+	ldr	t6, 56(a1)
+	lw	$0, 32(a0)
+        lw    $0, 160(a1)    /* bring the next lines of src, addr 160 */
+
+	sd	t0, 32(a0)
+	sd	t2, 40(a0)
+	sd	t4, 48(a0)
+	sd	t6, 56(a0)
+
+	daddiu	a0, a0, 64	/* adding 64 to dest */
 	sgtu	v1, a0, t9
 	bne	a0, a3, $ua_loop16w
-	addiu	a1, a1, 64	/* adding 64 to src */
+	daddiu	a1, a1, 64	/* adding 64 to src */
 	move	a2, t8
 
 /* Here we have src and dest word-aligned but less than 64-bytes to go */
 
 $ua_chk8w:
-	pref 0, 0x0(a1)
+	lw	$0, 0x0(a1)
 	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
 				/* the t8 is the reminder count */
 	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
 
-	LWHI	t0, 0(a1)
-	LWLO	t0, 3(a1)
-	LWHI	t1, 4(a1)
-	LWLO	t1, 7(a1)
-	LWHI	t2, 8(a1)
-	LWLO	t2, 11(a1)
-	LWHI	t3, 12(a1)
-	LWLO	t3, 15(a1)
-	LWHI	t4, 16(a1)
-	LWLO	t4, 19(a1)
-	LWHI	t5, 20(a1)
-	LWLO	t5, 23(a1)
-	LWHI	t6, 24(a1)
-	LWLO	t6, 27(a1)
-	LWHI	t7, 28(a1)
-	LWLO	t7, 31(a1)
-	addiu	a1, a1, 32
-
-	sw	t0, 0(a0)
-	sw	t1, 4(a0)
-	sw	t2, 8(a0)
-	sw	t3, 12(a0)
-	sw	t4, 16(a0)
-	sw	t5, 20(a0)
-	sw	t6, 24(a0)
-	sw	t7, 28(a0)
-	addiu	a0, a0, 32
+	ldl	t0, 7(a1)
+	ldr	t0, 0(a1)
+	ldl	t2, 15(a1)
+	ldr	t2, 8(a1)
+	ldl	t4, 23(a1)
+	ldr	t4, 16(a1)
+	ldl	t6, 31(a1)
+	ldr	t6, 24(a1)
+	lw	$0, 0x0(a0)
+	daddiu	a1, a1, 32
+
+	sd	t0, 0(a0)
+	sd	t2, 8(a0)
+	sd	t4, 16(a0)
+	sd	t6, 24(a0)
+	daddiu	a0, a0, 32
 
 $ua_chk1w:
 	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
 	beq	a2, t8, $ua_smallCopy
 	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
-	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
+	daddu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
 
 /* copying in words (4-byte chunks) */
 $ua_wordCopy_loop:
 	LWHI	v1, 0(a1)
 	LWLO	v1, 3(a1)
-	addiu	a1, a1, 4
-	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
+	daddiu	a1, a1, 4
+	daddiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
 	bne	a0, a3, $ua_wordCopy_loop
 	sw	v1, -4(a0)
 
 /* Now less than 4 bytes (value in a2) left to copy */
 $ua_smallCopy:
 	beqz	a2, leave
-	addu	a3, a0, a2	/* a3 is the last dst address */
+	daddu	a3, a0, a2	/* a3 is the last dst address */
 $ua_smallCopy_loop:
 	lb	v1, 0(a1)
-	addiu	a1, a1, 1
-	addiu	a0, a0, 1
+	daddiu	a1, a1, 1
+	daddiu	a0, a0, 1
 	bne	a0, a3, $ua_smallCopy_loop
 	sb	v1, -1(a0)
 
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index dec3974..edbf16b 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -59,6 +59,71 @@  _mm_empty (void)
 }
 #endif
 
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN  2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN  8
+
+#define COMBINE_CLEAR   0
+#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* no SIMD instructions for div, so leave it alone 
+ * portion covered by a but not b
+ * min (1, (1-b) / a) 
+ */
+static uint8_t
+combine_disjoint_out_part (uint8_t a, uint8_t b)
+{
+
+    b = ~b;                
+    if (b >= a)            
+	return MASK;       
+    return DIV_UN8 (b, a);    
+}
+
+/* portion covered by both a and b 
+ * max (1-(1-b)/a, 0)
+ */
+static uint8_t
+combine_disjoint_in_part (uint8_t a, uint8_t b)
+{
+
+    b = ~b;                 
+    if (b >= a)            
+	return 0;          
+    return ~DIV_UN8(b, a);   
+}
+
+/* portion covered by a but not b 
+ * max (1-b/a ,0)
+ * */
+static uint8_t
+combine_conjoint_out_part (uint8_t a, uint8_t b)
+{
+
+    if (b >= a)             
+	return 0x00;       
+    return ~DIV_UN8(b, a);   
+}
+
+/* portion covered by both a and b 
+ * min (1, b/a)
+ */
+static uint8_t
+combine_conjoint_in_part (uint8_t a, uint8_t b)
+{
+
+    if (b >= a)            
+	return MASK;       
+    return DIV_UN8 (b, a);    
+}
+
 #ifdef USE_X86_MMX
 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
 #  include <xmmintrin.h>
@@ -78,7 +143,8 @@  _mm_movemask_pi8 (__m64 __A)
 
     return ret;
 }
-
+#define __OPTIMIZE__
+#ifdef  __OPTIMIZE__
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
 {
@@ -88,7 +154,7 @@  _mm_mulhi_pu16 (__m64 __A, __m64 __B)
     );
     return __A;
 }
-
+#else
 # define _mm_shuffle_pi16(A, N)						\
     ({									\
 	__m64 ret;							\
@@ -102,7 +168,7 @@  _mm_mulhi_pu16 (__m64 __A, __m64 __B)
     })
 # endif
 #endif
-
+#endif
 #ifndef _MSC_VER
 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
@@ -710,6 +776,34 @@  combine (const uint32_t *src, const uint32_t *mask)
     return vsrc;
 }
 
+static force_inline void
+mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64 *s64, __m64 *m64)
+{
+    __m64 res, tmp;
+    
+    if(!(*mask))
+	{
+	    *s64 = 0;
+	    *m64 = 0;
+	    return;
+	}
+    
+    *s64 = load8888(src);
+    
+    if (*mask == ~0)
+	{
+	    *m64 = expand_alpha(*s64);
+	    return;
+	}
+    
+    *m64 = load8888(mask);
+    
+    res = pix_multiply(*s64, *m64);
+    tmp = expand_alpha(*s64);
+    *s64 = res;
+    *m64 = pix_multiply(*m64, tmp);
+}
+
 static force_inline __m64
 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
 {
@@ -729,6 +823,39 @@  core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
 }
 
 static void
+mmx_combine_disjoint_over_u (pixman_implementation_t *imp,
+			 pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    uint32_t *end = dest + width;
+    uint32_t s32;
+    uint64_t sa64;
+    __m64 s64, d64;
+
+    while (dest < end)
+	{
+	    s64 = combine (src, mask);
+	    
+	    if (s64)
+		{
+		    store8888(&s32, s64);
+		    sa64 = combine_disjoint_out_part (*dest >> A_SHIFT, s32 >> A_SHIFT);
+		    d64 = pix_add (pix_multiply (load8888 (dest),expand_alpha_rev ((*(__m64*)&sa64))), s64);
+		    store8888 (dest, d64);
+		}
+	    
+	    ++dest;
+	    ++src;
+	    if (mask)
+		++mask;
+	    
+	}
+}
+
+static void
 mmx_combine_over_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
                     uint32_t *               dest,
@@ -1062,7 +1189,294 @@  mmx_combine_saturate_u (pixman_implementation_t *imp,
     }
     _mm_empty ();
 }
+/* In functions such as ‘combine_conjoint_gerneral_u’, there are multiple branchs,determined by the parameter 'combine'.
+ * and this value will not change during functions operations,so it is not necessary to judge each value in the origin
+ * code. Can be judged at function entrance,and set the corresponding function pointer,can be called directly later.
+ */
+#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res)					 \
+    static type inline combine_joint_ ##zm## _ ##suffix( type sa, type da, type io_flag) \
+    {											 \
+	return res;									 \
+    }
+									
+/* 'conjoint' is same code structure as 'disjoint',the funtion name is different,set this macro to generate the corresponding
+ * function.The order of parameter is different,which is determined by 'io_flag',with '0' for 'in_part' and '1' for 'out_part'.
+ */
+#define DEF_FUNC_COMBINE_JOINT_U(cd, io)								 \
+    static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t sa, uint8_t da, uint8_t io_flag) \
+    {													 \
+	uint8_t parm[2];										 \
+	parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0);						 \
+	parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1);						 \
+	return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]);					 \
+    }
+/* Sets the macro for the array of function pointers, storing the correct handler at the function entrance */
+#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix)				\
+    COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={	\
+	combine_joint_zero_ ##suffix,					\
+	combine_ ##cd## joint_out_part_ ##suffix,			\
+	combine_ ##cd## joint_in_part_ ##suffix,			\
+	combine_joint_mask_ ##suffix					\
+    };
+
+typedef  uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t io_flag);
+
+DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0)
+DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_U(dis, in);
+DEF_FUNC_COMBINE_JOINT_U(dis, out);
+DEF_COMB_FUNC_ARR(dis,U,u)
+
+DEF_FUNC_COMBINE_JOINT_U(con, in);
+DEF_FUNC_COMBINE_JOINT_U(con, out);
+DEF_COMB_FUNC_ARR(con, U, u)
+/* Set an underlying function,'conjoint' and 'disjoint' related functions can be called. */
+static void
+mmx_combine_joint_general_u (uint32_t * dest,
+			 const uint32_t *src,
+			 const uint32_t *mask,
+			 int            width,
+			 uint8_t        comb,
+			 COMBINE_JOINT_FUNC_U *cjf)
+{
+    COMBINE_JOINT_FUNC_U combine_joint_u[2];
+    combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */
+    combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */
+    
+    uint32_t *end = dest + width;
+    while (dest < end)
+	{
+	    __m64 s64 = combine (src, mask);
+	    __m64 d64,sa64,da64;
+	    uint8_t sa, da;
+	    uint32_t tmp;
+	    uint64_t Fa, Fb;
+	    
+	    /* Because these function contain division instructions,
+             *  multimedia instruction are not used to optimize them.
+             */    
+	    store8888(&tmp, s64);
+	    sa = tmp >> A_SHIFT;
+	    da = *dest >> A_SHIFT;
+	        
+	    Fa = combine_joint_u[0](sa, da, 0);
+	    Fb = combine_joint_u[1](sa, da, 1);
+	        
+	    d64 = load8888(dest);
+	    sa64 = expand_alpha_rev (*(__m64*)&Fa);
+	    da64 = expand_alpha_rev (*(__m64*)&Fb);
+	        
+	    d64 = pix_add_mul (s64, sa64, d64, da64);
+	        
+	    store8888 (dest, d64);
+	        
+	    ++dest;
+	    ++src;
+	    if (mask)
+		++mask;
+	}
+}
+
+
+static void
+mmx_combine_disjoint_general_u (uint32_t * dest,
+				const uint32_t *src,
+				const uint32_t *mask,
+				int            width,
+				uint8_t        comb)
+{
+    mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_disjoint_u);
+}
+
+static void
+mmx_combine_disjoint_in_u (pixman_implementation_t *imp,
+			   pixman_op_t              op,
+			   uint32_t *                dest,
+			   const uint32_t *          src,
+			   const uint32_t *          mask,
+			   int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+				   pixman_op_t              op,
+				   uint32_t *                dest,
+				   const uint32_t *          src,
+				   const uint32_t *          mask,
+				   int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_disjoint_out_u (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t *                dest,
+			    const uint32_t *          src,
+			    const uint32_t *          mask,
+			    int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+				    pixman_op_t              op,
+				    uint32_t *                dest,
+				    const uint32_t *          src,
+				    const uint32_t *          mask,
+				    int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_disjoint_atop_u (pixman_implementation_t *imp,
+			     pixman_op_t              op,
+			     uint32_t *                dest,
+			     const uint32_t *          src,
+			     const uint32_t *          mask,
+			     int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+				     pixman_op_t              op,
+				     uint32_t *                dest,
+				     const uint32_t *          src,
+				     const uint32_t *          mask,
+				     int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_disjoint_xor_u (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t *                dest,
+			    const uint32_t *          src,
+			    const uint32_t *          mask,
+			    int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/* Conjoint */
+static void
+mmx_combine_conjoint_general_u(uint32_t * dest,
+			       const uint32_t *src,
+			       const uint32_t *mask,
+			       int            width,
+			       uint8_t        comb)
+{
+    mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_conjoint_u);
+}
+
+static void
+mmx_combine_conjoint_over_u (pixman_implementation_t *imp,
+			     pixman_op_t              op,
+			     uint32_t *                dest,
+			     const uint32_t *          src,
+			     const uint32_t *          mask,
+			     int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+				     pixman_op_t              op,
+				     uint32_t *                dest,
+				     const uint32_t *          src,
+				     const uint32_t *          mask,
+				     int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+mmx_combine_conjoint_in_u (pixman_implementation_t *imp,
+			   pixman_op_t              op,
+			   uint32_t *                dest,
+			   const uint32_t *          src,
+			   const uint32_t *          mask,
+			   int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+				   pixman_op_t              op,
+				   uint32_t *                dest,
+				   const uint32_t *          src,
+				   const uint32_t *          mask,
+				   int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_conjoint_out_u (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t *                dest,
+			    const uint32_t *          src,
+			    const uint32_t *          mask,
+			    int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+				    pixman_op_t              op,
+				    uint32_t *                dest,
+				    const uint32_t *          src,
+				    const uint32_t *          mask,
+				    int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_conjoint_atop_u (pixman_implementation_t *imp,
+			     pixman_op_t              op,
+			     uint32_t *                dest,
+			     const uint32_t *          src,
+			     const uint32_t *          mask,
+			     int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+				     pixman_op_t              op,
+				     uint32_t *                dest,
+				     const uint32_t *          src,
+				     const uint32_t *          mask,
+				     int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_conjoint_xor_u (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t *                dest,
+			    const uint32_t *          src,
+			    const uint32_t *          mask,
+			    int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
 
+/* Component alpha combiners */
 static void
 mmx_combine_src_ca (pixman_implementation_t *imp,
                     pixman_op_t              op,
@@ -1089,6 +1503,410 @@  mmx_combine_src_ca (pixman_implementation_t *imp,
 }
 
 static void
+mmx_combine_saturate_ca (pixman_implementation_t *imp,
+			 pixman_op_t              op,
+			 uint32_t *                dest,
+			 const uint32_t *          src,
+			 const uint32_t *          mask,
+			 int                      width)
+{
+    uint32_t *end = dest + width;
+    while (dest < end)
+	{
+	    uint16_t sa, sr, sg, sb;
+	    uint32_t sa32, m32;
+	    __m64 m64, s64, d64, sa64, da64, cmpf, res;
+	    
+	    mmx_combine_mask_ca (src, mask, &s64, &m64);
+	    
+	    d64 = load8888 (dest);
+	    da64 = expand_alpha (negate(d64));
+	    cmpf = _mm_cmpgt_pi16 (m64, da64);
+	    if (cmpf)
+		{
+		    store8888 (&m32, m64);
+		    sa = (m32 >> (A_SHIFT));
+		    sr = (m32 >> (R_SHIFT)) & MASK;
+		    sg = (m32 >> (G_SHIFT)) & MASK;
+		    sb =  m32               & MASK;
+		    sa32 = (~(*dest) >> A_SHIFT) & MASK;
+		    
+		    sa = (sa) ? sa : 0x1;
+		    sr = (sr) ? sr : 0x1;
+		    sg = (sg) ? sg : 0x1;
+		    sb = (sb) ? sb : 0x1;
+		    
+		    sa32 = ((sa32 << G_SHIFT) / sb & MASK) |
+			((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) |
+			((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) |
+			((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT);
+		    sa64 = load8888 (&sa32);
+		    da64 = MC (4x00ff);
+		    res = pix_multiply (s64, sa64);
+		    s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (s64, negate (cmpf)));
+		    res = pix_multiply (d64, da64);
+		    d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (d64, negate (cmpf)));
+		}
+	    res = _mm_adds_pu8 (s64, d64);
+	    store8888 (dest, res);
+	    
+	    ++dest;
+	    ++src;
+	    if (mask)
+		++mask;
+	}
+}
+
+#define DEF_FUNC_COMBINE_JOINT_CA(cd, io)				\
+    static uint32_t inline combine_ ##cd## joint_ ##io## _part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \
+    {									\
+	uint8_t da8 = da >> A_SHIFT;					\
+	uint32_t m, n, o, p, res;					\
+	uint8_t i, parm[2][4], shift=0;					\
+	for (i=0; i<4; i++)						\
+	    {								\
+		parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) + da8 * (io_flag ^ 0x0); \
+		parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) + da8 * (io_flag ^ 0x1); \
+		shift += G_SHIFT;					\
+	    }								\
+	m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0], parm[1][0]); \
+	n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1], parm[1][1]) << G_SHIFT; \
+	o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2], parm[1][2]) << R_SHIFT; \
+	p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3], parm[1][3]) << A_SHIFT; \
+	res = m | n | o | p;						\
+	return res;							\
+    }
+
+typedef  uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da, uint32_t io_flag);
+
+DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0)
+DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_CA(dis, in);
+DEF_FUNC_COMBINE_JOINT_CA(dis, out);
+DEF_COMB_FUNC_ARR(dis, CA, ca)
+
+DEF_FUNC_COMBINE_JOINT_CA(con, in);
+DEF_FUNC_COMBINE_JOINT_CA(con, out);
+DEF_COMB_FUNC_ARR(con, CA, ca)
+
+static void
+mmx_combine_joint_general_ca (uint32_t * dest,
+			      const uint32_t *src,
+			      const uint32_t *mask,
+			      int            width,
+			      uint8_t        comb,
+			      COMBINE_JOINT_FUNC_CA *cjf)
+{
+    COMBINE_JOINT_FUNC_CA combine_joint_ca[2];
+    combine_joint_ca[0] = cjf[comb & COMBINE_A];
+    combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2];
+    
+    uint32_t *end = dest + width;
+    while (dest < end)
+	{
+	    __m64 m64, s64, sa64, da64, d64;
+	    uint32_t m32, Fa, Fb;
+	    
+	    mmx_combine_mask_ca (src, mask, &s64, &m64);
+	    store8888(&m32, m64);
+	    
+	    Fa = combine_joint_ca[0](m32, *dest, 0);
+	    Fb = combine_joint_ca[1](m32, *dest, 1);
+	    
+	    sa64 = load8888 (&Fa);
+	    da64 = load8888 (&Fb);
+	    
+	    d64 = load8888 (dest);
+	    d64 = pix_add_mul(s64, sa64, d64, da64);
+	    
+	    store8888 (dest, d64);
+	    
+	    ++dest;
+	    ++src;
+	    if (mask)
+		++mask;
+	}
+    
+}
+
+static void
+mmx_combine_disjoint_general_ca (uint32_t * dest,
+				 const uint32_t *src,
+				 const uint32_t *mask,
+				 int            width,
+				 uint8_t        comb)
+{
+    mmx_combine_joint_general_ca (dest, src, mask, width, comb, combine_disjoint_ca);
+}
+
+static void
+mmx_combine_disjoint_over_ca (pixman_implementation_t *imp,
+			      pixman_op_t              op,
+			      uint32_t *                dest,
+			      const uint32_t *          src,
+			      const uint32_t *          mask,
+			      int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_disjoint_in_ca (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t *                dest,
+			    const uint32_t *          src,
+			    const uint32_t *          mask,
+			    int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+				    pixman_op_t              op,
+				    uint32_t *                dest,
+				    const uint32_t *          src,
+				    const uint32_t *          mask,
+				    int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_disjoint_out_ca (pixman_implementation_t *imp,
+			     pixman_op_t              op,
+			     uint32_t *                dest,
+			     const uint32_t *          src,
+			     const uint32_t *          mask,
+			     int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+				     pixman_op_t              op,
+				     uint32_t *                dest,
+				     const uint32_t *          src,
+				     const uint32_t *          mask,
+				     int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_disjoint_atop_ca (pixman_implementation_t *imp,
+			      pixman_op_t              op,
+			      uint32_t *                dest,
+			      const uint32_t *          src,
+			      const uint32_t *          mask,
+			      int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+				      pixman_op_t              op,
+				      uint32_t *                dest,
+				      const uint32_t *          src,
+				      const uint32_t *          mask,
+				      int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_disjoint_xor_ca (pixman_implementation_t *imp,
+			     pixman_op_t              op,
+			     uint32_t *                dest,
+			     const uint32_t *          src,
+			     const uint32_t *          mask,
+			     int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+mmx_combine_conjoint_general_ca(uint32_t * dest,
+				const uint32_t *src,
+				const uint32_t *mask,
+				int            width,
+				uint8_t        comb)
+{
+    mmx_combine_joint_general_ca(dest,src,mask,width,comb,combine_conjoint_ca);
+}
+
+/*
+ * Multiply
+ * B(Dca, ad, Sca, as) = Dca.Sca
+ */
+ 
+static void
+mmx_combine_multiply_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *                dest,
+                    const uint32_t *          src,
+                    const uint32_t *          mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 dia, d, sia;
+	__m64 s = combine (src, mask); 
+	__m64 ss = s;
+	d = load8888 (dest);   
+	sia = negate (expand_alpha (s));     
+	dia = negate (expand_alpha (d));
+	ss = pix_add_mul (ss, dia, d, sia);
+	d = pix_multiply (d, s);
+	d = pix_add (d, ss);	
+	store8888 (dest, d);
+
+	++dest;
+	++src;
+	if (mask)
+		mask++;
+    }
+    _mm_empty ();
+}
+ 
+static void
+mmx_combine_multiply_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *                dest,
+                     const uint32_t *          src,
+                     const uint32_t *          mask,
+                     int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 r = d;	
+	__m64 da = negate (expand_alpha (d));
+	__m64 sa = expand_alpha (s);
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	a = negate (a);
+	r = pix_add_mul (r, a, s, da);
+	d = pix_multiply (d, s);
+	r = pix_add (r, d);
+	store8888 (dest, r);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_conjoint_over_ca (pixman_implementation_t *imp,
+			      pixman_op_t              op,
+			      uint32_t *                dest,
+			      const uint32_t *          src,
+			      const uint32_t *          mask,
+			      int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+				      pixman_op_t              op,
+				      uint32_t *                dest,
+				      const uint32_t *          src,
+				      const uint32_t *          mask,
+				      int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+mmx_combine_conjoint_in_ca (pixman_implementation_t *imp,
+			    pixman_op_t              op,
+			    uint32_t *                dest,
+			    const uint32_t *          src,
+			    const uint32_t *          mask,
+			    int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+				    pixman_op_t              op,
+				    uint32_t *                dest,
+				    const uint32_t *          src,
+				    const uint32_t *          mask,
+				    int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_conjoint_out_ca (pixman_implementation_t *imp,
+			     pixman_op_t              op,
+			     uint32_t *                dest,
+			     const uint32_t *          src,
+			     const uint32_t *          mask,
+			     int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+				     pixman_op_t              op,
+				     uint32_t *                dest,
+				     const uint32_t *          src,
+				     const uint32_t *          mask,
+				     int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_conjoint_atop_ca (pixman_implementation_t *imp,
+			      pixman_op_t              op,
+			      uint32_t *                dest,
+			      const uint32_t *          src,
+			      const uint32_t *          mask,
+			      int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+				      pixman_op_t              op,
+				      uint32_t *                dest,
+				      const uint32_t *          src,
+				      const uint32_t *          mask,
+				      int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_conjoint_xor_ca (pixman_implementation_t *imp,
+			     pixman_op_t              op,
+			     uint32_t *                dest,
+			     const uint32_t *          src,
+			     const uint32_t *          mask,
+			     int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
 mmx_combine_over_ca (pixman_implementation_t *imp,
                      pixman_op_t              op,
                      uint32_t *               dest,
@@ -2089,23 +2907,34 @@  mmx_fill (pixman_implementation_t *imp,
 	stride = stride * (int) sizeof (uint32_t) / 1;
 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
 	byte_width = width;
-	stride *= 1;
+/*non necessary???*/
+/*	stride *= 1; */
         filler = (filler & 0xff) * 0x01010101;
     }
     else if (bpp == 16)
     {
 	stride = stride * (int) sizeof (uint32_t) / 2;
 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+#if 0
 	byte_width = 2 * width;
 	stride *= 2;
+#else
+	byte_width = width << 1;
+	stride <<= 1;
+#endif
         filler = (filler & 0xffff) * 0x00010001;
     }
     else
     {
 	stride = stride * (int) sizeof (uint32_t) / 4;
 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+#if 0
 	byte_width = 4 * width;
 	stride *= 4;
+#else
+	byte_width = width << 2;
+	stride <<= 2;
+#endif
     }
 
     fill = ((uint64_t)filler << 32) | filler;
@@ -3274,9 +4103,15 @@  mmx_blt (pixman_implementation_t *imp,
 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
 	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+#if 0
 	byte_width = 2 * width;
 	src_stride *= 2;
 	dst_stride *= 2;
+#else
+	byte_width = width << 1;
+	src_stride <<= 1;
+	dst_stride <<= 1;
+#endif
     }
     else if (src_bpp == 32)
     {
@@ -3284,9 +4119,15 @@  mmx_blt (pixman_implementation_t *imp,
 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+#if 0
 	byte_width = 4 * width;
 	src_stride *= 4;
 	dst_stride *= 4;
+#else
+	byte_width = width << 2;
+	src_stride <<= 2;
+	dst_stride <<= 2;
+#endif
     }
     else
     {
@@ -4003,6 +4844,186 @@  static const pixman_iter_info_t mmx_iters[] =
     { PIXMAN_null },
 };
 
+#define MMX_PDF_SEPARABLE_BLEND_MODE(name)					  \
+static void                            						  \
+mmx_combine_ ## name ## _u (pixman_implementation_t *imp,                         \
+			                pixman_op_t              op,              \
+			                uint32_t *                dest,           \
+			                const uint32_t *          src,            \
+			                const uint32_t *          mask,           \
+			                int                      width)           \
+{										  \
+	int i;								          \
+	for (i = 0; i < width; ++i) {		                                  \
+		__m64 s = load8888(src + i);	                                  \
+		__m64 d = load8888(dest + i);	                                  \
+		__m64 da = expand_alpha(d);		                          \
+		                                                                  \
+		if(mask)                                                          \
+		{                                                                 \
+			__m64 m = load8888(mask + i);                             \
+			__m64 ma = expand_alpha(m);                               \
+			s = pix_multiply(s,ma);                                   \
+		}                                                                 \
+		__m64 sa = expand_alpha(s);                                       \
+		                                                                  \
+		__m64 isa = negate(sa);			                          \
+		__m64 ida = negate(da);			                          \
+									          \
+		uint32_t result,sada,res;				          \
+		__m64 temp;						          \
+		store8888(&result,pix_add_mul(d,isa,s,ida));		          \
+		store8888(&sada,pix_multiply(sa,da));			          \
+		store8888(&res,mmx_blend_ ## name(d,da,s,sa));		          \
+									          \
+	        sada &= A_MASK;						          \
+		res  &= RGB_MASK;				                  \
+		temp = pix_add( pix_add(load8888(&result), load8888(&sada)),      \
+				load8888(&res));			          \
+		store8888(dest+i, temp);				          \
+	}	                                                                  \
+}                                                                                 \
+static void									  \
+mmx_combine_ ## name ## _ca (pixman_implementation_t *imp,		          \
+			     pixman_op_t              op,		          \
+                 uint32_t *                dest,			          \
+			     const uint32_t *          src,			  \
+			     const uint32_t *          mask,			  \
+			     int                     width)			  \
+    {										  \
+	int i;								          \
+	for (i = 0; i < width; ++i) {						  \
+		__m64 m = load8888(mask + i);		                          \
+		__m64 s = load8888(src + i);		                          \
+		__m64 d = load8888(dest + i);		                          \
+		__m64 sa = expand_alpha(s);			                  \
+		__m64 da = expand_alpha(d);			                  \
+		__m64 ida = negate(da);				                  \
+		                                                                  \
+		s = pix_multiply(s,m);				                  \
+		m = pix_multiply(m,sa);				                  \
+		__m64 im = negate(m);				                  \
+		__m64 ima = expand_alpha(m);		                          \
+                                                                                  \
+		uint32_t result,mada,res;			                  \
+		__m64 temp;				                          \
+		store8888(&result,pix_add_mul(d,im,s,ida));			  \
+		store8888(&mada,pix_multiply(ima,da));				  \
+		store8888(&res,mmx_blend_ ## name(d,da,s,m));			  \
+                                                                                  \
+	        mada &= A_MASK;						          \
+		res  &= RGB_MASK;				                  \
+		temp = pix_add( pix_add(load8888(&result), load8888(&mada)),      \
+				load8888(&res));			          \
+		store8888(dest+i, temp);				          \
+	}                                                                         \
+}                                                                                 \
+
+static inline __m64
+_emulate_pminuh(__m64 s, __m64 d)
+{
+	uint64_t tmp_s = to_uint64(s);
+	uint64_t tmp_d = to_uint64(d);
+
+	__m64 res = to_m64(MIN((tmp_s & R_DMASK), (tmp_d & R_DMASK)) 
+		| MIN((tmp_s & G_DMASK), (tmp_d & G_DMASK)) 
+		| MIN((tmp_s & B_DMASK), (tmp_d & B_DMASK)));	
+
+	return res; 
+}
+
+static inline __m64
+_emulate_pmaxuh(__m64 s, __m64 d)
+{
+	uint64_t tmp_s = to_uint64(s);
+	uint64_t tmp_d = to_uint64(d);
+
+	__m64 res = to_m64(MAX((tmp_s & R_DMASK), (tmp_d & R_DMASK)) 
+		| MAX((tmp_s & G_DMASK), (tmp_d & G_DMASK)) 
+		| MAX((tmp_s & B_DMASK), (tmp_d & B_DMASK)));	
+
+	return res; 
+}
+
+#define R_GREATER(a, b) ((a > b) ? 0x0000ffff00000000ULL : 0)
+#define G_GREATER(a, b) ((a > b) ? 0x00000000ffff0000ULL : 0)
+#define B_GREATER(a, b) ((a > b) ? 0x000000000000ffffULL : 0)
+
+static inline __m64
+_emulate_pcmpgtuh(__m64 s, __m64 d)
+{
+	uint64_t tmp_s = to_uint64(s);
+	uint64_t tmp_d = to_uint64(d);
+
+	__m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d & R_DMASK)) 
+		| G_GREATER((tmp_s & G_DMASK), (tmp_d & G_DMASK)) 
+		| B_GREATER((tmp_s & B_DMASK), (tmp_d & B_DMASK)));	
+
+	return res; 
+}
+
+static inline __m64
+_emulate_paddcmpgtuh(__m64 s, __m64 d1, __m64 d2)
+{
+	uint64_t tmp_s = to_uint64(s);
+	uint64_t tmp_d1 = to_uint64(d1);
+	uint64_t tmp_d2 = to_uint64(d2);
+
+	__m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d1 & R_DMASK) + (tmp_d2 & R_DMASK)) 
+		| G_GREATER((tmp_s & G_DMASK), (tmp_d1 & G_DMASK) + (tmp_d2 & G_DMASK)) 
+		| B_GREATER((tmp_s & B_DMASK), (tmp_d1 & B_DMASK) + (tmp_d2 & B_DMASK)));	
+
+	return res; 
+}
+
+
+/*
+ * Darken
+ * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ */
+static inline __m64
+mmx_blend_darken (__m64 dca, __m64 da, __m64 sca, __m64 sa)
+{
+	__m64 res;
+
+	__m64 s = _mm_mullo_pi16(sca,da);
+	__m64 d = _mm_mullo_pi16(dca,sa);
+
+
+	res = _emulate_pminuh(s, d);
+	res = _mm_adds_pu16(res,MC(4x0080));
+	res = _mm_mulhi_pu16(res,MC(4x0101));
+	
+	return res;
+}
+
+MMX_PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ */
+static inline __m64
+mmx_blend_lighten (__m64 dca, __m64 da, __m64 sca, __m64 sa)
+{
+	__m64 res;
+
+	__m64 s = _mm_mullo_pi16(sca,da);
+	__m64 d = _mm_mullo_pi16(dca,sa);
+
+	res = _emulate_pmaxuh(s, d);
+	res = _mm_adds_pu16(res,MC(4x0080));
+	res = _mm_mulhi_pu16(res,MC(4x0101));
+	
+	return res;
+}
+
+MMX_PDF_SEPARABLE_BLEND_MODE (lighten)
+
+
+#undef MMX_PDF_SEPARABLE_BLEND_MODE
+
+
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
@@ -4114,8 +5135,37 @@  _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
 
+    /* Unified alpha */
     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
+    /* Disjoint, unified */
+    imp->combine_32[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_u;
+
+    /* Conjoint, unified */
+    imp->combine_32[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_u;
+
+    /* Multiply, Unified */
+    imp->combine_32[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_u;
+    imp->combine_32[PIXMAN_OP_DARKEN] = mmx_combine_darken_u;
+    imp->combine_32[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_u;
+    
+    /* Component alpha combiners */
     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
@@ -4137,7 +5187,35 @@  _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
-
+    imp->combine_32_ca[PIXMAN_OP_SATURATE] = mmx_combine_saturate_ca;
+
+    /* Disjoint CA */
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = mmx_combine_disjoint_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = mmx_combine_disjoint_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = mmx_combine_disjoint_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_ca;
+
+    /* Conjoint CA */
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = mmx_combine_conjoint_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = mmx_combine_conjoint_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = mmx_combine_conjoint_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = mmx_combine_conjoint_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_ca;
+
+    /* Multiply CA */
+    imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_ca;
+    imp->combine_32_ca[PIXMAN_OP_DARKEN] = mmx_combine_darken_ca;
+    imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_ca;
+    
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 73a5414..93660b6 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -916,9 +916,39 @@  convert_8888_to_0565 (uint32_t s)
 static force_inline uint32_t
 convert_0565_to_0888 (uint16_t s)
 {
+    uint32_t ret;
+#if USE_LOONGSON_MMI	
+    asm(".set noreorder\r\n"
+        "sll $8, %1, 3\r\n"
+	"andi $8, 0xf8\r\n"
+	"sll $6, %1, 5\r\n"
+	"andi $6, 0xfc00\r\n"
+	"sll $4, %1, 8\r\n"
+	"li $2, 0xf80000\r\n"
+	"and $4, $2\r\n"
+	"or $6, $6, $4\r\n"
+	"or $8, $6\r\n"
+	"srl $4, %1, 2\r\n"
+	"andi $4, 0x7\r\n"
+	"srl $6, %1, 1\r\n"
+	"andi $6, 0x300\r\n"
+	"or $6, $6, $4\r\n"
+	"or $8, $6\r\n"
+	"sll $6, %1, 3\r\n"
+	"li $2, 0x70000\r\n"
+	"and $6, $2\r\n"
+	"or %0, $8, $6\r\n"
+        ".set reorder\r\n"
+	: "=r" (ret)
+	: "r" (s)
+	: "$8","$6","$4","$2"
+	);
+#else 
     return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |
             ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |
             ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)));
+#endif
+	return ret;
 }
 
 static force_inline uint32_t
@@ -991,7 +1021,7 @@  unorm_to_unorm (uint32_t val, int from_bits, int to_bits)
 	{								\
 	    result |= result >> from_bits;				\
 									\
-	    from_bits *= 2;						\
+	    from_bits <<= 1;						\
 	}								\
     }									\
     while (0)
diff --git a/pixman/pixman-solid-fill.c b/pixman/pixman-solid-fill.c
index 4694ebc..c0ca417 100644
--- a/pixman/pixman-solid-fill.c
+++ b/pixman/pixman-solid-fill.c
@@ -40,12 +40,53 @@  static argb_t
 color_to_float (const pixman_color_t *color)
 {
     argb_t result;
+#ifdef USE_LOONGSON_MMI
+    uint32_t a = color->alpha;
+    uint32_t r = color->red;
+    uint32_t g = color->green;
+    uint32_t b = color->blue; 
+    uint32_t m;
+    float tmp;
+    float counta, countr, countg, countb;
+    /*m=((1<<16)-1)*/
+     m=65535;
+    /* tmp=1.f / (float)m;*/
+     float data = 65535.f; 
+     asm(".set noreorder\r\n"
+         "recip.s %4,%5\r\n"
 
-    result.a = pixman_unorm_to_float (color->alpha, 16);
-    result.r = pixman_unorm_to_float (color->red, 16);
-    result.g = pixman_unorm_to_float (color->green, 16);
-    result.b = pixman_unorm_to_float (color->blue, 16);
+         "mtc1 %6, $f0\r\n"
+         "cvt.s.w $f2, $f0\r\n"
+         "mul.s %0,$f2,%4\r\n"
 
+         "mtc1 %7, $f10\r\n"
+         "cvt.s.w $f4, $f10\r\n"
+         "mul.s %1,$f4,%4\r\n"
+
+         "mtc1 %8, $f12\r\n"
+         "cvt.s.w $f6, $f12\r\n"
+         "mul.s %2,$f6,%4\r\n"
+        
+         "mtc1 %9, $f14\r\n"
+         "cvt.s.w $f8, $f14\r\n"
+         "mul.s %3,$f8,%4\r\n"
+
+         ".set reorder\r\n"
+         :"=f"(counta),"=f"(countr),"=f"(countg),"=f"(countb),"=f"(tmp)
+         :"f"(data),"r"(a),"r" (r),"r" (g),"r" (b)
+         :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14"
+        );
+
+    result.a = counta;
+    result.r = countr;
+    result.g = countg;
+    result.b = countb;
+#else
+    result.a = pixman_unorm_to_float(color->alpha, 16);
+    result.r = pixman_unorm_to_float(color->red, 16);
+    result.g = pixman_unorm_to_float(color->green, 16);
+    result.b = pixman_unorm_to_float(color->blue, 16);
+#endif
     return result;
 }
 
diff --git a/pixman/pixman-utils.c b/pixman/pixman-utils.c
index 4a3a835..51f5cd8 100644
--- a/pixman/pixman-utils.c
+++ b/pixman/pixman-utils.c
@@ -80,28 +80,73 @@  pixman_malloc_abc (unsigned int a,
 	return malloc (a * b * c);
 }
 
+
 static force_inline uint16_t
 float_to_unorm (float f, int n_bits)
 {
     uint32_t u;
 
-    if (f > 1.0)
-	f = 1.0;
-    if (f < 0.0)
-	f = 0.0;
-
-    u = f * (1 << n_bits);
-    u -= (u >> n_bits);
-
+    if (f >= 1.0)
+    {
+    	u = 1 << (n_bits);
+	u--;
+    	return u;
+    }
+    else if (f <= 0.0)
+    {
+	return 0.0;
+    }
+    else
+    {
+#ifdef USE_LOONGSON_MMI
+	asm(".set noreorder\r\n"
+	    "li $8, 0x1\r\n"
+	    "sll $8, %2\r\n"
+	    "mtc1 $8, $f2\r\n"
+	    "cvt.s.w $f0, $f2\r\n"
+	    "mul.s $f0, $f0, %1\r\n"
+	    "floor.w.s %0, $f0\r\n"
+	    ".set reorder\r\n"
+	   : "=f" (u)
+	   : "f" (f), "r" (n_bits)
+	   : "$8","$f0", "$f2"
+	);
+#else
+	u = f * (1 << n_bits);
+	u -= (u >> n_bits);
+#endif
+    }
     return u;
 }
 
 static force_inline float
 unorm_to_float (uint16_t u, int n_bits)
 {
+    float result;
+#ifdef USE_LOONGSON_MMI
+    asm(".set noreorder\r\n"
+	"li $8, 0x1\r\n"
+	"sll $8, %2\r\n"
+	"addu $8, -1\r\n"
+	"mtc1 $8, $f8\r\n"
+	"cvt.s.w $f2, $f8\r\n"
+	"and $8,%1\r\n"
+	"mtc1 $8, $f6\r\n"
+	"cvt.s.w $f4, $f6\r\n"
+        "recip.s $f0, $f2\r\n"
+	"mul.s %0,$f0,$f4\r\n"
+
+	".set reorder\r\n"
+	: "=f" (result)
+	: "r"(u), "r" (n_bits)
+	: "$8","$f0", "$f2","$f4","$f6","$f8"
+       );
+    return result;
+#else
     uint32_t m = ((1 << n_bits) - 1);
 
     return (u & m) * (1.f / (float)m);
+#endif
 }
 
 /*
@@ -206,8 +251,8 @@  pixman_contract_from_float (uint32_t     *dst,
 
     for (i = 0; i < width; ++i)
     {
-	uint8_t a, r, g, b;
-
+    	uint8_t a, r, g, b;
+	
 	a = float_to_unorm (src[i].a, 8);
 	r = float_to_unorm (src[i].r, 8);
 	g = float_to_unorm (src[i].g, 8);
diff --git a/test/Makefile.am b/test/Makefile.am
index 88dc36d..43cafb8 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -2,7 +2,7 @@  include $(top_srcdir)/test/Makefile.sources
 
 AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
 AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
-LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS) $(PTHREAD_LIBS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la $(top_builddir)/pixman/libpixman-loongson-mmi.la -lm  $(PNG_LIBS) $(PTHREAD_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
 
 libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
diff --git a/test/utils.c b/test/utils.c
index f8e42a5..73ddb6f 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -877,7 +877,15 @@  fuzzer_test_main (const char *test_name,
 #endif
     for (i = n1; i <= n2; i++)
     {
+#ifdef USE_LOONGSON_MMI
+	uint32_t crc;
+    #pragma omp critical
+	{
+	    crc = call_test_function (test_function, i, 0);
+	}
+#else
 	uint32_t crc = call_test_function (test_function, i, 0);
+#endif
 	if (verbose)
 	    printf ("%d: %08X\n", i, crc);
 	checksum += crc;

Comments

Matt Turner Sept. 19, 2018, 6:56 p.m.
On Tue, Sep 18, 2018 at 2:34 AM <xianjudiao@gmail.com> wrote:
>
> From: Xianju Diao <xianjudiao@gmail.com>
>
> make check:
>         when I enable the USE_OPENMP, the test of 'glyph-test' and 'cover-test' will failed on Loongson-3A3000.
>         Neither of the two test examples passed without optimizing the code.Maybe be multi-core synchronization
>         of cpu bug,I will continue to debug this problem, Now, I use the critical of openMP, 'glyph-test' and '
>         cover-test' can passed.
>
> benchmark:
>         Running cairo-perf-trace benchmark on Loongson-3A.
>                                               image             image16
>         gvim                              5.425 -> 5.069     5.531 -> 5.236
>         popler-reseau                     2.149 -> 2.13      2.152 -> 2.139
>         swfdec-giant-steps-full          18.672 -> 8.215    33.167 -> 18.28
>         swfdec-giant-steps                7.014 -> 2.455    12.48  -> 5.982
>         xfce4-terminal-al                13.695 -> 5.241    15.703 -> 5.859
>         gonme-system-monitor             12.783 -> 7.058    12.780 -> 7.104
>         grads-heat-map                    0.482 -> 0.486     0.516 -> 0.514
>         firefox-talos-svg               141.138 -> 134.621 152.495 -> 159.069
>         firefox-talos-gfx                23.119 -> 14.437   24.870 -> 15.161
>         firefox-world-map                32.018 -> 27.139   33.817 -> 28.085
>         firefox-periodic-table           12.305 -> 12.443   12.876 -> 12.913
>         evolution                         7.071 -> 3.564     8.550 -> 3.784
>         firefox-planet-gnome             77.926 -> 67.526   81.554 -> 65.840
>         ocitysmap                         4.934 -> 1.702     4.937 -> 1.701
> ---

Thanks for the patch. I will review it when I have time (I'm preparing
for a trip at the moment).

I have a Loongson3 system that I have found to be unstable. I assume
it is due to the hardware bugs that must be worked around in gcc and
binutils. I have patched both of them with the patches I found in
https://github.com/loongson-community/binutils-gdb etc, but I still
have instability. I would appreciate it very much if you could offer
some suggestions or help in improving the stability of my system.

Looks like there are a couple of different things happening in this
patch. We should try to split them up. One patch could be making the
assembly memcpy implementation usable on mips64. A separate patch
would add new functions to pixman-mmx.c.

A few quick comments inline.

>  configure.ac                    |    7 +-
>  pixman/Makefile.am              |    4 +-
>  pixman/loongson-mmintrin.h      |   46 ++
>  pixman/pixman-combine32.h       |    6 +
>  pixman/pixman-mips-dspr2-asm.h  |    2 +-
>  pixman/pixman-mips-memcpy-asm.S |  324 +++++-------
>  pixman/pixman-mmx.c             | 1088 ++++++++++++++++++++++++++++++++++++++-
>  pixman/pixman-private.h         |   32 +-
>  pixman/pixman-solid-fill.c      |   49 +-
>  pixman/pixman-utils.c           |   65 ++-
>  test/Makefile.am                |    2 +-
>  test/utils.c                    |    8 +

This diff stat doesn't correspond to this patch.

>  12 files changed, 1418 insertions(+), 215 deletions(-)
>
> diff --git a/configure.ac b/configure.ac
> index e833e45..3e3dde5 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -154,9 +154,9 @@ AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
>  # has set CFLAGS.
>  if test $SUNCC = yes &&                        \
>     test "x$test_CFLAGS" = "x" &&       \
> -   test "$CFLAGS" = "-g"
> +   test "$CFLAGS" = "-g -mabi=n64"
>  then
> -  CFLAGS="-O -g"
> +  CFLAGS="-O -g -mabi=n64"

This isn't acceptable.

>  fi
>
>  #
> @@ -183,6 +183,7 @@ AC_SUBST(LT_VERSION_INFO)
>  # Check for dependencies
>
>  PIXMAN_CHECK_CFLAG([-Wall])
> +PIXMAN_CHECK_CFLAG([-mabi=n64])
>  PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
>  PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
>  PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
> @@ -273,7 +274,7 @@ dnl ===========================================================================
>  dnl Check for Loongson Multimedia Instructions
>
>  if test "x$LS_CFLAGS" = "x" ; then
> -    LS_CFLAGS="-march=loongson2f"
> +    LS_CFLAGS="-march=loongson3a"

Also not acceptable. I see that recent gcc and binutils have gotten
new options for enabling MMI separately from -march=loongson*. Maybe
we could use those if available.

I'm not sure there is currently a good solution. Let me think about it.

>  fi
>
>  have_loongson_mmi=no
> diff --git a/pixman/Makefile.am b/pixman/Makefile.am
> index 581b6f6..e3a080c 100644
> --- a/pixman/Makefile.am
> +++ b/pixman/Makefile.am
> @@ -122,7 +122,7 @@ libpixman_mips_dspr2_la_SOURCES = \
>          pixman-mips-dspr2.h \
>          pixman-mips-dspr2-asm.S \
>          pixman-mips-dspr2-asm.h \
> -        pixman-mips-memcpy-asm.S
> +        #pixman-mips-memcpy-asm.S

Can't do this.

>  libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
>
>  ASM_CFLAGS_mips_dspr2=
> @@ -131,7 +131,7 @@ endif
>  # loongson code
>  if USE_LOONGSON_MMI
>  noinst_LTLIBRARIES += libpixman-loongson-mmi.la
> -libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
> +libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h pixman-mips-memcpy-asm.S
>  libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
>  libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
>  libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
> diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
> index 086c6e0..f049463 100644
> --- a/pixman/loongson-mmintrin.h
> +++ b/pixman/loongson-mmintrin.h
> @@ -89,6 +89,17 @@ _mm_adds_pu8 (__m64 __m1, __m64 __m2)
>  }
>
>  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_andn_si64 (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

> +{
> +        __m64 ret;
> +       asm("pandn %0, %1, %2\n\t"
> +               : "=f" (ret)
> +               : "f" (__m1), "f"(__m2)
> +       );
> +       return ret;
> +}
> +
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_and_si64 (__m64 __m1, __m64 __m2)
>  {
>         __m64 ret;
> @@ -100,6 +111,17 @@ _mm_and_si64 (__m64 __m1, __m64 __m2)
>  }
>
>  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

> +{
> +    __m64 ret;

Whitespace mistake.

> +       asm("pcmpeqh %0, %1, %2\n\t"
> +               : "=f" (ret)
> +               : "f" (__m1), "f" (__m2)
> +       );
> +       return ret;
> +}
> +
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
>  {
>         __m64 ret;
> @@ -110,6 +132,30 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
>         return ret;
>  }
>
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +loongson_fand (__m64 __m1, __m64 __m2)

Doesn't seem to be used.

> +{
> +       __m64 ret;
> +       asm("fand %0, %1, %2\n\t"
> +          : "=f" (ret)
> +          : "f" (__m1), "f" (__m2)
> +       );
> +       return ret;
> +}
> +
> +
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
> +{
> +       __m64 ret;
> +       asm("pcmpgth %0, %1, %2\n\t"
> +          : "=f" (ret)
> +          : "f" (__m1), "f" (__m2)
> +       );
> +       return ret;
> +}
> +
> +
>  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_empty (void)
>  {
> diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
> index cdd56a6..27f62d9 100644
> --- a/pixman/pixman-combine32.h
> +++ b/pixman/pixman-combine32.h
> @@ -14,6 +14,12 @@
>  #define RB_ONE_HALF 0x800080
>  #define RB_MASK_PLUS_ONE 0x10000100
>
> +#define RGB_MASK 0xffffff
> +#define RGB_DMASK 0xffffffffffffULL
> +#define R_DMASK 0x0000ffff00000000ULL
> +#define G_DMASK 0x00000000ffff0000ULL
> +#define B_DMASK 0x000000000000ffffULL
> +
>  #define ALPHA_8(x) ((x) >> A_SHIFT)
>  #define RED_8(x) (((x) >> R_SHIFT) & MASK)
>  #define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
> diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
> index e238566..63d7d96 100644
> --- a/pixman/pixman-mips-dspr2-asm.h
> +++ b/pixman/pixman-mips-dspr2-asm.h
> @@ -77,7 +77,7 @@
>                  .ent    symbol, 0;                      \
>  symbol:         .frame  sp, 0, ra;                      \
>                  .set    push;                           \
> -                .set    arch=mips32r2;                  \
> +                .set    arch=mips64r2;                  \

Can't do this.

>                  .set    noreorder;                      \
>                  .set    noat;
>
> diff --git a/pixman/pixman-mips-memcpy-asm.S b/pixman/pixman-mips-memcpy-asm.S
> index 9ad6da5..a140191 100644
> --- a/pixman/pixman-mips-memcpy-asm.S
> +++ b/pixman/pixman-mips-memcpy-asm.S
> @@ -54,19 +54,20 @@ LEAF_MIPS32R2(pixman_mips_fast_memcpy)
>
>  /* Test if the src and dst are word-aligned, or can be made word-aligned */
>         xor     t8, a1, a0
> -       andi    t8, t8, 0x3             /* t8 is a0/a1 word-displacement */
> +       andi    t8, t8, 0x7             /* t8 is a0/a1 word-displacement */
>
>         bne     t8, zero, $unaligned
>         negu    a3, a0
>
> -       andi    a3, a3, 0x3     /* we need to copy a3 bytes to make a0/a1 aligned */
> +       andi    a3, a3, 0x7     /* we need to copy a3 bytes to make a0/a1 aligned */
>         beq     a3, zero, $chk16w       /* when a3=0 then the dst (a0) is word-aligned */
>         subu    a2, a2, a3      /* now a2 is the remining bytes count */
>
> -       LWHI    t8, 0(a1)
> -       addu    a1, a1, a3
> -       SWHI    t8, 0(a0)
> -       addu    a0, a0, a3
> +       ld      t8, 0(a1)
> +       daddu   a1, a1, a3
> +       sdl     t8, 7(a0)
> +       sdr     t8, 0(a0)
> +       daddu   a0, a0, a3
>
>  /* Now the dst/src are mutually word-aligned with word-aligned addresses */
>  $chk16w:       andi    t8, a2, 0x3f    /* any whole 64-byte chunks? */
> @@ -76,9 +77,9 @@ $chk16w:      andi    t8, a2, 0x3f    /* any whole 64-byte chunks? */
>                                 /* There will be at most 1 32-byte chunk after it */
>         subu    a3, a2, t8      /* subtract from a2 the reminder */
>                                  /* Here a3 counts bytes in 16w chunks */
> -       addu    a3, a0, a3      /* Now a3 is the final dst after 64-byte chunks */
> +       daddu   a3, a0, a3      /* Now a3 is the final dst after 64-byte chunks */
>
> -       addu    t0, a0, a2      /* t0 is the "past the end" address */
> +       daddu   t0, a0, a2      /* t0 is the "past the end" address */
>
>  /*
>   * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
> @@ -89,119 +90,98 @@ $chk16w:   andi    t8, a2, 0x3f    /* any whole 64-byte chunks? */
>   */
>         subu    t9, t0, 160     /* t9 is the "last safe pref 30, 128(a0)" address */
>
> -       pref    0, 0(a1)                /* bring the first line of src, addr 0 */
> -       pref    0, 32(a1)       /* bring the second line of src, addr 32 */
> -       pref    0, 64(a1)       /* bring the third line of src, addr 64 */
> -       pref    30, 32(a0)      /* safe, as we have at least 64 bytes ahead */
> +       lw    $0, 0(a1)         /* bring the first line of src, addr 0 */
> +       lw    $0, 32(a1)        /* bring the second line of src, addr 32 */
> +       lw    $0, 64(a1)        /* bring the third line of src, addr 64 */
> +       lw      $0, 32(a0)      /* safe, as we have at least 64 bytes ahead */
>  /* In case the a0 > t9 don't use "pref 30" at all */
>         sgtu    v1, a0, t9
>         bgtz    v1, $loop16w    /* skip "pref 30, 64(a0)" for too short arrays */
>         nop
>  /* otherwise, start with using pref30 */
> -       pref    30, 64(a0)
> +       lw      $0, 64(a0)
>  $loop16w:
> -       pref    0, 96(a1)
> -       lw      t0, 0(a1)
> +       lw      $0, 96(a1)
> +       ld      t0, 0(a1)
>         bgtz    v1, $skip_pref30_96     /* skip "pref 30, 96(a0)" */
> -       lw      t1, 4(a1)
> -       pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
> +       lw    $0, 96(a0)   /* continue setting up the dest, addr 96 */
>  $skip_pref30_96:
> -       lw      t2, 8(a1)
> -       lw      t3, 12(a1)
> -       lw      t4, 16(a1)
> -       lw      t5, 20(a1)
> -       lw      t6, 24(a1)
> -       lw      t7, 28(a1)
> -        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
> -
> -       sw      t0, 0(a0)
> -       sw      t1, 4(a0)
> -       sw      t2, 8(a0)
> -       sw      t3, 12(a0)
> -       sw      t4, 16(a0)
> -       sw      t5, 20(a0)
> -       sw      t6, 24(a0)
> -       sw      t7, 28(a0)
> -
> -       lw      t0, 32(a1)
> +       ld      t2, 8(a1)
> +       ld      t4, 16(a1)
> +       ld      t6, 24(a1)
> +        lw     $0, 128(a1)    /* bring the next lines of src, addr 128 */
> +       lw      $0, 0x0(a0)
> +
> +       sd      t0, 0(a0)
> +       sd      t2, 8(a0)
> +       sd      t4, 16(a0)
> +       sd      t6, 24(a0)
> +
> +       ld      t0, 32(a1)
>         bgtz    v1, $skip_pref30_128    /* skip "pref 30, 128(a0)" */
> -       lw      t1, 36(a1)
> -       pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
> +       lw    $0, 128(a0)   /* continue setting up the dest, addr 128 */
>  $skip_pref30_128:
> -       lw      t2, 40(a1)
> -       lw      t3, 44(a1)
> -       lw      t4, 48(a1)
> -       lw      t5, 52(a1)
> -       lw      t6, 56(a1)
> -       lw      t7, 60(a1)
> -        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
> -
> -       sw      t0, 32(a0)
> -       sw      t1, 36(a0)
> -       sw      t2, 40(a0)
> -       sw      t3, 44(a0)
> -       sw      t4, 48(a0)
> -       sw      t5, 52(a0)
> -       sw      t6, 56(a0)
> -       sw      t7, 60(a0)
> -
> -       addiu   a0, a0, 64      /* adding 64 to dest */
> +       ld      t2, 40(a1)
> +       ld      t4, 48(a1)
> +       ld      t6, 56(a1)
> +        lw    $0, 160(a1)    /* bring the next lines of src, addr 160 */
> +       lw      $0, 0x32(a0)
> +
> +       sd      t0, 32(a0)
> +       sd      t2, 40(a0)
> +       sd      t4, 48(a0)
> +       sd      t6, 56(a0)
> +
> +       daddiu  a0, a0, 64      /* adding 64 to dest */
>         sgtu    v1, a0, t9
>         bne     a0, a3, $loop16w
> -       addiu   a1, a1, 64      /* adding 64 to src */
> +       daddiu  a1, a1, 64      /* adding 64 to src */
>         move    a2, t8
>
>  /* Here we have src and dest word-aligned but less than 64-bytes to go */
>
>  $chk8w:
> -       pref 0, 0x0(a1)
> +       lw      $0, 0x0(a1)
>         andi    t8, a2, 0x1f    /* is there a 32-byte chunk? */
>                                 /* the t8 is the reminder count past 32-bytes */
>         beq     a2, t8, $chk1w  /* when a2=t8, no 32-byte chunk */
>          nop
>
> -       lw      t0, 0(a1)
> -       lw      t1, 4(a1)
> -       lw      t2, 8(a1)
> -       lw      t3, 12(a1)
> -       lw      t4, 16(a1)
> -       lw      t5, 20(a1)
> -       lw      t6, 24(a1)
> -       lw      t7, 28(a1)
> -       addiu   a1, a1, 32
> -
> -       sw      t0, 0(a0)
> -       sw      t1, 4(a0)
> -       sw      t2, 8(a0)
> -       sw      t3, 12(a0)
> -       sw      t4, 16(a0)
> -       sw      t5, 20(a0)
> -       sw      t6, 24(a0)
> -       sw      t7, 28(a0)
> -       addiu   a0, a0, 32
> +       ld      t0, 0(a1)
> +       ld      t2, 8(a1)
> +       ld      t4, 16(a1)
> +       ld      t6, 24(a1)
> +       lw      $0, 0x0(a0)
> +       daddiu  a1, a1, 32
> +
> +       sd      t0, 0(a0)
> +       sd      t2, 8(a0)
> +       sd      t4, 16(a0)
> +       sd      t6, 24(a0)
> +       daddiu  a0, a0, 32
>
>  $chk1w:
>         andi    a2, t8, 0x3     /* now a2 is the reminder past 1w chunks */
>         beq     a2, t8, $last8
>         subu    a3, t8, a2      /* a3 is count of bytes in 1w chunks */
> -       addu    a3, a0, a3      /* now a3 is the dst address past the 1w chunks */
> +       daddu   a3, a0, a3      /* now a3 is the dst address past the 1w chunks */
>
>  /* copying in words (4-byte chunks) */
>  $wordCopy_loop:
>         lw      t3, 0(a1)       /* the first t3 may be equal t0 ... optimize? */
> -       addiu   a1, a1, 4
> -       addiu   a0, a0, 4
> +       daddiu  a1, a1, 4
> +       daddiu  a0, a0, 4
>         bne     a0, a3, $wordCopy_loop
>         sw      t3, -4(a0)
>
>  /* For the last (<8) bytes */
>  $last8:
>         blez    a2, leave
> -       addu    a3, a0, a2      /* a3 is the last dst address */
> +       daddu   a3, a0, a2      /* a3 is the last dst address */
>  $last8loop:
>         lb      v1, 0(a1)
> -       addiu   a1, a1, 1
> -       addiu   a0, a0, 1
> +       daddiu  a1, a1, 1
> +       daddiu  a0, a0, 1
>         bne     a0, a3, $last8loop
>         sb      v1, -1(a0)
>
> @@ -214,15 +194,16 @@ leave:    j       ra
>
>  $unaligned:
>         /* got here with a3="negu a0" */
> -       andi    a3, a3, 0x3     /* test if the a0 is word aligned */
> +       andi    a3, a3, 0x7     /* test if the a0 is word aligned */
>         beqz    a3, $ua_chk16w
>         subu    a2, a2, a3      /* bytes left after initial a3 bytes */
>
> -       LWHI    v1, 0(a1)
> -       LWLO    v1, 3(a1)
> -       addu    a1, a1, a3      /* a3 may be here 1, 2 or 3 */
> -       SWHI    v1, 0(a0)
> -       addu    a0, a0, a3      /* below the dst will be word aligned (NOTE1) */
> +        ldl     v1, 7(a1)
> +        ldr     v1, 0(a1)
> +       daddu   a1, a1, a3      /* a3 may be here 1, 2 or 3 */
> +        sdl     v1, 7(a0)
> +        sdr     v1, 0(a0)
> +       daddu   a0, a0, a3      /* below the dst will be word aligned (NOTE1) */
>
>  $ua_chk16w:    andi    t8, a2, 0x3f    /* any whole 64-byte chunks? */
>                                 /* t8 is the byte count after 64-byte chunks */
> @@ -230,149 +211,116 @@ $ua_chk16w:     andi    t8, a2, 0x3f    /* any whole 64-byte chunks? */
>                                 /* There will be at most 1 32-byte chunk after it */
>         subu    a3, a2, t8      /* subtract from a2 the reminder */
>                                  /* Here a3 counts bytes in 16w chunks */
> -       addu    a3, a0, a3      /* Now a3 is the final dst after 64-byte chunks */
> +       daddu   a3, a0, a3      /* Now a3 is the final dst after 64-byte chunks */
>
> -       addu    t0, a0, a2      /* t0 is the "past the end" address */
> +       daddu   t0, a0, a2      /* t0 is the "past the end" address */
>
>         subu    t9, t0, 160     /* t9 is the "last safe pref 30, 128(a0)" address */
>
> -       pref    0, 0(a1)                /* bring the first line of src, addr 0 */
> -       pref    0, 32(a1)       /* bring the second line of src, addr 32 */
> -       pref    0, 64(a1)       /* bring the third line of src, addr 64 */
> -       pref    30, 32(a0)      /* safe, as we have at least 64 bytes ahead */
> +       lw    $0, 0(a1)         /* bring the first line of src, addr 0 */
> +       lw    $0, 32(a1)        /* bring the second line of src, addr 32 */
> +       lw    $0, 64(a1)        /* bring the third line of src, addr 64 */
> +       lw      $0, 32(a0)      /* safe, as we have at least 64 bytes ahead */
>  /* In case the a0 > t9 don't use "pref 30" at all */
>         sgtu    v1, a0, t9
>         bgtz    v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */
>         nop
>  /* otherwise,  start with using pref30 */
> -       pref    30, 64(a0)
> +       lw      $0, 64(a0)
>  $ua_loop16w:
> -       pref    0, 96(a1)
> -       LWHI    t0, 0(a1)
> -       LWLO    t0, 3(a1)
> -       LWHI    t1, 4(a1)
> +       lw      $0, 96(a1)
> +        ldl     t0, 7(a1)
> +        ldr     t0, 0(a1)
>         bgtz    v1, $ua_skip_pref30_96
> -       LWLO    t1, 7(a1)
> -       pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
> +       lw    $0, 96(a0)   /* continue setting up the dest, addr 96 */
>  $ua_skip_pref30_96:
> -       LWHI    t2, 8(a1)
> -       LWLO    t2, 11(a1)
> -       LWHI    t3, 12(a1)
> -       LWLO    t3, 15(a1)
> -       LWHI    t4, 16(a1)
> -       LWLO    t4, 19(a1)
> -       LWHI    t5, 20(a1)
> -       LWLO    t5, 23(a1)
> -       LWHI    t6, 24(a1)
> -       LWLO    t6, 27(a1)
> -       LWHI    t7, 28(a1)
> -       LWLO    t7, 31(a1)
> -        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
> -
> -       sw      t0, 0(a0)
> -       sw      t1, 4(a0)
> -       sw      t2, 8(a0)
> -       sw      t3, 12(a0)
> -       sw      t4, 16(a0)
> -       sw      t5, 20(a0)
> -       sw      t6, 24(a0)
> -       sw      t7, 28(a0)
> -
> -       LWHI    t0, 32(a1)
> -       LWLO    t0, 35(a1)
> -       LWHI    t1, 36(a1)
> +       ldl     t2, 15(a1)
> +       ldr     t2, 8(a1)
> +       ldl     t4, 23(a1)
> +       ldr     t4, 16(a1)
> +       ldl     t6, 31(a1)
> +       ldr     t6, 24(a1)
> +        lw    $0, 128(a1)    /* bring the next lines of src, addr 128 */
> +       lw      $0, 0(a0)
> +
> +       sd      t0, 0(a0)
> +       sd      t2, 8(a0)
> +       sd      t4, 16(a0)
> +       sd      t6, 24(a0)
> +
> +       ldl     t0, 39(a1)
> +       ldr     t0, 32(a1)
>         bgtz    v1, $ua_skip_pref30_128
> -       LWLO    t1, 39(a1)
> -       pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
> +       lw    $0, 128(a0)   /* continue setting up the dest, addr 128 */
>  $ua_skip_pref30_128:
> -       LWHI    t2, 40(a1)
> -       LWLO    t2, 43(a1)
> -       LWHI    t3, 44(a1)
> -       LWLO    t3, 47(a1)
> -       LWHI    t4, 48(a1)
> -       LWLO    t4, 51(a1)
> -       LWHI    t5, 52(a1)
> -       LWLO    t5, 55(a1)
> -       LWHI    t6, 56(a1)
> -       LWLO    t6, 59(a1)
> -       LWHI    t7, 60(a1)
> -       LWLO    t7, 63(a1)
> -        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
> -
> -       sw      t0, 32(a0)
> -       sw      t1, 36(a0)
> -       sw      t2, 40(a0)
> -       sw      t3, 44(a0)
> -       sw      t4, 48(a0)
> -       sw      t5, 52(a0)
> -       sw      t6, 56(a0)
> -       sw      t7, 60(a0)
> -
> -       addiu   a0, a0, 64      /* adding 64 to dest */
> +       ldl     t2, 47(a1)
> +       ldr     t2, 40(a1)
> +       ldl     t4, 55(a1)
> +       ldr     t4, 48(a1)
> +       ldl     t6, 63(a1)
> +       ldr     t6, 56(a1)
> +       lw      $0, 32(a0)
> +        lw    $0, 160(a1)    /* bring the next lines of src, addr 160 */
> +
> +       sd      t0, 32(a0)
> +       sd      t2, 40(a0)
> +       sd      t4, 48(a0)
> +       sd      t6, 56(a0)
> +
> +       daddiu  a0, a0, 64      /* adding 64 to dest */
>         sgtu    v1, a0, t9
>         bne     a0, a3, $ua_loop16w
> -       addiu   a1, a1, 64      /* adding 64 to src */
> +       daddiu  a1, a1, 64      /* adding 64 to src */
>         move    a2, t8
>
>  /* Here we have src and dest word-aligned but less than 64-bytes to go */
>
>  $ua_chk8w:
> -       pref 0, 0x0(a1)
> +       lw      $0, 0x0(a1)
>         andi    t8, a2, 0x1f    /* is there a 32-byte chunk? */
>                                 /* the t8 is the reminder count */
>         beq     a2, t8, $ua_chk1w       /* when a2=t8, no 32-byte chunk */
>
> -       LWHI    t0, 0(a1)
> -       LWLO    t0, 3(a1)
> -       LWHI    t1, 4(a1)
> -       LWLO    t1, 7(a1)
> -       LWHI    t2, 8(a1)
> -       LWLO    t2, 11(a1)
> -       LWHI    t3, 12(a1)
> -       LWLO    t3, 15(a1)
> -       LWHI    t4, 16(a1)
> -       LWLO    t4, 19(a1)
> -       LWHI    t5, 20(a1)
> -       LWLO    t5, 23(a1)
> -       LWHI    t6, 24(a1)
> -       LWLO    t6, 27(a1)
> -       LWHI    t7, 28(a1)
> -       LWLO    t7, 31(a1)
> -       addiu   a1, a1, 32
> -
> -       sw      t0, 0(a0)
> -       sw      t1, 4(a0)
> -       sw      t2, 8(a0)
> -       sw      t3, 12(a0)
> -       sw      t4, 16(a0)
> -       sw      t5, 20(a0)
> -       sw      t6, 24(a0)
> -       sw      t7, 28(a0)
> -       addiu   a0, a0, 32
> +       ldl     t0, 7(a1)
> +       ldr     t0, 0(a1)
> +       ldl     t2, 15(a1)
> +       ldr     t2, 8(a1)
> +       ldl     t4, 23(a1)
> +       ldr     t4, 16(a1)
> +       ldl     t6, 31(a1)
> +       ldr     t6, 24(a1)
> +       lw      $0, 0x0(a0)
> +       daddiu  a1, a1, 32
> +
> +       sd      t0, 0(a0)
> +       sd      t2, 8(a0)
> +       sd      t4, 16(a0)
> +       sd      t6, 24(a0)
> +       daddiu  a0, a0, 32
>
>  $ua_chk1w:
>         andi    a2, t8, 0x3     /* now a2 is the reminder past 1w chunks */
>         beq     a2, t8, $ua_smallCopy
>         subu    a3, t8, a2      /* a3 is count of bytes in 1w chunks */
> -       addu    a3, a0, a3      /* now a3 is the dst address past the 1w chunks */
> +       daddu   a3, a0, a3      /* now a3 is the dst address past the 1w chunks */
>
>  /* copying in words (4-byte chunks) */
>  $ua_wordCopy_loop:
>         LWHI    v1, 0(a1)
>         LWLO    v1, 3(a1)
> -       addiu   a1, a1, 4
> -       addiu   a0, a0, 4               /* note: dst=a0 is word aligned here, see NOTE1 */
> +       daddiu  a1, a1, 4
> +       daddiu  a0, a0, 4               /* note: dst=a0 is word aligned here, see NOTE1 */
>         bne     a0, a3, $ua_wordCopy_loop
>         sw      v1, -4(a0)
>
>  /* Now less than 4 bytes (value in a2) left to copy */
>  $ua_smallCopy:
>         beqz    a2, leave
> -       addu    a3, a0, a2      /* a3 is the last dst address */
> +       daddu   a3, a0, a2      /* a3 is the last dst address */
>  $ua_smallCopy_loop:
>         lb      v1, 0(a1)
> -       addiu   a1, a1, 1
> -       addiu   a0, a0, 1
> +       daddiu  a1, a1, 1
> +       daddiu  a0, a0, 1
>         bne     a0, a3, $ua_smallCopy_loop
>         sb      v1, -1(a0)
>
> diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
> index dec3974..edbf16b 100644
> --- a/pixman/pixman-mmx.c
> +++ b/pixman/pixman-mmx.c
> @@ -59,6 +59,71 @@ _mm_empty (void)
>  }
>  #endif
>
> +#define COMBINE_A_OUT 1
> +#define COMBINE_A_IN  2
> +#define COMBINE_B_OUT 4
> +#define COMBINE_B_IN  8
> +
> +#define COMBINE_CLEAR   0
> +#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
> +#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
> +#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
> +#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
> +#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
> +#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
> +#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
> +
> +/* no SIMD instructions for div, so leave it alone
> + * portion covered by a but not b
> + * min (1, (1-b) / a)
> + */
> +static uint8_t
> +combine_disjoint_out_part (uint8_t a, uint8_t b)
> +{
> +
> +    b = ~b;
> +    if (b >= a)
> +       return MASK;
> +    return DIV_UN8 (b, a);
> +}
> +
> +/* portion covered by both a and b
> + * max (1-(1-b)/a, 0)
> + */
> +static uint8_t
> +combine_disjoint_in_part (uint8_t a, uint8_t b)
> +{
> +
> +    b = ~b;
> +    if (b >= a)
> +       return 0;
> +    return ~DIV_UN8(b, a);
> +}
> +
> +/* portion covered by a but not b
> + * max (1-b/a ,0)
> + * */
> +static uint8_t
> +combine_conjoint_out_part (uint8_t a, uint8_t b)
> +{
> +
> +    if (b >= a)
> +       return 0x00;
> +    return ~DIV_UN8(b, a);
> +}
> +
> +/* portion covered by both a and b
> + * min (1, b/a)
> + */
> +static uint8_t
> +combine_conjoint_in_part (uint8_t a, uint8_t b)
> +{
> +
> +    if (b >= a)
> +       return MASK;
> +    return DIV_UN8 (b, a);
> +}
> +
>  #ifdef USE_X86_MMX
>  # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
>  #  include <xmmintrin.h>
> @@ -78,7 +143,8 @@ _mm_movemask_pi8 (__m64 __A)
>
>      return ret;
>  }
> -
> +#define __OPTIMIZE__
> +#ifdef  __OPTIMIZE__
>  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>  _mm_mulhi_pu16 (__m64 __A, __m64 __B)
>  {
> @@ -88,7 +154,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
>      );
>      return __A;
>  }
> -
> +#else
>  # define _mm_shuffle_pi16(A, N)                                                \
>      ({                                                                 \
>         __m64 ret;                                                      \
> @@ -102,7 +168,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
>      })
>  # endif
>  #endif
> -
> +#endif
>  #ifndef _MSC_VER
>  #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
>   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
> @@ -710,6 +776,34 @@ combine (const uint32_t *src, const uint32_t *mask)
>      return vsrc;
>  }
>
> +static force_inline void
> +mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64 *s64, __m64 *m64)
> +{
> +    __m64 res, tmp;
> +
> +    if(!(*mask))
> +       {
> +           *s64 = 0;
> +           *m64 = 0;
> +           return;
> +       }
> +
> +    *s64 = load8888(src);
> +
> +    if (*mask == ~0)
> +       {
> +           *m64 = expand_alpha(*s64);
> +           return;
> +       }
> +
> +    *m64 = load8888(mask);
> +
> +    res = pix_multiply(*s64, *m64);
> +    tmp = expand_alpha(*s64);
> +    *s64 = res;
> +    *m64 = pix_multiply(*m64, tmp);
> +}
> +
>  static force_inline __m64
>  core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
>  {
> @@ -729,6 +823,39 @@ core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
>  }
>
>  static void
> +mmx_combine_disjoint_over_u (pixman_implementation_t *imp,
> +                        pixman_op_t              op,
> +                         uint32_t *                dest,
> +                         const uint32_t *          src,
> +                         const uint32_t *          mask,
> +                         int                      width)
> +{
> +    uint32_t *end = dest + width;
> +    uint32_t s32;
> +    uint64_t sa64;
> +    __m64 s64, d64;
> +
> +    while (dest < end)
> +       {
> +           s64 = combine (src, mask);
> +
> +           if (s64)
> +               {
> +                   store8888(&s32, s64);
> +                   sa64 = combine_disjoint_out_part (*dest >> A_SHIFT, s32 >> A_SHIFT);
> +                   d64 = pix_add (pix_multiply (load8888 (dest),expand_alpha_rev ((*(__m64*)&sa64))), s64);
> +                   store8888 (dest, d64);
> +               }
> +
> +           ++dest;
> +           ++src;
> +           if (mask)
> +               ++mask;
> +
> +       }
> +}
> +
> +static void
>  mmx_combine_over_u (pixman_implementation_t *imp,
>                      pixman_op_t              op,
>                      uint32_t *               dest,
> @@ -1062,7 +1189,294 @@ mmx_combine_saturate_u (pixman_implementation_t *imp,
>      }
>      _mm_empty ();
>  }
> +/* In functions such as ‘combine_conjoint_gerneral_u’, there are multiple branchs,determined by the parameter 'combine'.
> + * and this value will not change during functions operations,so it is not necessary to judge each value in the origin
> + * code. Can be judged at function entrance,and set the corresponding function pointer,can be called directly later.
> + */
> +#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res)                                       \
> +    static type inline combine_joint_ ##zm## _ ##suffix( type sa, type da, type io_flag) \
> +    {                                                                                   \
> +       return res;                                                                      \
> +    }
> +
> +/* 'conjoint' is same code structure as 'disjoint',the funtion name is different,set this macro to generate the corresponding
> + * function.The order of parameter is different,which is determined by 'io_flag',with '0' for 'in_part' and '1' for 'out_part'.
> + */
> +#define DEF_FUNC_COMBINE_JOINT_U(cd, io)                                                                \
> +    static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t sa, uint8_t da, uint8_t io_flag) \
> +    {                                                                                                   \
> +       uint8_t parm[2];                                                                                 \
> +       parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0);                                           \
> +       parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1);                                           \
> +       return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]);                                   \
> +    }
> +/* Sets the macro for the array of function pointers, storing the correct handler at the function entrance */
> +#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix)                            \
> +    COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={  \
> +       combine_joint_zero_ ##suffix,                                   \
> +       combine_ ##cd## joint_out_part_ ##suffix,                       \
> +       combine_ ##cd## joint_in_part_ ##suffix,                        \
> +       combine_joint_mask_ ##suffix                                    \
> +    };
> +
> +typedef  uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t io_flag);
> +
> +DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0)
> +DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0)
> +
> +DEF_FUNC_COMBINE_JOINT_U(dis, in);
> +DEF_FUNC_COMBINE_JOINT_U(dis, out);
> +DEF_COMB_FUNC_ARR(dis,U,u)
> +
> +DEF_FUNC_COMBINE_JOINT_U(con, in);
> +DEF_FUNC_COMBINE_JOINT_U(con, out);
> +DEF_COMB_FUNC_ARR(con, U, u)
> +/* Set an underlying function,'conjoint' and 'disjoint' related functions can be called. */
> +static void
> +mmx_combine_joint_general_u (uint32_t * dest,
> +                        const uint32_t *src,
> +                        const uint32_t *mask,
> +                        int            width,
> +                        uint8_t        comb,
> +                        COMBINE_JOINT_FUNC_U *cjf)
> +{
> +    COMBINE_JOINT_FUNC_U combine_joint_u[2];
> +    combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */
> +    combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */
> +
> +    uint32_t *end = dest + width;
> +    while (dest < end)
> +       {
> +           __m64 s64 = combine (src, mask);
> +           __m64 d64,sa64,da64;
> +           uint8_t sa, da;
> +           uint32_t tmp;
> +           uint64_t Fa, Fb;
> +
> +           /* Because these function contain division instructions,
> +             *  multimedia instruction are not used to optimize them.
> +             */
> +           store8888(&tmp, s64);
> +           sa = tmp >> A_SHIFT;
> +           da = *dest >> A_SHIFT;
> +
> +           Fa = combine_joint_u[0](sa, da, 0);
> +           Fb = combine_joint_u[1](sa, da, 1);
> +
> +           d64 = load8888(dest);
> +           sa64 = expand_alpha_rev (*(__m64*)&Fa);
> +           da64 = expand_alpha_rev (*(__m64*)&Fb);
> +
> +           d64 = pix_add_mul (s64, sa64, d64, da64);
> +
> +           store8888 (dest, d64);
> +
> +           ++dest;
> +           ++src;
> +           if (mask)
> +               ++mask;
> +       }
> +}
> +
> +
> +static void
> +mmx_combine_disjoint_general_u (uint32_t * dest,
> +                               const uint32_t *src,
> +                               const uint32_t *mask,
> +                               int            width,
> +                               uint8_t        comb)
> +{
> +    mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_disjoint_u);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_u (pixman_implementation_t *imp,
> +                          pixman_op_t              op,
> +                          uint32_t *                dest,
> +                          const uint32_t *          src,
> +                          const uint32_t *          mask,
> +                          int                      width)
> +{
> +    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
> +                                  pixman_op_t              op,
> +                                  uint32_t *                dest,
> +                                  const uint32_t *          src,
> +                                  const uint32_t *          mask,
> +                                  int                      width)
> +{
> +    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_u (pixman_implementation_t *imp,
> +                           pixman_op_t              op,
> +                           uint32_t *                dest,
> +                           const uint32_t *          src,
> +                           const uint32_t *          mask,
> +                           int                      width)
> +{
> +    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
> +                                   pixman_op_t              op,
> +                                   uint32_t *                dest,
> +                                   const uint32_t *          src,
> +                                   const uint32_t *          mask,
> +                                   int                      width)
> +{
> +    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_atop_u (pixman_implementation_t *imp,
> +                            pixman_op_t              op,
> +                            uint32_t *                dest,
> +                            const uint32_t *          src,
> +                            const uint32_t *          mask,
> +                            int                      width)
> +{
> +    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
> +}
> +
> +static void
> +mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
> +                                    pixman_op_t              op,
> +                                    uint32_t *                dest,
> +                                    const uint32_t *          src,
> +                                    const uint32_t *          mask,
> +                                    int                      width)
> +{
> +    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
> +}
> +
> +static void
> +mmx_combine_disjoint_xor_u (pixman_implementation_t *imp,
> +                           pixman_op_t              op,
> +                           uint32_t *                dest,
> +                           const uint32_t *          src,
> +                           const uint32_t *          mask,
> +                           int                      width)
> +{
> +    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
> +}
> +
> +/* Conjoint */
> +static void
> +mmx_combine_conjoint_general_u(uint32_t * dest,
> +                              const uint32_t *src,
> +                              const uint32_t *mask,
> +                              int            width,
> +                              uint8_t        comb)
> +{
> +    mmx_combine_joint_general_u (dest, src, mask, width, comb, combine_conjoint_u);
> +}
> +
> +static void
> +mmx_combine_conjoint_over_u (pixman_implementation_t *imp,
> +                            pixman_op_t              op,
> +                            uint32_t *                dest,
> +                            const uint32_t *          src,
> +                            const uint32_t *          mask,
> +                            int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
> +}
> +
> +static void
> +mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
> +                                    pixman_op_t              op,
> +                                    uint32_t *                dest,
> +                                    const uint32_t *          src,
> +                                    const uint32_t *          mask,
> +                                    int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
> +}
> +
> +static void
> +mmx_combine_conjoint_in_u (pixman_implementation_t *imp,
> +                          pixman_op_t              op,
> +                          uint32_t *                dest,
> +                          const uint32_t *          src,
> +                          const uint32_t *          mask,
> +                          int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
> +}
> +
> +static void
> +mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
> +                                  pixman_op_t              op,
> +                                  uint32_t *                dest,
> +                                  const uint32_t *          src,
> +                                  const uint32_t *          mask,
> +                                  int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
> +}
> +
> +static void
> +mmx_combine_conjoint_out_u (pixman_implementation_t *imp,
> +                           pixman_op_t              op,
> +                           uint32_t *                dest,
> +                           const uint32_t *          src,
> +                           const uint32_t *          mask,
> +                           int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
> +}
> +
> +static void
> +mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
> +                                   pixman_op_t              op,
> +                                   uint32_t *                dest,
> +                                   const uint32_t *          src,
> +                                   const uint32_t *          mask,
> +                                   int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
> +}
> +
> +static void
> +mmx_combine_conjoint_atop_u (pixman_implementation_t *imp,
> +                            pixman_op_t              op,
> +                            uint32_t *                dest,
> +                            const uint32_t *          src,
> +                            const uint32_t *          mask,
> +                            int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
> +}
> +
> +static void
> +mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
> +                                    pixman_op_t              op,
> +                                    uint32_t *                dest,
> +                                    const uint32_t *          src,
> +                                    const uint32_t *          mask,
> +                                    int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
> +}
> +
> +static void
> +mmx_combine_conjoint_xor_u (pixman_implementation_t *imp,
> +                           pixman_op_t              op,
> +                           uint32_t *                dest,
> +                           const uint32_t *          src,
> +                           const uint32_t *          mask,
> +                           int                      width)
> +{
> +    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
> +}
>
> +/* Component alpha combiners */
>  static void
>  mmx_combine_src_ca (pixman_implementation_t *imp,
>                      pixman_op_t              op,
> @@ -1089,6 +1503,410 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
>  }
>
>  static void
> +mmx_combine_saturate_ca (pixman_implementation_t *imp,
> +                        pixman_op_t              op,
> +                        uint32_t *                dest,
> +                        const uint32_t *          src,
> +                        const uint32_t *          mask,
> +                        int                      width)
> +{
> +    uint32_t *end = dest + width;
> +    while (dest < end)
> +       {
> +           uint16_t sa, sr, sg, sb;
> +           uint32_t sa32, m32;
> +           __m64 m64, s64, d64, sa64, da64, cmpf, res;
> +
> +           mmx_combine_mask_ca (src, mask, &s64, &m64);
> +
> +           d64 = load8888 (dest);
> +           da64 = expand_alpha (negate(d64));
> +           cmpf = _mm_cmpgt_pi16 (m64, da64);
> +           if (cmpf)
> +               {
> +                   store8888 (&m32, m64);
> +                   sa = (m32 >> (A_SHIFT));
> +                   sr = (m32 >> (R_SHIFT)) & MASK;
> +                   sg = (m32 >> (G_SHIFT)) & MASK;
> +                   sb =  m32               & MASK;
> +                   sa32 = (~(*dest) >> A_SHIFT) & MASK;
> +
> +                   sa = (sa) ? sa : 0x1;
> +                   sr = (sr) ? sr : 0x1;
> +                   sg = (sg) ? sg : 0x1;
> +                   sb = (sb) ? sb : 0x1;
> +
> +                   sa32 = ((sa32 << G_SHIFT) / sb & MASK) |
> +                       ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) |
> +                       ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) |
> +                       ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT);
> +                   sa64 = load8888 (&sa32);
> +                   da64 = MC (4x00ff);
> +                   res = pix_multiply (s64, sa64);
> +                   s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (s64, negate (cmpf)));
> +                   res = pix_multiply (d64, da64);
> +                   d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 (d64, negate (cmpf)));
> +               }
> +           res = _mm_adds_pu8 (s64, d64);
> +           store8888 (dest, res);
> +
> +           ++dest;
> +           ++src;
> +           if (mask)
> +               ++mask;
> +       }
> +}
> +
> +#define DEF_FUNC_COMBINE_JOINT_CA(cd, io)                              \
> +    static uint32_t inline combine_ ##cd## joint_ ##io## _part_ca(uint32_t sa, uint32_t da, uint32_t io_flag) \
> +    {                                                                  \
> +       uint8_t da8 = da >> A_SHIFT;                                    \
> +       uint32_t m, n, o, p, res;                                       \
> +       uint8_t i, parm[2][4], shift=0;                                 \
> +       for (i=0; i<4; i++)                                             \
> +           {                                                           \
> +               parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) + da8 * (io_flag ^ 0x0); \
> +               parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) + da8 * (io_flag ^ 0x1); \
> +               shift += G_SHIFT;                                       \
> +           }                                                           \
> +       m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0], parm[1][0]); \
> +       n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1], parm[1][1]) << G_SHIFT; \
> +       o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2], parm[1][2]) << R_SHIFT; \
> +       p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3], parm[1][3]) << A_SHIFT; \
> +       res = m | n | o | p;                                            \
> +       return res;                                                     \
> +    }
> +
> +typedef  uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da, uint32_t io_flag);
> +
> +DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0)
> +DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0)
> +
> +DEF_FUNC_COMBINE_JOINT_CA(dis, in);
> +DEF_FUNC_COMBINE_JOINT_CA(dis, out);
> +DEF_COMB_FUNC_ARR(dis, CA, ca)
> +
> +DEF_FUNC_COMBINE_JOINT_CA(con, in);
> +DEF_FUNC_COMBINE_JOINT_CA(con, out);
> +DEF_COMB_FUNC_ARR(con, CA, ca)
> +
> +static void
> +mmx_combine_joint_general_ca (uint32_t * dest,
> +                             const uint32_t *src,
> +                             const uint32_t *mask,
> +                             int            width,
> +                             uint8_t        comb,
> +                             COMBINE_JOINT_FUNC_CA *cjf)
> +{
> +    COMBINE_JOINT_FUNC_CA combine_joint_ca[2];
> +    combine_joint_ca[0] = cjf[comb & COMBINE_A];
> +    combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2];
> +
> +    uint32_t *end = dest + width;
> +    while (dest < end)
> +       {
> +           __m64 m64, s64, sa64, da64, d64;
> +           uint32_t m32, Fa, Fb;
> +
> +           mmx_combine_mask_ca (src, mask, &s64, &m64);
> +           store8888(&m32, m64);
> +
> +           Fa = combine_joint_ca[0](m32, *dest, 0);
> +           Fb = combine_joint_ca[1](m32, *dest, 1);
> +
> +           sa64 = load8888 (&Fa);
> +           da64 = load8888 (&Fb);
> +
> +           d64 = load8888 (dest);
> +           d64 = pix_add_mul(s64, sa64, d64, da64);
> +
> +           store8888 (dest, d64);
> +
> +           ++dest;
> +           ++src;
> +           if (mask)
> +               ++mask;
> +       }
> +
> +}
> +
> +static void
> +mmx_combine_disjoint_general_ca (uint32_t * dest,
> +                                const uint32_t *src,
> +                                const uint32_t *mask,
> +                                int            width,
> +                                uint8_t        comb)
> +{
> +    mmx_combine_joint_general_ca (dest, src, mask, width, comb, combine_disjoint_ca);
> +}
> +
> +static void
> +mmx_combine_disjoint_over_ca (pixman_implementation_t *imp,
> +                             pixman_op_t              op,
> +                             uint32_t *                dest,
> +                             const uint32_t *          src,
> +                             const uint32_t *          mask,
> +                             int                      width)
> +{
> +    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_ca (pixman_implementation_t *imp,
> +                           pixman_op_t              op,
> +                           uint32_t *                dest,
> +                           const uint32_t *          src,
> +                           const uint32_t *          mask,
> +                           int                      width)
> +{
> +    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
> +                                   pixman_op_t              op,
> +                                   uint32_t *                dest,
> +                                   const uint32_t *          src,
> +                                   const uint32_t *          mask,
> +                                   int                      width)
> +{
> +    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_ca (pixman_implementation_t *imp,
> +                            pixman_op_t              op,
> +                            uint32_t *                dest,
> +                            const uint32_t *          src,
> +                            const uint32_t *          mask,
> +                            int                      width)
> +{
> +    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
> +                                    pixman_op_t              op,
> +                                    uint32_t *                dest,
> +                                    const uint32_t *          src,
> +                                    const uint32_t *          mask,
> +                                    int                      width)
> +{
> +    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
> +}
> +
> +static void
> +mmx_combine_disjoint_atop_ca (pixman_implementation_t *imp,
> +                             pixman_op_t              op,
> +                             uint32_t *                dest,
> +                             const uint32_t *          src,
> +                             const uint32_t *          mask,
> +