changeset 9556:800470abb9f7

asm-avx2: calcResidual code for 8bbpp and 16bpp 8bpp: [16x16](8.99x), [32x32](9.88x) 16bpp: [16x16](5.31x), [32x32](3.61x)
author Rajesh Paulraj<rajesh@multicorewareinc.com>
date Fri, 20 Feb 2015 12:39:44 +0530
parents bcd32fad1690
children f75011092766
files source/common/x86/asm-primitives.cpp source/common/x86/pixel-util.h source/common/x86/pixel-util8.asm
diffstat 3 files changed, 137 insertions(+-), 11 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 20 10:59:34 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Feb 20 12:39:44 2015 +0530
@@ -1066,6 +1066,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_avx2;
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
         // p.weight_pp = x265_weight_pp_avx2; fails tests
+        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
+        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2;
 
         ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
@@ -1550,6 +1552,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_avx2;
         p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_avx2;
         p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_avx2;
+        p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
+        p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2;
 
         p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx;
         p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
--- a/source/common/x86/pixel-util.h	Fri Feb 20 10:59:34 2015 +0530
+++ b/source/common/x86/pixel-util.h	Fri Feb 20 12:39:44 2015 +0530
@@ -30,6 +30,8 @@ void x265_getResidual16_sse2(const pixel
 void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
 void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
 void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual16_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void x265_getResidual32_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
 
 void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride);
 void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
--- a/source/common/x86/pixel-util8.asm	Fri Feb 20 10:59:34 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Fri Feb 20 12:39:44 2015 +0530
@@ -102,9 +102,9 @@ cglobal getResidual4, 4,4,4
     punpcklqdq   m0, m1
     punpcklqdq   m2, m3
     psubw        m0, m2
-
     movh        [r2], m0
     movhps      [r2 + r3], m0
+    RET
 %else
 cglobal getResidual4, 4,4,5
     pxor        m0, m0
@@ -137,8 +137,8 @@ cglobal getResidual4, 4,4,5
     psubw       m1, m3
     movh        [r2], m1
     movhps      [r2 + r3 * 2], m1
+    RET
 %endif
-    RET
 
 
 INIT_XMM sse2
@@ -164,6 +164,7 @@ cglobal getResidual8, 4,4,4
     lea         r2, [r2 + r3 * 2]
 %endif
 %endrep
+    RET
 %else
 cglobal getResidual8, 4,4,5
     pxor        m0, m0
@@ -190,8 +191,9 @@ cglobal getResidual8, 4,4,5
     lea         r2, [r2 + r3 * 4]
 %endif
 %endrep
+    RET
 %endif
-    RET
+
 
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
@@ -245,10 +247,9 @@ cglobal getResidual16, 4,5,6
     lea         r0, [r0 + r3 * 2]
     lea         r1, [r1 + r3 * 2]
     lea         r2, [r2 + r3 * 2]
-
     jnz        .loop
+    RET
 %else
-
 INIT_XMM sse4
 cglobal getResidual16, 4,5,8
     mov         r4d, 16/4
@@ -309,11 +310,70 @@ cglobal getResidual16, 4,5,8
     lea         r0, [r0 + r3 * 2]
     lea         r1, [r1 + r3 * 2]
     lea         r2, [r2 + r3 * 4]
-
     jnz        .loop
+    RET
 %endif
 
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal getResidual16, 4,4,5
+    add         r3, r3
+    pxor        m0, m0
+
+%assign x 0
+%rep 16/2
+    movu        m1, [r0]
+    movu        m2, [r0 + r3]
+    movu        m3, [r1]
+    movu        m4, [r1 + r3]
+
+    psubw       m1, m3
+    psubw       m2, m4
+    movu        [r2], m1
+    movu        [r2 + r3], m2
+%assign x x+1
+%if (x != 8)
+    lea         r0, [r0 + r3 * 2]
+    lea         r1, [r1 + r3 * 2]
+    lea         r2, [r2 + r3 * 2]
+%endif
+%endrep
     RET
+%else
+INIT_YMM avx2
+cglobal getResidual16, 4,4,9
+    pxor        m0, m0
+    lea         r4, [r3 * 2]
+    add         r4d, r3d
+
+%assign x 0
+%rep 4
+    pmovzxbw    m1, [r0]
+    pmovzxbw    m2, [r0 + r3]
+    pmovzxbw    m3, [r0 + r3 * 2]
+    pmovzxbw    m4, [r0 + r4]
+    pmovzxbw    m5, [r1]
+    pmovzxbw    m6, [r1 + r3]
+    pmovzxbw    m7, [r1 + r3 * 2]
+    pmovzxbw    m8, [r1 + r4]
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
+    psubw       m4, m8
+    movu        [r2], m1
+    movu        [r2 + r3 * 2], m2
+    movu        [r2 + r3 * 2 * 2], m3
+    movu        [r2 + r4 * 2], m4
+
+%assign x x+1
+%if (x != 4)
+    lea         r0, [r0 + r3 * 2 * 2]
+    lea         r1, [r1 + r3 * 2 * 2]
+    lea         r2, [r2 + r3 * 4 * 2]
+%endif
+%endrep
+    RET
+%endif
 
 %if HIGH_BIT_DEPTH
 INIT_XMM sse2
@@ -364,9 +424,8 @@ cglobal getResidual32, 4,5,6
     lea         r0, [r0 + r3 * 2]
     lea         r1, [r1 + r3 * 2]
     lea         r2, [r2 + r3 * 2]
-
     jnz        .loop
-
+    RET
 %else
 INIT_XMM sse4
 cglobal getResidual32, 4,5,7
@@ -422,12 +481,73 @@ cglobal getResidual32, 4,5,7
     lea         r0, [r0 + r3 * 2]
     lea         r1, [r1 + r3 * 2]
     lea         r2, [r2 + r3 * 4]
-
     jnz        .loop
+    RET
 %endif
+
+
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal getResidual32, 4,4,5
+    add         r3, r3
+    pxor        m0, m0
+
+%assign x 0
+%rep 32
+    movu        m1, [r0]
+    movu        m2, [r0 + 32]
+    movu        m3, [r1]
+    movu        m4, [r1 + 32]
+
+    psubw       m1, m3
+    psubw       m2, m4
+    movu        [r2], m1
+    movu        [r2 + 32], m2
+%assign x x+1
+%if (x != 32)
+    lea         r0, [r0 + r3]
+    lea         r1, [r1 + r3]
+    lea         r2, [r2 + r3]
+%endif
+%endrep
     RET
-
-
+%else
+INIT_YMM avx2
+cglobal getResidual32, 4,4,9
+    pxor        m0, m0
+    lea         r4, [r3 * 2]
+
+%assign x 0
+%rep 16
+    pmovzxbw    m1, [r0]
+    pmovzxbw    m2, [r0 + 16]
+    pmovzxbw    m3, [r0 + r3]
+    pmovzxbw    m4, [r0 + r3 + 16]
+
+    pmovzxbw    m5, [r1]
+    pmovzxbw    m6, [r1 + 16]
+    pmovzxbw    m7, [r1 + r3]
+    pmovzxbw    m8, [r1 + r3 + 16]
+
+    psubw       m1, m5
+    psubw       m2, m6
+    psubw       m3, m7
+    psubw       m4, m8
+
+    movu        [r2 + 0 ], m1
+    movu        [r2 + 32], m2
+    movu        [r2 + r4 + 0], m3
+    movu        [r2 + r4 + 32], m4
+
+%assign x x+1
+%if (x != 16)
+    lea         r0, [r0 + r3 * 2]
+    lea         r1, [r1 + r3 * 2]
+    lea         r2, [r2 + r3 * 4]
+%endif
+%endrep
+    RET
+%endif
 ;-----------------------------------------------------------------------------
 ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------