changeset 9527:4f36bb90fbb4 draft

asm-avx2: cpy2Dto1D_shr_32 cpy2Dto1D_shr[32x32] 20.55x 1050.20 21585.50
author Praveen Tiwari <praveen@multicorewareinc.com>
date Tue, 17 Feb 2015 14:11:09 +0530
parents 757333cafbe9
children 66f0864f5b87
files source/common/x86/asm-primitives.cpp source/common/x86/blockcopy8.asm source/common/x86/blockcopy8.h
diffstat 3 files changed, 58 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Tue Feb 17 13:47:54 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Feb 17 14:11:09 2015 +0530
@@ -1456,6 +1456,7 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2;
         p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2;
+        p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_avx2;
 
         p.denoiseDct = x265_denoise_dct_avx2;
         p.quant = x265_quant_avx2;
--- a/source/common/x86/blockcopy8.asm	Tue Feb 17 13:47:54 2015 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Feb 17 14:11:09 2015 +0530
@@ -4653,6 +4653,62 @@ cglobal cpy2Dto1D_shr_32, 3, 4, 6
     jnz            .loop
     RET
 
+INIT_YMM avx2
+cglobal cpy2Dto1D_shr_32, 4, 5, 4
+    add        r2d, r2d
+    movd       xm0, r3d
+    pcmpeqw    m1, m1
+    psllw      m1, xm0
+    psraw      m1, 1
+    lea        r3, [r2 * 3]
+    mov        r4d, 32/4
+
+.loop:
+    ; Row 0
+    movu       m2, [r1]
+    movu       m3, [r1 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 0 * mmsize], m2
+    movu       [r0 + 1 * mmsize], m3
+
+    ; Row 1
+    movu       m2, [r1 + r2]
+    movu       m3, [r1 + r2 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 2 * mmsize], m2
+    movu       [r0 + 3 * mmsize], m3
+
+    ; Row 2
+    movu       m2, [r1 + 2 * r2]
+    movu       m3, [r1 + 2 * r2 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 4 * mmsize], m2
+    movu       [r0 + 5 * mmsize], m3
+
+    ; Row 3
+    movu       m2, [r1 + r3]
+    movu       m3, [r1 + r3 + 32]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 6 * mmsize], m2
+    movu       [r0 + 7 * mmsize], m3
+
+    add        r0, 8 * mmsize
+    lea        r1, [r1 + 4 * r2]
+    dec        r4d
+    jnz        .loop
+    RET
 
 ;--------------------------------------------------------------------------------------
 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
--- a/source/common/x86/blockcopy8.h	Tue Feb 17 13:47:54 2015 +0530
+++ b/source/common/x86/blockcopy8.h	Tue Feb 17 14:11:09 2015 +0530
@@ -53,6 +53,7 @@ void x265_cpy2Dto1D_shl_16_avx2(int16_t*
 void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);