changeset 9526:757333cafbe9 draft

asm-avx2: cpy2Dto1D_shr_16 cpy2Dto1D_shr[16x16] 25.39x 305.42 7755.48
author Praveen Tiwari <praveen@multicorewareinc.com>
date Tue, 17 Feb 2015 13:47:54 +0530
parents d2fdee36c259
children 4f36bb90fbb4
files source/common/x86/asm-primitives.cpp source/common/x86/blockcopy8.asm source/common/x86/blockcopy8.h
diffstat 3 files changed, 60 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Tue Feb 17 12:19:55 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Feb 17 13:47:54 2015 +0530
@@ -1455,6 +1455,7 @@ void setupAssemblyPrimitives(EncoderPrim
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
 
         p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2;
+        p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2;
 
         p.denoiseDct = x265_denoise_dct_avx2;
         p.quant = x265_quant_avx2;
--- a/source/common/x86/blockcopy8.asm	Tue Feb 17 12:19:55 2015 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Feb 17 13:47:54 2015 +0530
@@ -4549,6 +4549,64 @@ cglobal cpy2Dto1D_shr_16, 3, 4, 4
     jnz            .loop
     RET
 
+INIT_YMM avx2
+cglobal cpy2Dto1D_shr_16, 4, 5, 4
+    add        r2d, r2d
+    movd       xm0, r3d
+    pcmpeqw    m1, m1
+    psllw      m1, xm0
+    psraw      m1, 1
+    lea        r3, [r2 * 3]
+    mov        r4d, 16/8
+
+.loop:
+    ; Row 0-1
+    movu       m2, [r1]
+    movu       m3, [r1 + r2]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 0 * mmsize], m2
+    movu       [r0 + 1 * mmsize], m3
+
+    ; Row 2-3
+    movu       m2, [r1 + 2 * r2]
+    movu       m3, [r1 + r3]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 2 * mmsize], m2
+    movu       [r0 + 3 * mmsize], m3
+
+    ; Row 4-5
+    lea        r1, [r1 + 4 * r2]
+    movu       m2, [r1]
+    movu       m3, [r1 + r2]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 4 * mmsize], m2
+    movu       [r0 + 5 * mmsize], m3
+
+    ; Row 6-7
+    movu       m2, [r1 + 2 * r2]
+    movu       m3, [r1 + r3]
+    psubw      m2, m1
+    psraw      m2, xm0
+    psubw      m3, m1
+    psraw      m3, xm0
+    movu       [r0 + 6 * mmsize], m2
+    movu       [r0 + 7 * mmsize], m3
+
+    add        r0, 8 * mmsize
+    lea        r1, [r1 + 4 * r2]
+    dec        r4d
+    jnz        .loop
+    RET
+
 
 ;--------------------------------------------------------------------------------------
 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
--- a/source/common/x86/blockcopy8.h	Tue Feb 17 12:19:55 2015 +0530
+++ b/source/common/x86/blockcopy8.h	Tue Feb 17 13:47:54 2015 +0530
@@ -52,6 +52,7 @@ void x265_cpy2Dto1D_shl_8_avx2(int16_t* 
 void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);