changeset 9525:d2fdee36c259 draft

asm-avx2: cpy2Dto1D_shr_8 cpy2Dto1D_shr[8x8] 8.04x 132.45 1065.37
author Praveen Tiwari <praveen@multicorewareinc.com>
date Tue, 17 Feb 2015 12:19:55 +0530
parents ae80a972b770
children 757333cafbe9
files source/common/x86/asm-primitives.cpp source/common/x86/blockcopy8.asm source/common/x86/blockcopy8.h
diffstat 3 files changed, 38 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Tue Feb 17 11:59:10 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Feb 17 12:19:55 2015 +0530
@@ -1454,6 +1454,8 @@ void setupAssemblyPrimitives(EncoderPrim
         ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2);
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2);
 
+        p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2;
+
         p.denoiseDct = x265_denoise_dct_avx2;
         p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
--- a/source/common/x86/blockcopy8.asm	Tue Feb 17 11:59:10 2015 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Feb 17 12:19:55 2015 +0530
@@ -4466,6 +4466,41 @@ cglobal cpy2Dto1D_shr_8, 3, 5, 4
     jnz            .loop
     RET
 
+INIT_YMM avx2
+cglobal cpy2Dto1D_shr_8, 3, 4, 4
+    add        r2d, r2d
+    movd       xm0, r3m
+    pcmpeqw    m1, m1
+    psllw      m1, xm0
+    psraw      m1, 1
+    lea        r3, [r2 * 3]
+
+    ; Row 0-3
+    movu           xm2, [r1]
+    vinserti128    m2, m2, [r1 + r2], 1
+    movu           xm3, [r1 + 2 * r2]
+    vinserti128    m3, m3, [r1 + r3], 1
+    psubw          m2, m1
+    psraw          m2, xm0
+    psubw          m3, m1
+    psraw          m3, xm0
+    movu           [r0], m2
+    movu           [r0 + 32], m3
+
+    ; Row 4-7
+    lea            r1, [r1 + 4 * r2]
+    movu           xm2, [r1]
+    vinserti128    m2, m2, [r1 + r2], 1
+    movu           xm3, [r1 + 2 * r2]
+    vinserti128    m3, m3, [r1 + r3], 1
+    psubw          m2, m1
+    psraw          m2, xm0
+    psubw          m3, m1
+    psraw          m3, xm0
+    movu           [r0 + 64], m2
+    movu           [r0 + 96], m3
+    RET
+
 
 ;--------------------------------------------------------------------------------------
 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
--- a/source/common/x86/blockcopy8.h	Tue Feb 17 11:59:10 2015 +0530
+++ b/source/common/x86/blockcopy8.h	Tue Feb 17 12:19:55 2015 +0530
@@ -51,6 +51,7 @@ void x265_cpy1Dto2D_shr_32_sse2(int16_t*
 void x265_cpy2Dto1D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
+void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
 uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);