Mercurial > x265
changeset 9527:4f36bb90fbb4 draft
asm-avx2: cpy2Dto1D_shr_32
cpy2Dto1D_shr[32x32] 20.55x 1050.20 21585.50
author | Praveen Tiwari <praveen@multicorewareinc.com> |
---|---|
date | Tue, 17 Feb 2015 14:11:09 +0530 |
parents | 757333cafbe9 |
children | 66f0864f5b87 |
files | source/common/x86/asm-primitives.cpp source/common/x86/blockcopy8.asm source/common/x86/blockcopy8.h |
diffstat | 3 files changed, 58 insertions(+-), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Tue Feb 17 13:47:54 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Feb 17 14:11:09 2015 +0530 @@ -1456,6 +1456,7 @@ void setupAssemblyPrimitives(EncoderPrim p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2; p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2; + p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_avx2; p.denoiseDct = x265_denoise_dct_avx2; p.quant = x265_quant_avx2;
--- a/source/common/x86/blockcopy8.asm Tue Feb 17 13:47:54 2015 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Feb 17 14:11:09 2015 +0530 @@ -4653,6 +4653,62 @@ cglobal cpy2Dto1D_shr_32, 3, 4, 6 jnz .loop RET +INIT_YMM avx2 +cglobal cpy2Dto1D_shr_32, 4, 5, 4 + add r2d, r2d + movd xm0, r3d + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + lea r3, [r2 * 3] + mov r4d, 32/4 + +.loop: + ; Row 0 + movu m2, [r1] + movu m3, [r1 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 0 * mmsize], m2 + movu [r0 + 1 * mmsize], m3 + + ; Row 1 + movu m2, [r1 + r2] + movu m3, [r1 + r2 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 2 * mmsize], m2 + movu [r0 + 3 * mmsize], m3 + + ; Row 2 + movu m2, [r1 + 2 * r2] + movu m3, [r1 + 2 * r2 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 4 * mmsize], m2 + movu [r0 + 5 * mmsize], m3 + + ; Row 3 + movu m2, [r1 + r3] + movu m3, [r1 + r3 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 6 * mmsize], m2 + movu [r0 + 7 * mmsize], m3 + + add r0, 8 * mmsize + lea r1, [r1 + 4 * r2] + dec r4d + jnz .loop + RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
--- a/source/common/x86/blockcopy8.h Tue Feb 17 13:47:54 2015 +0530 +++ b/source/common/x86/blockcopy8.h Tue Feb 17 14:11:09 2015 +0530 @@ -53,6 +53,7 @@ void x265_cpy2Dto1D_shl_16_avx2(int16_t* void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);