Mercurial > x265
changeset 9526:757333cafbe9 draft
asm-avx2: cpy2Dto1D_shr_16
cpy2Dto1D_shr[16x16] 25.39x 305.42 7755.48
author | Praveen Tiwari <praveen@multicorewareinc.com> |
---|---|
date | Tue, 17 Feb 2015 13:47:54 +0530 |
parents | d2fdee36c259 |
children | 4f36bb90fbb4 |
files | source/common/x86/asm-primitives.cpp source/common/x86/blockcopy8.asm source/common/x86/blockcopy8.h |
diffstat | 3 files changed, 60 insertions(+-), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Tue Feb 17 12:19:55 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Feb 17 13:47:54 2015 +0530 @@ -1455,6 +1455,7 @@ void setupAssemblyPrimitives(EncoderPrim ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2); p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2; + p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2; p.denoiseDct = x265_denoise_dct_avx2; p.quant = x265_quant_avx2;
--- a/source/common/x86/blockcopy8.asm Tue Feb 17 12:19:55 2015 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Feb 17 13:47:54 2015 +0530 @@ -4549,6 +4549,64 @@ cglobal cpy2Dto1D_shr_16, 3, 4, 4 jnz .loop RET +INIT_YMM avx2 +cglobal cpy2Dto1D_shr_16, 4, 5, 4 + add r2d, r2d + movd xm0, r3d + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + lea r3, [r2 * 3] + mov r4d, 16/8 + +.loop: + ; Row 0-1 + movu m2, [r1] + movu m3, [r1 + r2] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 0 * mmsize], m2 + movu [r0 + 1 * mmsize], m3 + + ; Row 2-3 + movu m2, [r1 + 2 * r2] + movu m3, [r1 + r3] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 2 * mmsize], m2 + movu [r0 + 3 * mmsize], m3 + + ; Row 4-5 + lea r1, [r1 + 4 * r2] + movu m2, [r1] + movu m3, [r1 + r2] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 4 * mmsize], m2 + movu [r0 + 5 * mmsize], m3 + + ; Row 6-7 + movu m2, [r1 + 2 * r2] + movu m3, [r1 + r3] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 6 * mmsize], m2 + movu [r0 + 7 * mmsize], m3 + + add r0, 8 * mmsize + lea r1, [r1 + 4 * r2] + dec r4d + jnz .loop + RET + ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
--- a/source/common/x86/blockcopy8.h Tue Feb 17 12:19:55 2015 +0530 +++ b/source/common/x86/blockcopy8.h Tue Feb 17 13:47:54 2015 +0530 @@ -52,6 +52,7 @@ void x265_cpy2Dto1D_shl_8_avx2(int16_t* void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);