Mercurial > x265
changeset 9525:d2fdee36c259 draft
asm-avx2: cpy2Dto1D_shr_8
cpy2Dto1D_shr[8x8] 8.04x 132.45 1065.37
author | Praveen Tiwari <praveen@multicorewareinc.com> |
---|---|
date | Tue, 17 Feb 2015 12:19:55 +0530 |
parents | ae80a972b770 |
children | 757333cafbe9 |
files | source/common/x86/asm-primitives.cpp source/common/x86/blockcopy8.asm source/common/x86/blockcopy8.h |
diffstat | 3 files changed, 38 insertions(+-), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Tue Feb 17 11:59:10 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Feb 17 12:19:55 2015 +0530 @@ -1454,6 +1454,8 @@ void setupAssemblyPrimitives(EncoderPrim ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2); + p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2; + p.denoiseDct = x265_denoise_dct_avx2; p.quant = x265_quant_avx2; p.nquant = x265_nquant_avx2;
--- a/source/common/x86/blockcopy8.asm Tue Feb 17 11:59:10 2015 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Feb 17 12:19:55 2015 +0530 @@ -4466,6 +4466,41 @@ cglobal cpy2Dto1D_shr_8, 3, 5, 4 jnz .loop RET +INIT_YMM avx2 +cglobal cpy2Dto1D_shr_8, 3, 4, 4 + add r2d, r2d + movd xm0, r3m + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + lea r3, [r2 * 3] + + ; Row 0-3 + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r2], 1 + movu xm3, [r1 + 2 * r2] + vinserti128 m3, m3, [r1 + r3], 1 + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0], m2 + movu [r0 + 32], m3 + + ; Row 4-7 + lea r1, [r1 + 4 * r2] + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r2], 1 + movu xm3, [r1 + 2 * r2] + vinserti128 m3, m3, [r1 + r3], 1 + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 64], m2 + movu [r0 + 96], m3 + RET + ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
--- a/source/common/x86/blockcopy8.h Tue Feb 17 11:59:10 2015 +0530 +++ b/source/common/x86/blockcopy8.h Tue Feb 17 12:19:55 2015 +0530 @@ -51,6 +51,7 @@ void x265_cpy1Dto2D_shr_32_sse2(int16_t* void x265_cpy2Dto1D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);