Mercurial > x265
changeset 9556:800470abb9f7 draft
asm-avx2: calcResidual code for 8bbpp and 16bpp
8bpp: [16x16](8.99x), [32x32](9.88x)
16bpp: [16x16](5.31x), [32x32](3.61x)
author | Rajesh Paulraj<rajesh@multicorewareinc.com> |
---|---|
date | Fri, 20 Feb 2015 12:39:44 +0530 |
parents | bcd32fad1690 |
children | f75011092766 |
files | source/common/x86/asm-primitives.cpp source/common/x86/pixel-util.h source/common/x86/pixel-util8.asm |
diffstat | 3 files changed, 138 insertions(+-), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Fri Feb 20 10:59:34 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Feb 20 12:39:44 2015 +0530 @@ -1066,6 +1066,8 @@ void setupAssemblyPrimitives(EncoderPrim p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_avx2; p.scale1D_128to64 = x265_scale1D_128to64_avx2; // p.weight_pp = x265_weight_pp_avx2; fails tests + p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2; + p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2; ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2); @@ -1550,6 +1552,8 @@ void setupAssemblyPrimitives(EncoderPrim p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_avx2; p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_avx2; p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_avx2; + p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2; + p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2; p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx; p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
--- a/source/common/x86/pixel-util.h Fri Feb 20 10:59:34 2015 +0530 +++ b/source/common/x86/pixel-util.h Fri Feb 20 12:39:44 2015 +0530 @@ -30,6 +30,8 @@ void x265_getResidual16_sse2(const pixel void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual16_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual32_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride); void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride);
--- a/source/common/x86/pixel-util8.asm Fri Feb 20 10:59:34 2015 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Feb 20 12:39:44 2015 +0530 @@ -102,9 +102,9 @@ cglobal getResidual4, 4,4,4 punpcklqdq m0, m1 punpcklqdq m2, m3 psubw m0, m2 - movh [r2], m0 movhps [r2 + r3], m0 + RET %else cglobal getResidual4, 4,4,5 pxor m0, m0 @@ -137,8 +137,8 @@ cglobal getResidual4, 4,4,5 psubw m1, m3 movh [r2], m1 movhps [r2 + r3 * 2], m1 + RET %endif - RET INIT_XMM sse2 @@ -164,6 +164,7 @@ cglobal getResidual8, 4,4,4 lea r2, [r2 + r3 * 2] %endif %endrep + RET %else cglobal getResidual8, 4,4,5 pxor m0, m0 @@ -190,8 +191,9 @@ cglobal getResidual8, 4,4,5 lea r2, [r2 + r3 * 4] %endif %endrep + RET %endif - RET + %if HIGH_BIT_DEPTH INIT_XMM sse2 @@ -245,10 +247,9 @@ cglobal getResidual16, 4,5,6 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] - jnz .loop + RET %else - INIT_XMM sse4 cglobal getResidual16, 4,5,8 mov r4d, 16/4 @@ -309,11 +310,70 @@ cglobal getResidual16, 4,5,8 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] - jnz .loop + RET +%endif + +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +cglobal getResidual16, 4,4,5 + add r3, r3 + pxor m0, m0 + +%assign x 0 +%rep 16/2 + movu m1, [r0] + movu m2, [r0 + r3] + movu m3, [r1] + movu m4, [r1 + r3] + + psubw m1, m3 + psubw m2, m4 + movu [r2], m1 + movu [r2 + r3], m2 +%assign x x+1 +%if (x != 8) + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 2] %endif - +%endrep RET +%else +INIT_YMM avx2 +cglobal getResidual16, 4,4,9 + pxor m0, m0 + lea r4, [r3 * 2] + add r4d, r3d + +%assign x 0 +%rep 4 + pmovzxbw m1, [r0] + pmovzxbw m2, [r0 + r3] + pmovzxbw m3, [r0 + r3 * 2] + pmovzxbw m4, [r0 + r4] + pmovzxbw m5, [r1] + pmovzxbw m6, [r1 + r3] + pmovzxbw m7, [r1 + r3 * 2] + pmovzxbw m8, [r1 + r4] + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + psubw m4, m8 + movu [r2], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r3 * 2 * 2], m3 + movu [r2 + r4 * 2], m4 + +%assign x x+1 +%if (x != 4) + lea r0, [r0 + r3 * 2 * 2] + lea r1, [r1 + r3 * 2 * 2] + lea r2, [r2 + r3 * 4 * 2] +%endif +%endrep + RET +%endif %if HIGH_BIT_DEPTH INIT_XMM sse2 @@ -364,9 +424,8 @@ cglobal getResidual32, 4,5,6 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] - jnz .loop - + RET %else INIT_XMM sse4 cglobal getResidual32, 4,5,7 @@ -422,12 +481,73 @@ cglobal getResidual32, 4,5,7 lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] - jnz .loop + RET +%endif + + +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +cglobal getResidual32, 4,4,5 + add r3, r3 + pxor m0, m0 + +%assign x 0 +%rep 32 + movu m1, [r0] + movu m2, [r0 + 32] + movu m3, [r1] + movu m4, [r1 + 32] + + psubw m1, m3 + psubw m2, m4 + movu [r2], m1 + movu [r2 + 32], m2 +%assign x x+1 +%if (x != 32) + lea r0, [r0 + r3] + lea r1, [r1 + r3] + lea r2, [r2 + r3] %endif +%endrep RET - - +%else +INIT_YMM avx2 +cglobal getResidual32, 4,4,9 + pxor m0, m0 + lea r4, [r3 * 2] + +%assign x 0 +%rep 16 + pmovzxbw m1, [r0] + pmovzxbw m2, [r0 + 16] + pmovzxbw m3, [r0 + r3] + pmovzxbw m4, [r0 + r3 + 16] + + pmovzxbw m5, [r1] + pmovzxbw m6, [r1 + 16] + pmovzxbw m7, [r1 + r3] + pmovzxbw m8, [r1 + r3 + 16] + + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + psubw m4, m8 + + movu [r2 + 0 ], m1 + movu [r2 + 32], m2 + movu [r2 + r4 + 0], m3 + movu [r2 + r4 + 32], m4 + +%assign x x+1 +%if (x != 16) + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 4] +%endif +%endrep + RET +%endif ;----------------------------------------------------------------------------- ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); ;-----------------------------------------------------------------------------