Mercurial > x265
changeset 9551:46de85c1be4d draft
asm-avx2: filter_vps[8x8]: improve 296c->263c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Thu, 19 Feb 2015 15:18:12 +0530 |
parents | dbce8036e0c4 |
children | 8575ce28b986 |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm |
diffstat | 2 files changed, 31 insertions(+-), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Thu Feb 19 10:01:48 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Feb 19 15:18:12 2015 +0530 @@ -179,6 +179,7 @@ extern "C" { p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef x265_ ## fname ## _8x32_ ## cpu #define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu) + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2; #define ALL_CHROMA_422_CU_TYPED(prim, fncdef, fname, cpu) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \
--- a/source/common/x86/ipfilter8.asm Thu Feb 19 10:01:48 2015 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Feb 19 15:18:12 2015 +0530 @@ -3765,8 +3765,9 @@ FILTER_V4_W8_H8_H16_H32 8, 64 paddw m4, m0 %endmacro +%macro FILTER_VER_CHROMA_AVX2_8x8 1 INIT_YMM avx2 -cglobal interp_4tap_vert_pp_8x8, 4, 6, 7 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 mov r4d, r4m shl r4d, 6 @@ -3780,6 +3781,7 @@ cglobal interp_4tap_vert_pp_8x8, 4, 6, 7 lea r4, [r1 * 3] sub r0, r1 PROCESS_CHROMA_AVX2_W8_8R +%ifidn %1,pp lea r4, [r3 * 3] mova m3, [pw_512] pmulhrsw m5, m3 ; m5 = word: row 0, row 1 @@ -3799,7 +3801,33 @@ cglobal interp_4tap_vert_pp_8x8, 4, 6, 7 movq [r2 + r3], xm4 movhps [r2 + r3 * 2], xm1 movhps [r2 + r4], xm4 - RET +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r4], xm4 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_8x8 pp +FILTER_VER_CHROMA_AVX2_8x8 ps ;----------------------------------------------------------------------------- ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)