Mercurial > x265
changeset 9552:8575ce28b986 draft
asm-avx2: filter_vps[16x16]: improve 978c->790c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Thu, 19 Feb 2015 16:04:18 +0530 |
parents | 46de85c1be4d |
children | 35356968a48a |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm |
diffstat | 2 files changed, 59 insertions(+-), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Thu Feb 19 15:18:12 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Feb 19 16:04:18 2015 +0530 @@ -1722,6 +1722,11 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_16x4].luma_hps = x265_interp_8tap_horiz_ps_16x4_avx2; p.pu[LUMA_16x32].luma_hps = x265_interp_8tap_horiz_ps_16x32_avx2; p.pu[LUMA_16x64].luma_hps = x265_interp_8tap_horiz_ps_16x64_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = x265_interp_4tap_vert_ps_16x16_avx2; #endif p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_avx2; @@ -1791,11 +1796,6 @@ void setupAssemblyPrimitives(EncoderPrim p.cu[BLOCK_8x8].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_8_avx2; p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_avx2; p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_avx2; - -#if X86_64 - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2; - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2; -#endif } } #endif // if HIGH_BIT_DEPTH
--- a/source/common/x86/ipfilter8.asm Thu Feb 19 15:18:12 2015 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Feb 19 16:04:18 2015 +0530 @@ -4110,9 +4110,10 @@ FILTER_V4_W16_H2 16, 32 FILTER_V4_W16_H2 16, 24 FILTER_V4_W16_H2 16, 64 +%macro FILTER_VER_CHROMA_AVX2_16x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_pp_16x16, 4, 6, 15 +cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 mov r4d, r4m shl r4d, 6 @@ -4127,8 +4128,13 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6, mova m13, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif lea r5, [r3 * 3] - mova m14, [pw_512] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 @@ -4200,6 +4206,7 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6, paddw m7, m11 pmaddubsw m9, m12 +%ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 @@ -4229,6 +4236,25 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6, movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm6 movu [r2 + r5], xm7 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 + movu [r2 + r3 * 2], m6 + movu [r2 + r5], m7 +%endif lea r2, [r2 + r3 * 4] movu xm11, [r0 + r4] ; m11 = row 11 @@ -4289,6 +4315,7 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6, pmaddubsw m3, m13 paddw m1, m3 +%ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 @@ -4318,8 +4345,31 @@ cglobal interp_4tap_vert_pp_16x16, 4, 6, movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm0 movu [r2 + r5], xm1 - RET -%endif +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m6, m14 ; m6 = word: row 12 + psubw m7, m14 ; m7 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r5], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m6 + movu [r2 + r3], m7 + movu [r2 + r3 * 2], m0 + movu [r2 + r5], m1 +%endif + RET +%endif +%endmacro + +FILTER_VER_CHROMA_AVX2_16x16 pp +FILTER_VER_CHROMA_AVX2_16x16 ps ;----------------------------------------------------------------------------- ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)