Mercurial > x265
changeset 9514:c98d0dccbb36 draft
asm-avx2: luma_vpp[12x16], luma_vps[12x16]: improve 3466c->2182c, 3275c->2057c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Fri, 13 Feb 2015 18:28:52 +0530 |
parents | bc19307b799c |
children | e93986dd286c |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm |
diffstat | 2 files changed, 40 insertions(+-), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Fri Feb 13 18:07:46 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Feb 13 18:28:52 2015 +0530 @@ -1083,6 +1083,8 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2; p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2; + p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2; + p.pu[LUMA_16x8].luma_vpp = x265_interp_8tap_vert_pp_16x8_avx2; p.pu[LUMA_16x12].luma_vpp = x265_interp_8tap_vert_pp_16x12_avx2; p.pu[LUMA_16x16].luma_vpp = x265_interp_8tap_vert_pp_16x16_avx2; @@ -1108,6 +1110,8 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_8x16].luma_vps = x265_interp_8tap_vert_ps_8x16_avx2; p.pu[LUMA_8x32].luma_vps = x265_interp_8tap_vert_ps_8x32_avx2; + p.pu[LUMA_12x16].luma_vps = x265_interp_8tap_vert_ps_12x16_avx2; + p.pu[LUMA_16x8].luma_vps = x265_interp_8tap_vert_ps_16x8_avx2; p.pu[LUMA_16x12].luma_vps = x265_interp_8tap_vert_ps_16x12_avx2; p.pu[LUMA_16x16].luma_vps = x265_interp_8tap_vert_ps_16x16_avx2;
--- a/source/common/x86/ipfilter16.asm Fri Feb 13 18:07:46 2015 +0530 +++ b/source/common/x86/ipfilter16.asm Fri Feb 13 18:28:52 2015 +0530 @@ -4815,6 +4815,42 @@ cglobal interp_8tap_vert_%1_4x16, 4, 7, FILTER_VER_LUMA_AVX2_4x16 pp FILTER_VER_LUMA_AVX2_4x16 ps +%macro FILTER_VER_LUMA_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%else + vbroadcasti128 m14, [pd_n32768] +%endif + lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W8_16R %1 + add r2, 16 + add r0, 16 + mova m7, m14 + PROCESS_LUMA_AVX2_W4_16R %1 + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_12x16 pp +FILTER_VER_LUMA_AVX2_12x16 ps + ;--------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;---------------------------------------------------------------------------------------------------------------