Mercurial > x265
changeset 9516:369c6e503f92 draft
asm-avx2: luma_vsp[4x4], luma_vss[4x4]: improve 548c->275c, 510c->240c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Mon, 16 Feb 2015 12:01:02 +0530 |
parents | e93986dd286c |
children | 149c30d03da3 |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm source/common/x86/ipfilter8.h |
diffstat | 3 files changed, 126 insertions(+-), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Mon Feb 16 10:09:11 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Feb 16 12:01:02 2015 +0530 @@ -1617,6 +1617,10 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_8x16].luma_vps = x265_interp_8tap_vert_ps_8x16_avx2; p.pu[LUMA_8x32].luma_vps = x265_interp_8tap_vert_ps_8x32_avx2; + p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2; + + p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2; + // color space i420 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2; p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2;
--- a/source/common/x86/ipfilter8.asm Mon Feb 16 10:09:11 2015 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Feb 16 12:01:02 2015 +0530 @@ -51,6 +51,8 @@ tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, tab_c_526336: times 4 dd 8192*64+2048 +tab_c1_526336: times 8 dd 8192*64+2048 + tab_ChromaCoeff: db 0, 64, 0, 0 db -2, 58, 10, -2 db -4, 54, 16, -2 @@ -109,6 +111,27 @@ tab_LumaCoeffV: times 4 dw 0, 0 times 4 dw 58, -10 times 4 dw 4, -1 +ALIGN 32 +tab_LumaCoeffVer_w: times 8 dw 0, 0 + times 8 dw 0, 64 + times 8 dw 0, 0 + times 8 dw 0, 0 + + times 8 dw -1, 4 + times 8 dw -10, 58 + times 8 dw 17, -5 + times 8 dw 1, 0 + + times 8 dw -1, 4 + times 8 dw -11, 40 + times 8 dw 40, -11 + times 8 dw 4, -1 + + times 8 dw 0, 1 + times 8 dw -5, 17 + times 8 dw 58, -10 + times 8 dw 4, -1 + tab_LumaCoeffVer: times 8 db 0, 0 times 8 db 0, 64 times 8 db 0, 0 @@ -10450,3 +10473,100 @@ cglobal interp_8tap_vert_ss_%1x%2, 5, 7, FILTER_VER_LUMA_SS 48, 64 FILTER_VER_LUMA_SS 64, 16 FILTER_VER_LUMA_SS 16, 64 + +%macro FILTER_VER_LUMA_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_w] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_w + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,sp + mova m6, [tab_c1_526336] +%else + add r3d, r3d +%endif + + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m0, m5 + paddd m2, m4 + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + pmaddwd m1, [r5 + 2 * mmsize] + paddd m0, m5 + paddd m2, m1 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + 2 * r1] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] + pmaddwd m4, [r5 + 3 * mmsize] + paddd m2, m4 + +%ifidn %1,sp + paddd m0, m6 + paddd m2, m6 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] + +%ifidn %1,sp + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm0, 3 +%else + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 +%endif + RET +%endmacro + +FILTER_VER_LUMA_AVX2_4x4 sp +FILTER_VER_LUMA_AVX2_4x4 ss
--- a/source/common/x86/ipfilter8.h Mon Feb 16 10:09:11 2015 +0530 +++ b/source/common/x86/ipfilter8.h Mon Feb 16 12:01:02 2015 +0530 @@ -617,6 +617,8 @@ LUMA_FILTERS(_sse4); LUMA_SP_FILTERS(_sse4); LUMA_SS_FILTERS(_sse2); LUMA_FILTERS(_avx2); +LUMA_SP_FILTERS(_avx2); +LUMA_SS_FILTERS(_avx2); void x265_interp_8tap_hv_pp_8x8_sse4(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);