Mercurial > x265
changeset 9522:0c005021f270 draft
asm-avx2: luma_vsp[8x8], luma_vss[8x8]: improve 1422c->763c, 1238c->755c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Tue, 17 Feb 2015 10:40:58 +0530 |
parents | c452279cb1c9 |
children | 74f975134412 |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm |
diffstat | 2 files changed, 256 insertions(+-), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Tue Feb 17 10:08:56 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Feb 17 10:40:58 2015 +0530 @@ -1562,6 +1562,10 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_64x48].luma_vps = x265_interp_8tap_vert_ps_64x48_avx2; p.pu[LUMA_64x64].luma_vps = x265_interp_8tap_vert_ps_64x64_avx2; + p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2; + + p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2; + p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_avx2; p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2; p.pu[LUMA_4x16].luma_hps = x265_interp_8tap_horiz_ps_4x16_avx2;
--- a/source/common/x86/ipfilter8.asm Tue Feb 17 10:08:56 2015 +0530 +++ b/source/common/x86/ipfilter8.asm Tue Feb 17 10:40:58 2015 +0530 @@ -51,7 +51,7 @@ tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, tab_c_526336: times 4 dd 8192*64+2048 -tab_c1_526336: times 8 dd 8192*64+2048 +pd_526336: times 8 dd 8192*64+2048 tab_ChromaCoeff: db 0, 64, 0, 0 db -2, 58, 10, -2 @@ -112,25 +112,25 @@ tab_LumaCoeffV: times 4 dw 0, 0 times 4 dw 4, -1 ALIGN 32 -tab_LumaCoeffVer_w: times 8 dw 0, 0 - times 8 dw 0, 64 - times 8 dw 0, 0 - times 8 dw 0, 0 - - times 8 dw -1, 4 - times 8 dw -10, 58 - times 8 dw 17, -5 - times 8 dw 1, 0 - - times 8 dw -1, 4 - times 8 dw -11, 40 - times 8 dw 40, -11 - times 8 dw 4, -1 - - times 8 dw 0, 1 - times 8 dw -5, 17 - times 8 dw 58, -10 - times 8 dw 4, -1 +pw_LumaCoeffVer: times 8 dw 0, 0 + times 8 dw 0, 64 + times 8 dw 0, 0 + times 8 dw 0, 0 + + times 8 dw -1, 4 + times 8 dw -10, 58 + times 8 dw 17, -5 + times 8 dw 1, 0 + + times 8 dw -1, 4 + times 8 dw -11, 40 + times 8 dw 40, -11 + times 8 dw 4, -1 + + times 8 dw 0, 1 + times 8 dw -5, 17 + times 8 dw 58, -10 + times 8 dw 4, -1 tab_LumaCoeffVer: times 8 db 0, 0 times 8 db 0, 64 @@ -10482,17 +10482,17 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 shl r4d, 7 %ifdef PIC - lea r5, [tab_LumaCoeffVer_w] + lea r5, [pw_LumaCoeffVer] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_w + r4] + lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp - mova m6, [tab_c1_526336] + mova m6, [pd_526336] %else add r3d, r3d %endif @@ -10579,17 +10579,17 @@ cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffVer_w] + lea r5, [pw_LumaCoeffVer] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_w + r4] + lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp - mova m7, [tab_c1_526336] + mova m7, [pd_526336] %else add r3d, r3d %endif @@ -10941,16 +10941,16 @@ cglobal interp_8tap_vert_%1_4x16, 4, 7, add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffVer_w] + lea r5, [pw_LumaCoeffVer] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_w + r4] + lea r5, [pw_LumaCoeffVer + r4] %endif lea r4, [r1 * 3] sub r0, r4 %ifidn %1,sp - mova m7, [tab_c1_526336] + mova m7, [pd_526336] %else add r3d, r3d %endif @@ -10961,3 +10961,226 @@ cglobal interp_8tap_vert_%1_4x16, 4, 7, FILTER_VER_LUMA_AVX2_4x16 sp FILTER_VER_LUMA_AVX2_4x16 ss + +%macro FILTER_VER_LUMA_S_AVX2_8x8 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,sp + mova m11, [pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + pmaddwd m7, [r5] + paddd m5, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + pmaddwd m8, [r5 + 1 * mmsize] + paddd m4, m10 + paddd m6, m8 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + pmaddwd m9, [r5 + 1 * mmsize] + paddd m5, m8 + paddd m7, m9 + movu xm8, [r0 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + pmaddwd m10, [r5 + 2 * mmsize] + paddd m4, m9 + paddd m6, m10 + + lea r4, [r3 * 3] +%ifidn %1,sp + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m1, [interp8_hps_shuf] + vpermd m0, m1, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif + + lea r0, [r0 + r1 * 4] + movu xm9, [r0] ; m9 = row 12 + punpckhwd xm3, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm3, 1 + pmaddwd m3, m8, [r5 + 3 * mmsize] + pmaddwd m8, [r5 + 2 * mmsize] + paddd m5, m3 + paddd m7, m8 + movu xm3, [r0 + r1] ; m3 = row 13 + punpckhwd xm0, xm9, xm3 + punpcklwd xm9, xm3 + vinserti128 m9, m9, xm0, 1 + pmaddwd m9, [r5 + 3 * mmsize] + paddd m6, m9 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm9, xm3, xm0 + punpcklwd xm3, xm0 + vinserti128 m3, m3, xm9, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m7, m3 + +%ifidn %1,sp + paddd m4, m11 + paddd m5, m11 + paddd m6, m11 + paddd m7, m11 + psrad m4, 12 + psrad m5, 12 + psrad m6, 12 + psrad m7, 12 +%else + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m4, m5 + packssdw m6, m7 + lea r2, [r2 + r3 * 4] +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m1, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 +%endif + RET +%endif +%endmacro + +FILTER_VER_LUMA_S_AVX2_8x8 sp +FILTER_VER_LUMA_S_AVX2_8x8 ss