Mercurial > x265
changeset 9523:74f975134412 draft
asm-avx2: luma_vsp[8x16, 8x32], luma_vss[8x16, 8x32]: 6808c->1310c, 4869c->2505c,
5443c->1325c, 3873c->2595c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Tue, 17 Feb 2015 11:48:16 +0530 |
parents | 0c005021f270 |
children | ae80a972b770 |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm |
diffstat | 2 files changed, 396 insertions(+-), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Tue Feb 17 10:40:58 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Feb 17 11:48:16 2015 +0530 @@ -1563,8 +1563,12 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_64x64].luma_vps = x265_interp_8tap_vert_ps_64x64_avx2; p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2; + p.pu[LUMA_8x16].luma_vsp = x265_interp_8tap_vert_sp_8x16_avx2; + p.pu[LUMA_8x32].luma_vsp = x265_interp_8tap_vert_sp_8x32_avx2; p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2; + p.pu[LUMA_8x16].luma_vss = x265_interp_8tap_vert_ss_8x16_avx2; + p.pu[LUMA_8x32].luma_vss = x265_interp_8tap_vert_ss_8x32_avx2; p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_avx2; p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2;
--- a/source/common/x86/ipfilter8.asm Tue Feb 17 10:40:58 2015 +0530 +++ b/source/common/x86/ipfilter8.asm Tue Feb 17 11:48:16 2015 +0530 @@ -11184,3 +11184,395 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 FILTER_VER_LUMA_S_AVX2_8x8 sp FILTER_VER_LUMA_S_AVX2_8x8 ss + +%macro FILTER_VER_LUMA_S_AVX2_8xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 +.loopH: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,sp + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 +%ifidn %1,sp + packuswb m0, m2 + mova m1, [interp8_hps_shuf] + vpermd m0, m1, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm2, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm2, 1 + pmaddwd m2, m13, [r5 + 3 * mmsize] + paddd m7, m2 + pmaddwd m2, m13, [r5 + 2 * mmsize] + paddd m9, m2 + pmaddwd m2, m13, [r5 + 1 * mmsize] + paddd m11, m2 + pmaddwd m13, [r5] + +%ifidn %1,sp + paddd m6, m14 + paddd m7, m14 + psrad m6, 12 + psrad m7, 12 +%else + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m6, m7 + lea r2, [r2 + r3 * 4] + +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m1, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 +%else + vpermq m6, m6, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 +%endif + + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm0, xm6 + punpcklwd xm0, xm6 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 3 * mmsize] + paddd m8, m5 + pmaddwd m5, m0, [r5 + 2 * mmsize] + paddd m10, m5 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m12, m5 + pmaddwd m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhwd xm3, xm6, xm2 + punpcklwd xm6, xm2 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m11, m3 + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m13, m3 + pmaddwd m6, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m2, m3, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m3, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m6, m3 + movu xm2, [r0 + r4] ; m2 = row 19 + punpckhwd xm7, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm7, 1 + pmaddwd m7, m4, [r5 + 3 * mmsize] + paddd m12, m7 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm7, [r0] ; m7 = row 20 + punpckhwd xm3, xm2, xm7 + punpcklwd xm2, xm7 + vinserti128 m2, m2, xm3, 1 + pmaddwd m3, m2, [r5 + 3 * mmsize] + paddd m13, m3 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m6, m2 + movu xm3, [r0 + r1] ; m3 = row 21 + punpckhwd xm2, xm7, xm3 + punpcklwd xm7, xm3 + vinserti128 m7, m7, xm2, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m0, m7 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhwd xm7, xm3, xm2 + punpcklwd xm3, xm2 + vinserti128 m3, m3, xm7, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m6, m3 + +%ifidn %1,sp + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m6, m14 + psrad m8, 12 + psrad m9, 12 + psrad m10, 12 + psrad m11, 12 + psrad m12, 12 + psrad m13, 12 + psrad m0, 12 + psrad m6, 12 +%else + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m6, 6 +%endif + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m6 + lea r2, [r2 + r3 * 4] + +%ifidn %1,sp + packuswb m8, m10 + packuswb m12, m0 + vpermd m8, m1, m8 + vpermd m12, m1, m12 + vextracti128 xm10, m8, 1 + vextracti128 xm0, m12, 1 + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm10 + movhps [r2 + r6], xm10 + lea r2, [r2 + r3 * 4] + movq [r2], xm12 + movhps [r2 + r3], xm12 + movq [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm0 +%else + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm6, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm6 +%endif + + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_S_AVX2_8xN sp, 16 +FILTER_VER_LUMA_S_AVX2_8xN sp, 32 +FILTER_VER_LUMA_S_AVX2_8xN ss, 16 +FILTER_VER_LUMA_S_AVX2_8xN ss, 32