Mercurial > x265
changeset 9543:0f04b4e829d1 draft
asm-avx2: luma_vsp[16x16, 32x16, 64x16], luma_vss[16x16, 32x16, 64x16] for 16bpp
luma_vsp[16x16, 32x16, 64x16]: 4551c->2746c, 8745c->5569c, 17086c->10753c
luma_vss[16x16, 32x16, 64x16]: 3860c->2472c, 7345c->5137c, 14255c->9829c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Wed, 18 Feb 2015 14:00:36 +0530 |
parents | f2c55dc1e7c0 |
children | f98e30b6f17b |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm |
diffstat | 2 files changed, 79 insertions(+-), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Wed Feb 18 13:36:17 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Feb 18 14:00:36 2015 +0530 @@ -1146,7 +1146,19 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2; + p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2; + + p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2; + + p.pu[LUMA_64x16].luma_vsp = x265_interp_8tap_vert_sp_64x16_avx2; + p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2; + + p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2; + + p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2; + + p.pu[LUMA_64x16].luma_vss = x265_interp_8tap_vert_ss_64x16_avx2; #else p.cu[BLOCK_4x4].dct = x265_dct4_avx2; #endif
--- a/source/common/x86/ipfilter16.asm Wed Feb 18 13:36:17 2015 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Feb 18 14:00:36 2015 +0530 @@ -3054,6 +3054,14 @@ FILTER_VER_LUMA_AVX2_8x8 ss paddd m9, m13 pmaddwd m11, [r5] +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%else paddd m0, m14 paddd m1, m14 paddd m2, m14 @@ -3067,6 +3075,13 @@ FILTER_VER_LUMA_AVX2_8x8 ss psrad m3, 6 psrad m4, 6 psrad m5, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + psrad m4, 10 + psrad m5, 10 %else psrad m0, 2 psrad m1, 2 @@ -3075,14 +3090,20 @@ FILTER_VER_LUMA_AVX2_8x8 ss psrad m4, 2 psrad m5, 2 %endif +%endif + packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b vpermq m4, m4, 11011000b + pxor m5, m5 %ifidn %1,pp - pxor m5, m5 + CLIPW m0, m5, [pw_pixel_max] + CLIPW m2, m5, [pw_pixel_max] + CLIPW m4, m5, [pw_pixel_max] +%elifidn %1, sp CLIPW m0, m5, [pw_pixel_max] CLIPW m2, m5, [pw_pixel_max] CLIPW m4, m5, [pw_pixel_max] @@ -3122,19 +3143,30 @@ FILTER_VER_LUMA_AVX2_8x8 ss paddd m11, m1 pmaddwd m13, [r5] +%ifidn %1,ss + psrad m6, 6 + psrad m7, 6 +%else paddd m6, m14 paddd m7, m14 %ifidn %1,pp psrad m6, 6 psrad m7, 6 +%elifidn %1, sp + psrad m6, 10 + psrad m7, 10 %else psrad m6, 2 psrad m7, 2 %endif +%endif + packssdw m6, m7 vpermq m6, m6, 11011000b %ifidn %1,pp CLIPW m6, m5, [pw_pixel_max] +%elifidn %1, sp + CLIPW m6, m5, [pw_pixel_max] %endif vextracti128 xm7, m6, 1 movu [r8 + r3 * 2], xm6 @@ -3213,6 +3245,16 @@ FILTER_VER_LUMA_AVX2_8x8 ss pmaddwd m7, [r5 + 3 * mmsize] paddd m1, m7 +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%else paddd m8, m14 paddd m9, m14 paddd m10, m14 @@ -3230,6 +3272,15 @@ FILTER_VER_LUMA_AVX2_8x8 ss psrad m13, 6 psrad m0, 6 psrad m1, 6 +%elifidn %1, sp + psrad m8, 10 + psrad m9, 10 + psrad m10, 10 + psrad m11, 10 + psrad m12, 10 + psrad m13, 10 + psrad m0, 10 + psrad m1, 10 %else psrad m8, 2 psrad m9, 2 @@ -3240,6 +3291,8 @@ FILTER_VER_LUMA_AVX2_8x8 ss psrad m0, 2 psrad m1, 2 %endif +%endif + packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 @@ -3253,6 +3306,11 @@ FILTER_VER_LUMA_AVX2_8x8 ss CLIPW m10, m5, [pw_pixel_max] CLIPW m12, m5, [pw_pixel_max] CLIPW m0, m5, [pw_pixel_max] +%elifidn %1, sp + CLIPW m8, m5, [pw_pixel_max] + CLIPW m10, m5, [pw_pixel_max] + CLIPW m12, m5, [pw_pixel_max] + CLIPW m0, m5, [pw_pixel_max] %endif vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 @@ -3290,6 +3348,8 @@ cglobal interp_8tap_vert_%1_%2x16, 4, 10 sub r0, r4 %ifidn %1,pp vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + mova m14, [pd_524800] %else vbroadcasti128 m14, [pd_n32768] %endif @@ -3311,6 +3371,12 @@ FILTER_VER_LUMA_AVX2_Nx16 pp, 64 FILTER_VER_LUMA_AVX2_Nx16 ps, 16 FILTER_VER_LUMA_AVX2_Nx16 ps, 32 FILTER_VER_LUMA_AVX2_Nx16 ps, 64 +FILTER_VER_LUMA_AVX2_Nx16 sp, 16 +FILTER_VER_LUMA_AVX2_Nx16 sp, 32 +FILTER_VER_LUMA_AVX2_Nx16 sp, 64 +FILTER_VER_LUMA_AVX2_Nx16 ss, 16 +FILTER_VER_LUMA_AVX2_Nx16 ss, 32 +FILTER_VER_LUMA_AVX2_Nx16 ss, 64 %macro FILTER_VER_LUMA_AVX2_NxN 3 INIT_YMM avx2