Mercurial > x265
changeset 9547:6d868a8ff7f2 draft
asm-avx2: luma_vsp[8x4, 16x4, 32x24], luma_vss[8x4, 16x4, 32x24] for 16bpp
luma_vsp[8x4, 16x4, 32x24]: 819c->525c, 1364c->1041c, 13273c->8638c
luma_vss[8x4, 16x4, 32x24]: 791c->488c, 1204c->964c, 10837c->7785c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Wed, 18 Feb 2015 15:25:58 +0530 |
parents | 966a69aa9f69 |
children | ede163eb20e0 |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm |
diffstat | 2 files changed, 35 insertions(+-), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Wed Feb 18 15:08:06 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Feb 18 15:25:58 2015 +0530 @@ -1157,6 +1157,7 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_32x8].luma_vsp = x265_interp_8tap_vert_sp_32x8_avx2; p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2; + p.pu[LUMA_32x24].luma_vsp = x265_interp_8tap_vert_sp_32x24_avx2; p.pu[LUMA_32x32].luma_vsp = x265_interp_8tap_vert_sp_32x32_avx2; p.pu[LUMA_32x64].luma_vsp = x265_interp_8tap_vert_sp_32x64_avx2; @@ -1180,6 +1181,7 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_32x8].luma_vss = x265_interp_8tap_vert_ss_32x8_avx2; p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2; + p.pu[LUMA_32x24].luma_vss = x265_interp_8tap_vert_ss_32x24_avx2; p.pu[LUMA_32x32].luma_vss = x265_interp_8tap_vert_ss_32x32_avx2; p.pu[LUMA_32x64].luma_vss = x265_interp_8tap_vert_ss_32x64_avx2; @@ -1216,8 +1218,12 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_16x4].luma_vps = x265_interp_8tap_vert_ps_16x4_avx2; p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2; + p.pu[LUMA_8x4].luma_vsp = x265_interp_8tap_vert_sp_8x4_avx2; + p.pu[LUMA_16x4].luma_vsp = x265_interp_8tap_vert_sp_16x4_avx2; p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2; + p.pu[LUMA_8x4].luma_vss = x265_interp_8tap_vert_ss_8x4_avx2; + p.pu[LUMA_16x4].luma_vss = x265_interp_8tap_vert_ss_16x4_avx2; } } #else // if HIGH_BIT_DEPTH
--- a/source/common/x86/ipfilter16.asm Wed Feb 18 15:08:06 2015 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Feb 18 15:25:58 2015 +0530 @@ -4174,6 +4174,8 @@ cglobal interp_8tap_vert_%1_32x24, 4, 10 sub r0, r4 %ifidn %1,pp vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + mova m14, [pd_524800] %else vbroadcasti128 m14, [pd_n32768] %endif @@ -4203,6 +4205,8 @@ cglobal interp_8tap_vert_%1_32x24, 4, 10 FILTER_VER_LUMA_AVX2_32x24 pp FILTER_VER_LUMA_AVX2_32x24 ps +FILTER_VER_LUMA_AVX2_32x24 sp +FILTER_VER_LUMA_AVX2_32x24 ss %macro PROCESS_LUMA_AVX2_W8_4R 1 movu xm0, [r0] ; m0 = row 0 @@ -4277,6 +4281,12 @@ FILTER_VER_LUMA_AVX2_32x24 ps pmaddwd m6, [r5 + 3 * mmsize] paddd m3, m6 +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%else paddd m0, m7 paddd m1, m7 paddd m2, m7 @@ -4286,18 +4296,28 @@ FILTER_VER_LUMA_AVX2_32x24 ps psrad m1, 6 psrad m2, 6 psrad m3, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 %else psrad m0, 2 psrad m1, 2 psrad m2, 2 psrad m3, 2 %endif +%endif + packssdw m0, m1 packssdw m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b + pxor m4, m4 %ifidn %1,pp - pxor m4, m4 + CLIPW m0, m4, [pw_pixel_max] + CLIPW m2, m4, [pw_pixel_max] +%elifidn %1, sp CLIPW m0, m4, [pw_pixel_max] CLIPW m2, m4, [pw_pixel_max] %endif @@ -4325,6 +4345,8 @@ cglobal interp_8tap_vert_%1_16x4, 4, 7, sub r0, r4 %ifidn %1,pp vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + mova m7, [pd_524800] %else vbroadcasti128 m7, [pd_n32768] %endif @@ -4346,6 +4368,8 @@ cglobal interp_8tap_vert_%1_16x4, 4, 7, FILTER_VER_LUMA_AVX2_16x4 pp FILTER_VER_LUMA_AVX2_16x4 ps +FILTER_VER_LUMA_AVX2_16x4 sp +FILTER_VER_LUMA_AVX2_16x4 ss %macro FILTER_VER_LUMA_AVX2_8x4 1 INIT_YMM avx2 @@ -4366,6 +4390,8 @@ cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 sub r0, r4 %ifidn %1,pp vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + mova m7, [pd_524800] %else vbroadcasti128 m7, [pd_n32768] %endif @@ -4381,6 +4407,8 @@ cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 FILTER_VER_LUMA_AVX2_8x4 pp FILTER_VER_LUMA_AVX2_8x4 ps +FILTER_VER_LUMA_AVX2_8x4 sp +FILTER_VER_LUMA_AVX2_8x4 ss %macro FILTER_VER_LUMA_AVX2_16x12 1 INIT_YMM avx2