Mercurial > x265
changeset 9542:f2c55dc1e7c0 draft
asm-avx2: luma_vsp[4x4, 8x8], luma_vss[4x4, 8x8] for 16bpp
luma_vsp[4x4, 8x8]: 557c->255c, 1380c->839c
luma_vss[4x4, 8x8]: 506c->240c, 1188c->756c
author | Divya Manivannan <divya@multicorewareinc.com> |
---|---|
date | Wed, 18 Feb 2015 13:36:17 +0530 |
parents | bcb623bffca4 |
children | 0f04b4e829d1 |
files | source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm |
diffstat | 2 files changed, 63 insertions(+-), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Wed Feb 18 11:32:52 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Feb 18 13:36:17 2015 +0530 @@ -1143,6 +1143,10 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_64x32].luma_vps = x265_interp_8tap_vert_ps_64x32_avx2; p.pu[LUMA_64x48].luma_vps = x265_interp_8tap_vert_ps_64x48_avx2; p.pu[LUMA_64x64].luma_vps = x265_interp_8tap_vert_ps_64x64_avx2; + + p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2; + + p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2; #else p.cu[BLOCK_4x4].dct = x265_dct4_avx2; #endif @@ -1168,6 +1172,10 @@ void setupAssemblyPrimitives(EncoderPrim p.pu[LUMA_4x16].luma_vps = x265_interp_8tap_vert_ps_4x16_avx2; p.pu[LUMA_8x4].luma_vps = x265_interp_8tap_vert_ps_8x4_avx2; p.pu[LUMA_16x4].luma_vps = x265_interp_8tap_vert_ps_16x4_avx2; + + p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2; + + p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2; } } #else // if HIGH_BIT_DEPTH
--- a/source/common/x86/ipfilter16.asm Wed Feb 18 11:32:52 2015 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Feb 18 13:36:17 2015 +0530 @@ -31,6 +31,7 @@ tab_c_32: times 4 dd 32 tab_c_n32768: times 4 dd -32768 tab_c_524800: times 4 dd 524800 tab_c_n8192: times 8 dw -8192 +pd_524800: times 8 dd 524800 tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 @@ -2580,6 +2581,7 @@ cglobal interp_8tap_vert_pp_%1x%2, 5, 7, FILTER_VER_LUMA_PP 48, 64 FILTER_VER_LUMA_PP 64, 16 FILTER_VER_LUMA_PP 16, 64 + %macro FILTER_VER_LUMA_AVX2_4x4 1 INIT_YMM avx2 cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 @@ -2600,6 +2602,8 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 %ifidn %1,pp vbroadcasti128 m6, [pd_32] +%elifidn %1, sp + mova m6, [pd_524800] %else vbroadcasti128 m6, [pd_n32768] %endif @@ -2647,18 +2651,29 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 pmaddwd m4, [r5 + 3 * mmsize] paddd m2, m4 +%ifidn %1,ss + psrad m0, 6 + psrad m2, 6 +%else paddd m0, m6 paddd m2, m6 %ifidn %1,pp psrad m0, 6 psrad m2, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m2, 10 %else psrad m0, 2 psrad m2, 2 %endif +%endif + packssdw m0, m2 + pxor m1, m1 %ifidn %1,pp - pxor m1, m1 + CLIPW m0, m1, [pw_pixel_max] +%elifidn %1, sp CLIPW m0, m1, [pw_pixel_max] %endif @@ -2673,6 +2688,8 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 FILTER_VER_LUMA_AVX2_4x4 pp FILTER_VER_LUMA_AVX2_4x4 ps +FILTER_VER_LUMA_AVX2_4x4 sp +FILTER_VER_LUMA_AVX2_4x4 ss %macro FILTER_VER_LUMA_AVX2_8x8 1 INIT_YMM avx2 @@ -2695,6 +2712,8 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 %ifidn %1,pp vbroadcasti128 m11, [pd_32] +%elifidn %1, sp + mova m11, [pd_524800] %else vbroadcasti128 m11, [pd_n32768] %endif @@ -2796,6 +2815,12 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 paddd m6, m10 lea r4, [r3 * 3] +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%else paddd m0, m11 paddd m1, m11 paddd m2, m11 @@ -2805,18 +2830,28 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 psrad m1, 6 psrad m2, 6 psrad m3, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 %else psrad m0, 2 psrad m1, 2 psrad m2, 2 psrad m3, 2 %endif +%endif + packssdw m0, m1 packssdw m2, m3 vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b + pxor m10, m10 %ifidn %1,pp - pxor m10, m10 + CLIPW m0, m10, [pw_pixel_max] + CLIPW m2, m10, [pw_pixel_max] +%elifidn %1, sp CLIPW m0, m10, [pw_pixel_max] CLIPW m2, m10, [pw_pixel_max] %endif @@ -2850,6 +2885,12 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 pmaddwd m3, [r5 + 3 * mmsize] paddd m7, m3 +%ifidn %1,ss + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%else paddd m4, m11 paddd m5, m11 paddd m6, m11 @@ -2859,12 +2900,19 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 psrad m5, 6 psrad m6, 6 psrad m7, 6 +%elifidn %1, sp + psrad m4, 10 + psrad m5, 10 + psrad m6, 10 + psrad m7, 10 %else psrad m4, 2 psrad m5, 2 psrad m6, 2 psrad m7, 2 %endif +%endif + packssdw m4, m5 packssdw m6, m7 vpermq m4, m4, 11011000b @@ -2872,6 +2920,9 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 %ifidn %1,pp CLIPW m4, m10, [pw_pixel_max] CLIPW m6, m10, [pw_pixel_max] +%elifidn %1, sp + CLIPW m4, m10, [pw_pixel_max] + CLIPW m6, m10, [pw_pixel_max] %endif vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 @@ -2886,6 +2937,8 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1 FILTER_VER_LUMA_AVX2_8x8 pp FILTER_VER_LUMA_AVX2_8x8 ps +FILTER_VER_LUMA_AVX2_8x8 sp +FILTER_VER_LUMA_AVX2_8x8 ss %macro PROCESS_LUMA_AVX2_W8_16R 1 movu xm0, [r0] ; m0 = row 0