changeset 9539:7d20d3a40bba draft

asm-avx2: luma_vsp[32x24], luma_vss[32x24]: improve 13898c->7904c, 10802c->7850c
author Divya Manivannan <divya@multicorewareinc.com>
date Tue, 17 Feb 2015 17:46:00 +0530
parents 5a5a973ff619
children 8a9989b5b1ed
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm
diffstat 2 files changed, 63 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Tue Feb 17 17:25:52 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Feb 17 17:46:00 2015 +0530
@@ -1595,6 +1595,7 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.pu[LUMA_32x8].luma_vsp = x265_interp_8tap_vert_sp_32x8_avx2;
         p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vsp = x265_interp_8tap_vert_sp_32x24_avx2;
         p.pu[LUMA_32x32].luma_vsp = x265_interp_8tap_vert_sp_32x32_avx2;
         p.pu[LUMA_32x64].luma_vsp = x265_interp_8tap_vert_sp_32x64_avx2;
 
@@ -1621,6 +1622,7 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.pu[LUMA_32x8].luma_vss = x265_interp_8tap_vert_ss_32x8_avx2;
         p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2;
+        p.pu[LUMA_32x24].luma_vss = x265_interp_8tap_vert_ss_32x24_avx2;
         p.pu[LUMA_32x32].luma_vss = x265_interp_8tap_vert_ss_32x32_avx2;
         p.pu[LUMA_32x64].luma_vss = x265_interp_8tap_vert_ss_32x64_avx2;
 
--- a/source/common/x86/ipfilter8.asm	Tue Feb 17 17:25:52 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Tue Feb 17 17:46:00 2015 +0530
@@ -12831,3 +12831,64 @@ FILTER_VER_LUMA_AVX2_Nx8 sp, 32
 FILTER_VER_LUMA_AVX2_Nx8 sp, 16
 FILTER_VER_LUMA_AVX2_Nx8 ss, 32
 FILTER_VER_LUMA_AVX2_Nx8 ss, 16
+
+%macro FILTER_VER_LUMA_S_AVX2_32x24 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_32x24, 4, 10, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,sp
+    mova            m14, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+    lea             r6, [r3 * 3]
+    mov             r9d, 4
+.loopW:
+    PROCESS_LUMA_AVX2_W8_16R %1
+%ifidn %1,sp
+    add             r2, 8
+%else
+    add             r2, 16
+%endif
+    add             r0, 16
+    dec             r9d
+    jnz             .loopW
+    lea             r9, [r1 * 4]
+    sub             r7, r9
+    lea             r0, [r7 - 48]
+%ifidn %1,sp
+    lea             r2, [r8 + r3 * 4 - 24]
+%else
+    lea             r2, [r8 + r3 * 4 - 48]
+%endif
+    mova            m11, m14
+    mov             r9d, 4
+.loop:
+    PROCESS_LUMA_S_AVX2_W8_8R %1
+%ifidn %1,sp
+    add             r2, 8
+%else
+    add             r2, 16
+%endif
+    add             r0, 16
+    dec             r9d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_S_AVX2_32x24 sp
+FILTER_VER_LUMA_S_AVX2_32x24 ss