changeset 9534:fb9a01bb8b3d draft

asm-avx2: luma_vsp[12x16], luma_vss[12x16]: improve 3753c->2097c, 2957c->1950c
author Divya Manivannan <divya@multicorewareinc.com>
date Tue, 17 Feb 2015 15:34:48 +0530
parents f0455f10dccb
children 73d2a949ddd5
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm
diffstat 2 files changed, 43 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Tue Feb 17 15:19:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Feb 17 15:34:48 2015 +0530
@@ -1578,6 +1578,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x16].luma_vsp = x265_interp_8tap_vert_sp_8x16_avx2;
         p.pu[LUMA_8x32].luma_vsp = x265_interp_8tap_vert_sp_8x32_avx2;
 
+        p.pu[LUMA_12x16].luma_vsp = x265_interp_8tap_vert_sp_12x16_avx2;
+
         p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
         p.pu[LUMA_16x32].luma_vsp = x265_interp_8tap_vert_sp_16x32_avx2;
         p.pu[LUMA_16x64].luma_vsp = x265_interp_8tap_vert_sp_16x64_avx2;
@@ -1599,6 +1601,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x16].luma_vss = x265_interp_8tap_vert_ss_8x16_avx2;
         p.pu[LUMA_8x32].luma_vss = x265_interp_8tap_vert_ss_8x32_avx2;
 
+        p.pu[LUMA_12x16].luma_vss = x265_interp_8tap_vert_ss_12x16_avx2;
+
         p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
         p.pu[LUMA_16x32].luma_vss = x265_interp_8tap_vert_ss_16x32_avx2;
         p.pu[LUMA_16x64].luma_vss = x265_interp_8tap_vert_ss_16x64_avx2;
--- a/source/common/x86/ipfilter8.asm	Tue Feb 17 15:19:01 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Tue Feb 17 15:34:48 2015 +0530
@@ -12191,3 +12191,42 @@ FILTER_VER_LUMA_AVX2_NxN 48, 64, ss
 FILTER_VER_LUMA_AVX2_NxN 64, 32, ss
 FILTER_VER_LUMA_AVX2_NxN 64, 48, ss
 FILTER_VER_LUMA_AVX2_NxN 64, 64, ss
+
+%macro FILTER_VER_LUMA_S_AVX2_12x16 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_12x16, 4, 9, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,sp
+    mova            m14, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+    lea             r6, [r3 * 3]
+    PROCESS_LUMA_AVX2_W8_16R %1
+%ifidn %1,sp
+    add             r2, 8
+%else
+    add             r2, 16
+%endif
+    add             r0, 16
+    mova            m7, m14
+    PROCESS_LUMA_AVX2_W4_16R %1
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_S_AVX2_12x16 sp
+FILTER_VER_LUMA_S_AVX2_12x16 ss