changeset 9514:c98d0dccbb36 draft

asm-avx2: luma_vpp[12x16], luma_vps[12x16]: improve 3466c->2182c, 3275c->2057c
author Divya Manivannan <divya@multicorewareinc.com>
date Fri, 13 Feb 2015 18:28:52 +0530
parents bc19307b799c
children e93986dd286c
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm
diffstat 2 files changed, 40 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 13 18:07:46 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Feb 13 18:28:52 2015 +0530
@@ -1083,6 +1083,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2;
         p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2;
 
+        p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2;
+
         p.pu[LUMA_16x8].luma_vpp = x265_interp_8tap_vert_pp_16x8_avx2;
         p.pu[LUMA_16x12].luma_vpp = x265_interp_8tap_vert_pp_16x12_avx2;
         p.pu[LUMA_16x16].luma_vpp = x265_interp_8tap_vert_pp_16x16_avx2;
@@ -1108,6 +1110,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x16].luma_vps = x265_interp_8tap_vert_ps_8x16_avx2;
         p.pu[LUMA_8x32].luma_vps = x265_interp_8tap_vert_ps_8x32_avx2;
 
+        p.pu[LUMA_12x16].luma_vps = x265_interp_8tap_vert_ps_12x16_avx2;
+
         p.pu[LUMA_16x8].luma_vps = x265_interp_8tap_vert_ps_16x8_avx2;
         p.pu[LUMA_16x12].luma_vps = x265_interp_8tap_vert_ps_16x12_avx2;
         p.pu[LUMA_16x16].luma_vps = x265_interp_8tap_vert_ps_16x16_avx2;
--- a/source/common/x86/ipfilter16.asm	Fri Feb 13 18:07:46 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Fri Feb 13 18:28:52 2015 +0530
@@ -4815,6 +4815,42 @@ cglobal interp_8tap_vert_%1_4x16, 4, 7, 
 FILTER_VER_LUMA_AVX2_4x16 pp
 FILTER_VER_LUMA_AVX2_4x16 ps
 
+%macro FILTER_VER_LUMA_AVX2_12x16 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%1_12x16, 4, 9, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+    add             r3d, r3d
+
+%ifdef PIC
+    lea             r5, [tab_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [tab_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+%ifidn %1,pp
+    vbroadcasti128  m14, [pd_32]
+%else
+    vbroadcasti128  m14, [pd_n32768]
+%endif
+    lea             r6, [r3 * 3]
+    PROCESS_LUMA_AVX2_W8_16R %1
+    add             r2, 16
+    add             r0, 16
+    mova            m7, m14
+    PROCESS_LUMA_AVX2_W4_16R %1
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_12x16 pp
+FILTER_VER_LUMA_AVX2_12x16 ps
+
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;---------------------------------------------------------------------------------------------------------------