changeset 9533:f0455f10dccb draft

asm-avx2: luma_vsp[64xN, 48x64, 24x32, 32x32, 32x64, 16x32, 16x64] luma_vss[64xN, 48x64, 24x32, 32x32, 32x64, 16x32, 16x64] luma_vsp[16x32, 16x64]: 9857c->4982c, 18312c->10032c luma_vsp[32x32, 32x64]: 18179c->10524c, 37362c->19950c luma_vsp[24x32, 48x64]: 13964c->7384c, 56729c->29613c luma_vsp[64x32, 64x48, 64x64]: 35624c->20026c, 55695c->30272c, 76489c->40257c luma_vss[16x32, 16x64]: 7855c->4974c, 14714c->9910c luma_vss[32x32, 32x64]: 14365c->10428c, 28707c->19744c luma_vss[24x32, 48x64]: 11216c->7417c, 43614c->29375c luma_vss[64x32, 64x48, 64x64]: 28662c->19587c, 42873c->29366c, 59281c->39065c
author Divya Manivannan <divya@multicorewareinc.com>
date Tue, 17 Feb 2015 15:19:01 +0530
parents ece4853da305
children fb9a01bb8b3d
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm
diffstat 2 files changed, 93 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Tue Feb 17 14:35:38 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Feb 17 15:19:01 2015 +0530
@@ -1579,20 +1579,42 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x32].luma_vsp = x265_interp_8tap_vert_sp_8x32_avx2;
 
         p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vsp = x265_interp_8tap_vert_sp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vsp = x265_interp_8tap_vert_sp_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vsp = x265_interp_8tap_vert_sp_24x32_avx2;
 
         p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2;
+        p.pu[LUMA_32x32].luma_vsp = x265_interp_8tap_vert_sp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vsp = x265_interp_8tap_vert_sp_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vsp = x265_interp_8tap_vert_sp_48x64_avx2;
 
         p.pu[LUMA_64x16].luma_vsp = x265_interp_8tap_vert_sp_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vsp = x265_interp_8tap_vert_sp_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vsp = x265_interp_8tap_vert_sp_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vsp = x265_interp_8tap_vert_sp_64x64_avx2;
 
         p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2;
         p.pu[LUMA_8x16].luma_vss = x265_interp_8tap_vert_ss_8x16_avx2;
         p.pu[LUMA_8x32].luma_vss = x265_interp_8tap_vert_ss_8x32_avx2;
 
         p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vss = x265_interp_8tap_vert_ss_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vss = x265_interp_8tap_vert_ss_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vss = x265_interp_8tap_vert_ss_24x32_avx2;
 
         p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2;
+        p.pu[LUMA_32x32].luma_vss = x265_interp_8tap_vert_ss_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vss = x265_interp_8tap_vert_ss_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vss = x265_interp_8tap_vert_ss_48x64_avx2;
 
         p.pu[LUMA_64x16].luma_vss = x265_interp_8tap_vert_ss_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vss = x265_interp_8tap_vert_ss_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vss = x265_interp_8tap_vert_ss_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vss = x265_interp_8tap_vert_ss_64x64_avx2;
 
         p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_avx2;
         p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2;
--- a/source/common/x86/ipfilter8.asm	Tue Feb 17 14:35:38 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Tue Feb 17 15:19:01 2015 +0530
@@ -12120,3 +12120,74 @@ FILTER_VER_LUMA_AVX2_Nx16 sp, 64
 FILTER_VER_LUMA_AVX2_Nx16 ss, 16
 FILTER_VER_LUMA_AVX2_Nx16 ss, 32
 FILTER_VER_LUMA_AVX2_Nx16 ss, 64
+
+%macro FILTER_VER_LUMA_AVX2_NxN 3
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15
+    mov             r4d, r4m
+    shl             r4d, 7
+    add             r1d, r1d
+
+%ifdef PIC
+    lea             r5, [pw_LumaCoeffVer]
+    add             r5, r4
+%else
+    lea             r5, [pw_LumaCoeffVer + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r4
+
+%ifidn %3,sp
+    mova            m14, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+
+    lea             r6, [r3 * 3]
+    lea             r11, [r1 * 4]
+    mov             r9d, %2 / 16
+.loopH:
+    mov             r10d, %1 / 8
+.loopW:
+    PROCESS_LUMA_AVX2_W8_16R %3
+%ifidn %3,sp
+    add             r2, 8
+%else
+    add             r2, 16
+%endif
+    add             r0, 16
+    dec             r10d
+    jnz             .loopW
+    sub             r7, r11
+    lea             r0, [r7 - 2 * %1 + 16]
+%ifidn %3,sp
+    lea             r2, [r8 + r3 * 4 - %1 + 8]
+%else
+    lea             r2, [r8 + r3 * 4 - 2 * %1 + 16]
+%endif
+    dec             r9d
+    jnz             .loopH
+    RET
+%endif
+%endmacro
+
+FILTER_VER_LUMA_AVX2_NxN 16, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 16, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 24, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 32, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 32, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 48, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 48, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 16, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 16, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 24, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 32, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 32, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 48, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 48, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 64, ss