changeset 9544:f98e30b6f17b draft

asm-avx2: 16bpp: luma_vsp[64xN, 32x32, 32x64, 48x64, 24x32, 16x32, 16x64] luma_vss[64xN, 32x32, 32x64, 48x64, 24x32, 16x32, 16x64] luma_vsp[24x32, 16x32, 16x64]: 12997c->8150c, 8831c->5609c, 17442c->10827c luma_vsp[32x32, 32x64, 48x64]: 17223c->10861c, 34436c->21621c, 53712c->33384c luma_vsp[64x32, 64x48, 64x64]: 34520c->22015c, 51128c->32388c, 73260c->45347c luma_vss[24x32, 16x32, 16x64]: 11269c->7360c, 7376c->5051c, 14541c->9787 luma_vss[32x32, 32x64, 48x64]: 14352c->9778c, 29244c->19424c, 43161c->29127c luma_vss[64x32, 64x48, 64x64]: 29201c->19438c, 46448c->29505c, 60554c->41126c
author Divya Manivannan <divya@multicorewareinc.com>
date Wed, 18 Feb 2015 14:27:37 +0530
parents 0f04b4e829d1
children cb7f17d881d7
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm
diffstat 2 files changed, 42 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 18 14:00:36 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 18 14:27:37 2015 +0530
@@ -1147,18 +1147,40 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2;
 
         p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vsp = x265_interp_8tap_vert_sp_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vsp = x265_interp_8tap_vert_sp_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vsp = x265_interp_8tap_vert_sp_24x32_avx2;
 
         p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2;
+        p.pu[LUMA_32x32].luma_vsp = x265_interp_8tap_vert_sp_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vsp = x265_interp_8tap_vert_sp_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vsp = x265_interp_8tap_vert_sp_48x64_avx2;
 
         p.pu[LUMA_64x16].luma_vsp = x265_interp_8tap_vert_sp_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vsp = x265_interp_8tap_vert_sp_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vsp = x265_interp_8tap_vert_sp_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vsp = x265_interp_8tap_vert_sp_64x64_avx2;
 
         p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2;
 
         p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
+        p.pu[LUMA_16x32].luma_vss = x265_interp_8tap_vert_ss_16x32_avx2;
+        p.pu[LUMA_16x64].luma_vss = x265_interp_8tap_vert_ss_16x64_avx2;
+
+        p.pu[LUMA_24x32].luma_vss = x265_interp_8tap_vert_ss_24x32_avx2;
 
         p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2;
+        p.pu[LUMA_32x32].luma_vss = x265_interp_8tap_vert_ss_32x32_avx2;
+        p.pu[LUMA_32x64].luma_vss = x265_interp_8tap_vert_ss_32x64_avx2;
+
+        p.pu[LUMA_48x64].luma_vss = x265_interp_8tap_vert_ss_48x64_avx2;
 
         p.pu[LUMA_64x16].luma_vss = x265_interp_8tap_vert_ss_64x16_avx2;
+        p.pu[LUMA_64x32].luma_vss = x265_interp_8tap_vert_ss_64x32_avx2;
+        p.pu[LUMA_64x48].luma_vss = x265_interp_8tap_vert_ss_64x48_avx2;
+        p.pu[LUMA_64x64].luma_vss = x265_interp_8tap_vert_ss_64x64_avx2;
 #else
         p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
 #endif
--- a/source/common/x86/ipfilter16.asm	Wed Feb 18 14:00:36 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Wed Feb 18 14:27:37 2015 +0530
@@ -3399,6 +3399,8 @@ cglobal interp_8tap_vert_%3_%1x%2, 4, 12
 
 %ifidn %3,pp
     vbroadcasti128  m14, [pd_32]
+%elifidn %3, sp
+    mova            m14, [pd_524800]
 %else
     vbroadcasti128  m14, [pd_n32768]
 %endif
@@ -3441,6 +3443,24 @@ FILTER_VER_LUMA_AVX2_NxN 48, 64, ps
 FILTER_VER_LUMA_AVX2_NxN 64, 32, ps
 FILTER_VER_LUMA_AVX2_NxN 64, 48, ps
 FILTER_VER_LUMA_AVX2_NxN 64, 64, ps
+FILTER_VER_LUMA_AVX2_NxN 16, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 16, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 24, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 32, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 32, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 48, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 32, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 48, sp
+FILTER_VER_LUMA_AVX2_NxN 64, 64, sp
+FILTER_VER_LUMA_AVX2_NxN 16, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 16, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 24, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 32, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 32, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 48, 64, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 32, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 48, ss
+FILTER_VER_LUMA_AVX2_NxN 64, 64, ss
 
 %macro FILTER_VER_LUMA_AVX2_8xN 2
 INIT_YMM avx2