changeset 9549:998c5d235ee7 draft

asm-avx2: luma_vsp[12x16, 4x16], luma_vss[12x16, 4x16] for 16bpp luma_vsp[12x16, 4x16]: 3472c->2192c, 1465c->787c luma_vss[12x16, 4x16]: 2970c->2037c, 1220c->701c
author Divya Manivannan <divya@multicorewareinc.com>
date Wed, 18 Feb 2015 17:03:02 +0530
parents ede163eb20e0
children dbce8036e0c4
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm
diffstat 2 files changed, 61 insertions(+-), 3 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 18 16:05:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 18 17:03:02 2015 +0530
@@ -1148,6 +1148,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x16].luma_vsp = x265_interp_8tap_vert_sp_8x16_avx2;
         p.pu[LUMA_8x32].luma_vsp = x265_interp_8tap_vert_sp_8x32_avx2;
 
+        p.pu[LUMA_12x16].luma_vsp = x265_interp_8tap_vert_sp_12x16_avx2;
+
         p.pu[LUMA_16x8].luma_vsp = x265_interp_8tap_vert_sp_16x8_avx2;
         p.pu[LUMA_16x12].luma_vsp = x265_interp_8tap_vert_sp_16x12_avx2;
         p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
@@ -1173,6 +1175,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x16].luma_vss = x265_interp_8tap_vert_ss_8x16_avx2;
         p.pu[LUMA_8x32].luma_vss = x265_interp_8tap_vert_ss_8x32_avx2;
 
+        p.pu[LUMA_12x16].luma_vss = x265_interp_8tap_vert_ss_12x16_avx2;
+
         p.pu[LUMA_16x8].luma_vss = x265_interp_8tap_vert_ss_16x8_avx2;
         p.pu[LUMA_16x12].luma_vss = x265_interp_8tap_vert_ss_16x12_avx2;
         p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
@@ -1221,11 +1225,13 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2;
         p.pu[LUMA_4x8].luma_vsp = x265_interp_8tap_vert_sp_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vsp = x265_interp_8tap_vert_sp_4x16_avx2;
         p.pu[LUMA_8x4].luma_vsp = x265_interp_8tap_vert_sp_8x4_avx2;
         p.pu[LUMA_16x4].luma_vsp = x265_interp_8tap_vert_sp_16x4_avx2;
 
         p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2;
         p.pu[LUMA_4x8].luma_vss = x265_interp_8tap_vert_ss_4x8_avx2;
+        p.pu[LUMA_4x16].luma_vss = x265_interp_8tap_vert_ss_4x16_avx2;
         p.pu[LUMA_8x4].luma_vss = x265_interp_8tap_vert_ss_8x4_avx2;
         p.pu[LUMA_16x4].luma_vss = x265_interp_8tap_vert_ss_16x4_avx2;
     }
--- a/source/common/x86/ipfilter16.asm	Wed Feb 18 16:05:33 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Wed Feb 18 17:03:02 2015 +0530
@@ -4976,18 +4976,29 @@ FILTER_VER_LUMA_AVX2_4x8 ss
     paddd           m1, m3
     pmaddwd         m6, [r5]
 
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m2, 6
+%else
     paddd           m0, m7
     paddd           m2, m7
 %ifidn %1,pp
     psrad           m0, 6
     psrad           m2, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m2, 10
 %else
     psrad           m0, 2
     psrad           m2, 2
 %endif
+%endif
+
     packssdw        m0, m2
+    pxor            m3, m3
 %ifidn %1,pp
-    pxor            m3, m3
+    CLIPW           m0, m3, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m0, m3, [pw_pixel_max]
 %endif
 
@@ -5023,18 +5034,29 @@ FILTER_VER_LUMA_AVX2_4x8 ss
     paddd           m5, m2
     pmaddwd         m0, [r5]
 
+%ifidn %1,ss
+    psrad           m4, 6
+    psrad           m1, 6
+%else
     paddd           m4, m7
     paddd           m1, m7
 %ifidn %1,pp
     psrad           m4, 6
     psrad           m1, 6
+%elifidn %1, sp
+    psrad           m4, 10
+    psrad           m1, 10
 %else
     psrad           m4, 2
     psrad           m1, 2
 %endif
+%endif
+
     packssdw        m4, m1
+    pxor            m2, m2
 %ifidn %1,pp
-    pxor            m2, m2
+    CLIPW           m4, m2, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m4, m2, [pw_pixel_max]
 %endif
 
@@ -5070,18 +5092,29 @@ FILTER_VER_LUMA_AVX2_4x8 ss
     pmaddwd         m1, [r5 + 1 * mmsize]
     paddd           m3, m1
 
+%ifidn %1,ss
+    psrad           m6, 6
+    psrad           m5, 6
+%else
     paddd           m6, m7
     paddd           m5, m7
 %ifidn %1,pp
     psrad           m6, 6
     psrad           m5, 6
+%elifidn %1, sp
+    psrad           m6, 10
+    psrad           m5, 10
 %else
     psrad           m6, 2
     psrad           m5, 2
 %endif
+%endif
+
     packssdw        m6, m5
+    pxor            m1, m1
 %ifidn %1,pp
-    pxor            m1, m1
+    CLIPW           m6, m1, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m6, m1, [pw_pixel_max]
 %endif
 
@@ -5110,18 +5143,29 @@ FILTER_VER_LUMA_AVX2_4x8 ss
     pmaddwd         m6, [r5 + 3 * mmsize]
     paddd           m3, m6
 
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m3, 6
+%else
     paddd           m0, m7
     paddd           m3, m7
 %ifidn %1,pp
     psrad           m0, 6
     psrad           m3, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m3, 10
 %else
     psrad           m0, 2
     psrad           m3, 2
 %endif
+%endif
+  
     packssdw        m0, m3
 %ifidn %1,pp
     CLIPW           m0, m1, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m0, m1, [pw_pixel_max]
 %endif
 
     vextracti128    xm3, m0, 1
@@ -5151,6 +5195,8 @@ cglobal interp_8tap_vert_%1_4x16, 4, 7, 
     sub             r0, r4
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
+%elifidn %1, sp
+    mova            m7, [pd_524800]
 %else
     vbroadcasti128  m7, [pd_n32768]
 %endif
@@ -5161,6 +5207,8 @@ cglobal interp_8tap_vert_%1_4x16, 4, 7, 
 
 FILTER_VER_LUMA_AVX2_4x16 pp
 FILTER_VER_LUMA_AVX2_4x16 ps
+FILTER_VER_LUMA_AVX2_4x16 sp
+FILTER_VER_LUMA_AVX2_4x16 ss
 
 %macro FILTER_VER_LUMA_AVX2_12x16 1
 INIT_YMM avx2
@@ -5182,6 +5230,8 @@ cglobal interp_8tap_vert_%1_12x16, 4, 9,
     sub             r0, r4
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
 %else
     vbroadcasti128  m14, [pd_n32768]
 %endif
@@ -5197,6 +5247,8 @@ cglobal interp_8tap_vert_%1_12x16, 4, 9,
 
 FILTER_VER_LUMA_AVX2_12x16 pp
 FILTER_VER_LUMA_AVX2_12x16 ps
+FILTER_VER_LUMA_AVX2_12x16 sp
+FILTER_VER_LUMA_AVX2_12x16 ss
 
 ;---------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)