changeset 9543:0f04b4e829d1 draft

asm-avx2: luma_vsp[16x16, 32x16, 64x16], luma_vss[16x16, 32x16, 64x16] for 16bpp luma_vsp[16x16, 32x16, 64x16]: 4551c->2746c, 8745c->5569c, 17086c->10753c luma_vss[16x16, 32x16, 64x16]: 3860c->2472c, 7345c->5137c, 14255c->9829c
author Divya Manivannan <divya@multicorewareinc.com>
date Wed, 18 Feb 2015 14:00:36 +0530
parents f2c55dc1e7c0
children f98e30b6f17b
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm
diffstat 2 files changed, 79 insertions(+-), 1 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 18 13:36:17 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 18 14:00:36 2015 +0530
@@ -1146,7 +1146,19 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2;
 
+        p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
+
+        p.pu[LUMA_32x16].luma_vsp = x265_interp_8tap_vert_sp_32x16_avx2;
+
+        p.pu[LUMA_64x16].luma_vsp = x265_interp_8tap_vert_sp_64x16_avx2;
+
         p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2;
+
+        p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
+
+        p.pu[LUMA_32x16].luma_vss = x265_interp_8tap_vert_ss_32x16_avx2;
+
+        p.pu[LUMA_64x16].luma_vss = x265_interp_8tap_vert_ss_64x16_avx2;
 #else
         p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
 #endif
--- a/source/common/x86/ipfilter16.asm	Wed Feb 18 13:36:17 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Wed Feb 18 14:00:36 2015 +0530
@@ -3054,6 +3054,14 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     paddd           m9, m13
     pmaddwd         m11, [r5]
 
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%else
     paddd           m0, m14
     paddd           m1, m14
     paddd           m2, m14
@@ -3067,6 +3075,13 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     psrad           m3, 6
     psrad           m4, 6
     psrad           m5, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+    psrad           m4, 10
+    psrad           m5, 10
 %else
     psrad           m0, 2
     psrad           m1, 2
@@ -3075,14 +3090,20 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     psrad           m4, 2
     psrad           m5, 2
 %endif
+%endif
+
     packssdw        m0, m1
     packssdw        m2, m3
     packssdw        m4, m5
     vpermq          m0, m0, 11011000b
     vpermq          m2, m2, 11011000b
     vpermq          m4, m4, 11011000b
+    pxor            m5, m5
 %ifidn %1,pp
-    pxor            m5, m5
+    CLIPW           m0, m5, [pw_pixel_max]
+    CLIPW           m2, m5, [pw_pixel_max]
+    CLIPW           m4, m5, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m0, m5, [pw_pixel_max]
     CLIPW           m2, m5, [pw_pixel_max]
     CLIPW           m4, m5, [pw_pixel_max]
@@ -3122,19 +3143,30 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     paddd           m11, m1
     pmaddwd         m13, [r5]
 
+%ifidn %1,ss
+    psrad           m6, 6
+    psrad           m7, 6
+%else
     paddd           m6, m14
     paddd           m7, m14
 %ifidn %1,pp
     psrad           m6, 6
     psrad           m7, 6
+%elifidn %1, sp
+    psrad           m6, 10
+    psrad           m7, 10
 %else
     psrad           m6, 2
     psrad           m7, 2
 %endif
+%endif
+
     packssdw        m6, m7
     vpermq          m6, m6, 11011000b
 %ifidn %1,pp
     CLIPW           m6, m5, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m6, m5, [pw_pixel_max]
 %endif
     vextracti128    xm7, m6, 1
     movu            [r8 + r3 * 2], xm6
@@ -3213,6 +3245,16 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     pmaddwd         m7, [r5 + 3 * mmsize]
     paddd           m1, m7
 
+%ifidn %1,ss
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+    psrad           m12, 6
+    psrad           m13, 6
+    psrad           m0, 6
+    psrad           m1, 6
+%else
     paddd           m8, m14
     paddd           m9, m14
     paddd           m10, m14
@@ -3230,6 +3272,15 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     psrad           m13, 6
     psrad           m0, 6
     psrad           m1, 6
+%elifidn %1, sp
+    psrad           m8, 10
+    psrad           m9, 10
+    psrad           m10, 10
+    psrad           m11, 10
+    psrad           m12, 10
+    psrad           m13, 10
+    psrad           m0, 10
+    psrad           m1, 10
 %else
     psrad           m8, 2
     psrad           m9, 2
@@ -3240,6 +3291,8 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     psrad           m0, 2
     psrad           m1, 2
 %endif
+%endif
+
     packssdw        m8, m9
     packssdw        m10, m11
     packssdw        m12, m13
@@ -3253,6 +3306,11 @@ FILTER_VER_LUMA_AVX2_8x8 ss
     CLIPW           m10, m5, [pw_pixel_max]
     CLIPW           m12, m5, [pw_pixel_max]
     CLIPW           m0, m5, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m8, m5, [pw_pixel_max]
+    CLIPW           m10, m5, [pw_pixel_max]
+    CLIPW           m12, m5, [pw_pixel_max]
+    CLIPW           m0, m5, [pw_pixel_max]
 %endif
     vextracti128    xm9, m8, 1
     vextracti128    xm11, m10, 1
@@ -3290,6 +3348,8 @@ cglobal interp_8tap_vert_%1_%2x16, 4, 10
     sub             r0, r4
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
 %else
     vbroadcasti128  m14, [pd_n32768]
 %endif
@@ -3311,6 +3371,12 @@ FILTER_VER_LUMA_AVX2_Nx16 pp, 64
 FILTER_VER_LUMA_AVX2_Nx16 ps, 16
 FILTER_VER_LUMA_AVX2_Nx16 ps, 32
 FILTER_VER_LUMA_AVX2_Nx16 ps, 64
+FILTER_VER_LUMA_AVX2_Nx16 sp, 16
+FILTER_VER_LUMA_AVX2_Nx16 sp, 32
+FILTER_VER_LUMA_AVX2_Nx16 sp, 64
+FILTER_VER_LUMA_AVX2_Nx16 ss, 16
+FILTER_VER_LUMA_AVX2_Nx16 ss, 32
+FILTER_VER_LUMA_AVX2_Nx16 ss, 64
 
 %macro FILTER_VER_LUMA_AVX2_NxN 3
 INIT_YMM avx2