changeset 9548:ede163eb20e0 draft

asm-avx2: luma_vsp[4x8, 16x12], luma_vss[4x8, 16x12] for 16bpp luma_vsp[4x8, 16x12]: 859c->431c, 3515c->2231c luma_vss[4x8, 16x12]: 833c->383c, 2942c->2089c
author Divya Manivannan <divya@multicorewareinc.com>
date Wed, 18 Feb 2015 16:05:33 +0530
parents 6d868a8ff7f2
children 998c5d235ee7
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm
diffstat 2 files changed, 84 insertions(+-), 2 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 18 15:25:58 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 18 16:05:33 2015 +0530
@@ -1149,6 +1149,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x32].luma_vsp = x265_interp_8tap_vert_sp_8x32_avx2;
 
         p.pu[LUMA_16x8].luma_vsp = x265_interp_8tap_vert_sp_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vsp = x265_interp_8tap_vert_sp_16x12_avx2;
         p.pu[LUMA_16x16].luma_vsp = x265_interp_8tap_vert_sp_16x16_avx2;
         p.pu[LUMA_16x32].luma_vsp = x265_interp_8tap_vert_sp_16x32_avx2;
         p.pu[LUMA_16x64].luma_vsp = x265_interp_8tap_vert_sp_16x64_avx2;
@@ -1173,6 +1174,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x32].luma_vss = x265_interp_8tap_vert_ss_8x32_avx2;
 
         p.pu[LUMA_16x8].luma_vss = x265_interp_8tap_vert_ss_16x8_avx2;
+        p.pu[LUMA_16x12].luma_vss = x265_interp_8tap_vert_ss_16x12_avx2;
         p.pu[LUMA_16x16].luma_vss = x265_interp_8tap_vert_ss_16x16_avx2;
         p.pu[LUMA_16x32].luma_vss = x265_interp_8tap_vert_ss_16x32_avx2;
         p.pu[LUMA_16x64].luma_vss = x265_interp_8tap_vert_ss_16x64_avx2;
@@ -1218,10 +1220,12 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_16x4].luma_vps = x265_interp_8tap_vert_ps_16x4_avx2;
 
         p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vsp = x265_interp_8tap_vert_sp_4x8_avx2;
         p.pu[LUMA_8x4].luma_vsp = x265_interp_8tap_vert_sp_8x4_avx2;
         p.pu[LUMA_16x4].luma_vsp = x265_interp_8tap_vert_sp_16x4_avx2;
 
         p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2;
+        p.pu[LUMA_4x8].luma_vss = x265_interp_8tap_vert_ss_4x8_avx2;
         p.pu[LUMA_8x4].luma_vss = x265_interp_8tap_vert_ss_8x4_avx2;
         p.pu[LUMA_16x4].luma_vss = x265_interp_8tap_vert_ss_16x4_avx2;
     }
--- a/source/common/x86/ipfilter16.asm	Wed Feb 18 15:25:58 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Wed Feb 18 16:05:33 2015 +0530
@@ -4430,6 +4430,8 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
     sub             r0, r4
 %ifidn %1,pp
     vbroadcasti128  m14, [pd_32]
+%elifidn %1, sp
+    mova            m14, [pd_524800]
 %else
     vbroadcasti128  m14, [pd_n32768]
 %endif
@@ -4549,6 +4551,14 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
     paddd           m9, m13
     pmaddwd         m11, [r5]
 
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+    psrad           m4, 6
+    psrad           m5, 6
+%else
     paddd           m0, m14
     paddd           m1, m14
     paddd           m2, m14
@@ -4562,6 +4572,13 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
     psrad           m3, 6
     psrad           m4, 6
     psrad           m5, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
+    psrad           m4, 10
+    psrad           m5, 10
 %else
     psrad           m0, 2
     psrad           m1, 2
@@ -4570,14 +4587,20 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
     psrad           m4, 2
     psrad           m5, 2
 %endif
+%endif
+
     packssdw        m0, m1
     packssdw        m2, m3
     packssdw        m4, m5
     vpermq          m0, m0, 11011000b
     vpermq          m2, m2, 11011000b
     vpermq          m4, m4, 11011000b
+    pxor            m5, m5
 %ifidn %1,pp
-    pxor            m5, m5
+    CLIPW           m0, m5, [pw_pixel_max]
+    CLIPW           m2, m5, [pw_pixel_max]
+    CLIPW           m4, m5, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m0, m5, [pw_pixel_max]
     CLIPW           m2, m5, [pw_pixel_max]
     CLIPW           m4, m5, [pw_pixel_max]
@@ -4615,19 +4638,30 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
     pmaddwd         m13, [r5 + 1 * mmsize]
     paddd           m11, m13
 
+%ifidn %1,ss
+    psrad           m6, 6
+    psrad           m7, 6
+%else
     paddd           m6, m14
     paddd           m7, m14
 %ifidn %1,pp
     psrad           m6, 6
     psrad           m7, 6
+%elifidn %1, sp
+    psrad           m6, 10
+    psrad           m7, 10
 %else
     psrad           m6, 2
     psrad           m7, 2
 %endif
+%endif
+
     packssdw        m6, m7
     vpermq          m6, m6, 11011000b
 %ifidn %1,pp
     CLIPW           m6, m5, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m6, m5, [pw_pixel_max]
 %endif
     vextracti128    xm7, m6, 1
     movu            [r8 + r3 * 2], xm6
@@ -4663,6 +4697,12 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
     pmaddwd         m3, [r5 + 3 * mmsize]
     paddd           m11, m3
 
+%ifidn %1,ss
+    psrad           m8, 6
+    psrad           m9, 6
+    psrad           m10, 6
+    psrad           m11, 6
+%else
     paddd           m8, m14
     paddd           m9, m14
     paddd           m10, m14
@@ -4672,12 +4712,19 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
     psrad           m9, 6
     psrad           m10, 6
     psrad           m11, 6
+%elifidn %1, sp
+    psrad           m8, 10
+    psrad           m9, 10
+    psrad           m10, 10
+    psrad           m11, 10
 %else
     psrad           m8, 2
     psrad           m9, 2
     psrad           m10, 2
     psrad           m11, 2
 %endif
+%endif
+
     packssdw        m8, m9
     packssdw        m10, m11
     vpermq          m8, m8, 11011000b
@@ -4685,6 +4732,9 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
 %ifidn %1,pp
     CLIPW           m8, m5, [pw_pixel_max]
     CLIPW           m10, m5, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m8, m5, [pw_pixel_max]
+    CLIPW           m10, m5, [pw_pixel_max]
 %endif
     vextracti128    xm9, m8, 1
     vextracti128    xm11, m10, 1
@@ -4703,6 +4753,8 @@ cglobal interp_8tap_vert_%1_16x12, 4, 10
 
 FILTER_VER_LUMA_AVX2_16x12 pp
 FILTER_VER_LUMA_AVX2_16x12 ps
+FILTER_VER_LUMA_AVX2_16x12 sp
+FILTER_VER_LUMA_AVX2_16x12 ss
 
 %macro FILTER_VER_LUMA_AVX2_4x8 1
 INIT_YMM avx2
@@ -4724,6 +4776,8 @@ cglobal interp_8tap_vert_%1_4x8, 4, 7, 8
 
 %ifidn %1,pp
     vbroadcasti128  m7, [pd_32]
+%elifidn %1, sp
+    mova            m7, [pd_524800]
 %else
     vbroadcasti128  m7, [pd_n32768]
 %endif
@@ -4780,18 +4834,29 @@ cglobal interp_8tap_vert_%1_4x8, 4, 7, 8
     pmaddwd         m6, [r5 + 1 * mmsize]
     paddd           m1, m6
 
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m2, 6
+%else
     paddd           m0, m7
     paddd           m2, m7
 %ifidn %1,pp
     psrad           m0, 6
     psrad           m2, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m2, 10
 %else
     psrad           m0, 2
     psrad           m2, 2
 %endif
+%endif
+
     packssdw        m0, m2
+    pxor            m6, m6
 %ifidn %1,pp
-    pxor            m6, m6
+    CLIPW           m0, m6, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m0, m6, [pw_pixel_max]
 %endif
 
@@ -4819,18 +4884,29 @@ cglobal interp_8tap_vert_%1_4x8, 4, 7, 8
     pmaddwd         m0, [r5 + 3 * mmsize]
     paddd           m1, m0
 
+%ifidn %1,ss
+    psrad           m4, 6
+    psrad           m1, 6
+%else
     paddd           m4, m7
     paddd           m1, m7
 %ifidn %1,pp
     psrad           m4, 6
     psrad           m1, 6
+%elifidn %1, sp
+    psrad           m4, 10
+    psrad           m1, 10
 %else
     psrad           m4, 2
     psrad           m1, 2
 %endif
+%endif
+
     packssdw        m4, m1
 %ifidn %1,pp
     CLIPW           m4, m6, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m4, m6, [pw_pixel_max]
 %endif
 
     vextracti128    xm1, m4, 1
@@ -4844,6 +4920,8 @@ cglobal interp_8tap_vert_%1_4x8, 4, 7, 8
 
 FILTER_VER_LUMA_AVX2_4x8 pp
 FILTER_VER_LUMA_AVX2_4x8 ps
+FILTER_VER_LUMA_AVX2_4x8 sp
+FILTER_VER_LUMA_AVX2_4x8 ss
 
 %macro PROCESS_LUMA_AVX2_W4_16R 1
     movq            xm0, [r0]