changeset 9542:f2c55dc1e7c0 draft

asm-avx2: luma_vsp[4x4, 8x8], luma_vss[4x4, 8x8] for 16bpp luma_vsp[4x4, 8x8]: 557c->255c, 1380c->839c luma_vss[4x4, 8x8]: 506c->240c, 1188c->756c
author Divya Manivannan <divya@multicorewareinc.com>
date Wed, 18 Feb 2015 13:36:17 +0530
parents bcb623bffca4
children 0f04b4e829d1
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter16.asm
diffstat 2 files changed, 63 insertions(+-), 2 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 18 11:32:52 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 18 13:36:17 2015 +0530
@@ -1143,6 +1143,10 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_64x32].luma_vps = x265_interp_8tap_vert_ps_64x32_avx2;
         p.pu[LUMA_64x48].luma_vps = x265_interp_8tap_vert_ps_64x48_avx2;
         p.pu[LUMA_64x64].luma_vps = x265_interp_8tap_vert_ps_64x64_avx2;
+
+        p.pu[LUMA_8x8].luma_vsp = x265_interp_8tap_vert_sp_8x8_avx2;
+
+        p.pu[LUMA_8x8].luma_vss = x265_interp_8tap_vert_ss_8x8_avx2;
 #else
         p.cu[BLOCK_4x4].dct = x265_dct4_avx2;
 #endif
@@ -1168,6 +1172,10 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_4x16].luma_vps = x265_interp_8tap_vert_ps_4x16_avx2;
         p.pu[LUMA_8x4].luma_vps = x265_interp_8tap_vert_ps_8x4_avx2;
         p.pu[LUMA_16x4].luma_vps = x265_interp_8tap_vert_ps_16x4_avx2;
+
+        p.pu[LUMA_4x4].luma_vsp = x265_interp_8tap_vert_sp_4x4_avx2;
+
+        p.pu[LUMA_4x4].luma_vss = x265_interp_8tap_vert_ss_4x4_avx2;
     }
 }
 #else // if HIGH_BIT_DEPTH
--- a/source/common/x86/ipfilter16.asm	Wed Feb 18 11:32:52 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Wed Feb 18 13:36:17 2015 +0530
@@ -31,6 +31,7 @@ tab_c_32:         times 4 dd 32
 tab_c_n32768:     times 4 dd -32768
 tab_c_524800:     times 4 dd 524800
 tab_c_n8192:      times 8 dw -8192
+pd_524800:        times 8 dd 524800
 
 tab_Tm16:         db 0, 1, 2, 3, 4,  5,  6, 7, 2, 3, 4,  5, 6, 7, 8, 9
 
@@ -2580,6 +2581,7 @@ cglobal interp_8tap_vert_pp_%1x%2, 5, 7,
     FILTER_VER_LUMA_PP 48, 64
     FILTER_VER_LUMA_PP 64, 16
     FILTER_VER_LUMA_PP 16, 64
+
 %macro FILTER_VER_LUMA_AVX2_4x4 1
 INIT_YMM avx2
 cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
@@ -2600,6 +2602,8 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
 
 %ifidn %1,pp
     vbroadcasti128  m6, [pd_32]
+%elifidn %1, sp
+    mova            m6, [pd_524800]
 %else
     vbroadcasti128  m6, [pd_n32768]
 %endif
@@ -2647,18 +2651,29 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
     pmaddwd         m4, [r5 + 3 * mmsize]
     paddd           m2, m4
 
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m2, 6
+%else
     paddd           m0, m6
     paddd           m2, m6
 %ifidn %1,pp
     psrad           m0, 6
     psrad           m2, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m2, 10
 %else
     psrad           m0, 2
     psrad           m2, 2
 %endif
+%endif
+
     packssdw        m0, m2
+    pxor            m1, m1
 %ifidn %1,pp
-    pxor            m1, m1
+    CLIPW           m0, m1, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m0, m1, [pw_pixel_max]
 %endif
 
@@ -2673,6 +2688,8 @@ cglobal interp_8tap_vert_%1_4x4, 4, 6, 7
 
 FILTER_VER_LUMA_AVX2_4x4 pp
 FILTER_VER_LUMA_AVX2_4x4 ps
+FILTER_VER_LUMA_AVX2_4x4 sp
+FILTER_VER_LUMA_AVX2_4x4 ss
 
 %macro FILTER_VER_LUMA_AVX2_8x8 1
 INIT_YMM avx2
@@ -2695,6 +2712,8 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1
 
 %ifidn %1,pp
     vbroadcasti128  m11, [pd_32]
+%elifidn %1, sp
+    mova            m11, [pd_524800]
 %else
     vbroadcasti128  m11, [pd_n32768]
 %endif
@@ -2796,6 +2815,12 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1
     paddd           m6, m10
 
     lea             r4, [r3 * 3]
+%ifidn %1,ss
+    psrad           m0, 6
+    psrad           m1, 6
+    psrad           m2, 6
+    psrad           m3, 6
+%else
     paddd           m0, m11
     paddd           m1, m11
     paddd           m2, m11
@@ -2805,18 +2830,28 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1
     psrad           m1, 6
     psrad           m2, 6
     psrad           m3, 6
+%elifidn %1, sp
+    psrad           m0, 10
+    psrad           m1, 10
+    psrad           m2, 10
+    psrad           m3, 10
 %else
     psrad           m0, 2
     psrad           m1, 2
     psrad           m2, 2
     psrad           m3, 2
 %endif
+%endif
+
     packssdw        m0, m1
     packssdw        m2, m3
     vpermq          m0, m0, 11011000b
     vpermq          m2, m2, 11011000b
+    pxor            m10, m10
 %ifidn %1,pp
-    pxor            m10, m10
+    CLIPW           m0, m10, [pw_pixel_max]
+    CLIPW           m2, m10, [pw_pixel_max]
+%elifidn %1, sp
     CLIPW           m0, m10, [pw_pixel_max]
     CLIPW           m2, m10, [pw_pixel_max]
 %endif
@@ -2850,6 +2885,12 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1
     pmaddwd         m3, [r5 + 3 * mmsize]
     paddd           m7, m3
 
+%ifidn %1,ss
+    psrad           m4, 6
+    psrad           m5, 6
+    psrad           m6, 6
+    psrad           m7, 6
+%else
     paddd           m4, m11
     paddd           m5, m11
     paddd           m6, m11
@@ -2859,12 +2900,19 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1
     psrad           m5, 6
     psrad           m6, 6
     psrad           m7, 6
+%elifidn %1, sp
+    psrad           m4, 10
+    psrad           m5, 10
+    psrad           m6, 10
+    psrad           m7, 10
 %else
     psrad           m4, 2
     psrad           m5, 2
     psrad           m6, 2
     psrad           m7, 2
 %endif
+%endif
+
     packssdw        m4, m5
     packssdw        m6, m7
     vpermq          m4, m4, 11011000b
@@ -2872,6 +2920,9 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1
 %ifidn %1,pp
     CLIPW           m4, m10, [pw_pixel_max]
     CLIPW           m6, m10, [pw_pixel_max]
+%elifidn %1, sp
+    CLIPW           m4, m10, [pw_pixel_max]
+    CLIPW           m6, m10, [pw_pixel_max]
 %endif
     vextracti128    xm5, m4, 1
     vextracti128    xm7, m6, 1
@@ -2886,6 +2937,8 @@ cglobal interp_8tap_vert_%1_8x8, 4, 6, 1
 
 FILTER_VER_LUMA_AVX2_8x8 pp
 FILTER_VER_LUMA_AVX2_8x8 ps
+FILTER_VER_LUMA_AVX2_8x8 sp
+FILTER_VER_LUMA_AVX2_8x8 ss
 
 %macro PROCESS_LUMA_AVX2_W8_16R 1
     movu            xm0, [r0]                       ; m0 = row 0