changeset 9578:4257bfeb9b87

asm-ssse3: filter_p2s 8bpp code for [4xN],[8xN],[16xN],[32xN]
author Rajesh Paulraj<rajesh@multicorewareinc.com>
date Wed, 25 Feb 2015 20:17:05 +0530
parents 69f8b94e4eb7
children 2cac9b74c41c
files source/common/x86/asm-primitives.cpp source/common/x86/ipfilter8.asm source/common/x86/ipfilter8.h
diffstat 3 files changed, 308 insertions(+-), 18 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 25 17:39:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 25 20:17:05 2015 +0530
@@ -1349,11 +1349,31 @@ void setupAssemblyPrimitives(EncoderPrim
         ASSIGN_SSE_PP(ssse3);
         p.cu[BLOCK_4x4].sse_pp = x265_pixel_ssd_4x4_ssse3;
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = x265_pixel_ssd_4x8_ssse3;
+        p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_ssse3;
+        p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_ssse3;
+        p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_ssse3;
+        p.pu[LUMA_8x4].filter_p2s = x265_pixelToShort_8x4_ssse3;
+        p.pu[LUMA_8x8].filter_p2s = x265_pixelToShort_8x8_ssse3;
+        p.pu[LUMA_8x16].filter_p2s = x265_pixelToShort_8x16_ssse3;
+        p.pu[LUMA_8x32].filter_p2s = x265_pixelToShort_8x32_ssse3;
+        p.pu[LUMA_16x4].filter_p2s = x265_pixelToShort_16x4_ssse3;
+        p.pu[LUMA_16x8].filter_p2s = x265_pixelToShort_16x8_ssse3;
+        p.pu[LUMA_16x12].filter_p2s = x265_pixelToShort_16x12_ssse3;
+        p.pu[LUMA_16x16].filter_p2s = x265_pixelToShort_16x16_ssse3;
+        p.pu[LUMA_16x32].filter_p2s = x265_pixelToShort_16x32_ssse3;
+        p.pu[LUMA_16x64].filter_p2s = x265_pixelToShort_16x64_ssse3;
+        p.pu[LUMA_32x8].filter_p2s = x265_pixelToShort_32x8_ssse3;
+        p.pu[LUMA_32x16].filter_p2s = x265_pixelToShort_32x16_ssse3;
+        p.pu[LUMA_32x24].filter_p2s = x265_pixelToShort_32x24_ssse3;
+        p.pu[LUMA_32x32].filter_p2s = x265_pixelToShort_32x32_ssse3;
+        p.pu[LUMA_32x64].filter_p2s = x265_pixelToShort_32x64_ssse3;
+        p.pu[LUMA_64x16].filter_p2s = x265_pixelToShort_64x16_ssse3;
+        p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3;
+        p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3;
+        p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3;
 
-        p.luma_p2s = x265_luma_p2s_ssse3;
         p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
         p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
-
         p.dst4x4 = x265_dst4_ssse3;
         p.cu[BLOCK_8x8].idct = x265_idct8_ssse3;
         p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_ssse3;
--- a/source/common/x86/ipfilter8.asm	Wed Feb 25 17:39:51 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Feb 25 20:17:05 2015 +0530
@@ -5416,28 +5416,24 @@ FILTER_V4_W16n_H2 64, 32
 FILTER_V4_W16n_H2 64, 48
 FILTER_V4_W16n_H2 48, 64
 FILTER_V4_W16n_H2 64, 16
-
-
 ;-----------------------------------------------------------------------------
-; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
 ;-----------------------------------------------------------------------------
+%macro PIXEL_WH_4xN 2
 INIT_XMM ssse3
-cglobal luma_p2s, 3, 7, 6
+cglobal pixelToShort_%1x%2, 3, 7, 6
 
     ; load width and height
-    mov         r3d, r3m
-    mov         r4d, r4m
-
+    mov         r3d, %1
+    mov         r4d, %2
     ; load constant
     mova        m4, [pb_128]
     mova        m5, [tab_c_64_n64]
-
 .loopH:
-
     xor         r5d, r5d
+
 .loopW:
-    lea         r6, [r0 + r5]
-
+    mov         r6, r0
     movh        m0, [r6]
     punpcklbw   m0, m4
     pmaddubsw   m0, m5
@@ -5477,8 +5473,263 @@ cglobal luma_p2s, 3, 7, 6
 
     sub         r4d, 4
     jnz         .loopH
-
-    RET
+    RET
+%endmacro
+PIXEL_WH_4xN 4, 4
+PIXEL_WH_4xN 4, 8
+PIXEL_WH_4xN 4, 16
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_8xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH
+    xor         r5d, r5d
+.loopW
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + FENC_STRIDE * 0], m0
+    movu        [r2 + FENC_STRIDE * 2], m1
+    movu        [r2 + FENC_STRIDE * 4], m2
+    movu        [r2 + FENC_STRIDE * 6], m3
+
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+    RET
+%endmacro
+PIXEL_WH_8xN 8, 8
+PIXEL_WH_8xN 8, 4
+PIXEL_WH_8xN 8, 16
+PIXEL_WH_8xN 8, 32
+
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_16xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+
+    RET
+%endmacro
+PIXEL_WH_16xN 16, 16
+PIXEL_WH_16xN 16, 8
+PIXEL_WH_16xN 16, 4
+PIXEL_WH_16xN 16, 12
+PIXEL_WH_16xN 16, 32
+PIXEL_WH_16xN 16, 64
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_32xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+
+    RET
+%endmacro
+PIXEL_WH_32xN 32, 32
+PIXEL_WH_32xN 32, 8
+PIXEL_WH_32xN 32, 16
+PIXEL_WH_32xN 32, 24
+PIXEL_WH_32xN 32, 64
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+%macro PIXEL_WH_64xN 2
+INIT_XMM ssse3
+cglobal pixelToShort_%1x%2, 3, 7, 6
+
+    ; load width and height
+    mov         r3d, %1
+    mov         r4d, %2
+
+    ; load constant
+    mova        m4, [pb_128]
+    mova        m5, [tab_c_64_n64]
+
+.loopH:
+    xor         r5d, r5d
+.loopW:
+    lea         r6, [r0 + r5]
+
+    movh        m0, [r6]
+    punpcklbw   m0, m4
+    pmaddubsw   m0, m5
+
+    movh        m1, [r6 + r1]
+    punpcklbw   m1, m4
+    pmaddubsw   m1, m5
+
+    movh        m2, [r6 + r1 * 2]
+    punpcklbw   m2, m4
+    pmaddubsw   m2, m5
+
+    lea         r6, [r6 + r1 * 2]
+    movh        m3, [r6 + r1]
+    punpcklbw   m3, m4
+    pmaddubsw   m3, m5
+
+    add         r5, 8
+    cmp         r5, r3
+
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+    movu        [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+    je          .nextH
+    jmp         .loopW
+
+
+.nextH:
+    lea         r0, [r0 + r1 * 4]
+    add         r2, FENC_STRIDE * 8
+
+    sub         r4d, 4
+    jnz         .loopH
+
+    RET
+%endmacro
+PIXEL_WH_64xN 64, 64
+PIXEL_WH_64xN 64, 16
+PIXEL_WH_64xN 64, 32
+PIXEL_WH_64xN 64, 48
 
 %macro PROCESS_LUMA_W4_4R 0
     movd        m0, [r0]
--- a/source/common/x86/ipfilter8.h	Wed Feb 25 17:39:51 2015 +0530
+++ b/source/common/x86/ipfilter8.h	Wed Feb 25 20:17:05 2015 +0530
@@ -619,10 +619,29 @@ LUMA_SS_FILTERS(_sse2);
 LUMA_FILTERS(_avx2);
 LUMA_SP_FILTERS(_avx2);
 LUMA_SS_FILTERS(_avx2);
-
 void x265_interp_8tap_hv_pp_8x8_sse4(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
-void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
-
+void x265_pixelToShort_4x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_4x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_4x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x12_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x24_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS
 #undef LUMA_SS_FILTERS