changeset 9560:35fc79c5f769

asm: avx2 code for pixel_add_ps[64x64] - 152x add_ps[64x64] 152.57x 9348.06 1426265.25
author Sumalatha Polureddy<sumalatha@multicorewareinc.com>
date Fri, 20 Feb 2015 16:48:00 +0530
parents c1221e72da80
children f317939a5b33
files source/common/x86/asm-primitives.cpp source/common/x86/pixel.h source/common/x86/pixeladd8.asm
diffstat 3 files changed, 56 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 20 17:55:46 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Feb 20 16:48:00 2015 +0530
@@ -1501,6 +1501,7 @@ void setupAssemblyPrimitives(EncoderPrim
     {
         p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
         p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
+        p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2;
 
         p.pu[LUMA_16x4].pixelavg_pp = x265_pixel_avg_16x4_avx2;
         p.pu[LUMA_16x8].pixelavg_pp = x265_pixel_avg_16x8_avx2;
--- a/source/common/x86/pixel.h	Fri Feb 20 17:55:46 2015 +0530
+++ b/source/common/x86/pixel.h	Fri Feb 20 16:48:00 2015 +0530
@@ -253,6 +253,7 @@ void x265_pixel_avg_64x16_avx2(pixel* ds
 
 void x265_pixel_add_ps_16x16_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_add_ps_32x32_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+void x265_pixel_add_ps_64x64_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
 
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD
--- a/source/common/x86/pixeladd8.asm	Fri Feb 20 17:55:46 2015 +0530
+++ b/source/common/x86/pixeladd8.asm	Fri Feb 20 16:48:00 2015 +0530
@@ -841,6 +841,60 @@ cglobal pixel_add_ps_64x%2, 6, 7, 8, des
 
     jnz         .loop
     RET
+
+INIT_YMM avx2
+cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+    mov         r6d,        %2/2
+    add         r5,         r5
+.loop:
+    pmovzxbw    m0,         [r2]                ; first 16 of row 0 of src0
+    pmovzxbw    m1,         [r2 + 16]           ; second 16 of row 0 of src0
+    pmovzxbw    m2,         [r2 + 32]           ; third 16 of row 0 of src0
+    pmovzxbw    m3,         [r2 + 48]           ; forth 16 of row 0 of src0
+    movu        m4,         [r3]                ; first 16 of row 0 of src1
+    movu        m5,         [r3 + 32]           ; second 16 of row 0 of src1
+    movu        m6,         [r3 + 64]           ; third 16 of row 0 of src1
+    movu        m7,         [r3 + 96]           ; forth 16 of row 0 of src1
+
+    paddw       m0,         m4
+    paddw       m1,         m5
+    paddw       m2,         m6
+    paddw       m3,         m7
+    packuswb    m0,         m1
+    packuswb    m2,         m3
+    vpermq      m0, m0, 11011000b
+    movu        [r0],      m0                   ; first 32 of row 0 of dst
+    vpermq      m2, m2, 11011000b
+    movu        [r0 + 32],      m2              ; second 32 of row 0 of dst
+
+    pmovzxbw    m0,         [r2 + r4]           ; first 16 of row 1 of src0
+    pmovzxbw    m1,         [r2 + r4 + 16]      ; second 16 of row 1 of src0
+    pmovzxbw    m2,         [r2 + r4 + 32]      ; third 16 of row 1 of src0
+    pmovzxbw    m3,         [r2 + r4 + 48]      ; forth 16 of row 1 of src0
+    movu        m4,         [r3 + r5]           ; first 16 of row 1 of src1
+    movu        m5,         [r3 + r5 + 32]      ; second 16 of row 1 of src1
+    movu        m6,         [r3 + r5 + 64]      ; third 16 of row 1 of src1
+    movu        m7,         [r3 + r5 + 96]      ; forth 16 of row 1 of src1
+
+    paddw       m0,         m4
+    paddw       m1,         m5
+    paddw       m2,         m6
+    paddw       m3,         m7
+    packuswb    m0,         m1
+    packuswb    m2,         m3
+    vpermq      m0, m0, 11011000b
+    movu        [r0 + r1],      m0              ; first 32 of row 1 of dst
+    vpermq      m2, m2, 11011000b
+    movu        [r0 + r1 + 32],      m2         ; second 32 of row 1 of dst
+
+    lea         r2,         [r2 + r4 * 2]
+    lea         r3,         [r3 + r5 * 2]
+    lea         r0,         [r0 + r1 * 2]
+
+    dec         r6d
+    jnz         .loop
+    RET
+
 %endif
 %endmacro