changeset 9541:bcb623bffca4 draft

asm: avx2 assembly code for 8bpp avg_pixel_pp[16xN] avg_pp[ 16x4] 30.98x 170.89 5293.98 avg_pp[ 16x8] 40.95x 255.57 10466.62 avg_pp[16x12] 49.12x 314.92 15468.01 avg_pp[16x16] 53.68x 368.70 19790.18 avg_pp[16x32] 65.08x 609.88 39689.32 avg_pp[16x64] 70.57x 1089.58 76886.48
author Sumalatha Polureddy<sumalatha@multicorewareinc.com>
date Wed, 18 Feb 2015 11:32:52 +0530
parents 8a9989b5b1ed
children f2c55dc1e7c0
files source/common/x86/asm-primitives.cpp source/common/x86/mc-a.asm source/common/x86/pixel.h
diffstat 3 files changed, 19 insertions(+-), 1 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 18 10:31:48 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 18 11:32:52 2015 +0530
@@ -1432,6 +1432,13 @@ void setupAssemblyPrimitives(EncoderPrim
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.pu[LUMA_16x4].pixelavg_pp = x265_pixel_avg_16x4_avx2;
+        p.pu[LUMA_16x8].pixelavg_pp = x265_pixel_avg_16x8_avx2;
+        p.pu[LUMA_16x12].pixelavg_pp = x265_pixel_avg_16x12_avx2;
+        p.pu[LUMA_16x16].pixelavg_pp = x265_pixel_avg_16x16_avx2;
+        p.pu[LUMA_16x32].pixelavg_pp = x265_pixel_avg_16x32_avx2;
+        p.pu[LUMA_16x64].pixelavg_pp = x265_pixel_avg_16x64_avx2;
+
         p.pu[LUMA_32x64].pixelavg_pp = x265_pixel_avg_32x64_avx2;
         p.pu[LUMA_32x32].pixelavg_pp = x265_pixel_avg_32x32_avx2;
         p.pu[LUMA_32x24].pixelavg_pp = x265_pixel_avg_32x24_avx2;
--- a/source/common/x86/mc-a.asm	Wed Feb 18 10:31:48 2015 +0530
+++ b/source/common/x86/mc-a.asm	Wed Feb 18 11:32:52 2015 +0530
@@ -3034,9 +3034,14 @@ AVGH 32, 32
 AVGH 32, 24
 AVGH 32, 16
 AVGH 32, 8
+
 AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 64
+AVGH 16, 32
 AVGH 16, 16
-AVGH 16,  8
+AVGH 16, 12
+AVGH 16, 8
+AVGH 16, 4
 
 %endif ;HIGH_BIT_DEPTH
 
--- a/source/common/x86/pixel.h	Wed Feb 18 10:31:48 2015 +0530
+++ b/source/common/x86/pixel.h	Wed Feb 18 11:32:52 2015 +0530
@@ -235,6 +235,12 @@ int x265_psyCost_ss_8x8_sse4(const int16
 int x265_psyCost_ss_16x16_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
 int x265_psyCost_ss_32x32_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
 int x265_psyCost_ss_64x64_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+void x265_pixel_avg_16x4_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_16x8_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_16x12_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_16x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_16x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_16x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
 void x265_pixel_avg_32x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
 void x265_pixel_avg_32x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
 void x265_pixel_avg_32x24_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);