Mercurial > x265
changeset 9558:d77824ea76c9 draft
asm: avx2 code for pixel_add_ps[32x32] - 80x
add_ps[32x32] 79.84x 1659.94 132523.33
author | Sumalatha Polureddy<sumalatha@multicorewareinc.com> |
---|---|
date | Fri, 20 Feb 2015 14:20:27 +0530 |
parents | f75011092766 |
children | c1221e72da80 |
files | source/common/x86/asm-primitives.cpp source/common/x86/pixel.h source/common/x86/pixeladd8.asm |
diffstat | 3 files changed, 63 insertions(+-), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/source/common/x86/asm-primitives.cpp Fri Feb 20 13:39:51 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Feb 20 14:20:27 2015 +0530 @@ -1500,6 +1500,7 @@ void setupAssemblyPrimitives(EncoderPrim if (cpuMask & X265_CPU_AVX2) { p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2; + p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2; p.pu[LUMA_16x4].pixelavg_pp = x265_pixel_avg_16x4_avx2; p.pu[LUMA_16x8].pixelavg_pp = x265_pixel_avg_16x8_avx2;
--- a/source/common/x86/pixel.h Fri Feb 20 13:39:51 2015 +0530 +++ b/source/common/x86/pixel.h Fri Feb 20 14:20:27 2015 +0530 @@ -252,6 +252,7 @@ void x265_pixel_avg_64x32_avx2(pixel* ds void x265_pixel_avg_64x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); void x265_pixel_add_ps_16x16_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); +void x265_pixel_add_ps_32x32_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); #undef DECL_PIXELS #undef DECL_HEVC_SSD
--- a/source/common/x86/pixeladd8.asm Fri Feb 20 13:39:51 2015 +0530 +++ b/source/common/x86/pixeladd8.asm Fri Feb 20 14:20:27 2015 +0530 @@ -569,6 +569,67 @@ cglobal pixel_add_ps_32x%2, 6, 7, 8, des jnz .loop RET + +INIT_YMM avx2 +cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] ; first half of row 0 of src0 + pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0 + movu m2, [r3] ; first half of row 0 of src1 + movu m3, [r3 + 32] ; second half of row 0 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0], m0 ; row 0 of dst + + pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0 + pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0 + movu m2, [r3 + r5] ; first half of row 1 of src1 + movu m3, [r3 + r5 + 32] ; second half of row 1 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0 + r1], m0 ; row 1 of dst + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + pmovzxbw m0, [r2] ; first half of row 2 of src0 + pmovzxbw m1, [r2 + 16] ; second half of row 2 of src0 + movu m2, [r3] ; first half of row 2 of src1 + movu m3, [r3 + 32] ; second half of row 2 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0], m0 ; row 2 of dst + + pmovzxbw m0, [r2 + r4] ; first half of row 3 of src0 + pmovzxbw m1, [r2 + r4 + 16] ; second half of row 3 of src0 + movu m2, [r3 + r5] ; first half of row 3 of src1 + movu m3, [r3 + r5 + 32] ; second half of row 3 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0 + r1], m0 ; row 3 of dst + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + dec r6d + jnz .loop + RET %endif %endmacro