changeset 279:7e51f92d13e8

Merged in deepthidevaki/xhevc_deepthid (pull request #32) Added vectorized satd_8x8 to pixel.inc
author nandaku2 <deepthi@multicorewareinc.com>
date Thu, 04 Apr 2013 14:54:51 +0530
parents 4abb704ac3ba (current diff) 349f2249ec7d (diff)
children 4449194c1d02
files
diffstat 1 files changed, 146 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/encoder/vec/pixel.inc	Thu Apr 04 14:51:40 2013 +0530
+++ b/source/encoder/vec/pixel.inc	Thu Apr 04 14:54:51 2013 +0530
@@ -112,8 +112,154 @@ int CDECL satd_4x4(pixel * piCur, intptr
     return satd;
 }
 
+int CDECL satd_8x8(pixel * piCur, intptr_t iStrideCur, pixel * piOrg, intptr_t iStrideOrg)
+{
+    int  j, satd = 0;
+
+    ALIGN_VAR_16(short, m2[8][8]);
+
+    Vec8s diff_v1, diff_v2, piOrg_v1, piOrg_v2, piCur_v1, piCur_v2;
+    Vec8s v1, v2, t1, t2;
+	
+    for (j = 0; j < 8; j += 2)
+    {
+        piOrg_v1.load(piOrg);
+        piCur_v1.load(piCur);        
+        piCur += iStrideCur;
+        piOrg += iStrideOrg;
+
+        piOrg_v2.load(piOrg);
+        piCur_v2.load(piCur);
+        piCur += iStrideCur;
+        piOrg += iStrideOrg;
+
+        diff_v1 = piOrg_v1 - piCur_v1;
+        diff_v2 = piOrg_v2 - piCur_v2;        
+
+        v1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(diff_v1, diff_v2);
+        v2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(diff_v1, diff_v2);
+
+        t1 = v1 + v2; //m2
+        t2 = v1 - v2;
+
+        v1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(t1, t2);
+        v2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(t1, t2);
+
+        t1 = v1 + v2; //m1
+        t2 = v1 - v2;
+
+        v1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(t1, t2);
+        v2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(t1, t2);
+
+        t1 = v1 + v2; //m2
+        t2 = v1 - v2;
+
+        v1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(t1, t2);		//m2[j][0...7]
+        v2 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(t1, t2);	//m2[j+1][0..7]
+
+        v1.store_a(m2[j]);
+        v2.store_a(m2[j + 1]);
+
+    }
+
+    //vertical
+    {
+        Vec8s v0, v1, v2, v3, v4, v5, v6, v7, t1, t2;
+
+        v0.load_a(m2[0]);
+        v4.load_a(m2[4]);
+        t1 = v0 + v4;
+        t2 = v0 - v4;
+        v0 = t1;
+        v4 = t2;
+
+        v1.load_a(m2[1]);
+        v5.load_a(m2[5]);
+        t1 = v1 + v5;
+        t2 = v1 - v5;
+        v1 = t1;
+        v5 = t2;
+
+        v2.load_a(m2[2]);
+        v6.load_a(m2[6]);
+        t1 = v2 + v6;
+        t2 = v2 - v6;
+        v2 = t1;
+        v6 = t2;
+
+        v3.load_a(m2[3]);
+        v7.load_a(m2[7]);
+        t1 = v3 + v7;
+        t2 = v3 - v7;
+        v3 = t1;
+        v7 = t2;
+
+        //Calculate m2[0][] - m2[3][]
+
+        t1 = v0 + v2;
+        t2 = v0 - v2;
+        v0 = t1;
+        v2 = t2;
+
+        t1 = v1 + v3;
+        t2 = v1 - v3;
+        v1 = t1;
+        v3 = t2;
+
+        t1 = v0 + v1;
+        t2 = v0 - v1;
+        v0 = abs(t1);
+        v1 = abs(t2);
+        v0 = v0 + v1;
+
+        t1 = v2 + v3;
+        t2 = v2 - v3;
+        v2 = abs(t1);
+        v3 = abs(t2);
+        v2 = v2 + v3;
+
+        v0 = v0 + v2;
+
+        //Calculate m2[4][] - m2[7][]
+
+        t1 = v4 + v6;
+        t2 = v4 - v6;
+        v4 = t1;
+        v6 = t2;
+
+
+        t1 = v5 + v7;
+        t2 = v5 - v7;
+        v5 = t1;
+        v7 = t2;
+
+        t1 = v4 + v5;
+        t2 = v4 - v5;
+        v4 = abs(t1);
+        v5 = abs(t2);
+        v4 = v4 + v5;
+
+        t1 = v6 + v7;
+        t2 = v6 - v7;
+        v6 = abs(t1);
+        v7 = abs(t2);
+        v6 = v6 + v7;
+
+        v4 = v4 + v6;
+        v0 = v0 + v4;
+
+        satd = horizontal_add_x(v0);
+
+    }
+
+    satd = ((satd + 2) >> 2);
+
+    return satd;
+}
+
 void Setup_Vec_PixelPrimitives(EncoderPrimitives &p)
 {
     p.sad[PARTITION_8x8] = sad_8x8;
     p.satd[PARTITION_4x4] = satd_4x4;
+    p.satd[PARTITION_8x8] = satd_8x8;
 }