changeset 131:ffc0604b89a9

Modifed vector- xCalcHADS4x4
author deepthidevaki
date Wed, 27 Mar 2013 13:42:43 +0530
parents c4df9c66f793
children 329defb33896
files source/encoder/TComRdCost_SSE.cpp
diffstat 1 files changed, 50 insertions(+-), 139 deletions(-) [+]
line wrap: on
line diff
--- a/source/encoder/TComRdCost_SSE.cpp	Wed Mar 27 10:36:25 2013 +0530
+++ b/source/encoder/TComRdCost_SSE.cpp	Wed Mar 27 13:42:43 2013 +0530
@@ -593,8 +593,7 @@ UInt TComRdCost::xCalcHADs8x8( Pel *piOr
 
 UInt TComRdCost::xCalcHADs4x4( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
 {
-	Int k, satd = 0;
-	__declspec(align(16)) Int m[16],d[16];
+	Int satd = 0;
 
 	assert( iStep == 1 );
 
@@ -602,76 +601,41 @@ UInt TComRdCost::xCalcHADs4x4( Pel *piOr
 	Vec4i v1,v2,v3,v4,m0,m4,m8,m12,diff_v,piOrg_v,piCur_v;
 	Int satd1,satd2,satd3,satd4;
 
-		temp1.load(piOrg);
-		temp2.load(piCur);
-		piOrg_v=extend_low(temp1);		
-		piCur_v=extend_low(temp2);
-		v1=piOrg_v-piCur_v;
-
-		piCur += iStrideCur;
-		piOrg += iStrideOrg;
-
-		temp1.load(piOrg);
-		temp2.load(piCur);
-		piOrg_v=extend_low(temp1);		
-		piCur_v=extend_low(temp2);
-		
-		v2=piOrg_v-piCur_v;
-
-		piCur += iStrideCur;
-		piOrg += iStrideOrg;
-
-		temp1.load(piOrg);
-		temp2.load(piCur);
-		piOrg_v=extend_low(temp1);
-		piCur_v=extend_low(temp2);
-		
-		v3=piOrg_v-piCur_v;
-	    
-		piCur += iStrideCur;
-		piOrg += iStrideOrg;
+	temp1.load(piOrg);
+	temp2.load(piCur);
+	piOrg_v=extend_low(temp1);		
+	piCur_v=extend_low(temp2);
+	v1=piOrg_v-piCur_v;
 
-		temp1.load(piOrg);
-		temp2.load(piCur);
-		piOrg_v=extend_low(temp1);		
-		piCur_v=extend_low(temp2);
-		
-		v4=piOrg_v-piCur_v;
+	piCur += iStrideCur;
+	piOrg += iStrideOrg;
 
-	/*===== hadamard transform =====*/
-	/*m[ 0] = diff[ 0] + diff[12];
-	m[ 1] = diff[ 1] + diff[13];
-	m[ 2] = diff[ 2] + diff[14];
-	m[ 3] = diff[ 3] + diff[15];
-	m[ 4] = diff[ 4] + diff[ 8];
-	m[ 5] = diff[ 5] + diff[ 9];
-	m[ 6] = diff[ 6] + diff[10];
-	m[ 7] = diff[ 7] + diff[11];
-	m[ 8] = diff[ 4] - diff[ 8];
-	m[ 9] = diff[ 5] - diff[ 9];
-	m[10] = diff[ 6] - diff[10];
-	m[11] = diff[ 7] - diff[11];
-	m[12] = diff[ 0] - diff[12];
-	m[13] = diff[ 1] - diff[13];
-	m[14] = diff[ 2] - diff[14];
-	m[15] = diff[ 3] - diff[15];
+	temp1.load(piOrg);
+	temp2.load(piCur);
+	piOrg_v=extend_low(temp1);		
+	piCur_v=extend_low(temp2);
 
-	d[ 0] = m[ 0] + m[ 4];
-	d[ 1] = m[ 1] + m[ 5];
-	d[ 2] = m[ 2] + m[ 6];
-	d[ 3] = m[ 3] + m[ 7];
-	d[ 4] = m[ 8] + m[12];
-	d[ 5] = m[ 9] + m[13];
-	d[ 6] = m[10] + m[14];
-	d[ 7] = m[11] + m[15];
-	d[ 8] = m[ 0] - m[ 4];
-	d[ 9] = m[ 1] - m[ 5];
-	d[10] = m[ 2] - m[ 6];
-	d[11] = m[ 3] - m[ 7];
-	d[12] = m[12] - m[ 8];
-	d[13] = m[13] - m[ 9];
-	d[14] = m[14] - m[10];
-	d[15] = m[15] - m[11];*/
+	v2=piOrg_v-piCur_v;
+
+	piCur += iStrideCur;
+	piOrg += iStrideOrg;
+
+	temp1.load(piOrg);
+	temp2.load(piCur);
+	piOrg_v=extend_low(temp1);
+	piCur_v=extend_low(temp2);
+
+	v3=piOrg_v-piCur_v;
+
+	piCur += iStrideCur;
+	piOrg += iStrideOrg;
+
+	temp1.load(piOrg);
+	temp2.load(piCur);
+	piOrg_v=extend_low(temp1);		
+	piCur_v=extend_low(temp2);
+
+	v4=piOrg_v-piCur_v;
 
 	m4=v2+v3;
 	m8=v2-v3;
@@ -684,85 +648,32 @@ UInt TComRdCost::xCalcHADs4x4( Pel *piOr
 	v3=m0-m4;
 	v4=m12-m8;	
 
-	v1.store(d);
-	v2.store(d+4);
-	v3.store(d+8);
-	v4.store(d+12);
-	
-	m[ 0] = d[ 0] + d[ 3];
-	m[ 1] = d[ 1] + d[ 2];
-	m[ 2] = d[ 1] - d[ 2];
-	m[ 3] = d[ 0] - d[ 3];
-	m[ 4] = d[ 4] + d[ 7];
-	m[ 5] = d[ 5] + d[ 6];
-	m[ 6] = d[ 5] - d[ 6];
-	m[ 7] = d[ 4] - d[ 7];
-	m[ 8] = d[ 8] + d[11];
-	m[ 9] = d[ 9] + d[10];
-	m[10] = d[ 9] - d[10];
-	m[11] = d[ 8] - d[11];
-	m[12] = d[12] + d[15];
-	m[13] = d[13] + d[14];
-	m[14] = d[13] - d[14];
-	m[15] = d[12] - d[15];
-
-	/*m0=blend4i<0,1,4,5>(v1,v2);
-	m4=blend4i<3,2,7,6>(v1,v2);
-	v1=m0+m4;
-	v2=m0-m4;
+	Vec4i tv1(v1[0],v1[1],v2[0],v2[1]);
+	Vec4i tv2(v1[3],v1[2],v2[3],v2[2]);
+	v1=tv1+tv2;
+	v2=tv1-tv2;
 
-	m8=blend4i<0,1,4,5>(v3,v4);
-	m12=blend4i<3,2,7,6>(v3,v4);
-	v3=m8+m12;
-	v4=m8-m12;
-    */
-	/*m0=blend4i<0,1,5,4>(v1,v2);
-	m0.store(m);
-	m4=blend4i<2,3,7,6>(v1,v2);
-	m4.store(m+4);
-
-	m0=blend4i<0,1,5,4>(v3,v4);
-	m0.store(m+8);
-	m4=blend4i<2,3,7,6>(v3,v4);
-	m4.store(m+12);*/
+	Vec4i tv3(v3[0],v3[1],v4[0],v4[1]);
+	Vec4i tv4(v3[3],v3[2],v4[3],v4[2]);
+	v3=tv3+tv4;
+	v4=tv3-tv4;
 
-	d[ 0] = m[ 0] + m[ 1];
-	d[ 1] = m[ 0] - m[ 1];
-	d[ 2] = m[ 2] + m[ 3];
-	d[ 3] = m[ 3] - m[ 2];
-	d[ 4] = m[ 4] + m[ 5];
-	d[ 5] = m[ 4] - m[ 5];
-	d[ 6] = m[ 6] + m[ 7];
-	d[ 7] = m[ 7] - m[ 6];
-	d[ 8] = m[ 8] + m[ 9];
-	d[ 9] = m[ 8] - m[ 9];
-	d[10] = m[10] + m[11];
-	d[11] = m[11] - m[10];
-	d[12] = m[12] + m[13];
-	d[13] = m[12] - m[13];
-	d[14] = m[14] + m[15];
-	d[15] = m[15] - m[14];
-
-	/*m0=blend4i<0,5,2,7>(v1,v2);
-	m4=blend4i<1,4,3,6>(v1,v2);
-	v1=abs(m0+m4);
-	v2=abs(m0-m4);
+	Vec4i tm1(v1[0],v2[1],v1[2],v2[3]);
+	Vec4i tm2(v1[1],v2[0],v1[3],v2[2]);
+	v1=abs(tm1+tm2);
+	v2=abs(tm1-tm2);
 	satd1=horizontal_add_x(v1);
 	satd2=horizontal_add_x(v2);
 
-	m8=blend4i<0,5,2,7>(v3,v4);
-	m12=blend4i<1,4,3,6>(v3,v4);
-	v3=abs(m8+m12);
-	v4=abs(m8-m12);
+	Vec4i tm3(v3[0],v4[1],v3[2],v4[3]);
+	Vec4i tm4(v3[1],v4[0],v3[3],v4[2]);
+	v3=abs(tm3+tm4);
+	v4=abs(tm3-tm4);
 	satd3=horizontal_add_x(v3);
 	satd4=horizontal_add_x(v4);
 
-	satd=satd1+satd2+satd3+satd4;*/
+	satd=satd1+satd2+satd3+satd4;
 
-	for (k=0; k<16; ++k)
-	{
-		satd += abs(d[k]);
-	}
 	satd = ((satd+1)>>1);
 
 	return satd;