changeset 127:cdddf736f4a1

Modifed vectorized xCalcHADs* for aligned data access
author deepthidevaki
date Mon, 25 Mar 2013 17:44:57 +0530
parents e4511f7ffd67
children a1206301f004 6b9e69b013e8
files source/encoder/TComRdCost_SSE.cpp
diffstat 1 files changed, 14 insertions(+-), 14 deletions(-) [+]
line wrap: on
line diff
--- a/source/encoder/TComRdCost_SSE.cpp	Mon Mar 25 15:23:32 2013 +0530
+++ b/source/encoder/TComRdCost_SSE.cpp	Mon Mar 25 17:44:57 2013 +0530
@@ -490,9 +490,8 @@ UInt TComRdCost::xGetSAD64( DistParam* p
 UInt TComRdCost::xCalcHADs8x8( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
 {
   Int  i, j, k, jj, sad=0;
-  Int  m1[8][8], m2[8][8], m3[8][8];
-  /*__declspec(align(16))*/
-  Short diff[64];
+  __declspec(align(16)) Int  m1[8][8], m2[8][8], m3[8][8];
+  __declspec(align(16)) Short diff[64];
 
   Vec8s diff_v1, piOrg_v, piCur_v;
   Vec4i v1, v2;
@@ -576,10 +575,10 @@ UInt TComRdCost::xCalcHADs8x8( Pel *piOr
   
   for (i = 0; i < 8; i++)
   {
-      v1.load(m2[i]);	  
+      v1.load_a(m2[i]);	  
 	  v1=abs(v1);
 	  sad+=horizontal_add_x(v1);
-	  v1.load(m2[i]+4);
+	  v1.load_a(m2[i]+4);
 	  v1=abs(v1);
 	  sad+=horizontal_add_x(v1);
   }
@@ -591,7 +590,8 @@ UInt TComRdCost::xCalcHADs8x8( Pel *piOr
 
 UInt TComRdCost::xCalcHADs4x4( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
 {
-	Int k, diff[16], satd = 0, m[16], d[16];
+	Int k, satd = 0;
+	__declspec(align(16)) Int diff[16],m[16],d[16];
 
 	assert( iStep == 1 );
 
@@ -643,13 +643,13 @@ UInt TComRdCost::xCalcHADs4x4( Pel *piOr
 	d[14] = m[14] - m[10];
 	d[15] = m[15] - m[11];*/
 
-	v1.load(diff);
-	v2.load(diff+12);
+	v1.load_a(diff);
+	v2.load_a(diff+12);
 	m0=v1+v2;
 	m12=v1-v2;
 
-	v3.load(diff+4);
-	v4.load(diff+8);
+	v3.load_a(diff+4);
+	v4.load_a(diff+8);
 	m4=v3+v4;
 	m8=v3-v4;
 
@@ -658,10 +658,10 @@ UInt TComRdCost::xCalcHADs4x4( Pel *piOr
 	v3=m0-m4;
 	v4=m12-m8;
 
-	v1.store(m);
-	v2.store(m+4);
-	v3.store(m+8);
-	v4.store(m+12);
+	v1.store_a(m);
+	v2.store_a(m+4);
+	v3.store_a(m+8);
+	v4.store_a(m+12);
 
 	m[ 0] = d[ 0] + d[ 3];
 	m[ 1] = d[ 1] + d[ 2];