changeset 2309:d2a8a011a13e

Merged in deepthidevaki/xhevc_deepthid (pull request #199) Added vector Filter Horizontal Multiplane, and support in testbench.
author Steve Borho <steve@borho.org>
date Fri, 14 Jun 2013 12:06:02 -0500
parents 73b00711e83d (current diff) 575b19e5a035 (diff)
children 83756d7ec230
files source/Lib/TLibEncoder/TEncSearch.cpp
diffstat 11 files changed, 512 insertions(+-), 218 deletions(-) [+]
line wrap: on
line diff
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Fri Jun 14 12:06:02 2013 -0500
@@ -959,7 +959,7 @@ Void TEncSearch::xIntraCodingLumaBlk(TCo
 
     //===== get residual signal =====
 
-    primitives.getResidue((pixel*)piOrg,(pixel*)piPred,piResi, uiHeight, uiWidth, uiStride);
+    primitives.getResidue[uiWidth/8]((pixel*)piOrg, (pixel*)piPred, piResi, uiStride);
 
     //===== transform and quantization =====
     //--- init rate estimation arrays for RDOQ ---
@@ -999,7 +999,7 @@ Void TEncSearch::xIntraCodingLumaBlk(TCo
 
     //===== reconstruction =====  
 
-    primitives.calcRecons((pixel*)piPred, piResi, (pixel*)piReco, piRecQt, (pixel*)piRecIPred, uiStride, uiRecQtStride, uiRecIPredStride, uiHeight, uiWidth);
+    primitives.calcRecons[uiWidth/8]((pixel*)piPred, piResi, (pixel*)piReco, piRecQt, (pixel*)piRecIPred, uiStride, uiRecQtStride, uiRecIPredStride);
 
     //===== update distortion =====
     int Part = PartitionFromSizes(uiWidth, uiHeight);
@@ -1092,7 +1092,7 @@ Void TEncSearch::xIntraCodingChromaBlk(T
     }
     //===== get residual signal =====    
 
-    primitives.getResidue((pixel*)piOrg,(pixel*)piPred,piResi, uiHeight, uiWidth, uiStride);
+    primitives.getResidue[uiWidth/8]((pixel*)piOrg,(pixel*)piPred,piResi, uiStride);
 
     //===== transform and quantization =====
     {
@@ -1141,7 +1141,7 @@ Void TEncSearch::xIntraCodingChromaBlk(T
 
     //===== reconstruction =====
     
-    primitives.calcRecons((pixel*)piPred, piResi, (pixel*)piReco, piRecQt, (pixel*)piRecIPred, uiStride, uiRecQtStride, uiRecIPredStride, uiHeight, uiWidth);
+    primitives.calcRecons[uiWidth/8]((pixel*)piPred, piResi, (pixel*)piReco, piRecQt, (pixel*)piRecIPred, uiStride, uiRecQtStride, uiRecIPredStride);
 
     //===== update distortion =====
     int Part = x265::PartitionFromSizes(uiWidth, uiHeight);
--- a/source/common/ipfilter.cpp	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/common/ipfilter.cpp	Fri Jun 14 12:06:02 2013 -0500
@@ -315,6 +315,14 @@ void CDECL filterVertical_short_pel_mult
     filterVertical_short_pel<8>(bitDepth, src, srcStride, dstP, dstStride, block_width, block_height, TComPrediction::m_lumaFilter[3]);
 }
 
+void CDECL filterHorizontalMultiplane(int bitDepth, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, int block_width, int block_height)
+{
+    filterConvertPelToShort(bitDepth, src, srcStride, dstF, dstStride, block_width, block_height);
+    filterHorizontal_pel_short<8>(bitDepth, src, srcStride, dstB, dstStride, block_width, block_height, TComPrediction::m_lumaFilter[2]);
+    filterHorizontal_pel_short<8>(bitDepth, src, srcStride, dstA, dstStride, block_width, block_height, TComPrediction::m_lumaFilter[1]);
+    filterHorizontal_pel_short<8>(bitDepth, src, srcStride, dstC, dstStride, block_width, block_height, TComPrediction::m_lumaFilter[3]);
+}
+
 }
 
 #if _MSC_VER
@@ -340,5 +348,6 @@ void Setup_C_IPFilterPrimitives(EncoderP
     p.ipFilter_p_p[FILTER_V_P_P_4] = filterVertical_pel_pel<4>;
 
     p.filterVmulti = filterVertical_short_pel_multiplane;
+    p.filterHmulti = filterHorizontalMultiplane;
 }
 }
--- a/source/common/pixel.cpp	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/common/pixel.cpp	Fri Jun 14 12:06:02 2013 -0500
@@ -493,15 +493,16 @@ void CDECL convert32to16_shr(short *piDs
     }
 }
 
-void CDECL getResidual(pixel *piOrg, pixel *piPred, short *piResi, int height, int width, int stride)
+template <int blockSize>
+void CDECL getResidual(pixel *piOrg, pixel *piPred, short *piResi, int stride)
 {
     pixel* pOrg   = piOrg;
     pixel* pPred  = piPred;
     short* pResi  = piResi;
 
-    for (int uiY = 0; uiY < height; uiY++)
+    for (int uiY = 0; uiY < blockSize; uiY++)
     {
-        for (int uiX = 0; uiX < width; uiX++)
+        for (int uiX = 0; uiX < blockSize; uiX++)
         {
             pResi[uiX] = static_cast<short>(pOrg[uiX]) - static_cast<short>(pPred[uiX]);
         }
@@ -512,8 +513,8 @@ void CDECL getResidual(pixel *piOrg, pix
     }
 }
 
-
-void CDECL calcRecons(pixel* piPred, short* piResi, pixel* piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride, int uiHeight, int uiWidth)
+template <int blockSize>
+void CDECL calcRecons(pixel* piPred, short* piResi, pixel* piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride)
 {
     pixel* pPred      = piPred;
     short* pResi      = piResi;
@@ -521,9 +522,9 @@ void CDECL calcRecons(pixel* piPred, sho
     short* pRecQt     = piRecQt;
     pixel* pRecIPred  = piRecIPred;
 
-    for (int uiY = 0; uiY < uiHeight; uiY++)
+    for (int uiY = 0; uiY < blockSize; uiY++)
     {
-        for (int uiX = 0; uiX < uiWidth; uiX++)
+        for (int uiX = 0; uiX < blockSize; uiX++)
         {
             pReco[uiX] = (pixel) ClipY(static_cast<short>(pPred[uiX]) + pResi[uiX]);
             pRecQt[uiX] = (short)pReco[uiX];
@@ -864,7 +865,15 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sa8d_32x32 = pixel_sa8d_32x32;
     p.sa8d_64x64 = pixel_sa8d_64x64;
 
-    p.getResidue = getResidual;
-    p.calcRecons = calcRecons;
+    p.getResidue[BLOCK_4x4] = getResidual<4>;
+    p.getResidue[BLOCK_8x8] = getResidual<8>;
+    p.getResidue[BLOCK_16x16] = getResidual<16>;
+    p.getResidue[BLOCK_32x32] = getResidual<32>;
+    p.getResidue[BLOCK_64x64] = getResidual<64>;
+    p.calcRecons[BLOCK_4x4] = calcRecons<4>;
+    p.calcRecons[BLOCK_8x8] = calcRecons<8>;
+    p.calcRecons[BLOCK_16x16] = calcRecons<16>;
+    p.calcRecons[BLOCK_32x32] = calcRecons<32>;
+    p.calcRecons[BLOCK_64x64] = calcRecons<64>;
 }
 }
--- a/source/common/primitives.h	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/common/primitives.h	Fri Jun 14 12:06:02 2013 -0500
@@ -94,6 +94,16 @@ enum Partitions
     NUM_PARTITIONS
 };
 
+enum SquareBlocks   // Routines can be indexed using (Blocksize/8) gives (0,1,2,4,8)
+{
+    BLOCK_4x4,
+    BLOCK_8x8,
+    BLOCK_16x16,
+    BLOCK_32x32=4,
+    BLOCK_64x64=8,
+    NUM_BLOCKS
+};
+
 enum FilterConf
 {
     //Naming convention used is - FILTER_isVertical_N_isFirst_isLast
@@ -201,10 +211,10 @@ typedef void (CDECL * cvt16to32_shl_t)(i
 typedef void (CDECL * cvt32to16_t)(int *psOrg, short *piDst, int);
 typedef void (CDECL * cvt32to16_shr_t)(short *piDst, int *psOrg, int, int);
 typedef void (CDECL * dct_t)(short *pSrc, short *pDst, intptr_t stride);
-typedef void (CDECL * getResidue_t)(pixel *piOrig, pixel *piPred, short *piRes, int height, int width, int stride);
-typedef void (CDECL * calcRecons_t)(pixel* piPred, short* piResi,pixel*  piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride, int uiHeight, int uiWidth);
+typedef void (CDECL * getResidue_t)(pixel *piOrig, pixel *piPred, short *piRes, int stride);
+typedef void (CDECL * calcRecons_t)(pixel* piPred, short* piResi,pixel*  piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride);
 typedef void (CDECL * filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstA, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height);
-
+typedef void (CDECL * filterHmulti_t)(int bitDepth, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, int block_width, int block_height);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -244,9 +254,10 @@ struct EncoderPrimitives
     cvt16to32_shl_t cvt16to32_shl;
     cvt32to16_t cvt32to16;
     cvt32to16_shr_t cvt32to16_shr;
-    getResidue_t getResidue;
-    calcRecons_t calcRecons;
+    getResidue_t getResidue[NUM_BLOCKS];
+    calcRecons_t calcRecons[NUM_BLOCKS];
     filterVmulti_t filterVmulti;
+    filterHmulti_t filterHmulti;
 };
 
 /* This copy of the table is what gets used by all by the encoder.
--- a/source/common/vec/ipfilter.inc	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/common/vec/ipfilter.inc	Fri Jun 14 12:06:02 2013 -0500
@@ -42,6 +42,7 @@ void NAME(Setup_Vec_IPFilterPrimitives)(
 
 #if !HIGH_BIT_DEPTH
      p.filterVmulti = filterVertical_short_pel_multiplane;
+     p.filterHmulti = filterHorizontalMultiplane;
 #endif 
 }
 }
--- a/source/common/vec/ipfilter8.inc	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/common/vec/ipfilter8.inc	Fri Jun 14 12:06:02 2013 -0500
@@ -611,6 +611,172 @@ void CDECL filterHorizontal_pel_pel(int 
     }
 }
 
+void CDECL filterHorizontalMultiplane(int /*bitDepth*/, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, int block_width, int block_height)
+{
+    int row, col;
+
+    src -= (8 / 2 - 1);
+    int offset;
+    int headRoom = IF_INTERNAL_PREC - 8;
+    int shift = IF_FILTER_PREC;
+    shift -= headRoom;
+    offset = -IF_INTERNAL_OFFS << shift;
+
+    Vec8s vec_sum_low, vec_sum_high;
+    Vec16uc vec_src0;
+    Vec8s vec_offset(offset);
+    Vec8s sumaL, sumaH, sumbL, sumbH, sumcL, sumcH, tmp, exp1;
+
+    // Load Ai, ai += Ai*coefi
+
+    for (row = 0; row < block_height; row++)
+    {
+        col = 0;
+
+        for (; col + 16 <= (block_width); col += 16)               // Iterations multiple of 8
+        {
+            vec_src0.load(src + col);
+            sumbL = -(extend_low(vec_src0));
+            sumbH = -(extend_high(vec_src0));
+
+            // a = b+=4*a1,  c+=1*a1
+            vec_src0.load(src + col + 1);                       // Load the 8 elements
+            sumcL = extend_low(vec_src0);
+            sumbL += (sumcL << 2);
+            sumaL = sumbL;
+            sumcH = extend_high(vec_src0);
+            sumbH += (sumcH << 2);
+            sumaH = sumbH;
+
+            // a +=-10*a2    b+=-11*a2      c+=-5*a2
+            vec_src0.load(src + col + 2);
+            tmp = extend_low(vec_src0);
+            sumbL -= tmp;
+            tmp *= (-5);
+            sumcL += tmp;
+            tmp <<= 1;
+            sumaL += tmp;
+            sumbL += tmp;
+            tmp = extend_high(vec_src0);
+            sumbH -= tmp;
+            tmp *= (-5);
+            sumcH += tmp;
+            tmp <<= 1;
+            sumaH += tmp;
+            sumbH += tmp;
+
+            // a +=58*a3    b+=40*a3      c+=17*a3
+            vec_src0.load(src + col + 3);
+            tmp = extend_low(vec_src0);
+            ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + col);    // storing A as short into intermediate buffer
+            exp1 = (tmp << 4) + tmp;
+            sumcL += exp1;
+            sumaL += tmp;
+            tmp *= 40;
+            sumbL += tmp;
+            sumaL += (tmp + exp1);
+            tmp = extend_high(vec_src0);
+            ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + col + 8);    // storing A as short into intermediate buffer
+            exp1 = (tmp << 4) + tmp;
+            sumcH += exp1;
+            sumaH += tmp;
+            tmp *= 40;
+            sumbH += tmp;
+            sumaH += (tmp + exp1);
+
+            // a +=17*a4    b+=40*a4      c+=58*a4
+            vec_src0.load(src + col + 4);
+            tmp = extend_low(vec_src0);
+            exp1 = (tmp << 4) + tmp;
+            sumaL += exp1;
+            sumcL += tmp;
+            tmp *= 40;
+            sumbL += tmp;
+            sumcL += (tmp + exp1);
+            tmp = extend_high(vec_src0);
+            exp1 = (tmp << 4) + tmp;
+            sumaH += exp1;
+            sumcH += tmp;
+            tmp *= 40;
+            sumbH += tmp;
+            sumcH += (tmp + exp1);
+
+            // a +=-5*a5    b+=-11*a5      c+=-10*a5
+            vec_src0.load(src + col + 5);
+            tmp = extend_low(vec_src0);
+            sumbL -= tmp;
+            tmp *= (-5);
+            sumaL += tmp;
+            tmp <<= 1;
+            sumcL += tmp;
+            sumbL += tmp;
+            tmp = extend_high(vec_src0);
+            sumbH -= tmp;
+            tmp *= (-5);
+            sumaH += tmp;
+            tmp <<= 1;
+            sumcH += tmp;
+            sumbH += tmp;
+
+            // a +=1*a6    b+=4*a6      c+=4*a6
+            vec_src0.load(src + col + 6);
+            tmp = extend_low(vec_src0);
+            sumaL += tmp;
+            tmp <<= 2;
+            sumbL += tmp;
+            sumcL += tmp;
+            tmp = extend_high(vec_src0);
+            sumaH += tmp;
+            tmp <<= 2;
+            sumbH += tmp;
+            sumcH += tmp;
+
+            // a +=0*a7    b+=-1*a7      c+=-1*a7
+            vec_src0.load(src + col + 7);
+            tmp = extend_low(vec_src0);
+            sumbL -= tmp;
+            sumcL -= tmp;
+            sumaL = (sumaL + vec_offset);               // Add offset to sum_low
+            sumbL = (sumbL + vec_offset);
+            sumcL = (sumcL + vec_offset);
+            tmp = extend_high(vec_src0);
+            sumbH -= tmp;
+            sumcH -= tmp;
+            sumaH = (sumaH + vec_offset);
+            sumbH = (sumbH + vec_offset);
+            sumcH = (sumcH + vec_offset);
+
+            sumaL.store(dstA + col);                             // Store vector
+            sumbL.store(dstB + col);
+            sumcL.store(dstC + col);
+            sumaH.store(dstA + col + 8);                             // Store vector
+            sumbH.store(dstB + col + 8);
+            sumcH.store(dstC + col + 8);
+        }
+
+        for (; col < block_width; col++)                           // Remaining iterations
+        {
+            vec_src0.load(src + col);
+            tmp = extend_low(vec_src0);                        // Assuming that there is no overflow (Everywhere in this function!)
+            int isuma = horizontal_add(tmp * Vec8s(-1, 4, -10, 58, 17,  -5, 1,  0));
+            int isumb = horizontal_add(tmp * Vec8s(-1, 4, -11, 40, 40, -11, 4, -1));
+            int isumc = horizontal_add(tmp * Vec8s(0, 1,  -5, 17, 58, -10, 4, -1));
+            short vala = (short)(isuma + offset) >> shift;
+            short valb = (short)(isumb + offset) >> shift;
+            short valc = (short)(isumc + offset) >> shift;
+            dstA[col] = vala;
+            dstB[col] = valb;
+            dstC[col] = valc;
+        }
+
+        src += srcStride;
+        dstF += dstStride;
+        dstA += dstStride;
+        dstB += dstStride;
+        dstC += dstStride;
+    }
+}
+
 template<int N>
 void CDECL filterHorizontal_pel_short(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int block_width, int block_height, short const *coeff)
 {
--- a/source/common/vec/pixel.inc	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/common/vec/pixel.inc	Fri Jun 14 12:06:02 2013 -0500
@@ -236,8 +236,16 @@ void NAME(Setup_Vec_PixelPrimitives)(Enc
     p.cvt32to16_shr = convert32to16_shr;
 
 #if !HIGH_BIT_DEPTH
-    p.getResidue = getResidual;
-    p.calcRecons = calcRecons;
+    p.getResidue[BLOCK_4x4] = getResidual4;
+    p.getResidue[BLOCK_8x8] = getResidual8;
+    p.getResidue[BLOCK_16x16] = getResidual<16>;
+    p.getResidue[BLOCK_32x32] = getResidual<32>;
+    p.getResidue[BLOCK_64x64] = getResidual<64>;
+    p.calcRecons[BLOCK_4x4] = calcRecons4;
+    p.calcRecons[BLOCK_8x8] = calcRecons8;
+    p.calcRecons[BLOCK_16x16] = calcRecons<16>;
+    p.calcRecons[BLOCK_32x32] = calcRecons<32>;
+    p.calcRecons[BLOCK_64x64] = calcRecons<64>;
 #endif 
 }
 }
--- a/source/common/vec/pixel8.inc	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/common/vec/pixel8.inc	Fri Jun 14 12:06:02 2013 -0500
@@ -83,6 +83,7 @@ template<>
 ALWAYSINLINE void unrollFunc_8<1>(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur, Vec8us& sad)
 {
     Vec16uc m1, n1;
+
     m1.load_a(piOrg);
     n1.load(piCur);
     sad.addSumAbsDiff(m1, n1);
@@ -111,6 +112,7 @@ int CDECL sad_8(pixel * piOrg, intptr_t 
         piOrg += strideOrg * 16;
         piCur += strideCur * 16;
     }
+
     if (ly & 8)
     {
         sad = 0;
@@ -213,7 +215,6 @@ int CDECL sad_16(pixel * piOrg, intptr_t
     return horizontal_add(sum);
 }
 
-
 template<int ly>
 int CDECL sad_24(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur)
 {
@@ -1715,80 +1716,89 @@ void CDECL sad_x4_64(pixel *piOrg, pixel
     res[3] = horizontal_add(sum4);
 }
 
-void CDECL getResidual(pixel *piOrg, pixel *piPred, short *piResi, int height, int width, int stride)
+void CDECL getResidual4(pixel *piOrg, pixel *piPred, short *piResi, int stride)
 {
     pixel*  pOrg    = piOrg;
     pixel*  pPred   = piPred;
     short* pResi  = piResi;
-
     Vec8s v_pOrg, v_pPred, v_pResilo, v_pResihi;
     Vec16uc v_temp1, v_temp2;
-    if(width == 4)
-    {
-        for (int uiY = 0; uiY < height; uiY++)
-        {
-            v_temp1 = load_partial(const_int(4), pOrg);
-
-            v_pOrg = extend_low(v_temp1);
-            v_temp2 = load_partial(const_int(4), pPred);
-            v_pPred = extend_low(v_temp2);
-
-            v_pResilo = v_pOrg - v_pPred;
-            store_partial(const_int(8), pResi, v_pResilo);
 
-            pOrg  += stride;
-            pResi += stride;
-            pPred += stride;
-        }
+    for (int uiY = 0; uiY < 4; uiY++)
+    {
+        v_temp1 = load_partial(const_int(4), pOrg);
+
+        v_pOrg = extend_low(v_temp1);
+        v_temp2 = load_partial(const_int(4), pPred);
+        v_pPred = extend_low(v_temp2);
+
+        v_pResilo = v_pOrg - v_pPred;
+        store_partial(const_int(8), pResi, v_pResilo);
+
+        pOrg  += stride;
+        pResi += stride;
+        pPred += stride;
     }
-    else if(width == 8)
+}
+
+void CDECL getResidual8(pixel *piOrg, pixel *piPred, short *piResi, int stride)
+{
+    pixel*  pOrg    = piOrg;
+    pixel*  pPred   = piPred;
+    short* pResi  = piResi;
+    Vec8s v_pOrg, v_pPred, v_pResilo, v_pResihi;
+    Vec16uc v_temp1, v_temp2;
+
+    for (int uiY = 0; uiY < 8; uiY++)
     {
-        for (int uiY = 0; uiY < height; uiY++)
+        v_temp1.load(pOrg);
+        v_pOrg = extend_low(v_temp1);
+        v_temp2.load(pPred);
+        v_pPred = extend_low(v_temp2);
+
+        v_pResilo = v_pOrg - v_pPred;
+        v_pResilo.store(pResi);
+
+        pOrg  += stride;
+        pResi += stride;
+        pPred += stride;
+    }
+}
+
+template<int blockSize>
+void CDECL getResidual(pixel *piOrg, pixel *piPred, short *piResi, int stride)
+{
+    pixel*  pOrg    = piOrg;
+    pixel*  pPred   = piPred;
+    short* pResi  = piResi;
+    Vec8s v_pOrg, v_pPred, v_pResilo, v_pResihi;
+    Vec16uc v_temp1, v_temp2;
+
+    for (int uiY = 0; uiY < blockSize; uiY++)
+    {
+        for (int uiX = 0; uiX < blockSize; uiX += 16)
         {
-            //pResi[uiX] = static_cast<short>(pOrg[uiX]) - static_cast<short>(pPred[uiX]);
-            v_temp1.load(pOrg);
+            v_temp1.load(pOrg + uiX);
             v_pOrg = extend_low(v_temp1);
-            v_temp2.load(pPred);
+            v_temp2.load(pPred + uiX);
             v_pPred = extend_low(v_temp2);
 
             v_pResilo = v_pOrg - v_pPred;
-            v_pResilo.store(pResi);
+            v_pResilo.store(pResi + uiX);
 
-            pOrg  += stride;
-            pResi += stride;
-            pPred += stride;
+            v_pOrg = extend_high(v_temp1);
+            v_pPred = extend_high(v_temp2);
+            v_pResihi = v_pOrg - v_pPred;
+            v_pResihi.store(pResi + uiX + 8);
         }
 
-    }
-    else 
-    {
-        for (int uiY = 0; uiY < height; uiY++)
-        {
-            for (int uiX = 0; uiX < width; uiX+=16)
-            {
-                v_temp1.load(pOrg+uiX);
-                v_pOrg = extend_low(v_temp1);
-                v_temp2.load(pPred+uiX);
-                v_pPred = extend_low(v_temp2);
-
-                v_pResilo = v_pOrg - v_pPred;
-                v_pResilo.store(pResi+uiX);
-
-                v_pOrg = extend_high(v_temp1);
-                v_pPred = extend_high(v_temp2);
-                v_pResihi = v_pOrg - v_pPred;
-                v_pResihi.store(pResi+uiX+8);
-
-            }
-
-            pOrg  += stride;
-            pResi += stride;
-            pPred += stride;
-        }
+        pOrg  += stride;
+        pResi += stride;
+        pPred += stride;
     }
 }
 
-void CDECL calcRecons(pixel* piPred, short* piResi, pixel* piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride, int uiHeight, int uiWidth)
+void CDECL calcRecons4(pixel* piPred, short* piResi, pixel* piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride)
 {
     pixel* pPred      = piPred;
     short* pResi      = piResi;
@@ -1796,97 +1806,109 @@ void CDECL calcRecons(pixel* piPred, sho
     short* pRecQt     = piRecQt;
     pixel* pRecIPred  = piRecIPred;
 
-    if (uiWidth == 4)
+    for (int uiY = 0; uiY < 4; uiY++)
     {
-        for (int uiY = 0; uiY < uiHeight; uiY++)
-        {
-            Vec8s vresi, vpred, vres, vsum;
-            Vec16uc tmp;
-
-            tmp = load_partial(const_int(4), pPred );
-            vpred = extend_low(tmp);
-
-            vresi = load_partial(const_int(8), pResi);
-            vsum = vpred + vresi;
-
-            vsum = min(255, max(vsum, 0));
-
-            store_partial(const_int(8), pRecQt, vsum);
+        Vec8s vresi, vpred, vres, vsum;
+        Vec16uc tmp;
 
-            tmp = compress(vsum, vsum);
-
-            store_partial(const_int(4), pReco, tmp);
-            store_partial(const_int(4), pRecIPred, tmp);
+        tmp = load_partial(const_int(4), pPred);
+        vpred = extend_low(tmp);
 
-            pPred     += uiStride;
-            pResi     += uiStride;
-            pReco     += uiStride;
-            pRecQt    += uiRecQtStride;
-            pRecIPred += uiRecIPredStride;
-        }
+        vresi = load_partial(const_int(8), pResi);
+        vsum = vpred + vresi;
+
+        vsum = min(255, max(vsum, 0));
+
+        store_partial(const_int(8), pRecQt, vsum);
+
+        tmp = compress(vsum, vsum);
+
+        store_partial(const_int(4), pReco, tmp);
+        store_partial(const_int(4), pRecIPred, tmp);
+
+        pPred     += uiStride;
+        pResi     += uiStride;
+        pReco     += uiStride;
+        pRecQt    += uiRecQtStride;
+        pRecIPred += uiRecIPredStride;
     }
-    else if (uiWidth == 8)
+}
+
+void CDECL calcRecons8(pixel* piPred, short* piResi, pixel* piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride)
+{
+    pixel* pPred      = piPred;
+    short* pResi      = piResi;
+    pixel* pReco      = piReco;
+    short* pRecQt     = piRecQt;
+    pixel* pRecIPred  = piRecIPred;
+
+    for (int uiY = 0; uiY < 8; uiY++)
     {
-        for (int uiY = 0; uiY < uiHeight; uiY++)
+        Vec8s vresi, vpred, vres, vsum;
+        Vec16uc tmp;
+
+        tmp.load(pPred);
+        vpred = extend_low(tmp);
+
+        vresi.load(pResi);
+        vsum = vpred + vresi;
+
+        vsum = min(255, max(vsum, 0));
+
+        vsum.store(pRecQt);
+
+        tmp = compress(vsum, vsum);
+
+        store_partial(const_int(8), pReco, tmp);
+        store_partial(const_int(8), pRecIPred, tmp);
+
+        pPred     += uiStride;
+        pResi     += uiStride;
+        pReco     += uiStride;
+        pRecQt    += uiRecQtStride;
+        pRecIPred += uiRecIPredStride;
+    }
+}
+
+template<int blockSize>
+void CDECL calcRecons(pixel* piPred, short* piResi, pixel* piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride)
+{
+    pixel* pPred      = piPred;
+    short* pResi      = piResi;
+    pixel* pReco      = piReco;
+    short* pRecQt     = piRecQt;
+    pixel* pRecIPred  = piRecIPred;
+
+    for (int uiY = 0; uiY < blockSize; uiY++)
+    {
+        for (int uiX = 0; uiX < blockSize; uiX += 16)
         {
-            Vec8s vresi, vpred, vres, vsum;
+            Vec8s vresi, vpred, vres, vsum1, vsum2;
             Vec16uc tmp;
 
-            tmp.load(pPred);
-            vpred = extend_low(tmp);
-
-            vresi.load(pResi);
-            vsum = vpred + vresi;
-
-            vsum = min(255, max(vsum, 0));
-
-            vsum.store(pRecQt);
+            tmp.load(pPred + uiX);
 
-            tmp = compress(vsum, vsum);
-
-            store_partial(const_int(8), pReco, tmp);
-            store_partial(const_int(8), pRecIPred, tmp);
+            vpred = extend_low(tmp);
+            vresi.load(pResi + uiX);
+            vsum1 = vpred + vresi;
+            vsum1 = min(255, max(vsum1, 0));
+            vsum1.store(pRecQt + uiX);
 
-            pPred     += uiStride;
-            pResi     += uiStride;
-            pReco     += uiStride;
-            pRecQt    += uiRecQtStride;
-            pRecIPred += uiRecIPredStride;
+            vpred = extend_high(tmp);
+            vresi.load(pResi + uiX + 8);
+            vsum2 = vpred + vresi;
+            vsum2 = min(255, max(vsum2, 0));
+            vsum2.store(pRecQt + uiX + 8);
+
+            tmp = compress(vsum1, vsum2);
+            tmp.store(pReco + uiX);
+            tmp.store(pRecIPred + uiX);
         }
-    }
-    else
-    {
-        for (int uiY = 0; uiY < uiHeight; uiY++)
-        {
-            for (int uiX = 0; uiX < uiWidth; uiX += 16)
-            {
-                Vec8s vresi, vpred, vres, vsum1, vsum2;
-                Vec16uc tmp;
-
-                tmp.load(pPred + uiX);
 
-                vpred = extend_low(tmp);
-                vresi.load(pResi+uiX);
-                vsum1 = vpred + vresi;
-                vsum1 = min(255, max(vsum1, 0));
-                vsum1.store(pRecQt + uiX);
-
-                vpred = extend_high(tmp);
-                vresi.load(pResi+uiX + 8);
-                vsum2 = vpred + vresi;
-                vsum2 = min(255, max(vsum2, 0));
-                vsum2.store(pRecQt + uiX + 8);
-
-                tmp = compress(vsum1, vsum2);
-                tmp.store(pReco + uiX);
-                tmp.store(pRecIPred + uiX);
-            }
-
-            pPred     += uiStride;
-            pResi     += uiStride;
-            pReco     += uiStride;
-            pRecQt    += uiRecQtStride;
-            pRecIPred += uiRecIPredStride;
-        }
+        pPred     += uiStride;
+        pResi     += uiStride;
+        pReco     += uiStride;
+        pRecQt    += uiRecQtStride;
+        pRecIPred += uiRecIPredStride;
     }
 }
--- a/source/test/ipfilterharness.cpp	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/test/ipfilterharness.cpp	Fri Jun 14 12:06:02 2013 -0500
@@ -266,29 +266,29 @@ bool IPFilterHarness::check_filterVMulti
     short rand_width = 32;                  // Can be randomly generated Width
     short rand_srcStride, rand_dstStride;
 
-    pixel dstAvec[100*100];
-    pixel dstEvec[100*100];
-    pixel dstIvec[100*100];
-    pixel dstPvec[100*100];
+    pixel dstAvec[100 * 100];
+    pixel dstEvec[100 * 100];
+    pixel dstIvec[100 * 100];
+    pixel dstPvec[100 * 100];
 
-    pixel dstAref[100*100];
-    pixel dstEref[100*100];
-    pixel dstIref[100*100];
-    pixel dstPref[100*100];
+    pixel dstAref[100 * 100];
+    pixel dstEref[100 * 100];
+    pixel dstIref[100 * 100];
+    pixel dstPref[100 * 100];
 
     for (int i = 0; i <= 100; i++)
-    {        
-        rand_srcStride = 64;               // Can be randomly generated 
-        rand_dstStride = 64;               
-        memset(dstAref, 0, 10000*sizeof(pixel));
-        memset(dstEref, 0, 10000*sizeof(pixel));
-        memset(dstIref, 0, 10000*sizeof(pixel));
-        memset(dstPref, 0, 10000*sizeof(pixel));
-        memset(dstAvec, 0, 10000*sizeof(pixel));
-        memset(dstEvec, 0, 10000*sizeof(pixel));
-        memset(dstIvec, 0, 10000*sizeof(pixel));
-        memset(dstPvec, 0, 10000*sizeof(pixel));
-        opt(8, short_buff + 8*rand_srcStride,
+    {
+        rand_srcStride = 64;               // Can be randomly generated
+        rand_dstStride = 64;
+        memset(dstAref, 0, 10000 * sizeof(pixel));
+        memset(dstEref, 0, 10000 * sizeof(pixel));
+        memset(dstIref, 0, 10000 * sizeof(pixel));
+        memset(dstPref, 0, 10000 * sizeof(pixel));
+        memset(dstAvec, 0, 10000 * sizeof(pixel));
+        memset(dstEvec, 0, 10000 * sizeof(pixel));
+        memset(dstIvec, 0, 10000 * sizeof(pixel));
+        memset(dstPvec, 0, 10000 * sizeof(pixel));
+        opt(8, short_buff + 8 * rand_srcStride,
             rand_srcStride,
             dstAvec, dstEvec, dstIvec, dstPvec,
             rand_dstStride,
@@ -301,8 +301,8 @@ bool IPFilterHarness::check_filterVMulti
             rand_width,
             rand_height);
 
-        if (memcmp(dstAvec,dstAref, 100*100 * sizeof(pixel)) || memcmp(dstEvec,dstEref, 100*100 * sizeof(pixel)) 
-            || memcmp(dstIvec,dstIref, 100*100 * sizeof(pixel)) || memcmp(dstPvec,dstPref, 100*100 * sizeof(pixel)))
+        if (memcmp(dstAvec, dstAref, 100 * 100 * sizeof(pixel)) || memcmp(dstEvec, dstEref, 100 * 100 * sizeof(pixel))
+            || memcmp(dstIvec, dstIref, 100 * 100 * sizeof(pixel)) || memcmp(dstPvec, dstPref, 100 * 100 * sizeof(pixel)))
         {
             return false;
         }
@@ -311,6 +311,56 @@ bool IPFilterHarness::check_filterVMulti
     return true;
 }
 
+bool IPFilterHarness::check_filterHMultiplane(x265::filterHmulti_t ref, x265::filterHmulti_t opt)
+{
+    short rand_height = 32;                 // Can be randomly generated Height
+    short rand_width = 32;                  // Can be randomly generated Width
+    short rand_srcStride, rand_dstStride;
+
+    short dstAvec[100 * 100];
+    short dstEvec[100 * 100];
+    short dstIvec[100 * 100];
+    short dstPvec[100 * 100];
+
+    short dstAref[100 * 100];
+    short dstEref[100 * 100];
+    short dstIref[100 * 100];
+    short dstPref[100 * 100];
+
+    for (int i = 0; i <= 100; i++)
+    {
+        rand_srcStride = 64;               // Can be randomly generated
+        rand_dstStride = 64;
+        memset(dstAref, 0, 10000 * sizeof(short));
+        memset(dstEref, 0, 10000 * sizeof(short));
+        memset(dstIref, 0, 10000 * sizeof(short));
+        memset(dstPref, 0, 10000 * sizeof(short));
+        memset(dstAvec, 0, 10000 * sizeof(short));
+        memset(dstEvec, 0, 10000 * sizeof(short));
+        memset(dstIvec, 0, 10000 * sizeof(short));
+        memset(dstPvec, 0, 10000 * sizeof(short));
+        opt(8, pixel_buff + 3 * rand_srcStride,
+            rand_srcStride,
+            dstAvec, dstEvec, dstIvec, dstPvec,
+            rand_dstStride,
+            rand_width,
+            rand_height);
+        ref(8, pixel_buff + 3 * rand_srcStride,
+            rand_srcStride,
+            dstAref, dstEref, dstIref, dstPref,
+            rand_dstStride,
+            rand_width,
+            rand_height);
+
+        if (memcmp(dstAvec, dstAref, 100 * 100 * sizeof(short)) || memcmp(dstEvec, dstEref, 100 * 100 * sizeof(short))
+            || memcmp(dstIvec, dstIref, 100 * 100 * sizeof(short)) || memcmp(dstPvec, dstPref, 100 * 100 * sizeof(short)))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
 
 bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
@@ -367,15 +417,25 @@ bool IPFilterHarness::testCorrectness(co
             return false;
         }
     }
-
+    
     if (opt.filterVmulti)
     {
         if (!check_filterVMultiplane(ref.filterVmulti, opt.filterVmulti))
         {
-            printf("\nFilter-multiplane failed\n");
+            printf("\nFilter-V-multiplane failed\n");
             return false;
         }
     }
+
+    if (opt.filterHmulti)
+    {
+        if (!check_filterHMultiplane(ref.filterHmulti, opt.filterHmulti))
+        {
+            printf("\nFilter-H-multiplane failed\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -433,8 +493,15 @@ void IPFilterHarness::measureSpeed(const
 
     if (opt.filterVmulti)
     {
-        printf("Filter-multiplane\t");
+        printf("Filter-V-multiplane\t");
         REPORT_SPEEDUP(opt.filterVmulti, ref.filterVmulti,
                        8, short_buff + 8 * srcStride, srcStride, IPF_vec_output_p, IPF_C_output_p, IPF_vec_output_p, IPF_C_output_p, dstStride, width, height);
     }
+
+    if (opt.filterVmulti)
+    {
+        printf("Filter-H-multiplane\t");
+        REPORT_SPEEDUP(opt.filterHmulti, ref.filterHmulti,
+                       8, pixel_buff + 8 * srcStride, srcStride, IPF_vec_output_s, IPF_C_output_s, IPF_vec_output_s, IPF_C_output_s, dstStride, width, height);
+    }
 }
--- a/source/test/ipfilterharness.h	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/test/ipfilterharness.h	Fri Jun 14 12:06:02 2013 -0500
@@ -46,6 +46,7 @@ protected:
     bool check_IPFilter_primitive(x265::IPFilterConvert_p_s ref, x265::IPFilterConvert_p_s opt);
     bool check_IPFilter_primitive(x265::IPFilterConvert_s_p ref, x265::IPFilterConvert_s_p opt);
     bool check_filterVMultiplane(x265::filterVmulti_t ref, x265::filterVmulti_t opt);
+    bool check_filterHMultiplane(x265::filterHmulti_t ref, x265::filterHmulti_t opt);
 
 public:
 
--- a/source/test/pixelharness.cpp	Fri Jun 14 11:57:35 2013 -0500
+++ b/source/test/pixelharness.cpp	Fri Jun 14 12:06:02 2013 -0500
@@ -269,32 +269,29 @@ bool PixelHarness::check_block_copy_p_s(
 }
 
 bool PixelHarness::check_getResidue(x265::getResidue_t ref, x265::getResidue_t opt)
-{     
+{
     ALIGN_VAR_16(short, ref_dest[64 * 64]);
-    ALIGN_VAR_16(short, opt_dest[64 * 64]); 
-    memset(ref_dest,0,64*64*sizeof(short));
-    memset(opt_dest,0,64*64*sizeof(short));
+    ALIGN_VAR_16(short, opt_dest[64 * 64]);
+    memset(ref_dest, 0, 64 * 64 * sizeof(short));
+    memset(opt_dest, 0, 64 * 64 * sizeof(short));
     int j = 0;
     for (int i = 0; i <= 100; i++)
     {
-        int height = 4<<(i%4); // rand()%64 + 1;
-        int width = 4<<(i%4); //rand()%64 + 1;
         int stride = 64;
-        opt(pbuf1 + j, pbuf2+ j, opt_dest, height,width, stride);
-        ref(pbuf1 + j, pbuf2+ j, ref_dest, height,width, stride);
+        opt(pbuf1 + j, pbuf2 + j, opt_dest, stride);
+        ref(pbuf1 + j, pbuf2 + j, ref_dest, stride);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
             return false;
 
         j += 100;
-        
     }
 
     return true;
 }
 
 bool PixelHarness::check_calcRecons(x265::calcRecons_t ref, x265::calcRecons_t opt)
-{     
+{
     ALIGN_VAR_16(short, ref_recq[64 * 64]);
     ALIGN_VAR_16(short, opt_recq[64 * 64]);
 
@@ -304,33 +301,29 @@ bool PixelHarness::check_calcRecons(x265
     ALIGN_VAR_16(pixel, ref_pred[64 * 64]);
     ALIGN_VAR_16(pixel, opt_pred[64 * 64]);
 
-    memset(ref_recq,0,64*64*sizeof(short));
-    memset(opt_recq,0,64*64*sizeof(short));
-    memset(ref_reco,0,64*64*sizeof(pixel));
-    memset(opt_reco,0,64*64*sizeof(pixel));
-    memset(ref_pred,0,64*64*sizeof(pixel));
-    memset(opt_pred,0,64*64*sizeof(pixel));
+    memset(ref_recq, 0, 64 * 64 * sizeof(short));
+    memset(opt_recq, 0, 64 * 64 * sizeof(short));
+    memset(ref_reco, 0, 64 * 64 * sizeof(pixel));
+    memset(opt_reco, 0, 64 * 64 * sizeof(pixel));
+    memset(ref_pred, 0, 64 * 64 * sizeof(pixel));
+    memset(opt_pred, 0, 64 * 64 * sizeof(pixel));
 
     int j = 0;
     for (int i = 0; i <= 100; i++)
     {
-        int height = 4<<(i%4); // rand()%64 + 1;
-        int width = 4<<(i%4); //rand()%64 + 1;
         int stride = 64;
-        opt(pbuf1 + j, sbuf1+ j, opt_reco, opt_recq, opt_pred, stride, stride, stride, height, width );
-        ref(pbuf1 + j, sbuf1+ j, ref_reco, ref_recq, ref_pred, stride, stride, stride, height, width );
+        opt(pbuf1 + j, sbuf1 + j, opt_reco, opt_recq, opt_pred, stride, stride, stride);
+        ref(pbuf1 + j, sbuf1 + j, ref_reco, ref_recq, ref_pred, stride, stride, stride);
 
         if (memcmp(ref_recq, opt_recq, 64 * 64 * sizeof(short)) || memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel)) || memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel)))
             return false;
 
         j += 100;
-        
     }
 
     return true;
 }
 
-
 bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
@@ -471,23 +464,30 @@ bool PixelHarness::testCorrectness(const
         }
     }
 
-    if (opt.getResidue)
+    for (int i = 0; i <= 8; (i ? i <<= 1 : i++))
     {
-        if (!check_getResidue(ref.getResidue, opt.getResidue))
+        if (opt.getResidue[i])
         {
-            printf("getResidue failed!\n");
-            return false;
+            if (!check_getResidue(ref.getResidue[i], opt.getResidue[i]))
+            {
+                printf("getResidue width:%d failed!\n", i ? i * 8 : 4);
+                return false;
+            }
         }
     }
 
-    if (opt.calcRecons)
+    for (int i = 0; i <= 8; (i ? i <<= 1 : i++))
     {
-        if (!check_calcRecons(ref.calcRecons, opt.calcRecons))
+        if (opt.calcRecons[i])
         {
-            printf("calcRecon failed!\n");
-            return false;
+            if (!check_calcRecons(ref.calcRecons[i], opt.calcRecons[i]))
+            {
+                printf("calcRecon width:%d failed!\n", i ? i * 8 : 4);
+                return false;
+            }
         }
     }
+
     return true;
 }
 
@@ -589,21 +589,21 @@ void PixelHarness::measureSpeed(const En
         REPORT_SPEEDUP(opt.cpyblock_s_c, ref.cpyblock_s_c, 64, 64, (short*)pbuf1, FENC_STRIDE, (uint8_t*)pbuf2, STRIDE);
     }
 
-    if (opt.getResidue)
+    for (int i = 0; i <= 8; i ? i <<= 1 : i++)
     {
-        for(int i =4; i<=32; i<<=1)
+        if (opt.getResidue[i])
         {
-            printf("getResidue%dx%d", i,i);
-            REPORT_SPEEDUP(opt.getResidue, ref.getResidue, pbuf1 , pbuf2, sbuf1, i, i, 64);
+            printf("getResidue%dx%d", (i ? i * 8 : 4), (i ? i * 8 : 4));
+            REPORT_SPEEDUP(opt.getResidue[i], ref.getResidue[i], pbuf1, pbuf2, sbuf1, 64);
         }
     }
 
-    if (opt.calcRecons)
+    for (int i = 0; i <= 8; i ? i <<= 1 : i++)
     {
-        for(int i =4; i<=32; i<<=1)
+        if (opt.calcRecons[i])
         {
-            printf("calcRecons%dx%d", i,i);
-            REPORT_SPEEDUP(opt.calcRecons, ref.calcRecons, pbuf1 , sbuf1, pbuf2, sbuf1, pbuf1, 64,64,64, i, i);
+            printf("calcRecons%dx%d", (i ? i * 8 : 4), (i ? i * 8 : 4));
+            REPORT_SPEEDUP(opt.calcRecons[i], ref.calcRecons[i], pbuf1, sbuf1, pbuf2, sbuf1, pbuf1, 64, 64, 64);
         }
     }
 }