changeset 1193:a9d4a7ce337c

Merged multicoreware/xhevc into default
author Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
date Mon, 06 May 2013 14:34:00 +0530
parents 260dd458563c (current diff) 81577757e572 (diff)
children 5e8ea012ff63
files source/Lib/TLibCommon/TComPicYuv.h source/encoder/vec/interpolationfilter.inc
diffstat 36 files changed, 4951 insertions(+-), 5135 deletions(-) [+]
line wrap: on
line diff
--- a/source/CMakeLists.txt	Fri May 03 17:54:14 2013 +0530
+++ b/source/CMakeLists.txt	Mon May 06 14:34:00 2013 +0530
@@ -20,12 +20,8 @@ if(MSVC)
     add_definitions(/W4 /WX /D_CRT_SECURE_NO_WARNINGS)
     add_definitions(/Ob2) # always inline
     add_definitions(/Oi)  # enable intrinsics
+    add_definitions(/MP)  # multithreaded build
     include_directories(compat/msvc)
-    # Add some multithreaded build support
-    option(MULTITHREADED_BUILD "Number of threads to use for build" ON)
-    if(MULTITHREADED_BUILD)
-        add_definitions(/MP)
-    endif()
 endif(MSVC)
 
 if("$ENV{CXX}" STREQUAL "icpc")
--- a/source/Lib/TLibCommon/TComDataCU.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComDataCU.h	Mon May 06 14:34:00 2013 +0530
@@ -242,15 +242,16 @@ public:
     // member functions for CU description
     // -------------------------------------------------------------------------------------------------------------------
 
-    TComPic*      getPic()                        { return m_pcPic; }
+    TComPic*      getPic()                         { return m_pcPic; }
 
-    TComSlice*    getSlice()                        { return m_pcSlice; }
+    TComSlice*    getSlice()                       { return m_pcSlice; }
 
     UInt&         getAddr()                        { return m_uiCUAddr; }
 
-    UInt&         getZorderIdxInCU()                        { return m_uiAbsIdxInLCU; }
+    UInt&         getZorderIdxInCU()               { return m_uiAbsIdxInLCU; }
 
     UInt          getSCUAddr();
+
     UInt          getCUPelX()                        { return m_uiCUPelX; }
 
     UInt          getCUPelY()                        { return m_uiCUPelY; }
--- a/source/Lib/TLibCommon/TComPattern.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComPattern.cpp	Mon May 06 14:34:00 2013 +0530
@@ -171,10 +171,10 @@ Void TComPattern::initPattern(TComDataCU
     m_cPatternCr.setPatternParamCU(pcCU, 2, uiWidth >> 1, uiHeight >> 1, uiOffsetLeft, uiOffsetAbove, uiAbsPartIdx);
 }
 
-Void TComPattern::initAdiPattern(TComDataCU* pcCU, UInt uiZorderIdxInPart, UInt uiPartDepth, Int* piAdiBuf, Int iOrgBufStride, Int iOrgBufHeight, Bool& bAbove, Bool& bLeft, Bool bLMmode)
+Void TComPattern::initAdiPattern(TComDataCU* pcCU, UInt uiZorderIdxInPart, UInt uiPartDepth, Pel* piAdiBuf, Int iOrgBufStride, Int iOrgBufHeight, Bool& bAbove, Bool& bLeft)
 {
     Pel*  piRoiOrigin;
-    Int*  piAdiTemp;
+    Pel*  piAdiTemp;
     UInt  uiCuWidth   = pcCU->getWidth(0) >> uiPartDepth;
     UInt  uiCuHeight  = pcCU->getHeight(0) >> uiPartDepth;
     UInt  uiCuWidth2  = uiCuWidth << 1;
@@ -218,33 +218,31 @@ Void TComPattern::initAdiPattern(TComDat
     piRoiOrigin = pcCU->getPic()->getPicYuvRec()->getLumaAddr(pcCU->getAddr(), pcCU->getZorderIdxInCU() + uiZorderIdxInPart);
     piAdiTemp   = piAdiBuf;
 
-    fillReferenceSamples(g_bitDepthY, piRoiOrigin, piAdiTemp, bNeighborFlags, iNumIntraNeighbor, iUnitSize, iNumUnitsInCu, iTotalUnits, uiCuWidth, uiCuHeight, uiWidth, uiHeight, iPicStride, bLMmode);
+    fillReferenceSamples(g_bitDepthY, piRoiOrigin, piAdiTemp, bNeighborFlags, iNumIntraNeighbor, iUnitSize, iNumUnitsInCu, iTotalUnits, uiCuWidth, uiCuHeight, uiWidth, uiHeight, iPicStride);
 
     Int   i;
     // generate filtered intra prediction samples
     Int iBufSize = uiCuHeight2 + uiCuWidth2 + 1; // left and left above border + above and above right border + top left corner = length of 3. filter buffer
 
-    UInt uiWH = uiWidth * uiHeight;             // number of elements in one buffer
+    UInt uiWH = ADI_BUF_STRIDE * uiHeight;       // number of elements in one buffer
 
-    Int* piFilteredBuf1 = piAdiBuf + uiWH;      // 1. filter buffer
-    Int* piFilteredBuf2 = piFilteredBuf1 + uiWH; // 2. filter buffer
-    Int* piFilterBuf = piFilteredBuf2 + uiWH;   // buffer for 2. filtering (sequential)
-    Int* piFilterBufN = piFilterBuf + iBufSize; // buffer for 1. filtering (sequential)
+    Pel* piFilteredBuf1 = piAdiBuf + uiWH;      // 1. filter buffer
+    Pel* piFilteredBuf2 = piFilteredBuf1 + uiWH; // 2. filter buffer
+    Pel* piFilterBuf = piFilteredBuf2 + uiWH;   // buffer for 2. filtering (sequential)
+    Pel* piFilterBufN = piFilterBuf + iBufSize; // buffer for 1. filtering (sequential)
 
     Int l = 0;
     // left border from bottom to top
     for (i = 0; i < uiCuHeight2; i++)
     {
-        piFilterBuf[l++] = piAdiTemp[uiWidth * (uiCuHeight2 - i)];
+        piFilterBuf[l++] = piAdiTemp[ADI_BUF_STRIDE * (uiCuHeight2 - i)];
     }
 
     // top left corner
     piFilterBuf[l++] = piAdiTemp[0];
+
     // above border from left to right
-    for (i = 0; i < uiCuWidth2; i++)
-    {
-        piFilterBuf[l++] = piAdiTemp[1 + i];
-    }
+    memcpy(&piFilterBuf[l], &piAdiTemp[1], uiCuWidth2*sizeof(*piFilterBuf));
 
     if (pcCU->getSlice()->getSPS()->getUseStrongIntraSmoothing())
     {
@@ -298,20 +296,17 @@ Void TComPattern::initAdiPattern(TComDat
     l = 0;
     for (i = 0; i < uiCuHeight2; i++)
     {
-        piFilteredBuf1[uiWidth * (uiCuHeight2 - i)] = piFilterBufN[l++];
+        piFilteredBuf1[ADI_BUF_STRIDE * (uiCuHeight2 - i)] = piFilterBufN[l++];
     }
 
     piFilteredBuf1[0] = piFilterBufN[l++];
-    for (i = 0; i < uiCuWidth2; i++)
-    {
-        piFilteredBuf1[1 + i] = piFilterBufN[l++];
-    }
+    memcpy(&piFilteredBuf1[1], &piFilterBufN[l], uiCuWidth2 * sizeof(*piFilteredBuf1));
 }
 
-Void TComPattern::initAdiPatternChroma(TComDataCU* pcCU, UInt uiZorderIdxInPart, UInt uiPartDepth, Int* piAdiBuf, Int iOrgBufStride, Int iOrgBufHeight, Bool& bAbove, Bool& bLeft)
+Void TComPattern::initAdiPatternChroma(TComDataCU* pcCU, UInt uiZorderIdxInPart, UInt uiPartDepth, Pel* piAdiBuf, Int iOrgBufStride, Int iOrgBufHeight, Bool& bAbove, Bool& bLeft)
 {
     Pel*  piRoiOrigin;
-    Int*  piAdiTemp;
+    Pel*  piAdiTemp;
     UInt  uiCuWidth  = pcCU->getWidth(0) >> uiPartDepth;
     UInt  uiCuHeight = pcCU->getHeight(0) >> uiPartDepth;
     UInt  uiWidth;
@@ -362,12 +357,12 @@ Void TComPattern::initAdiPatternChroma(T
 
     // get Cr pattern
     piRoiOrigin = pcCU->getPic()->getPicYuvRec()->getCrAddr(pcCU->getAddr(), pcCU->getZorderIdxInCU() + uiZorderIdxInPart);
-    piAdiTemp   = piAdiBuf + uiWidth * uiHeight;
+    piAdiTemp   = piAdiBuf + ADI_BUF_STRIDE * uiHeight;
 
     fillReferenceSamples(g_bitDepthC, piRoiOrigin, piAdiTemp, bNeighborFlags, iNumIntraNeighbor, iUnitSize, iNumUnitsInCu, iTotalUnits, uiCuWidth, uiCuHeight, uiWidth, uiHeight, iPicStride);
 }
 
-Void TComPattern::fillReferenceSamples(Int bitDepth, Pel* piRoiOrigin, Int* piAdiTemp, Bool* bNeighborFlags, Int iNumIntraNeighbor, Int iUnitSize, Int iNumUnitsInCu, Int iTotalUnits, UInt uiCuWidth, UInt uiCuHeight, UInt uiWidth, UInt uiHeight, Int iPicStride, Bool bLMmode)
+Void TComPattern::fillReferenceSamples(Int bitDepth, Pel* piRoiOrigin, Pel* piAdiTemp, Bool* bNeighborFlags, Int iNumIntraNeighbor, Int iUnitSize, Int iNumUnitsInCu, Int iTotalUnits, UInt uiCuWidth, UInt uiCuHeight, UInt uiWidth, UInt uiHeight, Int iPicStride)
 {
     Pel* piRoiTemp;
     Int  i, j;
@@ -383,7 +378,7 @@ Void TComPattern::fillReferenceSamples(I
 
         for (i = 1; i < uiHeight; i++)
         {
-            piAdiTemp[i * uiWidth] = iDCValue;
+            piAdiTemp[i * ADI_BUF_STRIDE] = iDCValue;
         }
     }
     else if (iNumIntraNeighbor == iTotalUnits)
@@ -393,39 +388,19 @@ Void TComPattern::fillReferenceSamples(I
         piAdiTemp[0] = piRoiTemp[0];
 
         // Fill left border with rec. samples
+        // Fill below left border with rec. samples
         piRoiTemp = piRoiOrigin - 1;
 
-        if (bLMmode)
-        {
-            piRoiTemp--; // move to the second left column
-        }
-
-        for (i = 0; i < uiCuHeight; i++)
+        for (i = 0; i < 2 * uiCuHeight; i++)
         {
-            piAdiTemp[(1 + i) * uiWidth] = piRoiTemp[0];
-            piRoiTemp += iPicStride;
-        }
-
-        // Fill below left border with rec. samples
-        for (i = 0; i < uiCuHeight; i++)
-        {
-            piAdiTemp[(1 + uiCuHeight + i) * uiWidth] = piRoiTemp[0];
+            piAdiTemp[(1 + i) * ADI_BUF_STRIDE] = piRoiTemp[0];
             piRoiTemp += iPicStride;
         }
 
         // Fill top border with rec. samples
+        // Fill top right border with rec. samples
         piRoiTemp = piRoiOrigin - iPicStride;
-        for (i = 0; i < uiCuWidth; i++)
-        {
-            piAdiTemp[1 + i] = piRoiTemp[i];
-        }
-
-        // Fill top right border with rec. samples
-        piRoiTemp = piRoiOrigin - iPicStride + uiCuWidth;
-        for (i = 0; i < uiCuWidth; i++)
-        {
-            piAdiTemp[1 + uiCuWidth + i] = piRoiTemp[i];
-        }
+        memcpy(&piAdiTemp[1], piRoiTemp, 2*uiCuWidth * sizeof(*piAdiTemp));
     }
     else // reference samples are partially available
     {
@@ -458,10 +433,6 @@ Void TComPattern::fillReferenceSamples(I
 
         // Fill left & below-left samples
         piRoiTemp += iPicStride;
-        if (bLMmode)
-        {
-            piRoiTemp--; // move the second left column
-        }
         piAdiLineTemp--;
         pbNeighborFlags--;
         for (j = 0; j < iNumUnits2; j++)
@@ -486,10 +457,7 @@ Void TComPattern::fillReferenceSamples(I
         {
             if (*pbNeighborFlags)
             {
-                for (i = 0; i < iUnitSize; i++)
-                {
-                    piAdiLineTemp[i] = piRoiTemp[i];
-                }
+                memcpy(piAdiLineTemp, piRoiTemp, iUnitSize * sizeof(*piAdiTemp));
             }
             piRoiTemp += iUnitSize;
             piAdiLineTemp += iUnitSize;
@@ -545,32 +513,29 @@ Void TComPattern::fillReferenceSamples(I
 
         // Copy processed samples
         piAdiLineTemp = piAdiLine + uiHeight + iUnitSize - 2;
-        for (i = 0; i < uiWidth; i++)
-        {
-            piAdiTemp[i] = piAdiLineTemp[i];
-        }
+        memcpy(piAdiTemp, piAdiLineTemp, uiWidth * sizeof(*piAdiTemp));
 
         piAdiLineTemp = piAdiLine + uiHeight - 1;
         for (i = 1; i < uiHeight; i++)
         {
-            piAdiTemp[i * uiWidth] = piAdiLineTemp[-i];
+            piAdiTemp[i * ADI_BUF_STRIDE] = piAdiLineTemp[-i];
         }
     }
 }
 
-Int* TComPattern::getAdiOrgBuf(Int /*iCuWidth*/, Int /*iCuHeight*/, Int* piAdiBuf)
+Pel* TComPattern::getAdiOrgBuf(Int /*iCuWidth*/, Int /*iCuHeight*/, Pel* piAdiBuf)
 {
     return piAdiBuf;
 }
 
-Int* TComPattern::getAdiCbBuf(Int /*iCuWidth*/, Int /*iCuHeight*/, Int* piAdiBuf)
+Pel* TComPattern::getAdiCbBuf(Int /*iCuWidth*/, Int /*iCuHeight*/, Pel* piAdiBuf)
 {
     return piAdiBuf;
 }
 
-Int* TComPattern::getAdiCrBuf(Int iCuWidth, Int iCuHeight, Int* piAdiBuf)
+Pel* TComPattern::getAdiCrBuf(Int iCuWidth, Int iCuHeight, Pel* piAdiBuf)
 {
-    return piAdiBuf + (iCuWidth * 2 + 1) * (iCuHeight * 2 + 1);
+    return piAdiBuf + ADI_BUF_STRIDE * (iCuHeight * 2 + 1);
 }
 
 /** Get pointer to reference samples for intra prediction
@@ -581,9 +546,9 @@ Int* TComPattern::getAdiCrBuf(Int iCuWid
  *
  * The prediction mode index is used to determine whether a smoothed reference sample buffer is returned.
  */
-Int* TComPattern::getPredictorPtr(UInt uiDirMode, UInt log2BlkSize, Int* piAdiBuf)
+Pel* TComPattern::getPredictorPtr(UInt uiDirMode, UInt log2BlkSize, Pel* piAdiBuf)
 {
-    Int* piSrc;
+    Pel* piSrc;
 
     assert(log2BlkSize >= 2 && log2BlkSize < 7);
     Int diff = min<Int>(abs((Int)uiDirMode - HOR_IDX), abs((Int)uiDirMode - VER_IDX));
@@ -602,7 +567,7 @@ Int* TComPattern::getPredictorPtr(UInt u
 
     if (ucFiltIdx)
     {
-        piSrc += (2 * width + 1) * (2 * height + 1);
+        piSrc += ADI_BUF_STRIDE * (2 * height + 1);
     }
 
     return piSrc;
--- a/source/Lib/TLibCommon/TComPattern.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComPattern.h	Mon May 06 14:34:00 2013 +0530
@@ -116,11 +116,11 @@ public:
     Int   getPatternLStride()       { return m_cPatternY.m_iPatternStride; }
 
     // access functions of ADI buffers
-    Int*  getAdiOrgBuf(Int iCuWidth, Int iCuHeight, Int* piAdiBuf);
-    Int*  getAdiCbBuf(Int iCuWidth, Int iCuHeight, Int* piAdiBuf);
-    Int*  getAdiCrBuf(Int iCuWidth, Int iCuHeight, Int* piAdiBuf);
+    Pel*  getAdiOrgBuf(Int iCuWidth, Int iCuHeight, Pel* piAdiBuf);
+    Pel*  getAdiCbBuf(Int iCuWidth, Int iCuHeight, Pel* piAdiBuf);
+    Pel*  getAdiCrBuf(Int iCuWidth, Int iCuHeight, Pel* piAdiBuf);
 
-    Int*  getPredictorPtr(UInt uiDirMode, UInt uiWidthBits, Int* piAdiBuf);
+    Pel*  getPredictorPtr(UInt uiDirMode, UInt uiWidthBits, Pel* piAdiBuf);
     // -------------------------------------------------------------------------------------------------------------------
     // initialization functions
     // -------------------------------------------------------------------------------------------------------------------
@@ -144,19 +144,18 @@ public:
     Void  initAdiPattern(TComDataCU* pcCU,
                          UInt uiZorderIdxInPart,
                          UInt uiPartDepth,
-                         Int* piAdiBuf,
+                         Pel* piAdiBuf,
                          Int iOrgBufStride,
                          Int iOrgBufHeight,
                          Bool& bAbove,
                          Bool& bLeft
-                         , Bool        bLMmode = false     // using for LM chroma or not
                          );
 
     /// set chroma parameters from CU data for accessing ADI data
     Void  initAdiPatternChroma(TComDataCU* pcCU,
                                UInt        uiZorderIdxInPart,
                                UInt        uiPartDepth,
-                               Int*        piAdiBuf,
+                               Pel*        piAdiBuf,
                                Int         iOrgBufStride,
                                Int         iOrgBufHeight,
                                Bool&       bAbove,
@@ -165,7 +164,7 @@ public:
 private:
 
     /// padding of unavailable reference samples for intra prediction
-    Void  fillReferenceSamples(Int bitDepth, Pel* piRoiOrigin, Int* piAdiTemp, Bool* bNeighborFlags, Int iNumIntraNeighbor, Int iUnitSize, Int iNumUnitsInCu, Int iTotalUnits, UInt uiCuWidth, UInt uiCuHeight, UInt uiWidth, UInt uiHeight, Int iPicStride, Bool bLMmode = false);
+    Void  fillReferenceSamples(Int bitDepth, Pel* piRoiOrigin, Pel* piAdiTemp, Bool* bNeighborFlags, Int iNumIntraNeighbor, Int iUnitSize, Int iNumUnitsInCu, Int iTotalUnits, UInt uiCuWidth, UInt uiCuHeight, UInt uiWidth, UInt uiHeight, Int iPicStride);
 
     /// constrained intra prediction
     Bool  isAboveLeftAvailable(TComDataCU* pcCU, UInt uiPartIdxLT);
--- a/source/Lib/TLibCommon/TComPicYuv.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComPicYuv.h	Mon May 06 14:34:00 2013 +0530
@@ -145,7 +145,7 @@ public:
     Pel*  getBufV()     { return m_apiPicBufV; }
 
     //  Access starting position of original picture
-    Pel*  getLumaAddr()     { return m_piPicOrgY; }
+    Pel*  getLumaAddr()   { return m_piPicOrgY; }
 
     Pel*  getCbAddr()     { return m_piPicOrgU; }
 
--- a/source/Lib/TLibCommon/TComPrediction.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp	Mon May 06 14:34:00 2013 +0530
@@ -53,12 +53,12 @@ TComPrediction::TComPrediction()
     : m_pLumaRecBuffer(0)
     , m_iLumaRecStride(0)
 {
-    m_piYuvExt = NULL;
+    m_piPredBuf = NULL;
 }
 
 TComPrediction::~TComPrediction()
 {
-    delete[] m_piYuvExt;
+    delete[] m_piPredBuf;
 
     m_acYuvPred[0].destroy();
     m_acYuvPred[1].destroy();
@@ -84,7 +84,7 @@ TComPrediction::~TComPrediction()
 
 Void TComPrediction::initTempBuff()
 {
-    if (m_piYuvExt == NULL)
+    if (m_piPredBuf == NULL)
     {
         Int extWidth  = MAX_CU_SIZE + 16;
         Int extHeight = MAX_CU_SIZE + 1;
@@ -98,9 +98,9 @@ Void TComPrediction::initTempBuff()
             }
         }
 
-        m_iYuvExtHeight  = ((MAX_CU_SIZE + 2) << 4);
-        m_iYuvExtStride = ((MAX_CU_SIZE  + 8) << 4);
-        m_piYuvExt = new Int[m_iYuvExtStride * m_iYuvExtHeight];
+        m_iPredBufHeight  = ((MAX_CU_SIZE + 2) << 4);
+        m_iPredBufStride = ((MAX_CU_SIZE  + 8) << 4);
+        m_piPredBuf = new Pel[m_iPredBufStride * m_iPredBufHeight];
 
         // new structure
         m_acYuvPred[0].create(MAX_CU_SIZE, MAX_CU_SIZE);
@@ -122,9 +122,12 @@ Void TComPrediction::initTempBuff()
 // ====================================================================================================================
 // Public member functions
 // ====================================================================================================================
+Void xPredIntraPlanar(Pel* pSrc, Int srcStride, Pel* rpDst, Int dstStride, UInt width, UInt height);
+Void xDCPredFiltering(Pel* pSrc, Int iSrcStride, Pel*& rpDst, Int iDstStride, Int iWidth, Int iHeight);
 
 // Function for calculating DC value of the reference samples used in Intra prediction
-Pel TComPrediction::predIntraGetPredValDC(Int* pSrc, Int iSrcStride, UInt iWidth, UInt iHeight, Bool bAbove, Bool bLeft)
+#if !ENABLE_PRIMITIVES
+Pel CDECL predIntraGetPredValDC(Pel* pSrc, intptr_t iSrcStride, intptr_t iWidth, intptr_t iHeight, int bAbove, int bLeft)
 {
     Int iInd, iSum = 0;
     Pel pDcVal;
@@ -163,6 +166,7 @@ Pel TComPrediction::predIntraGetPredValD
 
     return pDcVal;
 }
+#endif
 
 // Function for deriving the angular Intra predictions
 
@@ -185,17 +189,16 @@ Pel TComPrediction::predIntraGetPredValD
  * the predicted value for the pixel is linearly interpolated from the reference samples. All reference samples are taken
  * from the extended main reference.
  */
-Void TComPrediction::xPredIntraAng(Int bitDepth, Int* pSrc, Int srcStride, Pel*& rpDst, Int dstStride, UInt width, UInt height, UInt dirMode, Bool blkAboveAvailable, Bool blkLeftAvailable, Bool bFilter)
+Void xPredIntraAng(Int bitDepth, Pel* pSrc, Int srcStride, Pel*& rpDst, Int dstStride, UInt width, UInt height, UInt dirMode, Bool blkAboveAvailable, Bool blkLeftAvailable, Bool bFilter)
 {
     Int k, l;
     Int blkSize        = width;
     Pel* pDst          = rpDst;
 
     // Map the mode index to main prediction direction and angle
-    assert(dirMode > 0); //no planar
-    Bool modeDC        = dirMode < 2;
-    Bool modeHor       = !modeDC && (dirMode < 18);
-    Bool modeVer       = !modeDC && !modeHor;
+    assert(dirMode > 1); //no planar and dc
+    Bool modeHor       = (dirMode < 18);
+    Bool modeVer       = !modeHor;
     Int intraPredAngle = modeVer ? (Int)dirMode - VER_IDX : modeHor ? -((Int)dirMode - HOR_IDX) : 0;
     Int absAng         = abs(intraPredAngle);
     Int signAng        = intraPredAngle < 0 ? -1 : 1;
@@ -207,22 +210,7 @@ Void TComPrediction::xPredIntraAng(Int b
     absAng             = angTable[absAng];
     intraPredAngle     = signAng * absAng;
 
-    // Do the DC prediction
-    if (modeDC)
-    {
-        UChar dcval = (UChar)predIntraGetPredValDC(pSrc, srcStride, width, height, blkAboveAvailable, blkLeftAvailable);
-
-        for (k = 0; k < blkSize; k++)
-        {
-            for (l = 0; l < blkSize; l++)
-            {
-                pDst[k * dstStride + l] = dcval;
-            }
-        }
-    }
-
     // Do angular predictions
-    else
     {
         Pel* refMain;
         Pel* refSide;
@@ -337,56 +325,79 @@ Void TComPrediction::xPredIntraAng(Int b
     }
 }
 
+Void xPredIntraDC(Pel* pSrc, Int srcStride, Pel*& rpDst, Int dstStride, UInt width, UInt height, Bool blkAboveAvailable, Bool blkLeftAvailable, Bool bFilter)
+{
+    Int k, l;
+    Int blkSize        = width;
+    Pel* pDst          = rpDst;
+
+    // Do the DC prediction
+#if ENABLE_PRIMITIVES
+    Pel dcval = (Pel) primitives.getdcval_p((pixel*)pSrc, srcStride, width, height, (blkAboveAvailable ? 1 : 0), (blkLeftAvailable ? 1 : 0));
+#else
+    UChar dcval = (UChar) predIntraGetPredValDC(pSrc, srcStride, width, height, blkAboveAvailable, blkLeftAvailable);
+#endif
+
+    for (k = 0; k < blkSize; k++)
+    {
+        for (l = 0; l < blkSize; l++)
+        {
+            pDst[k * dstStride + l] = dcval;
+        }
+    }
+    if (bFilter && blkAboveAvailable && blkLeftAvailable)
+    {
+        xDCPredFiltering(pSrc, srcStride, pDst, dstStride, width, height);
+    }
+}
+
 Void TComPrediction::predIntraLumaAng(TComPattern* pcTComPattern, UInt uiDirMode, Pel* piPred, UInt uiStride, Int iWidth, Int iHeight, Bool bAbove, Bool bLeft)
 {
     Pel *pDst = piPred;
-    Int *ptrSrc;
+    Pel *ptrSrc;
 
     assert(g_aucConvertToBit[iWidth] >= 0);   //   4x  4
     assert(g_aucConvertToBit[iWidth] <= 5);   // 128x128
     assert(iWidth == iHeight);
 
-    ptrSrc = pcTComPattern->getPredictorPtr(uiDirMode, g_aucConvertToBit[iWidth] + 2, m_piYuvExt);
+    ptrSrc = pcTComPattern->getPredictorPtr(uiDirMode, g_aucConvertToBit[iWidth] + 2, m_piPredBuf);
 
     // get starting pixel in block
-    Int sw = 2 * iWidth + 1;
+    Int sw = ADI_BUF_STRIDE;
+    Bool bFilter = ((iWidth <= 16) && (iHeight <= 16));
 
     // Create the prediction
     if (uiDirMode == PLANAR_IDX)
     {
         xPredIntraPlanar(ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight);
     }
+    else if (uiDirMode == DC_IDX)
+    {
+        xPredIntraDC(ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight, bAbove, bLeft, bFilter);
+    }
     else
     {
-        if ((iWidth > 16) || (iHeight > 16))
-        {
-            xPredIntraAng(g_bitDepthY, ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight, uiDirMode, bAbove, bLeft, false);
-        }
-        else
-        {
-            xPredIntraAng(g_bitDepthY, ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight, uiDirMode, bAbove, bLeft, true);
-
-            if ((uiDirMode == DC_IDX) && bAbove && bLeft)
-            {
-                xDCPredFiltering(ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight);
-            }
-        }
+        xPredIntraAng(g_bitDepthY, ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight, uiDirMode, bAbove, bLeft, bFilter);
     }
 }
 
 // Angular chroma
-Void TComPrediction::predIntraChromaAng(Int* piSrc, UInt uiDirMode, Pel* piPred, UInt uiStride, Int iWidth, Int iHeight, Bool bAbove, Bool bLeft)
+Void TComPrediction::predIntraChromaAng(Pel* piSrc, UInt uiDirMode, Pel* piPred, UInt uiStride, Int iWidth, Int iHeight, Bool bAbove, Bool bLeft)
 {
     Pel *pDst = piPred;
-    Int *ptrSrc = piSrc;
+    Pel *ptrSrc = piSrc;
 
     // get starting pixel in block
-    Int sw = 2 * iWidth + 1;
+    Int sw = ADI_BUF_STRIDE;
 
     if (uiDirMode == PLANAR_IDX)
     {
         xPredIntraPlanar(ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight);
     }
+    else if (uiDirMode == DC_IDX)
+    {
+        xPredIntraDC(ptrSrc + sw + 1, sw, pDst, uiStride, iWidth, iHeight, bAbove, bLeft, false);
+    }
     else
     {
         // Create the prediction
@@ -838,13 +849,13 @@ Void TComPrediction::getMvPredAMVP(TComD
  *
  * This function derives the prediction samples for planar mode (intra coding).
  */
-Void TComPrediction::xPredIntraPlanar(Int* pSrc, Int srcStride, Pel* rpDst, Int dstStride, UInt width, UInt height)
+Void xPredIntraPlanar(Pel* pSrc, Int srcStride, Pel* rpDst, Int dstStride, UInt width, UInt height)
 {
     assert(width == height);
 
     Int k, l, bottomLeft, topRight;
     Int horPred;
-    Int leftColumn[MAX_CU_SIZE], topRow[MAX_CU_SIZE], bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
+    Int leftColumn[MAX_CU_SIZE+1], topRow[MAX_CU_SIZE+1], bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
     UInt blkSize = width;
     UInt offset2D = width;
     UInt shift1D = g_aucConvertToBit[width] + 2;
@@ -891,7 +902,7 @@ Void TComPrediction::xPredIntraPlanar(In
  *
  * This function performs filtering left and top edges of the prediction samples for DC mode (intra coding).
  */
-Void TComPrediction::xDCPredFiltering(Int* pSrc, Int iSrcStride, Pel*& rpDst, Int iDstStride, Int iWidth, Int iHeight)
+Void xDCPredFiltering(Pel* pSrc, Int iSrcStride, Pel*& rpDst, Int iDstStride, Int iWidth, Int iHeight)
 {
     Pel* pDst = rpDst;
     Int x, y, iDstStride2, iSrcStride2;
--- a/source/Lib/TLibCommon/TComPrediction.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.h	Mon May 06 14:34:00 2013 +0530
@@ -59,9 +59,9 @@ class TComPrediction : public TComWeight
 {
 protected:
 
-    Int*      m_piYuvExt;
-    Int       m_iYuvExtStride;
-    Int       m_iYuvExtHeight;
+    Pel*      m_piPredBuf;
+    Int       m_iPredBufStride;
+    Int       m_iPredBufHeight;
 
     TComYuv   m_acYuvPred[2];
     TComYuv   m_cYuvPredTemp;
@@ -75,9 +75,6 @@ protected:
     Pel*   m_pLumaRecBuffer;     ///< array for downsampled reconstructed luma sample
     Int    m_iLumaRecStride;     ///< stride of #m_pLumaRecBuffer array
 
-    Void xPredIntraAng(Int bitDepth, Int* pSrc, Int srcStride, Pel*& rpDst, Int dstStride, UInt width, UInt height, UInt dirMode, Bool blkAboveAvailable, Bool blkLeftAvailable, Bool bFilter);
-    Void xPredIntraPlanar(Int* pSrc, Int srcStride, Pel* rpDst, Int dstStride, UInt width, UInt height);
-
     // motion compensation functions
     Void xPredInterUni(TComDataCU* pcCU,                          UInt uiPartAddr,               Int iWidth, Int iHeight, RefPicList eRefPicList, TComYuv*& rpcYuvPred, Bool bi = false);
     Void xPredInterBi(TComDataCU* pcCU,                          UInt uiPartAddr,               Int iWidth, Int iHeight,                         TComYuv*& rpcYuvPred);
@@ -87,7 +84,6 @@ protected:
 
     Void xGetLLSPrediction(TComPattern* pcPattern, Int* pSrc0, Int iSrcStride, Pel* pDst0, Int iDstStride, UInt uiWidth, UInt uiHeight, UInt uiExt0);
 
-    Void xDCPredFiltering(Int* pSrc, Int iSrcStride, Pel*& rpDst, Int iDstStride, Int iWidth, Int iHeight);
     Bool xCheckIdenticalMotion(TComDataCU* pcCU, UInt PartAddr);
 
 public:
@@ -105,15 +101,14 @@ public:
 
     // Angular Intra
     Void predIntraLumaAng(TComPattern* pcTComPattern, UInt uiDirMode, Pel* piPred, UInt uiStride, Int iWidth, Int iHeight, Bool bAbove, Bool bLeft);
-    Void predIntraChromaAng(Int* piSrc, UInt uiDirMode, Pel* piPred, UInt uiStride, Int iWidth, Int iHeight, Bool bAbove, Bool bLeft);
-
-    Pel  predIntraGetPredValDC(Int* pSrc, Int iSrcStride, UInt iWidth, UInt iHeight, Bool bAbove, Bool bLeft);
+    Void predIntraChromaAng(Pel* piSrc, UInt uiDirMode, Pel* piPred, UInt uiStride, Int iWidth, Int iHeight, Bool bAbove, Bool bLeft);
 
-    Int* getPredicBuf()             { return m_piYuvExt; }
 
-    Int  getPredicBufWidth()        { return m_iYuvExtStride; }
+    Pel* getPredicBuf()             { return m_piPredBuf; }
 
-    Int  getPredicBufHeight()       { return m_iYuvExtHeight; }
+    Int  getPredicBufWidth()        { return m_iPredBufStride; }
+
+    Int  getPredicBufHeight()       { return m_iPredBufHeight; }
 };
 
 //! \}
--- a/source/Lib/TLibCommon/TComRdCost.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComRdCost.h	Mon May 06 14:34:00 2013 +0530
@@ -103,10 +103,17 @@ public:
 class DistParamSSE : public DistParam
 {
 public:
+
     Short* ptr1;
     Short* ptr2;
 };
 
+#define CALCRDCOST(uiBits, uiDistortion, m_dLambda) \
+    (Double)floor((Double)uiDistortion + (Double)((uiBits * m_dLambda + .5))) \
+
+#define CALCRDCOST_SAD(uiBits, uiDistortion, m_dLambda) \
+    (Double)floor((Double)uiDistortion + (Double)((Int)(uiBits * m_dLambda + .5) >> 16)) \
+
 /// RD cost computation class
 class TComRdCost
     : public TComRdCostWeightPrediction
@@ -123,9 +130,8 @@ private:
     Double                  m_cbDistortionWeight;
     Double                  m_crDistortionWeight;
 #endif
-    Double                  m_dLambda;
+
     Double                  m_sqrtLambda;
-    UInt                    m_uiLambdaMotionSAD;
     UInt                    m_uiLambdaMotionSSE;
     Double                  m_dFrameLambda;
 
@@ -134,6 +140,8 @@ public:
     TComMv                  m_mvPredictor;
     UInt                    m_uiCost;
     Int                     m_iCostScale;
+    Double                  m_dLambda;
+    UInt                    m_uiLambdaMotionSAD;
 
     TComRdCost();
     virtual ~TComRdCost();
@@ -230,7 +238,7 @@ public:
     UInt   getDistPart(Int bitDepth, Pel* piCur, Int iCurStride,  Short* piOrg, Int iOrgStride, UInt uiBlkWidth, UInt uiBlkHeight, DFunc eDFunc = DF_SSE);
     UInt   getDistPart(Int bitDepth, Short* piCur, Int iCurStride,  Short* piOrg, Int iOrgStride, UInt uiBlkWidth, UInt uiBlkHeight, DFunc eDFunc = DF_SSE);
 #endif
-#endif
+#endif // if WEIGHTED_CHROMA_DISTORTION
 
     UInt   getSADPart(Int bitDepth, Pel* pelCur, Int curStride,  Pel* pelOrg, Int orgStride, UInt width, UInt height);
 }; // END CLASS DEFINITION TComRdCost
--- a/source/Lib/TLibCommon/TComRom.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComRom.h	Mon May 06 14:34:00 2013 +0530
@@ -50,10 +50,11 @@
 // Macros
 // ====================================================================================================================
 
-#define     MAX_CU_DEPTH            7                           // log2(LCUSize)
+#define     MAX_CU_DEPTH            6                           // log2(LCUSize)
 #define     MAX_CU_SIZE             (1 << (MAX_CU_DEPTH))         // maximum allowable size of CU
 #define     MIN_PU_SIZE             4
 #define     MAX_NUM_SPU_W           (MAX_CU_SIZE / MIN_PU_SIZE)   // maximum number of SPU in horizontal line
+#define     ADI_BUF_STRIDE          (2*MAX_CU_SIZE+1+15)        // alignment to 16 bytes
 
 // ====================================================================================================================
 // Initialize / destroy functions
--- a/source/Lib/TLibCommon/TComYuv.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp	Mon May 06 14:34:00 2013 +0530
@@ -45,6 +45,7 @@
 #include "TComYuv.h"
 #include "TComInterpolationFilter.h"
 #include "TShortYUV.h"
+#include "primitives.h"
 
 //! \ingroup TLibCommon
 //! \{
@@ -99,40 +100,49 @@ Void TComYuv::copyToPicYuv(TComPicYuv* p
 
 Void TComYuv::copyToPicLuma(TComPicYuv* pcPicYuvDst, UInt iCuAddr, UInt uiAbsZorderIdx, UInt uiPartDepth, UInt uiPartIdx)
 {
-    Int  y, iWidth, iHeight;
+    Int iWidth, iHeight;
 
     iWidth  = m_iWidth >> uiPartDepth;
     iHeight = m_iHeight >> uiPartDepth;
 
-    Pel* pSrc     = getLumaAddr(uiPartIdx, iWidth);
-    Pel* pDst     = pcPicYuvDst->getLumaAddr(iCuAddr, uiAbsZorderIdx);
+    Pel* pSrc = getLumaAddr(uiPartIdx, iWidth);
+    Pel* pDst = pcPicYuvDst->getLumaAddr(iCuAddr, uiAbsZorderIdx);
 
-    UInt  iSrcStride  = getStride();
-    UInt  iDstStride  = pcPicYuvDst->getStride();
+    UInt  iSrcStride = getStride();
+    UInt  iDstStride = pcPicYuvDst->getStride();
 
-    for (y = iHeight; y != 0; y--)
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDst, iDstStride, (pixel*)pSrc, iSrcStride);
+#else
+    for (Int y = iHeight; y != 0; y--)
     {
         ::memcpy(pDst, pSrc, sizeof(Pel) * iWidth);
         pDst += iDstStride;
         pSrc += iSrcStride;
     }
+#endif
 }
 
 Void TComYuv::copyToPicChroma(TComPicYuv* pcPicYuvDst, UInt iCuAddr, UInt uiAbsZorderIdx, UInt uiPartDepth, UInt uiPartIdx)
 {
-    Int  y, iWidth, iHeight;
+    Int iWidth, iHeight;
 
     iWidth  = m_iCWidth >> uiPartDepth;
     iHeight = m_iCHeight >> uiPartDepth;
 
-    Pel* pSrcU      = getCbAddr(uiPartIdx, iWidth);
-    Pel* pSrcV      = getCrAddr(uiPartIdx, iWidth);
-    Pel* pDstU      = pcPicYuvDst->getCbAddr(iCuAddr, uiAbsZorderIdx);
-    Pel* pDstV      = pcPicYuvDst->getCrAddr(iCuAddr, uiAbsZorderIdx);
+    Pel* pSrcU = getCbAddr(uiPartIdx, iWidth);
+    Pel* pSrcV = getCrAddr(uiPartIdx, iWidth);
+    Pel* pDstU = pcPicYuvDst->getCbAddr(iCuAddr, uiAbsZorderIdx);
+    Pel* pDstV = pcPicYuvDst->getCrAddr(iCuAddr, uiAbsZorderIdx);
 
-    UInt  iSrcStride = getCStride();
-    UInt  iDstStride = pcPicYuvDst->getCStride();
-    for (y = iHeight; y != 0; y--)
+    UInt iSrcStride = getCStride();
+    UInt iDstStride = pcPicYuvDst->getCStride();
+
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+    x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
+    for (Int y = iHeight; y != 0; y--)
     {
         ::memcpy(pDstU, pSrcU, sizeof(Pel) * (iWidth));
         ::memcpy(pDstV, pSrcV, sizeof(Pel) * (iWidth));
@@ -141,6 +151,7 @@ Void TComYuv::copyToPicChroma(TComPicYuv
         pDstU += iDstStride;
         pDstV += iDstStride;
     }
+#endif
 }
 
 Void TComYuv::copyFromPicYuv(TComPicYuv* pcPicYuvSrc, UInt iCuAddr, UInt uiAbsZorderIdx)
@@ -151,35 +162,39 @@ Void TComYuv::copyFromPicYuv(TComPicYuv*
 
 Void TComYuv::copyFromPicLuma(TComPicYuv* pcPicYuvSrc, UInt iCuAddr, UInt uiAbsZorderIdx)
 {
-    Int  y;
+    Pel* pDst = m_apiBufY;
+    Pel* pSrc = pcPicYuvSrc->getLumaAddr(iCuAddr, uiAbsZorderIdx);
 
-    Pel* pDst     = m_apiBufY;
-    Pel* pSrc     = pcPicYuvSrc->getLumaAddr(iCuAddr, uiAbsZorderIdx);
+    UInt  iDstStride = getStride();
+    UInt  iSrcStride = pcPicYuvSrc->getStride();
 
-    UInt  iDstStride  = getStride();
-    UInt  iSrcStride  = pcPicYuvSrc->getStride();
-
-    for (y = m_iHeight; y != 0; y--)
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(m_iWidth, m_iHeight, (pixel*)pDst, iDstStride, (pixel*)pSrc, iSrcStride);
+#else
+    for (Int y = m_iHeight; y != 0; y--)
     {
         ::memcpy(pDst, pSrc, sizeof(Pel) * m_iWidth);
         pDst += iDstStride;
         pSrc += iSrcStride;
     }
+#endif
 }
 
 Void TComYuv::copyFromPicChroma(TComPicYuv* pcPicYuvSrc, UInt iCuAddr, UInt uiAbsZorderIdx)
 {
-    Int  y;
-
-    Pel* pDstU      = m_apiBufU;
-    Pel* pDstV      = m_apiBufV;
-    Pel* pSrcU      = pcPicYuvSrc->getCbAddr(iCuAddr, uiAbsZorderIdx);
-    Pel* pSrcV      = pcPicYuvSrc->getCrAddr(iCuAddr, uiAbsZorderIdx);
+    Pel* pDstU = m_apiBufU;
+    Pel* pDstV = m_apiBufV;
+    Pel* pSrcU = pcPicYuvSrc->getCbAddr(iCuAddr, uiAbsZorderIdx);
+    Pel* pSrcV = pcPicYuvSrc->getCrAddr(iCuAddr, uiAbsZorderIdx);
 
     UInt  iDstStride = getCStride();
     UInt  iSrcStride = pcPicYuvSrc->getCStride();
 
-    for (y = m_iCHeight; y != 0; y--)
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(m_iCWidth, m_iCHeight, (pixel*)pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+    x265::primitives.cpyblock(m_iCWidth, m_iCHeight, (pixel*)pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
+    for (Int y = m_iCHeight; y != 0; y--)
     {
         ::memcpy(pDstU, pSrcU, sizeof(Pel) * (m_iCWidth));
         ::memcpy(pDstV, pSrcV, sizeof(Pel) * (m_iCWidth));
@@ -188,6 +203,7 @@ Void TComYuv::copyFromPicChroma(TComPicY
         pDstU += iDstStride;
         pDstV += iDstStride;
     }
+#endif
 }
 
 Void TComYuv::copyToPartYuv(TComYuv* pcYuvDst, UInt uiDstPartIdx)
@@ -198,35 +214,39 @@ Void TComYuv::copyToPartYuv(TComYuv* pcY
 
 Void TComYuv::copyToPartLuma(TComYuv* pcYuvDst, UInt uiDstPartIdx)
 {
-    Int  y;
-
-    Pel* pSrc     = m_apiBufY;
-    Pel* pDst     = pcYuvDst->getLumaAddr(uiDstPartIdx);
+    Pel* pSrc = m_apiBufY;
+    Pel* pDst = pcYuvDst->getLumaAddr(uiDstPartIdx);
 
     UInt  iSrcStride  = getStride();
     UInt  iDstStride  = pcYuvDst->getStride();
 
-    for (y = m_iHeight; y != 0; y--)
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(m_iWidth, m_iHeight, (pixel*)pDst, iDstStride, (pixel*)pSrc, iSrcStride);
+#else
+    for (Int y = m_iHeight; y != 0; y--)
     {
         ::memcpy(pDst, pSrc, sizeof(Pel) * m_iWidth);
         pDst += iDstStride;
         pSrc += iSrcStride;
     }
+#endif
 }
 
 Void TComYuv::copyToPartChroma(TComYuv* pcYuvDst, UInt uiDstPartIdx)
 {
-    Int  y;
-
-    Pel* pSrcU      = m_apiBufU;
-    Pel* pSrcV      = m_apiBufV;
-    Pel* pDstU      = pcYuvDst->getCbAddr(uiDstPartIdx);
-    Pel* pDstV      = pcYuvDst->getCrAddr(uiDstPartIdx);
+    Pel* pSrcU = m_apiBufU;
+    Pel* pSrcV = m_apiBufV;
+    Pel* pDstU = pcYuvDst->getCbAddr(uiDstPartIdx);
+    Pel* pDstV = pcYuvDst->getCrAddr(uiDstPartIdx);
 
     UInt  iSrcStride = getCStride();
     UInt  iDstStride = pcYuvDst->getCStride();
 
-    for (y = m_iCHeight; y != 0; y--)
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(m_iCWidth, m_iCHeight, (pixel*)pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+    x265::primitives.cpyblock(m_iCWidth, m_iCHeight, (pixel*)pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
+    for (Int y = m_iCHeight; y != 0; y--)
     {
         ::memcpy(pDstU, pSrcU, sizeof(Pel) * (m_iCWidth));
         ::memcpy(pDstV, pSrcV, sizeof(Pel) * (m_iCWidth));
@@ -235,6 +255,7 @@ Void TComYuv::copyToPartChroma(TComYuv* 
         pDstU += iDstStride;
         pDstV += iDstStride;
     }
+#endif
 }
 
 Void TComYuv::copyPartToYuv(TComYuv* pcYuvDst, UInt uiSrcPartIdx)
@@ -245,41 +266,45 @@ Void TComYuv::copyPartToYuv(TComYuv* pcY
 
 Void TComYuv::copyPartToLuma(TComYuv* pcYuvDst, UInt uiSrcPartIdx)
 {
-    Int  y;
+    Pel* pSrc = getLumaAddr(uiSrcPartIdx);
+    Pel* pDst = pcYuvDst->getLumaAddr(0);
 
-    Pel* pSrc     = getLumaAddr(uiSrcPartIdx);
-    Pel* pDst     = pcYuvDst->getLumaAddr(0);
-
-    UInt  iSrcStride  = getStride();
-    UInt  iDstStride  = pcYuvDst->getStride();
+    UInt iSrcStride = getStride();
+    UInt iDstStride = pcYuvDst->getStride();
 
     UInt uiHeight = pcYuvDst->getHeight();
     UInt uiWidth = pcYuvDst->getWidth();
 
-    for (y = uiHeight; y != 0; y--)
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(uiWidth, uiHeight, (pixel*)pDst, iDstStride, (pixel*)pSrc, iSrcStride);
+#else
+    for (Int y = uiHeight; y != 0; y--)
     {
         ::memcpy(pDst, pSrc, sizeof(Pel) * uiWidth);
         pDst += iDstStride;
         pSrc += iSrcStride;
     }
+#endif
 }
 
 Void TComYuv::copyPartToChroma(TComYuv* pcYuvDst, UInt uiSrcPartIdx)
 {
-    Int  y;
+    Pel* pSrcU = getCbAddr(uiSrcPartIdx);
+    Pel* pSrcV = getCrAddr(uiSrcPartIdx);
+    Pel* pDstU = pcYuvDst->getCbAddr(0);
+    Pel* pDstV = pcYuvDst->getCrAddr(0);
 
-    Pel* pSrcU      = getCbAddr(uiSrcPartIdx);
-    Pel* pSrcV      = getCrAddr(uiSrcPartIdx);
-    Pel* pDstU      = pcYuvDst->getCbAddr(0);
-    Pel* pDstV      = pcYuvDst->getCrAddr(0);
-
-    UInt  iSrcStride = getCStride();
-    UInt  iDstStride = pcYuvDst->getCStride();
+    UInt iSrcStride = getCStride();
+    UInt iDstStride = pcYuvDst->getCStride();
 
     UInt uiCHeight = pcYuvDst->getCHeight();
     UInt uiCWidth = pcYuvDst->getCWidth();
 
-    for (y = uiCHeight; y != 0; y--)
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(uiCWidth, uiCHeight, (pixel*)pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+    x265::primitives.cpyblock(uiCWidth, uiCHeight, (pixel*)pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
+    for (Int y = uiCHeight; y != 0; y--)
     {
         ::memcpy(pDstU, pSrcU, sizeof(Pel) * (uiCWidth));
         ::memcpy(pDstV, pSrcV, sizeof(Pel) * (uiCWidth));
@@ -288,6 +313,7 @@ Void TComYuv::copyPartToChroma(TComYuv* 
         pDstU += iDstStride;
         pDstV += iDstStride;
     }
+#endif
 }
 
 Void TComYuv::copyPartToPartYuv(TComYuv* pcYuvDst, UInt uiPartIdx, UInt iWidth, UInt iHeight)
@@ -315,24 +341,32 @@ Void TComYuv::copyPartToPartLuma(TComYuv
         return;
     }
 
-    UInt  iSrcStride = getStride();
-    UInt  iDstStride = pcYuvDst->getStride();
+    UInt iSrcStride = getStride();
+    UInt iDstStride = pcYuvDst->getStride();
+
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDst, iDstStride, (pixel*)pSrc, iSrcStride);
+#else
     for (UInt y = iHeight; y != 0; y--)
     {
         ::memcpy(pDst, pSrc, iWidth * sizeof(Pel));
         pSrc += iSrcStride;
         pDst += iDstStride;
     }
+#endif
 }
 
-
 Void TComYuv::copyPartToPartLuma(TShortYUV* pcYuvDst, UInt uiPartIdx, UInt iWidth, UInt iHeight)
 {
-    Pel* pSrc =           getLumaAddr(uiPartIdx);
+    Pel*   pSrc =           getLumaAddr(uiPartIdx);
     Short* pDst = pcYuvDst->getLumaAddr(uiPartIdx);
 
     UInt  iSrcStride = getStride();
     UInt  iDstStride = pcYuvDst->getStride();
+
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock_s_p(iWidth, iHeight, pDst, iDstStride, (pixel*)pSrc, iSrcStride);
+#else
     for (UInt y = iHeight; y != 0; y--)
     {
         for(UInt x = 0; x < iWidth; x++)
@@ -341,6 +375,7 @@ Void TComYuv::copyPartToPartLuma(TShortY
         pSrc += iSrcStride;
         pDst += iDstStride;
     }
+#endif
 }
 
 
@@ -360,6 +395,11 @@ Void TComYuv::copyPartToPartChroma(TComY
 
     UInt   iSrcStride = getCStride();
     UInt   iDstStride = pcYuvDst->getCStride();
+
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+    x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
     for (UInt y = iHeight; y != 0; y--)
     {
         ::memcpy(pDstU, pSrcU, iWidth * sizeof(Pel));
@@ -369,17 +409,23 @@ Void TComYuv::copyPartToPartChroma(TComY
         pDstU += iDstStride;
         pDstV += iDstStride;
     }
+#endif
 }
 
 Void TComYuv::copyPartToPartChroma(TShortYUV* pcYuvDst, UInt uiPartIdx, UInt iWidth, UInt iHeight)
 {
-    Pel*  pSrcU =           getCbAddr(uiPartIdx);
-    Pel*  pSrcV =           getCrAddr(uiPartIdx);
+    Pel*    pSrcU =           getCbAddr(uiPartIdx);
+    Pel*    pSrcV =           getCrAddr(uiPartIdx);
     Short*  pDstU = pcYuvDst->getCbAddr(uiPartIdx);
     Short*  pDstV = pcYuvDst->getCrAddr(uiPartIdx);
 
     UInt   iSrcStride = getCStride();
     UInt   iDstStride = pcYuvDst->getCStride();
+
+#if ENABLE_PRIMITIVES
+    x265::primitives.cpyblock_s_p(iWidth, iHeight, pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+    x265::primitives.cpyblock_s_p(iWidth, iHeight, pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
     for (UInt y = iHeight; y != 0; y--)
     {
         for(UInt x = 0; x < iWidth; x++)
@@ -392,6 +438,7 @@ Void TComYuv::copyPartToPartChroma(TShor
         pDstU += iDstStride;
         pDstV += iDstStride;
     }
+#endif
 }
 
 
@@ -407,12 +454,16 @@ Void TComYuv::copyPartToPartChroma(TComY
         }
         UInt   iSrcStride = getCStride();
         UInt   iDstStride = pcYuvDst->getCStride();
+#if ENABLE_PRIMITIVES
+        x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+#else
         for (UInt y = iHeight; y != 0; y--)
         {
             ::memcpy(pDstU, pSrcU, iWidth * sizeof(Pel));
             pSrcU += iSrcStride;
             pDstU += iDstStride;
         }
+#endif
     }
     else if (chromaId == 1)
     {
@@ -424,12 +475,16 @@ Void TComYuv::copyPartToPartChroma(TComY
         }
         UInt   iSrcStride = getCStride();
         UInt   iDstStride = pcYuvDst->getCStride();
+#if ENABLE_PRIMITIVES
+        x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
         for (UInt y = iHeight; y != 0; y--)
         {
             ::memcpy(pDstV, pSrcV, iWidth * sizeof(Pel));
             pSrcV += iSrcStride;
             pDstV += iDstStride;
         }
+#endif
     }
     else
     {
@@ -446,6 +501,10 @@ Void TComYuv::copyPartToPartChroma(TComY
         }
         UInt   iSrcStride = getCStride();
         UInt   iDstStride = pcYuvDst->getCStride();
+#if ENABLE_PRIMITIVES
+        x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+        x265::primitives.cpyblock(iWidth, iHeight, (pixel*)pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
         for (UInt y = iHeight; y != 0; y--)
         {
             ::memcpy(pDstU, pSrcU, iWidth * sizeof(Pel));
@@ -455,6 +514,7 @@ Void TComYuv::copyPartToPartChroma(TComY
             pDstU += iDstStride;
             pDstV += iDstStride;
         }
+#endif
     }
 }
 
@@ -462,11 +522,15 @@ Void TComYuv::copyPartToPartChroma(TShor
 {
     if (chromaId == 0)
     {
-        Pel*  pSrcU =           getCbAddr(uiPartIdx);
+        Pel*    pSrcU =           getCbAddr(uiPartIdx);
         Short*  pDstU = pcYuvDst->getCbAddr(uiPartIdx);
         
         UInt   iSrcStride = getCStride();
         UInt   iDstStride = pcYuvDst->getCStride();
+
+#if ENABLE_PRIMITIVES
+        x265::primitives.cpyblock_s_p(iWidth, iHeight, pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+#else
         for (UInt y = iHeight; y != 0; y--)
         {
             for(int x = 0; x < iWidth; x++)
@@ -475,6 +539,7 @@ Void TComYuv::copyPartToPartChroma(TShor
             pSrcU += iSrcStride;
             pDstU += iDstStride;
         }
+#endif
     }
     else if (chromaId == 1)
     {
@@ -483,6 +548,10 @@ Void TComYuv::copyPartToPartChroma(TShor
        
         UInt   iSrcStride = getCStride();
         UInt   iDstStride = pcYuvDst->getCStride();
+
+#if ENABLE_PRIMITIVES
+        x265::primitives.cpyblock_s_p(iWidth, iHeight, pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
         for (UInt y = iHeight; y != 0; y--)
         {
             for(int x = 0; x < iWidth; x++)
@@ -491,16 +560,22 @@ Void TComYuv::copyPartToPartChroma(TShor
             pSrcV += iSrcStride;
             pDstV += iDstStride;
         }
+#endif
     }
     else
     {
-        Pel*  pSrcU =           getCbAddr(uiPartIdx);
-        Pel*  pSrcV =           getCrAddr(uiPartIdx);
+        Pel*    pSrcU =           getCbAddr(uiPartIdx);
+        Pel*    pSrcV =           getCrAddr(uiPartIdx);
         Short*  pDstU = pcYuvDst->getCbAddr(uiPartIdx);
         Short*  pDstV = pcYuvDst->getCrAddr(uiPartIdx);
 
         UInt   iSrcStride = getCStride();
         UInt   iDstStride = pcYuvDst->getCStride();
+
+#if ENABLE_PRIMITIVES
+        x265::primitives.cpyblock_s_p(iWidth, iHeight, pDstU, iDstStride, (pixel*)pSrcU, iSrcStride);
+        x265::primitives.cpyblock_s_p(iWidth, iHeight, pDstV, iDstStride, (pixel*)pSrcV, iSrcStride);
+#else
         for (UInt y = iHeight; y != 0; y--)
         {
             for(int x = 0; x < iWidth; x++)
@@ -513,6 +588,7 @@ Void TComYuv::copyPartToPartChroma(TShor
             pDstU += iDstStride;
             pDstV += iDstStride;
         }
+#endif
     }
 }
 
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Mon May 06 14:34:00 2013 +0530
@@ -678,7 +678,7 @@ Void TEncCu::xCompressCU(TComDataCU*& rp
                 {
                     UInt uiRawBits = (2 * g_bitDepthY + g_bitDepthC) * rpcBestCU->getWidth(0) * rpcBestCU->getHeight(0) / 2;
                     UInt uiBestBits = rpcBestCU->getTotalBits();
-                    if ((uiBestBits > uiRawBits) || (rpcBestCU->getTotalCost() > m_pcRdCost->calcRdCost(uiRawBits, 0)))
+                    if ((uiBestBits > uiRawBits) || (rpcBestCU->getTotalCost() > CALCRDCOST(uiRawBits, 0, m_pcRdCost->m_dLambda)))
                     {
                         xCheckIntraPCM(rpcBestCU, rpcTempCU);
                         rpcTempCU->initEstData(uiDepth, iQP);
@@ -698,7 +698,7 @@ Void TEncCu::xCompressCU(TComDataCU*& rp
         {
             rpcBestCU->getTotalBins() += ((TEncBinCABAC*)((TEncSbac*)m_pcEntropyCoder->m_pcEntropyCoderIf)->getEncBinIf())->getBinsCoded();
         }
-        rpcBestCU->getTotalCost()  = m_pcRdCost->calcRdCost(rpcBestCU->getTotalBits(), rpcBestCU->getTotalDistortion());
+        rpcBestCU->getTotalCost()  = CALCRDCOST(rpcBestCU->getTotalBits(), rpcBestCU->getTotalDistortion(), m_pcRdCost->m_dLambda);
 
         // accumulate statistics for early skip
         if (m_pcEncCfg->getUseFastEnc())
@@ -833,7 +833,7 @@ Void TEncCu::xCompressCU(TComDataCU*& rp
                     rpcTempCU->getTotalBins() += ((TEncBinCABAC*)((TEncSbac*)m_pcEntropyCoder->m_pcEntropyCoderIf)->getEncBinIf())->getBinsCoded();
                 }
             }
-            rpcTempCU->getTotalCost()  = m_pcRdCost->calcRdCost(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion());
+            rpcTempCU->getTotalCost()  = CALCRDCOST(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion(), m_pcRdCost->m_dLambda);
 
             if ((g_uiMaxCUWidth >> uiDepth) == rpcTempCU->getSlice()->getPPS()->getMinCuDQPSize() && rpcTempCU->getSlice()->getPPS()->getUseDQP())
             {
@@ -867,7 +867,7 @@ Void TEncCu::xCompressCU(TComDataCU*& rp
                     {
                         rpcTempCU->getTotalBins() += ((TEncBinCABAC*)((TEncSbac*)m_pcEntropyCoder->m_pcEntropyCoderIf)->getEncBinIf())->getBinsCoded();
                     }
-                    rpcTempCU->getTotalCost()  = m_pcRdCost->calcRdCost(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion());
+                    rpcTempCU->getTotalCost()  = CALCRDCOST(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion(), m_pcRdCost->m_dLambda);
 #endif
 
                     Bool foundNonZeroCbf = false;
@@ -1290,7 +1290,7 @@ Void TEncCu::xCheckRDCostInter(TComDataC
     }
 
     m_pcPredSearch->encodeResAndCalcRdInterCU(rpcTempCU, m_ppcOrigYuv[uhDepth], m_ppcPredYuvTemp[uhDepth], m_ppcResiYuvTemp[uhDepth], m_ppcResiYuvBest[uhDepth], m_ppcRecoYuvTemp[uhDepth], false);
-    rpcTempCU->getTotalCost()  = m_pcRdCost->calcRdCost(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion());
+    rpcTempCU->getTotalCost()  = CALCRDCOST(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion(), m_pcRdCost->m_dLambda);
 
     xCheckDQP(rpcTempCU);
     xCheckBestMode(rpcBestCU, rpcTempCU, uhDepth);
@@ -1299,6 +1299,7 @@ Void TEncCu::xCheckRDCostInter(TComDataC
 Void TEncCu::xCheckRDCostIntra(TComDataCU*& rpcBestCU, TComDataCU*& rpcTempCU, PartSize eSize)
 {
     UInt uiDepth = rpcTempCU->getDepth(0);
+
     PPAScopeEvent(TEncCU_xCheckRDCostIntra);
 
     rpcTempCU->setSkipFlagSubParts(false, 0, uiDepth);
@@ -1342,7 +1343,7 @@ Void TEncCu::xCheckRDCostIntra(TComDataC
     {
         rpcTempCU->getTotalBins() = ((TEncBinCABAC*)((TEncSbac*)m_pcEntropyCoder->m_pcEntropyCoderIf)->getEncBinIf())->getBinsCoded();
     }
-    rpcTempCU->getTotalCost() = m_pcRdCost->calcRdCost(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion());
+    rpcTempCU->getTotalCost() = CALCRDCOST(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion(), m_pcRdCost->m_dLambda);
 
     xCheckDQP(rpcTempCU);
     xCheckBestMode(rpcBestCU, rpcTempCU, uiDepth);
@@ -1358,6 +1359,7 @@ Void TEncCu::xCheckRDCostIntra(TComDataC
 Void TEncCu::xCheckIntraPCM(TComDataCU*& rpcBestCU, TComDataCU*& rpcTempCU)
 {
     UInt uiDepth = rpcTempCU->getDepth(0);
+
     PPAScopeEvent(TEncCU_xCheckIntraPCM);
 
     rpcTempCU->setSkipFlagSubParts(false, 0, uiDepth);
@@ -1390,7 +1392,7 @@ Void TEncCu::xCheckIntraPCM(TComDataCU*&
     {
         rpcTempCU->getTotalBins() = ((TEncBinCABAC*)((TEncSbac*)m_pcEntropyCoder->m_pcEntropyCoderIf)->getEncBinIf())->getBinsCoded();
     }
-    rpcTempCU->getTotalCost() = m_pcRdCost->calcRdCost(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion());
+    rpcTempCU->getTotalCost() = CALCRDCOST(rpcTempCU->getTotalBits(), rpcTempCU->getTotalDistortion(), m_pcRdCost->m_dLambda);
 
     xCheckDQP(rpcTempCU);
     xCheckBestMode(rpcBestCU, rpcTempCU, uiDepth);
@@ -1445,7 +1447,7 @@ Void TEncCu::xCheckDQP(TComDataCU* pcCU)
             {
                 pcCU->getTotalBins() += ((TEncBinCABAC*)((TEncSbac*)m_pcEntropyCoder->m_pcEntropyCoderIf)->getEncBinIf())->getBinsCoded();
             }
-            pcCU->getTotalCost() = m_pcRdCost->calcRdCost(pcCU->getTotalBits(), pcCU->getTotalDistortion());
+            pcCU->getTotalCost() = CALCRDCOST(pcCU->getTotalBits(), pcCU->getTotalDistortion(), m_pcRdCost->m_dLambda);
 #endif
         }
         else
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Mon May 06 14:34:00 2013 +0530
@@ -336,7 +336,7 @@ void TEncSearch::init(TEncCfg*     pcEnc
 
 #if ENABLE_PRIMITIVES
     int part = x265::PartitionFromSizes(m_cDistParam.iCols, m_cDistParam.iRows >> iSubShift);
-    uiSad = (x265::primitives.sad[part]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur) << iSubShift) >>
+    uiSad = (x265::primitives.sad[part]((pixel*)m_fencbuf, FENC_STRIDE * iSubStep, (pixel*)piCur, iStrideCur) << iSubShift) >>
         DISTORTION_PRECISION_ADJUSTMENT(m_cDistParam.bitDepth - 8);
     x264_cpu_emms();
 #else
@@ -1061,7 +1061,7 @@ Void TEncSearch::xIntraCodingLumaBlk(TCo
     if (default0Save1Load2 != 2)
     {
         pcCU->getPattern()->initPattern(pcCU, uiTrDepth, uiAbsPartIdx);
-        pcCU->getPattern()->initAdiPattern(pcCU, uiAbsPartIdx, uiTrDepth, m_piYuvExt, m_iYuvExtStride, m_iYuvExtHeight, bAboveAvail, bLeftAvail);
+        pcCU->getPattern()->initAdiPattern(pcCU, uiAbsPartIdx, uiTrDepth, m_piPredBuf, m_iPredBufStride, m_iPredBufHeight, bAboveAvail, bLeftAvail);
         //===== get prediction signal =====
         predIntraLumaAng(pcCU->getPattern(), uiLumaPredMode, piPred, uiStride, uiWidth, uiHeight, bAboveAvail, bLeftAvail);
         // save prediction
@@ -1242,8 +1242,8 @@ Void TEncSearch::xIntraCodingChromaBlk(T
     {
         pcCU->getPattern()->initPattern(pcCU, uiTrDepth, uiAbsPartIdx);
 
-        pcCU->getPattern()->initAdiPatternChroma(pcCU, uiAbsPartIdx, uiTrDepth, m_piYuvExt, m_iYuvExtStride, m_iYuvExtHeight, bAboveAvail, bLeftAvail);
-        Int*  pPatChroma  = (uiChromaId > 0 ? pcCU->getPattern()->getAdiCrBuf(uiWidth, uiHeight, m_piYuvExt) : pcCU->getPattern()->getAdiCbBuf(uiWidth, uiHeight, m_piYuvExt));
+        pcCU->getPattern()->initAdiPatternChroma(pcCU, uiAbsPartIdx, uiTrDepth, m_piPredBuf, m_iPredBufStride, m_iPredBufHeight, bAboveAvail, bLeftAvail);
+        Pel*  pPatChroma  = (uiChromaId > 0 ? pcCU->getPattern()->getAdiCrBuf(uiWidth, uiHeight, m_piPredBuf) : pcCU->getPattern()->getAdiCbBuf(uiWidth, uiHeight, m_piPredBuf));
 
         //===== get prediction signal =====
         {
@@ -1504,7 +1504,7 @@ Void TEncSearch::xRecurIntraCodingQT(TCo
                 else
                 {
                     UInt uiSingleBits = xGetIntraBitsQT(pcCU, uiTrDepth, uiAbsPartIdx, true, !bLumaOnly, false);
-                    singleCostTmp     = m_pcRdCost->calcRdCost(uiSingleBits, singleDistYTmp + singleDistCTmp);
+                    singleCostTmp     = CALCRDCOST(uiSingleBits, singleDistYTmp + singleDistCTmp, m_pcRdCost->m_dLambda);
                 }
 
                 if (singleCostTmp < dSingleCost)
@@ -1604,7 +1604,7 @@ Void TEncSearch::xRecurIntraCodingQT(TCo
             {
                 uiSingleBits = uiSingleBits * 4;
             }
-            dSingleCost       = m_pcRdCost->calcRdCost(uiSingleBits, uiSingleDistY + uiSingleDistC);
+            dSingleCost       = CALCRDCOST(uiSingleBits, uiSingleDistY + uiSingleDistC, m_pcRdCost->m_dLambda);
         }
     }
 
@@ -1670,7 +1670,7 @@ Void TEncSearch::xRecurIntraCodingQT(TCo
         }
         //----- determine rate and r-d cost -----
         UInt uiSplitBits = xGetIntraBitsQT(pcCU, uiTrDepth, uiAbsPartIdx, true, !bLumaOnly, false);
-        dSplitCost       = m_pcRdCost->calcRdCost(uiSplitBits, uiSplitDistY + uiSplitDistC);
+        dSplitCost       = CALCRDCOST(uiSplitBits, uiSplitDistY + uiSplitDistC, m_pcRdCost->m_dLambda);
 
         //===== compare and set best =====
         if (dSplitCost < dSingleCost)
@@ -2238,7 +2238,7 @@ Void TEncSearch::xRecurIntraChromaCoding
                     else
                     {
                         UInt bitsTmp = xGetIntraBitsQTChroma(pcCU, uiTrDepth, uiAbsPartIdx, chromaId + 2, false);
-                        singleCostTmp  = m_pcRdCost->calcRdCost(bitsTmp, singleDistCTmp);
+                        singleCostTmp  = CALCRDCOST(bitsTmp, singleDistCTmp, m_pcRdCost->m_dLambda);
                     }
 
                     if (singleCostTmp < dSingleCost)
@@ -2389,9 +2389,9 @@ Void TEncSearch::preestChromaPredMode(TC
     Bool  bLeftAvail  = false;
 
     pcCU->getPattern()->initPattern(pcCU, 0, 0);
-    pcCU->getPattern()->initAdiPatternChroma(pcCU, 0, 0, m_piYuvExt, m_iYuvExtStride, m_iYuvExtHeight, bAboveAvail, bLeftAvail);
-    Int*  pPatChromaU = pcCU->getPattern()->getAdiCbBuf(uiWidth, uiHeight, m_piYuvExt);
-    Int*  pPatChromaV = pcCU->getPattern()->getAdiCrBuf(uiWidth, uiHeight, m_piYuvExt);
+    pcCU->getPattern()->initAdiPatternChroma(pcCU, 0, 0, m_piPredBuf, m_iPredBufStride, m_iPredBufHeight, bAboveAvail, bLeftAvail);
+    Pel*  pPatChromaU = pcCU->getPattern()->getAdiCbBuf(uiWidth, uiHeight, m_piPredBuf);
+    Pel*  pPatChromaV = pcCU->getPattern()->getAdiCrBuf(uiWidth, uiHeight, m_piPredBuf);
 
     //===== get best prediction modes (using SAD) =====
     UInt  uiMinMode   = 0;
@@ -2457,7 +2457,7 @@ Void TEncSearch::estIntraPredQT(TComData
         Bool bAboveAvail = false;
         Bool bLeftAvail  = false;
         pcCU->getPattern()->initPattern(pcCU, uiInitTrDepth, uiPartOffset);
-        pcCU->getPattern()->initAdiPattern(pcCU, uiPartOffset, uiInitTrDepth, m_piYuvExt, m_iYuvExtStride, m_iYuvExtHeight, bAboveAvail, bLeftAvail);
+        pcCU->getPattern()->initAdiPattern(pcCU, uiPartOffset, uiInitTrDepth, m_piPredBuf, m_iPredBufStride, m_iPredBufHeight, bAboveAvail, bLeftAvail);
 
         //===== determine set of modes to be tested (using prediction signal only) =====
         Int numModesAvailable     = 35; //total number of Intra modes
@@ -2518,6 +2518,7 @@ Void TEncSearch::estIntraPredQT(TComData
                     uiRdModeList[numModesForFullRD++] = mostProbableMode;
                 }
             }
+
 #endif // FAST_UDI_USE_MPM
         }
         else
@@ -2791,7 +2792,7 @@ Void TEncSearch::estIntraPredChromaQT(TC
             m_pcRDGoOnSbacCoder->load(m_pppcRDSbacCoder[uiDepth][CI_CURR_BEST]);
         }
         UInt    uiBits = xGetIntraBitsQT(pcCU,   0, 0, false, true, false);
-        Double  dCost  = m_pcRdCost->calcRdCost(uiBits, uiDist);
+        Double  dCost  = CALCRDCOST(uiBits, uiDist, m_pcRdCost->m_dLambda);
 
         //----- compare -----
         if (dCost < dBestCost)
@@ -2978,7 +2979,7 @@ Void TEncSearch::IPCMSearch(TComDataCU* 
     xEncIntraHeader(pcCU, uiDepth, uiAbsPartIdx, true, false);
     uiBits = m_pcEntropyCoder->getNumberOfWrittenBits();
 
-    dCost = m_pcRdCost->calcRdCost(uiBits, uiDistortion);
+    dCost = CALCRDCOST(uiBits, uiDistortion, m_pcRdCost->m_dLambda);
 
     if (m_bUseSBACRD)
     {
@@ -3167,16 +3168,16 @@ Void TEncSearch::predInterSearch(TComDat
     Int           bestBiPMvpL1 = 0;
     UInt          biPDistTemp = MAX_INT;
 
+    /* TODO: this could be as high as TEncSlice::compressSlice() */
+#if ENABLE_PRIMITIVES
+    TComPicYuv *fenc = pcCU->getSlice()->getPic()->getPicYuvOrg();
+    m_me.setSourcePlanes((pixel*)fenc->getLumaAddr(), (pixel*)fenc->getCbAddr(), (pixel*)fenc->getCrAddr(), fenc->getStride(), fenc->getCStride());
+#endif
+
     TComMvField cMvFieldNeighbours[MRG_MAX_NUM_CANDS << 1]; // double length for mv of both lists
     UChar uhInterDirNeighbours[MRG_MAX_NUM_CANDS];
     Int numValidMergeCand = 0;
 
-    m_me.setSourcePlanes((pixel*)pcOrgYuv->getLumaAddr(),
-                         (pixel*)pcOrgYuv->getCbAddr(),
-                         (pixel*)pcOrgYuv->getCrAddr(),
-                         pcOrgYuv->getWidth(),
-                         pcOrgYuv->getCWidth());
-
     for (Int iPartIdx = 0; iPartIdx < iNumPart; iPartIdx++)
     {
         UInt          uiCost[2] = { MAX_UINT, MAX_UINT };
@@ -3203,8 +3204,11 @@ Void TEncSearch::predInterSearch(TComDat
         xGetBlkBits(ePartSize, pcCU->getSlice()->isInterP(), iPartIdx, uiLastMode, uiMbBits);
 
         pcCU->getPartIndexAndSize(iPartIdx, uiPartAddr, iRoiWidth, iRoiHeight);
+
 #if ENABLE_PRIMITIVES
-        m_me.setSourcePU(uiPartAddr, iRoiWidth, iRoiHeight);
+        Pel* PU = fenc->getLumaAddr(pcCU->getAddr(), pcCU->getZorderIdxInCU() + uiPartAddr);
+        m_me.setSourcePU(PU - fenc->getLumaAddr(), iRoiWidth, iRoiHeight);
+        x265::primitives.cpyblock(iRoiWidth, iRoiHeight, m_fencbuf, FENC_STRIDE, (pixel*)PU, fenc->getStride());
 #endif
         Bool bTestNormalMC = true;
 
@@ -3832,7 +3836,7 @@ UInt TEncSearch::xGetTemplateCost(TComDa
 #else
     uiCost = m_pcRdCost->getDistPart(g_bitDepthY, pcTemplateCand->getLumaAddr(uiPartAddr), pcTemplateCand->getStride(), pcOrgYuv->getLumaAddr(uiPartAddr), pcOrgYuv->getStride(), iSizeX, iSizeY, DF_SAD);
 #endif
-    uiCost = (UInt)m_pcRdCost->calcRdCost(m_auiMVPIdxCost[iMVPIdx][iMVPNum], uiCost, false, DF_SAD);
+    uiCost = (UInt)CALCRDCOST_SAD(m_auiMVPIdxCost[iMVPIdx][iMVPNum], uiCost, (Double)m_pcRdCost->m_uiLambdaMotionSAD);
     return uiCost;
 }
 
@@ -4314,7 +4318,7 @@ Void TEncSearch::encodeResAndCalcRdInter
         uiBits = m_pcEntropyCoder->getNumberOfWrittenBits();
         pcCU->getTotalBits()       = uiBits;
         pcCU->getTotalDistortion() = uiDistortion;
-        pcCU->getTotalCost()       = m_pcRdCost->calcRdCost(uiBits, uiDistortion);
+        pcCU->getTotalCost()       = CALCRDCOST(uiBits, uiDistortion, m_pcRdCost->m_dLambda);
 
         if (m_bUseSBACRD)
             m_pcRDGoOnSbacCoder->store(m_pppcRDSbacCoder[pcCU->getDepth(0)][CI_TEMP_BEST]);
@@ -4365,7 +4369,7 @@ Void TEncSearch::encodeResAndCalcRdInter
         m_pcEntropyCoder->resetBits();
         m_pcEntropyCoder->encodeQtRootCbfZero(pcCU);
         UInt zeroResiBits = m_pcEntropyCoder->getNumberOfWrittenBits();
-        Double dZeroCost = m_pcRdCost->calcRdCost(zeroResiBits, uiZeroDistortion);
+        Double dZeroCost = CALCRDCOST(zeroResiBits, uiZeroDistortion, m_pcRdCost->m_dLambda);
         if (pcCU->isLosslessCoded(0))
         {
             dZeroCost = dCost + 1;
@@ -4414,7 +4418,7 @@ Void TEncSearch::encodeResAndCalcRdInter
             xAddSymbolBitsInter(pcCU, 0, 0, uiBits, pDummy, NULL, pDummy);
         }
 
-        Double dExactCost = m_pcRdCost->calcRdCost(uiBits, uiDistortion);
+        Double dExactCost = CALCRDCOST(uiBits, uiDistortion, m_pcRdCost->m_dLambda);
         dCost = dExactCost;
 
         if (dCost < dCostBest)
@@ -4493,7 +4497,7 @@ Void TEncSearch::encodeResAndCalcRdInter
         + m_pcRdCost->getDistPart(g_bitDepthC, rpcYuvRec->getCbAddr(),   rpcYuvRec->getCStride(), pcYuvOrg->getCbAddr(),   pcYuvOrg->getCStride(), uiWidth >> 1, uiHeight >> 1)
         + m_pcRdCost->getDistPart(g_bitDepthC, rpcYuvRec->getCrAddr(),   rpcYuvRec->getCStride(), pcYuvOrg->getCrAddr(),   pcYuvOrg->getCStride(), uiWidth >> 1, uiHeight >> 1);
 #endif
-    dCostBest = m_pcRdCost->calcRdCost(uiBitsBest, uiDistortionBest);
+    dCostBest = CALCRDCOST(uiBitsBest, uiDistortionBest, m_pcRdCost->m_dLambda);
 
     pcCU->getTotalBits()       = uiBitsBest;
     pcCU->getTotalDistortion() = uiDistortionBest;
@@ -4666,7 +4670,7 @@ Void TEncSearch::xEstimateResidualQT(TCo
         const UInt uiNumSamplesChro = 1 << (uiLog2TrSizeC << 1);
 
         ::memset(m_pTempPel, 0, sizeof(Pel) * uiNumSamplesLuma); // not necessary needed for inside of recursion (only at the beginning)
-        
+
         UInt uiDistY = m_pcRdCost->getDistPart(g_bitDepthY, m_pTempPel, trWidth, pcResi->getLumaAddr(absTUPartIdx), pcResi->getStride(), trWidth, trHeight); // initialized with zero residual destortion
 
         if (puiZeroDist)
@@ -4683,7 +4687,6 @@ Void TEncSearch::xEstimateResidualQT(TCo
             assert(scalingListType < 6);
             m_pcTrQuant->invtransformNxN(pcCU->getCUTransquantBypass(uiAbsPartIdx), TEXT_LUMA, REG_DCT, pcResiCurrY, m_pcQTTempTComYuv[uiQTTempAccessLayer].getStride(),  pcCoeffCurrY, trWidth, trHeight, scalingListType); //this is for inter mode only
 
-            
             const UInt uiNonzeroDistY = m_pcRdCost->getDistPart(g_bitDepthY, m_pcQTTempTComYuv[uiQTTempAccessLayer].getLumaAddr(absTUPartIdx), m_pcQTTempTComYuv[uiQTTempAccessLayer].getStride(),
                                                                 pcResi->getLumaAddr(absTUPartIdx), pcResi->getStride(), trWidth, trHeight);
 
@@ -4693,11 +4696,11 @@ Void TEncSearch::xEstimateResidualQT(TCo
             }
             else
             {
-                const Double singleCostY = m_pcRdCost->calcRdCost(uiSingleBitsY, uiNonzeroDistY);
+                const Double singleCostY = CALCRDCOST(uiSingleBitsY, uiNonzeroDistY, m_pcRdCost->m_dLambda);
                 m_pcEntropyCoder->resetBits();
                 m_pcEntropyCoder->encodeQtCbfZero(pcCU, TEXT_LUMA,     uiTrMode);
                 const UInt uiNullBitsY   = m_pcEntropyCoder->getNumberOfWrittenBits();
-                const Double nullCostY   = m_pcRdCost->calcRdCost(uiNullBitsY, uiDistY);
+                const Double nullCostY   = CALCRDCOST(uiNullBitsY, uiDistY, m_pcRdCost->m_dLambda);
                 if (nullCostY < singleCostY)
                 {
                     uiAbsSumY = 0;
@@ -4722,7 +4725,7 @@ Void TEncSearch::xEstimateResidualQT(TCo
             m_pcEntropyCoder->resetBits();
             m_pcEntropyCoder->encodeQtCbfZero(pcCU, TEXT_LUMA, uiTrMode);
             const UInt uiNullBitsY = m_pcEntropyCoder->getNumberOfWrittenBits();
-            minCostY = m_pcRdCost->calcRdCost(uiNullBitsY, uiDistY);
+            minCostY = CALCRDCOST(uiNullBitsY, uiDistY, m_pcRdCost->m_dLambda);
         }
 
         if (!uiAbsSumY)
@@ -4742,9 +4745,9 @@ Void TEncSearch::xEstimateResidualQT(TCo
         {
             uiDistU = m_pcRdCost->getDistPart(g_bitDepthC, m_pTempPel, trWidthC, pcResi->getCbAddr(absTUPartIdxC), pcResi->getCStride(), trWidthC, trHeightC
 #if WEIGHTED_CHROMA_DISTORTION
-                                                  , TEXT_CHROMA_U
+                                              , TEXT_CHROMA_U
 #endif
-                                                  ); // initialized with zero residual destortion
+                                              );     // initialized with zero residual destortion
             if (puiZeroDist)
             {
                 *puiZeroDist += uiDistU;
@@ -4772,11 +4775,11 @@ Void TEncSearch::xEstimateResidualQT(TCo
                 }
                 else
                 {
-                    const Double dSingleCostU = m_pcRdCost->calcRdCost(uiSingleBitsU, uiNonzeroDistU);
+                    const Double dSingleCostU = CALCRDCOST(uiSingleBitsU, uiNonzeroDistU, m_pcRdCost->m_dLambda);
                     m_pcEntropyCoder->resetBits();
                     m_pcEntropyCoder->encodeQtCbfZero(pcCU, TEXT_CHROMA_U,     uiTrMode);
                     const UInt uiNullBitsU    = m_pcEntropyCoder->getNumberOfWrittenBits();
-                    const Double dNullCostU   = m_pcRdCost->calcRdCost(uiNullBitsU, uiDistU);
+                    const Double dNullCostU   = CALCRDCOST(uiNullBitsU, uiDistU, m_pcRdCost->m_dLambda);
                     if (dNullCostU < dSingleCostU)
                     {
                         uiAbsSumU = 0;
@@ -4801,7 +4804,7 @@ Void TEncSearch::xEstimateResidualQT(TCo
                 m_pcEntropyCoder->resetBits();
                 m_pcEntropyCoder->encodeQtCbfZero(pcCU, TEXT_CHROMA_U, uiTrModeC);
                 const UInt uiNullBitsU = m_pcEntropyCoder->getNumberOfWrittenBits();
-                minCostU = m_pcRdCost->calcRdCost(uiNullBitsU, uiDistU);
+                minCostU = CALCRDCOST(uiNullBitsU, uiDistU, m_pcRdCost->m_dLambda);
             }
             if (!uiAbsSumU)
             {
@@ -4816,9 +4819,9 @@ Void TEncSearch::xEstimateResidualQT(TCo
 
             uiDistV = m_pcRdCost->getDistPart(g_bitDepthC, m_pTempPel, trWidthC, pcResi->getCrAddr(absTUPartIdxC), pcResi->getCStride(), trWidthC, trHeightC
 #if WEIGHTED_CHROMA_DISTORTION
-                                                  , TEXT_CHROMA_V
+                                              , TEXT_CHROMA_V
 #endif
-                                                  ); // initialized with zero residual destortion
+                                              );     // initialized with zero residual destortion
             if (puiZeroDist)
             {
                 *puiZeroDist += uiDistV;
@@ -4839,18 +4842,18 @@ Void TEncSearch::xEstimateResidualQT(TCo
                                                                     , TEXT_CHROMA_V
 #endif
                                                                     );
-                
+
                 if (pcCU->isLosslessCoded(0))
                 {
                     uiDistV = uiNonzeroDistV;
                 }
                 else
                 {
-                    const Double dSingleCostV = m_pcRdCost->calcRdCost(uiSingleBitsV, uiNonzeroDistV);
+                    const Double dSingleCostV = CALCRDCOST(uiSingleBitsV, uiNonzeroDistV, m_pcRdCost->m_dLambda);
                     m_pcEntropyCoder->resetBits();
                     m_pcEntropyCoder->encodeQtCbfZero(pcCU, TEXT_CHROMA_V,     uiTrMode);
                     const UInt uiNullBitsV    = m_pcEntropyCoder->getNumberOfWrittenBits();
-                    const Double dNullCostV   = m_pcRdCost->calcRdCost(uiNullBitsV, uiDistV);
+                    const Double dNullCostV   = CALCRDCOST(uiNullBitsV, uiDistV, m_pcRdCost->m_dLambda);
                     if (dNullCostV < dSingleCostV)
                     {
                         uiAbsSumV = 0;
@@ -4875,7 +4878,7 @@ Void TEncSearch::xEstimateResidualQT(TCo
                 m_pcEntropyCoder->resetBits();
                 m_pcEntropyCoder->encodeQtCbfZero(pcCU, TEXT_CHROMA_V, uiTrModeC);
                 const UInt uiNullBitsV = m_pcEntropyCoder->getNumberOfWrittenBits();
-                minCostV = m_pcRdCost->calcRdCost(uiNullBitsV, uiDistV);
+                minCostV = CALCRDCOST(uiNullBitsV, uiDistV, m_pcRdCost->m_dLambda);
             }
             if (!uiAbsSumV)
             {
@@ -4955,8 +4958,8 @@ Void TEncSearch::xEstimateResidualQT(TCo
 
                 uiNonzeroDistY = m_pcRdCost->getDistPart(g_bitDepthY, m_pcQTTempTComYuv[uiQTTempAccessLayer].getLumaAddr(absTUPartIdx), m_pcQTTempTComYuv[uiQTTempAccessLayer].getStride(),
                                                          pcResi->getLumaAddr(absTUPartIdx), pcResi->getStride(), trWidth, trHeight);
-                
-                dSingleCostY = m_pcRdCost->calcRdCost(uiTsSingleBitsY, uiNonzeroDistY);
+
+                dSingleCostY = CALCRDCOST(uiTsSingleBitsY, uiNonzeroDistY, m_pcRdCost->m_dLambda);
             }
 
             if (!uiAbsSumTransformSkipY || minCostY < dSingleCostY)
@@ -5054,10 +5057,10 @@ Void TEncSearch::xEstimateResidualQT(TCo
                 uiNonzeroDistU = m_pcRdCost->getDistPart(g_bitDepthC, m_pcQTTempTComYuv[uiQTTempAccessLayer].getCbAddr(absTUPartIdxC), m_pcQTTempTComYuv[uiQTTempAccessLayer].getCStride(),
                                                          pcResi->getCbAddr(absTUPartIdxC), pcResi->getCStride(), trWidthC, trHeightC
 #if WEIGHTED_CHROMA_DISTORTION
-                                                             , TEXT_CHROMA_U
+                                                         , TEXT_CHROMA_U
 #endif
-                                                            );
-                dSingleCostU = m_pcRdCost->calcRdCost(uiSingleBitsU, uiNonzeroDistU);
+                                                         );
+                dSingleCostU = CALCRDCOST(uiSingleBitsU, uiNonzeroDistU, m_pcRdCost->m_dLambda);
             }
 
             if (!uiAbsSumTransformSkipU || minCostU < dSingleCostU)
@@ -5091,14 +5094,14 @@ Void TEncSearch::xEstimateResidualQT(TCo
                 assert(scalingListType < 6);
 
                 m_pcTrQuant->invtransformNxN(pcCU->getCUTransquantBypass(uiAbsPartIdx), TEXT_CHROMA, REG_DCT, pcResiCurrV, m_pcQTTempTComYuv[uiQTTempAccessLayer].getCStride(), pcCoeffCurrV, trWidthC, trHeightC, scalingListType, true);
-               
+
                 uiNonzeroDistV = m_pcRdCost->getDistPart(g_bitDepthC, m_pcQTTempTComYuv[uiQTTempAccessLayer].getCrAddr(absTUPartIdxC), m_pcQTTempTComYuv[uiQTTempAccessLayer].getCStride(),
                                                          pcResi->getCrAddr(absTUPartIdxC), pcResi->getCStride(), trWidthC, trHeightC
 #if WEIGHTED_CHROMA_DISTORTION
-                                                             , TEXT_CHROMA_V
+                                                         , TEXT_CHROMA_V
 #endif
-                                                             );
-                dSingleCostV = m_pcRdCost->calcRdCost(uiSingleBitsV, uiNonzeroDistV);
+                                                         );
+                dSingleCostV = CALCRDCOST(uiSingleBitsV, uiNonzeroDistV, m_pcRdCost->m_dLambda);
             }
 
             if (!uiAbsSumTransformSkipV || minCostV < dSingleCostV)
@@ -5158,7 +5161,7 @@ Void TEncSearch::xEstimateResidualQT(TCo
         uiSingleBits = m_pcEntropyCoder->getNumberOfWrittenBits();
 
         uiSingleDist = uiDistY + uiDistU + uiDistV;
-        dSingleCost = m_pcRdCost->calcRdCost(uiSingleBits, uiSingleDist);
+        dSingleCost = CALCRDCOST(uiSingleBits, uiSingleDist, m_pcRdCost->m_dLambda);
     }
 
     // code sub-blocks
@@ -5211,7 +5214,7 @@ Void TEncSearch::xEstimateResidualQT(TCo
         }
 
         uiSubdivBits = m_pcEntropyCoder->getNumberOfWrittenBits();
-        dSubdivCost  = m_pcRdCost->calcRdCost(uiSubdivBits, uiSubdivDist);
+        dSubdivCost  = CALCRDCOST(uiSubdivBits, uiSubdivDist, m_pcRdCost->m_dLambda);
 
         if (uiYCbf || uiUCbf || uiVCbf || !bCheckFull)
         {
--- a/source/Lib/TLibEncoder/TEncSearch.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Mon May 06 14:34:00 2013 +0530
@@ -49,6 +49,7 @@
 #include "TEncSbac.h"
 #include "TEncCfg.h"
 
+#include "primitives.h"
 #include "bitcost.h"
 #include "motion.h"
 
@@ -95,6 +96,9 @@ private:
 
     x265::BitCost   m_bc;
     x265::MotionEstimate m_me;
+#if ENABLE_PRIMITIVES
+    ALIGN_VAR_32(pixel, m_fencbuf[64 * 64]);
+#endif
 
 protected:
 
--- a/source/Lib/TLibEncoder/TEncSlice.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSlice.cpp	Mon May 06 14:34:00 2013 +0530
@@ -599,7 +599,7 @@ Void TEncSlice::precompressSlice(TComPic
         m_pcGOPEncoder->preLoopFilterPicAll(rpcPic, uiPicDist, uiALFBits);
 
         // compute RD cost and choose the best
-        dPicRdCost = m_pcRdCost->calcRdCost64(m_uiPicTotalBits + uiALFBits, uiPicDist, true, DF_SSE_FRAME);
+        dPicRdCost = CALCRDCOST_SAD(m_uiPicTotalBits + uiALFBits, uiPicDist, m_pcRdCost->m_uiLambdaMotionSAD);
 
         if (dPicRdCost < dPicRdCostBest)
         {
--- a/source/VectorClass/vectori128.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/VectorClass/vectori128.h	Mon May 06 14:34:00 2013 +0530
@@ -307,6 +307,11 @@ public:
     void fromUint32(uint32_t i) {
         xmm = _mm_cvtsi32_si128(i);
     }
+#if _WIN64
+    void fromUint64(uint64_t i) {
+        xmm = _mm_cvtsi64_si128(i);
+    }
+#endif
     // Constructor to build from all elements:
     Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
         int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) {
@@ -421,6 +426,21 @@ public:
 
 // Define operators for this class
 
+// convert vector to int32
+static inline int32_t toInt32(__m128i const & x) {
+    return _mm_cvtsi128_si32(x);
+}
+
+// extract low 64-bits from vector, return [LO LO], map to PUNPCKLQDQ
+static inline __m128i extract_lo64(__m128i const & x) {
+    return _mm_unpacklo_epi64(x, x);
+}
+
+// extract high 64-bits from vector, return [HI HI], map to PUNPCKHQDQ
+static inline __m128i extract_hi64(__m128i const & x) {
+    return _mm_unpackhi_epi64(x, x);
+}
+
 // vector operator + : add element by element
 static inline Vec16c operator + (Vec16c const & a, Vec16c const & b) {
     return _mm_add_epi8(a, b);
@@ -975,6 +995,11 @@ public:
         xmm = _mm_load_si128((__m128i const*)p);
         return *this;
     }
+    // Partial load. Load 4 elements and set the rest to 0(MMX)
+    Vec8s & load_partial4(void const * p) {
+        xmm = _mm_loadl_epi64((__m128i const*)p);
+        return *this;
+    }
     // Partial load. Load n elements and set the rest to 0
     Vec8s & load_partial(int n, void const * p) {
         if      (n >= 8) load(p);
@@ -992,6 +1017,10 @@ public:
         cutoff(n);
         return *this;
     }
+    // Partial store. Store 4 elements
+    void store_partial4(void * p) const {
+        _mm_storel_epi64((__m128i*)p, xmm);
+    }
     // Partial store. Store n elements
     void store_partial(int n, void * p) const {
         if (n >= 8) {
@@ -1385,6 +1414,10 @@ public:
         xmm = x;
         return *this;
     };
+    Vec8us & addSumAbsDiff(__m128i const & a, __m128i const & b) {
+        xmm = _mm_add_epi16(xmm, _mm_sad_epu8(a, b));
+        return *this;
+    }
     // Member function to load from array (unaligned)
     Vec8us & load(void const * p) {
         xmm = _mm_loadu_si128((__m128i const*)p);
@@ -4363,11 +4396,21 @@ static inline Vec4ui extend_low (Vec8us 
     return    _mm_unpacklo_epi16(a,_mm_setzero_si128());   // interleave with zero extensions
 }
 
+// Function extend_low : extends the low 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_low_unsafe (Vec8us const & a) {
+    return    _mm_unpacklo_epi16(a,a);   // interleave with zero extensions
+}
+
 // Function extend_high : extends the high 4 elements to 32 bits with zero extension
 static inline Vec4ui extend_high (Vec8us const & a) {
     return    _mm_unpackhi_epi16(a,_mm_setzero_si128());   // interleave with zero extensions
 }
 
+// Function extend_high : extends the high 4 elements to 32 bits with zero extension
+static inline Vec4ui extend_high_unsafe (Vec8us const & a) {
+    return    _mm_unpackhi_epi16(a,a);   // interleave with zero extensions
+}
+
 // Extend 32-bit integers to 64-bit integers, signed and unsigned
 
 // Function extend_low : extends the low 2 elements to 64 bits with sign extension
@@ -5324,6 +5367,26 @@ static inline Vec16uc & operator /= (Vec
     return a;
 }
 
+/*****************************************************************************
+*
+*          Vector shift: shift is a compile-time constant
+*
+*****************************************************************************/
+
+// Shift Vec4ui by compile-time constant
+template <int32_t d>
+static inline Vec4ui shift_right_by_i(Vec4ui const & x) {
+    const int n = int(d) / 8;
+    Static_error_check<((d%8) == 0)> shift_by_non_bytes;
+    return _mm_srli_si128(x, n);
+}
+
+// vector operator >> : shift right logical all elements with const bytes (map to PSRLDQ)
+template <int32_t d>
+static inline Vec4ui operator >> (Vec4ui const & a, Const_int_t<d>) {
+    return shift_right_by_i<d>(a);
+}
+
 #if _MSC_VER
 #pragma warning(pop)
 #endif
--- a/source/encoder/CMakeLists.txt	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/CMakeLists.txt	Mon May 06 14:34:00 2013 +0530
@@ -6,7 +6,7 @@ if(GCC)
 endif(GCC)
 
 if(ENABLE_PRIMITIVES)
-    set(CPRIMITIVES pixel.cpp macroblock.cpp ipfilter.cpp)
+    set(CPRIMITIVES pixel.cpp macroblock.cpp ipfilter.cpp IntraPred.cpp)
     if(ENABLE_PRIMITIVES_VEC)
         add_definitions(-DENABLE_VECTOR_PRIMITIVES=1)
     endif(ENABLE_PRIMITIVES_VEC)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/encoder/IntraPred.cpp	Mon May 06 14:34:00 2013 +0530
@@ -0,0 +1,77 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@multicorewareinc.com.
+ *****************************************************************************/
+
+#include "primitives.h"
+#include <cstring>
+#include <assert.h>
+
+namespace {
+pixel CDECL predIntraGetPredValDC(pixel* pSrc, intptr_t iSrcStride, intptr_t iWidth, intptr_t iHeight, int bAbove, int bLeft)
+{
+    int iInd, iSum = 0;
+    pixel pDcVal;
+
+    if (bAbove)
+    {
+        for (iInd = 0; iInd < iWidth; iInd++)
+        {
+            iSum += pSrc[iInd - iSrcStride];
+        }
+    }
+    if (bLeft)
+    {
+        for (iInd = 0; iInd < iHeight; iInd++)
+        {
+            iSum += pSrc[iInd * iSrcStride - 1];
+        }
+    }
+
+    if (bAbove && bLeft)
+    {
+        pDcVal = (pixel)((iSum + iWidth) / (iWidth + iHeight));
+    }
+    else if (bAbove)
+    {
+        pDcVal = (pixel)((iSum + iWidth / 2) / iWidth);
+    }
+    else if (bLeft)
+    {
+        pDcVal = (pixel)((iSum + iHeight / 2) / iHeight);
+    }
+    else
+    {
+        pDcVal = pSrc[-1]; // Default DC value already calculated and placed in the prediction array if no neighbors are available
+    }
+
+    return pDcVal;
+}
+}
+
+namespace x265 {
+// x265 private namespace
+
+void Setup_C_IPredPrimitives(EncoderPrimitives& p)
+{
+    p.getdcval_p = predIntraGetPredValDC;
+}
+}
--- a/source/encoder/pixel.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/pixel.cpp	Mon May 06 14:34:00 2013 +0530
@@ -321,6 +321,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad[PARTITION_4x16]  = sad<4, 16>;
     p.sad[PARTITION_4x24]  = sad<4, 24>;
     p.sad[PARTITION_4x32]  = sad<4, 32>;
+    p.sad[PARTITION_4x48]  = sad<4, 48>;
     p.sad[PARTITION_4x64]  = sad<4, 64>;
 
     p.sad[PARTITION_8x4]   = sad<8, 4>;
@@ -329,6 +330,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad[PARTITION_8x16]  = sad<8, 16>;
     p.sad[PARTITION_8x24]  = sad<8, 24>;
     p.sad[PARTITION_8x32]  = sad<8, 32>;
+    p.sad[PARTITION_8x48]  = sad<8, 48>;
     p.sad[PARTITION_8x64]  = sad<8, 64>;
 
     p.sad[PARTITION_12x4]  = sad<12, 4>;
@@ -337,6 +339,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad[PARTITION_12x16] = sad<12, 16>;
     p.sad[PARTITION_12x24] = sad<12, 24>;
     p.sad[PARTITION_12x32] = sad<12, 32>;
+    p.sad[PARTITION_12x48] = sad<12, 48>;
     p.sad[PARTITION_12x64] = sad<12, 64>;
 
     p.sad[PARTITION_16x4]  = sad<16, 4>;
@@ -345,6 +348,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad[PARTITION_16x16] = sad<16, 16>;
     p.sad[PARTITION_16x24] = sad<16, 24>;
     p.sad[PARTITION_16x32] = sad<16, 32>;
+    p.sad[PARTITION_16x48] = sad<16, 48>;
     p.sad[PARTITION_16x64] = sad<16, 64>;
 
     p.sad[PARTITION_24x4]  = sad<24, 4>;
@@ -353,6 +357,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad[PARTITION_24x16] = sad<24, 16>;
     p.sad[PARTITION_24x24] = sad<24, 24>;
     p.sad[PARTITION_24x32] = sad<24, 32>;
+    p.sad[PARTITION_24x48] = sad<24, 48>;
     p.sad[PARTITION_24x64] = sad<24, 64>;
 
     p.sad[PARTITION_32x4]  = sad<32, 4>;
@@ -361,14 +366,25 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad[PARTITION_32x16] = sad<32, 16>;
     p.sad[PARTITION_32x24] = sad<32, 24>;
     p.sad[PARTITION_32x32] = sad<32, 32>;
+    p.sad[PARTITION_32x48] = sad<32, 48>;
     p.sad[PARTITION_32x64] = sad<32, 64>;
 
+    p.sad[PARTITION_48x4]  = sad<48, 4>;
+    p.sad[PARTITION_48x8]  = sad<48, 8>;
+    p.sad[PARTITION_48x12] = sad<48, 12>;
+    p.sad[PARTITION_48x16] = sad<48, 16>;
+    p.sad[PARTITION_48x24] = sad<48, 24>;
+    p.sad[PARTITION_48x32] = sad<48, 32>;
+    p.sad[PARTITION_48x48] = sad<48, 48>;
+    p.sad[PARTITION_48x64] = sad<48, 64>;
+
     p.sad[PARTITION_64x4]  = sad<64, 4>;
     p.sad[PARTITION_64x8]  = sad<64, 8>;
     p.sad[PARTITION_64x12] = sad<64, 12>;
     p.sad[PARTITION_64x16] = sad<64, 16>;
     p.sad[PARTITION_64x24] = sad<64, 24>;
     p.sad[PARTITION_64x32] = sad<64, 32>;
+    p.sad[PARTITION_64x48] = sad<64, 48>;
     p.sad[PARTITION_64x64] = sad<64, 64>;
 
     // satd
@@ -378,6 +394,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.satd[PARTITION_4x16]  = satd4<4, 16>;
     p.satd[PARTITION_4x24]  = satd4<4, 24>;
     p.satd[PARTITION_4x32]  = satd4<4, 32>;
+    p.satd[PARTITION_4x48]  = satd4<4, 48>;
     p.satd[PARTITION_4x64]  = satd4<4, 64>;
 
     p.satd[PARTITION_8x4]   = satd_8x4;
@@ -386,6 +403,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.satd[PARTITION_8x16]  = satd8<8, 16>;
     p.satd[PARTITION_8x24]  = satd8<8, 24>;
     p.satd[PARTITION_8x32]  = satd8<8, 32>;
+    p.satd[PARTITION_8x48]  = satd8<8, 48>;
     p.satd[PARTITION_8x64]  = satd8<8, 64>;
 
     p.satd[PARTITION_12x4]  = satd12<4>;
@@ -394,6 +412,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.satd[PARTITION_12x16] = satd12<16>;
     p.satd[PARTITION_12x24] = satd12<24>;
     p.satd[PARTITION_12x32] = satd12<32>;
+    p.satd[PARTITION_12x48] = satd12<48>;
     p.satd[PARTITION_12x64] = satd12<64>;
 
     p.satd[PARTITION_16x4]  = satd8<16, 4>;
@@ -402,6 +421,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.satd[PARTITION_16x16] = satd8<16, 16>;
     p.satd[PARTITION_16x24] = satd8<16, 24>;
     p.satd[PARTITION_16x32] = satd8<16, 32>;
+    p.satd[PARTITION_16x48] = satd8<16, 48>;
     p.satd[PARTITION_16x64] = satd8<16, 64>;
 
     p.satd[PARTITION_24x4]  = satd8<24, 4>;
@@ -410,6 +430,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.satd[PARTITION_24x16] = satd8<24, 16>;
     p.satd[PARTITION_24x24] = satd8<24, 24>;
     p.satd[PARTITION_24x32] = satd8<24, 32>;
+    p.satd[PARTITION_24x48] = satd8<24, 48>;
     p.satd[PARTITION_24x64] = satd8<24, 64>;
 
     p.satd[PARTITION_32x4]  = satd8<32, 4>;
@@ -418,14 +439,25 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.satd[PARTITION_32x16] = satd8<32, 16>;
     p.satd[PARTITION_32x24] = satd8<32, 24>;
     p.satd[PARTITION_32x32] = satd8<32, 32>;
+    p.satd[PARTITION_32x48] = satd8<32, 48>;
     p.satd[PARTITION_32x64] = satd8<32, 64>;
 
+    p.satd[PARTITION_48x4]  = satd8<48, 4>;
+    p.satd[PARTITION_48x8]  = satd8<48, 8>;
+    p.satd[PARTITION_48x12] = satd8<48, 12>;
+    p.satd[PARTITION_48x16] = satd8<48, 16>;
+    p.satd[PARTITION_48x24] = satd8<48, 24>;
+    p.satd[PARTITION_48x32] = satd8<48, 32>;
+    p.satd[PARTITION_48x48] = satd8<48, 48>;
+    p.satd[PARTITION_48x64] = satd8<48, 64>;
+
     p.satd[PARTITION_64x4]  = satd8<64, 4>;
     p.satd[PARTITION_64x8]  = satd8<64, 8>;
     p.satd[PARTITION_64x12] = satd8<64, 12>;
     p.satd[PARTITION_64x16] = satd8<64, 16>;
     p.satd[PARTITION_64x24] = satd8<64, 24>;
     p.satd[PARTITION_64x32] = satd8<64, 32>;
+    p.satd[PARTITION_64x48] = satd8<64, 48>;
     p.satd[PARTITION_64x64] = satd8<64, 64>;
 
     // sad_x3
@@ -435,6 +467,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x3[PARTITION_4x16]  = sad_x3<4, 16>;
     p.sad_x3[PARTITION_4x24]  = sad_x3<4, 24>;
     p.sad_x3[PARTITION_4x32]  = sad_x3<4, 32>;
+    p.sad_x3[PARTITION_4x48]  = sad_x3<4, 48>;
     p.sad_x3[PARTITION_4x64]  = sad_x3<4, 64>;
 
     p.sad_x3[PARTITION_8x4]   = sad_x3<8, 4>;
@@ -443,6 +476,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x3[PARTITION_8x16]  = sad_x3<8, 16>;
     p.sad_x3[PARTITION_8x24]  = sad_x3<8, 24>;
     p.sad_x3[PARTITION_8x32]  = sad_x3<8, 32>;
+    p.sad_x3[PARTITION_8x48]  = sad_x3<8, 48>;
     p.sad_x3[PARTITION_8x64]  = sad_x3<8, 64>;
 
     p.sad_x3[PARTITION_12x4]  = sad_x3<12, 4>;
@@ -451,6 +485,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x3[PARTITION_12x16] = sad_x3<12, 16>;
     p.sad_x3[PARTITION_12x24] = sad_x3<12, 24>;
     p.sad_x3[PARTITION_12x32] = sad_x3<12, 32>;
+    p.sad_x3[PARTITION_12x48] = sad_x3<12, 48>;
     p.sad_x3[PARTITION_12x64] = sad_x3<12, 64>;
 
     p.sad_x3[PARTITION_16x4]  = sad_x3<16, 4>;
@@ -459,6 +494,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x3[PARTITION_16x16] = sad_x3<16, 16>;
     p.sad_x3[PARTITION_16x24] = sad_x3<16, 24>;
     p.sad_x3[PARTITION_16x32] = sad_x3<16, 32>;
+    p.sad_x3[PARTITION_16x48] = sad_x3<16, 48>;
     p.sad_x3[PARTITION_16x64] = sad_x3<16, 64>;
 
     p.sad_x3[PARTITION_24x4]  = sad_x3<24, 4>;
@@ -467,6 +503,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x3[PARTITION_24x16] = sad_x3<24, 16>;
     p.sad_x3[PARTITION_24x24] = sad_x3<24, 24>;
     p.sad_x3[PARTITION_24x32] = sad_x3<24, 32>;
+    p.sad_x3[PARTITION_24x48] = sad_x3<24, 48>;
     p.sad_x3[PARTITION_24x64] = sad_x3<24, 64>;
 
     p.sad_x3[PARTITION_32x4]  = sad_x3<32, 4>;
@@ -475,14 +512,25 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x3[PARTITION_32x16] = sad_x3<32, 16>;
     p.sad_x3[PARTITION_32x24] = sad_x3<32, 24>;
     p.sad_x3[PARTITION_32x32] = sad_x3<32, 32>;
+    p.sad_x3[PARTITION_32x48] = sad_x3<32, 48>;
     p.sad_x3[PARTITION_32x64] = sad_x3<32, 64>;
 
+    p.sad_x3[PARTITION_48x4]  = sad_x3<48, 4>;
+    p.sad_x3[PARTITION_48x8]  = sad_x3<48, 8>;
+    p.sad_x3[PARTITION_48x12] = sad_x3<48, 12>;
+    p.sad_x3[PARTITION_48x16] = sad_x3<48, 16>;
+    p.sad_x3[PARTITION_48x24] = sad_x3<48, 24>;
+    p.sad_x3[PARTITION_48x32] = sad_x3<48, 32>;
+    p.sad_x3[PARTITION_48x48] = sad_x3<48, 48>;
+    p.sad_x3[PARTITION_48x64] = sad_x3<48, 64>;
+    
     p.sad_x3[PARTITION_64x4]  = sad_x3<64, 4>;
     p.sad_x3[PARTITION_64x8]  = sad_x3<64, 8>;
     p.sad_x3[PARTITION_64x12] = sad_x3<64, 12>;
     p.sad_x3[PARTITION_64x16] = sad_x3<64, 16>;
     p.sad_x3[PARTITION_64x24] = sad_x3<64, 24>;
     p.sad_x3[PARTITION_64x32] = sad_x3<64, 32>;
+    p.sad_x3[PARTITION_64x48] = sad_x3<64, 48>;
     p.sad_x3[PARTITION_64x64] = sad_x3<64, 64>;
     
     // sad_x4
@@ -492,6 +540,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x4[PARTITION_4x16]  = sad_x4<4, 16>;
     p.sad_x4[PARTITION_4x24]  = sad_x4<4, 24>;
     p.sad_x4[PARTITION_4x32]  = sad_x4<4, 32>;
+    p.sad_x4[PARTITION_4x48]  = sad_x4<4, 48>;
     p.sad_x4[PARTITION_4x64]  = sad_x4<4, 64>;
 
     p.sad_x4[PARTITION_8x4]   = sad_x4<8, 4>;
@@ -500,6 +549,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x4[PARTITION_8x16]  = sad_x4<8, 16>;
     p.sad_x4[PARTITION_8x24]  = sad_x4<8, 24>;
     p.sad_x4[PARTITION_8x32]  = sad_x4<8, 32>;
+    p.sad_x4[PARTITION_8x48]  = sad_x4<8, 48>;
     p.sad_x4[PARTITION_8x64]  = sad_x4<8, 64>;
 
     p.sad_x4[PARTITION_12x4]  = sad_x4<12, 4>;
@@ -508,6 +558,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x4[PARTITION_12x16] = sad_x4<12, 16>;
     p.sad_x4[PARTITION_12x24] = sad_x4<12, 24>;
     p.sad_x4[PARTITION_12x32] = sad_x4<12, 32>;
+    p.sad_x4[PARTITION_12x48] = sad_x4<12, 48>;
     p.sad_x4[PARTITION_12x64] = sad_x4<12, 64>;
 
     p.sad_x4[PARTITION_16x4]  = sad_x4<16, 4>;
@@ -516,6 +567,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x4[PARTITION_16x16] = sad_x4<16, 16>;
     p.sad_x4[PARTITION_16x24] = sad_x4<16, 24>;
     p.sad_x4[PARTITION_16x32] = sad_x4<16, 32>;
+    p.sad_x4[PARTITION_16x48] = sad_x4<16, 48>;
     p.sad_x4[PARTITION_16x64] = sad_x4<16, 64>;
 
     p.sad_x4[PARTITION_24x4]  = sad_x4<24, 4>;
@@ -524,6 +576,7 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x4[PARTITION_24x16] = sad_x4<24, 16>;
     p.sad_x4[PARTITION_24x24] = sad_x4<24, 24>;
     p.sad_x4[PARTITION_24x32] = sad_x4<24, 32>;
+    p.sad_x4[PARTITION_24x48] = sad_x4<24, 48>;
     p.sad_x4[PARTITION_24x64] = sad_x4<24, 64>;
 
     p.sad_x4[PARTITION_32x4]  = sad_x4<32, 4>;
@@ -532,14 +585,25 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad_x4[PARTITION_32x16] = sad_x4<32, 16>;
     p.sad_x4[PARTITION_32x24] = sad_x4<32, 24>;
     p.sad_x4[PARTITION_32x32] = sad_x4<32, 32>;
+    p.sad_x4[PARTITION_32x48] = sad_x4<32, 48>;
     p.sad_x4[PARTITION_32x64] = sad_x4<32, 64>;
 
+    p.sad_x4[PARTITION_48x4]  = sad_x4<48, 4>;
+    p.sad_x4[PARTITION_48x8]  = sad_x4<48, 8>;
+    p.sad_x4[PARTITION_48x12] = sad_x4<48, 12>;
+    p.sad_x4[PARTITION_48x16] = sad_x4<48, 16>;
+    p.sad_x4[PARTITION_48x24] = sad_x4<48, 24>;
+    p.sad_x4[PARTITION_48x32] = sad_x4<48, 32>;
+    p.sad_x4[PARTITION_48x48] = sad_x4<48, 48>;
+    p.sad_x4[PARTITION_48x64] = sad_x4<48, 64>;
+
     p.sad_x4[PARTITION_64x4]  = sad_x4<64, 4>;
     p.sad_x4[PARTITION_64x8]  = sad_x4<64, 8>;
     p.sad_x4[PARTITION_64x12] = sad_x4<64, 12>;
     p.sad_x4[PARTITION_64x16] = sad_x4<64, 16>;
     p.sad_x4[PARTITION_64x24] = sad_x4<64, 24>;
     p.sad_x4[PARTITION_64x32] = sad_x4<64, 32>;
+    p.sad_x4[PARTITION_64x48] = sad_x4<64, 48>;
     p.sad_x4[PARTITION_64x64] = sad_x4<64, 64>;
 
     p.cpyblock = blockcopy_p_p;
--- a/source/encoder/primitives.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/primitives.cpp	Mon May 06 14:34:00 2013 +0530
@@ -34,9 +34,9 @@ namespace x265 {
 // x265 private namespace
 
 #if ENABLE_PRIMITIVES
-//                          4   8  12  16      24     32 / 64
-static int8_t psize[16] = { 0,  1,  2,  3, -1,  4, -1, 5,
-                            -1, -1, -1, -1, -1, -1, -1, 6 };
+//                           4   8  12  16/48   24     32/64
+static int8_t psize[16] = {  0,  1,  2,  3, -1,  4, -1, 5,
+                            -1, -1, -1,  6, -1, -1, -1, 7 };
 int *Motion_Cost;
 
 // Returns a Partitions enum if the size matches a supported performance primitive,
@@ -49,8 +49,8 @@ int PartitionFromSizes(int Width, int He
     assert(((Width | Height) & ~(4 | 8 | 16 | 32 | 64)) == 0);
     assert((w | h) >= 0);
 
-    // there are currently seven height partitions per width
-    return w * 7 + h;
+    // there are currently eight height partitions per width
+    return (w << 3) + h;
 }
 
 /* the "authoritative" set of encoder primitives */
@@ -59,12 +59,14 @@ EncoderPrimitives primitives;
 void Setup_C_PixelPrimitives(EncoderPrimitives &p);
 void Setup_C_MacroblockPrimitives(EncoderPrimitives &p);
 void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
+void Setup_C_IPredPrimitives(EncoderPrimitives &p);
 
 void Setup_C_Primitives(EncoderPrimitives &p)
 {
     Setup_C_PixelPrimitives(p);      // pixel.cpp
     Setup_C_MacroblockPrimitives(p); // macroblock.cpp
     Setup_C_IPFilterPrimitives(p);   // InterpolationFilter.cpp
+    Setup_C_IPredPrimitives(p);      // IntraPred.cpp
 }
 
 #endif // if ENABLE_PRIMITIVES
--- a/source/encoder/primitives.h	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/primitives.h	Mon May 06 14:34:00 2013 +0530
@@ -73,13 +73,14 @@ namespace x265 {
 
 enum Partitions
 {
-    PARTITION_4x4,  PARTITION_4x8,  PARTITION_4x12,  PARTITION_4x16,  PARTITION_4x24,  PARTITION_4x32,  PARTITION_4x64,
-    PARTITION_8x4,  PARTITION_8x8,  PARTITION_8x12,  PARTITION_8x16,  PARTITION_8x24,  PARTITION_8x32,  PARTITION_8x64,
-    PARTITION_12x4, PARTITION_12x8, PARTITION_12x12, PARTITION_12x16, PARTITION_12x24, PARTITION_12x32, PARTITION_12x64,
-    PARTITION_16x4, PARTITION_16x8, PARTITION_16x12, PARTITION_16x16, PARTITION_16x24, PARTITION_16x32, PARTITION_16x64,
-    PARTITION_24x4, PARTITION_24x8, PARTITION_24x12, PARTITION_24x16, PARTITION_24x24, PARTITION_24x32, PARTITION_24x64,
-    PARTITION_32x4, PARTITION_32x8, PARTITION_32x12, PARTITION_32x16, PARTITION_32x24, PARTITION_32x32, PARTITION_32x64,
-    PARTITION_64x4, PARTITION_64x8, PARTITION_64x12, PARTITION_64x16, PARTITION_64x24, PARTITION_64x32, PARTITION_64x64,
+    PARTITION_4x4,  PARTITION_4x8,  PARTITION_4x12,  PARTITION_4x16,  PARTITION_4x24,  PARTITION_4x32,  PARTITION_4x48,   PARTITION_4x64,
+    PARTITION_8x4,  PARTITION_8x8,  PARTITION_8x12,  PARTITION_8x16,  PARTITION_8x24,  PARTITION_8x32,  PARTITION_8x48,   PARTITION_8x64,
+    PARTITION_12x4, PARTITION_12x8, PARTITION_12x12, PARTITION_12x16, PARTITION_12x24, PARTITION_12x32, PARTITION_12x48,  PARTITION_12x64,
+    PARTITION_16x4, PARTITION_16x8, PARTITION_16x12, PARTITION_16x16, PARTITION_16x24, PARTITION_16x32, PARTITION_16x48,  PARTITION_16x64,
+    PARTITION_24x4, PARTITION_24x8, PARTITION_24x12, PARTITION_24x16, PARTITION_24x24, PARTITION_24x32, PARTITION_24x48,  PARTITION_24x64,
+    PARTITION_32x4, PARTITION_32x8, PARTITION_32x12, PARTITION_32x16, PARTITION_32x24, PARTITION_32x32, PARTITION_32x48,  PARTITION_32x64,
+    PARTITION_48x4, PARTITION_48x8, PARTITION_48x12, PARTITION_48x16, PARTITION_48x24, PARTITION_48x32, PARTITION_48x48,  PARTITION_48x64,
+    PARTITION_64x4, PARTITION_64x8, PARTITION_64x12, PARTITION_64x16, PARTITION_64x24, PARTITION_64x32, PARTITION_64x48,  PARTITION_64x64,
     NUM_PARTITIONS
 };
 
@@ -162,6 +163,8 @@ typedef void (CDECL * IPFilterConvert_s_
 typedef void (CDECL * blockcpy_p_p)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
 typedef void (CDECL * blockcpy_s_p)(int bx, int by, short *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
 typedef void (CDECL * blockcpy_p_s)(int bx, int by, pixel *dst, intptr_t dstride, short *src, intptr_t sstride); // dst is aligned
+typedef pixel (CDECL * getDCVal_p)(pixel* pSrc, intptr_t iSrcStride, intptr_t width, intptr_t height, int bAbove, int bLeft);
+
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -186,6 +189,7 @@ struct EncoderPrimitives
     blockcpy_p_p cpyblock;
     blockcpy_p_s cpyblock_p_s;
     blockcpy_s_p cpyblock_s_p;
+    getDCVal_p getdcval_p;
 };
 
 /* This copy of the table is what gets used by all by the encoder.
--- a/source/encoder/threadpool.cpp	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/threadpool.cpp	Mon May 06 14:34:00 2013 +0530
@@ -68,11 +68,33 @@ inline int _BitScanReverse64(DWORD *id, 
 }
 #endif // if !_WIN64
 
+#if _WIN32_WINNT <= _WIN32_WINNT_WINXP
+/* XP did not define this intrinsic */
+FORCEINLINE LONGLONG _InterlockedOr64 (
+    __inout LONGLONG volatile *Destination,
+    __in    LONGLONG Value
+    )
+{
+    LONGLONG Old;
+
+    do {
+        Old = *Destination;
+    } while (_InterlockedCompareExchange64(Destination,
+                                          Old | Value,
+                                          Old) != Old);
+
+    return Old;
+}
+#define ATOMIC_OR(ptr, mask)            _InterlockedOr64((volatile LONG64*)ptr, mask)
+#pragma intrinsic(_InterlockedCompareExchange64)
+#else
+#define ATOMIC_OR(ptr, mask)            InterlockedOr64((volatile LONG64*)ptr, mask)
+#endif
+
 #define CLZ64(id, x)                    _BitScanReverse64(&id, x)
 #define ATOMIC_INC(ptr)                 InterlockedIncrement((volatile LONG*)ptr)
 #define ATOMIC_DEC(ptr)                 InterlockedDecrement((volatile LONG*)ptr)
-#define ATOMIC_OR(ptr, mask)            InterlockedOr64((volatile LONG64*)ptr, mask)
-#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
+#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
 #define GIVE_UP_TIME()                  Sleep(0)
 
 #endif // ifdef __GNUC__
--- a/source/encoder/vec/CMakeLists.txt	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/vec/CMakeLists.txt	Mon May 06 14:34:00 2013 +0530
@@ -2,6 +2,7 @@ if (MSVC)
     set(PRIMITIVES sse2.cpp sse3.cpp ssse3.cpp sse41.cpp sse42.cpp)
     add_definitions(/wd4127) # conditional expression is constant
     add_definitions(/wd4244) # 'argument' : conversion from 'int' to 'char', possible loss of data
+    set_source_files_properties(sse2.cpp sse3.cpp ssse3.cpp sse41.cpp sse42.cpp /arch:SSE2)
 endif()
 if (MSVC_VERSION EQUAL 1600) # VC10
     set(PRIMITIVES ${PRIMITIVES} avx.cpp)
@@ -23,10 +24,15 @@ file(GLOB VECTORCLASS ../../VectorClass/
 source_group(VectorClass FILES ${VECTORCLASS})
 
 add_library(PrimitivesVec vec-primitives.cpp ${PRIMITIVES} ${VECTORCLASS}
+            # *.inc files listed here show up in Visual Studio, but are not built
+            # it is simply a convenience to make them easy to edit
             vecprimitives.inc
             blockcopy.inc
             pixel.inc
             pixel8.inc
             pixel16.inc
             macroblock.inc
-            interpolationfilter.inc)
+            intrapred.inc
+            ipfilter.inc
+            ipfilter8.inc
+            ipfilter16.inc)
--- a/source/encoder/vec/interpolationfilter.inc	Fri May 03 17:54:14 2013 +0530
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,541 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013 x265 project
- *
- * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
- *          Rajesh Paulraj <rajesh@multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@multicorewareinc.com.
- *****************************************************************************/
-
-#include <assert.h>
-
-#if _MSC_VER
-#pragma warning(disable: 4100) // unreferenced formal parameter
-#endif
-
-#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
-#define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
-#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally
-
-template<int N>
-void CDECL filterVertical_short_pel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int block_width, int block_height, short const *coeff)
-{
-    int row, col;
-
-    int cstride =  srcStride;
-
-    src -= (N / 2 - 1) * cstride;
-
-    int offset;
-    short maxVal;
-    int headRoom = IF_INTERNAL_PREC - bitDepth;
-    int shift = IF_FILTER_PREC;
-
-    shift += headRoom;
-    offset = 1 << (shift - 1);
-    offset +=  IF_INTERNAL_OFFS << IF_FILTER_PREC;
-    maxVal = (1 << bitDepth) - 1;
-
-    int cm[8][4];
-    for (int i = 0; i < N; i++)
-    {
-        cm[i][0] = coeff[i];
-        cm[i][1] = coeff[i];
-        cm[i][2] = coeff[i];
-        cm[i][3] = coeff[i];
-    }
-
-    for (row = 0; row < block_height; row++)
-    {
-        for (col = 0; col < block_width - 7; col += 8)
-        {
-            Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
-            Vec4i row0_first, row0_last, row1_first, row1_last, sum_first, sum_last;
-            Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
-
-            row0.load(&src[col]);
-            row1.load(&src[col + cstride]);
-
-            c0.load(cm[0]);
-            c1.load(cm[1]);
-
-            row0_first = extend_low(row0);
-            row1_first = extend_low(row1);
-            row0_last = extend_high(row0);
-            row1_last = extend_high(row1);
-
-            row0_first = row0_first * c0;
-            row1_first = row1_first * c1;
-            row0_last = row0_last * c0;
-            row1_last = row1_last * c1;
-
-            sum_first = row0_first + row1_first;
-            sum_last = row0_last + row1_last;
-
-            row2.load(&src[col + 2 * cstride]);
-            row3.load(&src[col + 3 * cstride]);
-
-            c2.load(cm[2]);
-            c3.load(cm[3]);
-
-            row0_first = extend_low(row2);
-            row0_last = extend_high(row2);
-            row0_first = row0_first * c2;
-            row0_last = row0_last * c2;
-            row1_first = extend_low(row3);
-            row1_last = extend_high(row3);
-            row1_first = row1_first * c3;
-            row1_last = row1_last * c3;
-            sum_first += row0_first + row1_first;
-            sum_last += row0_last + row1_last;
-
-            if (N == 8)
-            {
-                row4.load(&src[col + 4 * cstride]);
-                row5.load(&src[col + 5 * cstride]);
-
-                c4.load(cm[4]);
-                c5.load(cm[5]);
-
-                row0_first = extend_low(row4);
-                row0_last = extend_high(row4);
-                row0_first = row0_first * c4;
-                row0_last = row0_last * c4;
-                row1_first = extend_low(row5);
-                row1_last = extend_high(row5);
-                row1_first = row1_first * c5;
-                row1_last = row1_last * c5;
-                sum_first += row0_first + row1_first;
-                sum_last += row0_last + row1_last;
-
-                row6.load(&src[col + 6 * cstride]);
-                row7.load(&src[col + 7 * cstride]);
-
-                c6.load(cm[6]);
-                c7.load(cm[7]);
-
-                row0_first = extend_low(row6);
-                row0_last = extend_high(row6);
-                row0_first = row0_first * c6;
-                row0_last = row0_last * c6;
-                row1_first = extend_low(row7);
-                row1_last = extend_high(row7);
-                row1_first = row1_first * c7;
-                row1_last = row1_last * c7;
-                sum_first += row0_first + row1_first;
-                sum_last += row0_last + row1_last;
-            }
-
-            sum_first = (sum_first + offset)  >> shift;
-            sum_last = (sum_last + offset)  >> shift;
-
-            Vec4i zero(0);
-            sum = compress(sum_first, sum_last);
-
-            sum = max(sum, 0);
-            Vec8s maxVal_v(maxVal);
-            sum = min(sum, maxVal_v);
-
-            sum.store(dst + col);
-        }
-
-        //Handle the case when block_width is not multiple of 8
-        for (; col < block_width; col += 4)
-        {
-            Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
-            Vec4i row0_first, row0_last, row1_first, row1_last, sum_first, sum_last;
-            Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
-
-            row0.load(&src[col]);
-            row1.load(&src[col + cstride]);
-
-            c0.load(cm[0]);
-            c1.load(cm[1]);
-
-            row0_first = extend_low(row0);
-            row1_first = extend_low(row1);
-            row0_first = row0_first * c0;
-            row1_first = row1_first * c1;
-
-            sum_first = row0_first + row1_first;
-
-            row2.load(&src[col + 2 * cstride]);
-            row3.load(&src[col + 3 * cstride]);
-
-            c2.load(cm[2]);
-            c3.load(cm[3]);
-
-            row0_first = extend_low(row2);
-            row0_first = row0_first * c2;
-            row1_first = extend_low(row3);
-            row1_first = row1_first * c3;
-            sum_first += row0_first + row1_first;
-
-            if (N == 8)
-            {
-                row4.load(&src[col + 4 * cstride]);
-                row5.load(&src[col + 5 * cstride]);
-
-                c4.load(cm[4]);
-                c5.load(cm[5]);
-
-                row0_first = extend_low(row4);
-                row0_first = row0_first * c4;
-                row1_first = extend_low(row5);
-                row1_first = row1_first * c5;
-                sum_first += row0_first + row1_first;
-
-                row6.load(&src[col + 6 * cstride]);
-                row7.load(&src[col + 7 * cstride]);
-
-                c6.load(cm[6]);
-                c7.load(cm[7]);
-
-                row0_first = extend_low(row6);
-                row0_first = row0_first * c6;
-                row1_first = extend_low(row7);
-                row1_first = row1_first * c7;
-                sum_first += row0_first + row1_first;
-            }
-
-            sum_first = (sum_first + offset)  >> shift;
-
-            Vec4i zero(0);
-            sum = compress(sum_first, zero);
-
-            sum = max(sum, 0);
-            Vec8s maxVal_v(maxVal);
-            sum = min(sum, maxVal_v);
-
-            sum.store_partial(block_width - col, dst + col);
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-
-template<int N>
-void CDECL filterHorizontal_pel_pel(int bitDepth, pixel *src, int srcStride, pixel *dst, int dstStride, int block_width, int block_height, short const *coeff)
-
-{
-    int row, col;
-
-    src -= (N / 2 - 1);                                   // Here cStride = 1
-
-    int offset;
-    short maxVal;
-    int headRoom = IF_INTERNAL_PREC - bitDepth;
-    offset =  (1 << (headRoom - 1));
-    maxVal = (1 << bitDepth) - 1;
-
-    Vec4i vec_sum_low, vec_sum_high;
-    Vec8s vec_src0, vec_sum, vec_c;
-    vec_c.load(coeff);
-    Vec8s vec_c0(coeff[0]), vec_c1(coeff[1]), vec_c2(coeff[2]), vec_c3(coeff[3]), vec_c4(coeff[4]), vec_c5(coeff[5]), vec_c6(
-        coeff[6]), vec_c7(coeff[7]);
-    Vec4i vec_offset(offset);
-    Vec8s vec_maxVal(maxVal);
-    for (row = 0; row < block_height; row++)
-    {
-        col = 0;
-        for (; col < (block_width - 7); col += 8)                   // Iterations multiple of 8
-        {
-            vec_src0.load(src + col);                         // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c0;                       // Multiply by c[0]
-            vec_sum_low = extend_low(vec_src0);                 // Convert to integer lower bits
-            vec_sum_high = extend_high(vec_src0);               // Convert to integer higher bits
-
-            vec_src0.load(src + col + 1);                     // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c1;                       // Multiply by c[1]
-            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-            vec_src0.load(src + col + 2);                     // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c2;                       // Multiply by c[2]
-            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-            vec_src0.load(src + col + 3);                     // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c3;                       // Multiply by c[2]
-            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-            if (N == 8)
-            {
-                vec_src0.load(src + col + 4);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c4;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-                vec_src0.load(src + col + 5);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c5;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-                vec_src0.load(src + col + 6);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c6;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-                vec_src0.load(src + col + 7);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c7;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-            }
-
-            vec_sum_low = (vec_sum_low + vec_offset);           // Add offset(value copied into all integer vector elements) to sum_low
-            vec_sum_high = (vec_sum_high + vec_offset);         // Add offset(value copied into all integer vector elements) to sum_high
-
-            vec_sum = compress(vec_sum_low, vec_sum_high);       // Save two integer vectors(Vec4i) to single short vector(Vec8s)
-            vec_sum = vec_sum >> headRoom;                         // This shift must be done after saving integer(two vec4i) data to short(Vec8s)
-
-            vec_sum = max(vec_sum, 0);                          // (val < 0) ? 0 : val;
-            vec_sum = min(vec_sum, vec_maxVal);                 // (val > maxVal) ? maxVal : val;
-
-            vec_sum.store(dst + col);                           // Store vector
-        }
-
-        for (; col < block_width; col++)                           // Remaining iterations
-        {
-            if (N == 8)
-            {
-                vec_src0.load(src + col);
-            }
-            else
-            {
-                vec_src0.load_partial(4, src + col);
-            }
-
-            vec_src0 = vec_src0 * vec_c;                        // Assuming that there is no overflow (Everywhere in this function!)
-            int sum = horizontal_add(vec_src0);
-            short val = (short)(sum + offset) >> headRoom;
-            val = (val < 0) ? 0 : val;
-            val = (val > maxVal) ? maxVal : val;
-
-            dst[col] = (pixel)val;
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-
-template<int N>
-void CDECL filterHorizontal_pel_short(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int block_width, int block_height, short const *coeff)
-{
-    int row, col;
-
-    src -= (N / 2 - 1);
-
-    int offset;
-    int headRoom = IF_INTERNAL_PREC - bitDepth;
-    int shift = IF_FILTER_PREC;
-
-    shift -= headRoom;
-    offset = -IF_INTERNAL_OFFS << shift;
-
-    Vec4i vec_sum_low, vec_sum_high;
-    Vec8s vec_src0, vec_sum, vec_c;
-    vec_c.load(coeff);
-    Vec8s vec_c0(coeff[0]), vec_c1(coeff[1]), vec_c2(coeff[2]), vec_c3(coeff[3]), vec_c4(coeff[4]), vec_c5(coeff[5]), vec_c6(
-        coeff[6]), vec_c7(coeff[7]);
-    Vec4i vec_offset(offset);
-
-    for (row = 0; row < block_height; row++)
-    {
-        col = 0;
-        for (; col < (block_width - 7); col += 8)                   // Iterations multiple of 8
-        {
-            vec_src0.load(src + col);                         // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c0;                       // Multiply by c[0]
-            vec_sum_low = extend_low(vec_src0);                 // Convert to integer lower bits
-            vec_sum_high = extend_high(vec_src0);               // Convert to integer higher bits
-
-            vec_src0.load(src + col + 1);                     // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c1;                       // Multiply by c[1]
-            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-            vec_src0.load(src + col + 2);                     // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c2;                       // Multiply by c[2]
-            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-            vec_src0.load(src + col + 3);                     // Load the 8 elements
-            vec_src0 = vec_src0 * vec_c3;                       // Multiply by c[2]
-            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-            if (N == 8)
-            {
-                vec_src0.load(src + col + 4);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c4;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-                vec_src0.load(src + col + 5);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c5;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-                vec_src0.load(src + col + 6);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c6;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-
-                vec_src0.load(src + col + 7);                     // Load the 8 elements
-                vec_src0 = vec_src0 * vec_c7;                       // Multiply by c[2]
-                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
-                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
-            }
-
-            vec_sum_low = (vec_sum_low + vec_offset);           // Add offset(value copied into all integer vector elements) to sum_low
-            vec_sum_high = (vec_sum_high + vec_offset);         // Add offset(value copied into all integer vector elements) to sum_high
-
-            vec_sum = compress(vec_sum_low, vec_sum_high);       // Save two integer vectors(Vec4i) to single short vector(Vec8s)
-            vec_sum = vec_sum >> shift;                         // This shift must be done after saving integer(two vec4i) data to short(Vec8s)
-
-            vec_sum.store(dst + col);                           // Store vector
-        }
-
-        for (; col < block_width; col++)                           // Remaining iterations
-        {
-            if (N == 8)
-            {
-                vec_src0.load(src + col);
-            }
-            else
-            {
-                vec_src0.load_partial(4, src + col);
-            }
-
-            vec_src0 = vec_src0 * vec_c;                        // Assuming that there is no overflow (Everywhere in this function!)
-            int sum = horizontal_add(vec_src0);
-            short val = (short)(sum + offset) >> shift;
-
-            dst[col] = val;
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-
-void CDECL filterConvertPelToShort(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int width, int height)
-{
-    pixel* srcOrg = src;
-    short* dstOrg = dst;
-    int shift = IF_INTERNAL_PREC - bitDepth;
-    int row, col;
-    Vec8s src_v, dst_v, val_v;
-
-    for (row = 0; row < height; row++)
-    {
-        for (col = 0; col < width - 7; col += 8)
-        {
-            src_v.load(src + col);
-            val_v = src_v << shift;
-            dst_v = val_v - IF_INTERNAL_OFFS;
-            dst_v.store(dst + col);
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-
-    if (width % 8 != 0)
-    {
-        src = srcOrg;
-        dst = dstOrg;
-        col = width - (width % 8);
-        for (row = 0; row < height; row++)
-        {
-            src_v.load(src + col);
-            val_v = src_v << shift;
-            dst_v = val_v - IF_INTERNAL_OFFS;
-            dst_v.store_partial(width - col, dst + col);
-
-            src += srcStride;
-            dst += dstStride;
-        }
-    }
-}
-
-void CDECL filterConvertShortToPel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height)
-{
-    short* srcOrg = src;
-    pixel* dstOrg = dst;
-    int shift = IF_INTERNAL_PREC - bitDepth;
-    short offset = IF_INTERNAL_OFFS;
-
-    offset += shift ? (1 << (shift - 1)) : 0;
-    short maxVal = (1 << bitDepth) - 1;
-    Vec8s minVal(0);
-    int row, col;
-    Vec8s src_c, val_c;
-    for (row = 0; row < height; row++)
-    {
-        for (col = 0; col < width - 7; col += 8)
-        {
-            src_c.load(src + col);
-            val_c = add_saturated(src_c, offset) >> shift;
-            val_c = max(val_c, minVal);
-            val_c = min(val_c, maxVal);
-            val_c.store(dst + col);
-        }
-
-        src += srcStride;
-        dst += dstStride;
-    }
-
-    if (width % 8 != 0)
-    {
-        src = srcOrg;
-        dst = dstOrg;
-        col = width - (width % 8);
-        for (row = 0; row < height; row++)
-        {
-            src_c.load(src + col);
-            val_c = add_saturated(src_c, offset) >> shift;
-            val_c = max(val_c, minVal);
-            val_c = min(val_c, maxVal);
-            val_c.store_partial(width - col, dst + col);
-
-            src += srcStride;
-            dst += dstStride;
-        }
-    }
-}
-
-void Setup_Vec_IPFilterPrimitives(EncoderPrimitives& p)
-{
-#if HIGH_BIT_DEPTH
-    p.ipFilter_s_p[FILTER_V_S_P_8] = filterVertical_short_pel<8>;
-    p.ipFilter_p_p[FILTER_H_P_P_8] = filterHorizontal_pel_pel<8>;
-    p.ipFilter_p_s[FILTER_H_P_S_8] = filterHorizontal_pel_short<8>;
-    p.ipFilter_p_p[FILTER_H_P_P_4] = filterHorizontal_pel_pel<4>;
-    p.ipFilter_p_s[FILTER_H_P_S_4] = filterHorizontal_pel_short<4>;
-    p.ipFilter_s_p[FILTER_V_S_P_4] = filterVertical_short_pel<4>;
-    p.ipfilterConvert_p_s = filterConvertPelToShort;
-    p.ipfilterConvert_s_p = filterConvertShortToPel;
-#endif
-}
-#if _MSC_VER
-#pragma warning(default: 4100)
-#endif
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/encoder/vec/intrapred.inc	Mon May 06 14:34:00 2013 +0530
@@ -0,0 +1,157 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@multicorewareinc.com.
+ *****************************************************************************/
+
+// Function for calculating DC value of the reference samples used in Intra prediction
+pixel CDECL predIntraGetPredValDC(pixel* pSrc, intptr_t iSrcStride, intptr_t iWidth, intptr_t iHeight, int bAbove, int bLeft)
+{
+    //assert(iWidth == iHeight); // all of Intra is NxN
+    int iSum = 0;
+    int logSize = g_aucConvertToBit[iWidth] + 2;
+    pixel *pSrcAbove = &pSrc[-iSrcStride];
+
+#if HIGH_BIT_DEPTH
+    Vec8s sumAbove(0);
+    Vec8s m0;
+
+    if (bAbove)
+    {
+        switch( iWidth )
+        {
+            case 4:
+                m0.load_partial4(pSrcAbove);
+                sumAbove = m0;
+                break;
+            case 8:
+                m0.load(pSrcAbove);
+                sumAbove = m0;
+                break;
+            case 16:
+                m0.load(pSrcAbove    );
+                sumAbove  = m0;
+                m0.load(pSrcAbove + 8);
+                sumAbove += m0;
+                break;
+            case 32:
+                m0.load(pSrcAbove    );
+                sumAbove  = m0;
+                m0.load(pSrcAbove + 8);
+                sumAbove += m0;
+                m0.load(pSrcAbove +16);
+                sumAbove += m0;
+                m0.load(pSrcAbove +24);
+                sumAbove += m0;
+                break;
+                //case 64:
+            default:
+                // CHECK_ME: the max support bit_depth is 13-bits
+                m0.load(pSrcAbove    );
+                sumAbove  = m0;
+                m0.load(pSrcAbove + 8);
+                sumAbove += m0;
+                m0.load(pSrcAbove +16);
+                sumAbove += m0;
+                m0.load(pSrcAbove +24);
+                sumAbove += m0;
+                m0.load(pSrcAbove +32);
+                sumAbove += m0;
+                m0.load(pSrcAbove +40);
+                sumAbove += m0;
+                m0.load(pSrcAbove +48);
+                sumAbove += m0;
+                m0.load(pSrcAbove +56);
+                sumAbove += m0;
+                break;
+        }
+        iSum = horizontal_add_x(sumAbove);
+    }
+
+#else
+
+    if (bAbove)
+    {
+        Vec16uc pix;
+        Vec8us  im;
+        Vec4ui  im1, im2;
+
+        switch( iWidth )
+        {
+            case 4:
+                pix.fromUint32(*(uint32_t*)pSrcAbove);
+                iSum = horizontal_add(extend_low(pix));
+                break;
+            case 8:
+#if _WIN64
+                pix.fromUint64(*(uint64_t*)pSrcAbove);
+#else
+                pix.load_partial(8, pSrcAbove);
+#endif
+                iSum = horizontal_add(extend_low(pix));
+                break;
+            case 16:
+                pix.load(pSrcAbove);
+                iSum = horizontal_add_x(pix);
+                break;
+            case 32:
+                pix.load(pSrcAbove);
+                im1 = Vec4ui(pix.sad(_mm_setzero_si128()));
+                pix.load(pSrcAbove + 16);
+                im1 += Vec4ui(pix.sad(_mm_setzero_si128()));
+                im1 += (im1 >> const_int(64));
+                iSum += toInt32(im1);
+                break;
+            //case 64:
+            default:
+                pix.load(pSrcAbove);
+                im1 = Vec4ui(pix.sad(_mm_setzero_si128()));
+                pix.load(pSrcAbove + 16);
+                im1 += Vec4ui(pix.sad(_mm_setzero_si128()));
+                pix.load(pSrcAbove + 32);
+                im1 += Vec4ui(pix.sad(_mm_setzero_si128()));
+                pix.load(pSrcAbove + 48);
+                im1 += Vec4ui(pix.sad(_mm_setzero_si128()));
+                im1 += (im1 >> const_int(64));
+                //im1 += extract_hi64(im1);
+                iSum += toInt32(im1);
+                break;
+        }
+    }
+#endif
+
+    if (bLeft)
+    {
+        for (int iInd = 0; iInd < iHeight; iInd++)
+        {
+            iSum += pSrcAbove[iSrcStride - 1];
+            pSrcAbove += iSrcStride;
+        }
+    }
+
+//     assert(bAbove || bLeft);
+    logSize += (bAbove + bLeft - 1);
+    return (iSum + (1<<(logSize-1))) >> logSize;
+}
+
+void Setup_Vec_IPredPrimitives(EncoderPrimitives& p)
+{
+    p.getdcval_p = predIntraGetPredValDC;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/encoder/vec/ipfilter.inc	Mon May 06 14:34:00 2013 +0530
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
+ *          Rajesh Paulraj <rajesh@multicorewareinc.com>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@multicorewareinc.com.
+ *****************************************************************************/
+
+#include <assert.h>
+
+void Setup_Vec_IPFilterPrimitives(EncoderPrimitives& p)
+{
+#if HIGH_BIT_DEPTH
+    p.ipFilter_s_p[FILTER_V_S_P_8] = filterVertical_short_pel<8>;
+    p.ipFilter_p_p[FILTER_H_P_P_8] = filterHorizontal_pel_pel<8>;
+    p.ipFilter_p_s[FILTER_H_P_S_8] = filterHorizontal_pel_short<8>;
+    p.ipFilter_p_p[FILTER_H_P_P_4] = filterHorizontal_pel_pel<4>;
+    p.ipFilter_p_s[FILTER_H_P_S_4] = filterHorizontal_pel_short<4>;
+    p.ipFilter_s_p[FILTER_V_S_P_4] = filterVertical_short_pel<4>;
+    p.ipfilterConvert_p_s = filterConvertPelToShort;
+    p.ipfilterConvert_s_p = filterConvertShortToPel;
+#else
+    if (&p) return;
+#endif
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/encoder/vec/ipfilter16.inc	Mon May 06 14:34:00 2013 +0530
@@ -0,0 +1,531 @@
+
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
+ *          Rajesh Paulraj <rajesh@multicorewareinc.com>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@multicorewareinc.com.
+ *****************************************************************************/
+
+#include <assert.h>
+
+#if _MSC_VER
+#pragma warning(disable: 4100) // unreferenced formal parameter
+#endif
+
+#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
+#define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
+#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally
+
+template<int N>
+void CDECL filterVertical_short_pel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int block_width, int block_height, short const *coeff)
+{
+    int row, col;
+
+    int cstride =  srcStride;
+
+    src -= (N / 2 - 1) * cstride;
+
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    int shift = IF_FILTER_PREC;
+
+    shift += headRoom;
+    offset = 1 << (shift - 1);
+    offset +=  IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    maxVal = (1 << bitDepth) - 1;
+
+    int cm[8][4];
+    for (int i = 0; i < N; i++)
+    {
+        cm[i][0] = coeff[i];
+        cm[i][1] = coeff[i];
+        cm[i][2] = coeff[i];
+        cm[i][3] = coeff[i];
+    }
+
+    for (row = 0; row < block_height; row++)
+    {
+        for (col = 0; col < block_width - 7; col += 8)
+        {
+            Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
+            Vec4i row0_first, row0_last, row1_first, row1_last, sum_first, sum_last;
+            Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            row0.load(&src[col]);
+            row1.load(&src[col + cstride]);
+
+            c0.load(cm[0]);
+            c1.load(cm[1]);
+
+            row0_first = extend_low(row0);
+            row1_first = extend_low(row1);
+            row0_last = extend_high(row0);
+            row1_last = extend_high(row1);
+
+            row0_first = row0_first * c0;
+            row1_first = row1_first * c1;
+            row0_last = row0_last * c0;
+            row1_last = row1_last * c1;
+
+            sum_first = row0_first + row1_first;
+            sum_last = row0_last + row1_last;
+
+            row2.load(&src[col + 2 * cstride]);
+            row3.load(&src[col + 3 * cstride]);
+
+            c2.load(cm[2]);
+            c3.load(cm[3]);
+
+            row0_first = extend_low(row2);
+            row0_last = extend_high(row2);
+            row0_first = row0_first * c2;
+            row0_last = row0_last * c2;
+            row1_first = extend_low(row3);
+            row1_last = extend_high(row3);
+            row1_first = row1_first * c3;
+            row1_last = row1_last * c3;
+            sum_first += row0_first + row1_first;
+            sum_last += row0_last + row1_last;
+
+            if (N == 8)
+            {
+                row4.load(&src[col + 4 * cstride]);
+                row5.load(&src[col + 5 * cstride]);
+
+                c4.load(cm[4]);
+                c5.load(cm[5]);
+
+                row0_first = extend_low(row4);
+                row0_last = extend_high(row4);
+                row0_first = row0_first * c4;
+                row0_last = row0_last * c4;
+                row1_first = extend_low(row5);
+                row1_last = extend_high(row5);
+                row1_first = row1_first * c5;
+                row1_last = row1_last * c5;
+                sum_first += row0_first + row1_first;
+                sum_last += row0_last + row1_last;
+
+                row6.load(&src[col + 6 * cstride]);
+                row7.load(&src[col + 7 * cstride]);
+
+                c6.load(cm[6]);
+                c7.load(cm[7]);
+
+                row0_first = extend_low(row6);
+                row0_last = extend_high(row6);
+                row0_first = row0_first * c6;
+                row0_last = row0_last * c6;
+                row1_first = extend_low(row7);
+                row1_last = extend_high(row7);
+                row1_first = row1_first * c7;
+                row1_last = row1_last * c7;
+                sum_first += row0_first + row1_first;
+                sum_last += row0_last + row1_last;
+            }
+
+            sum_first = (sum_first + offset)  >> shift;
+            sum_last = (sum_last + offset)  >> shift;
+
+            Vec4i zero(0);
+            sum = compress(sum_first, sum_last);
+
+            sum = max(sum, 0);
+            Vec8s maxVal_v(maxVal);
+            sum = min(sum, maxVal_v);
+
+            sum.store(dst + col);
+        }
+
+        //Handle the case when block_width is not multiple of 8
+        for (; col < block_width; col += 4)
+        {
+            Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
+            Vec4i row0_first, row0_last, row1_first, row1_last, sum_first, sum_last;
+            Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            row0.load(&src[col]);
+            row1.load(&src[col + cstride]);
+
+            c0.load(cm[0]);
+            c1.load(cm[1]);
+
+            row0_first = extend_low(row0);
+            row1_first = extend_low(row1);
+            row0_first = row0_first * c0;
+            row1_first = row1_first * c1;
+
+            sum_first = row0_first + row1_first;
+
+            row2.load(&src[col + 2 * cstride]);
+            row3.load(&src[col + 3 * cstride]);
+
+            c2.load(cm[2]);
+            c3.load(cm[3]);
+
+            row0_first = extend_low(row2);
+            row0_first = row0_first * c2;
+            row1_first = extend_low(row3);
+            row1_first = row1_first * c3;
+            sum_first += row0_first + row1_first;
+
+            if (N == 8)
+            {
+                row4.load(&src[col + 4 * cstride]);
+                row5.load(&src[col + 5 * cstride]);
+
+                c4.load(cm[4]);
+                c5.load(cm[5]);
+
+                row0_first = extend_low(row4);
+                row0_first = row0_first * c4;
+                row1_first = extend_low(row5);
+                row1_first = row1_first * c5;
+                sum_first += row0_first + row1_first;
+
+                row6.load(&src[col + 6 * cstride]);
+                row7.load(&src[col + 7 * cstride]);
+
+                c6.load(cm[6]);
+                c7.load(cm[7]);
+
+                row0_first = extend_low(row6);
+                row0_first = row0_first * c6;
+                row1_first = extend_low(row7);
+                row1_first = row1_first * c7;
+                sum_first += row0_first + row1_first;
+            }
+
+            sum_first = (sum_first + offset)  >> shift;
+
+            Vec4i zero(0);
+            sum = compress(sum_first, zero);
+
+            sum = max(sum, 0);
+            Vec8s maxVal_v(maxVal);
+            sum = min(sum, maxVal_v);
+
+            sum.store_partial(block_width - col, dst + col);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+template<int N>
+void CDECL filterHorizontal_pel_pel(int bitDepth, pixel *src, int srcStride, pixel *dst, int dstStride, int block_width, int block_height, short const *coeff)
+
+{
+    int row, col;
+
+    src -= (N / 2 - 1);                                   // Here cStride = 1
+
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    offset =  (1 << (headRoom - 1));
+    maxVal = (1 << bitDepth) - 1;
+
+    Vec4i vec_sum_low, vec_sum_high;
+    Vec8s vec_src0, vec_sum, vec_c;
+    vec_c.load(coeff);
+    Vec8s vec_c0(coeff[0]), vec_c1(coeff[1]), vec_c2(coeff[2]), vec_c3(coeff[3]), vec_c4(coeff[4]), vec_c5(coeff[5]), vec_c6(
+        coeff[6]), vec_c7(coeff[7]);
+    Vec4i vec_offset(offset);
+    Vec8s vec_maxVal(maxVal);
+    for (row = 0; row < block_height; row++)
+    {
+        col = 0;
+        for (; col < (block_width - 7); col += 8)                   // Iterations multiple of 8
+        {
+            vec_src0.load(src + col);                         // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c0;                       // Multiply by c[0]
+            vec_sum_low = extend_low(vec_src0);                 // Convert to integer lower bits
+            vec_sum_high = extend_high(vec_src0);               // Convert to integer higher bits
+
+            vec_src0.load(src + col + 1);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c1;                       // Multiply by c[1]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 2);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c2;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 3);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c3;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            if (N == 8)
+            {
+                vec_src0.load(src + col + 4);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c4;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 5);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c5;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 6);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c6;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 7);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c7;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+            }
+
+            vec_sum_low = (vec_sum_low + vec_offset);           // Add offset(value copied into all integer vector elements) to sum_low
+            vec_sum_high = (vec_sum_high + vec_offset);         // Add offset(value copied into all integer vector elements) to sum_high
+
+            vec_sum = compress(vec_sum_low, vec_sum_high);       // Save two integer vectors(Vec4i) to single short vector(Vec8s)
+            vec_sum = vec_sum >> headRoom;                         // This shift must be done after saving integer(two vec4i) data to short(Vec8s)
+
+            vec_sum = max(vec_sum, 0);                          // (val < 0) ? 0 : val;
+            vec_sum = min(vec_sum, vec_maxVal);                 // (val > maxVal) ? maxVal : val;
+
+            vec_sum.store(dst + col);                           // Store vector
+        }
+
+        for (; col < block_width; col++)                           // Remaining iterations
+        {
+            if (N == 8)
+            {
+                vec_src0.load(src + col);
+            }
+            else
+            {
+                vec_src0.load_partial(4, src + col);
+            }
+
+            vec_src0 = vec_src0 * vec_c;                        // Assuming that there is no overflow (Everywhere in this function!)
+            int sum = horizontal_add(vec_src0);
+            short val = (short)(sum + offset) >> headRoom;
+            val = (val < 0) ? 0 : val;
+            val = (val > maxVal) ? maxVal : val;
+
+            dst[col] = (pixel)val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+template<int N>
+void CDECL filterHorizontal_pel_short(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int block_width, int block_height, short const *coeff)
+{
+    int row, col;
+
+    src -= (N / 2 - 1);
+
+    int offset;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    int shift = IF_FILTER_PREC;
+
+    shift -= headRoom;
+    offset = -IF_INTERNAL_OFFS << shift;
+
+    Vec4i vec_sum_low, vec_sum_high;
+    Vec8s vec_src0, vec_sum, vec_c;
+    vec_c.load(coeff);
+    Vec8s vec_c0(coeff[0]), vec_c1(coeff[1]), vec_c2(coeff[2]), vec_c3(coeff[3]), vec_c4(coeff[4]), vec_c5(coeff[5]), vec_c6(
+        coeff[6]), vec_c7(coeff[7]);
+    Vec4i vec_offset(offset);
+
+    for (row = 0; row < block_height; row++)
+    {
+        col = 0;
+        for (; col < (block_width - 7); col += 8)                   // Iterations multiple of 8
+        {
+            vec_src0.load(src + col);                         // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c0;                       // Multiply by c[0]
+            vec_sum_low = extend_low(vec_src0);                 // Convert to integer lower bits
+            vec_sum_high = extend_high(vec_src0);               // Convert to integer higher bits
+
+            vec_src0.load(src + col + 1);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c1;                       // Multiply by c[1]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 2);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c2;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 3);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c3;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            if (N == 8)
+            {
+                vec_src0.load(src + col + 4);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c4;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 5);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c5;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 6);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c6;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 7);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c7;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+            }
+
+            vec_sum_low = (vec_sum_low + vec_offset);           // Add offset(value copied into all integer vector elements) to sum_low
+            vec_sum_high = (vec_sum_high + vec_offset);         // Add offset(value copied into all integer vector elements) to sum_high
+
+            vec_sum = compress(vec_sum_low, vec_sum_high);       // Save two integer vectors(Vec4i) to single short vector(Vec8s)
+            vec_sum = vec_sum >> shift;                         // This shift must be done after saving integer(two vec4i) data to short(Vec8s)
+
+            vec_sum.store(dst + col);                           // Store vector
+        }
+
+        for (; col < block_width; col++)                           // Remaining iterations
+        {
+            if (N == 8)
+            {
+                vec_src0.load(src + col);
+            }
+            else
+            {
+                vec_src0.load_partial(4, src + col);
+            }
+
+            vec_src0 = vec_src0 * vec_c;                        // Assuming that there is no overflow (Everywhere in this function!)
+            int sum = horizontal_add(vec_src0);
+            short val = (short)(sum + offset) >> shift;
+
+            dst[col] = val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+void CDECL filterConvertPelToShort(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int width, int height)
+{
+    pixel* srcOrg = src;
+    short* dstOrg = dst;
+    int shift = IF_INTERNAL_PREC - bitDepth;
+    int row, col;
+    Vec8s src_v, dst_v, val_v;
+
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width - 7; col += 8)
+        {
+            src_v.load(src + col);
+            val_v = src_v << shift;
+            dst_v = val_v - IF_INTERNAL_OFFS;
+            dst_v.store(dst + col);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+
+    if (width % 8 != 0)
+    {
+        src = srcOrg;
+        dst = dstOrg;
+        col = width - (width % 8);
+        for (row = 0; row < height; row++)
+        {
+            src_v.load(src + col);
+            val_v = src_v << shift;
+            dst_v = val_v - IF_INTERNAL_OFFS;
+            dst_v.store_partial(width - col, dst + col);
+
+            src += srcStride;
+            dst += dstStride;
+        }
+    }
+}
+
+void CDECL filterConvertShortToPel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height)
+{
+    short* srcOrg = src;
+    pixel* dstOrg = dst;
+    int shift = IF_INTERNAL_PREC - bitDepth;
+    short offset = IF_INTERNAL_OFFS;
+
+    offset += shift ? (1 << (shift - 1)) : 0;
+    short maxVal = (1 << bitDepth) - 1;
+    Vec8s minVal(0);
+    int row, col;
+    Vec8s src_c, val_c;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width - 7; col += 8)
+        {
+            src_c.load(src + col);
+            val_c = add_saturated(src_c, offset) >> shift;
+            val_c = max(val_c, minVal);
+            val_c = min(val_c, maxVal);
+            val_c.store(dst + col);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+
+    if (width % 8 != 0)
+    {
+        src = srcOrg;
+        dst = dstOrg;
+        col = width - (width % 8);
+        for (row = 0; row < height; row++)
+        {
+            src_c.load(src + col);
+            val_c = add_saturated(src_c, offset) >> shift;
+            val_c = max(val_c, minVal);
+            val_c = min(val_c, maxVal);
+            val_c.store_partial(width - col, dst + col);
+
+            src += srcStride;
+            dst += dstStride;
+        }
+    }
+}
+
+#if _MSC_VER
+#pragma warning(default: 4100)
+#endif
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/encoder/vec/ipfilter8.inc	Mon May 06 14:34:00 2013 +0530
@@ -0,0 +1,529 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Deepthi Devaki <deepthidevaki@multicorewareinc.com>,
+ *          Rajesh Paulraj <rajesh@multicorewareinc.com>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@multicorewareinc.com.
+ *****************************************************************************/
+
+#include <assert.h>
+
+#if _MSC_VER
+#pragma warning(disable: 4100) // unreferenced formal parameter
+#endif
+
+#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
+#define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
+#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally
+
+template<int N>
+void CDECL filterVertical_short_pel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int block_width, int block_height, short const *coeff)
+{
+    int row, col;
+
+    int cstride =  srcStride;
+
+    src -= (N / 2 - 1) * cstride;
+
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    int shift = IF_FILTER_PREC;
+
+    shift += headRoom;
+    offset = 1 << (shift - 1);
+    offset +=  IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    maxVal = (1 << bitDepth) - 1;
+
+    int cm[8][4];
+    for (int i = 0; i < N; i++)
+    {
+        cm[i][0] = coeff[i];
+        cm[i][1] = coeff[i];
+        cm[i][2] = coeff[i];
+        cm[i][3] = coeff[i];
+    }
+
+    for (row = 0; row < block_height; row++)
+    {
+        for (col = 0; col < block_width - 7; col += 8)
+        {
+            Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
+            Vec4i row0_first, row0_last, row1_first, row1_last, sum_first, sum_last;
+            Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            row0.load(&src[col]);
+            row1.load(&src[col + cstride]);
+
+            c0.load(cm[0]);
+            c1.load(cm[1]);
+
+            row0_first = extend_low(row0);
+            row1_first = extend_low(row1);
+            row0_last = extend_high(row0);
+            row1_last = extend_high(row1);
+
+            row0_first = row0_first * c0;
+            row1_first = row1_first * c1;
+            row0_last = row0_last * c0;
+            row1_last = row1_last * c1;
+
+            sum_first = row0_first + row1_first;
+            sum_last = row0_last + row1_last;
+
+            row2.load(&src[col + 2 * cstride]);
+            row3.load(&src[col + 3 * cstride]);
+
+            c2.load(cm[2]);
+            c3.load(cm[3]);
+
+            row0_first = extend_low(row2);
+            row0_last = extend_high(row2);
+            row0_first = row0_first * c2;
+            row0_last = row0_last * c2;
+            row1_first = extend_low(row3);
+            row1_last = extend_high(row3);
+            row1_first = row1_first * c3;
+            row1_last = row1_last * c3;
+            sum_first += row0_first + row1_first;
+            sum_last += row0_last + row1_last;
+
+            if (N == 8)
+            {
+                row4.load(&src[col + 4 * cstride]);
+                row5.load(&src[col + 5 * cstride]);
+
+                c4.load(cm[4]);
+                c5.load(cm[5]);
+
+                row0_first = extend_low(row4);
+                row0_last = extend_high(row4);
+                row0_first = row0_first * c4;
+                row0_last = row0_last * c4;
+                row1_first = extend_low(row5);
+                row1_last = extend_high(row5);
+                row1_first = row1_first * c5;
+                row1_last = row1_last * c5;
+                sum_first += row0_first + row1_first;
+                sum_last += row0_last + row1_last;
+
+                row6.load(&src[col + 6 * cstride]);
+                row7.load(&src[col + 7 * cstride]);
+
+                c6.load(cm[6]);
+                c7.load(cm[7]);
+
+                row0_first = extend_low(row6);
+                row0_last = extend_high(row6);
+                row0_first = row0_first * c6;
+                row0_last = row0_last * c6;
+                row1_first = extend_low(row7);
+                row1_last = extend_high(row7);
+                row1_first = row1_first * c7;
+                row1_last = row1_last * c7;
+                sum_first += row0_first + row1_first;
+                sum_last += row0_last + row1_last;
+            }
+
+            sum_first = (sum_first + offset)  >> shift;
+            sum_last = (sum_last + offset)  >> shift;
+
+            Vec4i zero(0);
+            sum = compress(sum_first, sum_last);
+
+            sum = max(sum, 0);
+            Vec8s maxVal_v(maxVal);
+            sum = min(sum, maxVal_v);
+
+            sum.store(dst + col);
+        }
+
+        //Handle the case when block_width is not multiple of 8
+        for (; col < block_width; col += 4)
+        {
+            Vec8s row0, row1, row2, row3, row4, row5, row6, row7, sum;
+            Vec4i row0_first, row0_last, row1_first, row1_last, sum_first, sum_last;
+            Vec4i c0, c1, c2, c3, c4, c5, c6, c7;
+
+            row0.load(&src[col]);
+            row1.load(&src[col + cstride]);
+
+            c0.load(cm[0]);
+            c1.load(cm[1]);
+
+            row0_first = extend_low(row0);
+            row1_first = extend_low(row1);
+            row0_first = row0_first * c0;
+            row1_first = row1_first * c1;
+
+            sum_first = row0_first + row1_first;
+
+            row2.load(&src[col + 2 * cstride]);
+            row3.load(&src[col + 3 * cstride]);
+
+            c2.load(cm[2]);
+            c3.load(cm[3]);
+
+            row0_first = extend_low(row2);
+            row0_first = row0_first * c2;
+            row1_first = extend_low(row3);
+            row1_first = row1_first * c3;
+            sum_first += row0_first + row1_first;
+
+            if (N == 8)
+            {
+                row4.load(&src[col + 4 * cstride]);
+                row5.load(&src[col + 5 * cstride]);
+
+                c4.load(cm[4]);
+                c5.load(cm[5]);
+
+                row0_first = extend_low(row4);
+                row0_first = row0_first * c4;
+                row1_first = extend_low(row5);
+                row1_first = row1_first * c5;
+                sum_first += row0_first + row1_first;
+
+                row6.load(&src[col + 6 * cstride]);
+                row7.load(&src[col + 7 * cstride]);
+
+                c6.load(cm[6]);
+                c7.load(cm[7]);
+
+                row0_first = extend_low(row6);
+                row0_first = row0_first * c6;
+                row1_first = extend_low(row7);
+                row1_first = row1_first * c7;
+                sum_first += row0_first + row1_first;
+            }
+
+            sum_first = (sum_first + offset)  >> shift;
+
+            Vec4i zero(0);
+            sum = compress(sum_first, zero);
+
+            sum = max(sum, 0);
+            Vec8s maxVal_v(maxVal);
+            sum = min(sum, maxVal_v);
+
+            sum.store_partial(block_width - col, dst + col);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+template<int N>
+void CDECL filterHorizontal_pel_pel(int bitDepth, pixel *src, int srcStride, pixel *dst, int dstStride, int block_width, int block_height, short const *coeff)
+
+{
+    int row, col;
+
+    src -= (N / 2 - 1);                                   // Here cStride = 1
+
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    offset =  (1 << (headRoom - 1));
+    maxVal = (1 << bitDepth) - 1;
+
+    Vec4i vec_sum_low, vec_sum_high;
+    Vec8s vec_src0, vec_sum, vec_c;
+    vec_c.load(coeff);
+    Vec8s vec_c0(coeff[0]), vec_c1(coeff[1]), vec_c2(coeff[2]), vec_c3(coeff[3]), vec_c4(coeff[4]), vec_c5(coeff[5]), vec_c6(
+        coeff[6]), vec_c7(coeff[7]);
+    Vec4i vec_offset(offset);
+    Vec8s vec_maxVal(maxVal);
+    for (row = 0; row < block_height; row++)
+    {
+        col = 0;
+        for (; col < (block_width - 7); col += 8)                   // Iterations multiple of 8
+        {
+            vec_src0.load(src + col);                         // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c0;                       // Multiply by c[0]
+            vec_sum_low = extend_low(vec_src0);                 // Convert to integer lower bits
+            vec_sum_high = extend_high(vec_src0);               // Convert to integer higher bits
+
+            vec_src0.load(src + col + 1);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c1;                       // Multiply by c[1]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 2);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c2;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 3);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c3;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            if (N == 8)
+            {
+                vec_src0.load(src + col + 4);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c4;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 5);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c5;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 6);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c6;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 7);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c7;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+            }
+
+            vec_sum_low = (vec_sum_low + vec_offset);           // Add offset(value copied into all integer vector elements) to sum_low
+            vec_sum_high = (vec_sum_high + vec_offset);         // Add offset(value copied into all integer vector elements) to sum_high
+
+            vec_sum = compress(vec_sum_low, vec_sum_high);       // Save two integer vectors(Vec4i) to single short vector(Vec8s)
+            vec_sum = vec_sum >> headRoom;                         // This shift must be done after saving integer(two vec4i) data to short(Vec8s)
+
+            vec_sum = max(vec_sum, 0);                          // (val < 0) ? 0 : val;
+            vec_sum = min(vec_sum, vec_maxVal);                 // (val > maxVal) ? maxVal : val;
+
+            vec_sum.store(dst + col);                           // Store vector
+        }
+
+        for (; col < block_width; col++)                           // Remaining iterations
+        {
+            if (N == 8)
+            {
+                vec_src0.load(src + col);
+            }
+            else
+            {
+                vec_src0.load_partial(4, src + col);
+            }
+
+            vec_src0 = vec_src0 * vec_c;                        // Assuming that there is no overflow (Everywhere in this function!)
+            int sum = horizontal_add(vec_src0);
+            short val = (short)(sum + offset) >> headRoom;
+            val = (val < 0) ? 0 : val;
+            val = (val > maxVal) ? maxVal : val;
+
+            dst[col] = (pixel)val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+template<int N>
+void CDECL filterHorizontal_pel_short(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int block_width, int block_height, short const *coeff)
+{
+    int row, col;
+
+    src -= (N / 2 - 1);
+
+    int offset;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    int shift = IF_FILTER_PREC;
+
+    shift -= headRoom;
+    offset = -IF_INTERNAL_OFFS << shift;
+
+    Vec4i vec_sum_low, vec_sum_high;
+    Vec8s vec_src0, vec_sum, vec_c;
+    vec_c.load(coeff);
+    Vec8s vec_c0(coeff[0]), vec_c1(coeff[1]), vec_c2(coeff[2]), vec_c3(coeff[3]), vec_c4(coeff[4]), vec_c5(coeff[5]), vec_c6(
+        coeff[6]), vec_c7(coeff[7]);
+    Vec4i vec_offset(offset);
+
+    for (row = 0; row < block_height; row++)
+    {
+        col = 0;
+        for (; col < (block_width - 7); col += 8)                   // Iterations multiple of 8
+        {
+            vec_src0.load(src + col);                         // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c0;                       // Multiply by c[0]
+            vec_sum_low = extend_low(vec_src0);                 // Convert to integer lower bits
+            vec_sum_high = extend_high(vec_src0);               // Convert to integer higher bits
+
+            vec_src0.load(src + col + 1);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c1;                       // Multiply by c[1]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 2);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c2;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            vec_src0.load(src + col + 3);                     // Load the 8 elements
+            vec_src0 = vec_src0 * vec_c3;                       // Multiply by c[2]
+            vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+            vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+            if (N == 8)
+            {
+                vec_src0.load(src + col + 4);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c4;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 5);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c5;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 6);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c6;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+
+                vec_src0.load(src + col + 7);                     // Load the 8 elements
+                vec_src0 = vec_src0 * vec_c7;                       // Multiply by c[2]
+                vec_sum_low += extend_low(vec_src0);                // Add integer lower bits to sum_low bits
+                vec_sum_high += extend_high(vec_src0);              // Add integer higer bits to sum_high bits
+            }
+
+            vec_sum_low = (vec_sum_low + vec_offset);           // Add offset(value copied into all integer vector elements) to sum_low
+            vec_sum_high = (vec_sum_high + vec_offset);         // Add offset(value copied into all integer vector elements) to sum_high
+
+            vec_sum = compress(vec_sum_low, vec_sum_high);       // Save two integer vectors(Vec4i) to single short vector(Vec8s)
+            vec_sum = vec_sum >> shift;                         // This shift must be done after saving integer(two vec4i) data to short(Vec8s)
+
+            vec_sum.store(dst + col);                           // Store vector
+        }
+
+        for (; col < block_width; col++)                           // Remaining iterations
+        {
+            if (N == 8)
+            {
+                vec_src0.load(src + col);
+            }
+            else
+            {
+                vec_src0.load_partial(4, src + col);
+            }
+
+            vec_src0 = vec_src0 * vec_c;                        // Assuming that there is no overflow (Everywhere in this function!)
+            int sum = horizontal_add(vec_src0);
+            short val = (short)(sum + offset) >> shift;
+
+            dst[col] = val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+void CDECL filterConvertPelToShort(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int width, int height)
+{
+    pixel* srcOrg = src;
+    short* dstOrg = dst;
+    int shift = IF_INTERNAL_PREC - bitDepth;
+    int row, col;
+    Vec8s src_v, dst_v, val_v;
+
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width - 7; col += 8)
+        {
+            src_v.load(src + col);
+            val_v = src_v << shift;
+            dst_v = val_v - IF_INTERNAL_OFFS;
+            dst_v.store(dst + col);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+
+    if (width % 8 != 0)
+    {
+        src = srcOrg;
+        dst = dstOrg;
+        col = width - (width % 8);
+        for (row = 0; row < height; row++)
+        {
+            src_v.load(src + col);
+            val_v = src_v << shift;
+            dst_v = val_v - IF_INTERNAL_OFFS;
+            dst_v.store_partial(width - col, dst + col);
+
+            src += srcStride;
+            dst += dstStride;
+        }
+    }
+}
+
+void CDECL filterConvertShortToPel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height)
+{
+    short* srcOrg = src;
+    pixel* dstOrg = dst;
+    int shift = IF_INTERNAL_PREC - bitDepth;
+    short offset = IF_INTERNAL_OFFS;
+
+    offset += shift ? (1 << (shift - 1)) : 0;
+    short maxVal = (1 << bitDepth) - 1;
+    Vec8s minVal(0);
+    int row, col;
+    Vec8s src_c, val_c;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width - 7; col += 8)
+        {
+            src_c.load(src + col);
+            val_c = add_saturated(src_c, offset) >> shift;
+            val_c = max(val_c, minVal);
+            val_c = min(val_c, maxVal);
+            val_c.store(dst + col);
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+
+    if (width % 8 != 0)
+    {
+        src = srcOrg;
+        dst = dstOrg;
+        col = width - (width % 8);
+        for (row = 0; row < height; row++)
+        {
+            src_c.load(src + col);
+            val_c = add_saturated(src_c, offset) >> shift;
+            val_c = max(val_c, minVal);
+            val_c = min(val_c, maxVal);
+            val_c.store_partial(width - col, dst + col);
+
+            src += srcStride;
+            dst += dstStride;
+        }
+    }
+}
+#if _MSC_VER
+#pragma warning(default: 4100)
+#endif
\ No newline at end of file
--- a/source/encoder/vec/pixel.inc	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/vec/pixel.inc	Mon May 06 14:34:00 2013 +0530
@@ -27,159 +27,6 @@
 
 /* File for pixels type-neutral code */
 
-template<int ly>
-int CDECL sad_4x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 16)
-        sum += sad_4<16>(piOrg + row * strideOrg, strideOrg, piCur + row * strideCur, strideCur);
-    return sum;
-}
-
-template<int ly>
-int CDECL sad_8x8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    /* groups of 8x16 blocks, upcasting sum from short to int often enough to avoid overflow */
-    int sum = 0;
-    for (int row = 0; row < ly; row += 8)
-        sum += sad_8<8>(piOrg + row * strideOrg, strideOrg, piCur + row * strideCur, strideCur);
-    return sum;
-}
-
-template<int ly>
-int CDECL sad_8x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    /* groups of 8x16 blocks, upcasting sum from short to int often enough to avoid overflow */
-    int sum = 0;
-    for (int row = 0; row < ly; row += 16)
-        sum += sad_8<16>(piOrg + row * strideOrg, strideOrg, piCur + row * strideCur, strideCur);
-    return sum;
-}
-
-template<int ly>
-int CDECL sad_12(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 4)
-    {
-        // cannot assume 8pel alignment, must use 4x4
-        sum += sad_4<4>(piOrg + row * strideOrg, strideOrg,
-                        piCur + row * strideCur, strideCur);
-        sum += sad_4<4>(piOrg + row * strideOrg + 4, strideOrg,
-                        piCur + row * strideCur + 4, strideCur);
-        sum += sad_4<4>(piOrg + row * strideOrg + 8, strideOrg,
-                        piCur + row * strideCur + 8, strideCur);
-    }
-    return sum;
-}
-
-template<int ly>
-int CDECL sad_12x8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 8)
-    {
-        // cannot assume 8pel alignment, must use 4x8
-        sum += sad_4<8>(piOrg + row * strideOrg, strideOrg,
-                        piCur + row * strideCur, strideCur);
-        sum += sad_4<8>(piOrg + row * strideOrg + 4, strideOrg,
-                        piCur + row * strideCur + 4, strideCur);
-        sum += sad_4<8>(piOrg + row * strideOrg + 8, strideOrg,
-                        piCur + row * strideCur + 8, strideCur);
-    }
-    return sum;
-}
-
-template<int lx, int ly>
-int CDECL sad_16x24(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 24)
-    {
-        for (int col = 0; col < lx; col += 16)
-        {
-            sum += sad_16<24>(piOrg + row * strideOrg + col, strideOrg,
-                              piCur + row * strideCur + col, strideCur);
-        }
-    }
-    return sum;
-}
-
-template<int lx, int ly>
-int CDECL sad_16x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 16)
-    {
-        for (int col = 0; col < lx; col += 16)
-        {
-            sum += sad_16<16>(piOrg + row * strideOrg + col, strideOrg,
-                              piCur + row * strideCur + col, strideCur);
-        }
-    }
-    return sum;
-}
-
-template<int lx, int ly>
-int CDECL sad_24(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 4)
-    {
-        for (int col = 0; col < lx; col += 8)
-        {
-            sum += sad_8<4>(piOrg + row * strideOrg + col, strideOrg,
-                            piCur + row * strideCur + col, strideCur);
-        }
-    }
-    return sum;
-}
-
-template<int lx, int ly>
-int CDECL sad_32x12(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 12)
-    {
-        for (int col = 0; col < lx; col += 32)
-        {
-            sum += sad_32<32,12>(piOrg + row * strideOrg + col, strideOrg,
-                              piCur + row * strideCur + col, strideCur);
-        }
-    }
-    return sum;
-}
-
-template<int lx, int ly>
-int CDECL sad_32x8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 8)
-    {
-        for (int col = 0; col < lx; col += 32)
-        {
-            sum += sad_32<32, 8>(piOrg + row * strideOrg + col, strideOrg,
-                                 piCur + row * strideCur + col, strideCur);
-        }
-    }
-    return sum;
-}
-
-template<int lx, int ly>
-int CDECL sad_32x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int sum = 0;
-    for (int row = 0; row < ly; row += 16)
-    {
-        for (int col = 0; col < lx; col += 32)
-        {
-            sum += sad_32<32, 16>(piOrg + row * strideOrg + col, strideOrg,
-                                 piCur + row * strideCur + col, strideCur);
-        }
-    }
-    return sum;
-}
-
 #if HIGH_BIT_DEPTH
 template<int lx, int ly>
 int CDECL satd(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
@@ -208,145 +55,217 @@ void Setup_Vec_PixelPrimitives(EncoderPr
     p.sad[PARTITION_4x16] = sad_4<16>;
     p.sad[PARTITION_4x24] = sad_4<24>;
     p.sad[PARTITION_4x32] = sad_4<32>;
-    p.sad[PARTITION_4x64] = sad_4x16<64>;
+    p.sad[PARTITION_4x48] = sad_4<48>;
+    p.sad[PARTITION_4x64] = sad_4<64>;
 
     p.sad[PARTITION_8x4] = sad_8<4>;
     p.sad[PARTITION_8x8] = sad_8<8>;
     p.sad[PARTITION_8x12] = sad_8<12>;
     p.sad[PARTITION_8x16] = sad_8<16>;
-    p.sad[PARTITION_8x24] = sad_8x8<24>;
-    p.sad[PARTITION_8x32] = sad_8x16<32>;
-    p.sad[PARTITION_8x64] = sad_8x16<64>;
+    p.sad[PARTITION_8x24] = sad_8<24>;
+    p.sad[PARTITION_8x32] = sad_8<32>;
+    p.sad[PARTITION_8x48] = sad_8<48>;
+    p.sad[PARTITION_8x64] = sad_8<64>;
 
     p.sad[PARTITION_12x4] = sad_12<4>;
     p.sad[PARTITION_12x8] = sad_12<8>;
     p.sad[PARTITION_12x12] = sad_12<12>;
-    p.sad[PARTITION_12x16] = sad_12x8<16>;
-    p.sad[PARTITION_12x24] = sad_12x8<24>;
-    p.sad[PARTITION_12x32] = sad_12x8<32>;
-    p.sad[PARTITION_12x64] = sad_12x8<64>;
+    p.sad[PARTITION_12x16] = sad_12<16>;
+    p.sad[PARTITION_12x24] = sad_12<24>;
+    p.sad[PARTITION_12x32] = sad_12<32>;
+    p.sad[PARTITION_12x48] = sad_12<48>;
+    p.sad[PARTITION_12x64] = sad_12<64>;
 
     p.sad[PARTITION_16x4] = sad_16<4>;
     p.sad[PARTITION_16x8] = sad_16<8>;
     p.sad[PARTITION_16x12] = sad_16<12>;
     p.sad[PARTITION_16x16] = sad_16<16>;
-    //p.sad[PARTITION_16x24] = sad_16x24<16,24>;
-    p.sad[PARTITION_16x32] = sad_16x16<16, 32>;
-    p.sad[PARTITION_16x64] = sad_16x16<16, 64>;
+    p.sad[PARTITION_16x24] = sad_16<24>;
+    p.sad[PARTITION_16x32] = sad_16<32>;
+    p.sad[PARTITION_16x48] = sad_16<48>;
+    p.sad[PARTITION_16x64] = sad_16<64>;
 
-    p.sad[PARTITION_32x4] = sad_32<32, 4>;
-    p.sad[PARTITION_32x8] = sad_32x8<32, 8>;
-    p.sad[PARTITION_32x12] = sad_32<32, 12>;
-    p.sad[PARTITION_32x16] = sad_16x16<32, 16>;
-    p.sad[PARTITION_32x24] = sad_32x8<32, 24>;
-    p.sad[PARTITION_32x32] = sad_32x16<32, 32>;
-    p.sad[PARTITION_32x64] = sad_16x16<32, 64>;
+    p.sad[PARTITION_24x4] = sad_24<4>;
+    p.sad[PARTITION_24x8] = sad_24<8>;
+    p.sad[PARTITION_24x12] = sad_24<12>;
+    p.sad[PARTITION_24x16] = sad_24<16>;
+    p.sad[PARTITION_24x24] = sad_24<24>;
+    p.sad[PARTITION_24x32] = sad_24<32>;
+    p.sad[PARTITION_24x48] = sad_24<48>;
+    p.sad[PARTITION_24x64] = sad_24<64>;
 
-    p.sad[PARTITION_64x4] = sad_32<64, 4>;
-    p.sad[PARTITION_64x8] = sad_32x8<64, 8>;
-    p.sad[PARTITION_64x12] = sad_32x12<64,12>;
-    p.sad[PARTITION_64x16] = sad_16x16<64, 16>;
-    //p.sad[PARTITION_64x24] = sad_16x24<64, 24>;
-    p.sad[PARTITION_64x32] = sad_16x16<64, 32>;
-    p.sad[PARTITION_64x64] = sad_16x16<64, 64>;
+    p.sad[PARTITION_32x4] = sad_32<4>;
+    p.sad[PARTITION_32x8] = sad_32<8>;
+    p.sad[PARTITION_32x12] = sad_32<12>;
+    p.sad[PARTITION_32x16] = sad_32<16>;
+    p.sad[PARTITION_32x24] = sad_32<24>;
+    p.sad[PARTITION_32x32] = sad_32<32>;
+    p.sad[PARTITION_32x48] = sad_32<48>;
+    p.sad[PARTITION_32x64] = sad_32<64>;
+
+    p.sad[PARTITION_48x4] = sad_48<4>;
+    p.sad[PARTITION_48x8] = sad_48<8>;
+    p.sad[PARTITION_48x12] = sad_48<12>;
+    p.sad[PARTITION_48x16] = sad_48<16>;
+    p.sad[PARTITION_48x24] = sad_48<24>;
+    p.sad[PARTITION_48x32] = sad_48<32>;
+    p.sad[PARTITION_48x48] = sad_48<48>;
+    p.sad[PARTITION_48x64] = sad_48<64>;
+
+    p.sad[PARTITION_64x4] = sad_64<4>;
+    p.sad[PARTITION_64x8] = sad_64<8>;
+    p.sad[PARTITION_64x12] = sad_64<12>;
+    p.sad[PARTITION_64x16] = sad_64<16>;
+    p.sad[PARTITION_64x24] = sad_64<24>;
+    p.sad[PARTITION_64x32] = sad_64<32>;
+    p.sad[PARTITION_64x48] = sad_64<48>;
+    p.sad[PARTITION_64x64] = sad_64<64>;
 
     //sad_x3
-    p.sad_x3[PARTITION_4x4] = sad_4_4_x3<4, 4>;
-    p.sad_x3[PARTITION_4x8] = sad_4_x3<4, 8>;
-    p.sad_x3[PARTITION_4x12] = sad_4_4_x3<4, 12>;
-    p.sad_x3[PARTITION_4x16] = sad_4_x3<4, 16>;
-    p.sad_x3[PARTITION_4x24] = sad_4_x3<4, 24>;
-    p.sad_x3[PARTITION_4x32] = sad_4_x3<4, 32>;
-    p.sad_x3[PARTITION_4x64] = sad_4_x3<4, 64>;
-
-    p.sad_x3[PARTITION_8x4] = sad_8_4_x3<8, 4>;
-    p.sad_x3[PARTITION_8x8] = sad_8_x3<8, 8>;
-    p.sad_x3[PARTITION_8x12] = sad_8_4_x3<8, 12>;
-    p.sad_x3[PARTITION_8x16] = sad_8_x3<8, 16>;
-    p.sad_x3[PARTITION_8x24] = sad_8_x3<8, 24>;
-    p.sad_x3[PARTITION_8x32] = sad_8_x3<8, 32>;
-    p.sad_x3[PARTITION_8x64] = sad_8_x3<8, 64>;
+    p.sad_x3[PARTITION_4x4] = sad_4_x3<4>;
+    p.sad_x3[PARTITION_4x8] = sad_4_x3<8>;
+    p.sad_x3[PARTITION_4x12] = sad_4_x3<12>;
+    p.sad_x3[PARTITION_4x16] = sad_4_x3<16>;
+    p.sad_x3[PARTITION_4x24] = sad_4_x3<24>;
+    p.sad_x3[PARTITION_4x32] = sad_4_x3<32>;
+    p.sad_x3[PARTITION_4x48] = sad_4_x3<48>;
+    p.sad_x3[PARTITION_4x64] = sad_4_x3<64>;
 
-    p.sad_x3[PARTITION_12x4] = sad_12_4_x3<12, 4>;
-    p.sad_x3[PARTITION_12x8] = sad_12_x3<12, 8>;
-    p.sad_x3[PARTITION_12x12] = sad_12_4_x3<12, 12>;
-    p.sad_x3[PARTITION_12x16] = sad_12_x3<12, 16>;
-    p.sad_x3[PARTITION_12x24] = sad_12_x3<12, 24>;
-    p.sad_x3[PARTITION_12x32] = sad_12_x3<12, 32>;
-    p.sad_x3[PARTITION_12x64] = sad_12_x3<12, 64>;
+    p.sad_x3[PARTITION_8x4] = sad_8_x3<4>;
+    p.sad_x3[PARTITION_8x8] = sad_8_x3<8>;
+    p.sad_x3[PARTITION_8x12] = sad_8_x3<12>;
+    p.sad_x3[PARTITION_8x16] = sad_8_x3<16>;
+    p.sad_x3[PARTITION_8x24] = sad_8_x3<24>;
+    p.sad_x3[PARTITION_8x32] = sad_8_x3<32>;
+    p.sad_x3[PARTITION_8x48] = sad_8_x3<48>;
+    p.sad_x3[PARTITION_8x64] = sad_8_x3<64>;
 
-    p.sad_x3[PARTITION_16x4] = sad_16_4_x3<16, 4>;
-    p.sad_x3[PARTITION_16x8] = sad_16_x3<16, 8>;
-    p.sad_x3[PARTITION_16x12] = sad_16_4_x3<16, 12>;
-    p.sad_x3[PARTITION_16x16] = sad_16_x3<16, 16>;
-    p.sad_x3[PARTITION_16x24] = sad_16_x3<16, 24>;
-    p.sad_x3[PARTITION_16x32] = sad_16_x3<16, 32>;
-    p.sad_x3[PARTITION_16x64] = sad_16_x3<16, 64>;
+    p.sad_x3[PARTITION_12x4] = sad_12_x3<4>;
+    p.sad_x3[PARTITION_12x8] = sad_12_x3<8>;
+    p.sad_x3[PARTITION_12x12] = sad_12_x3<12>;
+    p.sad_x3[PARTITION_12x16] = sad_12_x3<16>;
+    p.sad_x3[PARTITION_12x24] = sad_12_x3<24>;
+    p.sad_x3[PARTITION_12x32] = sad_12_x3<32>;
+    p.sad_x3[PARTITION_12x48] = sad_12_x3<48>;
+    p.sad_x3[PARTITION_12x64] = sad_12_x3<64>;
 
-    p.sad_x3[PARTITION_32x4] = sad_32_x3<32, 4>;
-    p.sad_x3[PARTITION_32x8] = sad_32_x3<32, 8>;
-    p.sad_x3[PARTITION_32x12] = sad_32_x3<32, 12>;
-    p.sad_x3[PARTITION_32x16] = sad_32_x3<32, 16>;
-    p.sad_x3[PARTITION_32x24] = sad_32_x3<32, 24>;
-    p.sad_x3[PARTITION_32x32] = sad_32_x3<32, 32>;
-    p.sad_x3[PARTITION_32x64] = sad_32_x3<32, 64>;
+    p.sad_x3[PARTITION_16x4] = sad_16_x3<4>;
+    p.sad_x3[PARTITION_16x8] = sad_16_x3<8>;
+    p.sad_x3[PARTITION_16x12] = sad_16_x3<12>;
+    p.sad_x3[PARTITION_16x16] = sad_16_x3<16>;
+    p.sad_x3[PARTITION_16x24] = sad_16_x3<24>;
+    p.sad_x3[PARTITION_16x32] = sad_16_x3<32>;
+    p.sad_x3[PARTITION_16x48] = sad_16_x3<48>;
+    p.sad_x3[PARTITION_16x64] = sad_16_x3<64>;
 
-    p.sad_x3[PARTITION_64x4] = sad_64_x3<64, 4>;
-    p.sad_x3[PARTITION_64x8] = sad_64_x3<64, 8>;
-    p.sad_x3[PARTITION_64x12] = sad_64_x3<64,12>;
-    p.sad_x3[PARTITION_64x16] = sad_64_x3<64, 16>;
-    p.sad_x3[PARTITION_64x24] = sad_64_x3<64, 24>;
-    p.sad_x3[PARTITION_64x32] = sad_64_x3<64, 32>;
-    p.sad_x3[PARTITION_64x64] = sad_64_x3<64, 64>;
+    p.sad_x3[PARTITION_24x4] = sad_24_x3<4>;
+    p.sad_x3[PARTITION_24x8] = sad_24_x3<8>;
+    p.sad_x3[PARTITION_24x12] = sad_24_x3<12>;
+    p.sad_x3[PARTITION_24x16] = sad_24_x3<16>;
+    p.sad_x3[PARTITION_24x24] = sad_24_x3<24>;
+    p.sad_x3[PARTITION_24x32] = sad_24_x3<32>;
+    p.sad_x3[PARTITION_24x48] = sad_24_x3<48>;
+    p.sad_x3[PARTITION_24x64] = sad_24_x3<64>;
+
+    p.sad_x3[PARTITION_32x4] = sad_32_x3<4>;
+    p.sad_x3[PARTITION_32x8] = sad_32_x3<8>;
+    p.sad_x3[PARTITION_32x12] = sad_32_x3<12>;
+    p.sad_x3[PARTITION_32x16] = sad_32_x3<16>;
+    p.sad_x3[PARTITION_32x24] = sad_32_x3<24>;
+    p.sad_x3[PARTITION_32x32] = sad_32_x3<32>;
+    p.sad_x3[PARTITION_32x48] = sad_32_x3<48>;
+    p.sad_x3[PARTITION_32x64] = sad_32_x3<64>;
+
+    p.sad_x3[PARTITION_48x4] = sad_48_x3<4>;
+    p.sad_x3[PARTITION_48x8] = sad_48_x3<8>;
+    p.sad_x3[PARTITION_48x12] = sad_48_x3<12>;
+    p.sad_x3[PARTITION_48x16] = sad_48_x3<16>;
+    p.sad_x3[PARTITION_48x24] = sad_48_x3<24>;
+    p.sad_x3[PARTITION_48x32] = sad_48_x3<32>;
+    p.sad_x3[PARTITION_48x48] = sad_48_x3<48>;
+    p.sad_x3[PARTITION_48x64] = sad_48_x3<64>;
+
+    p.sad_x3[PARTITION_64x4] = sad_64_x3<4>;
+    p.sad_x3[PARTITION_64x8] = sad_64_x3<8>;
+    p.sad_x3[PARTITION_64x12] = sad_64_x3<12>;
+    p.sad_x3[PARTITION_64x16] = sad_64_x3<16>;
+    p.sad_x3[PARTITION_64x24] = sad_64_x3<24>;
+    p.sad_x3[PARTITION_64x32] = sad_64_x3<32>;
+    p.sad_x3[PARTITION_64x48] = sad_64_x3<48>;
+    p.sad_x3[PARTITION_64x64] = sad_64_x3<64>;
 
     //// sad_x4
-    p.sad_x4[PARTITION_4x4] = sad_4_4_x4<4, 4>;
-    p.sad_x4[PARTITION_4x8] = sad_4_x4<4, 8>;
-    p.sad_x4[PARTITION_4x12] = sad_4_4_x4<4, 12>;
-    p.sad_x4[PARTITION_4x16] = sad_4_x4<4, 16>;
-    p.sad_x4[PARTITION_4x24] = sad_4_x4<4, 24>;
-    p.sad_x4[PARTITION_4x32] = sad_4_x4<4, 32>;
-    p.sad_x4[PARTITION_4x64] = sad_4_x4<4, 64>;
-
-    p.sad_x4[PARTITION_8x4] = sad_8_4_x4<8, 4>;
-    p.sad_x4[PARTITION_8x8] = sad_8_x4<8, 8>;
-    p.sad_x4[PARTITION_8x12] = sad_8_4_x4<8, 12>;
-    p.sad_x4[PARTITION_8x16] = sad_8_x4<8, 16>;
-    p.sad_x4[PARTITION_8x24] = sad_8_x4<8, 24>;
-    p.sad_x4[PARTITION_8x32] = sad_8_x4<8, 32>;
-    p.sad_x4[PARTITION_8x64] = sad_8_x4<8, 64>;
+    p.sad_x4[PARTITION_4x4] = sad_4_x4<4>;
+    p.sad_x4[PARTITION_4x8] = sad_4_x4<8>;
+    p.sad_x4[PARTITION_4x12] = sad_4_x4<12>;
+    p.sad_x4[PARTITION_4x16] = sad_4_x4<16>;
+    p.sad_x4[PARTITION_4x24] = sad_4_x4<24>;
+    p.sad_x4[PARTITION_4x32] = sad_4_x4<32>;
+    p.sad_x4[PARTITION_4x48] = sad_4_x4<48>;
+    p.sad_x4[PARTITION_4x64] = sad_4_x4<64>;
 
-    p.sad_x4[PARTITION_12x4] = sad_12_4_x4<12, 4>;
-    p.sad_x4[PARTITION_12x8] = sad_12_x4<12, 8>;
-    p.sad_x4[PARTITION_12x12] = sad_12_4_x4<12, 12>;
-    p.sad_x4[PARTITION_12x16] = sad_12_x4<12, 16>;
-    p.sad_x4[PARTITION_12x24] = sad_12_x4<12, 24>;
-    p.sad_x4[PARTITION_12x32] = sad_12_x4<12, 32>;
-    p.sad_x4[PARTITION_12x64] = sad_12_x4<12, 64>;
+    p.sad_x4[PARTITION_8x4] = sad_8_x4<4>;
+    p.sad_x4[PARTITION_8x8] = sad_8_x4<8>;
+    p.sad_x4[PARTITION_8x12] = sad_8_x4<12>;
+    p.sad_x4[PARTITION_8x16] = sad_8_x4<16>;
+    p.sad_x4[PARTITION_8x24] = sad_8_x4<24>;
+    p.sad_x4[PARTITION_8x32] = sad_8_x4<32>;
+    p.sad_x4[PARTITION_8x48] = sad_8_x4<48>;
+    p.sad_x4[PARTITION_8x64] = sad_8_x4<64>;
 
-    p.sad_x4[PARTITION_16x4] = sad_16_4_x4<16, 4>;
-    p.sad_x4[PARTITION_16x8] = sad_16_x4<16, 8>;
-    p.sad_x4[PARTITION_16x12] = sad_16_4_x4<16, 12>;
-    p.sad_x4[PARTITION_16x16] = sad_16_x4<16, 16>;
-    p.sad_x4[PARTITION_16x24] = sad_16_x4<16, 24>;
-    p.sad_x4[PARTITION_16x32] = sad_16_x4<16, 32>;
-    p.sad_x4[PARTITION_16x64] = sad_16_x4<16, 64>;
+    p.sad_x4[PARTITION_12x4] = sad_12_x4<4>;
+    p.sad_x4[PARTITION_12x8] = sad_12_x4<8>;
+    p.sad_x4[PARTITION_12x12] = sad_12_x4<12>;
+    p.sad_x4[PARTITION_12x16] = sad_12_x4<16>;
+    p.sad_x4[PARTITION_12x24] = sad_12_x4<24>;
+    p.sad_x4[PARTITION_12x32] = sad_12_x4<32>;
+    p.sad_x4[PARTITION_12x48] = sad_12_x4<48>;
+    p.sad_x4[PARTITION_12x64] = sad_12_x4<64>;
 
-    p.sad_x4[PARTITION_32x4] = sad_32_4_x4<32, 4>;
-    p.sad_x4[PARTITION_32x8] = sad_32_x4<32, 8>;
-    p.sad_x4[PARTITION_32x12] = sad_32_4_x4<32, 12>;
-    p.sad_x4[PARTITION_32x16] = sad_32_x4<32, 16>;
-    p.sad_x4[PARTITION_32x24] = sad_32_x4<32, 24>;
-    p.sad_x4[PARTITION_32x32] = sad_32_x4<32, 32>;
-    p.sad_x4[PARTITION_32x64] = sad_32_x4<32, 64>;
+    p.sad_x4[PARTITION_16x4] = sad_16_x4<4>;
+    p.sad_x4[PARTITION_16x8] = sad_16_x4<8>;
+    p.sad_x4[PARTITION_16x12] = sad_16_x4<12>;
+    p.sad_x4[PARTITION_16x16] = sad_16_x4<16>;
+    p.sad_x4[PARTITION_16x24] = sad_16_x4<24>;
+    p.sad_x4[PARTITION_16x32] = sad_16_x4<32>;
+    p.sad_x4[PARTITION_16x48] = sad_16_x4<48>;
+    p.sad_x4[PARTITION_16x64] = sad_16_x4<64>;
 
-    p.sad_x4[PARTITION_64x4] = sad_64_x4<64, 4>;
-    p.sad_x4[PARTITION_64x8] = sad_64_x4<64, 8>;
-    p.sad_x4[PARTITION_64x12] = sad_64_x4<64,12>;
-    p.sad_x4[PARTITION_64x16] = sad_64_x4<64, 16>;
-    p.sad_x4[PARTITION_64x24] = sad_64_x4<64, 24>;
-    p.sad_x4[PARTITION_64x32] = sad_64_x4<64, 32>;
-    p.sad_x4[PARTITION_64x64] = sad_64_x4<64, 64>;
+    p.sad_x4[PARTITION_24x4] = sad_24_x4<4>;
+    p.sad_x4[PARTITION_24x8] = sad_24_x4<8>;
+    p.sad_x4[PARTITION_24x12] = sad_24_x4<12>;
+    p.sad_x4[PARTITION_24x16] = sad_24_x4<16>;
+    p.sad_x4[PARTITION_24x24] = sad_24_x4<24>;
+    p.sad_x4[PARTITION_24x32] = sad_24_x4<32>;
+    p.sad_x4[PARTITION_24x48] = sad_24_x4<48>;
+    p.sad_x4[PARTITION_24x64] = sad_24_x4<64>;
+
+    p.sad_x4[PARTITION_32x4] = sad_32_x4<4>;
+    p.sad_x4[PARTITION_32x8] = sad_32_x4<8>;
+    p.sad_x4[PARTITION_32x12] = sad_32_x4<12>;
+    p.sad_x4[PARTITION_32x16] = sad_32_x4<16>;
+    p.sad_x4[PARTITION_32x24] = sad_32_x4<24>;
+    p.sad_x4[PARTITION_32x32] = sad_32_x4<32>;
+    p.sad_x4[PARTITION_32x48] = sad_32_x4<48>;
+    p.sad_x4[PARTITION_32x64] = sad_32_x4<64>;
+
+    p.sad_x4[PARTITION_48x4] = sad_48_x4<4>;
+    p.sad_x4[PARTITION_48x8] = sad_48_x4<8>;
+    p.sad_x4[PARTITION_48x12] = sad_48_x4<12>;
+    p.sad_x4[PARTITION_48x16] = sad_48_x4<16>;
+    p.sad_x4[PARTITION_48x24] = sad_48_x4<24>;
+    p.sad_x4[PARTITION_48x32] = sad_48_x4<32>;
+    p.sad_x4[PARTITION_48x48] = sad_48_x4<48>;
+    p.sad_x4[PARTITION_48x64] = sad_48_x4<64>;
+
+    p.sad_x4[PARTITION_64x4] = sad_64_x4<4>;
+    p.sad_x4[PARTITION_64x8] = sad_64_x4<8>;
+    p.sad_x4[PARTITION_64x12] = sad_64_x4<12>;
+    p.sad_x4[PARTITION_64x16] = sad_64_x4<16>;
+    p.sad_x4[PARTITION_64x24] = sad_64_x4<24>;
+    p.sad_x4[PARTITION_64x32] = sad_64_x4<32>;
+    p.sad_x4[PARTITION_64x48] = sad_64_x4<48>;
+    p.sad_x4[PARTITION_64x64] = sad_64_x4<64>;
 
 #if HIGH_BIT_DEPTH
     // satd
--- a/source/encoder/vec/pixel16.inc	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/vec/pixel16.inc	Mon May 06 14:34:00 2013 +0530
@@ -30,2442 +30,1670 @@
 template<int ly>
 int CDECL sad_4(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    Vec8s sum(0);
-    for (int row = 0; row < ly; row++)
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec8s m1, n1;
-        m1.load(piOrg);
+        for(int i = 0; i < 16; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
         n1.load(piCur);
-        sum += abs(m1 - n1);
+        sad += abs(m1 - n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
     }
+    sum += extend_low(sad);
 
-    return horizontal_add(extend_low(sum));
+    return horizontal_add(sum);
 }
 
 template<int ly>
 int CDECL sad_8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    Vec8s sum(0);
-    for (int row = 0; row < ly; row++)
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec8s m1, n1;
+        for(int i = 0; i < 16; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
         m1.load_a(piOrg);
         n1.load(piCur);
-        sum += abs(m1 - n1);
+        sad += abs(m1 - n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
     }
-    return horizontal_add_x(sum);
+    sum += extend_low(sad) + extend_high(sad);
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int CDECL sad_12(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
+    {
+        for(int i = 0; i < 16; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 8); m1.cutoff(4);
+            n1.load(piCur + 8); n1.cutoff(4);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 8); m1.cutoff(4);
+        n1.load(piCur + 8); n1.cutoff(4);
+        sad += abs(m1 - n1);
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    sum += extend_low(sad) + extend_high(sad);
+
+    return horizontal_add(sum);
 }
 
 template<int ly>
 int CDECL sad_16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    Vec16s sum(0);
-    for (int row = 0; row < ly; row++)
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
     {
-        Vec16s m1, n1;
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur + 8);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
         m1.load_a(piOrg);
         n1.load(piCur);
-        sum += abs(m1 - n1);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 8);
+        n1.load(piCur + 8);
+        sad += abs(m1 - n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
     }
+    sum += extend_low(sad) + extend_high(sad);
 
-    return horizontal_add(extend_low(sum)) +
-           horizontal_add(extend_high(sum));
+    return horizontal_add(sum);
 }
 
-template<int lx, int ly>
-int CDECL sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+template<int ly>
+int CDECL sad_24(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur)
 {
-    Vec16s sum(0);
-    for (int row = 0; row < ly; row++)
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
     {
-        for (int col = 0; col < lx; col += 32)
+        for(int i = 0; i < 8; i++)
         {
-            Vec16s m1, n1;
-            m1.load(piOrg + col);
-            n1.load(piCur + col);
-            sum += abs(m1 - n1);
-            Vec16s m2, n2;
-            m2.load(piOrg + col + 16);
-            n2.load(piCur + col + 16);
-            sum += abs(m2 - n2);
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur + 8);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur + 16);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
         }
 
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 8);
+        n1.load(piCur + 8);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 16);
+        n1.load(piCur + 16);
+        sad += abs(m1 - n1);
+
         piOrg += strideOrg;
         piCur += strideCur;
     }
-    return horizontal_add_x(extend_low(sum)) +
-           horizontal_add_x(extend_high(sum));
+    sum += extend_low(sad) + extend_high(sad);
+
+    return horizontal_add(sum);
 }
 
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);
+template<int ly>
+int CDECL sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 3) << 3;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-7; row+=8)
+
+    for (row = 0; row < main_iters; row += 8)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
+        for(int i = 0; i < 8; i++)
         {
-            for (int col = 0; col < lx; col += 4)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
+            m1.load_a(piOrg + 8);
+            n1.load(piCur + 8);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur + 16);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur + 24);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 8);
+        n1.load(piCur + 8);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 16);
+        n1.load(piCur + 16);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 24);
+        n1.load(piCur + 24);
+        sad += abs(m1 - n1);
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    sum += extend_low(sad) + extend_high(sad);
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int CDECL sad_48(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur + 8);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur + 16);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur + 24);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur + 32);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 40);
+            n1.load(piCur + 40);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 8);
+        n1.load(piCur + 8);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 16);
+        n1.load(piCur + 16);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 24);
+        n1.load(piCur + 24);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 32);
+        n1.load(piCur + 32);
+        sad += abs(m1 - n1);
+
+        m1.load_a(piOrg + 40);
+        n1.load(piCur + 40);
+        sad += abs(m1 - n1);
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    sum += extend_low(sad) + extend_high(sad);
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int CDECL sad_64(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec8s m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int row;
+
+    for (row = 0; row < ly; row += 4)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur + 8);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur + 16);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur + 24);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur + 32);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 40);
+            n1.load(piCur + 40);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 48);
+            n1.load(piCur + 48);
+            sad += abs(m1 - n1);
+
+            m1.load_a(piOrg + 56);
+            n1.load(piCur + 56);
+            sad += abs(m1 - n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+void CDECL sad_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{
+    Vec8s m1, n1, n2, n3;
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 4)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
 
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_4_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 4)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
+        sum1 += extend_low(sad1);
+        sum2 += extend_low(sad2);
+        sum3 += extend_low(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
     }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 4)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
-
-template<int lx, int ly>
-void CDECL sad_8_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-7; row+=8)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 8)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 8)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-}
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
 
-template<int lx, int ly>
-void CDECL sad_8_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 8)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 8)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
 
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+
+    sum1 += extend_low(sad1);
+    sum2 += extend_low(sad2);
+    sum3 += extend_low(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_12_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+template<int ly>
+void CDECL sad_8_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
 {    
     Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
+    int main_iters = (ly >> 3) << 3;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 12)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load(piOrg + col + 4);
-                n1.load(piCur1 + col + 4);
-                n2.load(piCur2 + col + 4);
-                n3.load(piCur3 + col + 4);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 12)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 4);
-            n1.load(piCur1 + col + 4);
-            n2.load(piCur2 + col + 4);
-            n3.load(piCur3 + col + 4);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_12_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 12)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load(piOrg + col + 4);
-                n1.load(piCur1 + col + 4);
-                n2.load(piCur2 + col + 4);
-                n3.load(piCur3 + col + 4);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 12)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 4);
-            n1.load(piCur1 + col + 4);
-            n2.load(piCur2 + col + 4);
-            n3.load(piCur3 + col + 4);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
 
-template<int lx, int ly>
-void CDECL sad_16_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 16)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
     }
-    for (; row < ly; row++)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 16)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-}
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
 
-template<int lx, int ly>
-void CDECL sad_16_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 16)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 16)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
 
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-template<int lx, int ly>
-void CDECL sad_32_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
+template<int ly>
+void CDECL sad_12_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{
     Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
+    int main_iters = (ly >> 3) << 3;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 32)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
 
-                m1.load_a(piOrg + col + 24);
-                n1.load(piCur1 + col + 24);
-                n2.load(piCur2 + col + 24);
-                n3.load(piCur3 + col + 24);
+            m1.load_a(piOrg + 8); m1.cutoff(4);
+            n1.load(piCur1 + 8); n1.cutoff(4);
+            n2.load(piCur2 + 8); n2.cutoff(4);
+            n3.load(piCur3 + 8); n3.cutoff(4);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 32)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
 
-            m1.load_a(piOrg + col + 24);
-            n1.load(piCur1 + col + 24);
-            n2.load(piCur2 + col + 24);
-            n3.load(piCur3 + col + 24);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
+        m1.load_a(piOrg + 8); m1.cutoff(4);
+        n1.load(piCur1 + 8); n1.cutoff(4);
+        n2.load(piCur2 + 8); n2.cutoff(4);
+        n3.load(piCur3 + 8); n3.cutoff(4);
+
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
 
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-template<int lx, int ly>
-void CDECL sad_64_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+template<int ly>
+void CDECL sad_16_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
 {    
     Vec8s m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
+    int main_iters = (ly >> 3) << 3;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-1; row+=2)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for(int temp_rows = 0; temp_rows < 2; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 64)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
 
-                m1.load_a(piOrg + col + 24);
-                n1.load(piCur1 + col + 24);
-                n2.load(piCur2 + col + 24);
-                n3.load(piCur3 + col + 24);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-
-                m1.load_a(piOrg + col + 32);
-                n1.load(piCur1 + col + 32);
-                n2.load(piCur2 + col + 32);
-                n3.load(piCur3 + col + 32);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-                m1.load_a(piOrg + col + 40);
-                n1.load(piCur1 + col + 40);
-                n2.load(piCur2 + col + 40);
-                n3.load(piCur3 + col + 40);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
 
-                m1.load_a(piOrg + col + 48);
-                n1.load(piCur1 + col + 48);
-                n2.load(piCur2 + col + 48);
-                n3.load(piCur3 + col + 48);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
 
-                m1.load_a(piOrg + col + 56);
-                n1.load(piCur1 + col + 56);
-                n2.load(piCur2 + col + 56);
-                n3.load(piCur3 + col + 56);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-            }
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        for (int col = 0; col < lx; col += 64)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 24);
-            n1.load(piCur1 + col + 24);
-            n2.load(piCur2 + col + 24);
-            n3.load(piCur3 + col + 24);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 32);
-            n1.load(piCur1 + col + 32);
-            n2.load(piCur2 + col + 32);
-            n3.load(piCur3 + col + 32);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-
-            m1.load_a(piOrg + col + 40);
-            n1.load(piCur1 + col + 40);
-            n2.load(piCur2 + col + 40);
-            n3.load(piCur3 + col + 40);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
 
-            m1.load_a(piOrg + col + 48);
-            n1.load(piCur1 + col + 48);
-            n2.load(piCur2 + col + 48);
-            n3.load(piCur3 + col + 48);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
 
-            m1.load_a(piOrg + col + 56);
-            n1.load(piCur1 + col + 56);
-            n2.load(piCur2 + col + 56);
-            n3.load(piCur3 + col + 56);
+        m1.load_a(piOrg + 8);
+        n1.load(piCur1 + 8);
+        n2.load(piCur2 + 8);
+        n3.load(piCur3 + 8);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
 
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);
-    Vec4i sum4_low(0);
+template<int ly>
+void CDECL sad_24_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{
+    Vec8s m1, n1, n2, n3;
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0;
-    for (row = 0; row < ly-7; row+=8)
+
+    for (row = 0; row < ly; row += 4)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
+        for(int i = 0; i < 4; i++)
         {
-            for (int col = 0; col < lx; col += 4)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
-            piCur4 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
     }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 4)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-    res[3] += horizontal_add_x(sum4_low);
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_4_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);
-    Vec4i sum4_low(0);
+template<int ly>
+void CDECL sad_32_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{
+    Vec8s m1, n1, n2, n3;
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0;
-    for (row = 0; row < ly-3; row+=4)
+
+    for (row = 0; row < ly; row += 4)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
+        for(int i = 0; i < 4; i++)
         {
-            for (int col = 0; col < lx; col += 4)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur1 + 24);
+            n2.load(piCur2 + 24);
+            n3.load(piCur3 + 24);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+}
+
+template<int ly>
+void CDECL sad_48_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{    
+    Vec8s m1, n1, n2, n3;
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
+    int row;
+
+    for (row = 0; row < ly; row += 2)
+    {
+        for(int i = 0; i < 2; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur1 + 24);
+            n2.load(piCur2 + 24);
+            n3.load(piCur3 + 24);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur1 + 32);
+            n2.load(piCur2 + 32);
+            n3.load(piCur3 + 32);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 40);
+            n1.load(piCur1 + 40);
+            n2.load(piCur2 + 40);
+            n3.load(piCur3 + 40);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+}
+
+template<int ly>
+void CDECL sad_64_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{
+    Vec8s m1, n1, n2, n3;
+    Vec8us sad1(0), sad2(0), sad3(0);
+    Vec4i sum1(0), sum2(0), sum3(0);
+    int row;
+
+    for (row = 0; row < ly; row += 2)
+    {
+        for(int i = 0; i < 2; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur1 + 24);
+            n2.load(piCur2 + 24);
+            n3.load(piCur3 + 24);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur1 + 32);
+            n2.load(piCur2 + 32);
+            n3.load(piCur3 + 32);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 40);
+            n1.load(piCur1 + 40);
+            n2.load(piCur2 + 40);
+            n3.load(piCur3 + 40);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 48);
+            n1.load(piCur1 + 48);
+            n2.load(piCur2 + 48);
+            n3.load(piCur3 + 48);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+
+            m1.load_a(piOrg + 56);
+            n1.load(piCur1 + 56);
+            n2.load(piCur2 + 56);
+            n3.load(piCur3 + 56);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+}
+
+template<int ly>
+void CDECL sad_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{
+    Vec8s m1, n1, n2, n3, n4;
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
             piCur4 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 4)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-    res[3] += horizontal_add_x(sum4_low);
-}
-
-template<int lx, int ly>
-void CDECL sad_8_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 8)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-            piCur4 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 8)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-    res[3] += horizontal_add_x(sum4_low) + horizontal_add_x(sum4_high);
-}
-
-template<int lx, int ly>
-void CDECL sad_8_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 8)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-            piCur4 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 8)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-    res[3] += horizontal_add_x(sum4_low) + horizontal_add_x(sum4_high);
-}
-
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_12_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 12)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load(piOrg + col + 4);
-                n1.load(piCur1 + col + 4);
-                n2.load(piCur2 + col + 4);
-                n3.load(piCur3 + col + 4);
-                n4.load(piCur4 + col + 4);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-                n4.load(piCur4 + col + 8);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-            piCur4 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 12)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 4);
-            n1.load(piCur1 + col + 4);
-            n2.load(piCur2 + col + 4);
-            n3.load(piCur3 + col + 4);
-            n4.load(piCur4 + col + 4);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-            n4.load(piCur4 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
+        sum1 += extend_low(sad1);
+        sum2 += extend_low(sad2);
+        sum3 += extend_low(sad3);
+        sum4 += extend_low(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
     }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-    res[3] += horizontal_add_x(sum4_low);
-}
-
-/* For performance - This function assumes that the *last load* can access 8 elements. */
-template<int lx, int ly>
-void CDECL sad_12_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 12)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load(piOrg + col + 4);
-                n1.load(piCur1 + col + 4);
-                n2.load(piCur2 + col + 4);
-                n3.load(piCur3 + col + 4);
-                n4.load(piCur4 + col + 4);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-                n4.load(piCur4 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-            piCur4 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
-    }
-    for (; row < ly; row++)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 12)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 4);
-            n1.load(piCur1 + col + 4);
-            n2.load(piCur2 + col + 4);
-            n3.load(piCur3 + col + 4);
-            n4.load(piCur4 + col + 4);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-            n4.load(piCur4 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum2_low += extend_low(sum2);
-        sum3_low += extend_low(sum3);
-        sum4_low += extend_low(sum4);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-    res[3] += horizontal_add_x(sum4_low);
-}
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
+        n4.load(piCur4);
 
-template<int lx, int ly>
-void CDECL sad_16_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 16)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-                n4.load(piCur4 + col + 8);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
+        sad4 += abs(m1 - n4);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-            piCur4 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 16)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-            n4.load(piCur4 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
         piCur4 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-    res[3] += horizontal_add_x(sum4_low) + horizontal_add_x(sum4_high);
+
+    sum1 += extend_low(sad1);
+    sum2 += extend_low(sad2);
+    sum3 += extend_low(sad3);
+    sum4 += extend_low(sad4);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
 }
 
-template<int lx, int ly>
-void CDECL sad_16_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
+template<int ly>
+void CDECL sad_8_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{
     Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int main_iters = (ly >> 3) << 3;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-3; row+=4)
+
+    for (row = 0; row < main_iters; row += 8)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
+        for(int i = 0; i < 8; i++)
         {
-            for (int col = 0; col < lx; col += 16)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
 
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-                n4.load(piCur4 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
             piCur4 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 16)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-            n4.load(piCur4 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-    res[3] += horizontal_add_x(sum4_low) + horizontal_add_x(sum4_high);
-}
-
-template<int lx, int ly>
-void CDECL sad_32_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
-    Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 32)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-                n4.load(piCur4 + col + 8);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
-                n4.load(piCur4 + col + 16);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 24);
-                n1.load(piCur1 + col + 24);
-                n2.load(piCur2 + col + 24);
-                n3.load(piCur3 + col + 24);
-                n4.load(piCur4 + col + 24);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-            piCur4 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sum4 += extend_low(sad4) + extend_high(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
     }
-    for (; row < ly; row++)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 32)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-            n4.load(piCur4 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
+        n4.load(piCur4);
 
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
-            n4.load(piCur4 + col + 16);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
+        sad4 += abs(m1 - n4);
 
-            m1.load_a(piOrg + col + 24);
-            n1.load(piCur1 + col + 24);
-            n2.load(piCur2 + col + 24);
-            n3.load(piCur3 + col + 24);
-            n4.load(piCur4 + col + 24 );
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
         piCur4 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-    res[3] += horizontal_add_x(sum4_low) + horizontal_add_x(sum4_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+    sum4 += extend_low(sad4) + extend_high(sad4);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
 }
 
-template<int lx, int ly>
-void CDECL sad_32_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
+template<int ly>
+void CDECL sad_12_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{
     Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int main_iters = (ly >> 3) << 3;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 32)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-                n4.load(piCur4 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
 
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
-                n4.load(piCur4 + col + 16);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
 
-                m1.load_a(piOrg + col + 24);
-                n1.load(piCur1 + col + 24);
-                n2.load(piCur2 + col + 24);
-                n3.load(piCur3 + col + 24);
-                n4.load(piCur4 + col + 24);
+            m1.load_a(piOrg + 8); m1.cutoff(4);
+            n1.load(piCur1 + 8); n1.cutoff(4);
+            n2.load(piCur2 + 8); n2.cutoff(4);
+            n3.load(piCur3 + 8); n3.cutoff(4);
+            n4.load(piCur4 + 8); n4.cutoff(4);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
             piCur4 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 32)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-            n4.load(piCur4 + col + 8);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
-            n4.load(piCur4 + col + 16);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sum4 += extend_low(sad4) + extend_high(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
+        n4.load(piCur4);
 
-            m1.load_a(piOrg + col + 24);
-            n1.load(piCur1 + col + 24);
-            n2.load(piCur2 + col + 24);
-            n3.load(piCur3 + col + 24);
-            n4.load(piCur4 + col + 24 );
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
+        sad4 += abs(m1 - n4);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
+        m1.load_a(piOrg + 8); m1.cutoff(4);
+        n1.load(piCur1 + 8); n1.cutoff(4);
+        n2.load(piCur2 + 8); n2.cutoff(4);
+        n3.load(piCur3 + 8); n3.cutoff(4);
+        n4.load(piCur4 + 8); n4.cutoff(4);
+
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
+        sad4 += abs(m1 - n4);
+
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
         piCur4 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-    res[3] += horizontal_add_x(sum4_low) + horizontal_add_x(sum4_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+    sum4 += extend_low(sad4) + extend_high(sad4);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
 }
 
-template<int lx, int ly>
-void CDECL sad_64_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
-{    
+template<int ly>
+void CDECL sad_16_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{
     Vec8s m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);
-    Vec4i sum4_low(0), sum4_high(0);
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int main_iters = (ly >> 3) << 3;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-1; row+=2)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 2; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 64)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-                n4.load(piCur4 + col);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 8);
-                n1.load(piCur1 + col + 8);
-                n2.load(piCur2 + col + 8);
-                n3.load(piCur3 + col + 8);
-                n4.load(piCur4 + col + 8);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
-                n4.load(piCur4 + col + 16);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 24);
-                n1.load(piCur1 + col + 24);
-                n2.load(piCur2 + col + 24);
-                n3.load(piCur3 + col + 24);
-                n4.load(piCur4 + col + 24);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-
-                m1.load_a(piOrg + col + 32);
-                n1.load(piCur1 + col + 32);
-                n2.load(piCur2 + col + 32);
-                n3.load(piCur3 + col + 32);
-                n4.load(piCur4 + col + 32);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
 
-                m1.load_a(piOrg + col + 40);
-                n1.load(piCur1 + col + 40);
-                n2.load(piCur2 + col + 40);
-                n3.load(piCur3 + col + 40);
-                n4.load(piCur4 + col + 40);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
 
-                m1.load_a(piOrg + col + 48);
-                n1.load(piCur1 + col + 48);
-                n2.load(piCur2 + col + 48);
-                n3.load(piCur3 + col + 48);
-                n4.load(piCur4 + col + 48);
-
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+            n4.load(piCur4 + 8);
 
-                m1.load_a(piOrg + col + 56);
-                n1.load(piCur1 + col + 56);
-                n2.load(piCur2 + col + 56);
-                n3.load(piCur3 + col + 56);
-                n4.load(piCur4 + col + 56);
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
 
-                sum1 += abs(m1 - n1);
-                sum2 += abs(m1 - n2);
-                sum3 += abs(m1 - n3);
-                sum4 += abs(m1 - n4);
-            }
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
             piCur4 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 64)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-            n4.load(piCur4 + col);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 8);
-            n1.load(piCur1 + col + 8);
-            n2.load(piCur2 + col + 8);
-            n3.load(piCur3 + col + 8);
-            n4.load(piCur4 + col + 8);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
-            n4.load(piCur4 + col + 16);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 24);
-            n1.load(piCur1 + col + 24);
-            n2.load(piCur2 + col + 24);
-            n3.load(piCur3 + col + 24);
-            n4.load(piCur4 + col + 24);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 32);
-            n1.load(piCur1 + col + 32);
-            n2.load(piCur2 + col + 32);
-            n3.load(piCur3 + col + 32);
-            n4.load(piCur4 + col + 32);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-
-            m1.load_a(piOrg + col + 40);
-            n1.load(piCur1 + col + 40);
-            n2.load(piCur2 + col + 40);
-            n3.load(piCur3 + col + 40);
-            n4.load(piCur4 + col + 40);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sum4 += extend_low(sad4) + extend_high(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
+        n4.load(piCur4);
 
-            m1.load_a(piOrg + col + 48);
-            n1.load(piCur1 + col + 48);
-            n2.load(piCur2 + col + 48);
-            n3.load(piCur3 + col + 48);
-            n4.load(piCur4 + col + 48);
-
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
+        sad4 += abs(m1 - n4);
 
-            m1.load_a(piOrg + col + 56);
-            n1.load(piCur1 + col + 56);
-            n2.load(piCur2 + col + 56);
-            n3.load(piCur3 + col + 56);
-            n4.load(piCur4 + col + 56);
+        m1.load_a(piOrg + 8);
+        n1.load(piCur1 + 8);
+        n2.load(piCur2 + 8);
+        n3.load(piCur3 + 8);
+        n4.load(piCur4 + 8);
 
-            sum1 += abs(m1 - n1);
-            sum2 += abs(m1 - n2);
-            sum3 += abs(m1 - n3);
-            sum4 += abs(m1 - n4);
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        sum4_low += extend_low(sum4);
-        sum4_high += extend_high(sum4);
+        sad1 += abs(m1 - n1);
+        sad2 += abs(m1 - n2);
+        sad3 += abs(m1 - n3);
+        sad4 += abs(m1 - n4);
+
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
         piCur4 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-    res[3] += horizontal_add_x(sum4_low) + horizontal_add_x(sum4_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+    sum4 += extend_low(sad4) + extend_high(sad4);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
+}
+
+template<int ly>
+void CDECL sad_24_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{
+    Vec8s m1, n1, n2, n3, n4;
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int row;
+
+    for (row = 0; row < ly; row += 4)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+            n4.load(piCur4 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+            n4.load(piCur4 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+            piCur4 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sum4 += extend_low(sad4) + extend_high(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
+}
+
+template<int ly>
+void CDECL sad_32_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{
+    Vec8s m1, n1, n2, n3, n4;
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int row;
+
+    for (row = 0; row < ly; row += 4)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+            n4.load(piCur4 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+            n4.load(piCur4 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur1 + 24);
+            n2.load(piCur2 + 24);
+            n3.load(piCur3 + 24);
+            n4.load(piCur4 + 24);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+            piCur4 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sum4 += extend_low(sad4) + extend_high(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
+}
+
+template<int ly>
+void CDECL sad_48_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{    
+    Vec8s m1, n1, n2, n3, n4;
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int row;
+
+    for (row = 0; row < ly; row += 2)
+    {
+        for(int i = 0; i < 2; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+            n4.load(piCur4 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+            n4.load(piCur4 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur1 + 24);
+            n2.load(piCur2 + 24);
+            n3.load(piCur3 + 24);
+            n4.load(piCur4 + 24);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur1 + 32);
+            n2.load(piCur2 + 32);
+            n3.load(piCur3 + 32);
+            n4.load(piCur4 + 32);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 40);
+            n1.load(piCur1 + 40);
+            n2.load(piCur2 + 40);
+            n3.load(piCur3 + 40);
+            n4.load(piCur4 + 40);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+            piCur4 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sum4 += extend_low(sad4) + extend_high(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
+}
+
+template<int ly>
+void CDECL sad_64_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{    
+    Vec8s m1, n1, n2, n3, n4;
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    int row;
+
+    for (row = 0; row < ly; row += 2)
+    {
+        for(int i = 0; i < 2; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+            n4.load(piCur4);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 8);
+            n1.load(piCur1 + 8);
+            n2.load(piCur2 + 8);
+            n3.load(piCur3 + 8);
+            n4.load(piCur4 + 8);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+            n4.load(piCur4 + 16);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 24);
+            n1.load(piCur1 + 24);
+            n2.load(piCur2 + 24);
+            n3.load(piCur3 + 24);
+            n4.load(piCur4 + 24);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur1 + 32);
+            n2.load(piCur2 + 32);
+            n3.load(piCur3 + 32);
+            n4.load(piCur4 + 32);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 40);
+            n1.load(piCur1 + 40);
+            n2.load(piCur2 + 40);
+            n3.load(piCur3 + 40);
+            n4.load(piCur4 + 40);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 48);
+            n1.load(piCur1 + 48);
+            n2.load(piCur2 + 48);
+            n3.load(piCur3 + 48);
+            n4.load(piCur4 + 48);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+
+            m1.load_a(piOrg + 56);
+            n1.load(piCur1 + 56);
+            n2.load(piCur2 + 56);
+            n3.load(piCur3 + 56);
+            n4.load(piCur4 + 56);
+
+            sad1 += abs(m1 - n1);
+            sad2 += abs(m1 - n2);
+            sad3 += abs(m1 - n3);
+            sad4 += abs(m1 - n4);
+            
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+            piCur4 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sum4 += extend_low(sad4) + extend_high(sad4);
+        sad1 = 0; sad2 = 0; sad3 = 0; sad4 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+    res[3] = horizontal_add(sum4);
 }
 
 int CDECL satd_4x4(pixel * piOrg, intptr_t iStrideOrg, pixel * piCur, intptr_t iStrideCur)
@@ -2700,4 +1928,4 @@ int CDECL sa8d_8x8(pixel * piOrg, intptr
     }
 
     return (satd + 2) >> 2;
-}
\ No newline at end of file
+}
--- a/source/encoder/vec/pixel8.inc	Fri May 03 17:54:14 2013 +0530
+++ b/source/encoder/vec/pixel8.inc	Mon May 06 14:34:00 2013 +0530
@@ -25,1037 +25,900 @@
 
 // Vector class versions of pixel comparison performance primitives
 /* intrinsics for when pixel type is uint8_t */
+
 template<int ly>
 int CDECL sad_4(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    Vec8s sum(0);
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
 
-    for (int row = 0; row < ly; row++)
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec16uc m1, n1;
+        for(int i = 0; i < 16; i++)
+        {
+            m1.fromUint32(*(uint32_t*)piOrg);
+            n1.fromUint32(*(uint32_t*)piCur);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
         m1.fromUint32(*(uint32_t*)piOrg);
         n1.fromUint32(*(uint32_t*)piCur);
-        sum += Vec8s(m1.sad(n1));
+        sad.addSumAbsDiff(m1, n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
     }
-
+    sum += extend_low(sad);
     return horizontal_add(sum);
 }
 
 template<int ly>
 int CDECL sad_8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    Vec8s sum(0);
-    for (int row = 0; row < ly; row++)
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec16uc m1, n1;
-        m1.load(piOrg);
+        for(int i = 0; i < 16; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
         n1.load(piCur);
-        sum += Vec8s(m1.sad(n1));
+        sad.addSumAbsDiff(m1, n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
     }
-    return sum[0];
+    sum += extend_low(sad);
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int CDECL sad_12(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
+    {
+        for(int i = 0; i < 16; i++)
+        {
+            m1.load_a(piOrg); m1.cutoff(12);
+            n1.load(piCur); n1.cutoff(12);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg); m1.cutoff(12);
+        n1.load(piCur); n1.cutoff(12);
+        sad.addSumAbsDiff(m1, n1);
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    sum += extend_low(sad) + extend_high(sad);
+    return horizontal_add(sum);
 }
 
 template<int ly>
 int CDECL sad_16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    Vec8s sum(0);
-    for (int row = 0; row < ly; row++)
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec16uc m1, n1;
+        for(int i = 0; i < 16; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
         m1.load_a(piOrg);
         n1.load(piCur);
-        sum += Vec8s(m1.sad(n1));
+        sad.addSumAbsDiff(m1, n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
     }
-    return horizontal_add_x(sum);
+    sum += extend_low(sad) + extend_high(sad);
+    return horizontal_add(sum);
 }
 
-template<int lx, int ly>
-int CDECL sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+template<int ly>
+int CDECL sad_24(pixel *piOrg, intptr_t strideOrg, pixel *piCur, intptr_t strideCur)
 {
-    // TODO: AVX2
-    int sum = 0;
-    for (int row = 0; row < ly; row++)
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec8s sad(0);
-        for (int col = 0; col < lx; col += 32)
+        for(int i = 0; i < 16; i++)
         {
-            Vec16uc m1, n1;
-            m1.load_a(piOrg + col);
-            n1.load(piCur + col);
-            sad += Vec8s(m1.sad(n1));
-            Vec16uc m2, n2;
-            m2.load_a(piOrg + col + 16);
-            n2.load(piCur + col + 16);
-            sad += Vec8s(m2.sad(n2));
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad.addSumAbsDiff(m1, n1);
+
+            m1.load_a(piOrg + 16); m1.cutoff(8);
+            n1.load(piCur + 16); n1.cutoff(8);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
         }
 
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sad.addSumAbsDiff(m1, n1);
+
+        m1.load_a(piOrg + 16); m1.cutoff(8);
+        n1.load(piCur + 16); n1.cutoff(8);
+        sad.addSumAbsDiff(m1, n1);
+
         piOrg += strideOrg;
         piCur += strideCur;
-        sum += horizontal_add_x(sad);
     }
-    return sum;
+    sum += extend_low(sad) + extend_high(sad);
+    return horizontal_add(sum);
 }
 
-template<int lx, int ly>
+template<int ly>
+int CDECL sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad.addSumAbsDiff(m1, n1);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur + 16);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sad.addSumAbsDiff(m1, n1);
+
+        m1.load_a(piOrg + 16);
+        n1.load(piCur + 16);
+        sad.addSumAbsDiff(m1, n1);
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    sum += extend_low(sad) + extend_high(sad);
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int CDECL sad_48(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad.addSumAbsDiff(m1, n1);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur + 16);
+            sad.addSumAbsDiff(m1, n1);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur + 32);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sad.addSumAbsDiff(m1, n1);
+
+        m1.load_a(piOrg + 16);
+        n1.load(piCur + 16);
+        sad.addSumAbsDiff(m1, n1);
+
+        m1.load_a(piOrg + 32);
+        n1.load(piCur + 32);
+        sad.addSumAbsDiff(m1, n1);
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    sum += extend_low(sad) + extend_high(sad);
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int CDECL sad_64(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec16uc m1, n1;
+    Vec4i sum(0);
+    Vec8us sad(0);
+    int main_iters = (ly >> 2) << 2;
+    int row;
+
+    for (row = 0; row < main_iters; row += 4)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur);
+            sad.addSumAbsDiff(m1, n1);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur + 16);
+            sad.addSumAbsDiff(m1, n1);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur + 32);
+            sad.addSumAbsDiff(m1, n1);
+
+            m1.load_a(piOrg + 48);
+            n1.load(piCur + 48);
+            sad.addSumAbsDiff(m1, n1);
+
+            piOrg += strideOrg;
+            piCur += strideCur;
+        }
+
+        sum += extend_low(sad) + extend_high(sad);
+        sad = 0;
+    }
+    return horizontal_add(sum);
+}
+
+template<int ly>
 void CDECL sad_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
 {    
     Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);    
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int main_iters = (ly >> 4) << 4;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 4)
-            {
-                m1.fromUint32(*(uint32_t*)piOrg);
-                n1.fromUint32(*(uint32_t*)piCur1);
-                n2.fromUint32(*(uint32_t*)piCur2);
-                n3.fromUint32(*(uint32_t*)piCur3);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);      
-        sum3_low += extend_low(sum3);        
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 4)
-        {
-            m1.fromUint32(*(uint32_t*)piOrg);
-            n1.fromUint32(*(uint32_t*)piCur1);
-            n2.fromUint32(*(uint32_t*)piCur2);
-            n3.fromUint32(*(uint32_t*)piCur3);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);       
-        sum3_low += extend_low(sum3);        
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
-
-template<int lx, int ly>
-void CDECL sad_4_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);    
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0; 
-    for (row = 0; row < ly-3; row+=4)
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 4)
-            {
-                m1.fromUint32(*(uint32_t*)piOrg);
-                n1.fromUint32(*(uint32_t*)piCur1);
-                n2.fromUint32(*(uint32_t*)piCur2);
-                n3.fromUint32(*(uint32_t*)piCur3);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);      
-        sum3_low += extend_low(sum3);        
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 4)
+        for(int i = 0; i < 16; i++)
         {
             m1.fromUint32(*(uint32_t*)piOrg);
             n1.fromUint32(*(uint32_t*)piCur1);
             n2.fromUint32(*(uint32_t*)piCur2);
             n3.fromUint32(*(uint32_t*)piCur3);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);       
-        sum3_low += extend_low(sum3);         
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-template<int lx, int ly>
-void CDECL sad_8_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);    
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 8)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
 
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);      
-        sum3_low += extend_low(sum3);         
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 8)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);       
-        sum3_low += extend_low(sum3);        
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-template<int lx, int ly>
-void CDECL sad_8_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);    
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0; 
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 8)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
         }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);      
-        sum3_low += extend_low(sum3);         
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 8)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);       
-        sum3_low += extend_low(sum3);         
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-template<int lx, int ly>
-void CDECL sad_12_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);    
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0; 
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 12)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
 
-                m1.fromUint32(*(uint32_t*)(piOrg + 8));
-                n1.fromUint32(*(uint32_t*)(piCur1 + 8));
-                n2.fromUint32(*(uint32_t*)(piCur2 + 8));
-                n3.fromUint32(*(uint32_t*)(piCur3 + 8));
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);      
-        sum3_low += extend_low(sum3);         
+        sum1 += extend_low(sad1);
+        sum2 += extend_low(sad2);
+        sum3 += extend_low(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
     }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 12)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-
-            m1.fromUint32(*(uint32_t*)(piOrg + 8));
-            n1.fromUint32(*(uint32_t*)(piCur1 + 8));
-            n2.fromUint32(*(uint32_t*)(piCur2 + 8));
-            n3.fromUint32(*(uint32_t*)(piCur3 + 8));
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);       
-        sum3_low += extend_low(sum3);         
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
-
-/* For performance - This function assumes that the *last load* can access 16 elements. */
-template<int lx, int ly>
-void CDECL sad_12_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);    
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0; 
-    for (row = 0; row < ly-3; row+=4)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 12)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-
-                m1.fromUint32(*(uint32_t*)(piOrg + 8));
-                n1.fromUint32(*(uint32_t*)(piCur1 + 8));
-                n2.fromUint32(*(uint32_t*)(piCur2 + 8));
-                n3.fromUint32(*(uint32_t*)(piCur3 + 8));
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);      
-        sum3_low += extend_low(sum3);       
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 12)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-
-            m1.fromUint32(*(uint32_t*)(piOrg + 8));
-            n1.fromUint32(*(uint32_t*)(piCur1 + 8));
-            n2.fromUint32(*(uint32_t*)(piCur2 + 8));
-            n3.fromUint32(*(uint32_t*)(piCur3 + 8));
+        m1.fromUint32(*(uint32_t*)piOrg);
+        n1.fromUint32(*(uint32_t*)piCur1);
+        n2.fromUint32(*(uint32_t*)piCur2);
+        n3.fromUint32(*(uint32_t*)piCur3);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);       
-        sum3_low += extend_low(sum3);       
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-}
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
 
-template<int lx, int ly>
-void CDECL sad_16_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);    
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0; 
-    for (row = 0; row < ly-7; row+=8)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 16)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 16)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+    sum1 += extend_low(sad1);
+    sum2 += extend_low(sad2);
+    sum3 += extend_low(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-template<int lx, int ly>
-void CDECL sad_16_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+/* For performance - This function assumes that the *last load* can access 16 elements. */
+template<int ly>
+void CDECL sad_8_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
 {    
     Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);    
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int main_iters = (ly >> 4) << 4;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0; 
-    for (row = 0; row < ly-3; row+=4)
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
+        for(int i = 0; i < 16; i++)
         {
-            for (int col = 0; col < lx; col += 16)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+         
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 16)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
+        sum1 += extend_low(sad1);
+        sum2 += extend_low(sad2);
+        sum3 += extend_low(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
-}
-
-template<int lx, int ly>
-void CDECL sad_32_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
-{    
-    Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);    
-    int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-7; row+=8)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 32)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
 
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
-            piOrg += FENC_STRIDE;
-            piCur1 += strideCur;
-            piCur2 += strideCur;
-            piCur3 += strideCur;
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 32)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+
+    sum1 += extend_low(sad1);
+    sum2 += extend_low(sad2);
+    sum3 += extend_low(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-template<int lx, int ly>
-void CDECL sad_32_4_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+/* For performance - This function assumes that the *last load* can access 16 elements. */
+template<int ly>
+void CDECL sad_12_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
 {    
     Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);    
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int main_iters = (ly >> 4) << 4;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0; 
-    for (row = 0; row < ly-3; row+=4)
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
+        for(int i = 0; i < 16; i++)
         {
-            for (int col = 0; col < lx; col += 32)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
+            m1.load_a(piOrg); m1.cutoff(12);
+            n1.load(piCur1); n1.cutoff(12);
+            n2.load(piCur2); n2.cutoff(12);
+            n3.load(piCur3); n3.cutoff(12);
 
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
 
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
-           
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
     }
-    for (; row < ly; row++)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 32)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
+        m1.load_a(piOrg); m1.cutoff(12);
+        n1.load(piCur1); n1.cutoff(12);
+        n2.load(piCur2); n2.cutoff(12);
+        n3.load(piCur3); n3.cutoff(12);
 
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+    
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-template<int lx, int ly>
-void CDECL sad_64_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+template<int ly>
+void CDECL sad_16_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
 {    
     Vec16uc m1, n1, n2, n3;
-    Vec4i sum1_low(0), sum1_high(0);
-    Vec4i sum2_low(0), sum2_high(0);
-    Vec4i sum3_low(0), sum3_high(0);    
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int main_iters = (ly >> 4) << 4;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    for (row = 0; row < ly-3; row+=4)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for(int temp_rows = 0; temp_rows < 4; temp_rows++)
-        {
-            for (int col = 0; col < lx; col += 64)
-            {
-                m1.load_a(piOrg + col);
-                n1.load(piCur1 + col);
-                n2.load(piCur2 + col);
-                n3.load(piCur3 + col);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
 
-                m1.load_a(piOrg + col + 16);
-                n1.load(piCur1 + col + 16);
-                n2.load(piCur2 + col + 16);
-                n3.load(piCur3 + col + 16);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
+    for (row = 0; row < main_iters; row += 16)
+    {
+        for(int i = 0; i < 16; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-                m1.load_a(piOrg + col + 32);
-                n1.load(piCur1 + col + 32);
-                n2.load(piCur2 + col + 32);
-                n3.load(piCur3 + col + 32);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
 
-                m1.load_a(piOrg + col + 48);
-                n1.load(piCur1 + col + 48);
-                n2.load(piCur2 + col + 48);
-                n3.load(piCur3 + col + 48);
-
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-            }
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
         }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
-    }
-    for (; row < ly; row++)
-    {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);        
-        for (int col = 0; col < lx; col += 64)
-        {
-            m1.load_a(piOrg + col);
-            n1.load(piCur1 + col);
-            n2.load(piCur2 + col);
-            n3.load(piCur3 + col);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-
-            m1.load_a(piOrg + col + 16);
-            n1.load(piCur1 + col + 16);
-            n2.load(piCur2 + col + 16);
-            n3.load(piCur3 + col + 16);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-
-            m1.load_a(piOrg + col + 32);
-            n1.load(piCur1 + col + 32);
-            n2.load(piCur2 + col + 32);
-            n3.load(piCur3 + col + 32);
-
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
 
-            m1.load_a(piOrg + col + 48);
-            n1.load(piCur1 + col + 48);
-            n2.load(piCur2 + col + 48);
-            n3.load(piCur3 + col + 48);
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-        }
-        sum1_low += extend_low(sum1);
-        sum1_high += extend_high(sum1);
-        sum2_low += extend_low(sum2);
-        sum2_high += extend_high(sum2);
-        sum3_low += extend_low(sum3);
-        sum3_high += extend_high(sum3);
         piOrg += FENC_STRIDE;
         piCur1 += strideCur;
         piCur2 += strideCur;
         piCur3 += strideCur;
     }
-    res[0] += horizontal_add_x(sum1_low) + horizontal_add_x(sum1_high);
-    res[1] += horizontal_add_x(sum2_low) + horizontal_add_x(sum2_high);
-    res[2] += horizontal_add_x(sum3_low) + horizontal_add_x(sum3_high);
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
 }
 
-
-template<int lx, int ly>
-void CDECL sad_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+template<int ly>
+void CDECL sad_24_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
 {    
-    Vec16uc m1, n1, n2, n3, n4;
-    Vec4i sum1_low(0);
-    Vec4i sum2_low(0);
-    Vec4i sum3_low(0);
-    Vec4i sum4_low(0);
+    Vec16uc m1, n1, n2, n3;
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int main_iters = (ly >> 4) << 4;
     int row;
-    res[0] = 0;
-    res[1] = 0;
-    res[2] = 0;
-    res[3] = 0; 
-    for (row = 0; row < ly-7; row+=8)
+
+    for (row = 0; row < main_iters; row += 16)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for(int temp_rows = 0; temp_rows < 8; temp_rows++)
+        for(int i = 0; i < 16; i++)
         {
-            for (int col = 0; col < lx; col += 4)
-            {
-                m1.fromUint32(*(uint32_t*)piOrg);
-                n1.fromUint32(*(uint32_t*)piCur1);
-                n2.fromUint32(*(uint32_t*)piCur2);
-                n3.fromUint32(*(uint32_t*)piCur3);
-                n4.fromUint32(*(uint32_t*)piCur4);
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
 
-                sum1 += Vec8s(m1.sad(n1));
-                sum2 += Vec8s(m1.sad(n2));
-                sum3 += Vec8s(m1.sad(n3));
-                sum4 += Vec8s(m1.sad(n4));
-            }
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            m1.load_a(piOrg + 16); m1.cutoff(8);
+            n1.load(piCur1 + 16); n1.cutoff(8);
+            n2.load(piCur2 + 16); n2.cutoff(8);
+            n3.load(piCur3 + 16); n3.cutoff(8);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
             piOrg += FENC_STRIDE;
             piCur1 += strideCur;
             piCur2 += strideCur;
             piCur3 += strideCur;
-            piCur4 += strideCur;
         }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);      
-        sum3_low += extend_low(sum3);       
-        sum4_low += extend_low(sum4);       
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
     }
-    for (; row < ly; row++)
+    while (row++ < ly)
     {
-        Vec8s sum1(0);
-        Vec8s sum2(0);
-        Vec8s sum3(0);
-        Vec8s sum4(0);
-        for (int col = 0; col < lx; col += 4)
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
+
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
+
+        m1.load_a(piOrg + 16); m1.cutoff(8);
+        n1.load(piCur1 + 16); n1.cutoff(8);
+        n2.load(piCur2 + 16); n2.cutoff(8);
+        n3.load(piCur3 + 16); n3.cutoff(8);
+
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
+
+        piOrg += FENC_STRIDE;
+        piCur1 += strideCur;
+        piCur2 += strideCur;
+        piCur3 += strideCur;
+    }
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+}
+
+template<int ly>
+void CDECL sad_32_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{    
+    Vec16uc m1, n1, n2, n3;
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
+
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
+
+        m1.load_a(piOrg + 16);
+        n1.load(piCur1 + 16);
+        n2.load(piCur2 + 16);
+        n3.load(piCur3 + 16);
+
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
+
+        piOrg += FENC_STRIDE;
+        piCur1 += strideCur;
+        piCur2 += strideCur;
+        piCur3 += strideCur;
+    }
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+}
+
+template<int ly>
+void CDECL sad_48_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{    
+    Vec16uc m1, n1, n2, n3;
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int main_iters = (ly >> 3) << 3;
+    int row;
+
+    for (row = 0; row < main_iters; row += 8)
+    {
+        for(int i = 0; i < 8; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur1 + 32);
+            n2.load(piCur2 + 32);
+            n3.load(piCur3 + 32);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+        }
+
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+    while (row++ < ly)
+    {
+        m1.load_a(piOrg);
+        n1.load(piCur1);
+        n2.load(piCur2);
+        n3.load(piCur3);
+
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
+
+        m1.load_a(piOrg + 16);
+        n1.load(piCur1 + 16);
+        n2.load(piCur2 + 16);
+        n3.load(piCur3 + 16);
+
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
+
+        m1.load_a(piOrg + 32);
+        n1.load(piCur1 + 32);
+        n2.load(piCur2 + 32);
+        n3.load(piCur3 + 32);
+
+        sad1.addSumAbsDiff(m1, n1);
+        sad2.addSumAbsDiff(m1, n2);
+        sad3.addSumAbsDiff(m1, n3);
+
+        piOrg += FENC_STRIDE;
+        piCur1 += strideCur;
+        piCur2 += strideCur;
+        piCur3 += strideCur;
+    }
+
+    sum1 += extend_low(sad1) + extend_high(sad1);
+    sum2 += extend_low(sad2) + extend_high(sad2);
+    sum3 += extend_low(sad3) + extend_high(sad3);
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+}
+
+template<int ly>
+void CDECL sad_64_x3(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, intptr_t strideCur, int *res)
+{    
+    Vec16uc m1, n1, n2, n3;
+    Vec4i sum1(0), sum2(0), sum3(0);
+    Vec8us sad1(0), sad2(0), sad3(0);
+    int row;
+
+    for (row = 0; row < ly; row += 4)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            m1.load_a(piOrg);
+            n1.load(piCur1);
+            n2.load(piCur2);
+            n3.load(piCur3);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            m1.load_a(piOrg + 16);
+            n1.load(piCur1 + 16);
+            n2.load(piCur2 + 16);
+            n3.load(piCur3 + 16);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            m1.load_a(piOrg + 32);
+            n1.load(piCur1 + 32);
+            n2.load(piCur2 + 32);
+            n3.load(piCur3 + 32);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+
+            m1.load_a(piOrg + 48);
+            n1.load(piCur1 + 48);
+            n2.load(piCur2 + 48);
+            n3.load(piCur3 + 48);
+
+            sad1.addSumAbsDiff(m1, n1);
+            sad2.addSumAbsDiff(m1, n2);
+            sad3.addSumAbsDiff(m1, n3);
+ 
+            piOrg += FENC_STRIDE;
+            piCur1 += strideCur;
+            piCur2 += strideCur;
+            piCur3 += strideCur;
+        }
+        sum1 += extend_low(sad1) + extend_high(sad1);
+        sum2 += extend_low(sad2) + extend_high(sad2);
+        sum3 += extend_low(sad3) + extend_high(sad3);
+        sad1 = 0; sad2 = 0; sad3 = 0;
+    }
+
+    res[0] = horizontal_add(sum1);
+    res[1] = horizontal_add(sum2);
+    res[2] = horizontal_add(sum3);
+}
+
+
+template<int ly>
+void CDECL sad_4_x4(pixel *piOrg, pixel *piCur1, pixel *piCur2, pixel *piCur3, pixel *piCur4, intptr_t strideCur, int *res)
+{    
+    Vec16uc m1, n1, n2, n3, n4;
+    Vec4i sum1(0), sum2(0), sum3(0), sum4(0);
+    Vec8us sad1(0), sad2(0), sad3(0), sad4(0);
+    int main_iters = (ly >> 4) << 4;
+    int row;
+
+    for (row = 0; row < main_iters; row += 16)
+    {
+        for(int i = 0; i < 16; i++)
         {
             m1.fromUint32(*(uint32_t*)piOrg);
             n1.fromUint32(*(uint32_t*)piCur1);
@@ -1063,1021 +926,622 @@ void CDECL sad_4_x4(pixel *piOrg, pixel 
             n3.fromUint32(*(uint32_t*)piCur3);
             n4.fromUint32(*(uint32_t*)piCur4);
 
-            sum1 += Vec8s(m1.sad(n1));
-            sum2 += Vec8s(m1.sad(n2));
-            sum3 += Vec8s(m1.sad(n3));
-            sum4 += Vec8s(m1.sad(n4));
-        }
-        sum1_low += extend_low(sum1);      
-        sum2_low += extend_low(sum2);       
-        sum3_low += extend_low(sum3);     
-        sum4_low += extend_low(sum4);       
-        piOrg += FENC_STRIDE;
-        piCur1 += strideCur;
-        piCur2 += strideCur;
-        piCur3 += strideCur;
-        piCur4 += strideCur;
-    }
-    res[0] += horizontal_add_x(sum1_low);
-    res[1] += horizontal_add_x(sum2_low);
-    res[2] += horizontal_add_x(sum3_low);
-    res[3] += horizontal_add_x(sum4_low);
-}
-
-templat