changeset 9568:8d5f9b7b4733

Merge with public
author Praveen Tiwari <praveen@multicorewareinc.com>
date Wed, 25 Feb 2015 10:05:40 +0530
parents 8be71cee10f3 (current diff) 87173d41df87 (diff)
children 02bac78bde96
files source/encoder/entropy.cpp
diffstat 12 files changed, 422 insertions(+-), 438 deletions(-) [+]
line wrap: on
line diff
--- a/source/common/cudata.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/common/cudata.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -1375,8 +1375,8 @@ bool CUData::hasEqualMotion(uint32_t abs
     return true;
 }
 
-/* Construct list of merging candidates */
-uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const
+/* Construct list of merging candidates, returns count */
+uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*candMvField)[2], uint8_t* candDir) const
 {
     uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;
     const bool isInterB = m_slice->isInterB();
@@ -1385,10 +1385,10 @@ uint32_t CUData::getInterMergeCandidates
 
     for (uint32_t i = 0; i < maxNumMergeCand; ++i)
     {
-        mvFieldNeighbours[i][0].mv = 0;
-        mvFieldNeighbours[i][1].mv = 0;
-        mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID;
-        mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID;
+        candMvField[i][0].mv = 0;
+        candMvField[i][1].mv = 0;
+        candMvField[i][0].refIdx = REF_NOT_VALID;
+        candMvField[i][1].refIdx = REF_NOT_VALID;
     }
 
     /* calculate the location of upper-left corner pixel and size of the current PU */
@@ -1420,11 +1420,11 @@ uint32_t CUData::getInterMergeCandidates
     if (isAvailableA1)
     {
         // get Inter Dir
-        interDirNeighbours[count] = cuLeft->m_interDir[leftPartIdx];
+        candDir[count] = cuLeft->m_interDir[leftPartIdx];
         // get Mv from Left
-        cuLeft->getMvField(cuLeft, leftPartIdx, 0, mvFieldNeighbours[count][0]);
+        cuLeft->getMvField(cuLeft, leftPartIdx, 0, candMvField[count][0]);
         if (isInterB)
-            cuLeft->getMvField(cuLeft, leftPartIdx, 1, mvFieldNeighbours[count][1]);
+            cuLeft->getMvField(cuLeft, leftPartIdx, 1, candMvField[count][1]);
 
         count++;
     
@@ -1444,11 +1444,11 @@ uint32_t CUData::getInterMergeCandidates
     if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx)))
     {
         // get Inter Dir
-        interDirNeighbours[count] = cuAbove->m_interDir[abovePartIdx];
+        candDir[count] = cuAbove->m_interDir[abovePartIdx];
         // get Mv from Left
-        cuAbove->getMvField(cuAbove, abovePartIdx, 0, mvFieldNeighbours[count][0]);
+        cuAbove->getMvField(cuAbove, abovePartIdx, 0, candMvField[count][0]);
         if (isInterB)
-            cuAbove->getMvField(cuAbove, abovePartIdx, 1, mvFieldNeighbours[count][1]);
+            cuAbove->getMvField(cuAbove, abovePartIdx, 1, candMvField[count][1]);
 
         count++;
    
@@ -1465,11 +1465,11 @@ uint32_t CUData::getInterMergeCandidates
     if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx)))
     {
         // get Inter Dir
-        interDirNeighbours[count] = cuAboveRight->m_interDir[aboveRightPartIdx];
+        candDir[count] = cuAboveRight->m_interDir[aboveRightPartIdx];
         // get Mv from Left
-        cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 0, mvFieldNeighbours[count][0]);
+        cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 0, candMvField[count][0]);
         if (isInterB)
-            cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, mvFieldNeighbours[count][1]);
+            cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, candMvField[count][1]);
 
         count++;
 
@@ -1486,11 +1486,11 @@ uint32_t CUData::getInterMergeCandidates
     if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx)))
     {
         // get Inter Dir
-        interDirNeighbours[count] = cuLeftBottom->m_interDir[leftBottomPartIdx];
+        candDir[count] = cuLeftBottom->m_interDir[leftBottomPartIdx];
         // get Mv from Left
-        cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 0, mvFieldNeighbours[count][0]);
+        cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 0, candMvField[count][0]);
         if (isInterB)
-            cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, mvFieldNeighbours[count][1]);
+            cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, candMvField[count][1]);
 
         count++;
 
@@ -1510,11 +1510,11 @@ uint32_t CUData::getInterMergeCandidates
             && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx)))
         {
             // get Inter Dir
-            interDirNeighbours[count] = cuAboveLeft->m_interDir[aboveLeftPartIdx];
+            candDir[count] = cuAboveLeft->m_interDir[aboveLeftPartIdx];
             // get Mv from Left
-            cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 0, mvFieldNeighbours[count][0]);
+            cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 0, candMvField[count][0]);
             if (isInterB)
-                cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, mvFieldNeighbours[count][1]);
+                cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, candMvField[count][1]);
 
             count++;
 
@@ -1563,8 +1563,8 @@ uint32_t CUData::getInterMergeCandidates
         if (bExistMV)
         {
             dir |= 1;
-            mvFieldNeighbours[count][0].mv = colmv;
-            mvFieldNeighbours[count][0].refIdx = refIdx;
+            candMvField[count][0].mv = colmv;
+            candMvField[count][0].refIdx = refIdx;
         }
 
         if (isInterB)
@@ -1576,14 +1576,14 @@ uint32_t CUData::getInterMergeCandidates
             if (bExistMV)
             {
                 dir |= 2;
-                mvFieldNeighbours[count][1].mv = colmv;
-                mvFieldNeighbours[count][1].refIdx = refIdx;
+                candMvField[count][1].mv = colmv;
+                candMvField[count][1].refIdx = refIdx;
             }
         }
 
         if (dir != 0)
         {
-            interDirNeighbours[count] = (uint8_t)dir;
+            candDir[count] = (uint8_t)dir;
 
             count++;
         
@@ -1605,20 +1605,20 @@ uint32_t CUData::getInterMergeCandidates
             priorityList0 >>= 2;
             priorityList1 >>= 2;
 
-            if ((interDirNeighbours[i] & 0x1) && (interDirNeighbours[j] & 0x2))
+            if ((candDir[i] & 0x1) && (candDir[j] & 0x2))
             {
                 // get Mv from cand[i] and cand[j]
-                int refIdxL0 = mvFieldNeighbours[i][0].refIdx;
-                int refIdxL1 = mvFieldNeighbours[j][1].refIdx;
+                int refIdxL0 = candMvField[i][0].refIdx;
+                int refIdxL1 = candMvField[j][1].refIdx;
                 int refPOCL0 = m_slice->m_refPOCList[0][refIdxL0];
                 int refPOCL1 = m_slice->m_refPOCList[1][refIdxL1];
-                if (!(refPOCL0 == refPOCL1 && mvFieldNeighbours[i][0].mv == mvFieldNeighbours[j][1].mv))
+                if (!(refPOCL0 == refPOCL1 && candMvField[i][0].mv == candMvField[j][1].mv))
                 {
-                    mvFieldNeighbours[count][0].mv = mvFieldNeighbours[i][0].mv;
-                    mvFieldNeighbours[count][0].refIdx = refIdxL0;
-                    mvFieldNeighbours[count][1].mv = mvFieldNeighbours[j][1].mv;
-                    mvFieldNeighbours[count][1].refIdx = refIdxL1;
-                    interDirNeighbours[count] = 3;
+                    candMvField[count][0].mv = candMvField[i][0].mv;
+                    candMvField[count][0].refIdx = refIdxL0;
+                    candMvField[count][1].mv = candMvField[j][1].mv;
+                    candMvField[count][1].refIdx = refIdxL1;
+                    candDir[count] = 3;
 
                     count++;
 
@@ -1633,15 +1633,15 @@ uint32_t CUData::getInterMergeCandidates
     int refcnt = 0;
     while (count < maxNumMergeCand)
     {
-        interDirNeighbours[count] = 1;
-        mvFieldNeighbours[count][0].mv = 0;
-        mvFieldNeighbours[count][0].refIdx = r;
+        candDir[count] = 1;
+        candMvField[count][0].mv = 0;
+        candMvField[count][0].refIdx = r;
 
         if (isInterB)
         {
-            interDirNeighbours[count] = 3;
-            mvFieldNeighbours[count][1].mv.word = 0;
-            mvFieldNeighbours[count][1].refIdx = r;
+            candDir[count] = 3;
+            candMvField[count][1].mv.word = 0;
+            candMvField[count][1].refIdx = r;
         }
 
         count++;
--- a/source/common/cudata.h	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/common/cudata.h	Wed Feb 25 10:05:40 2015 +0530
@@ -195,7 +195,7 @@ public:
     uint8_t  getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; }
     uint8_t  getQtRootCbf(uint32_t absPartIdx) const                             { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; }
     int8_t   getRefQP(uint32_t currAbsIdxInCTU) const;
-    uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const;
+    uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const;
     void     clipMv(MV& outMV) const;
     int      fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const;
     void     getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const;
--- a/source/common/lowres.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/common/lowres.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -56,10 +56,7 @@ bool Lowres::create(PicYuv *origPic, int
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
     /* allocate lowres buffers */
-    CHECKED_MALLOC(buffer[0], pixel, 4 * planesize);
-
-    /* initialize the whole buffer to prevent valgrind warnings on right edge */
-    memset(buffer[0], 0, 4 * sizeof(pixel) * planesize);
+    CHECKED_MALLOC_ZERO(buffer[0], pixel, 4 * planesize);
 
     buffer[1] = buffer[0] + planesize;
     buffer[2] = buffer[1] + planesize;
--- a/source/common/predict.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/common/predict.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -34,6 +34,18 @@ using namespace x265;
 #pragma warning(disable: 4127) // conditional expression is constant
 #endif
 
+PredictionUnit::PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx)
+{
+    /* address of CTU */
+    ctuAddr = cu.m_cuAddr;
+
+    /* offset of CU */
+    cuAbsPartIdx = cuGeom.absPartIdx;
+
+    /* offset and dimensions of PU */
+    cu.getPartIndexAndSize(puIdx, puAbsPartIdx, width, height);
+}
+
 namespace
 {
 inline pixel weightBidir(int w0, int16_t P0, int w1, int16_t P1, int round, int shift, int offset)
@@ -112,37 +124,25 @@ void Predict::predIntraChromaAng(uint32_
     primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, srcBuf, dirMode, 0);
 }
 
-void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
-{
-    m_predSlice = cu.m_slice;
-    cu.getPartIndexAndSize(partIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
-    m_ctuAddr = cu.m_cuAddr;
-    m_cuAbsPartIdx = cuGeom.absPartIdx;
-}
 
-void Predict::prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
+void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
 {
-    initMotionCompensation(cu, cuGeom, partIdx);
+    int refIdx0 = cu.m_refIdx[0][pu.puAbsPartIdx];
+    int refIdx1 = cu.m_refIdx[1][pu.puAbsPartIdx];
 
-    m_refIdx0      = cu.m_refIdx[0][m_puAbsPartIdx];
-    m_clippedMv[0] = cu.m_mv[0][m_puAbsPartIdx];
-    m_refIdx1      = cu.m_refIdx[1][m_puAbsPartIdx];
-    m_clippedMv[1] = cu.m_mv[1][m_puAbsPartIdx];
-    cu.clipMv(m_clippedMv[0]);
-    cu.clipMv(m_clippedMv[1]);
-}
-
-void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma)
-{
-    if (m_predSlice->isInterP())
+    if (cu.m_slice->isInterP())
     {
         /* P Slice */
         WeightValues wv0[3];
-        X265_CHECK(m_refIdx0 >= 0, "invalid P refidx\n");
-        X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "P refidx out of range\n");
-        const WeightParam *wp0 = m_predSlice->m_weightPredTable[0][m_refIdx0];
 
-        if (m_predSlice->m_pps->bUseWeightPred && wp0->bPresentFlag)
+        X265_CHECK(refIdx0 >= 0, "invalid P refidx\n");
+        X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "P refidx out of range\n");
+        const WeightParam *wp0 = cu.m_slice->m_weightPredTable[0][refIdx0];
+
+        MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
+        cu.clipMv(mv0);
+
+        if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag)
         {
             for (int plane = 0; plane < 3; plane++)
             {
@@ -155,18 +155,18 @@ void Predict::motionCompensation(Yuv& pr
             ShortYuv& shortYuv = m_predShortYuv[0];
 
             if (bLuma)
-                predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
             if (bChroma)
-                predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
 
-            addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
+            addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
         }
         else
         {
             if (bLuma)
-                predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
             if (bChroma)
-                predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
         }
     }
     else
@@ -176,10 +176,10 @@ void Predict::motionCompensation(Yuv& pr
         WeightValues wv0[3], wv1[3];
         const WeightParam *pwp0, *pwp1;
 
-        if (m_predSlice->m_pps->bUseWeightedBiPred)
+        if (cu.m_slice->m_pps->bUseWeightedBiPred)
         {
-            pwp0 = m_refIdx0 >= 0 ? m_predSlice->m_weightPredTable[0][m_refIdx0] : NULL;
-            pwp1 = m_refIdx1 >= 0 ? m_predSlice->m_weightPredTable[1][m_refIdx1] : NULL;
+            pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;
+            pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
             {
@@ -200,7 +200,7 @@ void Predict::motionCompensation(Yuv& pr
             else
             {
                 /* uniprediction weighting, always outputs to wv0 */
-                const WeightParam* pwp = (m_refIdx0 >= 0) ? pwp0 : pwp1;
+                const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;
                 for (int plane = 0; plane < 3; plane++)
                 {
                     wv0[plane].w = pwp[plane].inputWeight;
@@ -213,89 +213,100 @@ void Predict::motionCompensation(Yuv& pr
         else
             pwp0 = pwp1 = NULL;
 
-        if (m_refIdx0 >= 0 && m_refIdx1 >= 0)
+        if (refIdx0 >= 0 && refIdx1 >= 0)
         {
+            MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
+            MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];
+            cu.clipMv(mv0);
+            cu.clipMv(mv1);
+
             /* Biprediction */
-            X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "bidir refidx0 out of range\n");
-            X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "bidir refidx1 out of range\n");
+            X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "bidir refidx0 out of range\n");
+            X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "bidir refidx1 out of range\n");
 
             if (bLuma)
             {
-                predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
-                predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
             }
             if (bChroma)
             {
-                predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
-                predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
+                predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
             }
 
             if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag))
-                addWeightBi(predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
+                addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);
             else
-                predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], m_puAbsPartIdx, m_puWidth, m_puHeight, bLuma, bChroma);
+                predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma);
         }
-        else if (m_refIdx0 >= 0)
+        else if (refIdx0 >= 0)
         {
+            MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];
+            cu.clipMv(mv0);
+
             /* uniprediction to L0 */
-            X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "unidir refidx0 out of range\n");
+            X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "unidir refidx0 out of range\n");
 
             if (pwp0 && pwp0->bPresentFlag)
             {
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
                 if (bChroma)
-                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
 
-                addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
+                addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
                 if (bChroma)
-                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0);
             }
         }
         else
         {
+            MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];
+            cu.clipMv(mv1);
+
             /* uniprediction to L1 */
-            X265_CHECK(m_refIdx1 >= 0, "refidx1 was not positive\n");
-            X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "unidir refidx1 out of range\n");
+            X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n");
+            X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "unidir refidx1 out of range\n");
 
             if (pwp1 && pwp1->bPresentFlag)
             {
                 ShortYuv& shortYuv = m_predShortYuv[0];
 
                 if (bLuma)
-                    predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
                 if (bChroma)
-                    predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
 
-                addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma);
+                addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);
             }
             else
             {
                 if (bLuma)
-                    predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
                 if (bChroma)
-                    predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]);
+                    predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1);
             }
         }
     }
 }
 
-void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterLumaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
 {
-    pixel* dst = dstYuv.getLumaAddr(m_puAbsPartIdx);
+    pixel* dst = dstYuv.getLumaAddr(pu.puAbsPartIdx);
     intptr_t dstStride = dstYuv.m_size;
 
     intptr_t srcStride = refPic.m_stride;
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
-    const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+    int partEnum = partitionFromSizes(pu.width, pu.height);
+    const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
 
     int xFrac = mv.x & 0x3;
     int yFrac = mv.y & 0x3;
@@ -310,32 +321,32 @@ void Predict::predInterLumaPixel(Yuv& ds
         primitives.pu[partEnum].luma_hvpp(src, srcStride, dst, dstStride, xFrac, yFrac);
 }
 
-void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
 {
-    int16_t* dst = dstSYuv.getLumaAddr(m_puAbsPartIdx);
+    int16_t* dst = dstSYuv.getLumaAddr(pu.puAbsPartIdx);
     int dstStride = dstSYuv.m_size;
 
     intptr_t srcStride = refPic.m_stride;
     intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride;
-    const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset;
+    const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset;
 
     int xFrac = mv.x & 0x3;
     int yFrac = mv.y & 0x3;
 
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
+    int partEnum = partitionFromSizes(pu.width, pu.height);
 
-    X265_CHECK((m_puWidth % 4) + (m_puHeight % 4) == 0, "width or height not divisible by 4\n");
+    X265_CHECK((pu.width % 4) + (pu.height % 4) == 0, "width or height not divisible by 4\n");
     X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n");
 
     if (!(yFrac | xFrac))
-        primitives.luma_p2s(src, srcStride, dst, m_puWidth, m_puHeight);
+        primitives.luma_p2s(src, srcStride, dst, pu.width, pu.height);
     else if (!yFrac)
         primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0);
     else if (!xFrac)
         primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac);
     else
     {
-        int tmpStride = m_puWidth;
+        int tmpStride = pu.width;
         int filterSize = NTAPS_LUMA;
         int halfFilterSize = (filterSize >> 1);
         primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1);
@@ -343,7 +354,7 @@ void Predict::predInterLumaShort(ShortYu
     }
 }
 
-void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterChromaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const
 {
     intptr_t dstStride = dstYuv.m_csize;
     intptr_t refStride = refPic.m_strideC;
@@ -353,16 +364,16 @@ void Predict::predInterChromaPixel(Yuv& 
 
     intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
 
-    const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
-    const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
+    const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
 
-    pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx);
-    pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx);
+    pixel* dstCb = dstYuv.getCbAddr(pu.puAbsPartIdx);
+    pixel* dstCr = dstYuv.getCrAddr(pu.puAbsPartIdx);
 
     int xFrac = mv.x & ((1 << shiftHor) - 1);
     int yFrac = mv.y & ((1 << shiftVer) - 1);
 
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
+    int partEnum = partitionFromSizes(pu.width, pu.height);
     
     if (!(yFrac | xFrac))
     {
@@ -381,7 +392,7 @@ void Predict::predInterChromaPixel(Yuv& 
     }
     else
     {
-        int extStride = m_puWidth >> m_hChromaShift;
+        int extStride = pu.width >> m_hChromaShift;
         int filterSize = NTAPS_CHROMA;
         int halfFilterSize = (filterSize >> 1);
 
@@ -393,7 +404,7 @@ void Predict::predInterChromaPixel(Yuv& 
     }
 }
 
-void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
+void Predict::predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const
 {
     intptr_t refStride = refPic.m_strideC;
     intptr_t dstStride = dstSYuv.m_csize;
@@ -403,19 +414,19 @@ void Predict::predInterChromaShort(Short
 
     intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride;
 
-    const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
-    const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset;
+    const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
+    const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset;
 
-    int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx);
-    int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx);
+    int16_t* dstCb = dstSYuv.getCbAddr(pu.puAbsPartIdx);
+    int16_t* dstCr = dstSYuv.getCrAddr(pu.puAbsPartIdx);
 
     int xFrac = mv.x & ((1 << shiftHor) - 1);
     int yFrac = mv.y & ((1 << shiftVer) - 1);
 
-    int partEnum = partitionFromSizes(m_puWidth, m_puHeight);
+    int partEnum = partitionFromSizes(pu.width, pu.height);
     
-    uint32_t cxWidth  = m_puWidth >> m_hChromaShift;
-    uint32_t cxHeight = m_puHeight >> m_vChromaShift;
+    uint32_t cxWidth  = pu.width >> m_hChromaShift;
+    uint32_t cxHeight = pu.height >> m_vChromaShift;
 
     X265_CHECK(((cxWidth | cxHeight) % 2) == 0, "chroma block size expected to be multiple of 2\n");
 
@@ -447,7 +458,7 @@ void Predict::predInterChromaShort(Short
 }
 
 /* weighted averaging for bi-pred */
-void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const
+void Predict::addWeightBi(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const
 {
     int x, y;
 
@@ -456,9 +467,9 @@ void Predict::addWeightBi(Yuv& predYuv, 
 
     if (bLuma)
     {
-        pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
-        const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx);
-        const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx);
+        pixel* dstY = predYuv.getLumaAddr(pu.puAbsPartIdx);
+        const int16_t* srcY0 = srcYuv0.getLumaAddr(pu.puAbsPartIdx);
+        const int16_t* srcY1 = srcYuv1.getLumaAddr(pu.puAbsPartIdx);
 
         // Luma
         w0      = wp0[0].w;
@@ -473,9 +484,9 @@ void Predict::addWeightBi(Yuv& predYuv, 
         dststride = predYuv.m_size;
 
         // TODO: can we use weight_sp here?
-        for (y = m_puHeight - 1; y >= 0; y--)
+        for (y = pu.height - 1; y >= 0; y--)
         {
-            for (x = m_puWidth - 1; x >= 0; )
+            for (x = pu.width - 1; x >= 0; )
             {
                 // note: luma min width is 4
                 dstY[x] = weightBidir(w0, srcY0[x], w1, srcY1[x], round, shift, offset);
@@ -496,12 +507,12 @@ void Predict::addWeightBi(Yuv& predYuv, 
 
     if (bChroma)
     {
-        pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
-        pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-        const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx);
-        const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx);
-        const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx);
-        const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx);
+        pixel* dstU = predYuv.getCbAddr(pu.puAbsPartIdx);
+        pixel* dstV = predYuv.getCrAddr(pu.puAbsPartIdx);
+        const int16_t* srcU0 = srcYuv0.getCbAddr(pu.puAbsPartIdx);
+        const int16_t* srcV0 = srcYuv0.getCrAddr(pu.puAbsPartIdx);
+        const int16_t* srcU1 = srcYuv1.getCbAddr(pu.puAbsPartIdx);
+        const int16_t* srcV1 = srcYuv1.getCrAddr(pu.puAbsPartIdx);
 
         // Chroma U
         w0      = wp0[1].w;
@@ -515,8 +526,8 @@ void Predict::addWeightBi(Yuv& predYuv, 
         src1Stride = srcYuv1.m_csize;
         dststride  = predYuv.m_csize;
 
-        uint32_t cwidth = m_puWidth >> srcYuv0.m_hChromaShift;
-        uint32_t cheight = m_puHeight >> srcYuv0.m_vChromaShift;
+        uint32_t cwidth = pu.width >> srcYuv0.m_hChromaShift;
+        uint32_t cheight = pu.height >> srcYuv0.m_vChromaShift;
 
         // TODO: can we use weight_sp here?
         for (y = cheight - 1; y >= 0; y--)
@@ -561,15 +572,15 @@ void Predict::addWeightBi(Yuv& predYuv, 
 }
 
 /* weighted averaging for uni-pred */
-void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const
+void Predict::addWeightUni(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const
 {
     int w0, offset, shiftNum, shift, round;
     uint32_t srcStride, dstStride;
 
     if (bLuma)
     {
-        pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx);
-        const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx);
+        pixel* dstY = predYuv.getLumaAddr(pu.puAbsPartIdx);
+        const int16_t* srcY0 = srcYuv.getLumaAddr(pu.puAbsPartIdx);
 
         // Luma
         w0      = wp[0].w;
@@ -580,15 +591,15 @@ void Predict::addWeightUni(Yuv& predYuv,
         srcStride = srcYuv.m_size;
         dstStride = predYuv.m_size;
 
-        primitives.weight_sp(srcY0, dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset);
+        primitives.weight_sp(srcY0, dstY, srcStride, dstStride, pu.width, pu.height, w0, round, shift, offset);
     }
 
     if (bChroma)
     {
-        pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx);
-        pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx);
-        const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx);
-        const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx);
+        pixel* dstU = predYuv.getCbAddr(pu.puAbsPartIdx);
+        pixel* dstV = predYuv.getCrAddr(pu.puAbsPartIdx);
+        const int16_t* srcU0 = srcYuv.getCbAddr(pu.puAbsPartIdx);
+        const int16_t* srcV0 = srcYuv.getCrAddr(pu.puAbsPartIdx);
 
         // Chroma U
         w0      = wp[1].w;
@@ -600,8 +611,8 @@ void Predict::addWeightUni(Yuv& predYuv,
         srcStride = srcYuv.m_csize;
         dstStride = predYuv.m_csize;
 
-        uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift;
-        uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift;
+        uint32_t cwidth = pu.width >> srcYuv.m_hChromaShift;
+        uint32_t cheight = pu.height >> srcYuv.m_vChromaShift;
 
         primitives.weight_sp(srcU0, dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset);
 
--- a/source/common/predict.h	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/common/predict.h	Wed Feb 25 10:05:40 2015 +0530
@@ -36,6 +36,17 @@ class CUData;
 class Slice;
 struct CUGeom;
 
+struct PredictionUnit
+{
+    uint32_t     ctuAddr;      // raster index of current CTU within its picture
+    uint32_t     cuAbsPartIdx; // z-order offset of current CU within its CTU
+    uint32_t     puAbsPartIdx; // z-order offset of current PU with its CU
+    int          width;
+    int          height;
+
+    PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx);
+};
+
 class Predict
 {
 public:
@@ -65,38 +76,34 @@ public:
 
     // Unfiltered/filtered neighbours of the current partition.
     pixel     intraNeighbourBuf[2][258];
+
     /* Slice information */
-    const Slice* m_predSlice;
     int       m_csp;
     int       m_hChromaShift;
     int       m_vChromaShift;
 
-    /* cached CU information for prediction */
-    uint32_t  m_ctuAddr;      // raster index of current CTU within its picture
-    uint32_t  m_cuAbsPartIdx; // z-order index of current CU within its CTU
-    uint32_t  m_puAbsPartIdx; // z-order index of current PU with its CU
-    int       m_puWidth;
-    int       m_puHeight;
-    int       m_refIdx0;
-    int       m_refIdx1;
-
-    /* TODO: Need to investigate clipping while writing into the TComDataCU fields itself */
-    MV        m_clippedMv[2];
-
     Predict();
     ~Predict();
 
     bool allocBuffers(int csp);
 
     // motion compensation functions
-    void predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
-    void predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterLumaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterChromaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const;
 
-    void predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
-    void predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
+    void predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const;
 
-    void addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const;
-    void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
+    void addWeightBi(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const;
+    void addWeightUni(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const;
+
+    void motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma);
+
+    /* Angular Intra */
+    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
+    void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
+    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
+    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
 
     /* Intra prediction helper functions */
     static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
@@ -111,19 +118,6 @@ public:
     static int  isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits);
     template<bool cip>
     static int  isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits);
-
-public:
-
-    /* prepMotionCompensation needs to be called to prepare MC with CU-relevant data */
-    void initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx);
-    void prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx);
-    void motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma);
-
-    /* Angular Intra */
-    void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
-    void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
-    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
-    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
 };
 }
 
--- a/source/common/threadpool.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/common/threadpool.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -321,7 +321,7 @@ ThreadPool* ThreadPool::allocThreadPools
                 numPools = 0;
                 return NULL;
             }
-            if (bNumaSupport)
+            if (numNumaNodes > 1)
                 x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
             else
                 x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
--- a/source/encoder/analysis.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/encoder/analysis.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -574,8 +574,8 @@ void Analysis::compressInterCU_dist(cons
                 {
                     for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                        motionCompensation(bestInter->predYuv, false, true);
+                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
                     }
                 }
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
@@ -610,8 +610,8 @@ void Analysis::compressInterCU_dist(cons
                     /* finally code the best mode selected from SA8D costs */
                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
-                        motionCompensation(md.bestMode->predYuv, false, true);
+                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
                     }
                     encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
                 }
@@ -828,8 +828,8 @@ void Analysis::compressInterCU_rd0_4(con
                 {
                     for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(bestInter->cu, cuGeom, puIdx);
-                        motionCompensation(bestInter->predYuv, false, true);
+                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
+                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
                     }
                 }
                 encodeResAndCalcRdInterCU(*bestInter, cuGeom);
@@ -883,8 +883,8 @@ void Analysis::compressInterCU_rd0_4(con
                 {
                     for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++)
                     {
-                        prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx);
-                        motionCompensation(md.bestMode->predYuv, false, true);
+                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
                     }
                     if (m_param->rdLevel == 2)
                         encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
@@ -1217,32 +1217,32 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mod
     bestPred->cu.setPredModeSubParts(MODE_INTER);
     bestPred->cu.m_mergeFlag[0] = true;
 
-    MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
-    uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
-    uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
+    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
+    uint8_t candDir[MRG_MAX_NUM_CANDS];
+    uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);
+    PredictionUnit pu(merge.cu, cuGeom, 0);
 
     bestPred->sa8dCost = MAX_INT64;
     int bestSadCand = -1;
     int sizeIdx = cuGeom.log2CUSize - 2;
 
-    for (uint32_t i = 0; i < maxNumMergeCand; ++i)
+    for (uint32_t i = 0; i < numMergeCand; ++i)
     {
         if (m_bFrameParallel &&
-            (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
-            mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
+            (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
+            candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
             continue;
 
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx
-        tempPred->cu.m_interDir[0] = interDirNeighbours[i];
-        tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-        tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
-        tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-        tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+        tempPred->cu.m_interDir[0] = candDir[i];
+        tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
+        tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
+        tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
+        tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
 
-        prepMotionCompensation(tempPred->cu, cuGeom, 0);
-        motionCompensation(tempPred->predYuv, true, m_bChromaSa8d);
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d);
 
-        tempPred->sa8dBits = getTUBits(i, maxNumMergeCand);
+        tempPred->sa8dBits = getTUBits(i, numMergeCand);
         tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
         if (m_bChromaSa8d)
         {
@@ -1264,10 +1264,7 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mod
 
     /* calculate the motion compensation for chroma for the best mode selected */
     if (!m_bChromaSa8d) /* Chroma MC was done above */
-    {
-        prepMotionCompensation(bestPred->cu, cuGeom, 0);
-        motionCompensation(bestPred->predYuv, false, true);
-    }
+        motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
 
     if (m_param->rdLevel)
     {
@@ -1278,11 +1275,11 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mod
 
         /* Encode with residual */
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;
-        tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
-        tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
-        tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
-        tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
-        tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+        tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
+        tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
+        tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
+        tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
+        tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
         tempPred->sa8dCost = bestPred->sa8dCost;
         tempPred->predYuv.copyFromYuv(bestPred->predYuv);
 
@@ -1294,11 +1291,11 @@ void Analysis::checkMerge2Nx2N_rd0_4(Mod
         md.bestMode = bestPred;
 
     /* broadcast sets of MV field data */
-    bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0);
-    bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0);
-    bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0);
-    bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0);
-    bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0);
+    bestPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);
+    bestPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);
+    bestPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);
+    bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);
+    bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);
 }
 
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1319,52 +1316,47 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mod
     skip.cu.setPartSizeSubParts(SIZE_2Nx2N);
     skip.cu.m_mergeFlag[0] = true;
 
-    MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
-    uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
-    uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours);
+    MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists
+    uint8_t candDir[MRG_MAX_NUM_CANDS];
+    uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir);
+    PredictionUnit pu(merge.cu, cuGeom, 0);
 
     bool foundCbf0Merge = false;
     bool triedPZero = false, triedBZero = false;
     bestPred->rdCost = MAX_INT64;
 
-    if (m_param->analysisMode == X265_ANALYSIS_LOAD && isSkipMode)
+    if (isSkipMode)
     {
         uint32_t i = *m_reuseBestMergeCand;
-        tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
-        tempPred->cu.m_interDir[0] = interDirNeighbours[i];
-        tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-        tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
-        tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-        tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
-        tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
+        bestPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
+        bestPred->cu.m_interDir[0] = candDir[i];
+        bestPred->cu.m_mv[0][0] = candMvField[i][0].mv;
+        bestPred->cu.m_mv[1][0] = candMvField[i][1].mv;
+        bestPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
+        bestPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
 
-        prepMotionCompensation(tempPred->cu, cuGeom, 0);
-        motionCompensation(tempPred->predYuv, true, true);
-
-        encodeResAndCalcRdSkipCU(*tempPred);
-
-        if (tempPred->rdCost < bestPred->rdCost)
-            std::swap(tempPred, bestPred);
+        motionCompensation(bestPred->cu, pu, bestPred->predYuv, true, true);
+        encodeResAndCalcRdSkipCU(*bestPred);
     }
     else
     {
-        for (uint32_t i = 0; i < maxNumMergeCand; i++)
+        for (uint32_t i = 0; i < numMergeCand; i++)
         {
             if (m_bFrameParallel &&
-                (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
-                mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4))
+                (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||
+                candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4))
                 continue;
 
             /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */
-            if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx)
+            if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx)
             {
                 if (triedPZero)
                     continue;
                 triedPZero = true;
             }
-            else if (interDirNeighbours[i] == 3 &&
-                !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx &&
-                !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx)
+            else if (candDir[i] == 3 &&
+                !candMvField[i][0].mv.word && !candMvField[i][0].refIdx &&
+                !candMvField[i][1].mv.word && !candMvField[i][1].refIdx)
             {
                 if (triedBZero)
                     continue;
@@ -1372,15 +1364,14 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mod
             }
 
             tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
-            tempPred->cu.m_interDir[0] = interDirNeighbours[i];
-            tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-            tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
-            tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-            tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+            tempPred->cu.m_interDir[0] = candDir[i];
+            tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
+            tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
+            tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
+            tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
             tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
 
-            prepMotionCompensation(tempPred->cu, cuGeom, 0);
-            motionCompensation(tempPred->predYuv, true, true);
+            motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true);
 
             uint8_t hasCbf = true;
             bool swapped = false;
@@ -1405,11 +1396,11 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mod
                 if (swapped)
                 {
                     tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;
-                    tempPred->cu.m_interDir[0] = interDirNeighbours[i];
-                    tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
-                    tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx;
-                    tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv;
-                    tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx;
+                    tempPred->cu.m_interDir[0] = candDir[i];
+                    tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;
+                    tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
+                    tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
+                    tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
                     tempPred->cu.setPredModeSubParts(MODE_INTER);
                     tempPred->predYuv.copyFromYuv(bestPred->predYuv);
                 }
@@ -1428,11 +1419,11 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mod
 
         /* broadcast sets of MV field data */
         uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0];
-        bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0);
-        bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0);
-        bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0);
-        bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0);
-        bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0);
+        bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0);
+        bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0);
+        bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0);
+        bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0);
+        bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0);
     }
 
     if (m_param->analysisMode)
@@ -1572,8 +1563,8 @@ void Analysis::checkBidir2Nx2N(Mode& int
     cu.setPUMv(1, bestME[1].mv, 0, 0);
     cu.m_mvd[1][0] = bestME[1].mv - mvp1;
 
-    prepMotionCompensation(cu, cuGeom, 0);
-    motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d);
+    PredictionUnit pu(cu, cuGeom, 0);
+    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d);
 
     int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
     if (m_bChromaSa8d)
@@ -1612,8 +1603,7 @@ void Analysis::checkBidir2Nx2N(Mode& int
             cu.m_mv[0][0] = mvzero;
             cu.m_mv[1][0] = mvzero;
 
-            prepMotionCompensation(cu, cuGeom, 0);
-            motionCompensation(tmpPredYuv, true, true);
+            motionCompensation(cu, pu, tmpPredYuv, true, true);
 
             zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
@@ -1621,8 +1611,8 @@ void Analysis::checkBidir2Nx2N(Mode& int
         }
         else
         {
-            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx);
-            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx);
+            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
+            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx);
             intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
 
             primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
@@ -1657,10 +1647,7 @@ void Analysis::checkBidir2Nx2N(Mode& int
                 /* real MC was already performed */
                 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
             else
-            {
-                prepMotionCompensation(cu, cuGeom, 0);
-                motionCompensation(bidir2Nx2N.predYuv, true, true);
-            }
+                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, true);
         }
         else if (m_bChromaSa8d)
         {
--- a/source/encoder/entropy.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/encoder/entropy.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -43,6 +43,7 @@ Entropy::Entropy()
 {
     markValid();
     m_fracBits = 0;
+    m_pad = 0;
     X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState[0]) * MAX_OFF_CTX_MOD, "context state table is too small\n");
 }
 
--- a/source/encoder/search.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/encoder/search.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -1806,22 +1806,24 @@ uint32_t Search::estIntraPredChromaQT(Mo
     return totalDistortion;
 }
 
-/* estimation of best merge coding of an inter PU (not a merge CU) */
-uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, MergeData& m)
+/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */
+uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m)
 {
-    X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n");
-
-    m.maxNumMergeCand = cu.getInterMergeCandidates(m.absPartIdx, puIdx, m.mvFieldNeighbours, m.interDirNeighbours);
+    X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n");
+
+    MVField  candMvField[MRG_MAX_NUM_CANDS][2];
+    uint8_t  candDir[MRG_MAX_NUM_CANDS];
+    uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir);
 
     if (cu.isBipredRestriction())
     {
-        /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */
-        for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
+        /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */
+        for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
         {
-            if (m.interDirNeighbours[mergeCand] == 3)
+            if (candDir[mergeCand] == 3)
             {
-                m.interDirNeighbours[mergeCand] = 1;
-                m.mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID;
+                candDir[mergeCand] = 1;
+                candMvField[mergeCand][1].refIdx = REF_NOT_VALID;
             }
         }
     }
@@ -1829,27 +1831,26 @@ uint32_t Search::mergeEstimation(CUData&
     Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv;
 
     uint32_t outCost = MAX_UINT;
-    for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand)
+    for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand)
     {
         /* Prevent TMVP candidates from using unavailable reference pixels */
         if (m_bFrameParallel &&
-            (m.mvFieldNeighbours[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
-             m.mvFieldNeighbours[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
+            (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 ||
+             candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4))
             continue;
 
-        cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv;
-        cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx;
-        cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv;
-        cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx;
-
-        prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(tempYuv, true, m_me.bChromaSATD);
-
-        uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size);
+        cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv;
+        cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx;
+        cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv;
+        cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx;
+
+        motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD);
+
+        uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size);
         if (m_me.bChromaSATD)
-            costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx);
-
-        uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand);
+            costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx);
+
+        uint32_t bitsCand = getTUBits(mergeCand, numMergeCand);
         costCand = costCand + m_rdCost.getCost(bitsCand);
         if (costCand < outCost)
         {
@@ -1859,9 +1860,9 @@ uint32_t Search::mergeEstimation(CUData&
         }
     }
 
-    m.mvField[0] = m.mvFieldNeighbours[m.index][0];
-    m.mvField[1] = m.mvFieldNeighbours[m.index][1];
-    m.interDir = m.interDirNeighbours[m.index];
+    m.mvField[0] = candMvField[m.index][0];
+    m.mvField[1] = candMvField[m.index][1];
+    m.dir = candDir[m.index];
 
     return outCost;
 }
@@ -1899,17 +1900,17 @@ void Search::processPME(PME& pme, Search
         slave.setQP(*m_slice, m_rdCost.m_qp);
         slave.m_slice = m_slice;
         slave.m_frame = m_frame;
-        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.mode.cu.m_cuAddr, pme.cuGeom.absPartIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
-        slave.prepMotionCompensation(pme.mode.cu, pme.cuGeom, pme.puIdx);
+
+        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height);
     }
 
     /* Perform ME, repeat until no more work is available */
     do
     {
         if (meId < m_slice->m_numRefIdx[0])
-            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.puIdx, 0, meId);
+            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.pu, pme.puIdx, 0, meId);
         else
-            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
+            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.pu, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
 
         meId = -1;
         pme.m_lock.acquire();
@@ -1920,16 +1921,14 @@ void Search::processPME(PME& pme, Search
     while (meId >= 0);
 }
 
-/* this function assumes the caller has configured its MotionEstimation engine with the
- * correct source plane and source PU, and has called prepMotionCompensation() to set
- * m_puAbsPartIdx, m_puWidth, and m_puHeight */
-void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref)
+void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, const PredictionUnit& pu,
+                                    int part, int list, int ref)
 {
     uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS;
     bits += getTUBits(ref, m_slice->m_numRefIdx[list]);
 
     MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1];
-    int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
+    int numMvc = interMode.cu.fillMvpCand(part, pu.puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc);
 
     int mvpIdx = 0;
     int merange = m_param->searchRange;
@@ -1949,8 +1948,8 @@ void Search::singleMotionEstimation(Sear
             interMode.cu.clipMv(mvCand);
 
             Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
-            predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
-            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+            predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand);
+            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
 
             if (bestCost > cost)
             {
@@ -2008,43 +2007,38 @@ void Search::predInterSearch(Mode& inter
     Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
 
     MergeData merge;
+    uint32_t mrgCost;
     memset(&merge, 0, sizeof(merge));
 
     for (int puIdx = 0; puIdx < numPart; puIdx++)
     {
         MotionData* bestME = interMode.bestME[puIdx];
-
-        /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
-        initMotionCompensation(cu, cuGeom, puIdx);
-
-        m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.absPartIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
-
-        uint32_t mrgCost = MAX_UINT;
+        PredictionUnit pu(cu, cuGeom, puIdx);
+
+        m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height);
 
         /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */
         if (cu.m_partSize[0] != SIZE_2Nx2N)
         {
-            merge.absPartIdx = m_puAbsPartIdx;
-            merge.width      = m_puWidth;
-            merge.height     = m_puHeight;
-            mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
+            mrgCost = mergeEstimation(cu, cuGeom, pu, puIdx, merge);
 
             if (bMergeOnly && mrgCost != MAX_UINT)
             {
-                cu.m_mergeFlag[m_puAbsPartIdx] = true;
-                cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
-                cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
-                cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
-                cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
-                cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
-                cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
+                cu.m_mergeFlag[pu.puAbsPartIdx] = true;
+                cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
+                cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
+                cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
+                cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
+                cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
+                cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
                 totalmebits += merge.bits;
 
-                prepMotionCompensation(cu, cuGeom, puIdx);
-                motionCompensation(*predYuv, true, bChromaSA8D);
+                motionCompensation(cu, pu, *predYuv, true, bChromaSA8D);
                 continue;
             }
         }
+        else
+            mrgCost = MAX_UINT;
 
         bestME[0].cost = MAX_UINT;
         bestME[1].cost = MAX_UINT;
@@ -2061,7 +2055,7 @@ void Search::predInterSearch(Mode& inter
                 uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
                 bits += getTUBits(ref, numRefIdx[l]);
 
-                int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+                int numMvc = cu.fillMvpCand(puIdx, pu.puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
 
                 // Pick the best possible MVP from AMVP candidates based on least residual
                 int mvpIdx = 0;
@@ -2079,8 +2073,8 @@ void Search::predInterSearch(Mode& inter
                             continue;
 
                         cu.clipMv(mvCand);
-                        predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
-                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+                        predInterLumaPixel(pu, tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                        uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
 
                         if (bestCost > cost)
                         {
@@ -2116,7 +2110,7 @@ void Search::predInterSearch(Mode& inter
         }
         else if (bTryDistributed)
         {
-            PME pme(*this, interMode, cuGeom, puIdx);
+            PME pme(*this, interMode, cuGeom, pu, puIdx);
             pme.m_jobTotal = numME;
             pme.m_jobAcquired = 1; /* reserve L0-0 */
 
@@ -2124,7 +2118,7 @@ void Search::predInterSearch(Mode& inter
             {
                 processPME(pme, *this);
 
-                singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0); /* L0-0 */
+                singleMotionEstimation(*this, interMode, cuGeom, pu, puIdx, 0, 0); /* L0-0 */
 
                 bDoUnidir = false;
 
@@ -2144,7 +2138,7 @@ void Search::predInterSearch(Mode& inter
                     uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS;
                     bits += getTUBits(ref, numRefIdx[l]);
 
-                    int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
+                    int numMvc = cu.fillMvpCand(puIdx, pu.puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc);
 
                     // Pick the best possible MVP from AMVP candidates based on least residual
                     int mvpIdx = 0;
@@ -2162,8 +2156,8 @@ void Search::predInterSearch(Mode& inter
                                 continue;
 
                             cu.clipMv(mvCand);
-                            predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
-                            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size);
+                            predInterLumaPixel(pu, tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand);
+                            uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size);
 
                             if (bestCost > cost)
                             {
@@ -2204,7 +2198,7 @@ void Search::predInterSearch(Mode& inter
         int bidirBits = 0;
 
         if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */
-            cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N &&     /* 2Nx2N biprediction is handled elsewhere */
+            cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */
             bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT)
         {
             bidir[0] = bestME[0];
@@ -2214,16 +2208,14 @@ void Search::predInterSearch(Mode& inter
 
             if (m_me.bChromaSATD)
             {
-                cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv;
-                cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
-                cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv;
-                cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
-
-                prepMotionCompensation(cu, cuGeom, puIdx);
-                motionCompensation(tmpPredYuv, true, true);
-
-                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
-                           m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+                cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;
+                cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
+                cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;
+                cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
+                motionCompensation(cu, pu, tmpPredYuv, true, true);
+
+                satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
+                           m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
             }
             else
             {
@@ -2232,11 +2224,11 @@ void Search::predInterSearch(Mode& inter
                 Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;
 
                 /* Generate reference subpels */
-                predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv);
-                predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv);
-
-                primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size,
-                                                                                              bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32);
+                predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);
+                predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);
+
+                primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,
+                                                                                                 bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);
                 satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
             }
 
@@ -2263,21 +2255,19 @@ void Search::predInterSearch(Mode& inter
                 /* coincident blocks of the two reference pictures */
                 if (m_me.bChromaSATD)
                 {
-                    cu.m_mv[0][m_puAbsPartIdx] = mvzero;
-                    cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref;
-                    cu.m_mv[1][m_puAbsPartIdx] = mvzero;
-                    cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref;
-
-                    prepMotionCompensation(cu, cuGeom, puIdx);
-                    motionCompensation(tmpPredYuv, true, true);
-
-                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) +
-                               m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx);
+                    cu.m_mv[0][pu.puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;
+                    cu.m_mv[1][pu.puAbsPartIdx] = mvzero;
+                    cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;
+                    motionCompensation(cu, pu, tmpPredYuv, true, true);
+
+                    satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +
+                               m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);
                 }
                 else
                 {
-                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + m_puAbsPartIdx);
-                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + m_puAbsPartIdx);
+                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
+                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);
                     intptr_t refStride = slice->m_mref[0][0].lumaStride;
 
                     primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
@@ -2315,13 +2305,13 @@ void Search::predInterSearch(Mode& inter
         /* select best option and store into CU */
         if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost)
         {
-            cu.m_mergeFlag[m_puAbsPartIdx] = true;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
-            cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx);
+            cu.m_mergeFlag[pu.puAbsPartIdx] = true;
+            cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */
+            cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);
 
             totalmebits += merge.bits;
         }
@@ -2329,17 +2319,17 @@ void Search::predInterSearch(Mode& inter
         {
             lastMode = 2;
 
-            cu.m_mergeFlag[m_puAbsPartIdx] = false;
-            cu.setPUInterDir(3, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx;
-
-            cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
-            cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx;
+            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
+            cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;
+            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;
+
+            cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;
+            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;
 
             totalmebits += bidirBits;
         }
@@ -2347,15 +2337,15 @@ void Search::predInterSearch(Mode& inter
         {
             lastMode = 0;
 
-            cu.m_mergeFlag[m_puAbsPartIdx] = false;
-            cu.setPUInterDir(1, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
-            cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx;
-
-            cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx);
+            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
+            cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;
+            cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;
+
+            cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);
 
             totalmebits += bestME[0].bits;
         }
@@ -2363,21 +2353,20 @@ void Search::predInterSearch(Mode& inter
         {
             lastMode = 1;
 
-            cu.m_mergeFlag[m_puAbsPartIdx] = false;
-            cu.setPUInterDir(2, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx);
-            cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx);
-            cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
-            cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx;
-
-            cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx);
-            cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx);
+            cu.m_mergeFlag[pu.puAbsPartIdx] = false;
+            cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);
+            cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);
+            cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;
+            cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;
+
+            cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);
+            cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);
 
             totalmebits += bestME[1].bits;
         }
 
-        prepMotionCompensation(cu, cuGeom, puIdx);
-        motionCompensation(*predYuv, true, bChromaSA8D);
+        motionCompensation(cu, pu, *predYuv, true, bChromaSA8D);
     }
 
     interMode.sa8dBits += totalmebits;
--- a/source/encoder/search.h	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/encoder/search.h	Wed Feb 25 10:05:40 2015 +0530
@@ -288,9 +288,10 @@ public:
         Search&       master;
         Mode&         mode;
         const CUGeom& cuGeom;
+        const PredictionUnit& pu;
         int           puIdx;
 
-        PME(Search& s, Mode& m, const CUGeom& g, int p) : master(s), mode(m), cuGeom(g), puIdx(p) {}
+        PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {}
 
         void processTasks(int workerThreadId);
 
@@ -300,7 +301,7 @@ public:
     };
 
     void     processPME(PME& pme, Search& slave);
-    void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
+    void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, const PredictionUnit& pu, int part, int list, int ref);
 
 protected:
 
@@ -347,21 +348,11 @@ protected:
     // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
     void     offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx);
 
+    /* output of mergeEstimation, best merge candidate */
     struct MergeData
     {
-        /* merge candidate data, cached between calls to mergeEstimation */
-        MVField  mvFieldNeighbours[MRG_MAX_NUM_CANDS][2];
-        uint8_t  interDirNeighbours[MRG_MAX_NUM_CANDS];
-        uint32_t maxNumMergeCand;
-
-        /* data updated for each partition */
-        uint32_t absPartIdx;
-        int      width;
-        int      height;
-
-        /* outputs */
         MVField  mvField[2];
-        uint32_t interDir;
+        uint32_t dir;
         uint32_t index;
         uint32_t bits;
     };
@@ -369,8 +360,8 @@ protected:
     /* inter/ME helper functions */
     void     checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const;
     void     setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const;
-    uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, int partIdx, MergeData& m);
-    static void getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]);
+    uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
+    static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
 
     /* intra helper functions */
     enum { MAX_RD_INTRA_MODES = 16 };
--- a/source/encoder/slicetype.cpp	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/encoder/slicetype.cpp	Wed Feb 25 10:05:40 2015 +0530
@@ -210,7 +210,7 @@ void LookaheadTLD::calcAdaptiveQuantFram
 void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
 {
     ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-    ALIGN_VAR_32(pixel, fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+    pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
     pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
 
     const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
@@ -224,6 +224,9 @@ void LookaheadTLD::lowresIntraEstimate(L
     pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
     pixelcmp_t satd = primitives.pu[sizeIdx].satd;
 
+    fenc.costEst[0][0] = 0;
+    fenc.costEstAq[0][0] = 0;
+
     for (int cuY = 0; cuY < heightInCU; cuY++)
     {
         fenc.rowSatds[0][0][cuY] = 0;
@@ -239,7 +242,7 @@ void LookaheadTLD::lowresIntraEstimate(L
 
             memcpy(neighbours[0], pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
             for (int i = 1; i < cuSize + 1; i++)
-                neighbours[0][i + cuSize2] = pixCur[-1 - fenc.lumaStride + i * fenc.lumaStride]; /* todo: fixme */
+                neighbours[0][i + cuSize2] = pixCur[-1 - fenc.lumaStride + i * fenc.lumaStride]; /* TODO: gcc warning */
 
             for (int i = 0; i < cuSize; i++)
             {
@@ -264,7 +267,7 @@ void LookaheadTLD::lowresIntraEstimate(L
             uint32_t ilowmode = 0;
 
             /* DC and planar */
-            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16));
+            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, cuSize <= 16);
             cost = satd(fencIntra, cuSize, prediction, cuSize);
             COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
 
@@ -306,8 +309,20 @@ void LookaheadTLD::lowresIntraEstimate(L
             fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT));
             fenc.intraCost[cuXY] = icost;
             fenc.intraMode[cuXY] = (uint8_t)ilowmode;
-            fenc.rowSatds[0][0][cuY] += icost;
-            fenc.costEst[0][0] += icost;
+
+            /* do not include edge blocks in the frame cost estimates, they are not very accurate */
+            const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
+                                        cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
+
+            int icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] + 128) >> 8) : icost;
+
+            if (bFrameScoreCU)
+            {
+                fenc.costEst[0][0] += icost;
+                fenc.costEstAq[0][0] += icostAq;
+            }
+
+            fenc.rowSatds[0][0][cuY] += icostAq;
         }
     }
 }
@@ -500,7 +515,7 @@ Lookahead::Lookahead(x265_param *param, 
      * do much unnecessary work, some frame cost estimates are not needed, so if
      * the thread pool is small we disable this feature after the initial burst
      * of work */
-    m_bBatchFrameCosts = m_bBatchMotionSearch;
+    m_bBatchFrameCosts = 0 && m_bBatchMotionSearch; /* temporarily disabled */
 
     if (m_bBatchMotionSearch && m_pool->m_numWorkers > 12)
     {
@@ -1635,10 +1650,10 @@ void Lookahead::estimateCUPropagate(Lowr
     if (!referenced)
         memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t));
 
-    int32_t StrideInCU = m_widthInCU;
+    int32_t strideInCU = m_widthInCU;
     for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++)
     {
-        int cuIndex = blocky * StrideInCU;
+        int cuIndex = blocky * strideInCU;
         primitives.propagateCost(m_scratch, propagateCost,
                                  frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
                                  frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU);
@@ -1676,10 +1691,10 @@ void Lookahead::estimateCUPropagate(Lowr
                         int32_t y = mvs[list][cuIndex].y;
                         int32_t cux = (x >> 5) + blockx;
                         int32_t cuy = (y >> 5) + blocky;
-                        int32_t idx0 = cux + cuy * StrideInCU;
+                        int32_t idx0 = cux + cuy * strideInCU;
                         int32_t idx1 = idx0 + 1;
-                        int32_t idx2 = idx0 + StrideInCU;
-                        int32_t idx3 = idx0 + StrideInCU + 1;
+                        int32_t idx2 = idx0 + strideInCU;
+                        int32_t idx3 = idx0 + strideInCU + 1;
                         x &= 31;
                         y &= 31;
                         int32_t idx0weight = (32 - y) * (32 - x);
@@ -1781,7 +1796,7 @@ int64_t CostEstimateGroup::singleCost(in
     return estimateFrameCost(tld, p0, p1, b, intraPenalty);
 }
 
-void CostEstimateGroup::add(int p0, int p1, int b, bool intraPenalty)
+void CostEstimateGroup::add(int p0, int p1, int b)
 {
     X265_CHECK(m_batchMode || !m_jobTotal, "single CostEstimateGroup instance cannot mix batch modes\n");
     m_batchMode = true;
@@ -1790,7 +1805,6 @@ void CostEstimateGroup::add(int p0, int 
     e.p0 = p0;
     e.p1 = p1;
     e.b = b;
-    e.bIntraPenalty = intraPenalty;
 
     if (m_jobTotal == MAX_BATCH_SIZE)
         finishBatch();
@@ -1828,7 +1842,7 @@ void CostEstimateGroup::processTasks(int
             ProfileScopeEvent(estCostSingle);
             Estimate& e = m_estimates[i];
 
-            estimateFrameCost(tld, e.p0, e.p1, e.b, e.bIntraPenalty);
+            estimateFrameCost(tld, e.p0, e.p1, e.b, false);
         }
         else
         {
@@ -1888,6 +1902,7 @@ int64_t CostEstimateGroup::estimateFrame
              * going to need motion searches or bidir measurements */
 
             memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);
+            memset(fenc->rowSatds, 0, sizeof(fenc->rowSatds[0]) * m_lookahead.m_heightInCU);
 
             m_lock.acquire();
             X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n");
--- a/source/encoder/slicetype.h	Tue Feb 24 17:26:37 2015 +0530
+++ b/source/encoder/slicetype.h	Wed Feb 25 10:05:40 2015 +0530
@@ -208,10 +208,9 @@ public:
     struct Estimate
     {
         int  p0, b, p1;
-        bool bIntraPenalty;
     } m_estimates[MAX_BATCH_SIZE];
 
-    void add(int p0, int p1, int b, bool intraPenalty = false);
+    void add(int p0, int p1, int b);
     void finishBatch();
 
 protected: