changeset 9574:ec2cee938666

Merge with public
author Praveen Tiwari <praveen@multicorewareinc.com>
date Thu, 26 Feb 2015 10:34:07 +0530
parents e1b7ddbe1ecb (current diff) 0e89af0781ee (diff)
children 78460bc1d375
files source/common/quant.cpp source/common/x86/asm-primitives.cpp source/common/x86/const-a.asm source/common/x86/intrapred.h source/common/x86/intrapred8.asm source/encoder/entropy.cpp
diffstat 14 files changed, 205 insertions(+-), 151 deletions(-) [+]
line wrap: on
line diff
--- a/source/CMakeLists.txt	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/CMakeLists.txt	Thu Feb 26 10:34:07 2015 +0530
@@ -29,11 +29,6 @@ configure_file("${PROJECT_SOURCE_DIR}/x2
 
 SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
 
-option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF)
-if(CHECKED_BUILD)
-    add_definitions(-DCHECKED_BUILD=1)
-endif()
-
 # System architecture detection
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
@@ -177,6 +172,25 @@ if(YASM_FOUND AND X86)
     endif()
 endif()
 
+option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF)
+if(CHECKED_BUILD)
+    add_definitions(-DCHECKED_BUILD=1)
+    if(GCC)
+        check_cxx_compiler_flag(-fsanitize=address CC_HAS_FSANITIZE)              # clang and gcc
+        check_cxx_compiler_flag(-fsanitize=undefined-trap CC_HAS_CATCH_UNDEFINED) # clang
+        check_cxx_compiler_flag(-ftrapv CC_HAS_FTRAPV)                            # gcc
+        if(CC_HAS_FSANITIZE)
+            add_definitions(-fsanitize=address)
+        endif()
+        if(CC_HAS_FTRAPV)
+            add_definitions(-ftrapv)
+        endif()
+        if(CC_HAS_CATCH_UNDEFINED)
+            add_definitions(-fsanitize=undefined-trap -fsanitize-undefined-trap-on-error)
+        endif()
+    endif(GCC)
+endif()
+
 # Build options
 set(LIB_INSTALL_DIR lib CACHE STRING "Install location of libraries")
 set(BIN_INSTALL_DIR bin CACHE STRING "Install location of executables")
--- a/source/common/bitstream.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/bitstream.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -44,7 +44,7 @@ void Bitstream::push_back(uint8_t val)
 void Bitstream::write(uint32_t val, uint32_t numBits)
 {
     X265_CHECK(numBits <= 32, "numBits out of range\n");
-    X265_CHECK(numBits == 32 || ((val & (~0 << numBits)) == 0), "numBits & val out of range\n");
+    X265_CHECK(numBits == 32 || ((val & (~0u << numBits)) == 0), "numBits & val out of range\n");
 
     uint32_t totalPartialBits = m_partialByteBits + numBits;
     uint32_t nextPartialBits = totalPartialBits & 7;
--- a/source/common/cudata.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/cudata.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -38,7 +38,7 @@ namespace {
 void bcast1(uint8_t* dst, uint8_t val)  { dst[0] = val; }
 
 void copy4(uint8_t* dst, uint8_t* src)  { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; }
-void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101 * val; }
+void bcast4(uint8_t* dst, uint8_t val)  { ((uint32_t*)dst)[0] = 0x01010101u * val; }
 
 void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; }
 void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; }
@@ -1426,9 +1426,7 @@ uint32_t CUData::getInterMergeCandidates
         if (isInterB)
             cuLeft->getMvField(cuLeft, leftPartIdx, 1, candMvField[count][1]);
 
-        count++;
-    
-        if (count == maxNumMergeCand)
+        if (++count == maxNumMergeCand)
             return maxNumMergeCand;
     }
 
@@ -1450,9 +1448,7 @@ uint32_t CUData::getInterMergeCandidates
         if (isInterB)
             cuAbove->getMvField(cuAbove, abovePartIdx, 1, candMvField[count][1]);
 
-        count++;
-   
-        if (count == maxNumMergeCand)
+        if (++count == maxNumMergeCand)
             return maxNumMergeCand;
     }
 
@@ -1471,9 +1467,7 @@ uint32_t CUData::getInterMergeCandidates
         if (isInterB)
             cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, candMvField[count][1]);
 
-        count++;
-
-        if (count == maxNumMergeCand)
+        if (++count == maxNumMergeCand)
             return maxNumMergeCand;
     }
 
@@ -1492,9 +1486,7 @@ uint32_t CUData::getInterMergeCandidates
         if (isInterB)
             cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, candMvField[count][1]);
 
-        count++;
-
-        if (count == maxNumMergeCand)
+        if (++count == maxNumMergeCand)
             return maxNumMergeCand;
     }
 
@@ -1516,9 +1508,7 @@ uint32_t CUData::getInterMergeCandidates
             if (isInterB)
                 cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, candMvField[count][1]);
 
-            count++;
-
-            if (count == maxNumMergeCand)
+            if (++count == maxNumMergeCand)
                 return maxNumMergeCand;
         }
     }
@@ -1553,31 +1543,21 @@ uint32_t CUData::getInterMergeCandidates
                 absPartAddr = 0;
         }
 
-        int refIdx = 0;
-        uint32_t partIdxCenter = deriveCenterIdx(puIdx);
-        uint32_t curCTUIdx = m_cuAddr;
-        int dir = 0;
-        bool bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, 0, ctuIdx, absPartAddr);
-        if (!bExistMV)
-            bExistMV = getColMVP(colmv, refIdx, 0, curCTUIdx, partIdxCenter);
-        if (bExistMV)
+        int maxList = isInterB ? 2 : 1;
+        int dir = 0, refIdx = 0;
+        for (int list = 0; list < maxList; list++)
         {
-            dir |= 1;
-            candMvField[count][0].mv = colmv;
-            candMvField[count][0].refIdx = refIdx;
-        }
-
-        if (isInterB)
-        {
-            bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, 1, ctuIdx, absPartAddr);
+            bool bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, list, ctuIdx, absPartAddr);
             if (!bExistMV)
-                bExistMV = getColMVP(colmv, refIdx, 1, curCTUIdx, partIdxCenter);
-
+            {
+                uint32_t partIdxCenter = deriveCenterIdx(puIdx);
+                bExistMV = getColMVP(colmv, refIdx, list, m_cuAddr, partIdxCenter);
+            }
             if (bExistMV)
             {
-                dir |= 2;
-                candMvField[count][1].mv = colmv;
-                candMvField[count][1].refIdx = refIdx;
+                dir |= (1 << list);
+                candMvField[count][list].mv = colmv;
+                candMvField[count][list].refIdx = refIdx;
             }
         }
 
@@ -1585,9 +1565,7 @@ uint32_t CUData::getInterMergeCandidates
         {
             candDir[count] = (uint8_t)dir;
 
-            count++;
-        
-            if (count == maxNumMergeCand)
+            if (++count == maxNumMergeCand)
                 return maxNumMergeCand;
         }
     }
@@ -1598,12 +1576,10 @@ uint32_t CUData::getInterMergeCandidates
         uint32_t priorityList0 = 0xEDC984; // { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 }
         uint32_t priorityList1 = 0xB73621; // { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 }
 
-        for (uint32_t idx = 0; idx < cutoff; idx++)
+        for (uint32_t idx = 0; idx < cutoff; idx++, priorityList0 >>= 2, priorityList1 >>= 2)
         {
             int i = priorityList0 & 3;
             int j = priorityList1 & 3;
-            priorityList0 >>= 2;
-            priorityList1 >>= 2;
 
             if ((candDir[i] & 0x1) && (candDir[j] & 0x2))
             {
@@ -1620,9 +1596,7 @@ uint32_t CUData::getInterMergeCandidates
                     candMvField[count][1].refIdx = refIdxL1;
                     candDir[count] = 3;
 
-                    count++;
-
-                    if (count == maxNumMergeCand)
+                    if (++count == maxNumMergeCand)
                         return maxNumMergeCand;
                 }
             }
@@ -1658,18 +1632,6 @@ uint32_t CUData::getInterMergeCandidates
     return count;
 }
 
-/* Check whether the current PU and a spatial neighboring PU are in a same ME region */
-bool CUData::isDiffMER(int xN, int yN, int xP, int yP) const
-{
-    uint32_t plevel = 2;
-
-    if ((xN >> plevel) != (xP >> plevel))
-        return true;
-    if ((yN >> plevel) != (yP >> plevel))
-        return true;
-    return false;
-}
-
 /* Constructs a list of candidates for AMVP, and a larger list of motion candidates */
 int CUData::fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const
 {
@@ -1804,16 +1766,17 @@ int CUData::fillMvpCand(uint32_t puIdx, 
 
 void CUData::clipMv(MV& outMV) const
 {
-    int mvshift = 2;
-    int offset = 8;
-    int xmax = (m_slice->m_sps->picWidthInLumaSamples + offset - m_cuPelX - 1) << mvshift;
-    int xmin = (-(int)g_maxCUSize - offset - (int)m_cuPelX + 1) << mvshift;
+    const uint32_t mvshift = 2;
+    uint32_t offset = 8;
 
-    int ymax = (m_slice->m_sps->picHeightInLumaSamples + offset - m_cuPelY - 1) << mvshift;
-    int ymin = (-(int)g_maxCUSize - offset - (int)m_cuPelY + 1) << mvshift;
+    int16_t xmax = (int16_t)((m_slice->m_sps->picWidthInLumaSamples + offset - m_cuPelX - 1) << mvshift);
+    int16_t xmin = -(int16_t)((g_maxCUSize + offset + m_cuPelX - 1) << mvshift);
 
-    outMV.x = (int16_t)X265_MIN(xmax, X265_MAX(xmin, (int)outMV.x));
-    outMV.y = (int16_t)X265_MIN(ymax, X265_MAX(ymin, (int)outMV.y));
+    int16_t ymax = (int16_t)((m_slice->m_sps->picHeightInLumaSamples + offset - m_cuPelY - 1) << mvshift);
+    int16_t ymin = -(int16_t)((g_maxCUSize + offset + m_cuPelY - 1) << mvshift);
+
+    outMV.x = X265_MIN(xmax, X265_MAX(xmin, outMV.x));
+    outMV.y = X265_MIN(ymax, X265_MAX(ymin, outMV.y));
 }
 
 bool CUData::addMVPCand(MV& mvp, int picList, int refIdx, uint32_t partUnitIdx, MVP_DIR dir) const
@@ -2006,60 +1969,45 @@ uint32_t CUData::deriveCenterIdx(uint32_
                            + (puWidth  >> (LOG2_UNIT_SIZE + 1))];
 }
 
-ScanType CUData::getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const
-{
-    uint32_t dirMode;
-
-    if (!bIsIntra)
-        return SCAN_DIAG;
-
-    // check that MDCS can be used for this TU
-    if (bIsLuma)
-    {
-        if (log2TrSize > MDCS_LOG2_MAX_SIZE)
-            return SCAN_DIAG;
-
-        dirMode = m_lumaIntraDir[absPartIdx];
-    }
-    else
-    {
-        if (log2TrSize > (uint32_t)(MDCS_LOG2_MAX_SIZE - m_hChromaShift))
-            return SCAN_DIAG;
-
-        dirMode = m_chromaIntraDir[absPartIdx];
-        if (dirMode == DM_CHROMA_IDX)
-        {
-            dirMode = m_lumaIntraDir[(m_chromaFormat == X265_CSP_I444) ? absPartIdx : absPartIdx & 0xFC];
-            dirMode = (m_chromaFormat == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[dirMode] : dirMode;
-        }
-    }
-
-    if (abs((int)dirMode - VER_IDX) <= MDCS_ANGLE_LIMIT)
-        return SCAN_HOR;
-    else if (abs((int)dirMode - HOR_IDX) <= MDCS_ANGLE_LIMIT)
-        return SCAN_VER;
-    else
-        return SCAN_DIAG;
-}
-
 void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const
 {
+    bool bIsIntra = isIntra(absPartIdx);
+
     // set the group layout
     result.log2TrSizeCG = log2TrSize - 2;
 
     // set the scan orders
-    result.scanType = getCoefScanIdx(absPartIdx, log2TrSize, bIsLuma, isIntra(absPartIdx));
+    if (bIsIntra)
+    {
+        uint32_t dirMode;
+
+        if (bIsLuma)
+            dirMode = m_lumaIntraDir[absPartIdx];
+        else
+        {
+            dirMode = m_chromaIntraDir[absPartIdx];
+            if (dirMode == DM_CHROMA_IDX)
+            {
+                dirMode = m_lumaIntraDir[(m_chromaFormat == X265_CSP_I444) ? absPartIdx : absPartIdx & 0xFC];
+                dirMode = (m_chromaFormat == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[dirMode] : dirMode;
+            }
+        }
+
+        if (log2TrSize <= (MDCS_LOG2_MAX_SIZE - m_hChromaShift) || (bIsLuma && log2TrSize == MDCS_LOG2_MAX_SIZE))
+            result.scanType = dirMode >= 22 && dirMode <= 30 ? SCAN_HOR : dirMode >= 6 && dirMode <= 14 ? SCAN_VER : SCAN_DIAG;
+        else
+            result.scanType = SCAN_DIAG;
+    }
+    else
+        result.scanType = SCAN_DIAG;
+
     result.scan     = g_scanOrder[result.scanType][log2TrSize - 2];
     result.scanCG   = g_scanOrderCG[result.scanType][result.log2TrSizeCG];
 
     if (log2TrSize == 2)
         result.firstSignificanceMapContext = 0;
     else if (log2TrSize == 3)
-    {
-        result.firstSignificanceMapContext = 9;
-        if (result.scanType != SCAN_DIAG && bIsLuma)
-            result.firstSignificanceMapContext += 6;
-    }
+        result.firstSignificanceMapContext = (result.scanType != SCAN_DIAG && bIsLuma) ? 15 : 9;
     else
         result.firstSignificanceMapContext = bIsLuma ? 21 : 12;
 }
--- a/source/common/cudata.h	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/cudata.h	Thu Feb 26 10:34:07 2015 +0530
@@ -122,9 +122,9 @@ public:
     uint32_t      m_cuPelY;           // CU position within the picture, in pixels (Y)
     uint32_t      m_numPartitions;    // maximum number of 4x4 partitions within this CU
 
-    int           m_chromaFormat;
-    int           m_hChromaShift;
-    int           m_vChromaShift;
+    uint32_t      m_chromaFormat;
+    uint32_t      m_hChromaShift;
+    uint32_t      m_vChromaShift;
 
     /* Per-part data, stored contiguously */
     int8_t*       m_qp;               // array of QP values
@@ -216,7 +216,6 @@ public:
     uint32_t getSCUAddr() const                  { return (m_cuAddr << g_unitSizeDepth * 2) + m_absIdxInCTU; }
     uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const;
     uint32_t getCtxSkipFlag(uint32_t absPartIdx) const;
-    ScanType getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const;
     void     getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const;
 
     const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const;
@@ -241,7 +240,8 @@ protected:
 
     bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const;
 
-    bool isDiffMER(int xN, int yN, int xP, int yP) const;
+    /* Check whether the current PU and a spatial neighboring PU are in same merge region */
+    bool isDiffMER(int xN, int yN, int xP, int yP) const { return ((xN >> 2) != (xP >> 2)) || ((yN >> 2) != (yP >> 2)); }
 
     // add possible motion vector predictor candidates
     bool addMVPCand(MV& mvp, int picList, int refIdx, uint32_t absPartIdx, MVP_DIR dir) const;
--- a/source/common/deblock.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/deblock.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -358,7 +358,7 @@ static inline void pelFilterChroma(pixel
         int16_t m5  = (int16_t)src[offset];
         int16_t m2  = (int16_t)src[-offset * 2];
 
-        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3));
+        int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
         src[-offset] = x265_clip(m3 + (delta & maskP));
         src[0] = x265_clip(m4 - (delta & maskQ));
     }
--- a/source/common/mv.h	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/mv.h	Thu Feb 26 10:34:07 2015 +0530
@@ -56,12 +56,17 @@ public:
 
     MV& operator >>=(int i)                    { x >>= i; y >>= i; return *this; }
 
+#if CHECKED_BUILD
+    /* avoid signed left-shifts when -ftrapv is enabled */
+    MV& operator <<=(int i)                    { x *= (1 << i); y *= (1 << i); return *this; }
+    MV operator <<(int i) const                { return MV(x * (1 << i), y * (1 << i)); }
+#else
     MV& operator <<=(int i)                    { x <<= i; y <<= i; return *this; }
+    MV operator <<(int i) const                { return MV(x << i, y << i); }
+#endif
 
     MV operator >>(int i) const                { return MV(x >> i, y >> i); }
 
-    MV operator <<(int i) const                { return MV(x << i, y << i); }
-
     MV operator *(int16_t i) const             { return MV(x * i, y * i); }
 
     MV operator -(const MV& other) const       { return MV(x - other.x, y - other.y); }
--- a/source/common/quant.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/quant.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -1100,7 +1100,8 @@ uint32_t Quant::calcPatternSigCtx(uint64
 
     const uint32_t trSizeCG = 1 << log2TrSizeCG;
     X265_CHECK(trSizeCG <= 8, "transform CG is too large\n");
-    const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (1 + (cgPosY << log2TrSizeCG) + cgPosX));
+    const uint32_t shift = (cgPosY << log2TrSizeCG) + cgPosX + 1;
+    const uint32_t sigPos = (uint32_t)(shift >= 64 ? 0 : sigCoeffGroupFlag64 >> shift);
     const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1);
     const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2;
 
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -1303,6 +1303,8 @@ void setupAssemblyPrimitives(EncoderPrim
         ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2);
         ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
 
+        p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2;
+
         p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
         p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
 
--- a/source/common/x86/const-a.asm	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/x86/const-a.asm	Thu Feb 26 10:34:07 2015 +0530
@@ -37,6 +37,7 @@ const pw_16,       times 16 dw 16
 const pw_32,       times 16 dw 32
 const pw_128,      times 16 dw 128
 const pw_256,      times 16 dw 256
+const pw_257,      times 16 dw 257
 const pw_512,      times 16 dw 512
 const pw_1023,     times 8  dw 1023
 const pw_1024,     times 16 dw 1024
--- a/source/common/x86/intrapred.h	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/x86/intrapred.h	Thu Feb 26 10:34:07 2015 +0530
@@ -26,6 +26,7 @@
 #ifndef X265_INTRAPRED_H
 #define X265_INTRAPRED_H
 
+void x265_intra_pred_dc4_sse2 (pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
 void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
--- a/source/common/x86/intrapred8.asm	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Thu Feb 26 10:34:07 2015 +0530
@@ -97,6 +97,7 @@ cextern pw_4
 cextern pw_8
 cextern pw_16
 cextern pw_32
+cextern pw_257
 cextern pw_1024
 cextern pb_unpackbd1
 cextern multiL
@@ -108,6 +109,80 @@ cextern multi_2Row
 ;---------------------------------------------------------------------------------------------
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
 ;---------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal intra_pred_dc4, 5,5,3
+    inc         r2
+    pxor        m0, m0
+    movu        m1, [r2]
+    pshufd      m1, m1, 0xF8
+    psadbw      m1, m0              ; m1 = sum
+
+    test        r4d, r4d
+
+    paddw       m1, [pw_4]
+    psraw       m1, 3
+    movd        r4d, m1             ; r4d = dc_val
+    pmullw      m1, [pw_257]
+    pshuflw     m1, m1, 0x00
+
+    ; store DC 4x4
+    lea         r3, [r1 * 3]
+    movd        [r0], m1
+    movd        [r0 + r1], m1
+    movd        [r0 + r1 * 2], m1
+    movd        [r0 + r3], m1
+
+    ; do DC filter
+    jz         .end
+    lea         r3d, [r4d * 2 + 2]  ; r3d = DC * 2 + 2
+    add         r4d, r3d            ; r4d = DC * 3 + 2
+    movd        m1, r4d
+    pshuflw     m1, m1, 0           ; m1 = pixDCx3
+
+    ; filter top
+    movd        m2, [r2]
+    punpcklbw   m2, m0
+    paddw       m2, m1
+    psraw       m2, 2
+    packuswb    m2, m2
+    movd        [r0], m2            ; overwrite top-left pixel, we will update it later
+
+    ; filter top-left
+    movzx       r4d, byte [r2 + 8]
+    add         r3d, r4d
+    movzx       r4d, byte [r2]
+    add         r3d, r4d
+    shr         r3d, 2
+    mov         [r0], r3b
+
+    ; filter left
+    add         r0, r1
+    movq        m2, [r2 + 9]
+    punpcklbw   m2, m0
+    paddw       m2, m1
+    psraw       m2, 2
+    packuswb    m2, m2
+%if ARCH_X86_64
+    movq        r4, m2
+    mov         [r0], r4b
+    shr         r4, 8
+    mov         [r0 + r1], r4b
+    shr         r4, 8
+    mov         [r0 + r1 * 2], r4b
+%else
+    movd        r2d, m2
+    mov         [r0], r2b
+    shr         r2, 8
+    mov         [r0 + r1], r2b
+    shr         r2, 8
+    mov         [r0 + r1 * 2], r2b
+%endif
+.end:
+    RET
+
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
 INIT_XMM sse4
 cglobal intra_pred_dc4, 5,5,3
     inc         r2
--- a/source/encoder/entropy.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/encoder/entropy.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -540,15 +540,14 @@ void Entropy::encodeCU(const CUData& ctu
 {
     const Slice* slice = ctu.m_slice;
 
-    if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
-        bEncodeDQP = true;
-
     int cuSplitFlag = !(cuGeom.flags & CUGeom::LEAF);
     int cuUnsplitFlag = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
 
     if (!cuUnsplitFlag)
     {
         uint32_t qNumParts = cuGeom.numPartitions >> 2;
+        if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
+            bEncodeDQP = true;
         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
         {
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
@@ -564,6 +563,8 @@ void Entropy::encodeCU(const CUData& ctu
     if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth)
     {
         uint32_t qNumParts = cuGeom.numPartitions >> 2;
+        if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
+            bEncodeDQP = true;
         for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts)
         {
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx);
@@ -572,6 +573,9 @@ void Entropy::encodeCU(const CUData& ctu
         return;
     }
 
+    if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP)
+        bEncodeDQP = true;
+
     if (slice->m_pps->bTransquantBypassEnabled)
         codeCUTransquantBypassFlag(ctu.m_tqBypass[absPartIdx]);
 
--- a/source/encoder/slicetype.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/encoder/slicetype.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -515,7 +515,7 @@ Lookahead::Lookahead(x265_param *param, 
      * do much unnecessary work, some frame cost estimates are not needed, so if
      * the thread pool is small we disable this feature after the initial burst
      * of work */
-    m_bBatchFrameCosts = 0 && m_bBatchMotionSearch; /* temporarily disabled */
+    m_bBatchFrameCosts = m_bBatchMotionSearch;
 
     if (m_bBatchMotionSearch && m_pool->m_numWorkers > 12)
     {
@@ -1049,7 +1049,6 @@ void Lookahead::slicetypeDecide()
             m_outputQueue.pushBack(*list[i]);
         }
     }
-    m_outputLock.release();
 
     bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead;
     if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
@@ -1068,6 +1067,7 @@ void Lookahead::slicetypeDecide()
         frames[j + 1] = NULL;
         slicetypeAnalyse(frames, true);
     }
+    m_outputLock.release();
 }
 
 void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
@@ -1636,8 +1636,8 @@ void Lookahead::estimateCUPropagate(Lowr
     uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
     int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
     int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
-    MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] };
     int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight };
+    int listDist[2] = { b - p0 - 1, p1 - b - 1 };
 
     memset(m_scratch, 0, m_widthInCU * sizeof(int));
 
@@ -1680,15 +1680,17 @@ void Lookahead::estimateCUPropagate(Lowr
                         if (lists_used == 3)
                             listamount = (listamount * bipredWeights[list] + 32) >> 6;
 
+                        MV *mvs = frames[b]->lowresMvs[list][listDist[list]];
+
                         /* Early termination for simple case of mv0. */
-                        if (!mvs[list][cuIndex].word)
+                        if (!mvs[cuIndex].word)
                         {
                             CLIP_ADD(refCosts[list][cuIndex], listamount);
                             continue;
                         }
 
-                        int32_t x = mvs[list][cuIndex].x;
-                        int32_t y = mvs[list][cuIndex].y;
+                        int32_t x = mvs[cuIndex].x;
+                        int32_t y = mvs[cuIndex].y;
                         int32_t cux = (x >> 5) + blockx;
                         int32_t cuy = (y >> 5) + blocky;
                         int32_t idx0 = cux + cuy * strideInCU;
@@ -1978,11 +1980,7 @@ void CostEstimateGroup::estimateCUCost(L
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;
-
-    MV(*fencMVs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
-                        &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
-    int(*fencCosts[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY],
-                           &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
+    int listDist[2] = { b - p0 - 1, p1 - b - 1 };
 
     MV mvmin, mvmax;
     int bcost = tld.me.COST_MAX;
@@ -1996,15 +1994,18 @@ void CostEstimateGroup::estimateCUCost(L
 
     for (int i = 0; i < 1 + bBidir; i++)
     {
+        int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
+
         if (!bDoSearch[i])
         {
-            COPY2_IF_LT(bcost, *fencCosts[i], listused, i + 1);
+            COPY2_IF_LT(bcost, fencCost, listused, i + 1);
             continue;
         }
 
         int numc = 0;
         MV mvc[4], mvp;
-        MV *fencMV = fencMVs[i];
+
+        MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];
 
         /* Reverse-order MV prediction */
         mvc[0] = 0;
@@ -2026,8 +2027,8 @@ void CostEstimateGroup::estimateCUCost(L
         else
             median_mv(mvp, mvc[0], mvc[1], mvc[2]);
 
-        *fencCosts[i] = tld.me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, s_merange, *fencMVs[i]);
-        COPY2_IF_LT(bcost, *fencCosts[i], listused, i + 1);
+        fencCost = tld.me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, s_merange, *fencMV);
+        COPY2_IF_LT(bcost, fencCost, listused, i + 1);
     }
 
     if (bBidir) /* B, also consider bidir */
@@ -2038,8 +2039,8 @@ void CostEstimateGroup::estimateCUCost(L
         ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
-        pixel *src0 = fref0->lowresMC(pelOffset, *fencMVs[0], subpelbuf0, stride0);
-        pixel *src1 = fref1->lowresMC(pelOffset, *fencMVs[1], subpelbuf1, stride1);
+        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
+        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
 
         ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
--- a/source/encoder/weightPrediction.cpp	Wed Feb 25 16:25:43 2015 +0530
+++ b/source/encoder/weightPrediction.cpp	Thu Feb 26 10:34:07 2015 +0530
@@ -58,6 +58,7 @@ int sliceHeaderCost(WeightParam *w, int 
 void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
 {
     intptr_t stride = ref.lumaStride;
+    const int mvshift = 1 << 2;
     const int cuSize = 8;
     MV mvmin, mvmax;
 
@@ -66,15 +67,15 @@ void mcLuma(pixel* mcout, Lowres& ref, c
     for (int y = 0; y < ref.lines; y += cuSize)
     {
         intptr_t pixoff = y * stride;
-        mvmin.y = (int16_t)((-y - 8) << 2);
-        mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
+        mvmin.y = (int16_t)((-y - 8) * mvshift);
+        mvmax.y = (int16_t)((ref.lines - y - 1 + 8) * mvshift);
 
         for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
         {
             ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
             intptr_t bstride = 8;
-            mvmin.x = (int16_t)((-x - 8) << 2);
-            mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
+            mvmin.x = (int16_t)((-x - 8) * mvshift);
+            mvmax.x = (int16_t)((ref.width - x - 1 + 8) * mvshift);
 
             /* clip MV to available pixels */
             MV mv = mvs[cu];
@@ -100,6 +101,7 @@ void mcChroma(pixel *      mcout,
     int csp = cache.csp;
     int bw = 16 >> cache.hshift;
     int bh = 16 >> cache.vshift;
+    const int mvshift = 1 << 2;
     MV mvmin, mvmax;
 
     for (int y = 0; y < height; y += bh)
@@ -109,8 +111,8 @@ void mcChroma(pixel *      mcout,
          * into the lowres structures */
         int cu = y * cache.lowresWidthInCU;
         intptr_t pixoff = y * stride;
-        mvmin.y = (int16_t)((-y - 8) << 2);
-        mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
+        mvmin.y = (int16_t)((-y - 8) * mvshift);
+        mvmax.y = (int16_t)((height - y - 1 + 8) * mvshift);
 
         for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
         {
@@ -122,8 +124,8 @@ void mcChroma(pixel *      mcout,
                 mv.y >>= cache.vshift;
 
                 /* clip MV to available pixels */
-                mvmin.x = (int16_t)((-x - 8) << 2);
-                mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
+                mvmin.x = (int16_t)((-x - 8) * mvshift);
+                mvmax.x = (int16_t)((width - x - 1 + 8) * mvshift);
                 mv = mv.clipped(mvmin, mvmax);
 
                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);