changeset 9550:dbce8036e0c4 draft

Merge with public
author Praveen Tiwari <praveen@multicorewareinc.com>
date Thu, 19 Feb 2015 10:01:48 +0530
parents 998c5d235ee7 (current diff) 039ea966d5eb (diff)
children 46de85c1be4d
files source/encoder/entropy.cpp
diffstat 21 files changed, 157 insertions(+-), 98 deletions(-) [+]
line wrap: on
line diff
--- a/doc/reST/cli.rst	Wed Feb 18 17:03:02 2015 +0530
+++ b/doc/reST/cli.rst	Thu Feb 19 10:01:48 2015 +0530
@@ -477,6 +477,23 @@ the prediction quad-tree.
 	and less frame parallelism as well. Because of this the faster
 	presets use a CU size of 32. Default: 64
 
+.. option:: --min-cu-size, <64|32|16|8>
+
+	Minimum CU size (width and height). By using 16 or 32 the encoder
+	will not analyze the cost of CUs below that minimum threshold,
+	saving considerable amounts of compute with a predictable increase
+	in bitrate. This setting has a large effect on performance on the
+	faster presets.
+
+	Default: 8 (minimum 8x8 CU for HEVC, best compression efficiency)
+
+.. note::
+
+	All encoders within a single process must use the same settings for
+	the CU size range. :option:`--ctu` and :option:`--min-cu-size` must
+	be consistent for all of them since the encoder configures several
+	key global data structures based on this range.
+
 .. option:: --rect, --no-rect
 
 	Enable analysis of rectangular motion partitions Nx2N and 2NxN
--- a/source/CMakeLists.txt	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/CMakeLists.txt	Thu Feb 19 10:01:48 2015 +0530
@@ -21,7 +21,7 @@ include(CheckSymbolExists)
 include(CheckCXXCompilerFlag)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 45)
+set(X265_BUILD 46)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
--- a/source/common/common.h	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/common.h	Thu Feb 19 10:01:48 2015 +0530
@@ -258,7 +258,7 @@ typedef int16_t  coeff_t;      // transf
 #define UNIT_SIZE               (1 << LOG2_UNIT_SIZE)       // unit size of CU partition
 
 #define MAX_NUM_PARTITIONS      256
-#define NUM_CU_PARTITIONS       (1U << (g_maxFullDepth << 1))
+#define NUM_4x4_PARTITIONS      (1U << (g_unitSizeDepth << 1)) // number of 4x4 units in max CU size
 
 #define MIN_PU_SIZE             4
 #define MIN_TU_SIZE             4
--- a/source/common/constants.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/constants.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -121,7 +121,7 @@ const uint16_t x265_chroma_lambda2_offse
 
 uint32_t g_maxLog2CUSize = MAX_LOG2_CU_SIZE;
 uint32_t g_maxCUSize     = MAX_CU_SIZE;
-uint32_t g_maxFullDepth  = NUM_FULL_DEPTH - 1;
+uint32_t g_unitSizeDepth = NUM_CU_DEPTH;
 uint32_t g_maxCUDepth    = NUM_CU_DEPTH - 1;
 uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS] = { 0, };
 uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS] = { 0, };
--- a/source/common/constants.h	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/constants.h	Thu Feb 19 10:01:48 2015 +0530
@@ -55,7 +55,7 @@ extern const uint8_t g_log2Size[MAX_CU_S
 extern uint32_t g_maxLog2CUSize;
 extern uint32_t g_maxCUSize;
 extern uint32_t g_maxCUDepth;
-extern uint32_t g_maxFullDepth;
+extern uint32_t g_unitSizeDepth; // Depth at which 4x4 unit occurs from max CU size
 
 extern const int16_t g_t4[4][4];
 extern const int16_t g_t8[8][8];
--- a/source/common/cudata.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/cudata.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -159,11 +159,11 @@ void CUData::initialize(const CUDataMemP
     m_chromaFormat  = csp;
     m_hChromaShift  = CHROMA_H_SHIFT(csp);
     m_vChromaShift  = CHROMA_V_SHIFT(csp);
-    m_numPartitions = NUM_CU_PARTITIONS >> (depth * 2);
+    m_numPartitions = NUM_4x4_PARTITIONS >> (depth * 2);
 
     if (!s_partSet[0])
     {
-        s_numPartInCUSize = 1 << g_maxFullDepth;
+        s_numPartInCUSize = 1 << g_unitSizeDepth;
         switch (g_maxLog2CUSize)
         {
         case 6:
@@ -272,7 +272,7 @@ void CUData::initCTU(const Frame& frame,
     m_cuPelX        = (cuAddr % m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize;
     m_cuPelY        = (cuAddr / m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize;
     m_absIdxInCTU   = 0;
-    m_numPartitions = NUM_CU_PARTITIONS;
+    m_numPartitions = NUM_4x4_PARTITIONS;
 
     /* sequential memsets */
     m_partSet((uint8_t*)m_qp, (uint8_t)qp);
@@ -559,7 +559,7 @@ const CUData* CUData::getPUAbove(uint32_
         return this;
     }
 
-    aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize];
+    aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize];
     return m_cuAbove;
 }
 
@@ -581,7 +581,7 @@ const CUData* CUData::getPUAboveLeft(uin
                 return this;
             }
         }
-        alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize - 1];
+        alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize - 1];
         return m_cuAbove;
     }
 
@@ -591,7 +591,7 @@ const CUData* CUData::getPUAboveLeft(uin
         return m_cuLeft;
     }
 
-    alPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - 1];
+    alPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - 1];
     return m_cuAboveLeft;
 }
 
@@ -620,14 +620,14 @@ const CUData* CUData::getPUAboveRight(ui
             }
             return NULL;
         }
-        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + 1];
+        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_4x4_PARTITIONS - s_numPartInCUSize + 1];
         return m_cuAbove;
     }
 
     if (!isZeroRow(absPartIdxRT, s_numPartInCUSize))
         return NULL;
 
-    arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize];
+    arPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - s_numPartInCUSize];
     return m_cuAboveRight;
 }
 
@@ -720,21 +720,21 @@ const CUData* CUData::getPUAboveRightAdi
             }
             return NULL;
         }
-        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset];
+        arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_4x4_PARTITIONS - s_numPartInCUSize + partUnitOffset];
         return m_cuAbove;
     }
 
     if (!isZeroRow(absPartIdxRT, s_numPartInCUSize))
         return NULL;
 
-    arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1];
+    arPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1];
     return m_cuAboveRight;
 }
 
 /* Get left QpMinCu */
 const CUData* CUData::getQpMinCuLeft(uint32_t& lPartUnitIdx, uint32_t curAbsIdxInCTU) const
 {
-    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
+    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
     uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx];
 
     // check for left CTU boundary
@@ -751,7 +751,7 @@ const CUData* CUData::getQpMinCuLeft(uin
 /* Get above QpMinCu */
 const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdxInCTU) const
 {
-    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
+    uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2);
     uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx];
 
     // check for top CTU boundary
@@ -790,7 +790,7 @@ int CUData::getLastValidPartIdx(int absP
 
 int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const
 {
-    uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2;
+    uint32_t quPartIdxMask = 0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2;
     int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask);
 
     if (lastValidPartIdx >= 0)
@@ -800,7 +800,7 @@ int8_t CUData::getLastCodedQP(uint32_t a
         if (m_absIdxInCTU)
             return m_encData->getPicCTU(m_cuAddr)->getLastCodedQP(m_absIdxInCTU);
         else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth)))
-            return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS);
+            return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_4x4_PARTITIONS);
         else
             return (int8_t)m_slice->m_sliceQp;
     }
@@ -932,7 +932,7 @@ uint32_t CUData::getCtxSkipFlag(uint32_t
 
 bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth)
 {
-    uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1);
+    uint32_t curPartNumb = NUM_4x4_PARTITIONS >> (depth << 1);
     uint32_t curPartNumQ = curPartNumb >> 2;
 
     if (m_cuDepth[absPartIdx] > depth)
@@ -2066,14 +2066,14 @@ void CUData::getTUEntropyCodingParameter
 
 #define CU_SET_FLAG(bitfield, flag, value) (bitfield) = ((bitfield) & (~(flag))) | ((~((value) - 1)) & (flag))
 
-void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS])
+void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS])
 {
     // Initialize the coding blocks inside the CTB
-    for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= MIN_LOG2_CU_SIZE; log2CUSize--)
+    for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= g_log2Size[minCUSize]; log2CUSize--)
     {
         uint32_t blockSize = 1 << log2CUSize;
         uint32_t sbWidth   = 1 << (g_log2Size[maxCUSize] - log2CUSize);
-        int32_t lastLevelFlag = log2CUSize == MIN_LOG2_CU_SIZE;
+        int32_t lastLevelFlag = log2CUSize == g_log2Size[minCUSize];
         for (uint32_t sbY = 0; sbY < sbWidth; sbY++)
         {
             for (uint32_t sbX = 0; sbX < sbWidth; sbX++)
@@ -2095,7 +2095,7 @@ void CUData::calcCTUGeoms(uint32_t ctuWi
                 cu->log2CUSize = log2CUSize;
                 cu->childOffset = childIdx - cuIdx;
                 cu->encodeIdx = g_depthScanIdx[yOffset][xOffset] * 4;
-                cu->numPartitions = (NUM_CU_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
+                cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
                 cu->depth = g_log2Size[maxCUSize] - log2CUSize;
 
                 cu->flags = 0;
--- a/source/common/cudata.h	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/cudata.h	Thu Feb 19 10:01:48 2015 +0530
@@ -158,7 +158,7 @@ public:
     CUData();
 
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance);
-    static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
+    static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
 
     void     initCTU(const Frame& frame, uint32_t cuAddr, int qp);
     void     initSubCU(const CUData& ctu, const CUGeom& cuGeom);
@@ -213,7 +213,7 @@ public:
     void     getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const;
     int      getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const;
 
-    uint32_t getSCUAddr() const                  { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInCTU; }
+    uint32_t getSCUAddr() const                  { return (m_cuAddr << g_unitSizeDepth * 2) + m_absIdxInCTU; }
     uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const;
     uint32_t getCtxSkipFlag(uint32_t absPartIdx) const;
     ScanType getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const;
@@ -278,7 +278,7 @@ struct CUDataMemPool
 
     bool create(uint32_t depth, uint32_t csp, uint32_t numInstances)
     {
-        uint32_t numPartition = NUM_CU_PARTITIONS >> (depth * 2);
+        uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2);
         uint32_t cuSize = g_maxCUSize >> depth;
         uint32_t sizeL = cuSize * cuSize;
         uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp));
--- a/source/common/param.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/param.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -127,6 +127,7 @@ void x265_param_default(x265_param *para
 
     /* CU definitions */
     param->maxCUSize = 64;
+    param->minCUSize = 8;
     param->tuQTMaxInterDepth = 1;
     param->tuQTMaxIntraDepth = 1;
     param->maxTUSize = 32;
@@ -570,6 +571,7 @@ int x265_param_parse(x265_param *p, cons
     OPT("repeat-headers") p->bRepeatHeaders = atobool(value);
     OPT("wpp") p->bEnableWavefront = atobool(value);
     OPT("ctu") p->maxCUSize = (uint32_t)atoi(value);
+    OPT("min-cu-size") p->minCUSize = (uint32_t)atoi(value);
     OPT("tu-intra-depth") p->tuQTMaxIntraDepth = (uint32_t)atoi(value);
     OPT("tu-inter-depth") p->tuQTMaxInterDepth = (uint32_t)atoi(value);
     OPT("max-tu-size") p->maxTUSize = (uint32_t)atoi(value);
@@ -961,7 +963,7 @@ int x265_check_params(x265_param *param)
     int check_failed = 0; /* abort if there is a fatal configuration problem */
 
     CHECK(param->maxCUSize != 64 && param->maxCUSize != 32 && param->maxCUSize != 16,
-          "max ctu size must be 16, 32, or 64");
+          "max cu size must be 16, 32, or 64");
     if (check_failed == 1)
         return check_failed;
 
@@ -978,6 +980,10 @@ int x265_check_params(x265_param *param)
           "x265 was compiled for 8bit encodes, only 8bit internal depth supported");
 #endif
 
+    CHECK(param->minCUSize != 64 && param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8,
+          "minimim CU size must be 8, 16, 32, or 64");
+    CHECK(param->minCUSize > param->maxCUSize,
+          "min CU size must be less than or equal to max CU size");
     CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > QP_MAX_SPEC,
           "QP exceeds supported range (-QpBDOffsety to 51)");
     CHECK(param->fpsNum == 0 || param->fpsDenom == 0,
@@ -1156,30 +1162,36 @@ int x265_set_globals(x265_param *param)
 {
     static int once /* = 0 */;
 
+    uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
+    uint32_t minLog2CUSize = (uint32_t)g_log2Size[param->minCUSize];
+
     if (ATOMIC_INC(&once) > 1)
     {
-        if (param->maxCUSize != g_maxCUSize)
+        if (g_maxCUSize != param->maxCUSize)
         {
             x265_log(param, X265_LOG_ERROR, "maxCUSize must be the same for all encoders in a single process");
             return -1;
         }
+        if (g_maxCUDepth != maxLog2CUSize - minLog2CUSize)
+        {
+            x265_log(param, X265_LOG_ERROR, "maxCUDepth must be the same for all encoders in a single process");
+            return -1;
+        }
     }
     else
     {
-        uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize];
-
         // set max CU width & height
         g_maxCUSize     = param->maxCUSize;
         g_maxLog2CUSize = maxLog2CUSize;
 
         // compute actual CU depth with respect to config depth and max transform size
-        g_maxCUDepth   = maxLog2CUSize - MIN_LOG2_CU_SIZE;
-        g_maxFullDepth = maxLog2CUSize - LOG2_UNIT_SIZE;
+        g_maxCUDepth    = maxLog2CUSize - minLog2CUSize;
+        g_unitSizeDepth = maxLog2CUSize - LOG2_UNIT_SIZE;
 
         // initialize partition order
         uint32_t* tmp = &g_zscanToRaster[0];
-        initZscanToRaster(g_maxFullDepth, 1, 0, tmp);
-        initRasterToZscan(g_maxFullDepth);
+        initZscanToRaster(g_unitSizeDepth, 1, 0, tmp);
+        initRasterToZscan(g_unitSizeDepth);
     }
     return 0;
 }
@@ -1195,7 +1207,7 @@ void x265_print_params(x265_param *param
     if (param->interlaceMode)
         x265_log(param, X265_LOG_INFO, "Interlaced field inputs             : %s\n", x265_interlace_names[param->interlaceMode]);
 
-    x265_log(param, X265_LOG_INFO, "Coding QT: max CU size, min CU size : %d / %d\n", param->maxCUSize, 8);
+    x265_log(param, X265_LOG_INFO, "Coding QT: max CU size, min CU size : %d / %d\n", param->maxCUSize, param->minCUSize);
 
     x265_log(param, X265_LOG_INFO, "Residual QT: max TU size, max depth : %d / %d inter / %d intra\n",
              param->maxTUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth);
@@ -1293,6 +1305,7 @@ char *x265_param2string(x265_param *p)
     s += sprintf(s, " bitdepth=%d", p->internalBitDepth);
     BOOL(p->bEnableWavefront, "wpp");
     s += sprintf(s, " ctu=%d", p->maxCUSize);
+    s += sprintf(s, " min-cu-size=%d", p->minCUSize);
     s += sprintf(s, " max-tu-size=%d", p->maxTUSize);
     s += sprintf(s, " tu-intra-depth=%d", p->tuQTMaxIntraDepth);
     s += sprintf(s, " tu-inter-depth=%d", p->tuQTMaxInterDepth);
--- a/source/common/picyuv.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/picyuv.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -84,7 +84,7 @@ fail:
  * allocated by the same encoder. */
 bool PicYuv::createOffsets(const SPS& sps)
 {
-    uint32_t numPartitions = 1 << (g_maxFullDepth * 2);
+    uint32_t numPartitions = 1 << (g_unitSizeDepth * 2);
     CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
     CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight);
     for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++)
--- a/source/common/slice.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/slice.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -183,8 +183,8 @@ void RPS::sortDeltaPOC()
 uint32_t Slice::realEndAddress(uint32_t endCUAddr) const
 {
     // Calculate end address
-    uint32_t internalAddress = (endCUAddr - 1) % NUM_CU_PARTITIONS;
-    uint32_t externalAddress = (endCUAddr - 1) / NUM_CU_PARTITIONS;
+    uint32_t internalAddress = (endCUAddr - 1) % NUM_4x4_PARTITIONS;
+    uint32_t externalAddress = (endCUAddr - 1) / NUM_4x4_PARTITIONS;
     uint32_t xmax = m_sps->picWidthInLumaSamples - (externalAddress % m_sps->numCuInWidth) * g_maxCUSize;
     uint32_t ymax = m_sps->picHeightInLumaSamples - (externalAddress / m_sps->numCuInWidth) * g_maxCUSize;
 
@@ -192,13 +192,13 @@ uint32_t Slice::realEndAddress(uint32_t 
         internalAddress--;
 
     internalAddress++;
-    if (internalAddress == NUM_CU_PARTITIONS)
+    if (internalAddress == NUM_4x4_PARTITIONS)
     {
         internalAddress = 0;
         externalAddress++;
     }
 
-    return externalAddress * NUM_CU_PARTITIONS + internalAddress;
+    return externalAddress * NUM_4x4_PARTITIONS + internalAddress;
 }
 
 
--- a/source/common/x86/intrapred8.asm	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Thu Feb 19 10:01:48 2015 +0530
@@ -714,7 +714,7 @@ cglobal intra_pred_planar32, 3,4,8,0-(4*
 ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------
 INIT_XMM ssse3
-cglobal intra_pred_ang4_2, 3,5,4
+cglobal intra_pred_ang4_2, 3,5,3
     lea         r4, [r2 + 2]
     add         r2, 10
     cmp         r3m, byte 34
--- a/source/encoder/analysis.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/analysis.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -139,8 +139,8 @@ Mode& Analysis::compressCTU(CUData& ctu,
         {
             int numPredDir = m_slice->isInterP() ? 1 : 2;
             m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData;
-            reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
-            reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
+            m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir];
+            m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS];
         }
     }
 
@@ -260,7 +260,7 @@ void Analysis::compressIntraCU(const CUD
         checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
         checkBestMode(md.pred[PRED_INTRA], depth);
 
-        if (depth == g_maxCUDepth)
+        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
         {
             md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
             checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
@@ -472,7 +472,7 @@ void Analysis::parallelModeAnalysis(int 
         {
         case 0:
             slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL, NULL);
-            if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
+            if (m_curGeom->log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
                 slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL, NULL);
             break;
 
@@ -556,7 +556,7 @@ void Analysis::compressInterCU_dist(cons
         if (bTryIntra)
         {
             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-            if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
+            if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
                 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
         }
 
@@ -704,7 +704,7 @@ void Analysis::compressInterCU_dist(cons
             if (bTryIntra)
             {
                 checkBestMode(md.pred[PRED_INTRA], depth);
-                if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
+                if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
                     checkBestMode(md.pred[PRED_INTRA_NxN], depth);
             }
         }
@@ -1153,7 +1153,7 @@ void Analysis::compressInterCU_rd5_6(con
                     bHor = true;
                 else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
                     bVer = true;
-                else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0))
+                else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
                 {
                     bHor = true;
                     bVer = true;
@@ -1187,7 +1187,7 @@ void Analysis::compressInterCU_rd5_6(con
                 checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL);
                 checkBestMode(md.pred[PRED_INTRA], depth);
 
-                if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize)
+                if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
                 {
                     md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
                     checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL);
@@ -1388,7 +1388,7 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mod
 
     if (m_param->analysisMode == X265_ANALYSIS_LOAD && isSkipMode)
     {
-        uint32_t i = *reuseBestMergeCand;
+        uint32_t i = *m_reuseBestMergeCand;
         tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i;    /* merge candidate ID is stored in L0 MVP idx */
         tempPred->cu.m_interDir[0] = interDirNeighbours[i];
         tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv;
@@ -1496,9 +1496,9 @@ void Analysis::checkMerge2Nx2N_rd5_6(Mod
 
     if (m_param->analysisMode)
     {
-        reuseBestMergeCand++;
+        m_reuseBestMergeCand++;
         if (m_param->analysisMode == X265_ANALYSIS_SAVE)
-            *reuseBestMergeCand = bestPred->cu.m_mvpIdx[0][0];
+            *m_reuseBestMergeCand = bestPred->cu.m_mvpIdx[0][0];
     }
 }
 
@@ -1516,8 +1516,8 @@ void Analysis::checkInter_rd0_4(Mode& in
             MotionData* bestME = interMode.bestME[part];
             for (int32_t i = 0; i < numPredDir; i++)
             {
-                bestME[i].ref = *reuseRef;
-                reuseRef++;
+                bestME[i].ref = *m_reuseRef;
+                m_reuseRef++;
             }
         }
     }
@@ -1542,8 +1542,8 @@ void Analysis::checkInter_rd0_4(Mode& in
                 MotionData* bestME = interMode.bestME[puIdx];
                 for (int32_t i = 0; i < numPredDir; i++)
                 {
-                    *reuseRef = bestME[i].ref;
-                    reuseRef++;
+                    *m_reuseRef = bestME[i].ref;
+                    m_reuseRef++;
                 }
             }
         }
@@ -1569,8 +1569,8 @@ void Analysis::checkInter_rd5_6(Mode& in
             MotionData* bestME = interMode.bestME[puIdx];
             for (int32_t i = 0; i < numPredDir; i++)
             {
-                bestME[i].ref = *reuseRef;
-                reuseRef++;
+                bestME[i].ref = *m_reuseRef;
+                m_reuseRef++;
             }
         }
     }
@@ -1586,8 +1586,8 @@ void Analysis::checkInter_rd5_6(Mode& in
                 MotionData* bestME = interMode.bestME[puIdx];
                 for (int32_t i = 0; i < numPredDir; i++)
                 {
-                    *reuseRef = bestME[i].ref;
-                    reuseRef++;
+                    *m_reuseRef = bestME[i].ref;
+                    m_reuseRef++;
                 }
             }
         }
--- a/source/encoder/analysis.h	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/analysis.h	Thu Feb 19 10:01:48 2015 +0530
@@ -74,14 +74,11 @@ public:
     bool      m_bTryLossless;
     bool      m_bChromaSa8d;
 
-    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
-    analysis_intra_data* m_reuseIntraDataCTU;
-    analysis_inter_data* m_reuseInterDataCTU;
-    int32_t* reuseRef;
-    uint32_t* reuseBestMergeCand;
     Analysis();
+
     bool create(ThreadLocalData* tld);
     void destroy();
+
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
 
 protected:
@@ -96,6 +93,12 @@ protected:
     void parallelModeAnalysis(int threadId, int jobId);
     void parallelME(int threadId, int meId);
 
+    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
+    analysis_intra_data* m_reuseIntraDataCTU;
+    analysis_inter_data* m_reuseInterDataCTU;
+    int32_t*             m_reuseRef;
+    uint32_t*            m_reuseBestMergeCand;
+
     /* full analysis for an I-slice CU */
     void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder);
 
--- a/source/encoder/api.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/api.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -206,7 +206,7 @@ void x265_picture_init(x265_param *param
 
         uint32_t numCUsInFrame   = widthInCU * heightInCU;
         pic->analysisData.numCUsInFrame = numCUsInFrame;
-        pic->analysisData.numPartitions = NUM_CU_PARTITIONS;
+        pic->analysisData.numPartitions = NUM_4x4_PARTITIONS;
     }
 }
 
--- a/source/encoder/encoder.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/encoder.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -637,7 +637,7 @@ int Encoder::encode(const x265_picture* 
                 slice->m_sps = &m_sps;
                 slice->m_pps = &m_pps;
                 slice->m_maxNumMergeCand = m_param->maxNumMergeCand;
-                slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS);
+                slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS);
                 frameEnc->m_reconPic->m_cuOffsetC = m_cuOffsetC;
                 frameEnc->m_reconPic->m_cuOffsetY = m_cuOffsetY;
                 frameEnc->m_reconPic->m_buOffsetC = m_buOffsetC;
@@ -667,7 +667,7 @@ int Encoder::encode(const x265_picture* 
 
                 uint32_t numCUsInFrame   = widthInCU * heightInCU;
                 analysis->numCUsInFrame  = numCUsInFrame;
-                analysis->numPartitions  = NUM_CU_PARTITIONS;
+                analysis->numPartitions  = NUM_4x4_PARTITIONS;
                 allocAnalysis(analysis);
             }
 
@@ -948,6 +948,8 @@ void Encoder::printSummary()
         StatisticLog finalLog;
         for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++)
         {
+            int cuSize = g_maxCUSize >> depth;
+
             for (int i = 0; i < m_param->frameNumThreads; i++)
             {
                 StatisticLog& enclog = m_frameEncoder[i]->m_sliceTypeLog[sliceType];
@@ -961,7 +963,7 @@ void Encoder::printSummary()
                     finalLog.cuInterDistribution[depth][m] += enclog.cuInterDistribution[depth][m];
                 }
 
-                if (depth == g_maxCUDepth)
+                if (cuSize == 8 && m_sps.quadtreeTULog2MinSize < 3)
                     finalLog.cntIntraNxN += enclog.cntIntraNxN;
                 if (sliceType != I_SLICE)
                 {
@@ -1026,7 +1028,6 @@ void Encoder::printSummary()
             }
 
             // print statistics
-            int cuSize = g_maxCUSize >> depth;
             char stats[256] = { 0 };
             int len = 0;
             if (sliceType != I_SLICE)
@@ -1054,14 +1055,14 @@ void Encoder::printSummary()
                                cuIntraDistribution[1], cuIntraDistribution[2]);
                 if (sliceType != I_SLICE)
                 {
-                    if (depth == g_maxCUDepth)
+                    if (cuSize == 8 && m_sps.quadtreeTULog2MinSize < 3)
                         len += sprintf(stats + len, " %dx%d "X265_LL "%%", cuSize / 2, cuSize / 2, cntIntraNxN);
                 }
 
                 len += sprintf(stats + len, ")");
                 if (sliceType == I_SLICE)
                 {
-                    if (depth == g_maxCUDepth)
+                    if (cuSize == 8 && m_sps.quadtreeTULog2MinSize < 3)
                         len += sprintf(stats + len, " %dx%d: "X265_LL "%%", cuSize / 2, cuSize / 2, cntIntraNxN);
                 }
             }
@@ -1440,8 +1441,8 @@ void Encoder::initSPS(SPS *sps)
     sps->numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize;
     sps->numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize;
     sps->numCUsInFrame = sps->numCuInWidth * sps->numCuInHeight;
-    sps->numPartitions = NUM_CU_PARTITIONS;
-    sps->numPartInCUSize = 1 << g_maxFullDepth;
+    sps->numPartitions = NUM_4x4_PARTITIONS;
+    sps->numPartInCUSize = 1 << g_unitSizeDepth;
 
     sps->log2MinCodingBlockSize = g_maxLog2CUSize - g_maxCUDepth;
     sps->log2DiffMaxMinCodingBlockSize = g_maxCUDepth;
@@ -1708,10 +1709,10 @@ void Encoder::configure(x265_param *p)
     m_conformanceWindow.leftOffset = 0;
 
     /* set pad size if width is not multiple of the minimum CU size */
-    if (p->sourceWidth & (MIN_CU_SIZE - 1))
+    if (p->sourceWidth & (p->minCUSize - 1))
     {
-        uint32_t rem = p->sourceWidth & (MIN_CU_SIZE - 1);
-        uint32_t padsize = MIN_CU_SIZE - rem;
+        uint32_t rem = p->sourceWidth & (p->minCUSize - 1);
+        uint32_t padsize = p->minCUSize - rem;
         p->sourceWidth += padsize;
 
         m_conformanceWindow.bEnabled = true;
@@ -1719,10 +1720,10 @@ void Encoder::configure(x265_param *p)
     }
 
     /* set pad size if height is not multiple of the minimum CU size */
-    if (p->sourceHeight & (MIN_CU_SIZE - 1))
+    if (p->sourceHeight & (p->minCUSize - 1))
     {
-        uint32_t rem = p->sourceHeight & (MIN_CU_SIZE - 1);
-        uint32_t padsize = MIN_CU_SIZE - rem;
+        uint32_t rem = p->sourceHeight & (p->minCUSize - 1);
+        uint32_t padsize = p->minCUSize - rem;
         p->sourceHeight += padsize;
 
         m_conformanceWindow.bEnabled = true;
--- a/source/encoder/entropy.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/entropy.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -671,7 +671,7 @@ void Entropy::finishCU(const CUData& ctu
     {
         // Encode slice finish
         bool bTerminateSlice = false;
-        if (cuAddr + (NUM_CU_PARTITIONS >> (depth << 1)) == realEndAddress)
+        if (cuAddr + (NUM_4x4_PARTITIONS >> (depth << 1)) == realEndAddress)
             bTerminateSlice = true;
 
         // The 1-terminating bit is added to all streams, so don't add it here when it's 1.
@@ -856,7 +856,7 @@ void Entropy::codePUWise(const CUData& c
     PartSize partSize = (PartSize)cu.m_partSize[absPartIdx];
     uint32_t numPU = (partSize == SIZE_2Nx2N ? 1 : (partSize == SIZE_NxN ? 4 : 2));
     uint32_t depth = cu.m_cuDepth[absPartIdx];
-    uint32_t puOffset = (g_puOffset[uint32_t(partSize)] << (g_maxFullDepth - depth) * 2) >> 4;
+    uint32_t puOffset = (g_puOffset[uint32_t(partSize)] << (g_unitSizeDepth - depth) * 2) >> 4;
 
     for (uint32_t puIdx = 0, subPartIdx = absPartIdx; puIdx < numPU; puIdx++, subPartIdx += puOffset)
     {
--- a/source/encoder/frameencoder.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/frameencoder.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -143,6 +143,7 @@ bool FrameEncoder::initializeGeoms()
 {
     /* Geoms only vary between CTUs in the presence of picture edges */
     int maxCUSize = m_param->maxCUSize;
+    int minCUSize = m_param->minCUSize;
     int heightRem = m_param->sourceHeight & (maxCUSize - 1);
     int widthRem = m_param->sourceWidth & (maxCUSize - 1);
     int allocGeoms = 1; // body
@@ -157,7 +158,7 @@ bool FrameEncoder::initializeGeoms()
         return false;
 
     // body
-    CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms);
+    CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, minCUSize, m_cuGeoms);
     memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols);
     if (allocGeoms == 1)
         return true;
@@ -166,7 +167,7 @@ bool FrameEncoder::initializeGeoms()
     if (widthRem)
     {
         // right
-        CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+        CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
         for (uint32_t i = 0; i < m_numRows; i++)
         {
             uint32_t ctuAddr = m_numCols * (i + 1) - 1;
@@ -177,7 +178,7 @@ bool FrameEncoder::initializeGeoms()
     if (heightRem)
     {
         // bottom
-        CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+        CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
         for (uint32_t i = 0; i < m_numCols; i++)
         {
             uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i;
@@ -188,7 +189,7 @@ bool FrameEncoder::initializeGeoms()
         if (widthRem)
         {
             // corner
-            CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
+            CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS);
 
             uint32_t ctuAddr = m_numCols * m_numRows - 1;
             m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS;
@@ -538,7 +539,7 @@ void FrameEncoder::encodeSlice()
 {
     Slice* slice = m_frame->m_encData->m_slice;
     const uint32_t widthInLCUs = slice->m_sps->numCuInWidth;
-    const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_CU_PARTITIONS - 1) / NUM_CU_PARTITIONS;
+    const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_4x4_PARTITIONS - 1) / NUM_4x4_PARTITIONS;
     const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
 
     SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL;
@@ -1038,7 +1039,7 @@ void FrameEncoder::collectCTUStatistics(
             else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
             {
                 /* TODO: log intra modes at absPartIdx +0 to +3 */
-                X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
+                X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
                 log->cntIntraNxN++;
                 log->cntIntra[depth]--;
             }
@@ -1086,7 +1087,7 @@ void FrameEncoder::collectCTUStatistics(
 
                 if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N)
                 {
-                    X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n");
+                    X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n");
                     log->cntIntraNxN++;
                     /* TODO: log intra modes at absPartIdx +0 to +3 */
                 }
--- a/source/encoder/search.h	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/search.h	Thu Feb 19 10:01:48 2015 +0530
@@ -41,9 +41,11 @@
     m_stats[cu.m_encData->m_frameEncoderID].count++; \
     ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc)
 #define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count)
+#define ProfileCounter(cu, count) m_stats[cu.m_encData->m_frameEncoderID].count++;
 #else
 #define ProfileCUScopeNamed(name, cu, acc, count)
 #define ProfileCUScope(cu, acc, count)
+#define ProfileCounter(cu, count)
 #endif
 
 namespace x265 {
--- a/source/encoder/slicetype.cpp	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/encoder/slicetype.cpp	Thu Feb 19 10:01:48 2015 +0530
@@ -580,7 +580,9 @@ void Lookahead::vbvLookahead(Lowres **fr
         curNonB++;
     int nextNonB = keyframe ? prevNonB : curNonB;
     int nextB = prevNonB + 1;
-    int nextBRef = 0;
+    int nextBRef = 0, curBRef = 0;
+    if (m_param->bBPyramid && curNonB - prevNonB > 1)
+        curBRef = (prevNonB + curNonB + 1) / 2;
     int miniGopEnd = keyframe ? prevNonB : curNonB;
     while (curNonB < numFrames + !keyframe)
     {
@@ -597,7 +599,6 @@ void Lookahead::vbvLookahead(Lowres **fr
                 {
                     frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx];
                     frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx];
-                
                 }
             }
             idx++;
@@ -622,19 +623,19 @@ void Lookahead::vbvLookahead(Lowres **fr
                     satdCost = vbvFrameCost(frames, nextBRef, curNonB, i);
             }
             else
-                satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i);
+                satdCost = vbvFrameCost(frames, prevNonB, curNonB, i);
             frames[nextNonB]->plannedSatd[idx] = satdCost;
             frames[nextNonB]->plannedType[idx] = type;
             /* Save the nextB Cost in each B frame of the current miniGop */
 
             for (int j = nextB; j < miniGopEnd; j++)
             {
-                if (nextBRef && i == nextBRef)
+                if (curBRef && curBRef == i)
                     break;
                 if (j >= i && j !=nextBRef)
                     continue;
                 frames[j]->plannedSatd[frames[j]->indB] = satdCost;
-                frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B;
+                frames[j]->plannedType[frames[j]->indB++] = type;
             }
         }
         prevNonB = curNonB;
--- a/source/x265.h	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/x265.h	Thu Feb 19 10:01:48 2015 +0530
@@ -229,6 +229,9 @@ typedef enum
 #define X265_B_ADAPT_FAST       1
 #define X265_B_ADAPT_TRELLIS    2
 
+#define X265_REF_LIMIT_DEPTH    1
+#define X265_REF_LIMIT_CU       2
+
 #define X265_BFRAME_MAX         16
 #define X265_MAX_FRAME_THREADS  16
 
@@ -238,13 +241,14 @@ typedef enum
 #define X265_TYPE_P             0x0003
 #define X265_TYPE_BREF          0x0004  /* Non-disposable B-frame */
 #define X265_TYPE_B             0x0005
+#define IS_X265_TYPE_I(x) ((x) == X265_TYPE_I || (x) == X265_TYPE_IDR)
+#define IS_X265_TYPE_B(x) ((x) == X265_TYPE_B || (x) == X265_TYPE_BREF)
+
 #define X265_QP_AUTO                 0
 
 #define X265_AQ_NONE                 0
 #define X265_AQ_VARIANCE             1
 #define X265_AQ_AUTO_VARIANCE        2
-#define IS_X265_TYPE_I(x) ((x) == X265_TYPE_I || (x) == X265_TYPE_IDR)
-#define IS_X265_TYPE_B(x) ((x) == X265_TYPE_B || (x) == X265_TYPE_BREF)
 
 /* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */
 
@@ -576,9 +580,15 @@ typedef struct x265_param
      * The higher the size, the more efficiently x265 can encode areas of low
      * complexity, greatly improving compression efficiency at large
      * resolutions.  The smaller the size, the more effective wavefront and
-     * frame parallelism will become because of the increase in rows. default 64 */
+     * frame parallelism will become because of the increase in rows. default 64
+     * All encoders within the same process must use the same maxCUSize. */
     uint32_t  maxCUSize;
 
+    /* Miniumum CU width and height in pixels.  The size must be 64, 32, 16, or
+     * 8. Default 8. All encoders within the same process must use the same
+     * minCUSize. */
+    uint32_t  minCUSize;
+
     /* Enable rectangular motion prediction partitions (vertical and
      * horizontal), available at all CU depths from 64x64 to 8x8. Default is
      * disabled */
@@ -664,6 +674,15 @@ typedef struct x265_param
      * the performance but the less compression efficiency. Default is 3 */
     uint32_t  maxNumMergeCand;
 
+    /* Limit the motion references used for each search based on the results of
+     * previous motion searches already performed for the same CU: If 0 all
+     * references are always searched. If X265_REF_LIMIT_CU all motion searches
+     * will restrict themselves to the references selected by the 2Nx2N search
+     * at the same depth. If X265_REF_LIMIT_DEPTH the 2Nx2N motion search will
+     * only use references that were selected by the best motion searches of the
+     * 4 split CUs at the next lower CU depth.  The two flags may be combined */
+    uint32_t  limitReferences;
+
     /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
      * (methods) are sorted in increasing complexity, with diamond being the
      * simplest and fastest and full being the slowest.  DIA, HEX, and UMH were
--- a/source/x265cli.h	Wed Feb 18 17:03:02 2015 +0530
+++ b/source/x265cli.h	Thu Feb 19 10:01:48 2015 +0530
@@ -71,7 +71,8 @@ static const struct option long_options[
     { "no-wpp",               no_argument, NULL, 0 },
     { "wpp",                  no_argument, NULL, 0 },
     { "ctu",            required_argument, NULL, 's' },
-    { "max-tu-size",    required_argument, NULL, 's' },
+    { "min-cu-size",    required_argument, NULL, 0 },
+    { "max-tu-size",    required_argument, NULL, 0 },
     { "tu-intra-depth", required_argument, NULL, 0 },
     { "tu-inter-depth", required_argument, NULL, 0 },
     { "me",             required_argument, NULL, 0 },
@@ -265,6 +266,7 @@ static void showHelp(x265_param *param)
     H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
     H0("\nQuad-Tree size and depth:\n");
     H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
+    H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
     H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
     H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
     H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);