changeset 1624:bd911514525a

Merged multicoreware/xhevc into default
author Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
date Wed, 22 May 2013 10:37:50 +0530
parents 8aa73d922542 (current diff) c0134d7e756d (diff)
children 55094d80f04a
files
diffstat 26 files changed, 863 insertions(+-), 390 deletions(-) [+]
line wrap: on
line diff
--- a/cfg/encoder_I_15P.cfg	Tue May 21 15:33:13 2013 +0530
+++ b/cfg/encoder_I_15P.cfg	Wed May 22 10:37:50 2013 +0530
@@ -44,8 +44,8 @@ InternalBitDepth              : 8       
 
 #=========== Coding Tools =================
 SAO                           : 0           # Sample adaptive offset  (0: OFF, 1: ON)
+RectInter                     : 1           # Rectangular motion partitions Nx2N and 2NxN(0: OFF, 1: ON)
 AMP                           : 1           # Asymmetric motion partitions (0: OFF, 1: ON)
-AMP_REFINE                    : 1           # Asymmetric refine motion partitions Nx2N and 2NxN(0: OFF, 1: ON)
 TransformSkip                 : 1           # Transform skipping (0: OFF, 1: ON)
 TransformSkipFast             : 1           # Fast Transform skipping (0: OFF, 1: ON)
 SAOLcuBoundary                : 0           # SAOLcuBoundary using non-deblocked pixels (0: OFF, 1: ON)
--- a/cfg/encoder_all_I.cfg	Tue May 21 15:33:13 2013 +0530
+++ b/cfg/encoder_all_I.cfg	Wed May 22 10:37:50 2013 +0530
@@ -38,8 +38,8 @@ InternalBitDepth              : 8       
 
 #=========== Coding Tools =================
 SAO                           : 1           # Sample adaptive offset  (0: OFF, 1: ON)
+RectInter                     : 1           # Rectangular motion partitions Nx2N and 2NxN(0: OFF, 1: ON)
 AMP                           : 1           # Asymmetric motion partitions (0: OFF, 1: ON)
-AMP_REFINE                    : 1           # Asymmetric refine motion partitions Nx2N and 2NxN(0: OFF, 1: ON)
 TransformSkip                 : 1           # Transform skipping (0: OFF, 1: ON)
 TransformSkipFast             : 1           # Fast Transform skipping (0: OFF, 1: ON)
 SAOLcuBoundary                : 0           # SAOLcuBoundary using non-deblocked pixels (0: OFF, 1: ON)
--- a/source/Lib/TLibCommon/TComSlice.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibCommon/TComSlice.h	Wed May 22 10:37:50 2013 +0530
@@ -852,7 +852,6 @@ private:
     UInt        m_pcmLog2MaxSize;
     UInt        m_uiPCMLog2MinSize;
     Bool        m_useAMP;
-    Bool        m_useAMPRefine;
 
     // Parameter
     Int         m_bitDepthY;
@@ -874,7 +873,6 @@ private:
     UInt        m_uiMaxTrSize;
 
     Int m_iAMPAcc[MAX_CU_DEPTH];
-    Int m_iAMPRefineAcc[MAX_CU_DEPTH];
     Bool        m_bUseSAO;
 
     Bool        m_bTemporalIdNestingFlag; // temporal_id_nesting_flag
@@ -981,10 +979,6 @@ public:
 
     Void setUseAMP(Bool b) { m_useAMP = b; }
 
-    Bool getUseAMPRefine() { return m_useAMPRefine; }
-
-    Void setUseAMPRefine(Bool b) { m_useAMPRefine = b; }
-
     Void setQuadtreeTULog2MaxSize(UInt u) { m_uiQuadtreeTULog2MaxSize = u;    }
 
     UInt getQuadtreeTULog2MaxSize()         { return m_uiQuadtreeTULog2MaxSize; }
@@ -1031,10 +1025,6 @@ public:
 
     Void      setAMPAcc(UInt uiDepth, Int iAccu) { assert(uiDepth < g_uiMaxCUDepth);  m_iAMPAcc[uiDepth] = iAccu; }
 
-    Int       getAMPRefineAcc(UInt uiDepth) { return m_iAMPRefineAcc[uiDepth]; }
-
-    Void      setAMPRefineAcc(UInt uiDepth, Int iAccu) { assert(uiDepth < g_uiMaxCUDepth);  m_iAMPRefineAcc[uiDepth] = iAccu; }
-
     // Bit-depth
     Int      getBitDepthY() { return m_bitDepthY; }
 
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Wed May 22 10:37:50 2013 +0530
@@ -625,7 +625,7 @@ void xTrMxN(Int bitDepth, Short *block, 
     Int shift_1st = g_aucConvertToBit[iWidth]  + 1 + bitDepth - 8; // log2(iWidth) - 1 + g_bitDepth - 8
     Int shift_2nd = g_aucConvertToBit[iHeight]  + 8;                 // log2(iHeight) + 6
 
-    Short tmp[64 * 64];
+    ALIGN_VAR_32(Short, tmp[64 * 64]);
 
     if (iWidth == 4 && iHeight == 4)
     {
@@ -636,6 +636,7 @@ void xTrMxN(Int bitDepth, Short *block, 
         }
         else
         {
+
             partialButterfly4(block, tmp, shift_1st, iHeight);
             partialButterfly4(tmp, coeff, shift_2nd, iWidth);
         }
--- a/source/Lib/TLibEncoder/TEncCfg.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncCfg.h	Wed May 22 10:37:50 2013 +0530
@@ -100,7 +100,6 @@ protected:
 
     //==== File I/O ========
     Int       m_iFrameRate;
-    Int       m_FrameSkip;
     Int       m_iSourceWidth;
     Int       m_iSourceHeight;
     Int       m_conformanceMode;
@@ -135,7 +134,7 @@ protected:
 
     Int       m_maxTempLayer;                    ///< Max temporal layer
     Bool      m_useAMP;
-    Bool      m_useAMPRefine;
+    Bool      m_useRectInter;
 
     //======= Transform =============
     UInt      m_uiQuadtreeTULog2MaxSize;
@@ -277,18 +276,14 @@ public:
 
     Void      setFrameRate(Int i)      { m_iFrameRate = i; }
 
-    Void      setFrameSkip(UInt i) { m_FrameSkip = i; }
-
     Void      setSourceWidth(Int i)      { m_iSourceWidth = i; }
 
     Void      setSourceHeight(Int i)      { m_iSourceHeight = i; }
 
-    Window   &getConformanceWindow()                           { return m_conformanceWindow; }
+    Window   &getConformanceWindow()      { return m_conformanceWindow; }
 
     Void      setConformanceWindow(Int confLeft, Int confRight, Int confTop, Int confBottom) { m_conformanceWindow.setWindow(confLeft, confRight, confTop, confBottom); }
 
-    Void      setFramesToBeEncoded(Int i)      { m_framesToBeEncoded = i; }
-
     //====== Coding Structure ========
     Void      setIntraPeriod(Int i)      { m_uiIntraPeriod = (UInt)i; }
 
@@ -327,10 +322,6 @@ public:
 
     Void      setQuadtreeTUMaxDepthIntra(UInt u)      { m_uiQuadtreeTUMaxDepthIntra = u; }
 
-    Void setUseAMP(Bool b) { m_useAMP = b; }
-
-    Void setUseAMPRefine(Bool b) { m_useAMPRefine = b; }
-
     //====== Loop/Deblock Filter ========
     Void      setLoopFilterDisable(Bool b)      { m_bLoopFilterDisable       = b; }
 
@@ -347,7 +338,10 @@ public:
     //====== Motion search ========
     Void      setSearchMethod(Int i)     { m_iSearchMethod = i; }
     Void      setSearchRange(Int i)      { m_iSearchRange = i; }
-    Void      setBipredSearchRange(Int i)      { m_bipredSearchRange = i; }
+    Void      setBipredSearchRange(Int i){ m_bipredSearchRange = i; }
+    Void      setUseRectInter(Bool b)    { m_useRectInter = b; }
+    Bool      getUseRectInter() const    { return m_useRectInter; }
+    Void      setUseAMP(Bool b)          { m_useAMP = b; }
 
     //====== Quality control ========
     Void      setMaxCuDQPDepth(Int i)      { m_iMaxCuDQPDepth = i; }
@@ -360,9 +354,9 @@ public:
 
     Bool      getUseAdaptQpSelect()           { return m_bUseAdaptQpSelect; }
 
-    Void      setUseAdaptiveQP(Bool b)      { m_bUseAdaptiveQP = b; }
+    Void      setUseAdaptiveQP(Bool b)        { m_bUseAdaptiveQP = b; }
 
-    Void      setQPAdaptationRange(Int i)      { m_iQPAdaptationRange = i; }
+    Void      setQPAdaptationRange(Int i)     { m_iQPAdaptationRange = i; }
 
     //====== Lossless ========
     Void      setUseLossless(Bool b)        { m_useLossless = b;  }
@@ -370,8 +364,6 @@ public:
     //====== Sequence ========
     Int       getFrameRate()      { return m_iFrameRate; }
 
-    UInt      getFrameSkip()      { return m_FrameSkip; }
-
     Int       getSourceWidth()      { return m_iSourceWidth; }
 
     Int       getSourceHeight()      { return m_iSourceHeight; }
--- a/source/Lib/TLibEncoder/TEncCu.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncCu.cpp	Wed May 22 10:37:50 2013 +0530
@@ -238,12 +238,9 @@ Void TEncCu::init(TEncTop* pcEncTop)
     m_pcRdCost           = pcEncTop->getRdCost();
 
     m_pcEntropyCoder     = NULL;
-    m_pcCavlcCoder       = pcEncTop->getCavlcCoder();
-    m_pcSbacCoder        = pcEncTop->getSbacCoder();
-    m_pcBinCABAC         = pcEncTop->getBinCABAC();
 
     m_pppcRDSbacCoder   = NULL;
-    m_pcRDGoOnSbacCoder = pcEncTop->getRDGoOnSbacCoder();
+    m_pcRDGoOnSbacCoder = NULL;
 
     m_pcRateCtrl        = pcEncTop->getRateCtrl();
 }
@@ -266,6 +263,7 @@ Void TEncCu::compressCU(TComDataCU* pcCu
 
     m_pcPredSearch->set_pppcRDSbacCoder(m_pppcRDSbacCoder);
     m_pcPredSearch->set_pcEntropyCoder(m_pcEntropyCoder);
+    m_pcPredSearch->set_pcRDGoOnSbacCoder(m_pcRDGoOnSbacCoder);
 
     // analysis of CU
     xCompressCU(m_ppcBestCU[0], m_ppcTempCU[0], NULL, 0);
@@ -565,12 +563,12 @@ Void TEncCu::xCompressCU(TComDataCU*& rp
 
         if(rpcBestCU->getTotalCost() < LAMBDA_PARTITION_SELECT*_NxNCost)              // checking if BestCU is of size_2NX2N
         {
-            rpcBestCU->copyToPic(uiDepth);                                                        // Copy Best data to Picture for next partition prediction.
+            rpcBestCU->copyToPic(uiDepth);                                            // Copy Best data to Picture for next partition prediction.
             xCopyYuv2Pic(rpcBestCU->getPic(), rpcBestCU->getAddr(), rpcBestCU->getZorderIdxInCU(), uiDepth, uiDepth, rpcBestCU, uiLPelX, uiTPelY);        // Copy Yuv data to picture Yuv
             return;
         }
 
-        if (pcPic->getSlice(0)->getSPS()->getAMPRefineAcc(uiDepth))
+        if (m_pcEncCfg->getUseRectInter())
         {
             // 2NxN, Nx2N
             if (doNotBlockPu)
@@ -803,7 +801,7 @@ Void TEncCu::xCompressCU(TComDataCU*& rp
                     }
                 }
             
-                if (pcPic->getSlice(0)->getSPS()->getAMPRefineAcc(uiDepth))
+                if (m_pcEncCfg->getUseRectInter())
                 {
                     // 2NxN, Nx2N
                     if (doNotBlockPu)
--- a/source/Lib/TLibEncoder/TEncCu.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncCu.h	Wed May 22 10:37:50 2013 +0530
@@ -90,9 +90,6 @@ private:
     TComRdCost*             m_pcRdCost;
 
     TEncEntropy*            m_pcEntropyCoder;
-    TEncCavlc*              m_pcCavlcCoder;
-    TEncSbac*               m_pcSbacCoder;
-    TEncBinCABAC*           m_pcBinCABAC;
 
     // SBAC RD
     TEncSbac***             m_pppcRDSbacCoder;
@@ -108,7 +105,7 @@ public:
     Void set_pppcRDSbacCoder(TEncSbac*** pppcRDSbacCoder) { m_pppcRDSbacCoder = pppcRDSbacCoder; }
     Void set_pcEntropyCoder(TEncEntropy* pcEntropyCoder) { m_pcEntropyCoder = pcEntropyCoder; }
     Void set_pcPredSearch(TEncSearch* pcPredSearch) { m_pcPredSearch = pcPredSearch; }
-    
+    Void set_pcRDGoOnSbacCoder(TEncSbac* pcRDGoOnSbacCoder) { m_pcRDGoOnSbacCoder = pcRDGoOnSbacCoder; }
 
     /// copy parameters from encoder class
     Void  init(TEncTop* pcEncTop);
--- a/source/Lib/TLibEncoder/TEncGOP.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncGOP.cpp	Wed May 22 10:37:50 2013 +0530
@@ -1202,7 +1202,7 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
                     m_pcEntropyCoders[0].resetEntropy();
                     m_pcEntropyCoders[0].setBitstream(m_pcBitCounter);
                     // CHECK_ME: I think the SAO is use a temp Sbac only, so I always use [0], am I right?
-                    m_pcSAO->startSaoEnc(pcPic, &m_pcEntropyCoders[0], m_pcEncTop->getRDSbacCoders()[0], m_pcEncTop->getRDGoOnSbacCoder());
+                    m_pcSAO->startSaoEnc(pcPic, &m_pcEntropyCoders[0], m_pcEncTop->getRDSbacCoders()[0], &m_pcEncTop->getRDGoOnSbacCoders()[0]);
                     SAOParam& cSaoParam = *pcSlice->getPic()->getPicSym()->getSaoParam();
 
 #if SAO_CHROMA_LAMBDA
@@ -1582,7 +1582,7 @@ Void TEncGOP::preLoopFilterPicAll(TComPi
     m_pcLoopFilter->setCfg(m_pcCfg->getLFCrossTileBoundaryFlag());
     m_pcLoopFilter->loopFilterPic(pcPic);
 
-    m_pcEntropyCoders[0].setEntropyCoder(m_pcEncTop->getRDGoOnSbacCoder(), pcSlice);
+    m_pcEntropyCoders[0].setEntropyCoder(&m_pcEncTop->getRDGoOnSbacCoders()[0], pcSlice);
     m_pcEntropyCoders[0].resetEntropy();
     m_pcEntropyCoders[0].setBitstream(m_pcBitCounter);
     pcSlice = pcPic->getSlice(0);
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Wed May 22 10:37:50 2013 +0530
@@ -2361,6 +2361,7 @@ Void TEncSearch::preestChromaPredMode(TC
     Bool  bLeftAvail  = false;
 
     x265::pixelcmp sa8d;
+
     switch (uiWidth)
     {
     case 32:
@@ -2401,7 +2402,7 @@ Void TEncSearch::preestChromaPredMode(TC
 
         //--- get SAD ---
         UInt uiSAD = sa8d((pixel*)piOrgU, uiStride, (pixel*)piPredU, uiStride) +
-                     sa8d((pixel*)piOrgV, uiStride, (pixel*)piPredV, uiStride);
+            sa8d((pixel*)piOrgV, uiStride, (pixel*)piPredV, uiStride);
         x265_emms();
 
         //--- check ---
@@ -2437,6 +2438,7 @@ Void TEncSearch::estIntraPredQT(TComData
     Double  CandCostList[FAST_UDI_MAX_RDMODE_NUM];
 
     x265::pixelcmp sa8d;
+
     switch (uiWidth)
     {
     case 64:
@@ -2502,7 +2504,7 @@ Void TEncSearch::estIntraPredQT(TComData
             }
 
             CandNum = 0;
-           
+
             for (Int modeIdx = 0; modeIdx < numModesAvailable; modeIdx++)
             {
                 UInt uiMode = modeIdx;
@@ -2517,8 +2519,7 @@ Void TEncSearch::estIntraPredQT(TComData
                 Double cost      = (Double)uiSad + (Double)iModeBits * m_pcRdCost->getSqrtLambda();
 
                 CandNum += xUpdateCandList(uiMode, cost, numModesForFullRD, uiRdModeList, CandCostList);    //Find N least cost  modes. N = numModesForFullRD
-            }       
-
+            }
 
 #if FAST_UDI_USE_MPM
             Int uiPreds[3] = { -1, -1, -1 };
@@ -3911,9 +3912,9 @@ Void TEncSearch::xMotionEstimation(TComD
     m_me.setSearchLimits(cMvSrchRngLT, cMvSrchRngRB);
     m_me.setQP(pcCU->getQP(0), m_pcRdCost->getSqrtLambda());
 
-    if (m_iSearchMethod < 3 && m_cDistParam.bApplyWeight == false && !bBi)
+    if (m_iSearchMethod != X265_ORIG_SEARCH && m_cDistParam.bApplyWeight == false && !bBi)
     {
-        int satd = m_me.motionEstimate(m_pcRdCost->m_mvPredictor, 3, m_acMvPredictors, iSrchRng, rcMv);
+        int satd = m_me.motionEstimate(*pcMvPred, 3, m_acMvPredictors, iSrchRng, rcMv);
         UInt mvcost = m_me.mvcost(rcMv);
         UInt mvbits = m_me.bitcost(rcMv);
         ruiBits += mvbits;
@@ -3927,7 +3928,7 @@ Void TEncSearch::xMotionEstimation(TComD
 
     // Configure the MV bit cost calculator  (TODO: m_bc will go away)
     m_bc.setQP(pcCU->getQP(0), m_pcRdCost->getSqrtLambda());
-    m_bc.setMVP(m_pcRdCost->m_mvPredictor);
+    m_bc.setMVP(*pcMvPred);
 
     setWpScalingDistParam(pcCU, iRefIdxPred, eRefPicList);
 
@@ -3995,6 +3996,7 @@ Void TEncSearch::xPatternSearch(TComPatt
     m_pcRdCost->setDistParam(pcPatternKey, piRefY, iRefStride,  m_cDistParam);
 
     // fast encoder decision: use subsampled SAD for integer ME
+    if (0)
     {
         if (m_cDistParam.iRows > 12)
         {
--- a/source/Lib/TLibEncoder/TEncSearch.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Wed May 22 10:37:50 2013 +0530
@@ -131,7 +131,7 @@ protected:
 public:
     Void set_pppcRDSbacCoder(TEncSbac*** pppcRDSbacCoder) { m_pppcRDSbacCoder = pppcRDSbacCoder; }
     Void set_pcEntropyCoder(TEncEntropy* pcEntropyCoder) { m_pcEntropyCoder = pcEntropyCoder; }
-    
+    Void set_pcRDGoOnSbacCoder(TEncSbac* pcRDGoOnSbacCoder) { m_pcRDGoOnSbacCoder = pcRDGoOnSbacCoder; }
 
     TEncSearch();
     virtual ~TEncSearch();
--- a/source/Lib/TLibEncoder/TEncSlice.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSlice.cpp	Wed May 22 10:37:50 2013 +0530
@@ -131,7 +131,7 @@ Void TEncSlice::init(TEncTop* pcEncTop)
 
     m_pcBitCounter      = pcEncTop->getBitCounter();
     m_pcRdCost          = pcEncTop->getRdCost();
-    m_pcRDGoOnSbacCoder = pcEncTop->getRDGoOnSbacCoder();
+    m_pcRDGoOnSbacCoders= pcEncTop->getRDGoOnSbacCoders();
 
     m_pcRateCtrl        = pcEncTop->getRateCtrl();
 }
@@ -579,10 +579,10 @@ Void TEncSlice::compressSlice(TComPic* r
             }
 
             // set go-on entropy coder
-            m_pcEntropyCoders[uiSubStrm].setEntropyCoder(m_pcRDGoOnSbacCoder, pcSlice);
+            m_pcEntropyCoders[uiSubStrm].setEntropyCoder(&m_pcRDGoOnSbacCoders[uiSubStrm], pcSlice);
             m_pcEntropyCoders[uiSubStrm].setBitstream(&pcBitCounters[uiSubStrm]);
 
-            ((TEncBinCABAC*)m_pcRDGoOnSbacCoder->getEncBinIf())->setBinCountingEnableFlag(true);
+            ((TEncBinCABAC*)m_pcRDGoOnSbacCoders[uiSubStrm].getEncBinIf())->setBinCountingEnableFlag(true);
 
             Double oldLambda = m_pcRdCost->getLambda();
             if (m_pcCfg->getUseRateCtrl())
@@ -612,6 +612,7 @@ Void TEncSlice::compressSlice(TComPic* r
             m_pcCuEncoders[uiSubStrm].set_pppcRDSbacCoder(ppppcRDSbacCoders[uiSubStrm]);
             m_pcCuEncoders[uiSubStrm].set_pcEntropyCoder(&m_pcEntropyCoders[uiSubStrm]);
             m_pcCuEncoders[uiSubStrm].set_pcPredSearch(&m_pcPredSearchs[uiSubStrm]);
+            m_pcCuEncoders[uiSubStrm].set_pcRDGoOnSbacCoder(&m_pcRDGoOnSbacCoders[uiSubStrm]);
 
             // run CU encoder
             m_pcCuEncoders[uiSubStrm].compressCU(pcCU);
--- a/source/Lib/TLibEncoder/TEncSlice.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSlice.h	Wed May 22 10:37:50 2013 +0530
@@ -88,7 +88,7 @@ private:
     // RD optimization
     TComBitCounter*         m_pcBitCounter;                     ///< bit counter
     TComRdCost*             m_pcRdCost;                         ///< RD cost computation
-    TEncSbac*               m_pcRDGoOnSbacCoder;                ///< go-on SBAC encoder
+    TEncSbac*               m_pcRDGoOnSbacCoders;               ///< go-on SBAC encoder
     UInt64                  m_uiPicTotalBits;                   ///< total bits for the picture
     UInt64                  m_uiPicDist;                        ///< total distortion for the picture
     Double                  m_dPicRdCost;                       ///< picture-level RD cost
--- a/source/Lib/TLibEncoder/TEncTop.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncTop.cpp	Wed May 22 10:37:50 2013 +0530
@@ -43,6 +43,8 @@
 #include "primitives.h"
 #include "threadpool.h"
 
+#include <limits.h>
+
 //! \ingroup TLibEncoder
 //! \{
 
@@ -53,9 +55,10 @@
 TEncTop::TEncTop()
 {
     m_iPOCLast          = -1;
+    m_framesToBeEncoded = INT_MAX;
     m_iNumPicRcvd       =  0;
     m_uiNumAllPicCoded  =  0;
-    m_cRDGoOnSbacCoder.init(&m_cRDGoOnBinCoderCABAC);
+//     m_cRDGoOnSbacCoder.init(&m_cRDGoOnBinCoderCABAC);
 #if ENC_DEC_TRACE
     g_hTrace = fopen("TraceEnc.txt", "wb");
     g_bJustDoIt = g_bEncDecTraceDisable;
@@ -132,7 +135,7 @@ Void TEncTop::createWPPCoders(Int iNumSu
     m_pcSbacCoders           = new TEncSbac[iNumSubstreams];
     m_pcBinCoderCABACs       = new TEncBinCABAC[iNumSubstreams];
     m_pcRDGoOnSbacCoders     = new TEncSbac[iNumSubstreams];
-    m_pcRDGoOnBinCodersCABAC = new TEncBinCABAC[iNumSubstreams];
+    m_pcRDGoOnBinCodersCABAC = new TEncBinCABACCounter[iNumSubstreams];
     m_pcBitCounters          = new TComBitCounter[iNumSubstreams];
     m_pcRdCosts              = new TComRdCost[iNumSubstreams];
     m_pcEntropyCoders        = new TEncEntropy[iNumSubstreams];
@@ -182,12 +185,10 @@ Void TEncTop::destroy()
     }
     m_cLoopFilter.destroy();
     m_cRateCtrl.destroy();
-    // SBAC RD
-    Int iDepth;
 
     for (UInt ui = 0; ui < m_iNumSubstreams; ui++)
     {
-        for (iDepth = 0; iDepth < g_uiMaxCUDepth + 1; iDepth++)
+        for (Int iDepth = 0; iDepth < g_uiMaxCUDepth + 1; iDepth++)
         {
             for (Int iCIIdx = 0; iCIIdx < CI_NUM; iCIIdx++)
             {
@@ -196,7 +197,7 @@ Void TEncTop::destroy()
             }
         }
 
-        for (iDepth = 0; iDepth < g_uiMaxCUDepth + 1; iDepth++)
+        for (Int iDepth = 0; iDepth < g_uiMaxCUDepth + 1; iDepth++)
         {
             delete [] m_ppppcRDSbacCoders[ui][iDepth];
             delete [] m_ppppcBinCodersCABAC[ui][iDepth];
@@ -204,9 +205,12 @@ Void TEncTop::destroy()
 
         delete[] m_ppppcRDSbacCoders[ui];
         delete[] m_ppppcBinCodersCABAC[ui];
+
+        m_pcCuEncoders[ui].destroy();
     }
 
     delete[] m_pcCuEncoders;
+
     delete[] m_pcSearchs;
     delete[] m_pcEntropyCoders;
     delete[] m_ppppcRDSbacCoders;
@@ -261,7 +265,7 @@ Void TEncTop::init()
     // initialize encoder search class
     for(UInt ui=0; ui<m_uiNumSubstreams; ui++)
     {
-        m_pcSearchs[ui].init(this, &m_cTrQuant, m_iSearchRange, m_bipredSearchRange, m_iSearchMethod, &m_cRdCost, getRDGoOnSbacCoder());
+        m_pcSearchs[ui].init(this, &m_cTrQuant, m_iSearchRange, m_bipredSearchRange, m_iSearchMethod, &m_cRdCost, NULL/*getRDGoOnSbacCoder()*/);
     }
 
     m_iMaxRefPicNum = 0;
@@ -295,12 +299,14 @@ Void TEncTop::deletePicBuffer()
  \param   pcPicYuvOrg         original YUV picture
  \retval  rcListPicYuvRecOut  list of reconstruction YUV pictures
  \retval  rcListBitstreamOut  list of output bitstreams
- \retval  iNumEncoded         number of encoded pictures
+ \retval                      number of encoded pictures
  */
-Void TEncTop::encode(Bool flush, const x265_picture_t* pic, TComList<TComPicYuv*>& rcListPicYuvRecOut, std::list<AccessUnit>& accessUnitsOut, Int& iNumEncoded)
+int TEncTop::encode(Bool flush, const x265_picture_t* pic, TComList<TComPicYuv*>& rcListPicYuvRecOut, std::list<AccessUnit>& accessUnitsOut)
 {
     if (pic)
     {
+        m_iNumPicRcvd++;
+        
         // get original YUV
         TComPic* pcPicCurr = NULL;
         xGetNewPicBuffer(pcPicCurr);
@@ -313,10 +319,14 @@ Void TEncTop::encode(Bool flush, const x
         }
     }
 
+    // Wait until we have a full GOP of pictures
     if (!m_iNumPicRcvd || (!flush && m_iPOCLast != 0 && m_iNumPicRcvd != m_iGOPSize && m_iGOPSize))
     {
-        iNumEncoded = 0;
-        return;
+        return 0;
+    }
+    if (flush)
+    {
+        m_framesToBeEncoded = m_iNumPicRcvd + m_uiNumAllPicCoded;
     }
 
     if (m_RCEnableRateControl)
@@ -332,9 +342,11 @@ Void TEncTop::encode(Bool flush, const x
         m_cRateCtrl.destroyRCGOP();
     }
 
-    iNumEncoded         = m_iNumPicRcvd;
-    m_iNumPicRcvd       = 0;
-    m_uiNumAllPicCoded += iNumEncoded;
+    m_uiNumAllPicCoded += m_iNumPicRcvd;
+
+    Int iNumEncoded = m_iNumPicRcvd;
+    m_iNumPicRcvd = 0;
+    return iNumEncoded;
 }
 
 // ====================================================================================================================
@@ -390,7 +402,6 @@ Void TEncTop::xGetNewPicBuffer(TComPic*&
     rpcPic->setReconMark(false);
 
     m_iPOCLast++;
-    m_iNumPicRcvd++;
 
     rpcPic->getSlice(0)->setPOC(m_iPOCLast);
     // mark it should be extended
@@ -462,17 +473,13 @@ Void TEncTop::xInitSPS()
     for (i = 0; i < g_uiMaxCUDepth - g_uiAddCUDepth; i++)
     {
         m_cSPS.setAMPAcc(i, m_useAMP);
-        m_cSPS.setAMPRefineAcc(i, m_useAMPRefine);
-        //m_cSPS.setAMPAcc( i, 1 );
     }
 
     m_cSPS.setUseAMP(m_useAMP);
-    m_cSPS.setUseAMPRefine(m_useAMPRefine);
 
     for (i = g_uiMaxCUDepth - g_uiAddCUDepth; i < g_uiMaxCUDepth; i++)
     {
         m_cSPS.setAMPAcc(i, 0);
-        m_cSPS.setAMPRefineAcc(i, 1);
     }
 
     m_cSPS.setBitDepthY(g_bitDepthY);
--- a/source/Lib/TLibEncoder/TEncTop.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncTop.h	Wed May 22 10:37:50 2013 +0530
@@ -102,7 +102,7 @@ private:
     // RD cost computation
     TComBitCounter          m_cBitCounter;                ///< bit counter for RD optimization
     TComRdCost              m_cRdCost;                    ///< RD cost computation class
-    TEncSbac                m_cRDGoOnSbacCoder;           ///< going on SBAC model for RD stage
+//     TEncSbac                m_cRDGoOnSbacCoder;           ///< going on SBAC model for RD stage
     TEncBinCABACCounter     m_cRDGoOnBinCoderCABAC;       ///< going on bin coder CABAC for RD stage
 
     Int                     m_iNumSubstreams;             ///< # of top-level elements allocated.
@@ -180,7 +180,7 @@ public:
 
     TComRdCost*             getRdCost() { return &m_cRdCost;              }
 
-    TEncSbac*               getRDGoOnSbacCoder() { return &m_cRDGoOnSbacCoder;     }
+//     TEncSbac*               getRDGoOnSbacCoder() { return &m_cRDGoOnSbacCoder;     }
 
     TComBitCounter*         getBitCounters() { return m_pcBitCounters;         }
 
@@ -207,8 +207,7 @@ public:
     // -------------------------------------------------------------------------------------------------------------------
 
     /// encode several number of pictures until end-of-sequence
-    Void encode(Bool bEos, const x265_picture_t* pic, TComList<TComPicYuv*>& rcListPicYuvRecOut,
-                std::list<AccessUnit>& accessUnitsOut, Int& iNumEncoded);
+    int encode(Bool bEos, const x265_picture_t* pic, TComList<TComPicYuv*>& rcListPicYuvRecOut, std::list<AccessUnit>& accessUnitsOut);
 
     void printSummary() { m_cGOPEncoder.printOutSummary(m_uiNumAllPicCoded); }
 };
--- a/source/encoder/CMakeLists.txt	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/CMakeLists.txt	Wed May 22 10:37:50 2013 +0530
@@ -3,6 +3,8 @@ if(GCC)
         # force gcc to generate code for sync primitives
         set_source_files_properties(threadpool.cpp PROPERTIES COMPILE_FLAGS -march=i686)
     endif()
+    # encoder.cpp must include HM headers which are not careful about named parameters
+    set_source_files_properties(encoder.cpp PROPERTIES COMPILE_FLAGS -Wno-unused-parameter)
 endif(GCC)
 
 set(CPRIMITIVES pixel.cpp macroblock.cpp ipfilter.cpp IntraPred.cpp)
--- a/source/encoder/common.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/common.cpp	Wed May 22 10:37:50 2013 +0530
@@ -39,7 +39,7 @@ extern "C"
 void x265_param_default( x265_param_t *param )
 {
     memset(param, 0, sizeof(x265_param_t));
-    param->searchMethod = X265_UMH_SEARCH;
+    param->searchMethod = X265_ORIG_SEARCH;
     param->iSearchRange = 96;
     param->bipredSearchRange = 4;
     param->iIntraPeriod = -1; // default to open GOP
@@ -51,7 +51,7 @@ void x265_param_default( x265_param_t *p
     param->uiQuadtreeTUMaxDepthInter = 2;
     param->uiQuadtreeTUMaxDepthIntra = 1;
     param->enableAMP = 1;
-    param->enableAMPRefine = 1;
+    param->enableRectInter = 1;
     param->iQP = 30;
     param->iQPAdaptationRange = 6;
     param->bUseSAO = 1;
@@ -65,9 +65,6 @@ void x265_param_default( x265_param_t *p
     param->useStrongIntraSmoothing = 1;
     param->useRDOQ = 1;
     param->useRDOQTS = 1;
-    param->pcmLog2MaxSize = 5u;
-    param->uiPCMLog2MinSize = 3u;
-    param->bPCMInputBitDepthFlag = 1;
 }
 
 extern "C"
@@ -139,12 +136,8 @@ int x265_check_params(x265_param_t *para
         "QP exceeds supported range (-QpBDOffsety to 51)");
     CONFIRM(param->iFrameRate <= 0,
         "Frame rate must be more than 1");
-    CONFIRM(param->loopFilterBetaOffsetDiv2 < -13 || param->loopFilterBetaOffsetDiv2 > 13,
-        "Loop Filter Beta Offset div. 2 exceeds supported range (-13 to 13)");
-    CONFIRM(param->loopFilterTcOffsetDiv2 < -13 || param->loopFilterTcOffsetDiv2 > 13,
-        "Loop Filter Tc Offset div. 2 exceeds supported range (-13 to 13)");
-    CONFIRM(param->searchMethod < 0 || param->searchMethod > 3,
-        "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM)");
+    CONFIRM(param->searchMethod < 0 || param->searchMethod > X265_ORIG_SEARCH,
+        "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM 4:ORIG)");
     CONFIRM(param->iSearchRange < 0,
         "Search Range must be more than 0");
     CONFIRM(param->bipredSearchRange < 0,
@@ -202,18 +195,6 @@ int x265_check_params(x265_param_t *para
     CONFIRM(param->bUseAdaptQpSelect && (param->cbQpOffset != 0 || param->crQpOffset != 0),
         "AdaptiveQpSelection must be disabled when ChromaQpOffset is not equal to 0.");
 
-    if (param->usePCM)
-    {
-        CONFIRM(param->uiPCMLog2MinSize < 3,
-            "PCMLog2MinSize must be 3 or greater.");
-        CONFIRM(param->uiPCMLog2MinSize > 5,
-            "PCMLog2MinSize must be 5 or smaller.");
-        CONFIRM(param->pcmLog2MaxSize > 5,
-            "PCMLog2MaxSize must be 5 or smaller.");
-        CONFIRM(param->pcmLog2MaxSize < param->uiPCMLog2MinSize,
-            "PCMLog2MaxSize must be equal to or greater than m_uiPCMLog2MinSize.");
-    }
-
     //TODO:ChromaFmt assumes 4:2:0 below
     CONFIRM(param->iSourceWidth  % TComSPS::getWinUnitX(CHROMA_420) != 0,
         "Picture width must be an integer multiple of the specified chroma subsampling");
@@ -231,20 +212,6 @@ int x265_check_params(x265_param_t *para
 
     CONFIRM(param->iWaveFrontSynchro < 0, "WaveFrontSynchro cannot be negative");
 
-    if (param->RCEnableRateControl)
-    {
-        if (param->RCForceIntraQP)
-        {
-            if (param->RCInitialQP == 0)
-            {
-                printf("\nInitial QP for rate control is not specified. Reset not to use force intra QP!");
-                param->RCForceIntraQP = false;
-            }
-        }
-    }
-
-    CONFIRM(!param->TransquantBypassEnableFlag && param->CUTransquantBypassFlagValue, "CUTransquantBypassFlagValue cannot be 1 when TransquantBypassEnableFlag is 0");
-
     CONFIRM(param->log2ParallelMergeLevel < 2, "Log2ParallelMergeLevel should be larger than or equal to 2");
 
     return check_failed;
@@ -276,8 +243,8 @@ void x265_set_globals(x265_param_t *para
     g_uiPCMBitDepthLuma = g_uiPCMBitDepthChroma = 8;
 #endif
 
-    g_uiPCMBitDepthLuma = param->bPCMInputBitDepthFlag ? inputBitDepth : g_bitDepthY;
-    g_uiPCMBitDepthChroma = param->bPCMInputBitDepthFlag ? inputBitDepth : g_bitDepthC;
+    g_uiPCMBitDepthLuma = inputBitDepth;
+    g_uiPCMBitDepthChroma = inputBitDepth;
 }
 
 void x265_print_params(x265_param_t *param)
@@ -289,7 +256,6 @@ void x265_print_params(x265_param_t *par
     printf("CU size / depth              : %d / %d\n", param->uiMaxCUSize, param->uiMaxCUDepth);
     printf("RQT trans. size (min / max)  : %d / %d\n", 1 << param->uiQuadtreeTULog2MinSize, 1 << param->uiQuadtreeTULog2MaxSize);
     printf("Max RQT depth inter / intra  : %d / %d\n", param->uiQuadtreeTUMaxDepthInter, param->uiQuadtreeTUMaxDepthIntra);
-    printf("Min PCM size                 : %d\n", 1 << param->uiPCMLog2MinSize);
     printf("Motion search / range        : %s / %d\n", x265_motion_est_names[param->searchMethod], param->iSearchRange );
     printf("Max Num Merge Candidates     : %d\n", param->maxNumMergeCand);
     printf("Intra period                 : %d\n", param->iIntraPeriod);
@@ -304,16 +270,6 @@ void x265_print_params(x265_param_t *par
     {
         printf("QP adaptation                : %d (range=%d)\n", param->bUseAdaptiveQP, (param->bUseAdaptiveQP ? param->iQPAdaptationRange : 0));
     }
-    if (param->RCEnableRateControl)
-    {
-        printf("RateControl                  : %d\n", param->RCEnableRateControl);
-        printf("TargetBitrate                : %d\n", param->RCTargetBitrate);
-        printf("KeepHierarchicalBit          : %d\n", param->RCKeepHierarchicalBit);
-        printf("LCULevelRC                   : %d\n", param->RCLCULevelRC);
-        printf("UseLCUSeparateModel          : %d\n", param->RCUseLCUSeparateModel);
-        printf("InitialQP                    : %d\n", param->RCInitialQP);
-        printf("ForceIntraQP                 : %d\n", param->RCForceIntraQP);
-    }
     printf("\n");
 
     printf("TOOL CFG: ");
@@ -327,9 +283,7 @@ void x265_print_params(x265_param_t *par
     printf("TransformSkipFast:%d ", param->useTransformSkipFast);
     printf("CIP:%d ", param->bUseConstrainedIntraPred);
     printf("SAO:%d ", (param->bUseSAO) ? (1) : (0));
-    printf("PCM:%d ", (param->usePCM && (1u << param->uiPCMLog2MinSize) <= param->uiMaxCUSize) ? 1 : 0);
     printf("SAOLcuBasedOptimization:%d ", (param->saoLcuBasedOptimization) ? (1) : (0));
-    printf("LosslessCuEnabled:%d ", (param->useLossless) ? 1 : 0);
     printf("WPP:%d ", param->useWeightedPred);
     printf("WPB:%d ", param->useWeightedBiPred);
     printf("PME:%d ", param->log2ParallelMergeLevel);
@@ -338,5 +292,7 @@ void x265_print_params(x265_param_t *par
     printf("TMVPMode:%d ", param->TMVPModeId);
     printf("AQpS:%d ", param->bUseAdaptQpSelect);
     printf("SignBitHidingFlag:%d ", param->signHideFlag);
+    printf("\n\n");
     fflush(stdout);
+
 }
--- a/source/encoder/encoder.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/encoder.cpp	Wed May 22 10:37:50 2013 +0530
@@ -30,10 +30,7 @@
 #include <stdio.h>
 #include <string.h>
 
-struct x265_t : public x265::Encoder
-{
-
-};
+struct x265_t : public x265::Encoder {};
 
 using namespace x265;
 
@@ -50,15 +47,7 @@ void Encoder::configure(x265_param_t *pa
     setIntraPeriod(param->iIntraPeriod);
     setQP(param->iQP);
     setUseAMP(param->enableAMP);
-    setUseAMPRefine(param->enableAMPRefine);
-
-    //====== Loop/Deblock Filter ========
-    setLoopFilterDisable(param->bLoopFilterDisable);
-    setLoopFilterOffsetInPPS(param->loopFilterOffsetInPPS);
-    setLoopFilterBetaOffset(param->loopFilterBetaOffsetDiv2);
-    setLoopFilterTcOffset(param->loopFilterTcOffsetDiv2);
-    setDeblockingFilterControlPresent(param->DeblockingFilterControlPresent);
-    setDeblockingFilterMetric(param->DeblockingFilterMetric);
+    setUseRectInter(param->enableRectInter);
 
     //====== Motion search ========
     setSearchMethod(param->searchMethod);
@@ -71,17 +60,10 @@ void Encoder::configure(x265_param_t *pa
     setChromaCrQpOffset(param->crQpOffset);
     setUseAdaptQpSelect(param->bUseAdaptQpSelect);
 
-    assert(g_bitDepthY);
-    int lowestQP = -6 * (g_bitDepthY - 8); // XXX: check
-    if ((param->iQP == lowestQP) && param->useLossless)
-    {
-        param->bUseAdaptiveQP = 0;
-    }
     setUseAdaptiveQP(param->bUseAdaptiveQP);
     setQPAdaptationRange(param->iQPAdaptationRange);
 
     //====== Coding Tools ========
-    setUseLossless(param->useLossless);
     setUseRDOQ(param->useRDOQ);
     setUseRDOQTS(param->useRDOQTS);
     setRDpenalty(param->rdPenalty);
@@ -95,16 +77,11 @@ void Encoder::configure(x265_param_t *pa
     setUseTransformSkip(param->useTransformSkip);
     setUseTransformSkipFast(param->useTransformSkipFast);
     setUseConstrainedIntraPred(param->bUseConstrainedIntraPred);
-    setPCMLog2MinSize(param->uiPCMLog2MinSize);
-    setUsePCM(param->usePCM);
-    setPCMLog2MaxSize(param->pcmLog2MaxSize);
     setMaxNumMergeCand(param->maxNumMergeCand);
     setUseSAO(param->bUseSAO);
     setMaxNumOffsetsPerPic(param->maxNumOffsetsPerPic);
     setSaoLcuBoundary(param->saoLcuBoundary);
     setSaoLcuBasedOptimization(param->saoLcuBasedOptimization);
-    setPCMInputBitDepthFlag(param->bPCMInputBitDepthFlag);
-    setPCMFilterDisableFlag(param->bPCMFilterDisableFlag);
 
     //====== Parallel Merge Estimation ========
     setLog2ParallelMergeLevelMinus2(param->log2ParallelMergeLevel - 2);
@@ -118,22 +95,9 @@ void Encoder::configure(x265_param_t *pa
     setTMVPModeId(param->TMVPModeId);
     setSignHideFlag(param->signHideFlag);
 
-    setUseRateCtrl(param->RCEnableRateControl);
-    setTargetBitrate(param->RCTargetBitrate);
-    setKeepHierBit(param->RCKeepHierarchicalBit);
-    setLCULevelRC(param->RCLCULevelRC);
-    setUseLCUSeparateModel(param->RCUseLCUSeparateModel);
-    setInitialQP(param->RCInitialQP);
-    setForceIntraQP(param->RCForceIntraQP);
-
-    setTransquantBypassEnableFlag(param->TransquantBypassEnableFlag);
-    setCUTransquantBypassFlagValue(param->CUTransquantBypassFlagValue);
     setUseStrongIntraSmoothing(param->useStrongIntraSmoothing);
 
-
     //====== Settings derived from user configuration ======
-    setFramesToBeEncoded(m_framesToBeEncoded);
-    setFrameSkip(m_FrameSkip);
     setProfile(m_profile);
     setLevel(m_levelTier, m_level);
 
@@ -144,9 +108,8 @@ void Encoder::configure(x265_param_t *pa
     {
         setNumReorderPics(m_numReorderPics[i], i);
         setMaxDecPicBuffering(m_maxDecPicBuffering[i], i);
-        setLambdaModifier(i, m_adLambdaModifier[i]);
+        setLambdaModifier(i, 1.0);
     }
-    setMinSpatialSegmentationIdc(m_minSpatialSegmentationIdc);
 
     TComVPS vps;
     vps.setMaxTLayers(m_maxTempLayer);
@@ -188,17 +151,18 @@ void Encoder::configure(x265_param_t *pa
     setUseRecalculateQPAccordingToLambda(0);
     setActiveParameterSetsSEIEnabled(0);
     setVuiParametersPresentFlag(0);
+    setMinSpatialSegmentationIdc(0);
     setAspectRatioIdc(0);
     setSarWidth(0);
     setSarHeight(0);
     setOverscanInfoPresentFlag(0);
     setOverscanAppropriateFlag(0);
     setVideoSignalTypePresentFlag(0);
-    setVideoFormat(0);
+    setVideoFormat(5);
     setVideoFullRangeFlag(0);
     setColourDescriptionPresentFlag(0);
-    setColourPrimaries(0);
-    setTransferCharacteristics(0);
+    setColourPrimaries(2);
+    setTransferCharacteristics(2);
     setMatrixCoefficients(2);
     setChromaLocInfoPresentFlag(0);
     setChromaSampleLocTypeTopField(0);
@@ -214,6 +178,32 @@ void Encoder::configure(x265_param_t *pa
     setMaxBitsPerMinCuDenom(1);
     setLog2MaxMvLengthHorizontal(15);
     setLog2MaxMvLengthVertical(15);
+
+    setUsePCM(0);
+    setPCMLog2MinSize(3);
+    setPCMLog2MaxSize(5);
+    setPCMInputBitDepthFlag(1);
+    setPCMFilterDisableFlag(0);
+
+    setUseRateCtrl(0);
+    setTargetBitrate(0);
+    setKeepHierBit(0);
+    setLCULevelRC(0);
+    setUseLCUSeparateModel(0);
+    setInitialQP(0);
+    setForceIntraQP(0);
+
+    setUseLossless(0); // x264 configures this via --qp=0
+
+    setLoopFilterDisable(0);
+    setLoopFilterOffsetInPPS(0);
+    setLoopFilterBetaOffset(0);
+    setLoopFilterTcOffset(0);
+    setDeblockingFilterControlPresent(0);
+    setDeblockingFilterMetric(0);
+
+    setTransquantBypassEnableFlag(0);
+    setCUTransquantBypassFlagValue(0);
 }
 
 extern "C"
@@ -251,6 +241,8 @@ void x265_encoder_close(x265_t *encoder)
 /*======= Everything below here will become the new x265main.cpp ==========*/
 
 #include "TLibEncoder/AnnexBwrite.h"
+#include "input/input.h"
+#include "output/output.h"
 #include <getopt.h>
 #include <list>
 #include <ostream>
@@ -263,8 +255,8 @@ struct CLIOptions
 {
     x265::Input*  input;
     x265::Output* recon;
-    char* outputFileName;               ///< output bit-stream file
     x265::ThreadPool *threadPool;
+    fstream bitstreamFile;
 
     uint32_t inputBitDepth;             ///< bit-depth of input file
     uint32_t outputBitDepth;            ///< bit-depth of output file
@@ -277,7 +269,6 @@ struct CLIOptions
     {
         input = NULL;
         recon = NULL;
-        outputFileName = NULL;
         threadPool = NULL;
         inputBitDepth = outputBitDepth = 8;
         framesToBeEncoded = frameSkip = 0;
@@ -338,9 +329,148 @@ struct CLIOptions
     }
 };
 
+#define OPT(longname, var, argreq, flag, helptext)
+
+static const char short_options[] = "i:b:o:f:s:d:";
+OPT("help",            help,                            no_argument,   0, "Show help text")
+OPT("cpuid",           cpuid,                     required_argument,   0, "SIMD architecture. 2:MMX2 .. 8:AVX2 (default:0-auto)")
+OPT("threads",         threadcount,               required_argument,   0, "Number of threads for thread pool (default:CPU HT core count)")
+OPT("InputFile",       inputfn,                   required_argument, 'i', "Raw YUV or Y4M input file name")
+OPT("BitstreamFile",   bitstreamfn,               required_argument, 'b', "Bitstream output file name")
+OPT("ReconFile",       reconfn,                   required_argument, 'o', "Reconstructed YUV output file name")
+
+OPT("InputBitDepth",   cliopt->inputBitDepth,     required_argument,   0, "Bit-depth of input file (default: 8)")
+OPT("OutputBitDepth",  cliopt->outputBitDepth,    required_argument,   0, "Bit-depth of output file (default:InternalBitDepth)")
+OPT("FrameSkip",       cliopt->frameSkip,         required_argument,   0, "Number of frames to skip at start of input YUV")
+OPT("frames",          cliopt->framesToBeEncoded, required_argument, 'f', "Number of frames to be encoded (default=all)")
+
+OPT("wpp",             param->iWaveFrontSynchro,        no_argument,   0, "0:no synchro 1:synchro with TR 2:TRR etc")
+OPT("width",           param->iSourceWidth,       required_argument, 'w', "Source picture width")
+OPT("height",          param->iSourceHeight,      required_argument, 'h', "Source picture height")
+OPT("rate",            param->iFrameRate,         required_argument, 'r', "Frame rate")
+OPT("depth",           param->internalBitDepth,   required_argument,   0, "Bit-depth the codec operates at. (default:InputBitDepth)"
+                                                                          "If different to InputBitDepth, source data will be converted")
+OPT("ctu",             param->uiMaxCUSize,        required_argument, 's', "Maximum CU size (default: 64x64)")
+OPT("pdepth",          param->uiMaxCUDepth,       required_argument, 'd', "CU partition depth (default: 4)")
+
+OPT("constrained-intra", param->bUseConstrainedIntraPred,      no_argument, 0, "Constrained intra prediction (use only intra coded reference pixels)")
+OPT("TULog2MaxSize",   param->uiQuadtreeTULog2MaxSize,   required_argument, 0, "Maximum TU size in logarithm base 2")
+OPT("TULog2MinSize",   param->uiQuadtreeTULog2MinSize,   required_argument, 0, "Minimum TU size in logarithm base 2")
+OPT("TUMaxDepthIntra", param->uiQuadtreeTUMaxDepthIntra, required_argument, 0, "Depth of TU tree for intra CUs")
+OPT("TUMaxDepthInter", param->uiQuadtreeTUMaxDepthInter, required_argument, 0, "Depth of TU tree for inter CUs")
+OPT("keyint",          param->iIntraPeriod,              required_argument, 0, "Intra period in frames, (-1: only first frame)")
+OPT("me",              param->searchMethod,              required_argument, 0, "0:dia 1:hex 2:umh 3:tss 4:hm-orig")
+OPT("merange",         param->iSearchRange,              required_argument, 0, "Motion search range (default: 96)")
+OPT("bpredrange",      param->bipredSearchRange,         required_argument, 0, "Motion search range for bipred refinement (default:4)")
+OPT("MaxCuDQPDepth",   param->iMaxCuDQPDepth,            required_argument, 0, "Max depth for a minimum CU dQP")
+OPT("cbqpoffs",        param->cbQpOffset,                required_argument, 0, "Chroma Cb QP Offset")
+OPT("crqpoffs",        param->crQpOffset,                required_argument, 0, "Chroma Cr QP Offset")
+OPT("aqselect",        param->bUseAdaptQpSelect,               no_argument, 0, "Adaptive QP selection")
+OPT("aq",              param->bUseAdaptiveQP,                  no_argument, 0, "QP adaptation based on a psycho-visual model")
+OPT("aqrange",         param->iQPAdaptationRange,        required_argument, 0, "QP adaptation range")
+OPT("rdoq",            param->useRDOQ,                         no_argument, 0, "Use RDO quantization")
+OPT("rdoqts",          param->useRDOQTS,                       no_argument, 0, "Use RDO quantization with transform skip")
+OPT("rdpenalty",       param->rdPenalty,                 required_argument, 0, "RD-penalty for 32x32 TU for intra in non-intra slices. 0:disabled  1:RD-penalty  2:maximum RD-penalty")
+OPT("amp",             param->enableAMP,                       no_argument, 0, "Enable asymmetric motion partitions")
+OPT("rect",            param->enableRectInter,                 no_argument, 0, "Enable rectangular motion partitions Nx2N and 2NxN, disabling also disables AMP")
+OPT("tskip",           param->useTransformSkip,                no_argument, 0, "Intra transform skipping")
+OPT("tskip-fast",      param->useTransformSkipFast,            no_argument, 0, "Fast intra transform skipping")
+OPT("sao",             param->bUseSAO,                         no_argument, 0, "Enable Sample Adaptive Offset")
+OPT("max-sao-offsets", param->maxNumOffsetsPerPic,       required_argument, 0, "Max number of SAO offset per picture (Default: 2048)")
+OPT("SAOLcuBoundary",  param->saoLcuBoundary,                  no_argument, 0, "0: right/bottom LCU boundary areas skipped from SAO parameter estimation, 1: non-deblocked pixels are used for those areas")
+OPT("sao-lcu-opt",     param->saoLcuBasedOptimization,         no_argument, 0, "0: SAO picture-based optimization, 1: SAO LCU-based optimization ")
+OPT("weightp",         param->useWeightedPred,                 no_argument, 0, "Use weighted prediction in P slices")
+OPT("weightbp",        param->useWeightedBiPred,               no_argument, 0, "Use weighted (bidirectional) prediction in B slices")
+OPT("merge-level",     param->log2ParallelMergeLevel,    required_argument, 0, "Parallel merge estimation region")
+OPT("hidesign",        param->signHideFlag,                    no_argument, 0, "Hide sign bit of one coeff per TU (rdo)")
+OPT("MaxNumMergeCand", param->maxNumMergeCand,           required_argument, 0, "Maximum number of merge candidates")
+OPT("tmvp",            param->TMVPModeId,                required_argument, 0, "TMVP mode 0: TMVP disable for all slices. 1: TMVP enable for all slices (default) 2: TMVP enable for certain slices only")
+OPT("fdm",             param->useFastDecisionForMerge,         no_argument, 0, "Fast decision for Merge RD Cost")
+OPT("fast-cbf",        param->bUseCbfFastMode,                 no_argument, 0, "Cbf fast mode setting")
+OPT("early-skip",      param->useEarlySkipDetection,           no_argument, 0, "Early SKIP detection setting")
+OPT("strong-intra-smoothing", param->useStrongIntraSmoothing,  no_argument, 0, "Enable strong intra smoothing for 32x32 blocks")
+
 bool parse(int argc, char **argv, x265_param_t* param, CLIOptions* cliopt)
 {
-    return true;
+    int help = 0;
+    int cpuid = 0;
+    int threadcount = 0;
+    const char *inputfn = NULL, *reconfn = NULL, *bitstreamfn = NULL;
+
+    if (argc <= 1 || help)
+        return true;
+
+    x265::SetupPrimitives(cpuid);
+    cliopt->threadPool = x265::ThreadPool::AllocThreadPool(threadcount);
+
+    /* parse the width, height, frame rate from the y4m files if it is not given in the configuration file */
+    cliopt->input = x265::Input::Open(inputfn);
+    if (!cliopt->input || cliopt->input->isFail())
+    {
+        printf("Unable to open source file\n");
+        return true;
+    }
+    if (cliopt->input->getWidth())
+    {
+        param->iSourceWidth = cliopt->input->getWidth();
+        param->iSourceHeight = cliopt->input->getHeight();
+        param->iFrameRate = (int)cliopt->input->getRate();
+        cliopt->inputBitDepth = 8;
+    }
+    else
+    {
+        cliopt->input->setDimensions(param->iSourceWidth, param->iSourceHeight);
+        cliopt->input->setBitDepth(cliopt->inputBitDepth);
+    }
+
+    /* rules for input, output and internal bitdepths as per help text */
+    if (!param->internalBitDepth) { param->internalBitDepth = cliopt->inputBitDepth; }
+    if (!cliopt->outputBitDepth) { cliopt->outputBitDepth = param->internalBitDepth; }
+
+    uint32_t numRemainingFrames = (uint32_t)cliopt->input->guessFrameCount();
+
+    if (cliopt->frameSkip)
+    {
+        cliopt->input->skipFrames(cliopt->frameSkip);
+    }
+
+    cliopt->framesToBeEncoded = cliopt->framesToBeEncoded ? min(cliopt->framesToBeEncoded, numRemainingFrames) : numRemainingFrames;
+
+    printf("Input File                   : %s (%d total frames)\n", inputfn, numRemainingFrames);
+
+    if (reconfn)
+    {
+        printf("Reconstruction File          : %s\n", reconfn);
+        cliopt->recon = x265::Output::Open(reconfn, param->iSourceWidth, param->iSourceHeight, cliopt->outputBitDepth, param->iFrameRate);
+        if (cliopt->recon->isFail())
+        {
+            printf("Unable to write reconstruction file\n");
+            cliopt->recon->release();
+            cliopt->recon = 0;
+        }
+    }
+
+#if !HIGH_BIT_DEPTH
+    if (cliopt->inputBitDepth != 8 || cliopt->outputBitDepth != 8 || param->internalBitDepth != 8)
+    {
+        printf("x265 not compiled for bit depths greater than 8\n");
+        return true;
+    }
+#endif
+
+    printf("Bitstream File               : %s\n", bitstreamfn);
+    printf("Frame index                  : %u - %d (%d frames)\n", cliopt->frameSkip, cliopt->frameSkip + cliopt->framesToBeEncoded - 1, cliopt->framesToBeEncoded);
+
+    //    printf("GOP size                     : %d\n", m_iGOPSize);
+
+    cliopt->bitstreamFile.open(bitstreamfn, fstream::binary | fstream::out);
+    if (!cliopt->bitstreamFile)
+    {
+        fprintf(stderr, "failed to open bitstream file <%s> for writing\n", bitstreamfn);
+        return true;
+    }
+
+    return false;
 }
 
 void new_main(int argc, char **argv)
@@ -353,16 +483,12 @@ void new_main(int argc, char **argv)
     if (parse(argc, argv, &param, &cliopt))
         exit(1);
 
-    fstream bitstreamFile(cliopt.outputFileName, fstream::binary | fstream::out);
-    if (!bitstreamFile)
-    {
-        fprintf(stderr, "failed to open bitstream file <%s> for writing\n", cliopt.outputFileName);
+    x265_set_globals(&param, cliopt.inputBitDepth);
+
+    if (x265_check_params(&param))
         exit(1);
-    }
 
-    TComPicYuv *pcPicYuvRec = NULL;
-    TComList<TComPicYuv *> cListPicYuvRec; ///< list of reconstructed YUV files
-    list<AccessUnit> outputAccessUnits;    ///< list of access units to write out, populated by the encoder5_t
+    x265_print_params(&param);
 
     x265_t *encoder = x265_encoder_open(&param);
     if (!encoder)
@@ -371,11 +497,15 @@ void new_main(int argc, char **argv)
         exit(1);
     }
 
+    TComList<TComPicYuv *> cListPicYuvRec; ///< list of reconstructed YUV files
+    list<AccessUnit> outputAccessUnits;    ///< list of access units to write out, populated by the encoder5_t
+
     // main encoder loop
     uint32_t iFrameRcvd = 0;
-    bool  bEos = false;
+    bool bEos = false;
     while (!bEos)
     {
+        TComPicYuv *pcPicYuvRec = NULL;
         if (cListPicYuvRec.size() == (UInt)encoder->m_iGOPSize)
         {
             pcPicYuvRec = cListPicYuvRec.popFront();
@@ -391,7 +521,7 @@ void new_main(int argc, char **argv)
 
         // read input YUV file
         x265_picture_t pic;
-        bool flush = false;
+        bool nopic = false;
         if (cliopt.input->readPicture(pic))
         {
             iFrameRcvd++;
@@ -399,15 +529,11 @@ void new_main(int argc, char **argv)
         }
         else
         {
-            flush = true;
+            nopic = true;
             bEos = true;
-            encoder->setFramesToBeEncoded(iFrameRcvd);
         }
 
-        PPAStartCpuEventFunc(encode_frame);
-        Int iNumEncoded = 0;
-        encoder->encode(bEos, flush ? 0 : &pic, cListPicYuvRec, outputAccessUnits, iNumEncoded);
-        PPAStopCpuEventFunc(encode_frame);
+        Int iNumEncoded = encoder->encode(bEos, nopic ? 0 : &pic, cListPicYuvRec, outputAccessUnits);
 
         // write bitstream to file if necessary
         if (iNumEncoded > 0)
@@ -423,21 +549,21 @@ void new_main(int argc, char **argv)
                 --iterPicYuvRec;
             }
 
-            x265_picture_t pic;
             for (i = 0; i < iNumEncoded; i++)
             {
                 if (cliopt.recon)
                 {
-                    TComPicYuv  *pcPicYuvRec  = *(iterPicYuvRec++);
-                    pic.planes[0] = pcPicYuvRec->getLumaAddr(); pic.stride[0] = pcPicYuvRec->getStride();
-                    pic.planes[1] = pcPicYuvRec->getCbAddr();   pic.stride[1] = pcPicYuvRec->getCStride();
-                    pic.planes[2] = pcPicYuvRec->getCrAddr();   pic.stride[2] = pcPicYuvRec->getCStride();
-                    pic.bitDepth = sizeof(Pel)*8;
-                    cliopt.recon->writePicture(pic);
+                    x265_picture_t rpic;
+                    TComPicYuv  *recpic  = *(iterPicYuvRec++);
+                    rpic.planes[0] = recpic->getLumaAddr(); rpic.stride[0] = recpic->getStride();
+                    rpic.planes[1] = recpic->getCbAddr();   rpic.stride[1] = recpic->getCStride();
+                    rpic.planes[2] = recpic->getCrAddr();   rpic.stride[2] = recpic->getCStride();
+                    rpic.bitDepth = sizeof(Pel)*8;
+                    cliopt.recon->writePicture(rpic);
                 }
 
                 const AccessUnit &au = *(iterBitstream++);
-                const vector<UInt>& stats = writeAnnexB(bitstreamFile, au);
+                const vector<UInt>& stats = writeAnnexB(cliopt.bitstreamFile, au);
                 cliopt.rateStatsAccum(au, stats);
             }
             outputAccessUnits.clear();
@@ -446,6 +572,7 @@ void new_main(int argc, char **argv)
     }
 
     encoder->printSummary();
+    cliopt.bitstreamFile.close();
 
     double time = (double)iFrameRcvd / param.iFrameRate;
     printf("Bytes written to file: %u (%.3f kbps)\n", cliopt.totalBytes, 0.008 * cliopt.totalBytes / time);
@@ -456,8 +583,8 @@ void new_main(int argc, char **argv)
     size_t iSize = cListPicYuvRec.size();
     for (size_t i = 0; i < iSize; i++)
     {
-        TComPicYuv *pcPicYuvRec = *(iterPicYuvRec++);
-        pcPicYuvRec->destroy();
-        delete pcPicYuvRec;
+        TComPicYuv *recpic = *(iterPicYuvRec++);
+        recpic->destroy();
+        delete recpic;
     }
 }
--- a/source/encoder/encoder.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/encoder.h	Wed May 22 10:37:50 2013 +0530
@@ -25,24 +25,15 @@
 #define __ENCODER__
 
 #include "TLibEncoder/TEncTop.h"
-
-#include "input/input.h"
-#include "output/output.h"
-#include "threadpool.h"
 #include "common.h"
 #include "x265.h"
 
-#include <list>
-#include <ostream>
-
 namespace x265 {
 // private namespace
 
 class Encoder : public TEncTop
 {
 protected:
-    x265_param_t *m_param;
-
     // profile/level
     Profile::Name m_profile;
     Level::Tier   m_levelTier;
@@ -51,21 +42,13 @@ protected:
     // coding structure
     GOPEntry  m_GOPList[MAX_GOP];               ///< the coding structure entries from the config file
     int       m_maxTempLayer;                   ///< Max temporal layer
-    double    m_adLambdaModifier[MAX_TLAYER];   ///< Lambda modifier array for each temporal layer
     int       m_numReorderPics[MAX_TLAYER];     ///< total number of reorder pictures
     int       m_maxDecPicBuffering[MAX_TLAYER]; ///< total number of pictures in the decoded picture buffer
-    int       m_minSpatialSegmentationIdc;      ///< Indicates the maximum size of the spatial segments in the pictures in the coded video sequence
-
-    // internal member functions
-    void      xSetGlobal();                     ///< set global variables
-    void      xCheckParameter();                ///< check validity of configuration values
-    void      xPrintParameter();                ///< print configuration values
-    void      xPrintUsage();                    ///< print usage
 
 public:
     int       m_iGOPSize;                       ///< GOP size of hierarchical structure
 
-    Encoder() : m_param(NULL) {};
+    Encoder() : m_profile(Profile::MAIN), m_levelTier(Level::MAIN), m_level(Level::NONE) {};
 
     virtual ~Encoder() {}
 
--- a/source/encoder/motion.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/motion.cpp	Wed May 22 10:37:50 2013 +0530
@@ -37,17 +37,21 @@
 using namespace x265;
 
 static int size_scale[NUM_PARTITIONS];
-
-#define SAD_THRESH(v) (bcost < (((v>>4) * size_scale[partEnum])))
+#define SAD_THRESH(v) (bcost < (((v >> 4) * size_scale[partEnum])))
 
 static void init_scales(void)
 {
-    int dims[] = {4, 8, 12, 16, 24, 32, 48, 64};
+    int dims[] = { 4, 8, 12, 16, 24, 32, 48, 64 };
 
     int i = 0;
-    for (size_t h = 0; h < sizeof(dims)/sizeof(int); h++)
-        for (size_t w = 0; w < sizeof(dims)/sizeof(int); w++)
+
+    for (size_t h = 0; h < sizeof(dims) / sizeof(int); h++)
+    {
+        for (size_t w = 0; w < sizeof(dims) / sizeof(int); w++)
+        {
             size_scale[i++] = (dims[h] * dims[w]) >> 4;
+        }
+    }
 }
 
 void MotionEstimate::setSourcePU(int offset, int width, int height)
@@ -72,18 +76,17 @@ void MotionEstimate::setSourcePU(int off
 }
 
 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const MV hex2[8] = { MV( -1, -2 ), MV( -2, 0 ), MV( -1, 2 ), MV( 1, 2 ), MV( 2, 0 ), MV( 1, -2 ), MV( -1, -2 ), MV( -2, 0 ) };
+static const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
 static const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
-static const MV square1[9] = { MV( 0, 0 ), MV( 0, -1 ), MV( 0, 1 ), MV( -1, 0 ), MV( 1, 0 ), MV( -1, -1 ), MV( -1, 1 ), MV( 1, -1 ), MV( 1, 1 ) };
+static const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
 static const MV hex4[16] =
 {
-    MV(0,-4),  MV(0,4),  MV(-2,-3), MV(2,-3),
-    MV(-4,-2), MV(4,-2), MV(-4,-1), MV(4,-1),
-    MV(-4,0),  MV(4,0),  MV(-4,1),  MV(4,1),
+    MV(0, -4),  MV(0, 4),  MV(-2, -3), MV(2, -3),
+    MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
+    MV(-4, 0),  MV(4, 0),  MV(-4, 1),  MV(4, 1),
     MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
 };
 
-
 static inline int x265_predictor_difference(const MV *mvc, intptr_t numCandidates)
 {
     int sum = 0;
@@ -97,12 +100,25 @@ static inline int x265_predictor_differe
     return sum;
 }
 
+#define COST_MV_HM(mx, my, point, dist) \
+    do \
+    { \
+        MV tmv(mx, my); \
+        int cost = fpelSad(fref, tmv) + mvcost(tmv << 2); \
+        if (cost < bcost) { \
+            bcost = cost;\
+            bmv = tmv;\
+            bPointNr = point;\
+            bDistance = dist;\
+        }\
+    } while (0)
+
 #define COST_MV(mx, my) \
     do \
     { \
-        MV tmv(mx, my); \
-        int cost = fpelSad(fref, tmv) + mvcost(tmv<<2); \
-        COPY2_IF_LT(bcost, cost, bmv, tmv); \
+        MV _tmv(mx, my); \
+        int cost = fpelSad(fref, _tmv) + mvcost(_tmv << 2); \
+        COPY2_IF_LT(bcost, cost, bmv, _tmv); \
     } while (0)
 
 #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
@@ -205,36 +221,39 @@ int MotionEstimate::motionEstimate(const
 
     /* re-measure full pel rounded MVP with SAD as search start point */
     MV bmv = pmv.roundToFPel();
-    MV omv = bmv;
     int bcost = pmv.isSubpel() ? fpelSad(fref, bmv) + mvcost(bmv << 2) : bprecost;
 
     // measure SAD cost at MV(0) if MVP is not zero
     if (pmv.notZero())
     {
         int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(0);
-        if (cost < bprecost)
+        if (cost < bcost)
         {
-            bprecost = cost;
-            bestpre = 0;
+            bcost = cost;
+            bmv = 0;
         }
     }
 
-    // measure SAD cost at each QPEL motion vector candidate
-    for (int i = 0; i < numCandidates; i++)
+    if (searchMethod != X265_HM_SEARCH)
     {
-        MV m = mvc[i].clipped(qmvmin, qmvmax);
-        if (m.notZero() && m != pmv && m != bestpre) // check already measured
+        // measure SAD cost at each QPEL motion vector candidate
+        for (int i = 0; i < numCandidates; i++)
         {
-            int cost = qpelSad(m) + mvcost(m);
-            if (cost < bprecost)
+            MV m = mvc[i].clipped(qmvmin, qmvmax);
+            if (m.notZero() && m != pmv && m != bestpre) // check already measured
             {
-                bprecost = cost;
-                bestpre = m;
+                int cost = qpelSad(m) + mvcost(m);
+                if (cost < bprecost)
+                {
+                    bprecost = cost;
+                    bestpre = m;
+                }
             }
         }
     }
 
     pmv = pmv.roundToFPel();
+    MV omv = bmv;  // current search origin or starting point
 
     switch (searchMethod)
     {
@@ -266,22 +285,23 @@ int MotionEstimate::motionEstimate(const
 me_hex2:
         /* hexagon search, radius 2 */
 #if 0
-        for (int i = 0; i < merange/2; i++)
+        for (int i = 0; i < merange / 2; i++)
         {
             omv = bmv;
-            COST_MV( omv.x-2, omv.y   );
-            COST_MV( omv.x-1, omv.y+2 );
-            COST_MV( omv.x+1, omv.y+2 );
-            COST_MV( omv.x+2, omv.y   );
-            COST_MV( omv.x+1, omv.y-2 );
-            COST_MV( omv.x-1, omv.y-2 );
-            if( omv == bmv )
+            COST_MV(omv.x - 2, omv.y);
+            COST_MV(omv.x - 1, omv.y + 2);
+            COST_MV(omv.x + 1, omv.y + 2);
+            COST_MV(omv.x + 2, omv.y);
+            COST_MV(omv.x + 1, omv.y - 2);
+            COST_MV(omv.x - 1, omv.y - 2);
+            if (omv == bmv)
                 break;
-            if(!bmv.checkRange(mvmin, mvmax))
+            if (!bmv.checkRange(mvmin, mvmax))
                 break;
         }
-#else
-        /* equivalent to the above, but eliminates duplicate candidates */
+
+#else // if 0
+      /* equivalent to the above, but eliminates duplicate candidates */
         COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
         COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs + 3);
         bcost <<= 3;
@@ -316,7 +336,7 @@ me_hex2:
             }
         }
         bcost >>= 3;
-#endif
+#endif // if 0
 
         /* square refine */
         int dir = 0;
@@ -521,6 +541,60 @@ me_hex2:
             goto me_hex2;
         break;
     }
+
+    case X265_HM_SEARCH: // extendedDiamondSearch - HM Search Algorithm
+    {
+        int bPointNr = 0;
+        int bDistance = 0;
+        int rounds = 0;
+
+        const int earlyStopRounds = 3;
+        for (int16_t dist = 1; dist <= (int16_t)merange; dist *= 2)
+        {
+            int saved = bcost;
+            ExtendedDiamondSearch(bmv, bcost, bPointNr, bDistance, dist, omv);
+
+            // Break if we go earlyStopRounds without an improved prediction
+            if (bcost < saved)
+                rounds = 0;
+            else if (++rounds >= earlyStopRounds)
+                break;
+        }
+        if (bDistance == 1)
+        {
+            // if best distance was only 1, check two missing points
+            TwoPointSearch(bmv, bcost, bPointNr);
+            break;
+        }
+
+        const int rasterDistance = 5;
+        if (bDistance > rasterDistance)
+        {
+            // raster search refinement if distance was too big
+            MV tmv;
+            for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += rasterDistance)
+                for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += rasterDistance)
+                    COST_MV(omv.x + tmv.x, omv.y + tmv.y); // TODO: use sad_x4 here
+        }
+
+        while (bDistance > 0)
+        {
+            // center a new search around current best
+            omv = bmv;
+            bDistance = 0;
+            bPointNr = 0;
+            for (int16_t dist = 1; dist <= (int16_t)merange; dist *= 2)
+                ExtendedDiamondSearch(bmv, bcost, bPointNr, bDistance, dist, omv);
+
+            if (bDistance == 1)
+            {
+                TwoPointSearch(bmv, bcost, bPointNr);
+                break;
+            }
+        }
+    }
+    break;
+
     default:
         assert(0);
         break;
@@ -537,25 +611,26 @@ me_hex2:
 
     /* HPEL refinement followed by QPEL refinement */
 
+    omv = bmv;
     bcost <<= 4;
     int16_t res = 2;
     do
     {
         for (int iter = 0; iter < 2; iter++)
         {
-            MV mv = bmv + MV(0, -res);
+            MV mv = omv + MV(0, -res);
             int cost = qpelSatd(mv) + mvcost(mv);
             COPY1_IF_LT(bcost, (cost << 4) + 1);
 
-            mv = bmv + MV(0,  res);
+            mv = omv + MV(0,  res);
             cost = qpelSatd(mv) + mvcost(mv);
             COPY1_IF_LT(bcost, (cost << 4) + 3);
 
-            mv = bmv + MV(-res, 0);
+            mv = omv + MV(-res, 0);
             cost = qpelSatd(mv) + mvcost(mv);
             COPY1_IF_LT(bcost, (cost << 4) + 4);
 
-            mv = bmv + MV(res,  0);
+            mv = omv + MV(res,  0);
             cost = qpelSatd(mv) + mvcost(mv);
             COPY1_IF_LT(bcost, (cost << 4) + 12);
 
@@ -563,6 +638,7 @@ me_hex2:
             {
                 bmv.x -= res * ((bcost << 28) >> 30);
                 bmv.y -= res * ((bcost << 30) >> 30);
+                omv = bmv;
                 bcost &= ~15;
             }
         }
@@ -575,3 +651,298 @@ me_hex2:
     outQMv = bmv;
     return bcost >> 4;
 }
+
+void MotionEstimate::ExtendedDiamondSearch(MV &bmv, int &bcost, int &bPointNr, int &bDistance, int16_t dist, const MV& omv)
+{
+    pixel *fref = ref->lumaPlane[0][0] + blockOffset;
+
+    if (dist == 1)
+    {
+        const int16_t iTop    = omv.y - dist;
+        const int16_t iBottom = omv.y + dist;
+        const int16_t iLeft   = omv.x - dist;
+        const int16_t iRight  = omv.x + dist;
+
+        if (iTop >= mvmin.y) // check top
+        {
+            COST_MV_HM(omv.x, iTop, 2, dist);
+        }
+        if (iLeft >= mvmin.x) // check middle left
+        {
+            COST_MV_HM(iLeft, omv.y, 4, dist);
+        }
+        if (iRight <= mvmax.x) // check middle right
+        {
+            COST_MV_HM(iRight, omv.y, 5, dist);
+        }
+        if (iBottom <= mvmax.y) // check bottom
+        {
+            COST_MV_HM(omv.x, iBottom, 7, dist);
+        }
+    }
+    else if (dist <= 8)
+    {
+        const int16_t iTop      = omv.y - dist;
+        const int16_t iBottom   = omv.y + dist;
+        const int16_t iLeft     = omv.x - dist;
+        const int16_t iRight    = omv.x + dist;
+        const int16_t iTop_2    = omv.y - (dist >> 1);
+        const int16_t iBottom_2 = omv.y + (dist >> 1);
+        const int16_t iLeft_2   = omv.x - (dist >> 1);
+        const int16_t iRight_2  = omv.x + (dist >> 1);
+
+        if (iTop >= mvmin.y && iLeft >= mvmin.x &&
+            iRight <= mvmax.x && iBottom <= mvmax.y) // check border
+        {
+            // TODO: Use sad_x4 here
+            COST_MV_HM(omv.x, iTop, 2, dist);
+            COST_MV_HM(iLeft_2, iTop_2, 1, dist >> 1);
+            COST_MV_HM(iRight_2, iTop_2, 3, dist >> 1);
+            COST_MV_HM(iLeft, omv.y, 4, dist);
+
+            // TODO: Use sad_x4 here
+            COST_MV_HM(iRight, omv.y, 5, dist);
+            COST_MV_HM(iLeft_2, iBottom_2, 6, dist >> 1);
+            COST_MV_HM(iRight_2, iBottom_2, 8, dist >> 1);
+            COST_MV_HM(omv.x, iBottom, 7, dist);
+        }
+        else // check border for each mv
+        {
+            if (iTop >= mvmin.y) // check top
+            {
+                COST_MV_HM(omv.x, iTop, 2, dist);
+            }
+            if (iTop_2 >= mvmin.y) // check half top
+            {
+                if (iLeft_2 >= mvmin.x) // check half left
+                {
+                    COST_MV_HM(iLeft_2, iTop_2, 1, (dist >> 1));
+                }
+                if (iRight_2 <= mvmax.x) // check half right
+                {
+                    COST_MV_HM(iRight_2, iTop_2, 3, (dist >> 1));
+                }
+            } // check half top
+            if (iLeft >= mvmin.x) // check left
+            {
+                COST_MV_HM(iLeft, omv.y, 4, dist);
+            }
+            if (iRight <= mvmax.x) // check right
+            {
+                COST_MV_HM(iRight, omv.y, 5, dist);
+            }
+            if (iBottom_2 <= mvmax.y) // check half bottom
+            {
+                if (iLeft_2 >= mvmin.x) // check half left
+                {
+                    COST_MV_HM(iLeft_2, iBottom_2, 6, (dist >> 1));
+                }
+                if (iRight_2 <= mvmax.x) // check half right
+                {
+                    COST_MV_HM(iRight_2, iBottom_2, 8, (dist >> 1));
+                }
+            } // check half bottom
+            if (iBottom <= mvmax.y) // check bottom
+            {
+                COST_MV_HM(omv.x, iBottom, 7, dist);
+            }
+        } // check border for each mv
+    }
+    else
+    {
+        const int16_t iTop    = omv.y - dist;
+        const int16_t iBottom = omv.y + dist;
+        const int16_t iLeft   = omv.x - dist;
+        const int16_t iRight  = omv.x + dist;
+
+        if (iTop >= mvmin.y && iLeft >= mvmin.x &&
+            iRight <= mvmax.x && iBottom <= mvmax.y) // check border
+        {
+            // TODO; Use sad_x4
+            COST_MV_HM(omv.x, iTop, 0, dist);
+            COST_MV_HM(iLeft, omv.y, 0, dist);
+            COST_MV_HM(iRight, omv.y, 0, dist);
+            COST_MV_HM(omv.x, iBottom, 0, dist);
+            for (int16_t index = 1; index < 4; index++)
+            {
+                int16_t iPosYT = iTop    + ((dist >> 2) * index);
+                int16_t iPosYB = iBottom - ((dist >> 2) * index);
+                int16_t iPosXL = omv.x - ((dist >> 2) * index);
+                int16_t iPosXR = omv.x + ((dist >> 2) * index);
+                // TODO; Use sad_x4
+                COST_MV_HM(iPosXL, iPosYT, 0, dist);
+                COST_MV_HM(iPosXR, iPosYT, 0, dist);
+                COST_MV_HM(iPosXL, iPosYB, 0, dist);
+                COST_MV_HM(iPosXR, iPosYB, 0, dist);
+            }
+        }
+        else // check border for each mv
+        {
+            if (iTop >= mvmin.y) // check top
+            {
+                COST_MV_HM(omv.x, iTop, 0, dist);
+            }
+            if (iLeft >= mvmin.x) // check left
+            {
+                COST_MV_HM(iLeft, omv.y, 0, dist);
+            }
+            if (iRight <= mvmax.x) // check right
+            {
+                COST_MV_HM(iRight, omv.y, 0, dist);
+            }
+            if (iBottom <= mvmax.y) // check bottom
+            {
+                COST_MV_HM(omv.x, iBottom, 0, dist);
+            }
+            for (int16_t index = 1; index < 4; index++)
+            {
+                int16_t iPosYT = iTop    + ((dist >> 2) * index);
+                int16_t iPosYB = iBottom - ((dist >> 2) * index);
+                int16_t iPosXL = omv.x - ((dist >> 2) * index);
+                int16_t iPosXR = omv.x + ((dist >> 2) * index);
+
+                if (iPosYT >= mvmin.y) // check top
+                {
+                    if (iPosXL >= mvmin.x) // check left
+                    {
+                        COST_MV_HM(iPosXL, iPosYT, 0, dist);
+                    }
+                    if (iPosXR <= mvmax.x) // check right
+                    {
+                        COST_MV_HM(iPosXR, iPosYT, 0, dist);
+                    }
+                } // check top
+                if (iPosYB <= mvmax.y) // check bottom
+                {
+                    if (iPosXL >= mvmin.x) // check left
+                    {
+                        COST_MV_HM(iPosXL, iPosYB, 0, dist);
+                    }
+                    if (iPosXR <= mvmax.x) // check right
+                    {
+                        COST_MV_HM(iPosXR, iPosYB, 0, dist);
+                    }
+                } // check bottom
+            } // for ...
+        } // check border for each mv
+    } // dist > 8
+}
+
+void MotionEstimate::TwoPointSearch(MV &bmv, int &bcost, int bPointNr)
+{
+    pixel *fref = ref->lumaPlane[0][0] + blockOffset;
+    MV omv = bmv;
+
+    /* TODO: turn into a table lookup with per-point offset pairs */
+    switch (bPointNr)
+    {
+    case 1:
+    {
+        if ((omv.x - 1) >= mvmin.x)
+        {
+            COST_MV(omv.x - 1, omv.y);
+        }
+        if ((omv.y - 1) >= mvmin.y)
+        {
+            COST_MV(omv.x, omv.y - 1);
+        }
+    }
+    break;
+    case 2:
+    {
+        if ((omv.y - 1) >= mvmin.y)
+        {
+            if ((omv.x - 1) >= mvmin.x)
+            {
+                COST_MV(omv.x - 1, omv.y - 1);
+            }
+            if ((omv.x + 1) <= mvmax.x)
+            {
+                COST_MV(omv.x + 1, omv.y - 1);
+            }
+        }
+    }
+    break;
+    case 3:
+    {
+        if ((omv.y - 1) >= mvmin.y)
+        {
+            COST_MV(omv.x, omv.y - 1);
+        }
+        if ((omv.x + 1) <= mvmax.x)
+        {
+            COST_MV(omv.x + 1, omv.y);
+        }
+    }
+    break;
+    case 4:
+    {
+        if ((omv.x - 1) >= mvmin.x)
+        {
+            if ((omv.y + 1) <= mvmax.y)
+            {
+                COST_MV(omv.x - 1, omv.y + 1);
+            }
+            if ((omv.y - 1) >= mvmin.y)
+            {
+                COST_MV(omv.x - 1, omv.y - 1);
+            }
+        }
+    }
+    break;
+    case 5:
+    {
+        if ((omv.x + 1) <= mvmax.x)
+        {
+            if ((omv.y - 1) >= mvmin.y)
+            {
+                COST_MV(omv.x + 1, omv.y - 1);
+            }
+            if ((omv.y + 1) <= mvmax.y)
+            {
+                COST_MV(omv.x + 1, omv.y + 1);
+            }
+        }
+    }
+    break;
+    case 6:
+    {
+        if ((omv.x - 1) >= mvmin.x)
+        {
+            COST_MV(omv.x - 1, omv.y);
+        }
+        if ((omv.y + 1) <= mvmax.y)
+        {
+            COST_MV(omv.x, omv.y + 1);
+        }
+    }
+    break;
+    case 7:
+    {
+        if ((omv.y + 1) <= mvmax.y)
+        {
+            if ((omv.x - 1) >= mvmin.x)
+            {
+                COST_MV(omv.x - 1, omv.y + 1);
+            }
+            if ((omv.x + 1) <= mvmax.x)
+            {
+                COST_MV(omv.x + 1, omv.y + 1);
+            }
+        }
+    }
+    break;
+    case 8:
+    {
+        if ((omv.x + 1) <= mvmax.x)
+        {
+            COST_MV(omv.x + 1, omv.y);
+        }
+        if ((omv.y + 1) <= mvmax.y)
+        {
+            COST_MV(omv.x, omv.y + 1);
+        }
+    }
+    break;
+    }
+}
--- a/source/encoder/motion.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/motion.h	Wed May 22 10:37:50 2013 +0530
@@ -42,7 +42,7 @@ struct MotionReference
 
 protected:
 
-    MotionReference& operator=(const MotionReference&);
+    MotionReference& operator =(const MotionReference&);
 };
 
 class MotionEstimate : public BitCost
@@ -80,7 +80,7 @@ protected:
     int partEnum;
     int searchMethod;
 
-    MotionEstimate& operator=(const MotionEstimate&);
+    MotionEstimate& operator =(const MotionEstimate&);
 
 public:
 
@@ -120,6 +120,10 @@ public:
 
 protected:
 
+    /* HM Motion Search */
+    void ExtendedDiamondSearch(MV &bmv, int &bcost, int &bPointNr, int &bDistance, int16_t dist, const MV& omv);
+    void TwoPointSearch(MV &bmv, int &bcost, int bPointNr);
+
     /* Helper functions for motionEstimate.  fref is coincident block in reference frame */
     inline int fpelSad(pixel *fref, const MV& fmv)
     {
@@ -132,6 +136,7 @@ protected:
     {
         MV fmv = qmv >> 2;
         pixel *qfref = ref->lumaPlane[qmv.x & 3][qmv.y & 3] + blockOffset;
+
         return sad(fenc, FENC_STRIDE,
                    qfref + fmv.y * ref->lumaStride + fmv.x,
                    ref->lumaStride);
@@ -141,6 +146,7 @@ protected:
     {
         MV fmv = qmv >> 2;
         pixel *qfref = ref->lumaPlane[qmv.x & 3][qmv.y & 3] + blockOffset;
+
         return satd(fenc, FENC_STRIDE,
                     qfref + fmv.y * ref->lumaStride + fmv.x,
                     ref->lumaStride);
--- a/source/encoder/mv.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/mv.h	Wed May 22 10:37:50 2013 +0530
@@ -38,6 +38,7 @@ public:
 
     union {
         struct { int16_t x, y; };
+
         int32_t word;
     };
 
@@ -84,11 +85,13 @@ public:
     bool inline isSubpel() const               { return (this->word & 0x00030003) != 0; }
 
     MV mvmin(const MV& m) const                { return MV(x > m.x ? m.x : x, y > m.y ? m.y : y); }
+
     MV mvmax(const MV& m) const                { return MV(x < m.x ? m.x : x, y < m.y ? m.y : y); }
 
     MV clipped(const MV& _min, const MV& _max) const
     {
         MV cl = mvmin(_max);
+
         return cl.mvmax(_min);
     }
 
--- a/source/encoder/vec/macroblock.inc	Tue May 21 15:33:13 2013 +0530
+++ b/source/encoder/vec/macroblock.inc	Wed May 22 10:37:50 2013 +0530
@@ -212,6 +212,8 @@ void CDECL partialButterfly16(short *src
     }
 }
 
+#if 0    //partialButterfly32 vector code
+
 void CDECL partialButterfly32(short *src, short *dst, int shift, int line)
 {
     int j;
@@ -489,6 +491,8 @@ void CDECL partialButterfly32(short *src
     }
 }
 
+#endif  //partialButterfly32 vector code
+
 void CDECL partialButterfly8(short *src, short *dst, int shift, int line)
 {
     int j;
@@ -593,7 +597,7 @@ void CDECL partialButterfly4(short *src,
 
 #endif  //partialButterfly4 vector code
 
-#if 0     // partialButterfly32 intrinsic code
+#if 1   //partialButterfly32 intrisic code
 
 void CDECL partialButterfly32(short *src, short *dst, int nshift, int line)
 {
@@ -1321,11 +1325,6 @@ void CDECL partialButterfly32(short *src
 
 void CDECL partialButterfly4(short *src, short *dst, int nshift, int /* line */)
 {
-    //typedef uint32_t UInt32;
-    //typedef uint64_t UInt64;
-    //line = 4;
-    nshift = 2 - 1;
-
     // Const
     __m128i c_1         = _mm_set1_epi32(1);
     __m128i c16_64_64   = _mm_set1_epi32(0x00400040);
@@ -1338,15 +1337,6 @@ void CDECL partialButterfly4(short *src,
     __m128i c32_64_n64   = _mm_set_epi32(-64, 64, -64, 64);
     __m128i c32_36_n83   = _mm_set_epi32(-83, 36, -83, 36);
 
-    // _mm_cvtsi64_si128 not supported on 32bit build
-
-    /*
-      __m128i T20  = _mm_cvtsi64_si128(*(UInt64*)&src[0 * line]);   // [03 02 01 00]
-      __m128i T21  = _mm_cvtsi64_si128(*(UInt64*)&src[1 * line]);   // [13 12 11 10]
-      __m128i T22 = _mm_cvtsi64_si128(*(UInt64*)&src[2 * line]);    // [23 22 21 20]
-      __m128i T23 = _mm_cvtsi64_si128(*(UInt64*)&src[3 * line]);    // [33 32 31 30]
-    */
-
     __m128i T20  = _mm_loadl_epi64((const __m128i*)(src + 0)); // [03 02 01 00]
     __m128i T21  = _mm_loadl_epi64((const __m128i*)(src + 4)); // [13 12 11 10]
     __m128i T22  = _mm_loadl_epi64((const __m128i*)(src + 8)); // [23 22 21 20]
@@ -1411,8 +1401,8 @@ void CDECL partialButterfly4(short *src,
     Coeff_0_3 = _mm_srai_epi32(Coeff_0_3, nshift);
 
     //Co-effs 4-7
-    __m128i O_0_3 = _mm_mullo_epi32(O0123, c32_83_36); //	[O0*36 O1*83 O2*36 O3*83]
-    __m128i O_4_7 = _mm_mullo_epi32(O4567, c32_83_36); //	[O4*36 O5*83 O6*36 O7*83]
+    __m128i O_0_3 = _mm_mullo_epi32(O0123, c32_83_36); // [O0*36 O1*83 O2*36 O3*83]
+    __m128i O_4_7 = _mm_mullo_epi32(O4567, c32_83_36); // [O4*36 O5*83 O6*36 O7*83]
     __m128i Coeff_4_7 = _mm_hadd_epi32(O_0_3, O_4_7);
     Coeff_4_7 = _mm_add_epi32(Coeff_4_7, c32_128);
     Coeff_4_7 = _mm_srai_epi32(Coeff_4_7, nshift);
@@ -1422,15 +1412,15 @@ void CDECL partialButterfly4(short *src,
     _mm_store_si128((__m128i*)dst, Coeff_0_7);
 
     //Co-effs 8-11
-    __m128i E_8_11  = _mm_mullo_epi32(E0123, c32_64_n64); //	[ E0*-64 E1*64 E2*-64 E3*64]
-    __m128i E_12_15 = _mm_mullo_epi32(E4567, c32_64_n64); //	[ E4*-64 E5*64 E6*-64 E7*64]
+    __m128i E_8_11  = _mm_mullo_epi32(E0123, c32_64_n64); // [ E0*-64 E1*64 E2*-64 E3*64]
+    __m128i E_12_15 = _mm_mullo_epi32(E4567, c32_64_n64); // [ E4*-64 E5*64 E6*-64 E7*64]
     __m128i Coeff_8_11 = _mm_hadd_epi32(E_8_11, E_12_15);
     Coeff_8_11 = _mm_add_epi32(Coeff_8_11, c32_128);
     Coeff_8_11 = _mm_srai_epi32(Coeff_8_11, nshift);
 
     //Co-effs 12-15
-    __m128i O_8_11  = _mm_mullo_epi32(O0123, c32_36_n83); //	[O0*-83 O1*36 O2*-83 O3*36]
-    __m128i O_12_15 = _mm_mullo_epi32(O4567, c32_36_n83); //	[O4*-83 O5*36 O6*-83 O7*36]
+    __m128i O_8_11  = _mm_mullo_epi32(O0123, c32_36_n83); // [O0*-83 O1*36 O2*-83 O3*36]
+    __m128i O_12_15 = _mm_mullo_epi32(O4567, c32_36_n83); // [O4*-83 O5*36 O6*-83 O7*36]
     __m128i Coeff_12_15 = _mm_hadd_epi32(O_8_11, O_12_15);
     Coeff_12_15 = _mm_add_epi32(Coeff_12_15, c32_128);
     Coeff_12_15 = _mm_srai_epi32(Coeff_12_15, nshift);
@@ -1440,6 +1430,8 @@ void CDECL partialButterfly4(short *src,
     _mm_store_si128((__m128i*)(dst + 8), Coeff_8_15);
 }
 
+#if 0 // partialButterflyInverse4 vector code
+
 void CDECL partialButterflyInverse4(short *src, short *dst, int shift, int line)
 {
     int j;
@@ -1479,6 +1471,66 @@ void CDECL partialButterflyInverse4(shor
     }
 }
 
+#endif  // partialButterflyInverse4 vector code
+
+#if 1 // partialButterflyInverse4 intrinsic code
+
+void CDECL partialButterflyInverse4(short *src, short *dst, int shift, int line)
+{
+    int j;
+    int add = 1 << (shift - 1);
+    __m128i c_add = _mm_set1_epi32(add);
+
+    for (j = 0; j < (line / 2); j++)
+    {
+        int src_line = src[line];
+        int src_line3 = src[3 * line];
+        int src_line2_shift = (src[2 * line] << 6);
+        int src_zero_shift = (src[0] << 6);
+
+        int O_first_value = 83 * src_line + 36 * src_line3;
+        int O_second_value = 36 * src_line - 83 * src_line3;
+        int E_first_value = src_zero_shift + src_line2_shift;
+        int E_second_value = src_zero_shift - src_line2_shift;
+
+        int first_value = E_first_value + O_first_value;
+        int second_value = E_second_value + O_second_value;
+        int third_value = E_second_value - O_second_value;
+        int fourth_value = E_first_value - O_first_value;
+
+        __m128i sum_diff_value = _mm_set_epi32(fourth_value, third_value, second_value, first_value);
+        __m128i dst_third = _mm_srai_epi32(_mm_add_epi32(c_add, sum_diff_value), shift);
+
+        src++;
+
+        src_line = src[line];
+        src_line3 = src[3 * line];
+        src_line2_shift = (src[2 * line] << 6);
+        src_zero_shift = (src[0] << 6);
+
+        O_first_value = 83 * src_line + 36 * src_line3;
+        O_second_value = 36 * src_line - 83 * src_line3;
+        E_first_value = src_zero_shift + src_line2_shift;
+        E_second_value = src_zero_shift - src_line2_shift;
+
+        first_value = E_first_value + O_first_value;
+        second_value = E_second_value + O_second_value;
+        third_value = E_second_value - O_second_value;
+        fourth_value = E_first_value - O_first_value;
+
+        sum_diff_value = _mm_set_epi32(fourth_value, third_value, second_value, first_value);
+        __m128i dst_third1 = _mm_srai_epi32(_mm_add_epi32(c_add, sum_diff_value), shift);
+
+        __m128i dst_tmp_final = _mm_packs_epi32(dst_third, dst_third1);
+        _mm_store_si128((__m128i*)(dst), dst_tmp_final);
+
+        src++;
+        dst += 8;
+    }
+}
+
+#endif  // partialButterflyInverse4 intrinsic code
+
 void CDECL partialButterflyInverse8(short *src, short *dst, int shift, int line)
 {
     int j;
--- a/source/x265.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/x265.h	Wed May 22 10:37:50 2013 +0530
@@ -64,17 +64,20 @@ typedef enum
     X265_DIA_SEARCH,
     X265_HEX_SEARCH,
     X265_UMH_SEARCH,
-    X265_HM_SEARCH,  // adapted HM fast-ME method
+    X265_HM_SEARCH,    // adapted HM fast-ME method
+    X265_ORIG_SEARCH,  // original HM functions (deprecated)
 }
 X265_ME_METHODS;
 
-static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "hm", 0 };
+static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "hm", "orig", 0 };
 
 typedef struct x265_param_t
 {
     // coding tools (bit-depth)
     int       internalBitDepth;                 ///< bit-depth codec operates at
 
+    int       iWaveFrontSynchro;                ///< 0: no WPP. >= 1: WPP is enabled, the "Top right" from which inheritance occurs is this LCU offset in the line above the current.
+
     // source specification
     int       iFrameRate;                       ///< source frame-rates (Hz)
     int       iSourceWidth;                     ///< source width in pixel
@@ -97,7 +100,7 @@ typedef struct x265_param_t
     int       useTransformSkip;                 ///< flag for enabling intra transform skipping
     int       useTransformSkipFast;             ///< flag for enabling fast intra transform skipping
     int       enableAMP;                        ///< flag for enabling asymmetrical motion predictions
-    int       enableAMPRefine;                  ///< mis-named, disables rectangular modes 2NxN, Nx2N
+    int       enableRectInter;                  ///< flag for enabling rectangular modes 2NxN, Nx2N
 
     // coding quality
     int       iQP;                              ///< QP value of key-picture (integer)
@@ -109,30 +112,12 @@ typedef struct x265_param_t
     int       bUseAdaptiveQP;                   ///< Flag for enabling QP adaptation based on a psycho-visual model
     int       iQPAdaptationRange;               ///< dQP range by QP adaptation
 
-    // coding tools (PCM bit-depth)
-    int       bPCMInputBitDepthFlag;            ///< 0: PCM bit-depth is internal bit-depth. 1: PCM bit-depth is input bit-depth.
-
     // coding tool (lossless)
-    int       useLossless;                      ///< flag for using lossless coding
     int       bUseSAO;                          ///< Enable SAO filter
     int       maxNumOffsetsPerPic;              ///< SAO maximum number of offset per picture
     int       saoLcuBoundary;                   ///< SAO parameter estimation using non-deblocked pixels for LCU bottom and right boundary areas
     int       saoLcuBasedOptimization;          ///< SAO LCU-based optimization
 
-    // coding tools (loop filter)
-    int       bLoopFilterDisable;               ///< flag for using deblocking filter
-    int       loopFilterOffsetInPPS;            ///< offset for deblocking filter in 0 = slice header, 1 = PPS
-    int       loopFilterBetaOffsetDiv2;         ///< beta offset for deblocking filter
-    int       loopFilterTcOffsetDiv2;           ///< tc offset for deblocking filter
-    int       DeblockingFilterControlPresent;   ///< deblocking filter control present flag in PPS
-    int       DeblockingFilterMetric;           ///< blockiness metric in encoder
-
-    // coding tools (PCM)
-    int       usePCM;                           ///< flag for using IPCM
-    uint32_t  pcmLog2MaxSize;                   ///< log2 of maximum PCM block size
-    uint32_t  uiPCMLog2MinSize;                 ///< log2 of minimum PCM block size
-    int       bPCMFilterDisableFlag;            ///< PCM filter disable flag
-
     // coding tools
     int       useRDOQ;                          ///< flag for using RD optimized quantization
     int       useRDOQTS;                        ///< flag for using RD optimized quantization for transform skip
@@ -145,9 +130,6 @@ typedef struct x265_param_t
     int       searchMethod;                     ///< ME search method (DIA, HEX, UMH, HM)
     int       iSearchRange;                     ///< ME search range
     int       bipredSearchRange;                ///< ME search range for bipred refinement
-
-    int       iWaveFrontSynchro;                ///< 0: no WPP. >= 1: WPP is enabled, the "Top right" from which inheritance occurs is this LCU offset in the line above the current.
-
     int       bUseConstrainedIntraPred;         ///< flag for using constrained intra prediction
 
     // weighted prediction
@@ -158,18 +140,6 @@ typedef struct x265_param_t
     uint32_t  maxNumMergeCand;                  ///< Max number of merge candidates
 
     int       TMVPModeId;                       ///< TMVP mode 0: TMVP disabled for all slices. 1: TMVP enabled for all slices (default) 2: TMVP enabled for certain slices only
-
-    int       RCEnableRateControl;              ///< enable rate control or not
-    int       RCTargetBitrate;                  ///< target bitrate when rate control is enabled
-    int       RCKeepHierarchicalBit;            ///< whether keeping hierarchical bit allocation structure or not
-    int       RCLCULevelRC;                     ///< true: LCU level rate control; false: picture level rate control
-    int       RCUseLCUSeparateModel;            ///< use separate R-lambda model at LCU level
-    int       RCInitialQP;                      ///< inital QP for rate control
-    int       RCForceIntraQP;                   ///< force all intra picture to use initial QP or not
-
-    int       TransquantBypassEnableFlag;       ///< transquant_bypass_enable_flag setting in PPS.
-    int       CUTransquantBypassFlagValue;      ///< if transquant_bypass_enable_flag, the fixed value to use for the per-CU cu_transquant_bypass_flag.
-
     int       useStrongIntraSmoothing;          ///< enable strong intra smoothing for 32x32 blocks where the reference samples are flat
 }
 x265_param_t;
--- a/source/x265cfg.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/x265cfg.cpp	Wed May 22 10:37:50 2013 +0530
@@ -290,11 +290,12 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
     ("QuadtreeTUMaxDepthInter", uiQuadtreeTUMaxDepthInter, 2u, "Depth of TU tree for inter CUs")
 
     // Coding structure parameters
-    ("IntraPeriod,-ip",         iIntraPeriod,                -1, "Intra period in frames, (-1: only first frame)")
-    ("DecodingRefreshType,-dr", m_iDecodingRefreshType,       0, "Intra refresh type (0:none 1:CRA 2:IDR)")
-    ("GOPSize,g",               m_iGOPSize,                   1, "GOP size of temporal structure")
+    ("IntraPeriod,-ip",         iIntraPeriod,              -1, "Intra period in frames, (-1: only first frame)")
+    ("DecodingRefreshType,-dr", m_iDecodingRefreshType,     0, "Intra refresh type (0:none 1:CRA 2:IDR)")
+    ("GOPSize,g",               m_iGOPSize,                 1, "GOP size of temporal structure")
+
     // motion options
-    ("SearchMethod,-me",        searchMethod,               3, "0:DIA 1:HEX 2:UMH 3: UMH")
+    ("SearchMethod,-me",        searchMethod,               3, "0:DIA 1:HEX 2:UMH 3:HM 4:ORIG")
     ("SearchRange,-sr",         iSearchRange,              96, "Motion search range")
     ("BipredSearchRange",       bipredSearchRange,          4, "Motion search range for bipred refinement")
     ("HadamardME",              m_bUseHADME,                1, "Hadamard ME for fractional-pel")
@@ -327,16 +328,16 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
     ("RDpenalty",                     rdPenalty,                0,  "RD-penalty for 32x32 TU for intra in non-intra slices. 0:disbaled  1:RD-penalty  2:maximum RD-penalty")
 
     // Deblocking filter parameters
-    ("LoopFilterDisable",              bLoopFilterDisable,             0)
-    ("LoopFilterOffsetInPPS",          loopFilterOffsetInPPS,          0)
-    ("LoopFilterBetaOffset_div2",      loopFilterBetaOffsetDiv2,       0)
-    ("LoopFilterTcOffset_div2",        loopFilterTcOffsetDiv2,         0)
-    ("DeblockingFilterControlPresent", DeblockingFilterControlPresent, 0)
-    ("DeblockingFilterMetric",         DeblockingFilterMetric,         0)
+    ("LoopFilterDisable",              m_bLoopFilterDisable,             0)
+    ("LoopFilterOffsetInPPS",          m_loopFilterOffsetInPPS,          0)
+    ("LoopFilterBetaOffset_div2",      m_loopFilterBetaOffsetDiv2,       0)
+    ("LoopFilterTcOffset_div2",        m_loopFilterTcOffsetDiv2,         0)
+    ("DeblockingFilterControlPresent", m_DeblockingFilterControlPresent, 0)
+    ("DeblockingFilterMetric",         m_DeblockingFilterMetric,         0)
 
     // Coding tools
     ("AMP",                      enableAMP,                 1,  "Enable asymmetric motion partitions")
-    ("AMP_REFINE",               enableAMPRefine,           1,  "Enable asymmetric refinement motion partitions like include Nx2N and 2NxN")
+    ("RectInter",                enableRectInter,           1,  "Enable rectangular motion partitions Nx2N and 2NxN, disabling also disables AMP")
     ("TransformSkip",            useTransformSkip,          0,  "Intra transform skipping")
     ("TransformSkipFast",        useTransformSkipFast,      0,  "Fast intra transform skipping")
     ("SAO",                      bUseSAO,                   1,  "Enable Sample Adaptive Offset")
@@ -346,13 +347,12 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
 
     ("ConstrainedIntraPred",     bUseConstrainedIntraPred,  0, "Constrained Intra Prediction")
 
-    ("PCMEnabledFlag",           usePCM,                    0)
-    ("PCMLog2MaxSize",           pcmLog2MaxSize,            5u)
-    ("PCMLog2MinSize",           uiPCMLog2MinSize,          3u)
-    ("PCMInputBitDepthFlag",     bPCMInputBitDepthFlag,     1)
-    ("PCMFilterDisableFlag",     bPCMFilterDisableFlag,     0)
-
-    ("LosslessCuEnabled",        useLossless,               0)
+    ("PCMEnabledFlag",           m_usePCM,                    0)
+    ("PCMLog2MaxSize",           m_pcmLog2MaxSize,            5)
+    ("PCMLog2MinSize",           m_uiPCMLog2MinSize,          3)
+    ("PCMInputBitDepthFlag",     m_bPCMInputBitDepthFlag,     1)
+    ("PCMFilterDisableFlag",     m_bPCMFilterDisableFlag,     0)
+    ("LosslessCuEnabled",        m_useLossless,               0)
 
     ("WeightedPredP,-wpP",       useWeightedPred,               0,          "Use weighted prediction in P slices")
     ("WeightedPredB,-wpB",       useWeightedBiPred,             0,          "Use weighted (bidirectional) prediction in B slices")
@@ -374,18 +374,18 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
     ("FDM", useFastDecisionForMerge, 1, "Fast decision for Merge RD Cost")
     ("CFM", bUseCbfFastMode, 0, "Cbf fast mode setting")
     ("ESD", useEarlySkipDetection, 0, "Early SKIP detection setting")
+    ("StrongIntraSmoothing,-sis",      useStrongIntraSmoothing,            1, "Enable strong intra smoothing for 32x32 blocks")
 
-    ("RateControl",         RCEnableRateControl,       0, "Rate control: enable rate control")
-    ("TargetBitrate",       RCTargetBitrate,           0, "Rate control: target bitrate")
-    ("KeepHierarchicalBit", RCKeepHierarchicalBit,     0, "Rate control: keep hierarchical bit allocation in rate control algorithm")
-    ("LCULevelRateControl", RCLCULevelRC,              1, "Rate control: true: LCU level RC; false: picture level RC")
-    ("RCLCUSeparateModel",  RCUseLCUSeparateModel,     1, "Rate control: use LCU level separate R-lambda model")
-    ("InitialQP",           RCInitialQP,               0, "Rate control: initial QP")
-    ("RCForceIntraQP",      RCForceIntraQP,            0, "Rate control: force intra QP to be equal to initial QP")
+    ("RateControl",         m_RCEnableRateControl,       0, "Rate control: enable rate control")
+    ("TargetBitrate",       m_RCTargetBitrate,           0, "Rate control: target bitrate")
+    ("KeepHierarchicalBit", m_RCKeepHierarchicalBit,     0, "Rate control: keep hierarchical bit allocation in rate control algorithm")
+    ("LCULevelRateControl", m_RCLCULevelRC,              1, "Rate control: true: LCU level RC; false: picture level RC")
+    ("RCLCUSeparateModel",  m_RCUseLCUSeparateModel,     1, "Rate control: use LCU level separate R-lambda model")
+    ("InitialQP",           m_RCInitialQP,               0, "Rate control: initial QP")
+    ("RCForceIntraQP",      m_RCForceIntraQP,            0, "Rate control: force intra QP to be equal to initial QP")
 
-    ("TransquantBypassEnableFlag",     TransquantBypassEnableFlag,         0, "transquant_bypass_enable_flag indicator in PPS")
-    ("CUTransquantBypassFlagValue",    CUTransquantBypassFlagValue,        0, "Fixed cu_transquant_bypass_flag value, when transquant_bypass_enable_flag is enabled")
-    ("StrongIntraSmoothing,-sis",      useStrongIntraSmoothing,            1, "Enable strong intra smoothing for 32x32 blocks")
+    ("TransquantBypassEnableFlag",     m_TransquantBypassEnableFlag,         0, "transquant_bypass_enable_flag indicator in PPS")
+    ("CUTransquantBypassFlagValue",    m_CUTransquantBypassFlagValue,        0, "Fixed cu_transquant_bypass_flag value, when transquant_bypass_enable_flag is enabled")
     ("RecalculateQPAccordingToLambda", m_recalculateQPAccordingToLambda,     0, "Recalculate QP values according to lambda values. Do not suggest to be enabled in all intra case")
 
     ("SEIActiveParameterSets",         m_activeParameterSetsSEIEnabled,      0, "Enable generation of active parameter sets SEI messages")
@@ -582,8 +582,6 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
     printf("Frame index                  : %u - %d (%d frames)\n", m_FrameSkip, m_FrameSkip + m_framesToBeEncoded - 1, m_framesToBeEncoded);
     printf("GOP size                     : %d\n", m_iGOPSize);
     x265_print_params(this);
-    printf("\n\n");
-    fflush(stdout);
 
     return true;
 }
@@ -655,12 +653,12 @@ Void TAppEncCfg::xCheckParameter()
         }
     }
 
-    if ((iIntraPeriod != 1) && !loopFilterOffsetInPPS && DeblockingFilterControlPresent && (!bLoopFilterDisable))
+    if ((iIntraPeriod != 1) && !m_loopFilterOffsetInPPS && m_DeblockingFilterControlPresent && (!m_bLoopFilterDisable))
     {
         for (Int i = 0; i < m_iGOPSize; i++)
         {
-            xConfirmPara((m_GOPList[i].m_betaOffsetDiv2 + loopFilterBetaOffsetDiv2) < -6 || (m_GOPList[i].m_betaOffsetDiv2 + loopFilterBetaOffsetDiv2) > 6, "Loop Filter Beta Offset div. 2 for one of the GOP entries exceeds supported range (-6 to 6)");
-            xConfirmPara((m_GOPList[i].m_tcOffsetDiv2 + loopFilterTcOffsetDiv2) < -6 || (m_GOPList[i].m_tcOffsetDiv2 + loopFilterTcOffsetDiv2) > 6, "Loop Filter Tc Offset div. 2 for one of the GOP entries exceeds supported range (-6 to 6)");
+            xConfirmPara((m_GOPList[i].m_betaOffsetDiv2 + m_loopFilterBetaOffsetDiv2) < -6 || (m_GOPList[i].m_betaOffsetDiv2 + m_loopFilterBetaOffsetDiv2) > 6, "Loop Filter Beta Offset div. 2 for one of the GOP entries exceeds supported range (-6 to 6)");
+            xConfirmPara((m_GOPList[i].m_tcOffsetDiv2 + m_loopFilterTcOffsetDiv2) < -6 || (m_GOPList[i].m_tcOffsetDiv2 + m_loopFilterTcOffsetDiv2) > 6, "Loop Filter Tc Offset div. 2 for one of the GOP entries exceeds supported range (-6 to 6)");
         }
     }
     m_extraRPSs = 0;
--- a/source/x265cfg.h	Tue May 21 15:33:13 2013 +0530
+++ b/source/x265cfg.h	Wed May 22 10:37:50 2013 +0530
@@ -121,6 +121,28 @@ protected:
     int       m_SOPDescriptionSEIEnabled;
     int       m_scalableNestingSEIEnabled;
 
+    int m_bLoopFilterDisable;
+    int m_loopFilterOffsetInPPS;
+    int m_loopFilterBetaOffsetDiv2;
+    int m_loopFilterTcOffsetDiv2;
+    int m_DeblockingFilterControlPresent;
+    int m_DeblockingFilterMetric;
+    int m_useLossless;
+    int m_uiPCMLog2MinSize;
+    int m_usePCM;
+    int m_pcmLog2MaxSize;
+    int m_bPCMInputBitDepthFlag;
+    int m_bPCMFilterDisableFlag;
+    int m_RCEnableRateControl;
+    int m_RCTargetBitrate;
+    int m_RCKeepHierarchicalBit;
+    int m_RCLCULevelRC;
+    int m_RCUseLCUSeparateModel;
+    int m_RCInitialQP;
+    int m_RCForceIntraQP;
+    int m_TransquantBypassEnableFlag;
+    int m_CUTransquantBypassFlagValue;
+
     int       m_vuiParametersPresentFlag;         ///< enable generation of VUI parameters
     int       m_aspectRatioInfoPresentFlag;       ///< Signals whether aspect_ratio_idc is present
     int       m_aspectRatioIdc;                   ///< aspect_ratio_idc
--- a/source/x265enc.cpp	Tue May 21 15:33:13 2013 +0530
+++ b/source/x265enc.cpp	Wed May 22 10:37:50 2013 +0530
@@ -97,14 +97,15 @@ Void TAppEncTop::xInitLibCfg()
     m_cTEncTop.setIntraPeriod(iIntraPeriod);
     m_cTEncTop.setQP(iQP);
     m_cTEncTop.setUseAMP(enableAMP);
-    m_cTEncTop.setUseAMPRefine(enableAMPRefine);
+    m_cTEncTop.setUseRectInter(enableRectInter);
+
     //====== Loop/Deblock Filter ========
-    m_cTEncTop.setLoopFilterDisable(bLoopFilterDisable);
-    m_cTEncTop.setLoopFilterOffsetInPPS(loopFilterOffsetInPPS);
-    m_cTEncTop.setLoopFilterBetaOffset(loopFilterBetaOffsetDiv2);
-    m_cTEncTop.setLoopFilterTcOffset(loopFilterTcOffsetDiv2);
-    m_cTEncTop.setDeblockingFilterControlPresent(DeblockingFilterControlPresent);
-    m_cTEncTop.setDeblockingFilterMetric(DeblockingFilterMetric);
+    m_cTEncTop.setLoopFilterDisable(m_bLoopFilterDisable);
+    m_cTEncTop.setLoopFilterOffsetInPPS(m_loopFilterOffsetInPPS);
+    m_cTEncTop.setLoopFilterBetaOffset(m_loopFilterBetaOffsetDiv2);
+    m_cTEncTop.setLoopFilterTcOffset(m_loopFilterTcOffsetDiv2);
+    m_cTEncTop.setDeblockingFilterControlPresent(m_DeblockingFilterControlPresent);
+    m_cTEncTop.setDeblockingFilterMetric(m_DeblockingFilterMetric);
 
     //====== Motion search ========
     m_cTEncTop.setSearchMethod(searchMethod);
@@ -118,7 +119,7 @@ Void TAppEncTop::xInitLibCfg()
 
     m_cTEncTop.setUseAdaptQpSelect(bUseAdaptQpSelect);
     Int lowestQP = -6 * (g_bitDepthY - 8); // XXX: check
-    if ((iQP == lowestQP) && useLossless)
+    if ((iQP == lowestQP) && m_useLossless)
     {
         bUseAdaptiveQP = 0;
     }
@@ -137,9 +138,6 @@ Void TAppEncTop::xInitLibCfg()
     m_cTEncTop.setUseTransformSkip(useTransformSkip);
     m_cTEncTop.setUseTransformSkipFast(useTransformSkipFast);
     m_cTEncTop.setUseConstrainedIntraPred(bUseConstrainedIntraPred);
-    m_cTEncTop.setPCMLog2MinSize(uiPCMLog2MinSize);
-    m_cTEncTop.setUsePCM(usePCM);
-    m_cTEncTop.setPCMLog2MaxSize(pcmLog2MaxSize);
     m_cTEncTop.setMaxNumMergeCand(maxNumMergeCand);
 
     //====== Weighted Prediction ========
@@ -154,26 +152,11 @@ Void TAppEncTop::xInitLibCfg()
 
     m_cTEncTop.setSaoLcuBoundary(saoLcuBoundary);
     m_cTEncTop.setSaoLcuBasedOptimization(saoLcuBasedOptimization);
-    m_cTEncTop.setPCMInputBitDepthFlag(bPCMInputBitDepthFlag);
-    m_cTEncTop.setPCMFilterDisableFlag(bPCMFilterDisableFlag);
     m_cTEncTop.setWaveFrontSynchro(iWaveFrontSynchro);
     m_cTEncTop.setTMVPModeId(TMVPModeId);
     m_cTEncTop.setSignHideFlag(signHideFlag);
-    m_cTEncTop.setUseRateCtrl(RCEnableRateControl);
-    m_cTEncTop.setTargetBitrate(RCTargetBitrate);
-    m_cTEncTop.setKeepHierBit(RCKeepHierarchicalBit);
-    m_cTEncTop.setLCULevelRC(RCLCULevelRC);
-    m_cTEncTop.setUseLCUSeparateModel(RCUseLCUSeparateModel);
-    m_cTEncTop.setInitialQP(RCInitialQP);
-    m_cTEncTop.setForceIntraQP(RCForceIntraQP);
-    m_cTEncTop.setTransquantBypassEnableFlag(TransquantBypassEnableFlag);
-    m_cTEncTop.setCUTransquantBypassFlagValue(CUTransquantBypassFlagValue);
-    m_cTEncTop.setUseStrongIntraSmoothing(useStrongIntraSmoothing);
-    m_cTEncTop.setUseLossless(useLossless);
 
-    m_cTEncTop.setFrameSkip(m_FrameSkip);
     m_cTEncTop.setConformanceWindow(0, 0, 0, 0);
-    m_cTEncTop.setFramesToBeEncoded(m_framesToBeEncoded);
     int nullpad[2] = { 0, 0 };
     m_cTEncTop.setPad(nullpad);
 
@@ -193,9 +176,26 @@ Void TAppEncTop::xInitLibCfg()
         m_cTEncTop.setLambdaModifier(uiLoop, m_adLambdaModifier[uiLoop]);
     }
     m_cTEncTop.setMaxTempLayer(m_maxTempLayer);
-
+    m_cTEncTop.setUseStrongIntraSmoothing(useStrongIntraSmoothing);
 
     //====== Tool list ========
+    m_cTEncTop.setPCMInputBitDepthFlag(m_bPCMInputBitDepthFlag);
+    m_cTEncTop.setPCMFilterDisableFlag(m_bPCMFilterDisableFlag);
+    m_cTEncTop.setUseRateCtrl(m_RCEnableRateControl);
+    m_cTEncTop.setTargetBitrate(m_RCTargetBitrate);
+    m_cTEncTop.setKeepHierBit(m_RCKeepHierarchicalBit);
+    m_cTEncTop.setLCULevelRC(m_RCLCULevelRC);
+    m_cTEncTop.setUseLCUSeparateModel(m_RCUseLCUSeparateModel);
+    m_cTEncTop.setInitialQP(m_RCInitialQP);
+    m_cTEncTop.setForceIntraQP(m_RCForceIntraQP);
+    m_cTEncTop.setTransquantBypassEnableFlag(m_TransquantBypassEnableFlag);
+    m_cTEncTop.setCUTransquantBypassFlagValue(m_CUTransquantBypassFlagValue);
+    m_cTEncTop.setUseLossless(m_useLossless);
+
+    m_cTEncTop.setUsePCM(m_usePCM);
+    m_cTEncTop.setPCMLog2MinSize(m_uiPCMLog2MinSize);
+    m_cTEncTop.setPCMLog2MaxSize(m_pcmLog2MaxSize);
+
     m_cTEncTop.setUseASR(m_bUseASR);
     m_cTEncTop.setUseHADME(m_bUseHADME);
     m_cTEncTop.setdQPs(m_aidQP);
@@ -294,7 +294,6 @@ Void TAppEncTop::encode()
     xInitLib();
 
     // main encoder loop
-    Int   iNumEncoded = 0;
     Bool  bEos = false;
 
     list<AccessUnit> outputAccessUnits; ///< list of access units to write out.  is populated by the encoding process
@@ -309,7 +308,7 @@ Void TAppEncTop::encode()
 
         // read input YUV file
         x265_picture_t pic;
-        Bool flush = false;
+        Bool nopic = false;
         if (m_input->readPicture(pic))
         {
             m_iFrameRcvd++;
@@ -317,23 +316,18 @@ Void TAppEncTop::encode()
         }
         else
         {
-            flush = true;
+            nopic = true;
             bEos = true;
-            m_cTEncTop.setFramesToBeEncoded(m_iFrameRcvd);
         }
 
         // call encoding function for one frame
-        PPAStartCpuEventFunc(encode_frame);
-        m_cTEncTop.encode(bEos, flush ? 0 : &pic, m_cListPicYuvRec, outputAccessUnits, iNumEncoded);
-        PPAStopCpuEventFunc(encode_frame);
+        int iNumEncoded = m_cTEncTop.encode(bEos, nopic ? 0 : &pic, m_cListPicYuvRec, outputAccessUnits);
 
         // write bistream to file if necessary
         if (iNumEncoded > 0)
         {
-            PPAStartCpuEventFunc(bitstream_write);
             xWriteOutput(bitstreamFile, iNumEncoded, outputAccessUnits);
             outputAccessUnits.clear();
-            PPAStopCpuEventFunc(bitstream_write);
         }
     }
 
@@ -402,6 +396,7 @@ Void TAppEncTop::xDeleteBuffer()
  */
 Void TAppEncTop::xWriteOutput(std::ostream &bitstreamFile, Int iNumEncoded, const std::list<AccessUnit>& accessUnits)
 {
+    PPAStartCpuEventFunc(bitstream_write);
     Int i;
 
     TComList<TComPicYuv *>::iterator iterPicYuvRec = m_cListPicYuvRec.end();
@@ -429,6 +424,7 @@ Void TAppEncTop::xWriteOutput(std::ostre
         const vector<UInt>& stats = writeAnnexB(bitstreamFile, au);
         rateStatsAccum(au, stats);
     }
+    PPAStopCpuEventFunc(bitstream_write);
 }
 
 /**