changeset 2254:65768d985016

Merge
author Deepthi Devaki
date Thu, 13 Jun 2013 12:35:25 +0530
parents 2c0ecc7b043d (current diff) c8b90c296a0b (diff)
children 95b415adeffa 15956ceaf16a
files source/tools/HM decoder/TAppDecoder.exe
diffstat 13 files changed, 390 insertions(+-), 529 deletions(-) [+]
line wrap: on
line diff
--- a/.hgtags	Thu Jun 13 12:33:23 2013 +0530
+++ b/.hgtags	Thu Jun 13 12:35:25 2013 +0530
@@ -1,5 +1,2 @@
-681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD
-681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD
-d60578bec82edc058f79bba2f934de950f2c4325 LASTKNOWNGOOD
-d60578bec82edc058f79bba2f934de950f2c4325 LASTKNOWNGOOD
+681eabf8a086faea6141f9c1f5a72c9897ed8b29 LASTKNOWNGOOD1
 3ec4837e6f6c7159f438e1f537dff117c93ee139 LASTKNOWNGOOD
--- a/source/Lib/TLibCommon/TComPic.cpp	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibCommon/TComPic.cpp	Thu Jun 13 12:35:25 2013 +0530
@@ -146,8 +146,7 @@ Void TComPic::compressMotion()
  * \param numTiles number of tiles in picture
  * \param bNDBFilterCrossTileBoundary cross-tile-boundary in-loop filtering; true for "cross".
  */
-Void TComPic::createNonDBFilterInfo(std::vector<Int> sliceStartAddress, Int sliceGranularityDepth
-                                    , Bool bNDBFilterCrossTileBoundary)
+Void TComPic::createNonDBFilterInfo(std::vector<Int> sliceStartAddress, Int sliceGranularityDepth, Bool bNDBFilterCrossTileBoundary)
 {
     UInt maxNumSUInLCU = getNumPartInCU();
     UInt numLCUInPic   = getNumCUsInFrame();
--- a/source/Lib/TLibCommon/TComPic.h	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibCommon/TComPic.h	Thu Jun 13 12:35:25 2013 +0530
@@ -178,8 +178,7 @@ public:
 
     Window&       getDefDisplayWindow()   { return m_defaultDisplayWindow; }
 
-    Void          createNonDBFilterInfo(std::vector<Int> sliceStartAddress, Int sliceGranularityDepth
-                                        , Bool bNDBFilterCrossTileBoundary = true);
+    Void          createNonDBFilterInfo(std::vector<Int> sliceStartAddress, Int sliceGranularityDepth, Bool bNDBFilterCrossTileBoundary = true);
     Void          createNonDBFilterInfoLCU(Int sliceID, TComDataCU* pcCU, UInt startSU, UInt endSU, Int sliceGranularyDepth, UInt picWidth, UInt picHeight);
     Void          destroyNonDBFilterInfo();
 
--- a/source/Lib/TLibCommon/TComPicYuvMD5.cpp	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibCommon/TComPicYuvMD5.cpp	Thu Jun 13 12:35:25 2013 +0530
@@ -141,7 +141,7 @@ void calcCRC(TComPicYuv& pic, UChar dige
 
     width >>= 1;
     height >>= 1;
-    stride >>= 1;
+    stride = pic.getCStride();
 
     compCRC(g_bitDepthC, pic.getCbAddr(), width, height, stride, digest[1]);
     compCRC(g_bitDepthC, pic.getCrAddr(), width, height, stride, digest[2]);
@@ -185,7 +185,7 @@ void calcChecksum(TComPicYuv& pic, UChar
 
     width >>= 1;
     height >>= 1;
-    stride >>= 1;
+    stride = pic.getCStride();
 
     compChecksum(g_bitDepthC, pic.getCbAddr(), width, height, stride, digest[1]);
     compChecksum(g_bitDepthC, pic.getCrAddr(), width, height, stride, digest[2]);
@@ -216,7 +216,7 @@ void calcMD5(TComPicYuv& pic, UChar dige
     md5_plane_func = g_bitDepthC <= 8 ? (MD5PlaneFunc)md5_plane<1> : (MD5PlaneFunc)md5_plane<2>;
     width >>= 1;
     height >>= 1;
-    stride >>= 1;
+    stride = pic.getCStride();
 
     md5_plane_func(md5U, pic.getCbAddr(), width, height, stride);
     md5U.finalize(digest[1]);
--- a/source/Lib/TLibCommon/TComSlice.h	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibCommon/TComSlice.h	Thu Jun 13 12:35:25 2013 +0530
@@ -1435,29 +1435,29 @@ public:
     virtual ~TComSlice();
     Void      initSlice();
 
-    Void      setVPS(TComVPS* pcVPS) { m_pcVPS = pcVPS; }
-
-    TComVPS*  getVPS() { return m_pcVPS; }
-
-    Void      setSPS(TComSPS* pcSPS) { m_pcSPS = pcSPS; }
-
-    TComSPS*  getSPS() { return m_pcSPS; }
-
-    Void      setPPS(TComPPS* pcPPS)         { assert(pcPPS != NULL); m_pcPPS = pcPPS; m_iPPSId = pcPPS->getPPSId(); }
-
-    TComPPS*  getPPS() { return m_pcPPS; }
-
-    Void      setPPSId(Int PPSId)         { m_iPPSId = PPSId; }
-
-    Int       getPPSId() { return m_iPPSId; }
-
-    Void      setPicOutputFlag(Bool b)         { m_PicOutputFlag = b;    }
-
-    Bool      getPicOutputFlag()                 { return m_PicOutputFlag; }
-
-    Void      setSaoEnabledFlag(Bool s) { m_saoEnabledFlag = s; }
-
-    Bool      getSaoEnabledFlag() { return m_saoEnabledFlag; }
+    Void      setVPS(TComVPS* pcVPS)          { m_pcVPS = pcVPS; }
+
+    TComVPS*  getVPS()                        { return m_pcVPS; }
+
+    Void      setSPS(TComSPS* pcSPS)          { m_pcSPS = pcSPS; }
+
+    TComSPS*  getSPS()                        { return m_pcSPS; }
+
+    Void      setPPS(TComPPS* pcPPS)          { assert(pcPPS != NULL); m_pcPPS = pcPPS; m_iPPSId = pcPPS->getPPSId(); }
+
+    TComPPS*  getPPS()                        { return m_pcPPS; }
+
+    Void      setPPSId(Int PPSId)             { m_iPPSId = PPSId; }
+
+    Int       getPPSId()                      { return m_iPPSId; }
+
+    Void      setPicOutputFlag(Bool b)        { m_PicOutputFlag = b; }
+
+    Bool      getPicOutputFlag()              { return m_PicOutputFlag; }
+
+    Void      setSaoEnabledFlag(Bool s)       { m_saoEnabledFlag = s; }
+
+    Bool      getSaoEnabledFlag()             { return m_saoEnabledFlag; }
 
     Void      setSaoEnabledFlagChroma(Bool s) { m_saoEnabledFlagChroma = s; }   //!< set SAO Cb&Cr enabled flag
 
@@ -1465,130 +1465,131 @@ public:
 
     Void      setRPS(TComReferencePictureSet *pcRPS) { m_pcRPS = pcRPS; }
 
-    TComReferencePictureSet*  getRPS() { return m_pcRPS; }
-
-    TComReferencePictureSet*  getLocalRPS() { return &m_LocalRPS; }
-
-    Void      setRPSidx(Int iBDidx) { m_iBDidx = iBDidx; }
-
-    Int       getRPSidx() { return m_iBDidx; }
-
-    Int       getPrevPOC()                          { return m_prevPOC;       }
+    TComReferencePictureSet*  getRPS()            { return m_pcRPS; }
+
+    TComReferencePictureSet*  getLocalRPS()       { return &m_LocalRPS; }
+
+    Void      setRPSidx(Int iBDidx)               { m_iBDidx = iBDidx; }
+
+    Int       getRPSidx()                         { return m_iBDidx; }
+
+    Int       getPrevPOC()                        { return m_prevPOC; }
 
     TComRefPicListModification* getRefPicListModification() { return &m_RefPicListModification; }
 
-    Void      setLastIDR(Int iIDRPOC)                       { m_iLastIDR = iIDRPOC; }
-
-    Int       getLastIDR()                                  { return m_iLastIDR; }
-
-    SliceType getSliceType()                          { return m_eSliceType;         }
-
-    Int       getPOC()                          { return m_iPOC;           }
-
-    Int       getSliceQp()                          { return m_iSliceQp;           }
-
-    Bool      getDependentSliceSegmentFlag() const        { return m_dependentSliceSegmentFlag; }
-
-    void      setDependentSliceSegmentFlag(Bool val)      { m_dependentSliceSegmentFlag = val; }
-
-    Int       getSliceQpBase()                          { return m_iSliceQpBase;       }
-
-    Int       getSliceQpDelta()                          { return m_iSliceQpDelta;      }
-
-    Int       getSliceQpDeltaCb()                          { return m_iSliceQpDeltaCb;      }
-
-    Int       getSliceQpDeltaCr()                          { return m_iSliceQpDeltaCr;      }
-
-    Bool      getDeblockingFilterDisable()                { return m_deblockingFilterDisable; }
-
-    Bool      getDeblockingFilterOverrideFlag()           { return m_deblockingFilterOverrideFlag; }
-
-    Int       getDeblockingFilterBetaOffsetDiv2()         { return m_deblockingFilterBetaOffsetDiv2; }
-
-    Int       getDeblockingFilterTcOffsetDiv2()           { return m_deblockingFilterTcOffsetDiv2; }
-
-    Int       getNumRefIdx(RefPicList e)                { return m_aiNumRefIdx[e];             }
-
-    TComPic*  getPic()                              { return m_pcPic;                      }
-
-    TComPic*  getRefPic(RefPicList e, Int iRefIdx)    { return m_apcRefPicList[e][iRefIdx];  }
-
-    Int       getRefPOC(RefPicList e, Int iRefIdx)    { return m_aiRefPOCList[e][iRefIdx];   }
-
-    Int       getDepth()                              { return m_iDepth;                     }
-
-    UInt      getColFromL0Flag()                              { return m_colFromL0Flag;              }
-
-    UInt      getColRefIdx()                              { return m_colRefIdx;                  }
+    Void      setLastIDR(Int iIDRPOC)             { m_iLastIDR = iIDRPOC; }
+
+    Int       getLastIDR()                        { return m_iLastIDR; }
+
+    SliceType getSliceType()                      { return m_eSliceType; }
+
+    Int       getPOC()                            { return m_iPOC; }
+
+    Int       getSliceQp()                        { return m_iSliceQp; }
+
+    Bool      getDependentSliceSegmentFlag() const  { return m_dependentSliceSegmentFlag; }
+
+    void      setDependentSliceSegmentFlag(Bool val) { m_dependentSliceSegmentFlag = val; }
+
+    Int       getSliceQpBase()                    { return m_iSliceQpBase;       }
+
+    Int       getSliceQpDelta()                   { return m_iSliceQpDelta;      }
+
+    Int       getSliceQpDeltaCb()                 { return m_iSliceQpDeltaCb;      }
+
+    Int       getSliceQpDeltaCr()                 { return m_iSliceQpDeltaCr;      }
+
+    Bool      getDeblockingFilterDisable()        { return m_deblockingFilterDisable; }
+
+    Bool      getDeblockingFilterOverrideFlag()   { return m_deblockingFilterOverrideFlag; }
+
+    Int       getDeblockingFilterBetaOffsetDiv2() { return m_deblockingFilterBetaOffsetDiv2; }
+
+    Int       getDeblockingFilterTcOffsetDiv2()   { return m_deblockingFilterTcOffsetDiv2; }
+
+    Int       getNumRefIdx(RefPicList e)          { return m_aiNumRefIdx[e];             }
+
+    TComPic*  getPic()                            { return m_pcPic;                      }
+
+    TComPic*  getRefPic(RefPicList e, Int iRefIdx){ return m_apcRefPicList[e][iRefIdx];  }
+
+    Int       getRefPOC(RefPicList e, Int iRefIdx){ return m_aiRefPOCList[e][iRefIdx];   }
+
+    Int       getDepth()                          { return m_iDepth;                     }
+
+    UInt      getColFromL0Flag()                  { return m_colFromL0Flag;              }
+
+    UInt      getColRefIdx()                      { return m_colRefIdx;                  }
 
     Void      checkColRefIdx(UInt curSliceIdx, TComPic* pic);
-    Bool      getIsUsedAsLongTerm(Int i, Int j)                  { return m_bIsUsedAsLongTerm[i][j]; }
-
-    Bool      getCheckLDC()                                  { return m_bCheckLDC; }
-
-    Bool      getMvdL1ZeroFlag()                                  { return m_bLMvdL1Zero;    }
+
+    Bool      getIsUsedAsLongTerm(Int i, Int j)   { return m_bIsUsedAsLongTerm[i][j]; }
+
+    Bool      getCheckLDC()                       { return m_bCheckLDC; }
+
+    Bool      getMvdL1ZeroFlag()                  { return m_bLMvdL1Zero; }
 
     Int       getNumRpsCurrTempList();
-    Int       getList1IdxToList0Idx(Int list1Idx)               { return m_list1IdxToList0Idx[list1Idx]; }
-
-    Void      setReferenced(Bool b)                               { m_bRefenced = b; }
-
-    Bool      isReferenced()                                      { return m_bRefenced; }
-
-    Void      setPOC(Int i)                       { m_iPOC              = i; if (getTLayer() == 0) m_prevPOC = i; }
-
-    Void      setNalUnitType(NalUnitType e)               { m_eNalUnitType      = e;      }
-
-    NalUnitType getNalUnitType() const                        { return m_eNalUnitType;        }
+    Int       getList1IdxToList0Idx(Int list1Idx) { return m_list1IdxToList0Idx[list1Idx]; }
+
+    Void      setReferenced(Bool b)            { m_bRefenced = b; }
+
+    Bool      isReferenced()                   { return m_bRefenced; }
+
+    Void      setPOC(Int i)                    { m_iPOC = i; if (getTLayer() == 0) m_prevPOC = i; }
+
+    Void      setNalUnitType(NalUnitType e)    { m_eNalUnitType = e;           }
+
+    NalUnitType getNalUnitType() const         { return m_eNalUnitType;        }
 
     Bool      getRapPicFlag();
-    Bool      getIdrPicFlag()                              { return getNalUnitType() == NAL_UNIT_CODED_SLICE_IDR_W_RADL || getNalUnitType() == NAL_UNIT_CODED_SLICE_IDR_N_LP; }
-
-    Bool      isIRAP() const                        { return (getNalUnitType() >= 16) && (getNalUnitType() <= 23); }
+    Bool      getIdrPicFlag()                  { return getNalUnitType() == NAL_UNIT_CODED_SLICE_IDR_W_RADL || getNalUnitType() == NAL_UNIT_CODED_SLICE_IDR_N_LP; }
+
+    Bool      isIRAP() const                   { return (getNalUnitType() >= 16) && (getNalUnitType() <= 23); }
 
     Void      checkCRA(TComReferencePictureSet *pReferencePictureSet, Int& pocCRA, Bool& prevRAPisBLA, TComList<TComPic *>& rcListPic);
     Void      decodingRefreshMarking(Int& pocCRA, Bool& bRefreshPending, TComList<TComPic*>& rcListPic);
-    Void      setSliceType(SliceType e)                 { m_eSliceType        = e;      }
+    Void      setSliceType(SliceType e)               { m_eSliceType        = e;      }
 
     Void      setSliceQp(Int i)                       { m_iSliceQp          = i;      }
 
-    Void      setSliceQpBase(Int i)                       { m_iSliceQpBase      = i;      }
-
-    Void      setSliceQpDelta(Int i)                       { m_iSliceQpDelta     = i;      }
-
-    Void      setSliceQpDeltaCb(Int i)                       { m_iSliceQpDeltaCb   = i;      }
-
-    Void      setSliceQpDeltaCr(Int i)                       { m_iSliceQpDeltaCr   = i;      }
-
-    Void      setDeblockingFilterDisable(Bool b)                { m_deblockingFilterDisable = b;      }
-
-    Void      setDeblockingFilterOverrideFlag(Bool b)           { m_deblockingFilterOverrideFlag = b; }
-
-    Void      setDeblockingFilterBetaOffsetDiv2(Int i)          { m_deblockingFilterBetaOffsetDiv2 = i; }
-
-    Void      setDeblockingFilterTcOffsetDiv2(Int i)            { m_deblockingFilterTcOffsetDiv2 = i; }
+    Void      setSliceQpBase(Int i)                   { m_iSliceQpBase      = i;      }
+
+    Void      setSliceQpDelta(Int i)                  { m_iSliceQpDelta     = i;      }
+
+    Void      setSliceQpDeltaCb(Int i)                { m_iSliceQpDeltaCb   = i;      }
+
+    Void      setSliceQpDeltaCr(Int i)                { m_iSliceQpDeltaCr   = i;      }
+
+    Void      setDeblockingFilterDisable(Bool b)      { m_deblockingFilterDisable = b; }
+
+    Void      setDeblockingFilterOverrideFlag(Bool b) { m_deblockingFilterOverrideFlag = b; }
+
+    Void      setDeblockingFilterBetaOffsetDiv2(Int i) { m_deblockingFilterBetaOffsetDiv2 = i; }
+
+    Void      setDeblockingFilterTcOffsetDiv2(Int i)   { m_deblockingFilterTcOffsetDiv2 = i; }
 
     Void      setRefPic(TComPic* p, RefPicList e, Int iRefIdx) { m_apcRefPicList[e][iRefIdx] = p; }
 
     Void      setRefPOC(Int i, RefPicList e, Int iRefIdx) { m_aiRefPOCList[e][iRefIdx] = i; }
 
-    Void      setNumRefIdx(RefPicList e, Int i)         { m_aiNumRefIdx[e]    = i;      }
+    Void      setNumRefIdx(RefPicList e, Int i)   { m_aiNumRefIdx[e]    = i;      }
 
     Void      setPic(TComPic* p)                  { m_pcPic             = p;      }
 
-    Void      setDepth(Int iDepth)                  { m_iDepth            = iDepth; }
+    Void      setDepth(Int iDepth)                { m_iDepth            = iDepth; }
 
     Void      setRefPicList(TComList<TComPic*>& rcListPic, Bool checkNumPocTotalCurr = false);
     Void      setRefPOCList();
-    Void      setColFromL0Flag(UInt colFromL0) { m_colFromL0Flag = colFromL0; }
-
-    Void      setColRefIdx(UInt refIdx) { m_colRefIdx = refIdx; }
-
-    Void      setCheckLDC(Bool b)                      { m_bCheckLDC = b; }
-
-    Void      setMvdL1ZeroFlag(Bool b)                       { m_bLMvdL1Zero = b; }
-
-    Bool      isIntra()                          { return m_eSliceType == I_SLICE;  }
+    Void      setColFromL0Flag(UInt colFromL0)    { m_colFromL0Flag = colFromL0; }
+
+    Void      setColRefIdx(UInt refIdx)           { m_colRefIdx = refIdx; }
+
+    Void      setCheckLDC(Bool b)                 { m_bCheckLDC = b; }
+
+    Void      setMvdL1ZeroFlag(Bool b)            { m_bLMvdL1Zero = b; }
+
+    Bool      isIntra()                           { return m_eSliceType == I_SLICE;  }
 
     Bool      isInterB()                          { return m_eSliceType == B_SLICE;  }
 
@@ -1616,47 +1617,47 @@ public:
 
     Void setList1IdxToList0Idx();
 
-    UInt getTLayer()                            { return m_uiTLayer;                      }
-
-    Void setTLayer(UInt uiTLayer)             { m_uiTLayer = uiTLayer;                  }
+    UInt getTLayer()                          { return m_uiTLayer; }
+
+    Void setTLayer(UInt uiTLayer)             { m_uiTLayer = uiTLayer; }
 
     Void setTLayerInfo(UInt uiTLayer);
     Void decodingMarking(TComList<TComPic*>& rcListPic, Int iGOPSIze, Int& iMaxRefPicNum);
     Void applyReferencePictureSet(TComList<TComPic*>& rcListPic, TComReferencePictureSet *RPSList);
     Bool isTemporalLayerSwitchingPoint(TComList<TComPic*>& rcListPic);
     Bool isStepwiseTemporalLayerSwitchingPointCandidate(TComList<TComPic*>& rcListPic);
-    Int       checkThatAllRefPicsAreAvailable(TComList<TComPic*>& rcListPic, TComReferencePictureSet *pReferencePictureSet, Bool printErrors, Int pocRandomAccess = 0);
-    Void      createExplicitReferencePictureSetFromReference(TComList<TComPic*>& rcListPic, TComReferencePictureSet *pReferencePictureSet);
-
-    Void setMaxNumMergeCand(UInt val)         { m_maxNumMergeCand = val;                    }
-
-    UInt getMaxNumMergeCand()                  { return m_maxNumMergeCand;                   }
-
-    Void setSliceCurEndCUAddr(UInt uiAddr)     { m_sliceCurEndCUAddr = uiAddr;             }
-
-    UInt getSliceCurEndCUAddr()                  { return m_sliceCurEndCUAddr;               }
-
-    Void setSliceIdx(UInt i)           { m_sliceIdx = i;                           }
-
-    UInt getSliceIdx()                  { return m_sliceIdx;                       }
+    Int  checkThatAllRefPicsAreAvailable(TComList<TComPic*>& rcListPic, TComReferencePictureSet *pReferencePictureSet, Bool printErrors, Int pocRandomAccess = 0);
+    Void createExplicitReferencePictureSetFromReference(TComList<TComPic*>& rcListPic, TComReferencePictureSet *pReferencePictureSet);
+
+    Void setMaxNumMergeCand(UInt val)          { m_maxNumMergeCand = val; }
+
+    UInt getMaxNumMergeCand()                  { return m_maxNumMergeCand; }
+
+    Void setSliceCurEndCUAddr(UInt uiAddr)     { m_sliceCurEndCUAddr = uiAddr; }
+
+    UInt getSliceCurEndCUAddr()                { return m_sliceCurEndCUAddr; }
+
+    Void setSliceIdx(UInt i)                   { m_sliceIdx = i; }
+
+    UInt getSliceIdx()                         { return m_sliceIdx; }
 
     Void copySliceInfo(TComSlice *pcSliceSrc);
 
-    Void setNextSlice(Bool b)          { m_nextSlice = b;                           }
-
-    Bool isNextSlice()                  { return m_nextSlice;                        }
-
-    Void setSliceBits(UInt uiVal)      { m_sliceBits = uiVal;                      }
-
-    UInt getSliceBits()                  { return m_sliceBits;                       }
-
-    Void setSliceSegmentBits(UInt uiVal)      { m_sliceSegmentBits = uiVal;            }
-
-    UInt getSliceSegmentBits()                  { return m_sliceSegmentBits;             }
-
-    Void setFinalized(Bool uiVal)      { m_bFinalized = uiVal;                       }
-
-    Bool getFinalized()                  { return m_bFinalized;                        }
+    Void setNextSlice(Bool b)                  { m_nextSlice = b; }
+
+    Bool isNextSlice()                         { return m_nextSlice; }
+
+    Void setSliceBits(UInt uiVal)              { m_sliceBits = uiVal; }
+
+    UInt getSliceBits()                        { return m_sliceBits; }
+
+    Void setSliceSegmentBits(UInt uiVal)       { m_sliceSegmentBits = uiVal; }
+
+    UInt getSliceSegmentBits()                 { return m_sliceSegmentBits; }
+
+    Void setFinalized(Bool uiVal)              { m_bFinalized = uiVal; }
+
+    Bool getFinalized()                        { return m_bFinalized; }
 
     Void  setWpScaling(wpScalingParam wp[2][MAX_NUM_REF][3]) { memcpy(m_weightPredTable, wp, sizeof(wpScalingParam) * 2 * MAX_NUM_REF * 3); }
 
@@ -1664,16 +1665,16 @@ public:
 
     Void  resetWpScaling();
     Void  initWpScaling();
-    inline Bool applyWP() { return (m_eSliceType == P_SLICE && m_pcPPS->getUseWP()) || (m_eSliceType == B_SLICE && m_pcPPS->getWPBiPred());  }
+    inline Bool applyWP() { return (m_eSliceType == P_SLICE && m_pcPPS->getUseWP()) || (m_eSliceType == B_SLICE && m_pcPPS->getWPBiPred()); }
 
     Void  setWpAcDcParam(wpACDCParam wp[3]) { memcpy(m_weightACDCParam, wp, sizeof(wpACDCParam) * 3); }
 
     Void  getWpAcDcParam(wpACDCParam *&wp);
     Void  initWpAcDcParam();
 
-    Void setTileLocationCount(UInt cnt)               { return m_tileByteLocation.resize(cnt);    }
-
-    UInt getTileLocationCount()                         { return (UInt)m_tileByteLocation.size();  }
+    Void setTileLocationCount(UInt cnt)          { return m_tileByteLocation.resize(cnt);  }
+
+    UInt getTileLocationCount()                  { return (UInt)m_tileByteLocation.size(); }
 
     Void setTileLocation(Int idx, UInt location)
     {
@@ -1685,20 +1686,20 @@ public:
 
     UInt getTileLocation(Int idx)                { return m_tileByteLocation[idx];           }
 
-    Void setTileOffstForMultES(UInt uiOffset)      { m_uiTileOffstForMultES = uiOffset;        }
-
-    UInt getTileOffstForMultES()                    { return m_uiTileOffstForMultES;            }
+    Void setTileOffstForMultES(UInt uiOffset)    { m_uiTileOffstForMultES = uiOffset;        }
+
+    UInt getTileOffstForMultES()                 { return m_uiTileOffstForMultES;            }
 
     Void allocSubstreamSizes(UInt uiNumSubstreams);
-    UInt* getSubstreamSizes()                  { return m_puiSubstreamSizes; }
+    UInt* getSubstreamSizes()                   { return m_puiSubstreamSizes; }
 
     Void  setScalingList(TComScalingList* scalingList) { m_scalingList = scalingList; }
 
-    TComScalingList*   getScalingList()                               { return m_scalingList; }
+    TComScalingList*   getScalingList()         { return m_scalingList; }
 
     Void  setDefaultScalingList();
     Bool  checkDefaultScalingList();
-    Void      setCabacInitFlag(Bool val) { m_cabacInitFlag = val;      }    //!< set CABAC initial flag
+    Void      setCabacInitFlag(Bool val)   { m_cabacInitFlag = val;      }    //!< set CABAC initial flag
 
     Bool      getCabacInitFlag()           { return m_cabacInitFlag;     }  //!< get CABAC initial flag
 
@@ -1712,12 +1713,12 @@ public:
 
     Void      setEnableTMVPFlag(Bool b)    { m_enableTMVPFlag = b; }
 
-    Bool      getEnableTMVPFlag()              { return m_enableTMVPFlag; }
+    Bool      getEnableTMVPFlag()          { return m_enableTMVPFlag; }
 
 protected:
 
-    TComPic*  xGetRefPic(TComList<TComPic*>& rcListPic,
-                         Int                 poc);
+    TComPic*  xGetRefPic(TComList<TComPic*>& rcListPic, Int poc);
+
     TComPic*  xGetLongTermRefPic(TComList<TComPic*>& rcListPic, Int poc, Bool pocHasMsb);
 }; // END CLASS DEFINITION TComSlice
 
--- a/source/Lib/TLibEncoder/TEncGOP.cpp	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncGOP.cpp	Thu Jun 13 12:35:25 2013 +0530
@@ -99,7 +99,6 @@ TEncGOP::TEncGOP()
     m_iLastIDR            = 0;
     m_iGopSize            = 0;
     m_iNumPicCoded        = 0; //Niko
-    m_bFirst              = true;
     m_bSeqFirst           = true;
 
     m_pcCfg               = NULL;
@@ -123,8 +122,6 @@ TEncGOP::~TEncGOP()
  */
 Void  TEncGOP::create()
 {
-    m_bLongtermTestPictureHasBeenCoded = 0;
-    m_bLongtermTestPictureHasBeenCoded2 = 0;
 }
 
 Void  TEncGOP::destroy()
@@ -198,29 +195,28 @@ Void TEncGOP::xCreateLeadingSEIMessages(
 // ====================================================================================================================
 Void TEncGOP::compressGOP(Int iPOCLast, Int iNumPicRcvd, TComList<TComPic*>& rcListPic, TComList<TComPicYuv*>& rcListPicYuvRecOut, std::list<AccessUnit>& accessUnitsInGOP)
 {
-    TComPic*        pcPic;
-    TComPicYuv*     pcPicYuvRecOut;
-    TComSlice*      pcSlice;
-    TComOutputBitstream  *pcBitstreamRedirect;
-
     PPAScopeEvent(TEncGOP_compressGOP);
 
-    pcBitstreamRedirect = new TComOutputBitstream;
     AccessUnit::iterator  itLocationToPushSliceHeaderNALU; // used to store location where NALU containing slice header is to be inserted
+    Int                   picSptDpbOutputDuDelay = 0;
+    UInt*                 accumBitsDU = NULL;
+    UInt*                 accumNalsDU = NULL;
     UInt                  uiOneBitstreamPerSliceLength = 0;
-    TComOutputBitstream* pcSubstreamsOut = NULL;
-    x265::EncodeFrame* frame = m_pcEncTop->getFrameEncoder(0);
+    TComOutputBitstream*  pcBitstreamRedirect = new TComOutputBitstream;
+    TComOutputBitstream*  pcSubstreamsOut = NULL;
+    x265::EncodeFrame*    pcEncodeFrame  = m_pcEncTop->getFrameEncoder(0);
+    TEncEntropy*          pcEntropyCoder = pcEncodeFrame->getEntropyEncoder(0);
+    TEncSlice*            pcSliceEncoder = pcEncodeFrame->getSliceEncoder();
+    TEncCavlc*            pcCavlcCoder   = pcEncodeFrame->getCavlcCoder();
+    TEncSbac*             pcSbacCoder    = pcEncodeFrame->getSingletonSbac();
+    TEncBinCABAC*         pcBinCABAC     = pcEncodeFrame->getBinCABAC();
+    TComLoopFilter*       pcLoopFilter   = pcEncodeFrame->getLoopFilter();
+    TComBitCounter*       pcBitCounter   = pcEncodeFrame->getBitCounter();
+    TEncSampleAdaptiveOffset* pcSAO      = pcEncodeFrame->getSAO();
 
-    TEncEntropy*    pcEntropyCoder = frame->getEntropyEncoder(0);
-    TEncSlice*      pcSliceEncoder = frame->getSliceEncoder();
-    TEncCavlc*      pcCavlcCoder   = frame->getCavlcCoder();
-    TEncSbac*       pcSbacCoder    = frame->getSingletonSbac();
-    TEncBinCABAC*   pcBinCABAC     = frame->getBinCABAC();
-    TComLoopFilter* pcLoopFilter   = frame->getLoopFilter();
-    TComBitCounter* pcBitCounter   = frame->getBitCounter();
-    TEncSampleAdaptiveOffset* pcSAO = frame->getSAO();
-
-    xInitGOP(iPOCLast, iNumPicRcvd, rcListPic, rcListPicYuvRecOut);
+    // Exception for the first frame
+    m_iGopSize = (iPOCLast == 0) ? 1 : m_pcCfg->getGOPSize();
+    assert(iNumPicRcvd > 0 && m_iGopSize > 0);
 
     m_iNumPicCoded = 0;
     SEIPictureTiming pictureTimingSEI;
@@ -228,18 +224,15 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
 
     // Initialize Scalable Nesting SEI with single layer values
     SEIScalableNesting scalableNestingSEI;
-    scalableNestingSEI.m_bitStreamSubsetFlag           = 1;    // If the nested SEI messages are picture buffereing SEI mesages, picure timing SEI messages or sub-picture timing SEI messages, bitstream_subset_flag shall be equal to 1
+    scalableNestingSEI.m_bitStreamSubsetFlag           = 1;     // If the nested SEI messages are picture buffering SEI messages, picture timing SEI messages or sub-picture timing SEI messages, bitstream_subset_flag shall be equal to 1
     scalableNestingSEI.m_nestingOpFlag                 = 0;
-    scalableNestingSEI.m_nestingNumOpsMinus1           = 0;    //nesting_num_ops_minus1
+    scalableNestingSEI.m_nestingNumOpsMinus1           = 0;     // nesting_num_ops_minus1
     scalableNestingSEI.m_allLayersFlag                 = 0;
-    scalableNestingSEI.m_nestingNoOpMaxTemporalIdPlus1 = 6 + 1; //nesting_no_op_max_temporal_id_plus1
-    scalableNestingSEI.m_nestingNumLayersMinus1        = 1 - 1; //nesting_num_layers_minus1
+    scalableNestingSEI.m_nestingNoOpMaxTemporalIdPlus1 = 6 + 1; // nesting_no_op_max_temporal_id_plus1
+    scalableNestingSEI.m_nestingNumLayersMinus1        = 1 - 1; // nesting_num_layers_minus1
     scalableNestingSEI.m_nestingLayerId[0]             = 0;
     scalableNestingSEI.m_callerOwnsSEIs                = true;
 
-    Int picSptDpbOutputDuDelay = 0;
-    UInt *accumBitsDU = NULL;
-    UInt *accumNalsDU = NULL;
     SEIDecodingUnitInfo decodingUnitInfoSEI;
     for (Int iGOPid = 0; iGOPid < m_iGopSize; iGOPid++)
     {
@@ -310,24 +303,52 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
         // start a new access unit: create an entry in the list of output access units
         accessUnitsInGOP.push_back(AccessUnit());
         AccessUnit& accessUnit = accessUnitsInGOP.back();
-        xGetBuffer(rcListPic, rcListPicYuvRecOut, iNumPicRcvd, iTimeOffset, pcPic, pcPicYuvRecOut, pocCurr);
+
+        TComPic*              pcPic = NULL;
+        TComPicYuv*           pcPicYuvRecOut;
+        TComSlice*            pcSlice;
+        {
+            // Pick reconstruction picture in output time order
+            TComList<TComPicYuv*>::iterator iterPicYuvRec = rcListPicYuvRecOut.end();
+            for (Int i = 0; i < iNumPicRcvd - iTimeOffset + 1; i++)
+                iterPicYuvRec--;
+            pcPicYuvRecOut = *(iterPicYuvRec);
+
+            // Locate input picture with the correct POC (makes no assumption on
+            // input picture ordering)
+            TComList<TComPic*>::iterator iterPic = rcListPic.begin();
+            while (iterPic != rcListPic.end())
+            {
+                pcPic = *(iterPic++);
+                if (pcPic->getPOC() == pocCurr)
+                {
+                    break;
+                }
+            }
+        }
+        if (!pcPic || pcPic->getPOC() != pocCurr)
+        {
+            printf("error: Encode frame POC not found in input list!\n");
+            assert(0);
+            return;
+        }
 
         //  Slice data initialization
         pcPic->clearSliceBuffer();
-        assert(pcPic->getNumAllocatedSlice() == 1);
+        pcPic->setCurrSliceIdx(0);
         pcSliceEncoder->setSliceIdx(0);
-        pcPic->setCurrSliceIdx(0);
 
-        pcSliceEncoder->initEncSlice(pcPic, iPOCLast, pocCurr, iNumPicRcvd, iGOPid, pcSlice, m_pcEncTop->getSPS(), m_pcEncTop->getPPS());
+        pcSlice = pcSliceEncoder->initEncSlice(pcPic, pcEncodeFrame, iPOCLast, pocCurr, iGOPid, m_pcEncTop->getSPS(), m_pcEncTop->getPPS());
         pcSlice->setLastIDR(m_iLastIDR);
         pcSlice->setSliceIdx(0);
+
         //set default slice level flag to the same as SPS level flag
         pcSlice->setScalingList(m_pcEncTop->getScalingList());
         pcSlice->getScalingList()->setUseTransformSkip(m_pcEncTop->getPPS()->getUseTransformSkip());
         if (m_pcEncTop->getUseScalingListId() == SCALING_LIST_OFF)
         {
-            frame->setFlatScalingList();
-            frame->setUseScalingList(false);
+            pcEncodeFrame->setFlatScalingList();
+            pcEncodeFrame->setUseScalingList(false);
             m_pcEncTop->getSPS()->setScalingListPresentFlag(false);
             m_pcEncTop->getPPS()->setScalingListPresentFlag(false);
         }
@@ -336,8 +357,8 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
             pcSlice->setDefaultScalingList();
             m_pcEncTop->getSPS()->setScalingListPresentFlag(false);
             m_pcEncTop->getPPS()->setScalingListPresentFlag(false);
-            frame->setScalingList(pcSlice->getScalingList());
-            frame->setUseScalingList(true);
+            pcEncodeFrame->setScalingList(pcSlice->getScalingList());
+            pcEncodeFrame->setUseScalingList(true);
         }
         else if (m_pcEncTop->getUseScalingListId() == SCALING_LIST_FILE_READ)
         {
@@ -348,8 +369,8 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
             pcSlice->getScalingList()->checkDcOfMatrix();
             m_pcEncTop->getSPS()->setScalingListPresentFlag(pcSlice->checkDefaultScalingList());
             m_pcEncTop->getPPS()->setScalingListPresentFlag(false);
-            frame->setScalingList(pcSlice->getScalingList());
-            frame->setUseScalingList(true);
+            pcEncodeFrame->setScalingList(pcSlice->getScalingList());
+            pcEncodeFrame->setUseScalingList(true);
         }
         else
         {
@@ -612,7 +633,7 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
             sliceQP = Clip3(-pcSlice->getSPS()->getQpBDOffsetY(), MAX_QP, sliceQP);
             m_pcRateCtrl->getRCPic()->setPicEstQP(sliceQP);
 
-            pcSliceEncoder->resetQP(pcPic, sliceQP, lambda);
+            pcSliceEncoder->resetQP(pcPic, pcEncodeFrame, sliceQP, lambda);
         }
 
         UInt uiNumSlices = 1;
@@ -1044,7 +1065,7 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
                     pcSbacCoder->init((TEncBinIf*)pcBinCABAC);
                     pcEntropyCoder->setEntropyCoder(pcSbacCoder, pcSlice);
                     pcEntropyCoder->resetEntropy();
-                    frame->resetEntropy(pcSlice);
+                    pcEncodeFrame->resetEntropy(pcSlice);
                 }
 
                 if (pcSlice->isNextSlice())
@@ -1052,9 +1073,9 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
                     // set entropy coder for writing
                     pcSbacCoder->init((TEncBinIf*)pcBinCABAC);
                     {
-                        frame->resetEntropy(pcSlice);
-                        frame->getSbacCoder(0)->load(pcSbacCoder);
-                        pcEntropyCoder->setEntropyCoder(frame->getSbacCoder(0), pcSlice); //ALF is written in substream #0 with CABAC coder #0 (see ALF param encoding below)
+                        pcEncodeFrame->resetEntropy(pcSlice);
+                        pcEncodeFrame->getSbacCoder(0)->load(pcSbacCoder);
+                        pcEntropyCoder->setEntropyCoder(pcEncodeFrame->getSbacCoder(0), pcSlice); //ALF is written in substream #0 with CABAC coder #0 (see ALF param encoding below)
                     }
                     pcEntropyCoder->resetEntropy();
                     // File writing
@@ -1071,7 +1092,7 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
                 }
                 pcSlice->setFinalized(true);
 
-                pcSbacCoder->load(frame->getSbacCoder(0));
+                pcSbacCoder->load(pcEncodeFrame->getSbacCoder(0));
 
                 pcSlice->setTileOffstForMultES(uiOneBitstreamPerSliceLength);
                 pcSlice->setTileLocationCount(0);
@@ -1088,7 +1109,7 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
                     {
                         // Flush all substreams -- this includes empty ones.
                         // Terminating bit and flush.
-                        pcEntropyCoder->setEntropyCoder(frame->getSbacCoder(ui), pcSlice);
+                        pcEntropyCoder->setEntropyCoder(pcEncodeFrame->getSbacCoder(ui), pcSlice);
                         pcEntropyCoder->setBitstream(&pcSubstreamsOut[ui]);
                         pcEntropyCoder->encodeTerminatingBit(1);
                         pcEntropyCoder->encodeSliceFinish();
@@ -1180,7 +1201,7 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
                     pcEntropyCoder->setBitstream(pcBitCounter);
 
                     // CHECK_ME: I think the SAO is use a temp Sbac only, so I always use [0], am I right?
-                    pcSAO->startSaoEnc(pcPic, pcEntropyCoder, frame->getRDSbacCoders(0), frame->getRDGoOnSbacCoder(0));
+                    pcSAO->startSaoEnc(pcPic, pcEntropyCoder, pcEncodeFrame->getRDSbacCoders(0), pcEncodeFrame->getRDGoOnSbacCoder(0));
 
                     SAOParam& cSaoParam = *pcSlice->getPic()->getPicSym()->getSaoParam();
 
@@ -1493,7 +1514,6 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
         pcPic->getPicYuvRec()->copyToPic(pcPicYuvRecOut);
 
         pcPic->setReconMark(true);
-        m_bFirst = false;
         m_iNumPicCoded++;
         m_totalCoded++;
 
@@ -1550,119 +1570,6 @@ Void TEncGOP::printOutSummary(UInt uiNum
 // Protected member functions
 // ====================================================================================================================
 
-Void TEncGOP::xInitGOP(Int iPOCLast, Int iNumPicRcvd, TComList<TComPic*>& rcListPic, TComList<TComPicYuv*>& rcListPicYuvRecOut)
-{
-    assert(iNumPicRcvd > 0);
-    //  Exception for the first frame
-    if (iPOCLast == 0)
-    {
-        m_iGopSize    = 1;
-    }
-    else
-        m_iGopSize    = m_pcCfg->getGOPSize();
-
-    assert(m_iGopSize > 0);
-}
-
-Void TEncGOP::xGetBuffer(TComList<TComPic*>&    rcListPic,
-                         TComList<TComPicYuv*>& rcListPicYuvRecOut,
-                         Int                    iNumPicRcvd,
-                         Int                    iTimeOffset,
-                         TComPic*&              rpcPic,
-                         TComPicYuv*&           rpcPicYuvRecOut,
-                         Int                    pocCurr)
-{
-    Int i;
-
-    //  Rec. output
-    TComList<TComPicYuv*>::iterator     iterPicYuvRec = rcListPicYuvRecOut.end();
-    for (i = 0; i < iNumPicRcvd - iTimeOffset + 1; i++)
-    {
-        iterPicYuvRec--;
-    }
-
-    rpcPicYuvRecOut = *(iterPicYuvRec);
-
-    //  Current pic.
-    TComList<TComPic*>::iterator iterPic = rcListPic.begin();
-    while (iterPic != rcListPic.end())
-    {
-        rpcPic = *(iterPic);
-        rpcPic->setCurrSliceIdx(0);
-        if (rpcPic->getPOC() == pocCurr)
-        {
-            break;
-        }
-        iterPic++;
-    }
-
-    assert(rpcPic->getPOC() == pocCurr);
-}
-
-UInt64 TEncGOP::xFindDistortionFrame(TComPicYuv* pcPic0, TComPicYuv* pcPic1)
-{
-    Int     x, y;
-    Pel*  pSrc0   = pcPic0->getLumaAddr();
-    Pel*  pSrc1   = pcPic1->getLumaAddr();
-    UInt  uiShift = 2 * DISTORTION_PRECISION_ADJUSTMENT(g_bitDepthY - 8);
-    Int   iTemp;
-
-    Int   iStride = pcPic0->getStride();
-    Int   iWidth  = pcPic0->getWidth();
-    Int   iHeight = pcPic0->getHeight();
-
-    UInt64  uiTotalDiff = 0;
-
-    for (y = 0; y < iHeight; y++)
-    {
-        for (x = 0; x < iWidth; x++)
-        {
-            iTemp = pSrc0[x] - pSrc1[x];
-            uiTotalDiff += (iTemp * iTemp) >> uiShift;
-        }
-
-        pSrc0 += iStride;
-        pSrc1 += iStride;
-    }
-
-    uiShift = 2 * DISTORTION_PRECISION_ADJUSTMENT(g_bitDepthC - 8);
-    iHeight >>= 1;
-    iWidth  >>= 1;
-    iStride >>= 1;
-
-    pSrc0  = pcPic0->getCbAddr();
-    pSrc1  = pcPic1->getCbAddr();
-
-    for (y = 0; y < iHeight; y++)
-    {
-        for (x = 0; x < iWidth; x++)
-        {
-            iTemp = pSrc0[x] - pSrc1[x];
-            uiTotalDiff += (iTemp * iTemp) >> uiShift;
-        }
-
-        pSrc0 += iStride;
-        pSrc1 += iStride;
-    }
-
-    pSrc0  = pcPic0->getCrAddr();
-    pSrc1  = pcPic1->getCrAddr();
-
-    for (y = 0; y < iHeight; y++)
-    {
-        for (x = 0; x < iWidth; x++)
-        {
-            iTemp = pSrc0[x] - pSrc1[x];
-            uiTotalDiff += (iTemp * iTemp) >> uiShift;
-        }
-
-        pSrc0 += iStride;
-        pSrc1 += iStride;
-    }
-
-    return uiTotalDiff;
-}
-
 #if VERBOSE_RATE
 static const Char* nalUnitTypeToString(NalUnitType type)
 {
@@ -1776,7 +1683,7 @@ Void TEncGOP::xCalculateAddPSNR(TComPic*
 
     iHeight >>= 1;
     iWidth  >>= 1;
-    iStride >>= 1;
+    iStride = pcPicD->getCStride();
 
     UInt64 uiSSDU = computeSSD(pcPic->getPicYuvOrg()->getCbAddr(), pcPicD->getCbAddr(), iStride, iWidth, iHeight);
     UInt64 uiSSDV = computeSSD(pcPic->getPicYuvOrg()->getCrAddr(), pcPicD->getCrAddr(), iStride, iWidth, iHeight);
--- a/source/Lib/TLibEncoder/TEncGOP.h	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncGOP.h	Thu Jun 13 12:35:25 2013 +0530
@@ -73,15 +73,12 @@ class TEncGOP
 private:
 
     //  Data
-    Bool                    m_bLongtermTestPictureHasBeenCoded;
-    Bool                    m_bLongtermTestPictureHasBeenCoded2;
     UInt                    m_numLongTermRefPicSPS;
     UInt                    m_ltRefPicPocLsbSps[33];
     Bool                    m_ltRefPicUsedByCurrPicFlag[33];
     Int                     m_iLastIDR;
     Int                     m_iGopSize;
     Int                     m_iNumPicCoded;
-    Bool                    m_bFirst;
 
     //  Access channel
     TEncTop*                m_pcEncTop;
@@ -138,13 +135,8 @@ protected:
 
 protected:
 
-    Void  xInitGOP(Int iPOC, Int iNumPicRcvd, TComList<TComPic*>& rcListPic, TComList<TComPicYuv*>& rcListPicYuvRecOut);
-    Void  xGetBuffer(TComList<TComPic*>& rcListPic, TComList<TComPicYuv*>& rcListPicYuvRecOut, Int iNumPicRcvd, Int iTimeOffset, TComPic*& rpcPic, TComPicYuv*& rpcPicYuvRecOut, Int pocCurr);
-
     Void  xCalculateAddPSNR(TComPic* pcPic, TComPicYuv* pcPicD, const AccessUnit&);
 
-    UInt64 xFindDistortionFrame(TComPicYuv* pcPic0, TComPicYuv* pcPic1);
-
     Double xCalculateRVM();
 
     SEIActiveParameterSets* xCreateSEIActiveParameterSets(TComSPS *sps);
--- a/source/Lib/TLibEncoder/TEncSlice.cpp	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSlice.cpp	Thu Jun 13 12:35:25 2013 +0530
@@ -130,31 +130,29 @@ Void TEncSlice::init(TEncTop* pcEncTop)
  \param pcPic         picture class
  \param pocLast       POC of last picture
  \param pocCurr       current POC
- \param iNumPicRcvd   number of received pictures
  \param iTimeOffset   POC offset for hierarchical structure
  \param iDepth        temporal layer depth
- \param rpcSlice      slice header class
  \param pSPS          SPS associated with the slice
  \param pPPS          PPS associated with the slice
  */
-Void TEncSlice::initEncSlice(TComPic* pcPic, Int pocLast, Int pocCurr, Int iNumPicRcvd, Int iGOPid, TComSlice*& rpcSlice, TComSPS* pSPS, TComPPS *pPPS)
+TComSlice* TEncSlice::initEncSlice(TComPic* pcPic, x265::EncodeFrame *pcEncodeFrame, Int pocLast, Int pocCurr, Int iGOPid, TComSPS* pSPS, TComPPS *pPPS)
 {
     Double dQP;
     Double dLambda;
 
-    rpcSlice = pcPic->getSlice(0);
-    rpcSlice->setSPS(pSPS);
-    rpcSlice->setPPS(pPPS);
-    rpcSlice->setSliceBits(0);
-    rpcSlice->setPic(pcPic);
-    rpcSlice->initSlice();
-    rpcSlice->setPicOutputFlag(true);
-    rpcSlice->setPOC(pocCurr);
+    TComSlice* pcSlice = pcPic->getSlice(0);
+    pcSlice->setSPS(pSPS);
+    pcSlice->setPPS(pPPS);
+    pcSlice->setSliceBits(0);
+    pcSlice->setPic(pcPic);
+    pcSlice->initSlice();
+    pcSlice->setPicOutputFlag(true);
+    pcSlice->setPOC(pocCurr);
 
     // depth computation based on GOP size
     Int depth;
     {
-        Int poc = rpcSlice->getPOC() % m_pcCfg->getGOPSize();
+        Int poc = pcSlice->getPOC() % m_pcCfg->getGOPSize();
         if (poc == 0)
         {
             depth = 0;
@@ -186,7 +184,7 @@ Void TEncSlice::initEncSlice(TComPic* pc
     eSliceType = B_SLICE;
     eSliceType = (pocLast == 0 || pocCurr % m_pcCfg->getIntraPeriod() == 0 || m_pcGOPEncoder->getGOPSize() == 0) ? I_SLICE : eSliceType;
 
-    rpcSlice->setSliceType(eSliceType);
+    pcSlice->setSliceType(eSliceType);
 
     // ------------------------------------------------------------------------------------------------------------------
     // Non-referenced frame marking
@@ -194,13 +192,13 @@ Void TEncSlice::initEncSlice(TComPic* pc
 
     if (pocLast == 0)
     {
-        rpcSlice->setTemporalLayerNonReferenceFlag(false);
+        pcSlice->setTemporalLayerNonReferenceFlag(false);
     }
     else
     {
-        rpcSlice->setTemporalLayerNonReferenceFlag(!m_pcCfg->getGOPEntry(iGOPid).m_refPic);
+        pcSlice->setTemporalLayerNonReferenceFlag(!m_pcCfg->getGOPEntry(iGOPid).m_refPic);
     }
-    rpcSlice->setReferenced(true);
+    pcSlice->setReferenced(true);
 
     // ------------------------------------------------------------------------------------------------------------------
     // QP setting
@@ -209,7 +207,7 @@ Void TEncSlice::initEncSlice(TComPic* pc
     dQP = m_pcCfg->getQP();
     if (eSliceType != I_SLICE)
     {
-        if (!((dQP == -rpcSlice->getSPS()->getQpBDOffsetY()) && (rpcSlice->getSPS()->getUseLossless())))
+        if (!((dQP == -pcSlice->getSPS()->getQpBDOffsetY()) && (pcSlice->getSPS()->getUseLossless())))
         {
             dQP += m_pcCfg->getGOPEntry(iGOPid).m_QPOffset;
         }
@@ -219,8 +217,9 @@ Void TEncSlice::initEncSlice(TComPic* pc
     Int* pdQPs = m_pcCfg->getdQPs();
     if (pdQPs)
     {
-        dQP += pdQPs[rpcSlice->getPOC()];
+        dQP += pdQPs[pcSlice->getPOC()];
     }
+
     // ------------------------------------------------------------------------------------------------------------------
     // Lambda computation
     // ------------------------------------------------------------------------------------------------------------------
@@ -258,47 +257,45 @@ Void TEncSlice::initEncSlice(TComPic* pc
     }
 
     // if hadamard is used in ME process
-    if (!m_pcCfg->getUseHADME() && rpcSlice->getSliceType() != I_SLICE)
+    if (!m_pcCfg->getUseHADME() && pcSlice->getSliceType() != I_SLICE)
     {
         dLambda *= 0.95;
     }
 
     iQP = max(-pSPS->getQpBDOffsetY(), min(MAX_QP, (Int)floor(dQP + 0.5)));
 
-    if (rpcSlice->getSliceType() != I_SLICE)
+    if (pcSlice->getSliceType() != I_SLICE)
     {
         dLambda *= m_pcCfg->getLambdaModifier(m_pcCfg->getGOPEntry(iGOPid).m_temporalId);
     }
 
-    x265::EncodeFrame *frame = ((TEncTop*)m_pcCfg)->getFrameEncoder(0);
-
     // for RDO
     // in RdCost there is only one lambda because the luma and chroma bits are not separated, instead we weight the distortion of chroma.
     Double weight = 1.0;
     Int qpc;
     Int chromaQPOffset;
 
-    chromaQPOffset = rpcSlice->getPPS()->getChromaCbQpOffset() + rpcSlice->getSliceQpDeltaCb();
+    chromaQPOffset = pcSlice->getPPS()->getChromaCbQpOffset() + pcSlice->getSliceQpDeltaCb();
     qpc = Clip3(0, 57, iQP + chromaQPOffset);
     weight = pow(2.0, (iQP - g_aucChromaScale[qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
-    frame->setCbDistortionWeight(weight);
+    pcEncodeFrame->setCbDistortionWeight(weight);
 
-    chromaQPOffset = rpcSlice->getPPS()->getChromaCrQpOffset() + rpcSlice->getSliceQpDeltaCr();
+    chromaQPOffset = pcSlice->getPPS()->getChromaCrQpOffset() + pcSlice->getSliceQpDeltaCr();
     qpc = Clip3(0, 57, iQP + chromaQPOffset);
     weight = pow(2.0, (iQP - g_aucChromaScale[qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
-    frame->setCrDistortionWeight(weight);
+    pcEncodeFrame->setCrDistortionWeight(weight);
 
     // for RDOQ
-    frame->setLambda(dLambda, dLambda / weight);
+    pcEncodeFrame->setLambda(dLambda, dLambda / weight);
 
     // For SAO
-    rpcSlice->setLambda(dLambda, dLambda / weight);
+    pcSlice->setLambda(dLambda, dLambda / weight);
 
 #if HB_LAMBDA_FOR_LDC
     // restore original slice type
     eSliceType = (pocLast == 0 || pocCurr % m_pcCfg->getIntraPeriod() == 0 || m_pcGOPEncoder->getGOPSize() == 0) ? I_SLICE : eSliceType;
 
-    rpcSlice->setSliceType(eSliceType);
+    pcSlice->setSliceType(eSliceType);
 #endif
 
     if (m_pcCfg->getUseRecalculateQPAccordingToLambda())
@@ -307,79 +304,79 @@ Void TEncSlice::initEncSlice(TComPic* pc
         iQP = max(-pSPS->getQpBDOffsetY(), min(MAX_QP, (Int)floor(dQP + 0.5)));
     }
 
-    rpcSlice->setSliceQp(iQP);
-    rpcSlice->setSliceQpBase(iQP);
-    rpcSlice->setSliceQpDelta(0);
-    rpcSlice->setSliceQpDeltaCb(0);
-    rpcSlice->setSliceQpDeltaCr(0);
-    rpcSlice->setNumRefIdx(REF_PIC_LIST_0, m_pcCfg->getGOPEntry(iGOPid).m_numRefPicsActive);
-    rpcSlice->setNumRefIdx(REF_PIC_LIST_1, m_pcCfg->getGOPEntry(iGOPid).m_numRefPicsActive);
+    pcSlice->setSliceQp(iQP);
+    pcSlice->setSliceQpBase(iQP);
+    pcSlice->setSliceQpDelta(0);
+    pcSlice->setSliceQpDeltaCb(0);
+    pcSlice->setSliceQpDeltaCr(0);
+    pcSlice->setNumRefIdx(REF_PIC_LIST_0, m_pcCfg->getGOPEntry(iGOPid).m_numRefPicsActive);
+    pcSlice->setNumRefIdx(REF_PIC_LIST_1, m_pcCfg->getGOPEntry(iGOPid).m_numRefPicsActive);
 
     if (m_pcCfg->getDeblockingFilterMetric())
     {
-        rpcSlice->setDeblockingFilterOverrideFlag(true);
-        rpcSlice->setDeblockingFilterDisable(false);
-        rpcSlice->setDeblockingFilterBetaOffsetDiv2(0);
-        rpcSlice->setDeblockingFilterTcOffsetDiv2(0);
+        pcSlice->setDeblockingFilterOverrideFlag(true);
+        pcSlice->setDeblockingFilterDisable(false);
+        pcSlice->setDeblockingFilterBetaOffsetDiv2(0);
+        pcSlice->setDeblockingFilterTcOffsetDiv2(0);
     }
-    else if (rpcSlice->getPPS()->getDeblockingFilterControlPresentFlag())
+    else if (pcSlice->getPPS()->getDeblockingFilterControlPresentFlag())
     {
-        rpcSlice->getPPS()->setDeblockingFilterOverrideEnabledFlag(!m_pcCfg->getLoopFilterOffsetInPPS());
-        rpcSlice->setDeblockingFilterOverrideFlag(!m_pcCfg->getLoopFilterOffsetInPPS());
-        rpcSlice->getPPS()->setPicDisableDeblockingFilterFlag(m_pcCfg->getLoopFilterDisable());
-        rpcSlice->setDeblockingFilterDisable(m_pcCfg->getLoopFilterDisable());
-        if (!rpcSlice->getDeblockingFilterDisable())
+        pcSlice->getPPS()->setDeblockingFilterOverrideEnabledFlag(!m_pcCfg->getLoopFilterOffsetInPPS());
+        pcSlice->setDeblockingFilterOverrideFlag(!m_pcCfg->getLoopFilterOffsetInPPS());
+        pcSlice->getPPS()->setPicDisableDeblockingFilterFlag(m_pcCfg->getLoopFilterDisable());
+        pcSlice->setDeblockingFilterDisable(m_pcCfg->getLoopFilterDisable());
+        if (!pcSlice->getDeblockingFilterDisable())
         {
             if (!m_pcCfg->getLoopFilterOffsetInPPS() && eSliceType != I_SLICE)
             {
-                rpcSlice->getPPS()->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_betaOffsetDiv2 + m_pcCfg->getLoopFilterBetaOffset());
-                rpcSlice->getPPS()->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_tcOffsetDiv2 + m_pcCfg->getLoopFilterTcOffset());
-                rpcSlice->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_betaOffsetDiv2 + m_pcCfg->getLoopFilterBetaOffset());
-                rpcSlice->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_tcOffsetDiv2 + m_pcCfg->getLoopFilterTcOffset());
+                pcSlice->getPPS()->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_betaOffsetDiv2 + m_pcCfg->getLoopFilterBetaOffset());
+                pcSlice->getPPS()->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_tcOffsetDiv2 + m_pcCfg->getLoopFilterTcOffset());
+                pcSlice->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_betaOffsetDiv2 + m_pcCfg->getLoopFilterBetaOffset());
+                pcSlice->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getGOPEntry(iGOPid).m_tcOffsetDiv2 + m_pcCfg->getLoopFilterTcOffset());
             }
             else
             {
-                rpcSlice->getPPS()->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getLoopFilterBetaOffset());
-                rpcSlice->getPPS()->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getLoopFilterTcOffset());
-                rpcSlice->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getLoopFilterBetaOffset());
-                rpcSlice->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getLoopFilterTcOffset());
+                pcSlice->getPPS()->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getLoopFilterBetaOffset());
+                pcSlice->getPPS()->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getLoopFilterTcOffset());
+                pcSlice->setDeblockingFilterBetaOffsetDiv2(m_pcCfg->getLoopFilterBetaOffset());
+                pcSlice->setDeblockingFilterTcOffsetDiv2(m_pcCfg->getLoopFilterTcOffset());
             }
         }
     }
     else
     {
-        rpcSlice->setDeblockingFilterOverrideFlag(false);
-        rpcSlice->setDeblockingFilterDisable(false);
-        rpcSlice->setDeblockingFilterBetaOffsetDiv2(0);
-        rpcSlice->setDeblockingFilterTcOffsetDiv2(0);
+        pcSlice->setDeblockingFilterOverrideFlag(false);
+        pcSlice->setDeblockingFilterDisable(false);
+        pcSlice->setDeblockingFilterBetaOffsetDiv2(0);
+        pcSlice->setDeblockingFilterTcOffsetDiv2(0);
     }
 
-    rpcSlice->setDepth(depth);
+    pcSlice->setDepth(depth);
 
     pcPic->setTLayer(m_pcCfg->getGOPEntry(iGOPid).m_temporalId);
     if (eSliceType == I_SLICE)
     {
         pcPic->setTLayer(0);
     }
-    rpcSlice->setTLayer(pcPic->getTLayer());
+    pcSlice->setTLayer(pcPic->getTLayer());
 
     assert(m_apcPicYuvPred);
     assert(m_apcPicYuvResi);
 
     pcPic->setPicYuvPred(m_apcPicYuvPred);
     pcPic->setPicYuvResi(m_apcPicYuvResi);
-    rpcSlice->setMaxNumMergeCand(m_pcCfg->getMaxNumMergeCand());
+    pcSlice->setMaxNumMergeCand(m_pcCfg->getMaxNumMergeCand());
     xStoreWPparam(pPPS->getUseWP(), pPPS->getWPBiPred());
+    return pcSlice;
 }
 
-Void TEncSlice::resetQP(TComPic* pic, Int sliceQP, Double lambda)
+Void TEncSlice::resetQP(TComPic* pic, EncodeFrame *pcEncodeFrame, Int sliceQP, Double lambda)
 {
     TComSlice* slice = pic->getSlice(0);
 
     // store lambda
     slice->setSliceQp(sliceQP);
     slice->setSliceQpBase(sliceQP);
-    EncodeFrame *frame = ((TEncTop*)m_pcCfg)->getFrameEncoder(0);
 
     // for RDO
     // in RdCost there is only one lambda because the luma and chroma bits are not separated, instead we weight the distortion of chroma.
@@ -390,15 +387,15 @@ Void TEncSlice::resetQP(TComPic* pic, In
     chromaQPOffset = slice->getPPS()->getChromaCbQpOffset() + slice->getSliceQpDeltaCb();
     qpc = Clip3(0, 57, sliceQP + chromaQPOffset);
     weight = pow(2.0, (sliceQP - g_aucChromaScale[qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
-    frame->setCbDistortionWeight(weight);
+    pcEncodeFrame->setCbDistortionWeight(weight);
 
     chromaQPOffset = slice->getPPS()->getChromaCrQpOffset() + slice->getSliceQpDeltaCr();
     qpc = Clip3(0, 57, sliceQP + chromaQPOffset);
     weight = pow(2.0, (sliceQP - g_aucChromaScale[qpc]) / 3.0); // takes into account of the chroma qp mapping and chroma qp Offset
-    frame->setCrDistortionWeight(weight);
+    pcEncodeFrame->setCrDistortionWeight(weight);
 
     // for RDOQ
-    frame->setLambda(lambda, lambda / weight);
+    pcEncodeFrame->setLambda(lambda, lambda / weight);
 
     // For SAO
     slice->setLambda(lambda, lambda / weight);
--- a/source/Lib/TLibEncoder/TEncSlice.h	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSlice.h	Thu Jun 13 12:35:25 2013 +0530
@@ -53,6 +53,8 @@
 class TEncTop;
 class TEncGOP;
 
+namespace x265 { class EncodeFrame; }
+
 // ====================================================================================================================
 // Class definition
 // ====================================================================================================================
@@ -87,9 +89,10 @@ public:
     Void    init(TEncTop* pcEncTop);
 
     /// preparation of slice encoding (reference marking, QP and lambda)
-    Void    initEncSlice(TComPic* pcPic, Int pocLast, Int pocCurr, Int iNumPicRcvd,
-                         Int iGOPid, TComSlice*& rpcSlice, TComSPS* pSPS, TComPPS *pPPS);
-    Void    resetQP(TComPic* pic, Int sliceQP, Double lambda);
+    TComSlice *initEncSlice(TComPic* pcPic, x265::EncodeFrame *pcEncodeFrame, Int pocLast, Int pocCurr, Int iGOPid, TComSPS* pSPS, TComPPS *pPPS);
+
+    Void    resetQP(TComPic* pic, x265::EncodeFrame *pcEncodeFrame, Int sliceQP, Double lambda);
+
     // compress and encode slice
     Void    compressSlice(TComPic* rpcPic);                                            ///< analysis stage of slice
     Void    encodeSlice(TComPic*& rpcPic, TComOutputBitstream* pcSubstreams);
--- a/source/common/vec/dct.inc	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/common/vec/dct.inc	Thu Jun 13 12:35:25 2013 +0530
@@ -6,6 +6,7 @@
  *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
+ *          Min Chen <min.chen@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -2915,105 +2916,83 @@ void xIDST4(short *pSrc, short *pDst, in
 }
 #endif // INSTRSET >= 5
 
-#if INSTRSET < 5
-void xDCT4(short *pSrc, short *pDst, intptr_t)
+ALIGN_VAR_32(static const short, tab_dct_4x4[][8]) =
 {
-    const int shift_1st = 1;
-
-    partialButterfly4(pSrc, pDst, shift_1st, 4);
-}
-#endif // INSTRSET < 5
-
-
-#if INSTRSET >= 5
+    { 64, 64, 64, 64, 64, 64, 64, 64},
+    { 83, 36, 83, 36, 83, 36, 83, 36},
+    { 64,-64, 64,-64, 64,-64, 64,-64},
+    { 36,-83, 36,-83, 36,-83, 36,-83},
+};
 void xDCT4(short *pSrc, short *pDst, intptr_t)
 {
     // Const
     __m128i c_1         = _mm_set1_epi32(1);
     __m128i c_128       = _mm_set1_epi32(128);
-    __m128i c16_64_64   = _mm_set1_epi32(0x00400040);
-    __m128i c16_n64_64  = _mm_set1_epi32(0xFFC00040);
-    __m128i c16_36_83   = _mm_set1_epi32(0x00240053);
-    __m128i c16_n83_36  = _mm_set1_epi32(0xFFAD0024);
-    __m128i c32_36_83   = _mm_set_epi32( 36, 83, 36, 83);
-    __m128i c32_n64_64  = _mm_set_epi32(-64, 64,-64, 64);
-    __m128i c32_n83_36  = _mm_set_epi32(-83, 36,-83, 36);
-
-    __m128i T20  = _mm_loadu_si128((__m128i *)&pSrc[0 * 4]); // [13 12 11 10 03 02 01 00]
-    __m128i T22  = _mm_loadu_si128((__m128i *)&pSrc[2 * 4]); // [33 32 31 30 23 22 21 20]
+
+    __m128i T20, T21;
+    __m128i T30, T31, T32, T33;
+    __m128i T40, T41, T50, T51, T60, T61, T62, T63, T70, T71, T72, T73;
+    __m128i T50_, T51_;
+
+    T20  = _mm_loadu_si128((__m128i *)&pSrc[0 * 4]); // [13 12 11 10 03 02 01 00]
+    T21  = _mm_loadu_si128((__m128i *)&pSrc[2 * 4]); // [33 32 31 30 23 22 21 20]
 
     // DCT1
-    __m128i T30  = _mm_shuffle_epi32(T20, 0xD8);
-    __m128i T31  = _mm_shuffle_epi32(T22, 0xD8);
-    __m128i T32  = _mm_shufflehi_epi16(T30, 0xB1);
-    __m128i T33  = _mm_shufflehi_epi16(T31, 0xB1);
-
-    __m128i T40  = _mm_unpacklo_epi64(T32, T33);
-    __m128i T41  = _mm_unpackhi_epi64(T32, T33);
-    __m128i T50  = _mm_add_epi16(T40, T41);             // [1+2 0+3]
-    __m128i T51  = _mm_sub_epi16(T40, T41);             // [1-2 0-3]
-    __m128i T60  = _mm_madd_epi16(c16_64_64,  T50);     // [ 64*s12 + 64*s03] = [03 02 01 00]
-    __m128i T61  = _mm_madd_epi16(c16_36_83,  T51);     // [ 36*d12 + 83*d03] = [13 12 11 10]
-    __m128i T62  = _mm_madd_epi16(c16_n64_64, T50);     // [-64*s12 + 64*s03] = [23 22 21 20]
-    __m128i T63  = _mm_madd_epi16(c16_n83_36, T51);     // [-83*d12 + 36*d03] = [33 32 31 30]
-    __m128i T70  = _mm_srai_epi32(_mm_add_epi32(c_1, T60), 1);  // [03 02 01 00]
-    __m128i T71  = _mm_srai_epi32(_mm_add_epi32(c_1, T61), 1);  // [13 12 11 10]
-    __m128i T72  = _mm_srai_epi32(_mm_add_epi32(c_1, T62), 1);  // [23 22 21 20]
-    __m128i T73  = _mm_srai_epi32(_mm_add_epi32(c_1, T63), 1);  // [33 32 31 30]
-
-    // DCT2
-    __m128i T80  = _mm_unpacklo_epi64(T70, T71);
-    __m128i T81_ = _mm_unpackhi_epi64(T70, T71);
-    __m128i T81  = _mm_shuffle_epi32(T81_, 0xB1);
-    __m128i T82  = _mm_unpacklo_epi64(T72, T73);
-    __m128i T83_ = _mm_unpackhi_epi64(T72, T73);
-    __m128i T83  = _mm_shuffle_epi32(T83_, 0xB1);
-    __m128i T90A = _mm_add_epi32(T80, T81);
-    __m128i T90B = _mm_add_epi32(T82, T83);
-    __m128i T91A = _mm_sub_epi32(T80, T81);
-    __m128i T91B = _mm_sub_epi32(T82, T83);
-    __m128i TA0A = _mm_slli_epi32(T90A, 6);
-    __m128i TA0B = _mm_slli_epi32(T90B, 6);
-    __m128i TA1A = _mm_mullo_epi32(c32_36_83,  T91A);
-    __m128i TA1B = _mm_mullo_epi32(c32_36_83,  T91B);
-    __m128i TA2A = _mm_mullo_epi32(c32_n64_64, T90A);
-    __m128i TA2B = _mm_mullo_epi32(c32_n64_64, T90B);
-    __m128i TA3A = _mm_mullo_epi32(c32_n83_36, T91A);
-    __m128i TA3B = _mm_mullo_epi32(c32_n83_36, T91B);
-    __m128i TB0  = _mm_hadd_epi32(TA0A, TA0B);
-    __m128i TB1  = _mm_hadd_epi32(TA1A, TA1B);
-    __m128i TB2  = _mm_hadd_epi32(TA2A, TA2B);
-    __m128i TB3  = _mm_hadd_epi32(TA3A, TA3B);
-    __m128i TC0  = _mm_srai_epi32(_mm_add_epi32(TB0, c_128), 8);
-    __m128i TC1  = _mm_srai_epi32(_mm_add_epi32(TB1, c_128), 8);
-    __m128i TC2  = _mm_srai_epi32(_mm_add_epi32(TB2, c_128), 8);
-    __m128i TC3  = _mm_srai_epi32(_mm_add_epi32(TB3, c_128), 8);
-    __m128i TD0  = _mm_packs_epi32(TC0, TC1);       // [13 12 11 10 03 02 01 00]
-    __m128i TD1  = _mm_packs_epi32(TC2, TC3);       // [33 32 31 30 23 22 21 20]
-
-    _mm_storeu_si128((__m128i*)&pDst[0 * 4], TD0);
-    _mm_storeu_si128((__m128i*)&pDst[2 * 4], TD1);
+    T30  = _mm_shuffle_epi32(T20, 0xD8);        // [13 12 03 02 11 10 01 00]
+    T31  = _mm_shuffle_epi32(T21, 0xD8);        // [33 32 23 22 31 30 21 20]
+    T32  = _mm_shufflehi_epi16(T30, 0xB1);      // [12 13 02 03 11 10 01 00]
+    T33  = _mm_shufflehi_epi16(T31, 0xB1);      // [32 33 22 23 31 30 21 20]
+
+    T40  = _mm_unpacklo_epi64(T32, T33);        // [31 30 21 20 11 10 01 00]
+    T41  = _mm_unpackhi_epi64(T32, T33);        // [32 33 22 23 12 13 02 03]
+    T50  = _mm_add_epi16(T40, T41);             // [1+2 0+3]
+    T51  = _mm_sub_epi16(T40, T41);             // [1-2 0-3]
+    T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_4x4[0])); // [ 64*s12 + 64*s03] = [03 02 01 00]
+    T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_4x4[1])); // [ 36*d12 + 83*d03] = [13 12 11 10]
+    T62  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_4x4[2])); // [-64*s12 + 64*s03] = [23 22 21 20]
+    T63  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_4x4[3])); // [-83*d12 + 36*d03] = [33 32 31 30]
+    T70  = _mm_srai_epi32(_mm_add_epi32(T60, c_1), 1);  // [30 20 10 00]
+    T71  = _mm_srai_epi32(_mm_add_epi32(T61, c_1), 1);  // [31 21 11 01]
+    T72  = _mm_srai_epi32(_mm_add_epi32(T62, c_1), 1);  // [32 22 12 02]
+    T73  = _mm_srai_epi32(_mm_add_epi32(T63, c_1), 1);  // [33 23 13 03]
+
+    // Transpose
+    T20  = _mm_packs_epi32(T70, T71);       // [13 12 11 10 03 02 01 00]
+    T21  = _mm_packs_epi32(T72, T73);       // [33 32 31 30 23 22 21 20]
+
+    T30  = _mm_shuffle_epi32(T20, 0xD8);        // [13 12 03 02 11 10 01 00]
+    T31  = _mm_shuffle_epi32(T21, 0xD8);        // [33 32 23 22 31 30 21 20]
+    T32  = _mm_shufflehi_epi16(T30, 0xB1);      // [12 13 02 03 11 10 01 00]
+    T33  = _mm_shufflehi_epi16(T31, 0xB1);      // [32 33 22 23 31 30 21 20]
+
+    T40  = _mm_unpacklo_epi64(T32, T33);        // [31 30 21 20 11 10 01 00]
+    T41  = _mm_unpackhi_epi64(T32, T33);        // [32 33 22 23 12 13 02 03]
+
+    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4x4[0]));
+    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4x4[0]));
+    T60  = _mm_add_epi32(T50_, T51_);
+    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4x4[1]));
+    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4x4[1]));
+    T61  = _mm_sub_epi32(T50_, T51_);
+    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4x4[2]));
+    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4x4[2]));
+    T62  = _mm_add_epi32(T50_, T51_);
+    T50_ = _mm_madd_epi16(T40, _mm_load_si128((__m128i*)tab_dct_4x4[3]));
+    T51_ = _mm_madd_epi16(T41, _mm_load_si128((__m128i*)tab_dct_4x4[3]));
+    T63  = _mm_sub_epi32(T50_, T51_);
+    
+    T70  = _mm_srai_epi32(_mm_add_epi32(T60, c_128), 8);  // [30 20 10 00]
+    T71  = _mm_srai_epi32(_mm_add_epi32(T61, c_128), 8);  // [31 21 11 01]
+    T72  = _mm_srai_epi32(_mm_add_epi32(T62, c_128), 8);  // [32 22 12 02]
+    T73  = _mm_srai_epi32(_mm_add_epi32(T63, c_128), 8);  // [33 23 13 03]
+
+    T20  = _mm_packs_epi32(T70, T71);       // [13 12 11 10 03 02 01 00]
+    T21  = _mm_packs_epi32(T72, T73);       // [33 32 31 30 23 22 21 20]
+
+    _mm_storeu_si128((__m128i*)&pDst[0 * 4], T20);
+    _mm_storeu_si128((__m128i*)&pDst[2 * 4], T21);
 }
-#endif // INSTRSET >= 5
-
-#if INSTRSET < 5
-void xIDCT4(short *pSrc, short *pDst, intptr_t stride)
-{
-    const int shift_1st = 7;
-    const int shift_2nd = 12;
-    ALIGN_VAR_32(Short, tmp[4 * 4]);
-    ALIGN_VAR_32(Short, tmp2[4 * 4]);
-
-    partialButterflyInverse4(pSrc, tmp, shift_1st, 4);
-    partialButterflyInverse4(tmp, tmp2, shift_2nd, 4);
-    for(int i=0; i<4; i++)
-    {
-        memcpy(&pDst[i * stride], &tmp2[i * 4], 4 * sizeof(short));
-    }
-}
-#endif // INSTRSET < 5
-
-#if INSTRSET >= 5
+
 ALIGN_VAR_32(static const short, tab_idct_4x4[4][8] )=
 {
     { 64,  64, 64,  64, 64,  64, 64,  64 },
@@ -3046,9 +3025,6 @@ void xIDCT4(short *pSrc, short *pDst, in
     m128Tmp1 = _mm_srai_epi32( m128Tmp1, 7  );       // Sum = Sum >> iShiftNum
     m128iA = _mm_packs_epi32( m128iA, m128Tmp1);
 
-
-
-
     m128iD = _mm_sub_epi32( E2, O2 );
     m128iD = _mm_srai_epi32( m128iD, 7  );         // Sum = Sum >> iShiftNum
 
@@ -3103,7 +3079,6 @@ void xIDCT4(short *pSrc, short *pDst, in
     _mm_storel_epi64( (__m128i*)&pDst[2 * stride], m128iD );
     _mm_storeh_pi   ( (__m64*  )&pDst[3 * stride], _mm_castsi128_ps(m128iD));
 }
-#endif // INSTRSET >= 5
 
 #if INSTRSET < 5
 void xDCT8(short *pSrc, short *pDst, intptr_t)
--- a/source/encoder/motion.cpp	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/encoder/motion.cpp	Thu Jun 13 12:35:25 2013 +0530
@@ -59,8 +59,14 @@ void MotionEstimate::setSourcePU(int off
     if (size_scale[0] == 0)
         init_scales();
 
+    blockOffset = offset;
+
+    /* copy PU block into cache */
+    primitives.cpyblock(width, height, fenc, FENC_STRIDE, fencplane + offset, fencLumaStride);
+
 #if SUBSAMPLE_SAD
     subsample = 0;
+    fencSad = fenc;
     if (height > 12)
     {
         partEnum = PartitionFromSizes(width, height / 2);
@@ -72,6 +78,10 @@ void MotionEstimate::setSourcePU(int off
         partEnum = PartitionFromSizes(width, height);
         bufsad = primitives.sad[partEnum];
         satd = primitives.satd[partEnum];
+
+        /* Make sub-sampled copy of fenc block at `fencSad' for SAD calculations */
+        fencSad = fenc + height * FENC_STRIDE;
+        primitives.cpyblock(width, height / 2, fencSad, FENC_STRIDE, fenc, FENC_STRIDE * 2);
     }
     else
 #endif // if SUBSAMPLE_SAD
@@ -82,25 +92,6 @@ void MotionEstimate::setSourcePU(int off
         sad_x3 = primitives.sad_x3[partEnum];
         sad_x4 = primitives.sad_x4[partEnum];
     }
-
-    blockOffset = offset;
-
-    /* copy block into local buffer */
-    pixel *fencblock = fencplane + offset;
-    primitives.cpyblock(width, height, fenc, FENC_STRIDE, fencblock, fencLumaStride);
-#if SUBSAMPLE_SAD
-    if (subsample)
-    {
-        /* Make sub-sampled copy of fenc block at `fencSad' for SAD calculations */
-        fencSad = fenc + height * FENC_STRIDE;
-        primitives.cpyblock(width, height / 2, fencSad, FENC_STRIDE, fenc, FENC_STRIDE * 2);
-    }
-    else
-    {
-        /* Else use non-sub-sampled fenc block for SAD */
-        fencSad = fenc;
-    }
-#endif // if SUBSAMPLE_SAD
 }
 
 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
--- a/source/test/testbench.cpp	Thu Jun 13 12:33:23 2013 +0530
+++ b/source/test/testbench.cpp	Thu Jun 13 12:35:25 2013 +0530
@@ -121,7 +121,7 @@ int main(int argc, char *argv[])
     memset(&cprim, 0, sizeof(EncoderPrimitives));
     Setup_C_Primitives(cprim);
 
-    for (int i = 2; i <= cpuid; i++)
+    for (int i = 6; i <= cpuid; i++)
     {
 #if ENABLE_VECTOR_PRIMITIVES
         EncoderPrimitives vecprim;
Binary file source/tools/HM decoder/TAppDecoder.exe has changed