changeset 2183:318e10a6a50d

Merged in deepthidevaki/xhevc_deepthid (pull request #191) Optimized residual and reconstruction in xIntraCoding[Luma/Chroma]Blk
author Steve Borho <steve@borho.org>
date Tue, 11 Jun 2013 11:36:18 -0500
parents f09d5d9cf0c3 (current diff) 8e690d36e169 (diff)
children a3bf4f55f789
files source/common/vec/pixel8.inc
diffstat 7 files changed, 365 insertions(+-), 107 deletions(-) [+]
line wrap: on
line diff
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Jun 11 12:52:49 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Jun 11 11:36:18 2013 -0500
@@ -958,24 +958,8 @@ Void TEncSearch::xIntraCodingLumaBlk(TCo
     }
 
     //===== get residual signal =====
-    {
-        // get residual
-        Pel*  pOrg    = piOrg;
-        Pel*  pPred   = piPred;
-        Short* pResi  = piResi;
-        //TODO : performance primitive?
-        for (UInt uiY = 0; uiY < uiHeight; uiY++)
-        {
-            for (UInt uiX = 0; uiX < uiWidth; uiX++)
-            {
-                pResi[uiX] = static_cast<Short>(pOrg[uiX]) - static_cast<Short>(pPred[uiX]);
-            }
-
-            pOrg  += uiStride;
-            pResi += uiStride;
-            pPred += uiStride;
-        }
-    }
+
+    primitives.getResidue((pixel*)piOrg,(pixel*)piPred,piResi, uiHeight, uiWidth, uiStride);
 
     //===== transform and quantization =====
     //--- init rate estimation arrays for RDOQ ---
@@ -1013,30 +997,9 @@ Void TEncSearch::xIntraCodingLumaBlk(TCo
         }
     }
 
-    //===== reconstruction =====
-    {
-        Pel*   pPred      = piPred;
-        Short* pResi      = piResi;
-        Pel*   pReco      = piReco;
-        Short* pRecQt     = piRecQt;
-        Pel*   pRecIPred  = piRecIPred;
-        //TODO : performance primitive?
-        for (UInt uiY = 0; uiY < uiHeight; uiY++)
-        {
-            for (UInt uiX = 0; uiX < uiWidth; uiX++)
-            {
-                pReco[uiX] = ClipY(static_cast<Short>(pPred[uiX]) + pResi[uiX]);
-                pRecQt[uiX] = (Short)pReco[uiX];
-                pRecIPred[uiX] = pReco[uiX];
-            }
-
-            pPred     += uiStride;
-            pResi     += uiStride;
-            pReco     += uiStride;
-            pRecQt    += uiRecQtStride;
-            pRecIPred += uiRecIPredStride;
-        }
-    }
+    //===== reconstruction =====  
+
+    primitives.calcRecons((pixel*)piPred, piResi, (pixel*)piReco, piRecQt, (pixel*)piRecIPred, uiStride, uiRecQtStride, uiRecIPredStride, uiHeight, uiWidth);
 
     //===== update distortion =====
     int Part = PartitionFromSizes(uiWidth, uiHeight);
@@ -1114,18 +1077,9 @@ Void TEncSearch::xIntraCodingChromaBlk(T
         if (default0Save1Load2 == 1)
         {
             Pel*  pPred   = piPred;
-            Pel*  pPredBuf = m_pSharedPredTransformSkip[1 + uiChromaId];
-            Int k = 0;
-            //TODO : performance primitive???
-            for (UInt uiY = 0; uiY < uiHeight; uiY++)
-            {
-                for (UInt uiX = 0; uiX < uiWidth; uiX++)
-                {
-                    pPredBuf[k++] = pPred[uiX];
-                }
-
-                pPred += uiStride;
-            }
+            Pel*  pPredBuf = m_pSharedPredTransformSkip[1 + uiChromaId];            
+            
+            primitives.cpyblock(uiWidth,uiHeight,pPredBuf,uiWidth, pPred, uiStride);
         }
     }
     else
@@ -1133,37 +1087,12 @@ Void TEncSearch::xIntraCodingChromaBlk(T
         // load prediction
         Pel*  pPred   = piPred;
         Pel*  pPredBuf = m_pSharedPredTransformSkip[1 + uiChromaId];
-        Int k = 0;
-        //TODO : performance primitive???
-        for (UInt uiY = 0; uiY < uiHeight; uiY++)
-        {
-            for (UInt uiX = 0; uiX < uiWidth; uiX++)
-            {
-                pPred[uiX] = pPredBuf[k++];
-            }
-
-            pPred += uiStride;
-        }
+        
+        primitives.cpyblock(uiWidth,uiHeight,pPred,uiStride,pPredBuf,uiWidth);
     }
-    //===== get residual signal =====
-    {
-        // get residual
-        Pel*  pOrg    = piOrg;
-        Pel*  pPred   = piPred;
-        Short*  pResi   = piResi;
-        //TODO : performance primitive???
-        for (UInt uiY = 0; uiY < uiHeight; uiY++)
-        {
-            for (UInt uiX = 0; uiX < uiWidth; uiX++)
-            {
-                pResi[uiX] = static_cast<Short>(pOrg[uiX]) - static_cast<Short>(pPred[uiX]);
-            }
-
-            pOrg  += uiStride;
-            pResi += uiStride;
-            pPred += uiStride;
-        }
-    }
+    //===== get residual signal =====    
+
+    primitives.getResidue((pixel*)piOrg,(pixel*)piPred,piResi, uiHeight, uiWidth, uiStride);
 
     //===== transform and quantization =====
     {
@@ -1211,29 +1140,8 @@ Void TEncSearch::xIntraCodingChromaBlk(T
     }
 
     //===== reconstruction =====
-    {
-        Pel* pPred      = piPred;
-        Short* pResi      = piResi;
-        Pel* pReco      = piReco;
-        Short* pRecQt     = piRecQt;
-        Pel* pRecIPred  = piRecIPred;
-        //TODO : performance primitive???
-        for (UInt uiY = 0; uiY < uiHeight; uiY++)
-        {
-            for (UInt uiX = 0; uiX < uiWidth; uiX++)
-            {
-                pReco[uiX] = ClipC(static_cast<Short>(pPred[uiX]) + pResi[uiX]);
-                pRecQt[uiX] = (Short)pReco[uiX];
-                pRecIPred[uiX] = pReco[uiX];
-            }
-
-            pPred     += uiStride;
-            pResi     += uiStride;
-            pReco     += uiStride;
-            pRecQt    += uiRecQtStride;
-            pRecIPred += uiRecIPredStride;
-        }
-    }
+    
+    primitives.calcRecons((pixel*)piPred, piResi, (pixel*)piReco, piRecQt, (pixel*)piRecIPred, uiStride, uiRecQtStride, uiRecIPredStride, uiHeight, uiWidth);
 
     //===== update distortion =====
     int Part = x265::PartitionFromSizes(uiWidth, uiHeight);
--- a/source/common/pixel.cpp	Tue Jun 11 12:52:49 2013 +0530
+++ b/source/common/pixel.cpp	Tue Jun 11 11:36:18 2013 -0500
@@ -24,8 +24,13 @@
  *****************************************************************************/
 
 #include "primitives.h"
+#include <algorithm>
 #include <stdlib.h> // abs()
 
+template<typename T>
+inline T ClipY(T x) { return std::min<T>(T((1 << 8) - 1), std::max<T>(T(0), x)); }
+
+
 #define SET_FUNC_PRIMITIVE_TABLE_C(FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
     p. FUNC_PREFIX [PARTITION_4x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF < 4, 4, DATA_TYPE1, DATA_TYPE2 >;  \
     p. FUNC_PREFIX [PARTITION_4x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF < 4, 8, DATA_TYPE1, DATA_TYPE2 >;  \
@@ -484,6 +489,53 @@ void CDECL convert32to16_shr(short *piDs
         piDst[i] = (short)(psOrg[i] >> shift);
     }
 }
+
+void CDECL getResidualIntra(pixel *piOrg, pixel *piPred, short *piResi, int height, int width, int stride)
+    {
+        // get residual
+        pixel*  pOrg    = piOrg;
+        pixel*  pPred   = piPred;
+        short* pResi  = piResi;
+        //TODO : performance primitive?
+        for (int uiY = 0; uiY < height; uiY++)
+        {
+            for (int uiX = 0; uiX < width; uiX++)
+            {
+                pResi[uiX] = static_cast<short>(pOrg[uiX]) - static_cast<short>(pPred[uiX]);
+            }
+
+            pOrg  += stride;
+            pResi += stride;
+            pPred += stride;
+        }
+    }
+
+
+void CDECL calcRecons(pixel* piPred, short* piResi,pixel*  piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride, int uiHeight, int uiWidth)
+    {
+        pixel*   pPred      = piPred;
+        short* pResi      = piResi;
+        pixel*   pReco      = piReco;
+        short* pRecQt     = piRecQt;
+        pixel*   pRecIPred  = piRecIPred;
+        //TODO : performance primitive?
+        for (int uiY = 0; uiY < uiHeight; uiY++)
+        {
+            for (int uiX = 0; uiX < uiWidth; uiX++)
+            {
+                pReco[uiX] = (pixel) ClipY(static_cast<short>(pPred[uiX]) + pResi[uiX]);
+                pRecQt[uiX] = (short)pReco[uiX];
+                pRecIPred[uiX] = pReco[uiX];
+            }
+
+            pPred     += uiStride;
+            pResi     += uiStride;
+            pReco     += uiStride;
+            pRecQt    += uiRecQtStride;
+            pRecIPred += uiRecIPredStride;
+        }
+    }
+
 }  // end anonymous namespace
 
 namespace x265 {
@@ -809,5 +861,8 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sa8d_16x16 = pixel_sa8d_16x16;
     p.sa8d_32x32 = pixel_sa8d_32x32;
     p.sa8d_64x64 = pixel_sa8d_64x64;
+
+    p.getResidue = getResidualIntra;
+    p.calcRecons = calcRecons;
 }
 }
--- a/source/common/primitives.h	Tue Jun 11 12:52:49 2013 +0530
+++ b/source/common/primitives.h	Tue Jun 11 11:36:18 2013 -0500
@@ -201,6 +201,9 @@ typedef void (CDECL * cvt16to32_shl_t)(i
 typedef void (CDECL * cvt32to16_t)(int *psOrg, short *piDst, int);
 typedef void (CDECL * cvt32to16_shr_t)(short *piDst, int *psOrg, int, int);
 typedef void (CDECL * dct_t)(short *pSrc, short *pDst, intptr_t stride);
+typedef void (CDECL * getResidue_t)(pixel *piOrig, pixel *piPred, short *piRes, int height, int width, int stride);
+typedef void (CDECL * calcRecons_t)(pixel* piPred, short* piResi,pixel*  piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride, int uiHeight, int uiWidth);
+
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -240,6 +243,8 @@ struct EncoderPrimitives
     cvt16to32_shl_t cvt16to32_shl;
     cvt32to16_t cvt32to16;
     cvt32to16_shr_t cvt32to16_shr;
+    getResidue_t getResidue;
+    calcRecons_t calcRecons;
 };
 
 /* This copy of the table is what gets used by all by the encoder.
--- a/source/common/vec/pixel.inc	Tue Jun 11 12:52:49 2013 +0530
+++ b/source/common/vec/pixel.inc	Tue Jun 11 11:36:18 2013 -0500
@@ -231,5 +231,10 @@ void NAME(Setup_Vec_PixelPrimitives)(Enc
     p.cvt16to32_shl = convert16to32_shl;
     p.cvt32to16     = convert32to16;
     p.cvt32to16_shr = convert32to16_shr;
+
+#if !HIGH_BIT_DEPTH
+    p.getResidue = getResidualIntra;
+    p.calcRecons = calcRecons;
+#endif 
 }
 }
--- a/source/common/vec/pixel8.inc	Tue Jun 11 12:52:49 2013 +0530
+++ b/source/common/vec/pixel8.inc	Tue Jun 11 11:36:18 2013 -0500
@@ -1715,3 +1715,186 @@ void CDECL sad_x4_64(pixel *piOrg, pixel
     res[2] = horizontal_add(sum3);
     res[3] = horizontal_add(sum4);
 }
+
+void CDECL getResidualIntra(pixel *piOrg, pixel *piPred, short *piResi, int height, int width, int stride)
+{
+        // get residual
+        pixel*  pOrg    = piOrg;
+        pixel*  pPred   = piPred;
+        short* pResi  = piResi;
+        
+        Vec8s v_pOrg, v_pPred, v_pResilo, v_pResihi;
+        Vec16uc v_temp1, v_temp2;
+        if(width == 4)
+        {
+        for (int uiY = 0; uiY < height; uiY++)
+        {
+                v_temp1 = load_partial(const_int(4), pOrg);
+                
+                v_pOrg = extend_low(v_temp1);
+                v_temp2 = load_partial(const_int(4), pPred);
+                v_pPred = extend_low(v_temp2);
+
+                v_pResilo = v_pOrg - v_pPred;
+                store_partial(const_int(8), pResi, v_pResilo);
+
+            pOrg  += stride;
+            pResi += stride;
+            pPred += stride;
+        }
+        }
+        else if(width == 8)
+        {
+            for (int uiY = 0; uiY < height; uiY++)
+            {
+                //pResi[uiX] = static_cast<short>(pOrg[uiX]) - static_cast<short>(pPred[uiX]);
+                v_temp1.load(pOrg);
+                v_pOrg = extend_low(v_temp1);
+                v_temp2.load(pPred);
+                v_pPred = extend_low(v_temp2);
+
+                v_pResilo = v_pOrg - v_pPred;
+                v_pResilo.store(pResi);
+
+                pOrg  += stride;
+                pResi += stride;
+                pPred += stride;
+            }
+            
+        }
+        else 
+        {
+        for (int uiY = 0; uiY < height; uiY++)
+        {
+            for (int uiX = 0; uiX < width; uiX+=16)
+            {
+                v_temp1.load(pOrg+uiX);
+                v_pOrg = extend_low(v_temp1);
+                v_temp2.load(pPred+uiX);
+                v_pPred = extend_low(v_temp2);
+
+                v_pResilo = v_pOrg - v_pPred;
+                v_pResilo.store(pResi+uiX);
+
+                v_pOrg = extend_high(v_temp1);
+                v_pPred = extend_high(v_temp2);
+                v_pResihi = v_pOrg - v_pPred;
+                v_pResihi.store(pResi+uiX+8);
+
+            }
+
+            pOrg  += stride;
+            pResi += stride;
+            pPred += stride;
+        }
+        }
+
+}
+
+
+void CDECL calcRecons(pixel* piPred, short* piResi,pixel*  piReco, short* piRecQt, pixel* piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride, int uiHeight, int uiWidth)
+    {
+        pixel*   pPred      = piPred;
+        short* pResi      = piResi;
+        pixel*   pReco      = piReco;
+        short* pRecQt     = piRecQt;
+        pixel*   pRecIPred  = piRecIPred;
+        //TODO : performance primitive?
+
+        if(uiWidth == 4)
+        {
+            for (int uiY = 0; uiY < uiHeight; uiY++)
+            {
+                    Vec8s vresi, vpred, vres, vsum;
+                    Vec16uc tmp;
+
+                    tmp = load_partial(const_int(4), pPred );
+                    vpred = extend_low(tmp);
+                
+                    vresi = load_partial(const_int(8), pResi);
+                    vsum = vpred + vresi;
+                
+                    vsum = min(255, max(vsum, 0));
+
+                    store_partial(const_int(8), pRecQt, vsum);
+
+                    tmp = compress(vsum, vsum);
+
+                    store_partial(const_int(4), pReco, tmp);
+                    store_partial(const_int(4), pRecIPred, tmp);
+
+                pPred     += uiStride;
+                pResi     += uiStride;
+                pReco     += uiStride;
+                pRecQt    += uiRecQtStride;
+                pRecIPred += uiRecIPredStride;
+            }
+
+        }
+        else if(uiWidth == 8)
+        {
+            for (int uiY = 0; uiY < uiHeight; uiY++)
+            {
+                    Vec8s vresi, vpred, vres, vsum;
+                    Vec16uc tmp;
+
+                    tmp.load(pPred);
+                    vpred = extend_low(tmp);
+                
+                    vresi.load(pResi);
+                    vsum = vpred + vresi;
+                
+                    vsum = min(255, max(vsum, 0));
+
+                    vsum.store(pRecQt);
+
+                    tmp = compress(vsum, vsum);
+
+                    store_partial(const_int(8), pReco, tmp);
+                    store_partial(const_int(8), pRecIPred, tmp);
+
+                pPred     += uiStride;
+                pResi     += uiStride;
+                pReco     += uiStride;
+                pRecQt    += uiRecQtStride;
+                pRecIPred += uiRecIPredStride;
+            }
+
+        }
+        else
+        {
+            for (int uiY = 0; uiY < uiHeight; uiY++)
+            {
+                for (int uiX = 0; uiX < uiWidth; uiX+=16)
+                {
+                    Vec8s vresi, vpred, vres, vsum1, vsum2;
+                    Vec16uc tmp;
+
+                    tmp.load(pPred + uiX);
+
+                    vpred = extend_low(tmp);
+                    vresi.load(pResi+uiX);
+                    vsum1 = vpred + vresi;
+                    vsum1 = min(255, max(vsum1, 0));
+                    vsum1.store(pRecQt + uiX);
+
+                    vpred = extend_high(tmp);
+                    vresi.load(pResi+uiX + 8);
+                    vsum2 = vpred + vresi;
+                    vsum2 = min(255, max(vsum2, 0));
+                    vsum2.store(pRecQt + uiX + 8);
+
+                    tmp = compress(vsum1, vsum2);
+                    tmp.store(pReco + uiX);
+                    tmp.store(pRecIPred + uiX);
+                }
+
+                pPred     += uiStride;
+                pResi     += uiStride;
+                pReco     += uiStride;
+                pRecQt    += uiRecQtStride;
+                pRecIPred += uiRecIPredStride;
+            }
+        }
+    }
+
--- a/source/test/pixelharness.cpp	Tue Jun 11 12:52:49 2013 +0530
+++ b/source/test/pixelharness.cpp	Tue Jun 11 11:36:18 2013 -0500
@@ -268,6 +268,69 @@ bool PixelHarness::check_block_copy_p_s(
     return true;
 }
 
+bool PixelHarness::check_getResidue(x265::getResidue_t ref, x265::getResidue_t opt)
+{     
+    ALIGN_VAR_16(short, ref_dest[64 * 64]);
+    ALIGN_VAR_16(short, opt_dest[64 * 64]); 
+    memset(ref_dest,0,64*64*sizeof(short));
+    memset(opt_dest,0,64*64*sizeof(short));
+    int j = 0;
+    for (int i = 0; i <= 100; i++)
+    {
+        int height = 4<<(i%4); // rand()%64 + 1;
+        int width = 4<<(i%4); //rand()%64 + 1;
+        int stride = 64;
+        opt(pbuf1 + j, pbuf2+ j, opt_dest, height,width, stride);
+        ref(pbuf1 + j, pbuf2+ j, ref_dest, height,width, stride);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
+            return false;
+
+        j += 100;
+        
+    }
+
+    return true;
+}
+
+bool PixelHarness::check_calcRecons(x265::calcRecons_t ref, x265::calcRecons_t opt)
+{     
+    ALIGN_VAR_16(short, ref_recq[64 * 64]);
+    ALIGN_VAR_16(short, opt_recq[64 * 64]);
+
+    ALIGN_VAR_16(pixel, ref_reco[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_reco[64 * 64]);
+
+    ALIGN_VAR_16(pixel, ref_pred[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_pred[64 * 64]);
+
+    memset(ref_recq,0,64*64*sizeof(short));
+    memset(opt_recq,0,64*64*sizeof(short));
+    memset(ref_reco,0,64*64*sizeof(pixel));
+    memset(opt_reco,0,64*64*sizeof(pixel));
+    memset(ref_pred,0,64*64*sizeof(pixel));
+    memset(opt_pred,0,64*64*sizeof(pixel));
+
+    int j = 0;
+    for (int i = 0; i <= 100; i++)
+    {
+        int height = 4<<(i%4); // rand()%64 + 1;
+        int width = 4<<(i%4); //rand()%64 + 1;
+        int stride = 64;
+        opt(pbuf1 + j, sbuf1+ j, opt_reco, opt_recq, opt_pred, stride, stride, stride, height, width );
+        ref(pbuf1 + j, sbuf1+ j, ref_reco, ref_recq, ref_pred, stride, stride, stride, height, width );
+
+        if (memcmp(ref_recq, opt_recq, 64 * 64 * sizeof(short)) || memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel)) || memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        j += 100;
+        
+    }
+
+    return true;
+}
+
+
 bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
@@ -407,6 +470,24 @@ bool PixelHarness::testCorrectness(const
             return false;
         }
     }
+
+    if (opt.getResidue)
+    {
+        if (!check_getResidue(ref.getResidue, opt.getResidue))
+        {
+            printf("getResidue failed!\n");
+            return false;
+        }
+    }
+
+    if (opt.calcRecons)
+    {
+        if (!check_calcRecons(ref.calcRecons, opt.calcRecons))
+        {
+            printf("calcRecon failed!\n");
+            return false;
+        }
+    }
     return true;
 }
 
@@ -507,4 +588,22 @@ void PixelHarness::measureSpeed(const En
         printf("s_c   cpy");
         REPORT_SPEEDUP(opt.cpyblock_s_c, ref.cpyblock_s_c, 64, 64, (short*)pbuf1, FENC_STRIDE, (uint8_t*)pbuf2, STRIDE);
     }
+
+    if (opt.getResidue)
+    {
+        for(int i =4; i<=32; i<<=1)
+        {
+            printf("getResidue%dx%d", i,i);
+            REPORT_SPEEDUP(opt.getResidue, ref.getResidue, pbuf1 , pbuf2, sbuf1, i, i, 64);
+        }
+    }
+
+    if (opt.calcRecons)
+    {
+        for(int i =4; i<=32; i<<=1)
+        {
+            printf("calcRecons%dx%d", i,i);
+            REPORT_SPEEDUP(opt.calcRecons, ref.calcRecons, pbuf1 , sbuf1, pbuf2, sbuf1, pbuf1, 64,64,64, i, i);
+        }
+    }
 }
--- a/source/test/pixelharness.h	Tue Jun 11 12:52:49 2013 +0530
+++ b/source/test/pixelharness.h	Tue Jun 11 11:36:18 2013 -0500
@@ -44,6 +44,9 @@ protected:
     bool check_block_copy_s_p(x265::blockcpy_s_p ref, x265::blockcpy_s_p opt);
     bool check_block_copy_p_s(x265::blockcpy_p_s ref, x265::blockcpy_p_s opt);
     bool check_block_copy_s_c(x265::blockcpy_s_c ref, x265::blockcpy_s_c opt);
+    bool check_getResidue(x265::getResidue_t ref, x265::getResidue_t opt);
+    bool check_calcRecons(x265::calcRecons_t ref, x265::calcRecons_t opt);
+
 
 public: