changeset 2255:95b415adeffa

Merged in deepthidevaki/xhevc_deepthid (pull request #196) Modifications to vertical filter multiplane
author Steve Borho <steve@borho.org>
date Thu, 13 Jun 2013 14:37:59 -0500
parents 9123ea04e339 (current diff) 65768d985016 (diff)
children 5b2b4a7090bb
files source/common/primitives.h
diffstat 7 files changed, 216 insertions(+-), 0 deletions(-) [+]
line wrap: on
line diff
--- a/source/VectorClass/vectori128.h	Thu Jun 13 14:23:05 2013 -0500
+++ b/source/VectorClass/vectori128.h	Thu Jun 13 14:37:59 2013 -0500
@@ -4520,6 +4520,21 @@ static ALWAYSINLINE Vec8s compress (Vec4
 #endif
 }
 
+static ALWAYSINLINE Vec8s compress_unsafe (Vec4i const & low, Vec4i const & high) {
+#if INSTRSET >= 5   // SSE4.1 supported
+    return  _mm_packus_epi32(low,high);                  // unsigned pack
+#else
+    __m128i low1  = _mm_shufflelo_epi16(low,0xD8);         // low words in place
+    __m128i high1 = _mm_shufflelo_epi16(high,0xD8);        // low words in place
+    __m128i low2  = _mm_shufflehi_epi16(low1,0xD8);        // low words in place
+    __m128i high2 = _mm_shufflehi_epi16(high1,0xD8);       // low words in place
+    __m128i low3  = _mm_shuffle_epi32(low2,0xD8);          // low dwords of low  to pos. 0 and 32
+    __m128i high3 = _mm_shuffle_epi32(high2,0xD8);         // low dwords of high to pos. 0 and 32
+    return  _mm_unpacklo_epi64(low3,high3);                // interleave
+#endif
+
+}
+
 // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers
 // Signed with saturation
 static ALWAYSINLINE Vec8s compress_saturated (Vec4i const & low, Vec4i const & high) {
--- a/source/common/ipfilter.cpp	Thu Jun 13 14:23:05 2013 -0500
+++ b/source/common/ipfilter.cpp	Thu Jun 13 14:37:59 2013 -0500
@@ -25,6 +25,7 @@
 #include "primitives.h"
 #include <cstring>
 #include <assert.h>
+#include "TLibCommon/TComPrediction.h"
 
 #if _MSC_VER
 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
@@ -305,7 +306,17 @@ void filterVertical_pel_pel(int bitDepth
         dst += dstStride;
     }
 }
+
+void CDECL filterVertical_short_pel_multiplane(int bitDepth, short *src, int srcStride, pixel *dstA, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height)
+{
+    filterConvertShortToPel(bitDepth, src, srcStride, dstA, dstStride, block_width, block_height);
+    filterVertical_short_pel<8>(bitDepth, src, srcStride, dstI, dstStride, block_width, block_height, TComPrediction::m_lumaFilter[2]);
+    filterVertical_short_pel<8>(bitDepth, src, srcStride, dstE, dstStride, block_width, block_height, TComPrediction::m_lumaFilter[1]);
+    filterVertical_short_pel<8>(bitDepth, src, srcStride, dstP, dstStride, block_width, block_height, TComPrediction::m_lumaFilter[3]);
 }
+
+}
+
 #if _MSC_VER
 #pragma warning(default: 4127) // conditional expression is constant, typical for templated functions
 #pragma warning(default: 4100)
@@ -327,5 +338,7 @@ void Setup_C_IPFilterPrimitives(EncoderP
 
     p.ipFilter_p_p[FILTER_V_P_P_8] = filterVertical_pel_pel<8>;
     p.ipFilter_p_p[FILTER_V_P_P_4] = filterVertical_pel_pel<4>;
+
+    p.filterVmulti = filterVertical_short_pel_multiplane;
 }
 }
--- a/source/common/primitives.h	Thu Jun 13 14:23:05 2013 -0500
+++ b/source/common/primitives.h	Thu Jun 13 14:37:59 2013 -0500
@@ -203,6 +203,7 @@ typedef void (CDECL * cvt32to16_shr_t)(s
 typedef void (CDECL * dct_t)(short *pSrc, short *pDst, intptr_t stride);
 typedef void (CDECL * getResidue_t)(pixel *piOrig, pixel *piPred, short *piRes, int height, int width, int stride);
 typedef void (CDECL * calcRecons_t)(pixel* piPred, short* piResi,pixel*  piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride, int uiHeight, int uiWidth);
+typedef void (CDECL * filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstA, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height);
 
 
 /* Define a structure containing function pointers to optimized encoder
@@ -245,6 +246,7 @@ struct EncoderPrimitives
     cvt32to16_shr_t cvt32to16_shr;
     getResidue_t getResidue;
     calcRecons_t calcRecons;
+    filterVmulti_t filterVmulti;
 };
 
 /* This copy of the table is what gets used by all by the encoder.
--- a/source/common/vec/ipfilter.inc	Thu Jun 13 14:23:05 2013 -0500
+++ b/source/common/vec/ipfilter.inc	Thu Jun 13 14:37:59 2013 -0500
@@ -39,5 +39,9 @@ void NAME(Setup_Vec_IPFilterPrimitives)(
     p.ipFilter_s_p[FILTER_V_S_P_4] = filterVertical_short_pel<4>;
     p.ipFilter_p_p[FILTER_V_P_P_8] = filterVertical_pel_pel<8>;
     p.ipFilter_p_p[FILTER_V_P_P_4] = filterVertical_pel_pel<4>;
+
+#if !HIGH_BIT_DEPTH
+     p.filterVmulti = filterVertical_short_pel_multiplane;
+#endif 
 }
 }
--- a/source/common/vec/ipfilter8.inc	Thu Jun 13 14:23:05 2013 -0500
+++ b/source/common/vec/ipfilter8.inc	Thu Jun 13 14:37:59 2013 -0500
@@ -210,6 +210,118 @@ void CDECL filterVertical_short_pel(int 
     }
 }
 
+/*
+    Please refer Fig 7 in HEVC Overview document to familiarize with variables' naming convention
+    Input: Subpel from the Horizontal filter - 'src'
+    Output: All planes in the corresponding column - 'dst<A|E|I|P>'
+*/
+
+#define PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) { \
+        /*  write a3 to subpel */  \
+        val = (a3 + IF_INTERNAL_OFFS + 32) >> 6; \
+        val = max(val, 0); \
+        val = min(val, 255); \
+        tmp = compress_unsafe(val, 0); \
+        compress_unsafe(tmp, 0).store_partial(4, dstA + row * dstStride + col); \
+        /*  load a7 */ \
+        tmp.load(src + col + (row + 7) * cstride); \
+        a7 = extend_low(tmp); \
+        /*  calculation
+
+            The coefficients for different planes are :
+            e:    { -1, 4, -10, 58, 17,  -5, 1,  0 },
+            i:    { -1, 4, -11, 40, 40, -11, 4, -1 },
+            p:    {  0, 1,  -5, 17, 58, -10, 4, -1 }
+            Thus the expressions are:
+            sume = 4*a1 -a0 - 10*a2 + 58*a3 + 17*a4 -  5*a5 +   a6     ;
+            sumi = 4*a1 -a0 - 11*a2 + 40*a3 + 40*a4 - 11*a5 + 4*a6  -a7;
+            sump =   a1      - 5*a2 + 17*a3 + 58*a4 - 10*a5 + 4*a6  -a7;
+            */\
+        exp1 = (a1 << 2) - a0 - 10 * a2; \
+        exp2 = 40 * a3; \
+        exp3 = 17 * a3; \
+        exp4 = 17 * a4; \
+        exp5 = 40 * a4; \
+        exp6 = (a6 << 2) - a7 - 10 * a5; \
+            \
+        sume = exp1 + exp2 + exp3 + a3 + exp4 - 5 * a5 +   a6; \
+        sumi = exp1 - a2 + exp2 + exp5 + exp6 -   a5; \
+        sump = a1 - 5 * a2 + exp3 + exp4 + exp5 + a4 + exp6; \
+            \
+/* store results */ \
+        sumi = (sumi + offset) >> 12; \
+        tmp  = compress_saturated(sumi, 0); \
+        tmp  = max(tmp, 0); \
+        tmp  = min(tmp, 255); \
+        store_partial(const_int(4), dstI + row * dstStride + col, compress_unsafe(tmp, 0)); \
+            \
+        sume = (sume + offset) >> 12; \
+        tmp  = compress_saturated(sume, 0); \
+        tmp  = max(tmp, 0); \
+        tmp  = min(tmp, 255); \
+        store_partial(const_int(4), dstE + row * dstStride + col, compress_unsafe(tmp, 0)); \
+            \
+        sump = (sump + offset) >> 12; \
+        tmp  = compress_saturated(sump, 0); \
+        tmp  = max(tmp, 0); \
+        tmp  = min(tmp, 255); \
+        store_partial(const_int(4), dstP + row * dstStride + col, compress_unsafe(tmp, 0)); \
+            \
+}
+
+void CDECL filterVertical_short_pel_multiplane(int /*bitDepth*/, short *src, int srcStride, pixel *dstA, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height)
+{
+    int row, col;
+    int cstride =  srcStride;
+
+    src -= (8 / 2 - 1) * cstride;
+    int offset;
+    int headRoom = IF_INTERNAL_PREC - 8;
+    int shift = IF_FILTER_PREC;
+    shift += headRoom;
+    offset = 1 << (shift - 1);
+    offset +=  IF_INTERNAL_OFFS << IF_FILTER_PREC;
+
+    Vec4i a0, a1, a2, a3, a4, a5, a6, a7, sum;
+    Vec8s tmp;
+    Vec4i val, sume, sumi, sump;
+    Vec4i exp1, exp2, exp3, exp4, exp5, exp6;
+
+    for (col = 0; col < block_width; col += 4)         // Considering block width is always a multiple of 4
+    {
+        tmp.load(src + col);
+        a0 = extend_low(tmp);
+        tmp.load(src + col + cstride);
+        a1 = extend_low(tmp);
+        tmp.load(src + col + 2 * cstride);
+        a2 = extend_low(tmp);
+        tmp.load(src + col + 3 * cstride);
+        a3 = extend_low(tmp);
+        tmp.load(src + col + 4 * cstride);
+        a4 = extend_low(tmp);
+        tmp.load(src + col + 5 * cstride);
+        a5 = extend_low(tmp);
+        tmp.load(src + col + 6 * cstride);
+        a6 = extend_low(tmp);
+
+        for (row = 0; row < block_height; row++)
+        {
+            PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) row++;
+            PROCESSROW(a1, a2, a3, a4, a5, a6, a7, a0) 
+
+            Vec8s a8;
+            a8=a0;
+            a0=a2;
+            a1=a3;
+            a2=a4;
+            a3=a5;
+            a4=a6;
+            a5=a7;
+            a6=a8;
+        }
+    }
+}
+
 template<int N>
 void CDECL filterVertical_pel_pel(int bitDepth, pixel *src, int srcStride, pixel *dst, int dstStride, int block_width, int block_height, short const *coeff)
 {
--- a/source/test/ipfilterharness.cpp	Thu Jun 13 14:23:05 2013 -0500
+++ b/source/test/ipfilterharness.cpp	Thu Jun 13 14:37:59 2013 -0500
@@ -260,6 +260,60 @@ bool IPFilterHarness::check_IPFilter_pri
     return true;
 }
 
+bool IPFilterHarness::check_filterVMultiplane(x265::filterVmulti_t ref, x265::filterVmulti_t opt)
+{
+    short rand_height = 32;                 // Can be randomly generated Height
+    short rand_width = 32;                  // Can be randomly generated Width
+    short rand_srcStride, rand_dstStride;
+
+    pixel dstAvec[100*100];
+    pixel dstEvec[100*100];
+    pixel dstIvec[100*100];
+    pixel dstPvec[100*100];
+
+    pixel dstAref[100*100];
+    pixel dstEref[100*100];
+    pixel dstIref[100*100];
+    pixel dstPref[100*100];
+
+
+    for (int i = 0; i <= 100; i++)
+    {        
+
+        rand_srcStride = 64;               // Can be randomly generated 
+        rand_dstStride = 64;               
+        memset(dstAref, 0, 10000*sizeof(pixel));
+        memset(dstEref, 0, 10000*sizeof(pixel));
+        memset(dstIref, 0, 10000*sizeof(pixel));
+        memset(dstPref, 0, 10000*sizeof(pixel));
+        memset(dstAvec, 0, 10000*sizeof(pixel));
+        memset(dstEvec, 0, 10000*sizeof(pixel));
+        memset(dstIvec, 0, 10000*sizeof(pixel));
+        memset(dstPvec, 0, 10000*sizeof(pixel));
+        opt(8, short_buff+ 3*64 ,
+            rand_srcStride,
+            dstAvec, dstEvec,dstIvec, dstPvec,
+            rand_dstStride,
+            rand_width,
+            rand_height);
+        ref(8, short_buff + 3*64,
+            rand_srcStride,
+            dstAref, dstEref,dstIref, dstPref,
+            rand_dstStride,
+            rand_width,
+            rand_height);
+
+        if (memcmp(dstAvec,dstAref, 100*100 * sizeof(pixel)) || memcmp(dstEvec,dstEref, 100*100 * sizeof(pixel)) 
+            || memcmp(dstIvec,dstIref, 100*100 * sizeof(pixel)) || memcmp(dstPvec,dstPref, 100*100 * sizeof(pixel)))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
 bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     for (int value = 0; value < NUM_IPFILTER_P_P; value++)
@@ -316,6 +370,14 @@ bool IPFilterHarness::testCorrectness(co
         }
     }
 
+    if (opt.filterVmulti)
+    {
+        if (!check_filterVMultiplane(ref.filterVmulti, opt.filterVmulti))
+        {
+            printf("\nFilter-multiplane failed\n");
+            return false;
+        }
+    }
     return true;
 }
 
@@ -370,4 +432,11 @@ void IPFilterHarness::measureSpeed(const
         REPORT_SPEEDUP(opt.ipfilterConvert_s_p, ref.ipfilterConvert_s_p,
                        8, short_buff, srcStride, IPF_vec_output_p, dstStride, width, height);
     }
+
+    if (opt.filterVmulti)
+    {
+        printf("Filter-multiplane\t");
+        REPORT_SPEEDUP(opt.filterVmulti, ref.filterVmulti,
+                       8, short_buff, srcStride, IPF_vec_output_p, IPF_C_output_p, IPF_vec_output_p, IPF_C_output_p, dstStride, width, height);
+    }
 }
--- a/source/test/ipfilterharness.h	Thu Jun 13 14:23:05 2013 -0500
+++ b/source/test/ipfilterharness.h	Thu Jun 13 14:37:59 2013 -0500
@@ -45,6 +45,7 @@ protected:
     bool check_IPFilter_primitive(x265::IPFilter_s_p ref, x265::IPFilter_s_p opt);
     bool check_IPFilter_primitive(x265::IPFilterConvert_p_s ref, x265::IPFilterConvert_p_s opt);
     bool check_IPFilter_primitive(x265::IPFilterConvert_s_p ref, x265::IPFilterConvert_s_p opt);
+    bool check_filterVMultiplane(x265::filterVmulti_t ref, x265::filterVmulti_t opt);
 
 public: