changeset 829:57aa9e90c935

Merged in deepthidevaki/xhevc_deepthid (pull request #92) Call Filter Primitives from SamplingQ, SamplingH
author Steve Borho <steve@borho.org>
date Thu, 25 Apr 2013 11:52:27 -0500
parents 0004001ac17b (current diff) 4e66d9fa333c (diff)
children 0d3a7ebba9e9
files source/Lib/TLibEncoder/TEncSearch.cpp source/encoder/primitives.cpp
diffstat 7 files changed, 337 insertions(+-), 54 deletions(-) [+]
line wrap: on
line diff
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Apr 25 11:47:18 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Apr 25 11:52:27 2013 -0500
@@ -43,6 +43,7 @@
 #include "InterpolationFilter.h"
 #include <math.h>
 
+using namespace x265;
 //! \ingroup TLibEncoder
 //! \{
 
@@ -5940,28 +5941,51 @@ Void TEncSearch::xExtDIFUpSamplingH(TCom
     srcPtr = (Pel*)pattern->getROIY() - halfFilterSize * srcStride - 1;
 
     dstPtr = m_filteredBlock[0][0].getLumaAddr();
+
     filterCopy(srcPtr + halfFilterSize * srcStride + 1, srcStride, dstPtr, dstStride, width, height);
 
     intPtr = filteredBlockTmp[0].getLumaAddr();
+#if ENABLE_PRIMITIVES
+    primitives.ipfilterConvert_p_s(g_bitDepthY, (pixel*)srcPtr, srcStride, intPtr,
+                                   intStride, width + 1, height + filterSize);
+#else
     filterConvertPelToShort(g_bitDepthY, srcPtr, srcStride, intPtr,
                             intStride, width + 1, height + filterSize);
+#endif
 
     intPtr = filteredBlockTmp[0].getLumaAddr() + (halfFilterSize - 1) * intStride + 1;
     dstPtr = m_filteredBlock[2][0].getLumaAddr();
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr,
+                                            dstStride, width, height + 1, m_lumaFilter[2]);
+#else
     filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr,
                                          dstStride, width, height + 1, m_lumaFilter[2]);
+#endif
 
     intPtr = filteredBlockTmp[2].getLumaAddr();
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_p_s[FILTER_H_P_S_8](g_bitDepthY, (pixel*)srcPtr, srcStride, intPtr, intStride, width + 1, height + filterSize,  m_lumaFilter[2]);
+#else
     filterHorizontal_pel_short<NTAPS_LUMA>(g_bitDepthY, srcPtr, srcStride, intPtr, intStride, width + 1,
                                            height + filterSize,  m_lumaFilter[2]);
+#endif
 
     intPtr = filteredBlockTmp[2].getLumaAddr() + halfFilterSize * intStride;
     dstPtr = m_filteredBlock[0][2].getLumaAddr();
+#if ENABLE_PRIMITIVES
+    primitives.ipfilterConvert_s_p(g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width + 1, height + 0);
+#else
     filterConvertShortToPel(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width + 1, height + 0);
+#endif
 
     intPtr = filteredBlockTmp[2].getLumaAddr() + (halfFilterSize - 1) * intStride;
     dstPtr = m_filteredBlock[2][2].getLumaAddr();
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width + 1, height + 1, m_lumaFilter[2]);
+#else
     filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width + 1, height + 1, m_lumaFilter[2]);
+#endif
 }
 
 /**
@@ -6003,7 +6027,11 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
     {
         srcPtr += 1;
     }
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_p_s[FILTER_H_P_S_8](g_bitDepthY, (pixel*)srcPtr, srcStride, intPtr, intStride, width, extHeight, m_lumaFilter[1]);
+#else
     filterHorizontal_pel_short<NTAPS_LUMA>(g_bitDepthY, srcPtr, srcStride, intPtr, intStride, width, extHeight, m_lumaFilter[1]);
+#endif
 
     // Horizontal filter 3/4
     srcPtr = pattern->getROIY() - halfFilterSize * srcStride - 1;
@@ -6016,7 +6044,11 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
     {
         srcPtr += 1;
     }
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_p_s[FILTER_H_P_S_8](g_bitDepthY, (pixel*)srcPtr, srcStride, intPtr, intStride, width, extHeight, m_lumaFilter[3]);
+#else
     filterHorizontal_pel_short<NTAPS_LUMA>(g_bitDepthY, srcPtr, srcStride, intPtr, intStride, width, extHeight, m_lumaFilter[3]);
+#endif
 
     // Generate @ 1,1
     intPtr = filteredBlockTmp[1].getLumaAddr() + (halfFilterSize - 1) * intStride;
@@ -6025,12 +6057,20 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
     {
         intPtr += intStride;
     }
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#else
     filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#endif
 
     // Generate @ 3,1
     intPtr = filteredBlockTmp[1].getLumaAddr() + (halfFilterSize - 1) * intStride;
     dstPtr = m_filteredBlock[3][1].getLumaAddr();
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#else
     filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#endif
 
     if (halfPelRef.getVer() != 0)
     {
@@ -6041,7 +6081,11 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
         {
             intPtr += intStride;
         }
+#if ENABLE_PRIMITIVES
+        primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[2]);
+#else
         filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[2]);
+#endif
 
         // Generate @ 2,3
         intPtr = filteredBlockTmp[3].getLumaAddr() + (halfFilterSize - 1) * intStride;
@@ -6050,19 +6094,31 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
         {
             intPtr += intStride;
         }
+#if ENABLE_PRIMITIVES
+        primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[2]);
+#else
         filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[2]);
+#endif
     }
     else
     {
         // Generate @ 0,1
         intPtr = filteredBlockTmp[1].getLumaAddr() + halfFilterSize * intStride;
         dstPtr = m_filteredBlock[0][1].getLumaAddr();
+#if ENABLE_PRIMITIVES
+        primitives.ipfilterConvert_s_p(g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height);
+#else
         filterConvertShortToPel(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height);
+#endif
 
         // Generate @ 0,3
         intPtr = filteredBlockTmp[3].getLumaAddr() + halfFilterSize * intStride;
         dstPtr = m_filteredBlock[0][3].getLumaAddr();
+#if ENABLE_PRIMITIVES
+        primitives.ipfilterConvert_s_p(g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height);
+#else
         filterConvertShortToPel(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height);
+#endif
     }
 
     if (halfPelRef.getHor() != 0)
@@ -6078,7 +6134,12 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
         {
             intPtr += intStride;
         }
+
+#if ENABLE_PRIMITIVES
+        primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#else
         filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#endif
 
         // Generate @ 3,2
         intPtr = filteredBlockTmp[2].getLumaAddr() + (halfFilterSize - 1) * intStride;
@@ -6091,7 +6152,11 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
         {
             intPtr += intStride;
         }
+#if ENABLE_PRIMITIVES
+        primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#else
         filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#endif
     }
     else
     {
@@ -6102,7 +6167,11 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
         {
             intPtr += intStride;
         }
+#if ENABLE_PRIMITIVES
+        primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#else
         filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#endif
 
         // Generate @ 3,0
         intPtr = filteredBlockTmp[0].getLumaAddr() + (halfFilterSize - 1) * intStride + 1;
@@ -6111,7 +6180,11 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
         {
             intPtr += intStride;
         }
+#if ENABLE_PRIMITIVES
+        primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#else
         filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#endif
     }
 
     // Generate @ 1,3
@@ -6121,12 +6194,20 @@ Void TEncSearch::xExtDIFUpSamplingQ(TCom
     {
         intPtr += intStride;
     }
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#else
     filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[1]);
+#endif
 
     // Generate @ 3,3
     intPtr = filteredBlockTmp[3].getLumaAddr() + (halfFilterSize - 1) * intStride;
     dstPtr = m_filteredBlock[3][3].getLumaAddr();
+#if ENABLE_PRIMITIVES
+    primitives.ipFilter_s_p[FILTER_V_S_P_8](g_bitDepthY, intPtr, intStride, (pixel*)dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#else
     filterVertical_short_pel<NTAPS_LUMA>(g_bitDepthY, intPtr, intStride, dstPtr, dstStride, width, height, m_lumaFilter[3]);
+#endif
 }
 
 /** set wp tables
--- a/source/encoder/CMakeLists.txt	Thu Apr 25 11:47:18 2013 -0500
+++ b/source/encoder/CMakeLists.txt	Thu Apr 25 11:52:27 2013 -0500
@@ -6,7 +6,7 @@ if(GCC)
 endif(GCC)
 
 if(ENABLE_PRIMITIVES)
-    set(CPRIMITIVES pixel.cpp macroblock.cpp)
+    set(CPRIMITIVES pixel.cpp macroblock.cpp InterpolationFilter.cpp)
 endif(ENABLE_PRIMITIVES)
 
 add_library(x265 ../../COPYING
@@ -16,7 +16,7 @@ add_library(x265 ../../COPYING
     threading.cpp threading.h
     threadpool.cpp threadpool.h
     md5.cpp md5.h
-    InterpolationFilter.cpp InterpolationFilter.h)
+    InterpolationFilter.h)
 
 if(ENABLE_PRIMITIVES)
     if(ENABLE_PRIMITIVES_VEC)
--- a/source/encoder/InterpolationFilter.cpp	Thu Apr 25 11:47:18 2013 -0500
+++ b/source/encoder/InterpolationFilter.cpp	Thu Apr 25 11:52:27 2013 -0500
@@ -22,7 +22,7 @@
  * For more information, contact us at licensing@multicorewareinc.com.
  *****************************************************************************/
 
-#include "InterpolationFilter.h"
+#include "primitives.h"
 #include <cstring>
 #include <assert.h>
 
@@ -31,11 +31,14 @@
 #pragma warning(disable: 4100) // unreferenced formal parameter
 #endif
 
+#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
+#define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
+#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally
+
+namespace {
 template<int N>
-void filterVertical_short_pel(int bitDepth, short *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff)
+void CDECL filterVertical_short_pel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height, short const *coeff)
 {
-    assert(bitDepth == 8);   //assuming bitDepth = 8
-
     int cStride = srcStride;
 
     src -= (N / 2 - 1) * cStride;
@@ -89,10 +92,8 @@ void filterVertical_short_pel(int bitDep
 }
 
 template<int N>
-void filterHorizontal_pel_pel(int bitDepth, Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff)
+void CDECL filterHorizontal_pel_pel(int bitDepth, pixel *src, int srcStride, pixel *dst, int dstStride, int width, int height, short const *coeff)
 {
-    assert(bitDepth == 8);
-
     int cStride = 1;
 
     src -= (N / 2 - 1) * cStride;
@@ -141,10 +142,8 @@ void filterHorizontal_pel_pel(int bitDep
 }
 
 template<int N>
-void filterHorizontal_pel_short(int bitDepth, Pel *src, int srcStride, short *dst, int dstStride, int width, int height, short const *coeff)
+void CDECL filterHorizontal_pel_short(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int width, int height, short const *coeff)
 {
-    assert(bitDepth == 8); //assuming bitdepth = 8
-
     int cStride = 1;
 
     src -= (N / 2 - 1) * cStride;
@@ -190,23 +189,8 @@ void filterHorizontal_pel_short(int bitD
     }
 }
 
-void filterCopy(Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height)
+void CDECL filterConvertShortToPel(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height)
 {
-    int row;
-
-    for (row = 0; row < height; row++)
-    {
-        memcpy(dst, src, sizeof(Pel) * width);
-
-        src += srcStride;
-        dst += dstStride;
-    }
-}
-
-void filterConvertShortToPel(int bitDepth, short *src, int srcStride, Pel *dst, int dstStride, int width, int height)
-{
-    assert(bitDepth == 8);
-
     int shift = IF_INTERNAL_PREC - bitDepth;
     short offset = IF_INTERNAL_OFFS;
 
@@ -230,10 +214,8 @@ void filterConvertShortToPel(int bitDept
     }
 }
 
-void filterConvertPelToShort(int bitDepth, Pel *src, int srcStride, short *dst, int dstStride, int width, int height)
+void CDECL filterConvertPelToShort(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int width, int height)
 {
-    assert(bitDepth == 8);
-
     int shift = IF_INTERNAL_PREC - bitDepth;
     int row, col;
 
@@ -249,15 +231,24 @@ void filterConvertPelToShort(int bitDept
         dst += dstStride;
     }
 }
-
-template
-void filterVertical_short_pel<8>(int bit_Depth, short *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff);
-template
-void filterHorizontal_pel_pel<8>(int bit_Depth, Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff);
-template
-void filterHorizontal_pel_short<8>(int bit_Depth, Pel *src, int srcStride, short *dst, int dstStride, int width, int height, short const *coeff);
-
+}
 #if _MSC_VER
 #pragma warning(default: 4127) // conditional expression is constant, typical for templated functions
 #pragma warning(default: 4100)
 #endif
+
+namespace x265 {
+// x265 private namespace
+
+void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
+{
+    p.ipFilter_p_p[FILTER_H_P_P_8] = filterHorizontal_pel_pel<8>;
+    p.ipFilter_p_s[FILTER_H_P_S_8] = filterHorizontal_pel_short<8>;
+    p.ipFilter_s_p[FILTER_V_S_P_8] = filterVertical_short_pel<8>;
+    p.ipfilterConvert_p_s = filterConvertPelToShort;
+    p.ipfilterConvert_s_p = filterConvertShortToPel;
+    p.ipFilter_p_p[FILTER_H_P_P_4] = filterHorizontal_pel_pel<4>;
+    p.ipFilter_p_s[FILTER_H_P_S_4] = filterHorizontal_pel_short<4>;
+    p.ipFilter_s_p[FILTER_V_S_P_4] = filterVertical_short_pel<4>;
+}
+}
--- a/source/encoder/InterpolationFilter.h	Thu Apr 25 11:47:18 2013 -0500
+++ b/source/encoder/InterpolationFilter.h	Thu Apr 25 11:52:27 2013 -0500
@@ -28,6 +28,7 @@
 #define X265_INTERPOLATIONFILTER_H
 
 #include "TLibCommon/TypeDef.h"
+#include "assert.h"
 
 const short m_lumaFilter[4][8] =
 {
@@ -51,17 +52,225 @@ const short m_chromaFilter[8][4] =
 
 #define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
 #define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
-#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally
+#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally 
 
 template<int N>
-void filterVertical_short_pel(int bit_Depth, short *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff);
-template<int N>
-void filterHorizontal_pel_pel(int bit_Depth, Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff);
+void filterVertical_short_pel(int bitDepth, short *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff)
+{
+    assert(bitDepth == 8);   //assuming bitDepth = 8
+
+    int cStride = srcStride;
+
+    src -= (N / 2 - 1) * cStride;
+
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    int shift = IF_FILTER_PREC;
+
+    shift += headRoom;
+    offset = 1 << (shift - 1);
+    offset +=  IF_INTERNAL_OFFS << IF_FILTER_PREC;
+    maxVal = (1 << bitDepth) - 1;
+
+    int row, col;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            int sum;
+
+            sum  = src[col + 0 * cStride] * coeff[0];
+            sum += src[col + 1 * cStride] * coeff[1];
+            if (N >= 4)
+            {
+                sum += src[col + 2 * cStride] * coeff[2];
+                sum += src[col + 3 * cStride] * coeff[3];
+            }
+            if (N >= 6)
+            {
+                sum += src[col + 4 * cStride] * coeff[4];
+                sum += src[col + 5 * cStride] * coeff[5];
+            }
+            if (N == 8)
+            {
+                sum += src[col + 6 * cStride] * coeff[6];
+                sum += src[col + 7 * cStride] * coeff[7];
+            }
+
+            short val = (short)((sum + offset) >> shift);
+
+            val = (val < 0) ? 0 : val;
+            val = (val > maxVal) ? maxVal : val;
+
+            dst[col] = val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
 template<int N>
-void filterHorizontal_pel_short(int bit_Depth, Pel *src, int srcStride, short *dst, int dstStride, int width, int height, short const *coeff);
+void filterHorizontal_pel_pel(int bitDepth, Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height, short const *coeff)
+{
+    assert(bitDepth == 8);
 
-void filterCopy(Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height);
-void filterConvertShortToPel(int bitDepth, short *src, int srcStride, Pel *dst, int dstStride, int width, int height);
-void filterConvertPelToShort(int bitDepth, Pel *src, int srcStride, short *dst, int dstStride, int width, int height);
+    int cStride = 1;
+
+    src -= (N / 2 - 1) * cStride;
+
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    offset =  (1 << (headRoom - 1));
+    maxVal = (1 << bitDepth) - 1;
+
+    int row, col;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            int sum;
+
+            sum  = src[col + 0 * cStride] * coeff[0];
+            sum += src[col + 1 * cStride] * coeff[1];
+            if (N >= 4)
+            {
+                sum += src[col + 2 * cStride] * coeff[2];
+                sum += src[col + 3 * cStride] * coeff[3];
+            }
+            if (N >= 6)
+            {
+                sum += src[col + 4 * cStride] * coeff[4];
+                sum += src[col + 5 * cStride] * coeff[5];
+            }
+            if (N == 8)
+            {
+                sum += src[col + 6 * cStride] * coeff[6];
+                sum += src[col + 7 * cStride] * coeff[7];
+            }
+
+            short val = (short)((sum + offset) >> headRoom);
+
+            if (val < 0) val = 0;
+            if (val > maxVal) val = maxVal;
+            dst[col] = val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+template<int N>
+void filterHorizontal_pel_short(int bitDepth, Pel *src, int srcStride, short *dst, int dstStride, int width, int height, short const *coeff)
+{
+    assert(bitDepth == 8); //assuming bitdepth = 8
+
+    int cStride = 1;
+
+    src -= (N / 2 - 1) * cStride;
+
+    int offset;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    int shift = IF_FILTER_PREC;
+
+    shift -= headRoom;
+    offset = -IF_INTERNAL_OFFS << shift;
+
+    int row, col;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            int sum;
+
+            sum  = src[col + 0 * cStride] * coeff[0];
+            sum += src[col + 1 * cStride] * coeff[1];
+            if (N >= 4)
+            {
+                sum += src[col + 2 * cStride] * coeff[2];
+                sum += src[col + 3 * cStride] * coeff[3];
+            }
+            if (N >= 6)
+            {
+                sum += src[col + 4 * cStride] * coeff[4];
+                sum += src[col + 5 * cStride] * coeff[5];
+            }
+            if (N == 8)
+            {
+                sum += src[col + 6 * cStride] * coeff[6];
+                sum += src[col + 7 * cStride] * coeff[7];
+            }
+
+            short val = (short)((sum + offset) >> shift);
+            dst[col] = val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+void filterCopy(Pel *src, int srcStride, Pel *dst, int dstStride, int width, int height)
+{
+    int row;
+
+    for (row = 0; row < height; row++)
+    {
+        memcpy(dst, src, sizeof(Pel) * width);
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+void filterConvertShortToPel(int bitDepth, short *src, int srcStride, Pel *dst, int dstStride, int width, int height)
+{
+    assert(bitDepth == 8);
+
+    int shift = IF_INTERNAL_PREC - bitDepth;
+    short offset = IF_INTERNAL_OFFS;
+
+    offset += shift ? (1 << (shift - 1)) : 0;
+    short maxVal = (1 << bitDepth) - 1;
+    short minVal = 0;
+    int row, col;
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            short val = src[col];
+            val = (val + offset) >> shift;
+            if (val < minVal) val = minVal;
+            if (val > maxVal) val = maxVal;
+            dst[col] = val;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
+
+void filterConvertPelToShort(int bitDepth, Pel *src, int srcStride, short *dst, int dstStride, int width, int height)
+{
+    assert(bitDepth == 8);
+
+    int shift = IF_INTERNAL_PREC - bitDepth;
+    int row, col;
+
+    for (row = 0; row < height; row++)
+    {
+        for (col = 0; col < width; col++)
+        {
+            short val = src[col] << shift;
+            dst[col] = val - (short)IF_INTERNAL_OFFS;
+        }
+
+        src += srcStride;
+        dst += dstStride;
+    }
+}
 
 #endif // ifndef X265_INTERPOLATIONFILTER_H
--- a/source/encoder/primitives.cpp	Thu Apr 25 11:47:18 2013 -0500
+++ b/source/encoder/primitives.cpp	Thu Apr 25 11:52:27 2013 -0500
@@ -76,11 +76,13 @@ EncoderPrimitives primitives;
 
 void Setup_C_PixelPrimitives(EncoderPrimitives &p);
 void Setup_C_MacroblockPrimitives(EncoderPrimitives &p);
+void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
 
 void Setup_C_Primitives(EncoderPrimitives &p)
 {
     Setup_C_PixelPrimitives(p);      // pixel.cpp
     Setup_C_MacroblockPrimitives(p); // macroblock.cpp
+    Setup_C_IPFilterPrimitives(p);
 }
 
 #endif // if ENABLE_PRIMITIVES
--- a/source/test/CMakeLists.txt	Thu Apr 25 11:47:18 2013 -0500
+++ b/source/test/CMakeLists.txt	Thu Apr 25 11:52:27 2013 -0500
@@ -29,6 +29,7 @@ add_executable(TestBench
     timer.cpp testharness.h
     pixelharness.cpp pixelharness.h
     filterharness.cpp filterharness.h
-    mbdstharness.cpp mbdstharness.h)
+    mbdstharness.cpp mbdstharness.h
+    ipfilterharness.cpp ipfilterharness.h)
 
 target_link_libraries(TestBench x265 ${PLATFORM_LIBS})
--- a/source/test/ipfilterharness.cpp	Thu Apr 25 11:47:18 2013 -0500
+++ b/source/test/ipfilterharness.cpp	Thu Apr 25 11:52:27 2013 -0500
@@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <limits.h>
 
 using namespace x265;
 
@@ -57,7 +58,7 @@ IPFilterHarness::IPFilterHarness()
     {
         int isPositive = rand() & 1;                             // To randomly generate Positive and Negative values
         isPositive = (isPositive) ? 1 : -1;
-        pixel_buff[i] = (rand() &  PIXEL_MAX);
+        pixel_buff[i] = (pixel)(rand() &  PIXEL_MAX);
         short_buff[i] = (isPositive) * (rand() &  SHRT_MAX);
     }
 }
@@ -187,14 +188,13 @@ bool IPFilterHarness::check_IPFilter_pri
 {
     int rand_height = rand() % 100;                 // Randomly generated Height
     int rand_width = rand() % 100;                  // Randomly generated Width
-    short rand_val, rand_srcStride, rand_dstStride;
+    short rand_srcStride, rand_dstStride;
 
     for (int i = 0; i <= 100; i++)
     {
         memset(IPF_vec_output_p, 0, ipf_t_size);      // Initialize output buffer to zero
         memset(IPF_C_output_p, 0, ipf_t_size);        // Initialize output buffer to zero
-
-        rand_val = rand() % 4;                     // Random offset in the filter
+                             
         rand_srcStride = rand() % 100;              // Randomly generated srcStride
         rand_dstStride = rand() % 100;              // Randomly generated dstStride
 
@@ -222,14 +222,13 @@ bool IPFilterHarness::check_IPFilter_pri
 {
     int rand_height = rand() % 100;                 // Randomly generated Height
     int rand_width = rand() % 100;                  // Randomly generated Width
-    short rand_val, rand_srcStride, rand_dstStride;
+    short rand_srcStride, rand_dstStride;
 
     for (int i = 0; i <= 100; i++)
     {
         memset(IPF_vec_output_p, 0, ipf_t_size);      // Initialize output buffer to zero
         memset(IPF_C_output_p, 0, ipf_t_size);        // Initialize output buffer to zero
 
-        rand_val = rand() % 4;                     // Random offset in the filter
         rand_srcStride = rand() % 100;              // Randomly generated srcStride
         rand_dstStride = rand() % 100;              // Randomly generated dstStride