changeset 568:f05a5d3879f0

Merged multicoreware/xhevc into default
author Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
date Mon, 15 Apr 2013 17:43:29 +0530
parents 7f8daed75336 (current diff) 9a0056333a34 (diff)
children 66c24df49d97
files source/Lib/TLibCommon/TComInterpolationFilter.cpp source/encoder/macroblock.cpp
diffstat 11 files changed, 477 insertions(+-), 148 deletions(-) [+]
line wrap: on
line diff
--- a/source/Lib/TLibCommon/TComInterpolationFilter.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/Lib/TLibCommon/TComInterpolationFilter.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -45,6 +45,8 @@
 #include <assert.h>
 #include "primitives.h"
 
+using namespace x265;
+
 //! \ingroup TLibCommon
 //! \{
 // ====================================================================================================================
@@ -186,83 +188,106 @@ Void TComInterpolationFilter::filter(Int
     {
         if (N == 8 && !isFirst && !isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_8_0_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_8_0_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
         }
 
         if (N == 8 && !isFirst && isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_8_0_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_8_0_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
         }
 
         if (N == 8 && isFirst && !isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_8_1_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_8_1_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
         }
 
         if (N == 8 && isFirst && isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_8_1_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_8_1_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
         }
 
+/*
         if (N == 4 && !isFirst && !isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_4_0_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_4_0_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
         }
 
         if (N == 4 && !isFirst && isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_4_0_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_4_0_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
         }
 
         if (N == 4 && isFirst && !isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_4_1_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_4_1_0]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
         }
 
         if (N == 4 && isFirst && isLast)
         {
-            x265::primitives.filter[x265::FILTER_H_4_1_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width,
-                                                          height, bitDepth);
+            primitives.filter[FILTER_H_4_1_1]((const short*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
             return;
-        }
+        }*/
     }
 
     //Following will be uncommented when vertical filter is added to primitives
+/*
+    if (isVertical)
+    {
+        if (N == 8 && !isFirst && !isLast)
+        {
+            primitives.filter[FILTER_V_8_0_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
 
-    /* if(isVertical)
-     {
-         if(N==8 && !isFirst && !isLast)
-         {     x265::primitives.filter[x265::FILTER_V_8_0_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth); return; }
-         if(N==8 && !isFirst && isLast)
-         {     x265::primitives.filter[x265::FILTER_V_8_0_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth); return; }
-         if(N==8 && isFirst && !isLast)
-        {      x265::primitives.filter[x265::FILTER_V_8_1_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth); return; }
-         if(N==8 && isFirst && isLast)
-          {    x265::primitives.filter[x265::FILTER_V_8_1_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth); return; }
+        if (N == 8 && !isFirst && isLast)
+        {
+            primitives.filter[FILTER_V_8_0_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
 
-         if(N==4 && !isFirst && !isLast)
-           {   x265::primitives.filter[x265::FILTER_V_4_0_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth); return; }
-         if(N==4 && !isFirst && isLast)
-           {   x265::primitives.filter[x265::FILTER_V_4_0_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth); return; }
-         if(N==4 && isFirst && !isLast)
-           {   x265::primitives.filter[x265::FILTER_V_4_1_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth); return; }
-         if(N==4 && isFirst && isLast)
-           {   x265::primitives.filter[x265::FILTER_V_4_1_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);   return; }
-     }  */
+        if (N == 8 && isFirst && !isLast)
+        {
+            primitives.filter[FILTER_V_8_1_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
+
+        if (N == 8 && isFirst && isLast)
+        {
+            primitives.filter[FILTER_V_8_1_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
+
+        if (N == 4 && !isFirst && !isLast)
+        {
+            primitives.filter[FILTER_V_4_0_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
+
+        if (N == 4 && !isFirst && isLast)
+        {
+            primitives.filter[FILTER_V_4_0_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
+
+        if (N == 4 && isFirst && !isLast)
+        {
+            primitives.filter[FILTER_V_4_1_0]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
+
+        if (N == 4 && isFirst && isLast)
+        {
+            primitives.filter[FILTER_V_4_1_1]((pixel*)coeff, (pixel*)src, srcStride, (pixel*)dst, dstStride, width, height, bitDepth);
+            return;
+        }
+    }*/
 
 #endif     // if ENABLE_PRIMITIVES
 
--- a/source/Lib/TLibCommon/TComRdCost.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/Lib/TLibCommon/TComRdCost.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -543,7 +543,7 @@ UInt TComRdCost::getSADPart(Int bitDepth
     {
         int part = x265::PartitionFromSizes(width, height);
         if (part >= 0)
-            return x265::primitives.sad[part]((pixel*)pelCur, curStride, (pixel*)pelOrg, orgStride);
+            return x265::primitives.sad[part]((pixel*)pelCur, curStride, (pixel*)pelOrg, orgStride)>>shift;
     }
 
 #endif
@@ -590,8 +590,8 @@ UInt TComRdCost::xGetSAD(DistParam* pcDt
 #if ENABLE_PRIMITIVES
     int part = x265::PartitionFromSizes(iCols, iRows);
     if (part >= 0)
-        return x265::primitives.sad[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
+        return x265::primitives.sad[part]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur) >>
+               DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
 #endif
 
     UInt uiSum = 0;
@@ -628,8 +628,8 @@ UInt TComRdCost::xGetSAD4(DistParam* pcD
 #if ENABLE_PRIMITIVES
     int part = x265::PartitionFromSizes(4, iRows >> iSubShift);
     if (part >= 0)
-        return x265::primitives.sad[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
+        return ((x265::primitives.sad[part]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur) <<iSubShift) >>
+                 DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8));
 #endif
 
     UInt uiSum = 0;
@@ -667,8 +667,8 @@ UInt TComRdCost::xGetSAD8(DistParam* pcD
 #if ENABLE_PRIMITIVES
     int part = x265::PartitionFromSizes(8, iRows >> iSubShift);
     if (part >= 0)
-        return x265::primitives.sad[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
+        return ((x265::primitives.sad[part]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur) << iSubShift) >>
+                DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8));
 #endif
 
     UInt uiSum = 0;
@@ -710,8 +710,8 @@ UInt TComRdCost::xGetSAD16(DistParam* pc
 #if ENABLE_PRIMITIVES
     int part = x265::PartitionFromSizes(16, iRows >> iSubShift);
     if (part >= 0)
-        return x265::primitives.sad[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
+        return ((x265::primitives.sad[part]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur) << iSubShift) >>
+                DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8));
 #endif
 
     UInt uiSum = 0;
@@ -802,8 +802,8 @@ UInt TComRdCost::xGetSAD16N(DistParam* p
 #if ENABLE_PRIMITIVES
     int part = x265::PartitionFromSizes(iCols, iRows >> iSubShift);
     if (part >= 0)
-        return x265::primitives.sad[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
+        return ((x265::primitives.sad[part]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur) << iSubShift) >>
+                 DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8));
 #endif
 
     for (; iRows != 0; iRows -= iSubStep)
@@ -854,8 +854,8 @@ UInt TComRdCost::xGetSAD32(DistParam* pc
 #if ENABLE_PRIMITIVES
     int part = x265::PartitionFromSizes(32, iRows >> iSubShift);
     if (part >= 0)
-        return x265::primitives.sad[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
+        return ((x265::primitives.sad[part]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur) << iSubShift) >>
+                DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8));
 #endif
 
     UInt uiSum = 0;
@@ -1650,7 +1650,7 @@ UInt TComRdCost::xCalcHADs4x4(Pel *piOrg
     assert(iStep == 1);
 
 #if ENABLE_PRIMITIVES
-    return x265::primitives.satd[x265::PARTITION_4x4]((pixel*)piCur, iStrideCur, (pixel*)piOrg, iStrideOrg);
+    return x265::primitives.satd[x265::PARTITION_4x4]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur);
 #else
     Int k, satd = 0, diff[16], m[16], d[16];
 
@@ -1751,7 +1751,7 @@ UInt TComRdCost::xCalcHADs8x8(Pel *piOrg
 {
     assert(iStep == 1);
 #if ENABLE_PRIMITIVES
-    return x265::primitives.sa8d_8x8((pixel*)piCur, iStrideCur, (pixel*)piOrg, iStrideOrg);
+    return x265::primitives.sa8d_8x8((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur);
 #else
     Int k, i, j, jj, sad = 0;
     Int diff[64], m1[8][8], m2[8][8], m3[8][8];
@@ -2113,19 +2113,16 @@ UInt TComRdCost::xGetHADs4(DistParam* pc
     Int  iOffsetOrg = iStrideOrg << 2;
     Int  iOffsetCur = iStrideCur << 2;
 
-#if ENABLE_PRIMITIVES
-    assert(iStep == 1);
-    int part = x265::PartitionFromSizes(4, iRows);
-    if (part >= 0)
-        return x265::primitives.satd[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
-#endif
-
     UInt uiSum = 0;
 
     for (y = 0; y < iRows; y += 4)
     {
+#ifdef ENABLE_PRIMITIVES
+        assert(iStep==1);
+        uiSum += x265::primitives.satd[x265::PARTITION_4x4]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur);
+#else
         uiSum += xCalcHADs4x4(piOrg, piCur, iStrideOrg, iStrideCur, iStep);
+#endif
         piOrg += iOffsetOrg;
         piCur += iOffsetCur;
     }
@@ -2148,20 +2145,17 @@ UInt TComRdCost::xGetHADs8(DistParam* pc
     Int  iStep  = pcDtParam->iStep;
     Int  y;
 
-#if ENABLE_PRIMITIVES
-    assert(iStep == 1);
-    int part = x265::PartitionFromSizes(8, iRows);
-    if (part >= 0)
-        return x265::primitives.satd[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                                          iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
-#endif
-
     UInt uiSum = 0;
 
     if (iRows == 4)
     {
+#ifdef ENABLE_PRIMITIVES
+        assert(iStep==1);
+        uiSum += x265::primitives.satd[x265::PARTITION_8x4]((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur);
+#else
         uiSum += xCalcHADs4x4(piOrg + 0, piCur, iStrideOrg, iStrideCur, iStep);
         uiSum += xCalcHADs4x4(piOrg + 4, piCur + 4 * iStep, iStrideOrg, iStrideCur, iStep);
+#endif
     }
     else
     {
@@ -2169,7 +2163,12 @@ UInt TComRdCost::xGetHADs8(DistParam* pc
         Int  iOffsetCur = iStrideCur << 3;
         for (y = 0; y < iRows; y += 8)
         {
+#ifdef ENABLE_PRIMITIVES
+            assert(iStep==1);
+            uiSum += x265::primitives.sa8d_8x8((pixel*)piOrg, iStrideOrg, (pixel*)piCur, iStrideCur);
+#else
             uiSum += xCalcHADs8x8(piOrg, piCur, iStrideOrg, iStrideCur, iStep);
+#endif
             piOrg += iOffsetOrg;
             piCur += iOffsetCur;
         }
@@ -2193,16 +2192,7 @@ UInt TComRdCost::xGetHADs(DistParam* pcD
     Int  iStrideOrg = pcDtParam->iStrideOrg;
     Int  iStep  = pcDtParam->iStep;
 
-#if ENABLE_PRIMITIVES
-    assert(iStep == 1);
-    int part = x265::PartitionFromSizes(iCols, iRows);
-    if (part >= 0)
-        return x265::primitives.satd[part]((pixel*)piCur, iStrideCur, (pixel*)piOrg,
-                iStrideOrg) >> DISTORTION_PRECISION_ADJUSTMENT(pcDtParam->bitDepth - 8);
-#endif
-
     Int  x, y;
-
     UInt uiSum = 0;
 
 #if NS_HAD
@@ -2218,7 +2208,12 @@ UInt TComRdCost::xGetHADs(DistParam* pcD
         {
             for (x = 0; x < iCols; x += 8)
             {
+#ifdef ENABLE_PRIMITIVES
+                assert(iStep==1);
+                uiSum += x265::primitives.sa8d_8x8((pixel*)(&piOrg[x]), iStrideOrg, (pixel*)(&piCur[x*iStep]), iStrideCur);
+#else
                 uiSum += xCalcHADs8x8(&piOrg[x], &piCur[x * iStep], iStrideOrg, iStrideCur, iStep);
+#endif
             }
 
             piOrg += iOffsetOrg;
@@ -2267,7 +2262,12 @@ UInt TComRdCost::xGetHADs(DistParam* pcD
         {
             for (x = 0; x < iCols; x += 4)
             {
+#ifdef ENABLE_PRIMITIVES
+                assert(iStep==1);
+                uiSum += x265::primitives.satd[x265::PARTITION_4x4]((pixel*)(&piOrg[x]), iStrideOrg, (pixel*)(&piCur[x*iStep]), iStrideCur);
+#else
                 uiSum += xCalcHADs4x4(&piOrg[x], &piCur[x * iStep], iStrideOrg, iStrideCur, iStep);
+#endif
             }
 
             piOrg += iOffsetOrg;
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -891,8 +891,8 @@ void xITrMxN(Int bitDepth, Short *coeff,
         if (uiMode != REG_DCT)
         {
 #ifdef ENABLE_PRIMITIVES
-			x265::primitives.inversedst(coeff, tmp, shift_1st);
-			x265::primitives.inversedst(tmp, block, shift_2nd);
+            x265::primitives.inversedst(coeff, tmp, shift_1st);
+            x265::primitives.inversedst(tmp, block, shift_2nd);
 #else
             fastInverseDst(coeff, tmp, shift_1st); // Inverse DST by FAST Algorithm, coeff input, tmp output
             fastInverseDst(tmp, block, shift_2nd); // Inverse DST by FAST Algorithm, tmp input, coeff output
@@ -1527,7 +1527,7 @@ Void TComTrQuant::xIT(Int bitDepth, UInt
         }
 
         xITrMxN(bitDepth, coeff, block, iWidth, iHeight, uiMode);
-		{
+        {
             for (j = 0; j < iHeight; j++)
             {
                 memcpy(pResidual + j * uiStride, block + j * iWidth, iWidth * sizeof(Short));
--- a/source/Lib/config.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/Lib/config.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -708,6 +708,7 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
     /*
      * Set any derived parameters
      */
+
     /* convert std::string to c string for compatability */
     m_pchInputFile = cfg_InputFile.empty() ? NULL : strdup(cfg_InputFile.c_str());
     m_pchBitstreamFile = cfg_BitstreamFile.empty() ? NULL : strdup(cfg_BitstreamFile.c_str());
@@ -723,7 +724,7 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
     {
         m_cTVideoIOInputFile = new TVideoIOY4m();
         m_cTVideoIOReconFile = new TVideoIOY4m();
-		/* get the video information like width,height,framerate */
+        /* get the video information like width,height,framerate */
         m_cTVideoIOInputFile->open(m_pchInputFile,
                                    false,
                                    m_inputBitDepthY,
@@ -744,29 +745,6 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
         m_cTVideoIOReconFile = new TVideoIOYuv();
     }
 
-#if 0
-    if ((m_iSourceWidth == 0) && (m_iSourceHeight == 0))
-    {
-        FILE *hFile = fopen(m_pchInputFile, "rb");
-        Char source[5];
-        Int bytesRead;
-        Int width = 0;
-        Int height = 0;
-        Int rateNumerator = 0;
-        Int rateDenominator = 0;
-        Double rate = 30.0;
-
-#if defined(_MSC_VER)
-// Allow this warning temporarily until this code is moved into a cleaner location
-#pragma warning(disable: 4127) // conditional expression is constant
-#endif
-
-        m_iSourceWidth = width;
-        m_iSourceHeight = height;
-        m_iFrameRate = ceil(rate);
-    }
-
-#endif // if 0
     Char *pColumnWidth = cfg_ColumnWidth.empty() ? NULL : strdup(cfg_ColumnWidth.c_str());
     Char *pRowHeight = cfg_RowHeight.empty() ? NULL : strdup(cfg_RowHeight.c_str());
     if (m_iUniformSpacingIdr == 0 && m_iNumColumnsMinus1 > 0)
--- a/source/encoder/macroblock.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/encoder/macroblock.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -1,7 +1,10 @@
 /*****************************************************************************
  * Copyright (C) 2013 x265 project
  *
- * Authors:
+ * Authors: Mandar Gurav <mandar@multicorewareinc.com>
+ *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -46,10 +49,10 @@ void CDECL inversedst(short *tmp, short 
         c[2] = tmp[i] - tmp[12 + i];
         c[3] = 74 * tmp[4 + i];
 
-        block[4 * i + 0] = (pixel)Clip3(-32768, 32767, (29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor) >> shift);
-        block[4 * i + 1] = (pixel)Clip3(-32768, 32767, (55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor) >> shift);
-        block[4 * i + 2] = (pixel)Clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i]  + tmp[12 + i])      + rnd_factor) >> shift);
-        block[4 * i + 3] = (pixel)Clip3(-32768, 32767, (55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor) >> shift);
+        block[4 * i + 0] = (short) Clip3(-32768, 32767, (29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor) >> shift);
+        block[4 * i + 1] = (short) Clip3(-32768, 32767, (55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor) >> shift);
+        block[4 * i + 2] = (short) Clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i]  + tmp[12 + i])      + rnd_factor) >> shift);
+        block[4 * i + 3] = (short) Clip3(-32768, 32767, (55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor) >> shift);
     }
 }
 
--- a/source/encoder/pixel.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/encoder/pixel.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -277,6 +277,15 @@ void Setup_C_PixelPrimitives(EncoderPrim
     p.sad[PARTITION_16x32] = sad<16, 32>;
     p.sad[PARTITION_32x16] = sad<32, 16>;
     p.sad[PARTITION_32x32] = sad<32, 32>;
+    p.sad[PARTITION_64x4] = sad<64, 4>;
+    p.sad[PARTITION_4x64] = sad<4, 64>;
+    p.sad[PARTITION_64x8] = sad<64, 8>;
+    p.sad[PARTITION_8x64] = sad<8, 64>;
+    p.sad[PARTITION_16x64] = sad<16, 64>;
+    p.sad[PARTITION_64x16] = sad<64, 16>;
+    p.sad[PARTITION_32x64] = sad<32, 64>;
+    p.sad[PARTITION_64x32] = sad<64, 32>;
+    p.sad[PARTITION_64x64] = sad<64, 64>;
 
     p.satd[PARTITION_4x4]   = satd_4x4;
     p.satd[PARTITION_8x4]   = satd_8x4;
--- a/source/encoder/primitives.h	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/encoder/primitives.h	Mon Apr 15 17:43:29 2013 +0530
@@ -2,6 +2,10 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
--- a/source/encoder/threadpool.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/encoder/threadpool.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -67,8 +67,8 @@ inline int __lzcnt_2x32(uint64_t x64)
 
 #endif // if _WIN64
 
-#define ATOMIC_INC(ptr)                 InterlockedIncrement((volatile unsigned int*)ptr)
-#define ATOMIC_DEC(ptr)                 InterlockedDecrement((volatile unsigned int*)ptr)
+#define ATOMIC_INC(ptr)                 InterlockedIncrement((volatile LONG*)ptr)
+#define ATOMIC_DEC(ptr)                 InterlockedDecrement((volatile LONG*)ptr)
 #define ATOMIC_OR(ptr, mask)            InterlockedOr64((volatile LONG64*)ptr, mask)
 #define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
 #define GIVE_UP_TIME()                  Sleep(0)
@@ -257,7 +257,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numTh
         {
             new (buffer) PoolThread(*this);
             buffer += sizeof(PoolThread);
-            m_ok &= m_threads[i].Start();
+            m_ok = m_ok && m_threads[i].Start();
         }
     }
 }
@@ -276,7 +276,7 @@ ThreadPoolImpl::~ThreadPoolImpl()
         for (int i = 0; i < m_numThreads; i++)
             m_threads[i].~PoolThread();
 
-        delete reinterpret_cast<char*>(m_threads);
+        delete[] reinterpret_cast<char*>(m_threads);
     }
 
     // leak threads on program exit if there were resource failures
--- a/source/encoder/vec/macroblock.inc	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/encoder/vec/macroblock.inc	Mon Apr 15 17:43:29 2013 +0530
@@ -2,6 +2,10 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Steve Borho <steve@borho.org>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
+ *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,6 +27,11 @@
 
 // Vector class versions of macroblock performance primitives
 
+/* Used for filter */
+#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
+#define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
+#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally
+ 
 void CDECL inversedst(short *tmp, short *block, int shift)  // input tmp, output block
 {
     int rnd_factor = 1 << (shift - 1);
@@ -83,47 +92,75 @@ void CDECL inversedst(short *tmp, short 
     block[15] = half[7];
 }
 
-void CDECL filter_8_nonvertical(const short *coeff,
-                                pixel *src,
-                                int    srcStride,
-                                pixel *dst,
-                                int    dstStride,
-                                int    block_width,
-                                int    block_height,
-                                short  maxVal,
-                                int    shift,
-                                int    offset,
-                                bool   isLast)
+template<bool isFirst, bool isLast>
+void CDECL filter_Horizontal_8(const short *coeff,
+                               pixel *      src,
+                               int          srcStride,
+                               pixel *      dst,
+                               int          dstStride,
+                               int          block_width,
+                               int          block_height,
+                               int          bitDepth)
 {
     int row, col;
-    Vec8s vec_c, vec_src;
+    short *src_s = (short*)src;
+    short *dst_s = (short*)dst;
 
+    src_s -= (8 / 2 - 1);     // Here N = 8 and cStride = 1
+    int offset;
+    short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
+    int shift = IF_FILTER_PREC;
+    if (isLast)
+    {
+        shift += (isFirst) ? 0 : headRoom;
+        offset = 1 << (shift - 1);
+        offset += (isFirst) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
+        maxVal = (1 << bitDepth) - 1;
+    }
+    else
+    {
+        shift -= (isFirst) ? headRoom : 0;
+        offset = (isFirst) ? -IF_INTERNAL_OFFS << shift : 0;
+        maxVal = 0;
+    }
+
+    Vec8s vec_c, vec_src1;
     vec_c.load(coeff);
-
+    int sum = 0;
     for (row = 0; row < block_height; row++)
     {
         for (col = 0; col < block_width; col++)
         {
-            int sum;
-            vec_src.load(src + col);
-            vec_src = vec_src * vec_c;
-            sum = horizontal_add_x(vec_src);
-            short val = (sum + offset) >> shift;
+            vec_src1.load(src_s + col);
+            vec_src1 = vec_src1 * vec_c;            // Assuming that there is no overflow
+            sum = horizontal_add_x(vec_src1);
+            short val = (short)(sum + offset) >> shift;
             if (isLast)
             {
                 val = (val < 0) ? 0 : val;
                 val = (val > maxVal) ? maxVal : val;
             }
 
-            dst[col] = val;
+            dst_s[col] = val;
         }
 
-        src += srcStride;
-        dst += dstStride;
+        src_s += srcStride;
+        dst_s += dstStride;
     }
 }
 
 void Setup_Vec_MacroblockPrimitives(EncoderPrimitives &p)
 {
     p.inversedst = inversedst;
+
+    /*p.filter[FILTER_H_4_0_0] = filter_Horizontal_4< 0, 0>;
+      p.filter[FILTER_H_4_0_1] = filter_Horizontal_4< 0, 1>;
+      p.filter[FILTER_H_4_1_0] = filter_Horizontal_4< 1, 0>;
+      p.filter[FILTER_H_4_1_1] = filter_Horizontal_4< 1, 1>;*/
+
+    p.filter[FILTER_H_8_0_0] = filter_Horizontal_8<0, 0>;
+    p.filter[FILTER_H_8_0_1] = filter_Horizontal_8<0, 1>;
+    p.filter[FILTER_H_8_1_0] = filter_Horizontal_8<1, 0>;
+    p.filter[FILTER_H_8_1_1] = filter_Horizontal_8<1, 1>;
 }
--- a/source/encoder/vec/pixel.inc	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/encoder/vec/pixel.inc	Mon Apr 15 17:43:29 2013 +0530
@@ -24,9 +24,32 @@
 // Vector class versions of pixel comparison performance primitives
 
 template<int lx, int ly>
+int CDECL sad_4(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    int uiSum = 0;  // This is slower than the C primitve, so I'm not using it
+
+    for (int Row = 0; Row < ly; Row++)
+    {
+        for (int col = 0; col < lx; col += 4)
+        {
+            Vec8s m1, n1;
+            m1.load_partial(4, piOrg + col);
+            n1.load_partial(4, piCur + col);
+            m1 = m1 - n1;
+            m1 = abs(m1);
+            uiSum += horizontal_add_x(m1);
+        }
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+
+    return uiSum;
+}
+
+template<int lx, int ly>
 int CDECL sad_8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    // NOTE: This is completely untested, and ignores alignment issues
     int uiSum = 0;
 
     for (int Row = 0; Row < ly; Row++)
@@ -48,7 +71,32 @@ int CDECL sad_8(pixel * piOrg, intptr_t 
     return uiSum;
 }
 
-int CDECL satd_4x4(pixel * piCur, intptr_t iStrideCur, pixel * piOrg, intptr_t iStrideOrg)
+template<int lx, int ly>
+int CDECL sad_16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    int uiSum = 0;
+
+    for (int Row = 0; Row < ly; Row++)
+    {
+        for (int col = 0; col < lx; col += 16)
+        {
+            Vec16s m1, n1;
+            m1.load(piOrg + col);
+            n1.load(piCur + col);
+            m1 = m1 - n1;
+            m1 = abs(m1);
+            uiSum += horizontal_add_x(m1);
+        }
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+
+    return uiSum;
+}
+
+
+int CDECL satd_4x4(pixel * piOrg, intptr_t iStrideOrg, pixel * piCur, intptr_t iStrideCur)
 {
     int satd = 0;
 
@@ -116,7 +164,27 @@ int CDECL satd_4x4(pixel * piCur, intptr
     return satd;
 }
 
-int CDECL sa8d_8x8(pixel * piCur, intptr_t iStrideCur, pixel * piOrg, intptr_t iStrideOrg)
+template<int lx, int ly>
+int CDECL satd(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    int uiSum = 0;
+
+    for (int Row = 0; Row < ly; Row += 4)
+    {
+        for (int col = 0; col < lx; col += 4)
+        {
+            uiSum += satd_4x4( piOrg + col, strideOrg, piCur + col, strideCur );
+        }
+
+        piOrg += 4 * strideOrg;
+        piCur += 4 * strideCur;
+    }
+
+    return uiSum;
+}
+
+
+int CDECL sa8d_8x8(pixel * piOrg, intptr_t iStrideOrg, pixel * piCur, intptr_t iStrideCur)
 {
     ALIGN_VAR_16(short, m2[8][8]);
 
@@ -127,12 +195,12 @@ int CDECL sa8d_8x8(pixel * piCur, intptr
 
     for (j = 0; j < 8; j += 2)
     {
-        piOrg_v1.load(piOrg);
+        piOrg_v1.load_a(piOrg);
         piCur_v1.load(piCur);
         piCur += iStrideCur;
         piOrg += iStrideOrg;
 
-        piOrg_v2.load(piOrg);
+        piOrg_v2.load_a(piOrg);
         piCur_v2.load(piCur);
         piCur += iStrideCur;
         piOrg += iStrideOrg;
@@ -299,9 +367,35 @@ int CDECL sa8d_8x8(pixel * piCur, intptr
 
 void Setup_Vec_PixelPrimitives(EncoderPrimitives &p)
 {
+    p.sad[PARTITION_4x4] = sad_4<4, 4>;
+    p.sad[PARTITION_4x8] = sad_4<4, 8>;
+    p.sad[PARTITION_8x4] = sad_8<8, 4>;
     p.sad[PARTITION_8x8] = sad_8<8, 8>;
-    p.sad[PARTITION_16x16] = sad_8<16, 16>;
-    p.sad[PARTITION_32x32] = sad_8<32, 32>;
+    p.sad[PARTITION_16x4] = sad_16<16, 4>;
+    p.sad[PARTITION_4x16] = sad_4<4, 16>;
+    p.sad[PARTITION_16x8] = sad_16<16, 8>;
+    p.sad[PARTITION_8x16] = sad_8<8, 16>;
+    p.sad[PARTITION_16x16] = sad_16<16, 16>;
+    p.sad[PARTITION_4x32] = sad_4<4, 32>;
+    p.sad[PARTITION_32x4] = sad_16<32, 4>;
+    p.sad[PARTITION_8x32] = sad_8<8, 32>;
+    p.sad[PARTITION_32x8] = sad_16<32, 8>;
+    p.sad[PARTITION_16x32] = sad_16<16, 32>;
+    p.sad[PARTITION_32x16] = sad_16<32, 16>;
+    p.sad[PARTITION_32x32] = sad_16<32, 32>;
+    p.sad[PARTITION_4x64] = sad_4<4, 64>;
+    p.sad[PARTITION_64x4] = sad_16<64, 4>;
+    p.sad[PARTITION_64x8] = sad_16<64, 8>;
+    p.sad[PARTITION_8x64] = sad_8<8, 64>;
+    p.sad[PARTITION_16x64] = sad_16<16, 64>;
+    p.sad[PARTITION_64x16] = sad_16<64, 16>;
+    p.sad[PARTITION_32x64] = sad_16<32, 64>;
+    p.sad[PARTITION_64x32] = sad_16<64, 32>;
+    p.sad[PARTITION_64x64] = sad_16<64, 64>;
+
     p.satd[PARTITION_4x4] = satd_4x4;
+    p.satd[PARTITION_4x8] = satd<4, 8>;
+    // p.satd[PARTITION_8x4] = satd<8, 4>;  // slower than SWAR C version
+
     p.sa8d_8x8 = sa8d_8x8;
 }
--- a/source/test/testbench.cpp	Mon Apr 15 17:42:33 2013 +0530
+++ b/source/test/testbench.cpp	Mon Apr 15 17:43:29 2013 +0530
@@ -2,6 +2,8 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Gopu Govindaswamy <gopu@govindaswamy.org>
+ *          Mandar Gurav <mandar@multicorewareinc.com>
+ *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -84,23 +86,73 @@
 
 using namespace x265;
 
+/* Used for filter */
+#define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision
+#define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps
+#define IF_INTERNAL_OFFS (1 << (IF_INTERNAL_PREC - 1)) ///< Offset used internally
+#define NTAPS_LUMA       8 ///< Number of taps for luma
+const short m_lumaFilter[4][NTAPS_LUMA] =
+{
+{
+    0, 0,   0, 64,  0,   0, 0,  0
+},
+{
+    -1, 4, -10, 58, 17,  -5, 1,  0
+},
+{
+    -1, 4, -11, 40, 40, -11, 4, -1
+},
+{
+    0, 1,  -5, 17, 58, -10, 4, -1
+}
+};
+char FilterConf_names[16][40] =
+{
+    //Naming convention used is - isVertical_N_isFirst_isLast
+    "Hor_N=4_isFirst=0_isLast=0",
+    "Hor_N=4_isFirst=0_isLast=1",
+    "Hor_N=4_isFirst=1_isLast=0",
+    "Hor_N=4_isFirst=1_isLast=1",
+
+    "Hor_N=8_isFirst=0_isLast=0",
+    "Hor_N=8_isFirst=0_isLast=1",
+    "Hor_N=8_isFirst=1_isLast=0",
+    "Hor_N=8_isFirst=1_isLast=1",
+
+    "Ver_N=4_isFirst=0_isLast=0",
+    "Ver_N=4_isFirst=0_isLast=1",
+    "Ver_N=4_isFirst=1_isLast=0",
+    "Ver_N=4_isFirst=1_isLast=1",
+
+    "Ver_N=8_isFirst=0_isLast=0",
+    "Ver_N=8_isFirst=0_isLast=1",
+    "Ver_N=8_isFirst=1_isLast=0",
+    "Ver_N=8_isFirst=1_isLast=1"
+};
+pixel *pixel_buff;
+short *IPF_vec_output, *IPF_C_output;
+int t_size;
+
 /* pbuf1, pbuf2: initialized to random pixel data and shouldn't write into them. */
 pixel *pbuf1, *pbuf2;
 short *mbuf1, *mbuf2, *mbuf3;
 #define BENCH_ALIGNS 16
 
 // Initialize the Func Names for all the Pixel Comp
-static const char *FuncNames[NUM_PARTITIONS] = {
-"4x4", "8x4", "4x8", "8x8", "4x16", "16x4", "8x16", "16x8", "16x16", "4x32", "32x4", "8x32",
-"32x8", "16x32", "32x16", "32x32", "4x64", "64x4", "8x64", "64x8", "16x64", "64x16", "32x64", "64x32", "64x64"
+static const char *FuncNames[NUM_PARTITIONS] =
+{
+    "4x4", "8x4", "4x8", "8x8", "4x16", "16x4", "8x16", "16x8", "16x16", "4x32", "32x4", "8x32",
+    "32x8", "16x32", "32x16", "32x32", "4x64", "64x4", "8x64", "64x8", "16x64", "64x16", "32x64", "64x32", "64x64"
 };
 
 #if HIGH_BIT_DEPTH
-#define PIXEL_MAX ((1 << 10) - 1)
+#define BIT_DEPTH 10
 #else
-#define PIXEL_MAX ((1 << 8) - 1)
+#define BIT_DEPTH 8
 #endif
 
+#define PIXEL_MAX ((1 << BIT_DEPTH) - 1)
+
 /* To-do List: Generate the stride values at run time in each run
  *
  */
@@ -232,6 +284,41 @@ static void check_cycle_count(const Enco
     }
 
     /* Add logic here for testing performance of your new primitive*/
+    int rand_height = rand() % 100;                 // Randomly generated Height
+    int rand_width = rand() % 100;                  // Randomly generated Width
+    short rand_val, rand_srcStride, rand_dstStride;
+
+    rand_val = rand() % 24;                     // Random offset in the filter
+    rand_srcStride = rand() % 100;              // Randomly generated srcStride
+    rand_dstStride = rand() % 100;              // Randomly generated dstStride
+
+    for (int value = 4; value < 8; value++)
+    {
+        memset(IPF_vec_output, 0, t_size);      // Initialize output buffer to zero
+        memset(IPF_C_output, 0, t_size);        // Initialize output buffer to zero
+        if (vecprim.filter[value])
+        {
+            gettimeofday(&ts, NULL);
+            for (int j = 0; j < NUM_ITERATIONS_CYCLE; j++)
+            {
+                vecprim.filter[value]((short*)(m_lumaFilter + rand_val), pixel_buff, rand_srcStride, (pixel*)IPF_vec_output,
+                                      rand_dstStride, rand_height, rand_width, BIT_DEPTH);
+            }
+
+            gettimeofday(&te, NULL);
+            printf("\nfilter[%s] vectorized primitive: (%1.4f ms) ", FilterConf_names[value], timevaldiff(&ts, &te));
+
+            gettimeofday(&ts, NULL);
+            for (int j = 0; j < NUM_ITERATIONS_CYCLE; j++)
+            {
+                cprim.filter[value]((short*)(m_lumaFilter + rand_val), pixel_buff, rand_srcStride, (pixel*)IPF_vec_output,
+                                    rand_dstStride, rand_height, rand_width, BIT_DEPTH);
+            }
+
+            gettimeofday(&te, NULL);
+            printf("\tC primitive: (%1.4f ms) ", timevaldiff(&ts, &te));
+        }
+    }
 }
 
 static int check_pixel_primitive(pixelcmp ref, pixelcmp opt)
@@ -240,8 +327,8 @@ static int check_pixel_primitive(pixelcm
 
     for (int i = 0; i <= 100; i++)
     {
-        int vres = opt(pbuf1 + j, STRIDE, pbuf2, STRIDE);
-        int cres = ref(pbuf1 + j, STRIDE, pbuf2, STRIDE);
+        int vres = opt(pbuf1, STRIDE, pbuf2 + j, STRIDE);
+        int cres = ref(pbuf1, STRIDE, pbuf2 + j, STRIDE);
         if (vres != cres)
             return -1;
 
@@ -273,6 +360,49 @@ static int check_mbdst_primitive(mbdst r
     return 0;
 }
 
+static int check_IPFilter_primitive(IPFilter ref, IPFilter opt)
+{
+    int rand_height = rand() & 100;                 // Randomly generated Height
+    int rand_width = rand() & 100;                  // Randomly generated Width
+    int flag = 0;                                   // Return value
+    short rand_val, rand_srcStride, rand_dstStride;
+
+    for (int i = 0; i <= 100; i++)
+    {
+        memset(IPF_vec_output, 0, t_size);          // Initialize output buffer to zero
+        memset(IPF_C_output, 0, t_size);            // Initialize output buffer to zero
+
+        rand_val = rand() & 24;                     // Random offset in the filter
+        rand_srcStride = rand() & 100;              // Randomly generated srcStride
+        rand_dstStride = rand() & 100;              // Randomly generated dstStride
+
+        opt((short*)(m_lumaFilter + rand_val),
+            pixel_buff,
+            rand_srcStride,
+            (pixel*)IPF_vec_output,
+            rand_dstStride,
+            rand_height,
+            rand_width,
+            BIT_DEPTH);
+        ref((short*)(m_lumaFilter + rand_val),
+            pixel_buff,
+            rand_srcStride,
+            (pixel*)IPF_C_output,
+            rand_dstStride,
+            rand_height,
+            rand_width,
+            BIT_DEPTH);
+
+        if (memcmp(IPF_vec_output, IPF_C_output, t_size))
+        {
+            flag = -1;                                          // Test Failed
+            break;
+        }
+    }
+
+    return flag;
+}
+
 int init_pixelcmp_buffers()
 {
     pbuf1 = (pixel*)malloc(0x1e00 * sizeof(pixel) + 16 * BENCH_ALIGNS);
@@ -293,6 +423,29 @@ int init_pixelcmp_buffers()
     return 0;
 }
 
+int init_IPFilter_buffers()
+{
+    t_size = 200 * 200;
+    pixel_buff = (pixel*)malloc(t_size * sizeof(pixel));     // Assuming max_height = max_width = max_srcStride = max_dstStride = 100
+    IPF_vec_output = (short*)malloc(t_size * sizeof(short));      // Output Buffer1
+    IPF_C_output = (short*)malloc(t_size * sizeof(short));      // Output Buffer2
+
+    if (!pixel_buff || !IPF_vec_output || !IPF_C_output)
+    {
+        fprintf(stderr, "init_IPFilter_buffers: malloc failed, unable to initiate tests!\n");
+        return -1;
+    }
+
+    for (int i = 0; i < t_size; i++)                                    // Initialize input buffer
+    {
+        int isPositive = rand() & 1;                                    // To randomly generate Positive and Negative values
+        isPositive = (isPositive) ? 1 : -1;
+        pixel_buff[i] = isPositive * (rand() & PIXEL_MAX);
+    }
+
+    return 0;
+}
+
 int clean_pixelcmp_buffers()
 {
     free(pbuf1);
@@ -300,6 +453,14 @@ int clean_pixelcmp_buffers()
     return 0;
 }
 
+int clean_IPFilter_buffers()
+{
+    free(IPF_vec_output);
+    free(IPF_C_output);
+    free(pixel_buff);
+    return 0;
+}
+
 int init_mbdst_buffers()
 {
     int t_size = 32;
@@ -384,6 +545,24 @@ static int check_all_primitives(const En
         printf("\nsa8d_16x16: passed ");
     }
 
+    /********** Run Filter Primitives *******************/
+    if (init_IPFilter_buffers() < 0)
+        return -1;
+
+    for (int value = 4; value < 8; value++)
+    {
+        if (vectorprimitives.filter[value])
+        {
+            if (check_IPFilter_primitive(cprimitives.filter[value], vectorprimitives.filter[value]) < 0)
+            {
+                printf("\nfilter: Failed!\n");
+                return -1;
+            }
+
+            printf("\nFilter[%s]: passed ", FilterConf_names[value]);
+        }
+    }
+
     /********** Initialise and run mbdst Primitives *******************/
 
     if (init_mbdst_buffers() < 0)
@@ -408,7 +587,7 @@ static int check_all_primitives(const En
     /********************* Clean all buffers *****************************/
     clean_pixelcmp_buffers();
     clean_mbdst_buffers();
-
+    clean_IPFilter_buffers();
     return 0;
 }