changeset 783:07bd1df8ced7 draft

Merged multicoreware/xhevc into default
author Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
date Wed, 24 Apr 2013 12:00:28 +0530
parents 189e1c23ebdb (current diff) d9d313f7abe1 (diff)
children 5bcab5c66709
files source/Lib/TLibEncoder/TEncSearch.cpp
diffstat 29 files changed, 468 insertions(+-), 410 deletions(-) [+]
line wrap: on
line diff
--- a/doc/uncrustify/codingstyle.cfg	Wed Apr 24 11:58:14 2013 +0530
+++ b/doc/uncrustify/codingstyle.cfg	Wed Apr 24 12:00:28 2013 +0530
@@ -84,8 +84,8 @@ mod_sort_include=false
 mod_sort_using=false
 newlines=lf
 nl_after_access_spec=2
-nl_after_brace_close=ignore
-nl_after_brace_open=ignore
+#nl_after_brace_close=ignore
+#nl_after_brace_open=ignore
 nl_after_brace_open_cmt=true
 nl_after_case=false
 nl_after_class=2
--- a/source/Lib/TLibCommon/CommonDef.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/Lib/TLibCommon/CommonDef.h	Wed Apr 24 12:00:28 2013 +0530
@@ -55,7 +55,7 @@
 // Version information
 // ====================================================================================================================
 
-#define NV_VERSION        "10.0"                 ///< Current software version
+#define NV_VERSION        "10.1rc"                 ///< Current software version
 
 // ====================================================================================================================
 // Platform information
--- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -198,7 +198,7 @@ Void TComSampleAdaptiveOffset::create(UI
     m_iUpBuff1++;
     m_iUpBuff2++;
     m_iUpBufft++;
-    Pel i;
+    Short i;
 
     UInt uiMaxY  = (1 << g_bitDepthY) - 1;
     UInt uiMinY  = 0;
--- a/source/Lib/TLibEncoder/TEncGOP.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncGOP.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -674,6 +674,10 @@ Void TEncGOP::compressGOP(Int iPOCLast, 
 
             pcSlice->setCheckLDC(bLowDelay);
         }
+        else
+        {
+            pcSlice->setCheckLDC(true);
+        }
 
         uiColDir = 1 - uiColDir;
 
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -1154,7 +1154,7 @@ Void TEncSearch::xIntraCodingLumaBlk(TCo
         {
             for (UInt uiX = 0; uiX < uiWidth; uiX++)
             {
-                pReco[uiX] = ClipY(pPred[uiX] + pResi[uiX]);
+                pReco[uiX] = ClipY(static_cast<Short>(pPred[uiX]) + pResi[uiX]);
                 pRecQt[uiX] = pReco[uiX];
                 pRecIPred[uiX] = pReco[uiX];
             }
@@ -1353,7 +1353,7 @@ Void TEncSearch::xIntraCodingChromaBlk(T
         {
             for (UInt uiX = 0; uiX < uiWidth; uiX++)
             {
-                pReco[uiX] = ClipC(pPred[uiX] + pResi[uiX]);
+                pReco[uiX] = ClipC(static_cast<Short> (pPred[uiX]) + pResi[uiX]);
                 pRecQt[uiX] = pReco[uiX];
                 pRecIPred[uiX] = pReco[uiX];
             }
--- a/source/PPA/ppaCPUEvents.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/PPA/ppaCPUEvents.h	Wed Apr 24 12:00:28 2013 +0530
@@ -1,4 +1,5 @@
 PPA_REGISTER_CPU_EVENT(encode_block)
 PPA_REGISTER_CPU_EVENT(read_yuv)
+PPA_REGISTER_CPU_EVENT(write_yuv)
 PPA_REGISTER_CPU_EVENT(encode_frame)
 PPA_REGISTER_CPU_EVENT(bitstream_write)
--- a/source/VectorClass/vectori128.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/VectorClass/vectori128.h	Wed Apr 24 12:00:28 2013 +0530
@@ -302,6 +302,10 @@ public:
     Vec16c(int i) {
         xmm = _mm_set1_epi8(i);
     }
+    // MCW Added - assign lowest 4 byte values from uint32_t
+    void fromUint32(uint32_t i) {
+        xmm = _mm_cvtsi32_si128(i);
+    }
     // Constructor to build from all elements:
     Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7,
         int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) {
@@ -320,6 +324,10 @@ public:
     operator __m128i() const {
         return xmm;
     }
+    // MCW Added - PSADBW
+    Vec16c sad(__m128i const & x) {
+        return _mm_sad_epu8(xmm, x);
+    }
     // Member function to load from array (unaligned)
     Vec16c & load(void const * p) {
         xmm = _mm_loadu_si128((__m128i const*)p);
@@ -3698,7 +3706,7 @@ static inline Vec16uc blend16uc(Vec16uc 
 }
 
 #if _MSC_VER
-#pragma warning(disable: 4700)
+#pragma warning(disable: 4700)
 #endif
 
 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
--- a/source/encoder/md5.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/encoder/md5.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -44,6 +44,8 @@ static void byteReverse(uint8_t_t *buf, 
 }
 #endif
 
+void MD5Transform(uint32_t *buf, uint32_t *in);
+
 /*
  * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
  * initialization constants.
@@ -152,7 +154,7 @@ void MD5Final(MD5Context *ctx, uint8_t *
     byteReverse((uint8_t *) ctx->buf, 4);
     memcpy(digest, ctx->buf, 16);
 
-    memset(ctx, 0, sizeof(ctx));        /* In case it's sensitive */
+    memset(ctx, 0, sizeof(*ctx));        /* In case it's sensitive */
 }
 
 /* The four core functions - F1 is optimized somewhat */
--- a/source/encoder/md5.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/encoder/md5.h	Wed Apr 24 12:00:28 2013 +0530
@@ -39,7 +39,6 @@ typedef struct MD5Context {
 void MD5Init(MD5Context *context);
 void MD5Update(MD5Context *context, unsigned char *buf, uint32_t len);
 void MD5Final(MD5Context *ctx, uint8_t *digest);
-void MD5Transform(uint32_t *buf, uint32_t *in);
 
 class MD5
 {
--- a/source/encoder/pixel.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/encoder/pixel.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -30,7 +30,6 @@ namespace {
 template<int lx, int ly>
 int CDECL sad(pixel *pix1, intptr_t stride_pix1, pixel *pix2, intptr_t stride_pix2)
 {
-    // TODO: we could use SWAR here fairly easily.  Would it help?
     int sum = 0;
 
     for (int y = 0; y < ly; y++)
--- a/source/encoder/threadpool.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/encoder/threadpool.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -57,12 +57,12 @@
 #define CLZ64(x)                        __lzcnt_2x32(x)
 inline int __lzcnt_2x32(uint64_t x64)
 {
-    int val = __lzcnt((uint32_t)(x64 >> 32));
-
-    if (val)
-        return val + 32;
-
-    return __lzcnt((uint32_t)x64);
+    uint32_t high32 = (uint32_t)(x64 >> 32);
+    uint32_t low32 = (uint32_t)x64;
+    if (high32)
+        return __lzcnt(high32);
+    else
+        return __lzcnt(low32) + 32;
 }
 
 #endif // if _WIN64
--- a/source/encoder/vec/pixel.inc	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/encoder/vec/pixel.inc	Wed Apr 24 12:00:28 2013 +0530
@@ -23,20 +23,20 @@
 
 // Vector class versions of pixel comparison performance primitives
 
-template<int lx, int ly>
+#if HIGH_BIT_DEPTH
+
+/* intrinsics for when pixel type is short */
+
+template<int ly>
 int CDECL sad_4(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
     Vec8s sum(0);
-
     for (int row = 0; row < ly; row++)
     {
-        for (int col = 0; col < lx; col += 4)
-        {
-            Vec8s m1, n1;
-            m1.load(piOrg + col);
-            n1.load(piCur + col);
-            sum += abs(m1 - n1);
-        }
+        Vec8s m1, n1;
+        m1.load(piOrg);
+        n1.load(piCur);
+        sum += abs(m1 - n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
@@ -45,42 +45,42 @@ int CDECL sad_4(pixel * piOrg, intptr_t 
     return horizontal_add(extend_low(sum));
 }
 
-template<int lx, int ly>
+template<int ly>
 int CDECL sad_8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
     Vec8s sum(0);
-
     for (int row = 0; row < ly; row++)
     {
-        for (int col = 0; col < lx; col += 8)
-        {
-            Vec8s m1, n1;
-            m1.load_a(piOrg + col);
-            n1.load(piCur + col);
-            sum += abs(m1 - n1);
-        }
+        Vec8s m1, n1;
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sum += abs(m1 - n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
     }
-
     return horizontal_add_x(sum);
 }
 
-template<int lx, int ly>
+template<int ly>
+int CDECL sad_8x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    int sum = 0;
+    for (int row = 0; row < ly; row += 16)
+        sum += sad_8<16>(piOrg + row * strideOrg, strideOrg, piCur + row * strideCur, strideCur);
+    return sum;
+}
+
+template<int ly>
 int CDECL sad_16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
     Vec16s sum(0);
-
     for (int row = 0; row < ly; row++)
     {
-        for (int col = 0; col < lx; col += 16)
-        {
-            Vec16s m1, n1;
-            m1.load_a(piOrg + col);
-            n1.load(piCur + col);
-            sum += abs(m1 - n1);
-        }
+        Vec16s m1, n1;
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sum += abs(m1 - n1);
 
         piOrg += strideOrg;
         piCur += strideCur;
@@ -91,20 +91,29 @@ int CDECL sad_16(pixel * piOrg, intptr_t
 }
 
 template<int lx, int ly>
-int CDECL sad_16x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+int CDECL sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
 {
-    int sum = 0;
-
-    for (int row = 0; row < ly; row += 16)
+    Vec16s sum(0);
+    for (int row = 0; row < ly; row++)
     {
-        for (int col = 0; col < lx; col += 16)
+        for (int col = 0; col < lx; col += 32)
         {
-            sum += sad_16<16, 16>(piOrg + row * strideOrg + col, strideOrg,
-                                  piCur + row * strideCur + col, strideCur);
+            Vec16s m1, n1;
+            m1.load(piOrg + col);
+            n1.load(piCur + col);
+            sum += abs(m1 - n1);
+            Vec16s m2, n2;
+            m2.load(piOrg + col + 16);
+            n2.load(piCur + col + 16);
+            sum += abs(m2 - n2);
         }
-    }
 
-    return sum;
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    return horizontal_add_x(extend_low(sum)) +
+           horizontal_add_x(extend_high(sum));
+
 }
 
 int CDECL satd_4x4(pixel * piOrg, intptr_t iStrideOrg, pixel * piCur, intptr_t iStrideCur)
@@ -175,23 +184,6 @@ int CDECL satd_4x4(pixel * piOrg, intptr
     return satd;
 }
 
-template<int lx, int ly>
-int CDECL satd(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
-{
-    int uiSum = 0;
-
-    for (int row = 0; row < ly; row += 4)
-    {
-        for (int col = 0; col < lx; col += 4)
-        {
-            uiSum += satd_4x4(piOrg + strideOrg * row + col, strideOrg,
-                              piCur + strideCur * row + col, strideCur);
-        }
-    }
-
-    return uiSum;
-}
-
 int CDECL sa8d_8x8(pixel * piOrg, intptr_t iStrideOrg, pixel * piCur, intptr_t iStrideCur)
 {
     ALIGN_VAR_16(short, m2[8][8]);
@@ -317,8 +309,6 @@ int CDECL sa8d_8x8(pixel * piOrg, intptr
         v6 = abs(t1);
         v7 = abs(t2);
 
-#if HIGH_BIT_DEPTH
-
         Vec4i s0, s1, s2, s3, s4, s5, s6, s7, s8;
         s0 = extend_low(v0);
         s1 = extend_high(v0);
@@ -355,56 +345,173 @@ int CDECL sa8d_8x8(pixel * piOrg, intptr
         s0 = (s0 + s1) + (s2 + s3) + (s4 + s5) + (s6 + s7);
 
         satd = horizontal_add_x(s0);
-#else /* if HIGH_BIT_DEPTH */
-        v0 = v0 + v1;
-        v2 = v2 + v3;
-        v0 = v0 + v2;
-
-        v4 = v4 + v5;
-        v6 = v6 + v7;
-        v4 = v4 + v6;
-
-        v0 = v0 + v4;
-
-        satd = horizontal_add_x(v0);
-#endif /* if HIGH_BIT_DEPTH */
     }
 
     return (satd + 2) >> 2;
 }
 
+#else
+
+/* intrinsics for when pixel type is uint8_t */
+
+template<int ly>
+int CDECL sad_4(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec8s sum(0);
+
+    for (int row = 0; row < ly; row++)
+    {
+        Vec16uc m1, n1;
+        m1.fromUint32(*(uint32_t*)piOrg);
+        n1.fromUint32(*(uint32_t*)piCur);
+        sum += Vec8s(m1.sad(n1));
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+
+    return horizontal_add(sum);
+}
+
+template<int ly>
+int CDECL sad_8(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec8s sum(0);
+    for (int row = 0; row < ly; row++)
+    {
+        Vec16uc m1, n1;
+        m1.load(piOrg);
+        n1.load(piCur);
+        sum += Vec8s(m1.sad(n1));
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    return sum[0];
+}
+
+template<int ly>
+int CDECL sad_8x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    /* groups of 8x16 blocks, upcasting sum from short to int often enough to avoid overflow */
+    int sum = 0;
+    for (int row = 0; row < ly; row += 16)
+        sum += sad_8<16>(piOrg + row * strideOrg, strideOrg, piCur + row * strideCur, strideCur);
+    return sum;
+}
+
+template<int ly>
+int CDECL sad_16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    Vec8s sum(0);
+    for (int row = 0; row < ly; row++)
+    {
+        Vec16uc m1, n1;
+        m1.load_a(piOrg);
+        n1.load(piCur);
+        sum += Vec8s(m1.sad(n1));
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+    }
+    return horizontal_add_x(sum);
+}
+
+template<int lx, int ly>
+int CDECL sad_32(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    // TODO: AVX2
+    int sum = 0;
+    for (int row = 0; row < ly; row++)
+    {
+        Vec8s sad(0);
+        for (int col = 0; col < lx; col += 32)
+        {
+            Vec16uc m1, n1;
+            m1.load_a(piOrg + col);
+            n1.load(piCur + col);
+            sad += Vec8s(m1.sad(n1));
+            Vec16uc m2, n2;
+            m2.load_a(piOrg + col + 16);
+            n2.load(piCur + col + 16);
+            sad += Vec8s(m2.sad(n2));
+        }
+
+        piOrg += strideOrg;
+        piCur += strideCur;
+        sum += horizontal_add_x(sad);
+    }
+    return sum;
+}
+
+#endif
+
+template<int lx, int ly>
+int CDECL sad_16x16(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    int sum = 0;
+    for (int row = 0; row < ly; row += 16)
+    {
+        for (int col = 0; col < lx; col += 16)
+        {
+            sum += sad_16<16>(piOrg + row * strideOrg + col, strideOrg,
+                              piCur + row * strideCur + col, strideCur);
+        }
+    }
+    return sum;
+}
+
+template<int lx, int ly>
+int CDECL satd(pixel * piOrg, intptr_t strideOrg, pixel * piCur, intptr_t strideCur)
+{
+    int uiSum = 0;
+
+    for (int row = 0; row < ly; row += 4)
+    {
+        for (int col = 0; col < lx; col += 4)
+        {
+            uiSum += satd_4x4(piOrg + strideOrg * row + col, strideOrg,
+                              piCur + strideCur * row + col, strideCur);
+        }
+    }
+
+    return uiSum;
+}
+
 void Setup_Vec_PixelPrimitives(EncoderPrimitives &p)
 {
-    p.sad[PARTITION_4x4] = sad_4<4, 4>;
-    p.sad[PARTITION_4x8] = sad_4<4, 8>;
-    p.sad[PARTITION_8x4] = sad_8<8, 4>;
-    p.sad[PARTITION_8x8] = sad_8<8, 8>;
-    p.sad[PARTITION_16x4] = sad_16<16, 4>;
-    p.sad[PARTITION_4x16] = sad_4<4, 16>;
-    p.sad[PARTITION_16x8] = sad_16<16, 8>;
-    p.sad[PARTITION_8x16] = sad_8<8, 16>;
-    p.sad[PARTITION_16x16] = sad_16<16, 16>;
-    p.sad[PARTITION_4x32] = sad_4<4, 32>;
-    p.sad[PARTITION_32x4] = sad_16<32, 4>;
-    p.sad[PARTITION_8x32] = sad_8<8, 32>;
-    p.sad[PARTITION_32x8] = sad_16<32, 8>;
-    p.sad[PARTITION_16x32] = sad_16<16, 32>;
-    p.sad[PARTITION_32x16] = sad_16<32, 16>;
+    p.sad[PARTITION_4x4] = sad_4<4>;
+    p.sad[PARTITION_4x8] = sad_4<8>;
+    p.sad[PARTITION_8x4] = sad_8<4>;
+    p.sad[PARTITION_8x8] = sad_8<8>;
+    p.sad[PARTITION_16x4] = sad_16<4>;
+    p.sad[PARTITION_4x16] = sad_4<16>;
+    p.sad[PARTITION_16x8] = sad_16<8>;
+    p.sad[PARTITION_8x16] = sad_8<16>;
+    p.sad[PARTITION_16x16] = sad_16<16>;
+    p.sad[PARTITION_4x32] = sad_4<32>;
+    p.sad[PARTITION_32x4] = sad_32<32, 4>;
+    p.sad[PARTITION_8x32] = sad_8x16<32>;
+    p.sad[PARTITION_32x8] = sad_32<32, 8>;
+    p.sad[PARTITION_16x32] = sad_16x16<16, 32>;
+    p.sad[PARTITION_32x16] = sad_16x16<32, 16>;
     p.sad[PARTITION_32x32] = sad_16x16<32, 32>;
-    p.sad[PARTITION_4x64] = sad_4<4, 64>;
-    p.sad[PARTITION_64x4] = sad_16<64, 4>;
-    p.sad[PARTITION_64x8] = sad_16<64, 8>;
-    p.sad[PARTITION_8x64] = sad_8<8, 64>;
+    p.sad[PARTITION_4x64] = sad_4<64>;
+    p.sad[PARTITION_64x4] = sad_32<64, 4>;
+    p.sad[PARTITION_64x8] = sad_32<64, 8>;
+    p.sad[PARTITION_8x64] = sad_8x16<64>;
     p.sad[PARTITION_16x64] = sad_16x16<16, 64>;
     p.sad[PARTITION_64x16] = sad_16x16<64, 16>;
     p.sad[PARTITION_32x64] = sad_16x16<32, 64>;
     p.sad[PARTITION_64x32] = sad_16x16<64, 32>;
     p.sad[PARTITION_64x64] = sad_16x16<64, 64>;
 
+#if HIGH_BIT_DEPTH
     p.satd[PARTITION_4x4] = satd_4x4;
     p.satd[PARTITION_4x8] = satd<4, 8>;
     p.satd[PARTITION_16x4] = satd<16, 4>;
     // p.satd[PARTITION_8x4] = satd<8, 4>;  // slower than SWAR C version
-
     p.sa8d_8x8 = sa8d_8x8;
+#else
+#endif
 }
--- a/source/encoder/x86/asm-primitives.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/encoder/x86/asm-primitives.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -77,7 +77,9 @@ void Setup_Assembly_Primitives(EncoderPr
 {
     if (cpuid >= 1)
     {
-//        INIT7( sad, _mmx2 );
+#if 0
+        INIT7( sad, _mmx2 );
+#endif
         INIT8( satd, _mmx2 );
 
         // Intra predictions max out at 32x32 (but subpel refine can use larger blocks)
@@ -85,17 +87,19 @@ void Setup_Assembly_Primitives(EncoderPr
         p.satd[PARTITION_32x8]  = cmp<32, 8, 16, 8, x264_pixel_satd_16x8_mmx2>;
         p.satd[PARTITION_32x32] = cmp<32, 32, 16, 16, x264_pixel_satd_16x16_mmx2>;
 
+#if 0
         // For large CU motion search
-//        p.sad[PARTITION_32x32]  = cmp<32, 32, 16, 16, x264_pixel_sad_16x16_mmx2>;
-//        p.sad[PARTITION_64x32]  = cmp<64, 32, 16, 16, x264_pixel_sad_16x16_mmx2>;
-//        p.sad[PARTITION_32x64]  = cmp<32, 64, 16, 16, x264_pixel_sad_16x16_mmx2>;
-//        p.sad[PARTITION_64x64]  = cmp<64, 64, 16, 16, x264_pixel_sad_16x16_mmx2>;
+        p.sad[PARTITION_32x32]  = cmp<32, 32, 16, 16, x264_pixel_sad_16x16_mmx2>;
+        p.sad[PARTITION_64x32]  = cmp<64, 32, 16, 16, x264_pixel_sad_16x16_mmx2>;
+        p.sad[PARTITION_32x64]  = cmp<32, 64, 16, 16, x264_pixel_sad_16x16_mmx2>;
+        p.sad[PARTITION_64x64]  = cmp<64, 64, 16, 16, x264_pixel_sad_16x16_mmx2>;
+#endif
     }
 #if 0
     if (cpuid >= 2)
     {
         p.satd[PARTITION_4x16] = x264_pixel_satd_4x16_sse2;
-        //p.sa8d_8x8 = x264_pixel_sa8d_8x8_sse2;
+        p.sa8d_8x8 = x264_pixel_sa8d_8x8_sse2;
         p.sa8d_16x16 = x264_pixel_sa8d_16x16_sse2;
         p.sad[PARTITION_16x16] = x264_pixel_sad_16x16_sse2;
         p.sad[PARTITION_16x8]  = x264_pixel_sad_16x8_sse2;
@@ -109,13 +113,13 @@ void Setup_Assembly_Primitives(EncoderPr
     }
     if (cpuid >= 3)
     {
-        //p.sa8d_8x8 = x264_pixel_sa8d_8x8_ssse3;
+        p.sa8d_8x8 = x264_pixel_sa8d_8x8_ssse3;
         p.sa8d_16x16 = x264_pixel_sa8d_16x16_ssse3;
     }
     if (cpuid >= 4)
     {
         p.satd[PARTITION_4x16] = x264_pixel_satd_4x16_sse4;
-        //p.sa8d_8x8 = x264_pixel_sa8d_8x8_sse4;
+        p.sa8d_8x8 = x264_pixel_sa8d_8x8_sse4;
         p.sa8d_16x16 = x264_pixel_sa8d_16x16_sse4;
     }
     if (cpuid == 7)
--- a/source/input/input.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/input/input.h	Wed Apr 24 12:00:28 2013 +0530
@@ -25,8 +25,6 @@
 #define _INPUT_H_
 
 #include "x265.h"
-#include <stdint.h>
-#include <iostream>
 
 namespace x265 {
 // private x265 namespace
@@ -63,7 +61,7 @@ public:
 
     virtual bool isFail() const = 0;
 
-    virtual int  guessFrameCount() const = 0;
+    virtual int  guessFrameCount() = 0;
 };
 }
 
--- a/source/input/y4m.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/input/y4m.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -27,148 +27,134 @@
 #include <string.h>
 
 using namespace x265;
-
-#define Y4M_FRAME_MAGIC 5 // "FRAME"
+using namespace std;
 
 Y4MInput::Y4MInput(const char *filename)
 {
-    fp = fopen(filename, "rb");
-    if (fp)
+    ifs.open(filename, ios::binary | ios::in);
+    if (!ifs.fail())
         parseHeader();
-    buf = new uint8_t[3 * width * height / 2];
+    buf = new char[3 * width * height / 2];
 }
 
 Y4MInput::~Y4MInput()
 {
-    if (fp) fclose(fp);
+    ifs.close();
     if (buf) delete[] buf;
 }
 
-#if _MSC_VER
-#pragma warning(disable: 4127)
-#endif
 void Y4MInput::parseHeader()
 {
-    char source[5];
     int t_width = 0;
     int t_height = 0;
     int t_rateNumerator = 0;
     int t_rateDenominator = 0;
 
-    while (1)
+    while (ifs)
     {
-        source[0] = 0x0;
+        // Skip Y4MPEG string
+        char byte = ifs.get();
+        while (!ifs.eof() && (byte != ' ') && (byte != '\n'))
+            byte = ifs.get();
 
-        while ((source[0] != 0x20) && (source[0] != 0x0a))
-        {
-            if (fread(&source[0], 1, 1, fp) == 0)
-            {
-                break;
-            }
-        }
-
-        if (source[0] == 0x00)
-        {
-            break;
-        }
-
-        while (source[0] == 0x20)
+        while (byte == ' ' && ifs)
         {
             // read parameter identifier
-            fread(&source[1], 1, 1, fp);
-            if (source[1] == 'W')
+            switch (ifs.get())
             {
+            case 'W':
                 t_width = 0;
-                while (true)
+                while (ifs)
                 {
-                    fread(&source[0], 1, 1, fp);
+                    byte = ifs.get();
 
-                    if (source[0] == 0x20 || source[0] == 0x0a)
+                    if (byte == ' ' || byte == '\n')
                     {
                         break;
                     }
                     else
                     {
-                        t_width = t_width * 10 + (source[0] - '0');
+                        t_width = t_width * 10 + (byte - '0');
                     }
                 }
-
-                continue;
-            }
+                break;
 
-            if (source[1] == 'H')
-            {
+            case 'H':
                 t_height = 0;
-                while (true)
+                while (ifs)
                 {
-                    fread(&source[0], 1, 1, fp);
-                    if (source[0] == 0x20 || source[0] == 0x0a)
+                    byte = ifs.get();
+                    if (byte == ' ' || byte == '\n')
                     {
                         break;
                     }
                     else
                     {
-                        t_height = t_height * 10 + (source[0] - '0');
+                        t_height = t_height * 10 + (byte - '0');
                     }
                 }
+                break;
 
-                continue;
-            }
-
-            if (source[1] == 'F')
-            {
+            case 'F':
                 t_rateNumerator = 0;
                 t_rateDenominator = 0;
-                while (true)
+                while (ifs)
                 {
-                    fread(&source[0], 1, 1, fp);
-                    if (source[0] == '.')
+                    byte = ifs.get();
+                    if (byte == '.')
                     {
                         t_rateDenominator = 1;
-                        while (true)
+                        while (ifs)
                         {
-                            fread(&source[0], 1, 1, fp);
-                            if (source[0] == 0x20 || source[0] == 0x10)
+                            byte = ifs.get();
+                            if (byte == ' ' || byte == '\n')
                             {
                                 break;
                             }
                             else
                             {
-                                t_rateNumerator = t_rateNumerator * 10 + (source[0] - '0');
+                                t_rateNumerator = t_rateNumerator * 10 + (byte - '0');
                                 t_rateDenominator = t_rateDenominator * 10;
                             }
                         }
 
                         break;
                     }
-                    else if (source[0] == ':')
+                    else if (byte == ':')
                     {
-                        while (true)
+                        while (ifs)
                         {
-                            fread(&source[0], 1, 1, fp);
-                            if (source[0] == 0x20 || source[0] == 0x0a)
+                            byte = ifs.get();
+                            if (byte == ' ' || byte == '\n')
                             {
                                 break;
                             }
                             else
-                                t_rateDenominator = t_rateDenominator * 10 + (source[0] - '0');
+                                t_rateDenominator = t_rateDenominator * 10 + (byte - '0');
                         }
 
                         break;
                     }
                     else
                     {
-                        t_rateNumerator = t_rateNumerator * 10 + (source[0] - '0');
+                        t_rateNumerator = t_rateNumerator * 10 + (byte - '0');
                     }
                 }
+                break;
 
-                continue;
+            default:
+                while (ifs)
+                {
+                    // consume this unsupported configuration word
+                    byte = ifs.get();
+                    if (byte == ' ' || byte == '\n')
+                        break;
+                }
+                break;
             }
-
-            break;
         }
 
-        if (source[0] == 0x0a)
+        if (byte == '\n')
         {
             break;
         }
@@ -182,10 +168,16 @@ void Y4MInput::parseHeader()
     rateDenom = t_rateDenominator;
 }
 
-int  Y4MInput::guessFrameCount() const
+static const char header[] = "FRAME";
+
+int Y4MInput::guessFrameCount()
 {
-    /* TODO: Get file size, subtract file header, divide by (framesize+frameheader) */
-    return 0;
+    long cur = ifs.tellg();
+    ifs.seekg (0, ios::end);
+    long size = ifs.tellg();
+    ifs.seekg (cur, ios::beg);
+
+    return (int) ((size - cur) / ((width * height * 3 / 2) + strlen(header) + 1));
 }
 
 void Y4MInput::skipFrames(int numFrames)
@@ -203,42 +195,33 @@ bool Y4MInput::readPicture(x265_picture&
     PPAStartCpuEventFunc(read_yuv);
 
     /* strip off the FRAME header */
-    char header[Y4M_FRAME_MAGIC];
-
-    if (fread(&header, 1, sizeof(header), fp) < sizeof(header))
-        return false;
-    if (!strncmp(header, "FRAME", Y4M_FRAME_MAGIC))
+    char hbuf[sizeof(header)];
+    ifs.read(hbuf, strlen(header));
+    if (!ifs || strncmp(hbuf, header, strlen(header)))
     {
         fprintf(stderr, "Y4M frame header missing\n");
         return false;
     }
 
     /* consume bytes up to line feed */
-    char byte;
-    do
-    {
-        if (fread(&byte, 1, 1, fp) == 0)
-        {
-            fprintf(stderr, "Y4M frame header incomplete\n");
-            return false;
-        }
-    }
-    while (byte != '\n');
+    char byte = ifs.get();
+    while (byte != '\n' && !ifs)
+        byte = ifs.get();
 
     const size_t count = width * height * 3 / 2;
 
     pic.planes[0] = buf;
 
-    pic.planes[1] = buf + (width * height);
+    pic.planes[1] = buf + width * height;
 
-    pic.planes[2] = buf + ((width * height) + ((width >> 1) * (height >> 1)));
+    pic.planes[2] = buf + width * height + ((width * height) >> 2);
 
     pic.stride[0] = width;
 
     pic.stride[1] = pic.stride[2] = pic.stride[0] >> 1;
 
-    size_t bytes = fread(buf, 1, count, fp);
+    ifs.read(buf, count);
     PPAStopCpuEventFunc(read_yuv);
 
-    return bytes == count;
+    return ifs.good();
 }
--- a/source/input/y4m.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/input/y4m.h	Wed Apr 24 12:00:28 2013 +0530
@@ -25,8 +25,7 @@
 #define _Y4M_H_
 
 #include "input.h"
-#include <stdio.h>
-#include <stdint.h>
+#include <fstream>
 
 namespace x265 {
 // x265 private namespace
@@ -43,11 +42,9 @@ protected:
 
     int height;
 
-    uint8_t* buf;
+    char* buf;
 
-    FILE* fp;
-
-    bool eof;
+    std::ifstream ifs;
 
     void parseHeader();
 
@@ -67,13 +64,13 @@ public:
 
     int getHeight() const                         { return height; }
 
-    bool isEof() const                            { return !!feof(fp); }
+    bool isEof() const                            { return ifs.eof(); }
 
-    bool isFail() const                           { return !fp; }
+    bool isFail() const                           { return !ifs.is_open(); }
 
     void release()                                { delete this; }
 
-    int  guessFrameCount() const;
+    int  guessFrameCount();
 
     void skipFrames(int numFrames);
 
--- a/source/input/yuv.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/input/yuv.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -27,10 +27,11 @@
 #include <string.h>
 
 using namespace x265;
+using namespace std;
 
 YUVInput::YUVInput(const char *filename)
 {
-    fp = fopen(filename, "rb");
+    ifs.open(filename, ios::binary | ios::in);
     width = height = 0;
     depth = 8;
     buf = NULL;
@@ -38,14 +39,19 @@ YUVInput::YUVInput(const char *filename)
 
 YUVInput::~YUVInput()
 {
-    if (fp) fclose(fp);
+    ifs.close();
     if (buf) delete[] buf;
 }
 
-int  YUVInput::guessFrameCount() const
+int YUVInput::guessFrameCount()
 {
-    /* TODO: Get file size, divide by bufsize */
-    return 0;
+    long cur = ifs.tellg();
+    ifs.seekg (0, ios::end);
+    long size = ifs.tellg();
+    ifs.seekg (cur, ios::beg);
+    int pixelbytes = depth > 8 ? 2 : 1;
+
+    return (size - cur) / (width * height * pixelbytes * 3 / 2);
 }
 
 void YUVInput::skipFrames(int numFrames)
@@ -54,7 +60,7 @@ void YUVInput::skipFrames(int numFrames)
 
     int framesize = (width * height * 3 / 2) * pixelbytes;
 
-    fseek(fp, framesize * numFrames, SEEK_CUR);
+    ifs.seekg(framesize * numFrames, ios::cur);
 }
 
 // TODO: only supports 4:2:0 chroma sampling
@@ -68,7 +74,7 @@ bool YUVInput::readPicture(x265_picture&
 
     if (!buf)
     {
-        buf = new uint8_t[bufsize];
+        buf = new char[bufsize];
     }
 
     pic.planes[0] = buf;
@@ -83,8 +89,8 @@ bool YUVInput::readPicture(x265_picture&
 
     pic.stride[1] = pic.stride[2] = pic.stride[0] >> 1;
 
-    size_t bytes = fread(buf, 1, bufsize, fp);
+    ifs.read(buf, bufsize);
     PPAStopCpuEventFunc(read_yuv);
 
-    return bytes == (size_t)bufsize;
+    return ifs.good();
 }
--- a/source/input/yuv.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/input/yuv.h	Wed Apr 24 12:00:28 2013 +0530
@@ -25,8 +25,7 @@
 #define _YUV_H_
 
 #include "input.h"
-#include <stdio.h>
-#include <stdint.h>
+#include <fstream>
 
 namespace x265 {
 // private x265 namespace
@@ -41,11 +40,9 @@ protected:
 
     int depth;
 
-    uint8_t* buf;
+    char* buf;
 
-    FILE *fp;
-
-    bool eof;
+    std::ifstream ifs;
 
 public:
 
@@ -63,19 +60,13 @@ public:
 
     int getHeight() const                         { return height; }
 
-    bool isEof() const                            { return !!feof(fp); }
+    bool isEof() const                            { return ifs.eof(); }
 
-    bool isFail() const                           { return !fp; }
+    bool isFail() const                           { return !ifs.is_open(); }
 
-    void release()
-    {
-        if (fp)
-            fclose(fp);
+    void release()                                { delete this; }
 
-        delete this;
-    }
-
-    int  guessFrameCount() const;
+    int  guessFrameCount();
 
     void skipFrames(int numFrames);
 
--- a/source/output/output.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/output/output.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -28,11 +28,11 @@
 
 using namespace x265;
 
-Output* Output::Open(const char *fname, int width, int height, int bitdepth)
+Output* Output::Open(const char *fname, int width, int height, int bitdepth, int rate)
 {
     const char * s = strrchr(fname, '.');
     if (s && !strcmp(s, ".y4m"))
-        return new Y4MOutput(fname, width, height, bitdepth);
+        return new Y4MOutput(fname, width, height, rate);
     else
         return new YUVOutput(fname, width, height, bitdepth);
 }
--- a/source/output/output.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/output/output.h	Wed Apr 24 12:00:28 2013 +0530
@@ -25,7 +25,6 @@
 #define _OUTPUT_H_
 
 #include "x265.h"
-#include <stdint.h>
 
 namespace x265 {
 // private x265 namespace
@@ -41,7 +40,9 @@ public:
 
     Output()           {}
 
-    static  Output* Open(const char *fname, int width, int height, int bitdepth);
+    static Output* Open(const char *fname, int width, int height, int bitdepth, int rate);
+
+    virtual bool isFail() const = 0;
 
     virtual void release() = 0;
 
--- a/source/output/y4m.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/output/y4m.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -21,37 +21,35 @@
  * For more information, contact us at licensing@multicorewareinc.com.
  *****************************************************************************/
 
+#include "PPA/ppa.h"
 #include "output.h"
 #include "y4m.h"
-#include <stdio.h>
-#include <assert.h>
 
 using namespace x265;
+using namespace std;
 
-Y4MOutput::Y4MOutput(const char *filename, int t_width, int t_height, int bitdepth)
+Y4MOutput::Y4MOutput(const char *filename, int w, int h, int rate)
+    : width(w)
+    , height(h)
 {
-    fp = fopen(filename, "wb");
-    width = t_width;
-    height = t_height;
-    assert(bitdepth == 8);
+    ofs.open(filename, ios::binary | ios::out);
     buf = new char[width];
-    if (fp)
+    if (ofs)
     {
-        // TODO: need to get frame rate
-        fprintf(fp, "YUV4MPEG2 W%d H%d F30:1 Ip C420\n", width, height);
+        ofs << "YUV4MPEG2 W" << width << " H" << height << " F" << rate << ":1 Ip C420\n";
     }
 }
 
 Y4MOutput::~Y4MOutput()
 {
-    if (fp) fclose(fp);
+    ofs.close();
     if (buf) delete [] buf;
 }
 
-
 bool Y4MOutput::writePicture(const x265_picture& pic)
 {
-    fprintf(fp, "FRAME\n");
+    PPAStartCpuEventFunc(write_yuv);
+    ofs << "FRAME\n";
 
     if (pic.bitDepth > 8)
     {
@@ -61,7 +59,7 @@ bool Y4MOutput::writePicture(const x265_
         {
             for (int j = 0; j < width; j++)
                 buf[j] = (char) Y[j];
-            fwrite(buf, sizeof(char), width, fp);
+            ofs.write(buf, width);
             Y += pic.stride[0];
         }
         short *U = (short*)pic.planes[1];
@@ -69,7 +67,7 @@ bool Y4MOutput::writePicture(const x265_
         {
             for (int j = 0; j < width >> 1; j++)
                 buf[j] = (char) U[j];
-            fwrite(buf, sizeof(char), width >> 1, fp);
+            ofs.write(buf, width >> 1);
             U += pic.stride[1];
         }
         short *V = (short*)pic.planes[2];
@@ -77,7 +75,7 @@ bool Y4MOutput::writePicture(const x265_
         {
             for (int j = 0; j < width >> 1; j++)
                 buf[j] = (char) V[j];
-            fwrite(buf, sizeof(char), width >> 1, fp);
+            ofs.write(buf, width >> 1);
             V += pic.stride[2];
         }
     }
@@ -86,22 +84,23 @@ bool Y4MOutput::writePicture(const x265_
         char *Y = (char*)pic.planes[0];
         for (int i = 0; i < height; i++)
         {
-            fwrite(Y, sizeof(char), width, fp);
+            ofs.write(Y, width);
             Y += pic.stride[0];
         }
         char *U = (char*)pic.planes[1];
         for (int i = 0; i < height >> 1; i++)
         {
-            fwrite(U, sizeof(char), width >> 1, fp);
+            ofs.write(U, width >> 1);
             U += pic.stride[1];
         }
         char *V = (char*)pic.planes[2];
         for (int i = 0; i < height >> 1; i++)
         {
-            fwrite(V, sizeof(char), width >> 1, fp);
+            ofs.write(V, width >> 1);
             V += pic.stride[2];
         }
     }
 
+    PPAStopCpuEventFunc(write_yuv);
     return true;
 }
--- a/source/output/y4m.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/output/y4m.h	Wed Apr 24 12:00:28 2013 +0530
@@ -25,7 +25,7 @@
 #define _Y4M_H_
 
 #include "output.h"
-#include <stdio.h>
+#include <fstream>
 
 namespace x265 {
 // private x265 namespace
@@ -38,7 +38,7 @@ protected:
 
     int height;
 
-    FILE* fp;
+    std::ofstream ofs;
 
     char *buf;
 
@@ -46,10 +46,12 @@ protected:
 
 public:
 
-    Y4MOutput(const char *filename, int width, int height, int bitdepth);
+    Y4MOutput(const char *filename, int width, int height, int rate);
 
     virtual ~Y4MOutput();
 
+    bool isFail() const                           { return ofs.fail(); }
+
     void release()                                { delete this; }
 
     bool writePicture(const x265_picture& pic);
--- a/source/output/yuv.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/output/yuv.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -21,28 +21,31 @@
  * For more information, contact us at licensing@multicorewareinc.com.
  *****************************************************************************/
 
+#include "PPA/ppa.h"
 #include "output.h"
 #include "yuv.h"
 
 using namespace x265;
+using namespace std;
 
-YUVOutput::YUVOutput(const char *filename, int t_width, int t_height, int t_bitdepth)
+YUVOutput::YUVOutput(const char *filename, int w, int h, int d)
+    : width(w)
+    , height(h)
+    , depth(d)
 {
-    fp = fopen(filename, "wb");
-    width = t_width;
-    height = t_height;
-    depth = t_bitdepth;
+    ofs.open(filename, ios::binary | ios::out);
     buf = new char[width];
 }
 
 YUVOutput::~YUVOutput()
 {
-    if (fp) fclose(fp);
+    ofs.close();
     if (buf) delete [] buf;
 }
 
 bool YUVOutput::writePicture(const x265_picture& pic)
 {
+    PPAStartCpuEventFunc(write_yuv);
     int pixelbytes = (depth > 8) ? 2 : 1;
 
     if (pic.bitDepth > 8 && depth == 8)
@@ -53,7 +56,7 @@ bool YUVOutput::writePicture(const x265_
         {
             for (int j = 0; j < width; j++)
                 buf[j] = (char) Y[j];
-            fwrite(buf, sizeof(char), width, fp);
+            ofs.write(buf, width);
             Y += pic.stride[0];
         }
         short *U = (short*)pic.planes[1];
@@ -61,7 +64,7 @@ bool YUVOutput::writePicture(const x265_
         {
             for (int j = 0; j < width >> 1; j++)
                 buf[j] = (char) U[j];
-            fwrite(buf, sizeof(char), width >> 1, fp);
+            ofs.write(buf, width >> 1);
             U += pic.stride[1];
         }
         short *V = (short*)pic.planes[2];
@@ -69,31 +72,33 @@ bool YUVOutput::writePicture(const x265_
         {
             for (int j = 0; j < width >> 1; j++)
                 buf[j] = (char) V[j];
-            fwrite(buf, sizeof(char), width >> 1, fp);
+            ofs.write(buf, width >> 1);
             V += pic.stride[2];
         }
     }
     else
     {
-        // encoder gave us byte pixels, write them directly
+        // encoder pixels same size as output pixels, write them directly
         char *Y = (char*)pic.planes[0];
         for (int i = 0; i < height; i++)
         {
-            fwrite(Y, sizeof(char), width * pixelbytes, fp);
+            ofs.write(Y, width * pixelbytes);
             Y += pic.stride[0] * pixelbytes;
         }
         char *U = (char*)pic.planes[1];
         for (int i = 0; i < height >> 1; i++)
         {
-            fwrite(U, sizeof(char), (width>>1) * pixelbytes, fp);
+            ofs.write(U, (width >> 1) * pixelbytes);
             U += pic.stride[1] * pixelbytes;
         }
         char *V = (char*)pic.planes[2];
         for (int i = 0; i < height >> 1; i++)
         {
-            fwrite(V, sizeof(char), (width>>1) * pixelbytes, fp);
+            ofs.write(V, (width >> 1) * pixelbytes);
             V += pic.stride[2] * pixelbytes;
         }
     }
+
+    PPAStopCpuEventFunc(write_yuv);
     return true;
 }
--- a/source/output/yuv.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/output/yuv.h	Wed Apr 24 12:00:28 2013 +0530
@@ -25,6 +25,7 @@
 #define _YUV_H_
 
 #include "output.h"
+#include <fstream>
 #include <stdio.h>
 
 namespace x265 {
@@ -42,7 +43,7 @@ protected:
 
     char *buf;
 
-    FILE *fp;
+    std::ofstream ofs;
 
 public:
 
@@ -50,6 +51,8 @@ public:
 
     virtual ~YUVOutput();
 
+    bool isFail() const                           { return ofs.fail(); }
+
     void release()                                { delete this; }
 
     bool writePicture(const x265_picture& pic);
--- a/source/test/testbench.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/test/testbench.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -49,7 +49,8 @@ int main(int argc, char *argv[])
     }
 
     int seed = (int)time(NULL);
-    printf("Using random seed %X\n", seed);
+    const char *bpp[] = { "8bpp", "16bpp" };
+    printf("Using random seed %X %s\n", seed, bpp[HIGH_BIT_DEPTH]);
     srand(seed);
 
     PixelHarness  HPixel;
--- a/source/test/testpool.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/test/testpool.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -135,7 +135,7 @@ void MD5Frame::ProcessRow(int rownum)
     // Called by worker thread
     RowData &curRow = this->row[rownum];
 
-    assert(rownum < this->numrows);
+    assert(rownum < this->numrows && rownum >= 0);
     assert(curRow.curCol < this->numcols);
 
     while (curRow.curCol < this->numcols)
--- a/source/x265cfg.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/x265cfg.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -297,24 +297,17 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
         ("ReconFile,o",           cfg_ReconFile,     string(""), "Reconstructed YUV output file name")
         ("SourceWidth,-wdt",      m_iSourceWidth,      0, "Source picture width")
         ("SourceHeight,-hgt",     m_iSourceHeight,     0, "Source picture height")
-        ("InputBitDepth",         m_inputBitDepthY,    8, "Bit-depth of input file")
-        ("OutputBitDepth",        m_outputBitDepthY,   0, "Bit-depth of output file (default:InternalBitDepth)")
-        ("InternalBitDepth",      m_internalBitDepthY, 0, "Bit-depth the codec operates at. (default:InputBitDepth)"
-        "If different to InputBitDepth, source data will be converted")
-        ("InputBitDepthC",        m_inputBitDepthC,    0, "As per InputBitDepth but for chroma component. (default:InputBitDepth)")
-        ("OutputBitDepthC",       m_outputBitDepthC,   0, "As per OutputBitDepth but for chroma component. (default:InternalBitDepthC)")
-        ("InternalBitDepthC",     m_internalBitDepthC, 0, "As per InternalBitDepth but for chroma component. (default:IntrenalBitDepth)")
-        ("ConformanceMode",       m_conformanceMode,   0, "Window conformance mode (0: no window, 1:automatic padding, 2:padding, 3:conformance")
-        ("HorizontalPadding,-pdx", m_aiPad[0],         0, "Horizontal source padding for conformance window mode 2")
-        ("VerticalPadding,-pdy",  m_aiPad[1],          0, "Vertical source padding for conformance window mode 2")
-        ("ConfLeft",              m_confLeft,          0, "Left offset for window conformance mode 3")
-        ("ConfRight",             m_confRight,         0, "Right offset for window conformance mode 3")
-        ("ConfTop",               m_confTop,           0, "Top offset for window conformance mode 3")
-        ("ConfBottom",            m_confBottom,        0, "Bottom offset for window conformance mode 3")
         ("FrameRate,-fr",         m_iFrameRate,        0, "Frame rate")
         ("FrameSkip,-fs",         m_FrameSkip,         0u, "Number of frames to skip at start of input YUV")
         ("FramesToBeEncoded,f",   m_framesToBeEncoded, 0, "Number of frames to be encoded (default=all)")
 
+#if HIGH_BIT_DEPTH
+        ("InputBitDepth",         m_inputBitDepth,     8, "Bit-depth of input file")
+        ("OutputBitDepth",        m_outputBitDepth,    0, "Bit-depth of output file (default:InternalBitDepth)")
+        ("InternalBitDepth",      m_internalBitDepth,  0, "Bit-depth the codec operates at. (default:InputBitDepth)"
+         "If different to InputBitDepth, source data will be converted")
+#endif
+
         // Profile and level
         ("Profile", m_profile,   Profile::NONE, "Profile to be used when encoding (Incomplete)")
         ("Level",   m_level,     Level::NONE,   "Level limit to be used, eg 5.1 (Incomplete)")
@@ -632,36 +625,57 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
         printf("Unable to open source file\n");
         return 1;
     }
-    printf("Input          File          : %s\n", cfg_InputFile.c_str());
 
     if (m_input->getWidth())
     {
         m_iSourceWidth = m_input->getWidth();
         m_iSourceHeight = m_input->getHeight();
         m_iFrameRate = (int)m_input->getRate();
-        m_inputBitDepthC = m_inputBitDepthY = 8;
+#if HIGH_BIT_DEPTH
+        m_inputBitDepth = 8;
+#endif
     }
     else
     {
         m_input->setDimensions(m_iSourceWidth, m_iSourceHeight);
-        m_input->setBitDepth(m_inputBitDepthY);
+#if HIGH_BIT_DEPTH
+        m_input->setBitDepth(m_inputBitDepth);
+#else
+        m_input->setBitDepth(8);
+#endif
     }
 
+#if HIGH_BIT_DEPTH
     /* rules for input, output and internal bitdepths as per help text */
-    if (!m_internalBitDepthY) { m_internalBitDepthY = m_inputBitDepthY; }
-    if (!m_internalBitDepthC) { m_internalBitDepthC = m_internalBitDepthY; }
-    if (!m_inputBitDepthC) { m_inputBitDepthC = m_inputBitDepthY; }
-    if (!m_outputBitDepthY) { m_outputBitDepthY = m_internalBitDepthY; }
-    if (!m_outputBitDepthC) { m_outputBitDepthC = m_internalBitDepthC; }
+    if (!m_internalBitDepth) { m_internalBitDepth = m_inputBitDepth; }
+    if (!m_outputBitDepth) { m_outputBitDepth = m_internalBitDepth; }
+#endif
 
     if (m_FrameSkip && m_input)
     {
         m_input->skipFrames(m_FrameSkip);
     }
+
+    int numRemainingFrames = m_input->guessFrameCount();
+
+    m_framesToBeEncoded = m_framesToBeEncoded ? min(m_framesToBeEncoded, numRemainingFrames) : numRemainingFrames;
+
+    printf("Input          File          : %s (%d frames)\n", cfg_InputFile.c_str(), numRemainingFrames);
+
     if (!cfg_ReconFile.empty())
     {
         printf("Reconstruction File          : %s\n", cfg_ReconFile.c_str());
-        m_recon = x265::Output::Open(cfg_ReconFile.c_str(), m_iSourceWidth, m_iSourceHeight, m_outputBitDepthY);
+#if HIGH_BIT_DEPTH
+        m_recon = x265::Output::Open(cfg_ReconFile.c_str(), m_iSourceWidth, m_iSourceHeight, m_outputBitDepth, m_iFrameRate);
+#else
+        m_recon = x265::Output::Open(cfg_ReconFile.c_str(), m_iSourceWidth, m_iSourceHeight, 8, m_iFrameRate);
+#endif
+        if (m_recon->isFail())
+        {
+            printf("Unable to write reconstruction file\n");
+            m_recon->release();
+            m_recon = 0;
+        }
     }
 
     Char *pColumnWidth = cfg_ColumnWidth.empty() ? NULL : strdup(cfg_ColumnWidth.c_str());
@@ -733,67 +747,6 @@ Bool TAppEncCfg::parseCfg(Int argc, Char
 #endif
     m_scalingListFile = cfg_ScalingListFile.empty() ? NULL : strdup(cfg_ScalingListFile.c_str());
 
-    // TODO:ChromaFmt assumes 4:2:0 below
-    switch (m_conformanceMode)
-    {
-    case 0:
-    {
-        // no conformance or padding
-        m_confLeft = m_confRight = m_confTop = m_confBottom = 0;
-        m_aiPad[1] = m_aiPad[0] = 0;
-        break;
-    }
-    case 1:
-    {
-        // automatic padding to minimum CU size
-        Int minCuSize = m_uiMaxCUHeight >> (m_uiMaxCUDepth - 1);
-        if (m_iSourceWidth % minCuSize)
-        {
-            m_aiPad[0] = m_confRight  = ((m_iSourceWidth / minCuSize) + 1) * minCuSize - m_iSourceWidth;
-            m_iSourceWidth  += m_confRight;
-        }
-        if (m_iSourceHeight % minCuSize)
-        {
-            m_aiPad[1] = m_confBottom = ((m_iSourceHeight / minCuSize) + 1) * minCuSize - m_iSourceHeight;
-            m_iSourceHeight += m_confBottom;
-        }
-        if (m_aiPad[0] % TComSPS::getWinUnitX(CHROMA_420) != 0)
-        {
-            fprintf(stderr, "Error: picture width is not an integer multiple of the specified chroma subsampling\n");
-            exit(EXIT_FAILURE);
-        }
-        if (m_aiPad[1] % TComSPS::getWinUnitY(CHROMA_420) != 0)
-        {
-            fprintf(stderr, "Error: picture height is not an integer multiple of the specified chroma subsampling\n");
-            exit(EXIT_FAILURE);
-        }
-        break;
-    }
-    case 2:
-    {
-        //padding
-        m_iSourceWidth  += m_aiPad[0];
-        m_iSourceHeight += m_aiPad[1];
-        m_confRight  = m_aiPad[0];
-        m_confBottom = m_aiPad[1];
-        break;
-    }
-    case 3:
-    {
-        // conformance
-        if ((m_confLeft == 0) && (m_confRight == 0) && (m_confTop == 0) && (m_confBottom == 0))
-        {
-            fprintf(stderr, "Warning: Conformance window enabled, but all conformance window parameters set to zero\n");
-        }
-        if ((m_aiPad[1] != 0) || (m_aiPad[0] != 0))
-        {
-            fprintf(stderr, "Warning: Conformance window enabled, padding parameters will be ignored\n");
-        }
-        m_aiPad[1] = m_aiPad[0] = 0;
-        break;
-    }
-    }
-
     // allocate slice-based dQP values
     m_aidQP = new Int[m_framesToBeEncoded + m_iGOPSize + 1];
     ::memset(m_aidQP, 0, sizeof(Int) * (m_framesToBeEncoded + m_iGOPSize + 1));
@@ -990,15 +943,20 @@ Void TAppEncCfg::xCheckParameter()
     Bool check_failed = false; /* abort if there is a fatal configuration problem */
 #define xConfirmPara(a, b) check_failed |= confirmPara(a, b)
     // check range of parameters
-    xConfirmPara(m_inputBitDepthY < 8,                                                     "InputBitDepth must be at least 8");
-    xConfirmPara(m_inputBitDepthC < 8,                                                     "InputBitDepthC must be at least 8");
+#if HIGH_BIT_DEPTH
+    xConfirmPara(m_inputBitDepth < 8,                                                      "InputBitDepth must be at least 8");
+    xConfirmPara(m_inputBitDepth < 8,                                                      "InputBitDepth must be at least 8");
+    xConfirmPara(m_outputBitDepth > m_internalBitDepth,                                    "OutputBitDepth must be less than or equal to InternalBitDepth");
+    xConfirmPara(m_iQP <  -6 * (m_internalBitDepth - 8) || m_iQP > 51,                     "QP exceeds supported range (-QpBDOffsety to 51)");
+#else
+    xConfirmPara(m_iQP < 0 || m_iQP > 51,                                                  "QP exceeds supported range (-QpBDOffsety to 51)");
+#endif
     xConfirmPara(m_iFrameRate <= 0,                                                        "Frame rate must be more than 1");
     xConfirmPara(m_framesToBeEncoded <= 0,                                                 "Total Number Of Frames encoded must be more than 0");
     xConfirmPara(m_iGOPSize < 1,                                                           "GOP Size must be greater or equal to 1");
     xConfirmPara(m_iGOPSize > 1 &&  m_iGOPSize % 2,                                        "GOP Size must be a multiple of 2, if GOP Size is greater than 1");
     xConfirmPara((m_iIntraPeriod > 0 && m_iIntraPeriod < m_iGOPSize) || m_iIntraPeriod == 0, "Intra period must be more than GOP size, or -1 , not 0");
     xConfirmPara(m_iDecodingRefreshType < 0 || m_iDecodingRefreshType > 2,                 "Decoding Refresh Type must be equal to 0, 1 or 2");
-    xConfirmPara(m_iQP <  -6 * (m_internalBitDepthY - 8) || m_iQP > 51,                    "QP exceeds supported range (-QpBDOffsety to 51)");
     xConfirmPara(m_loopFilterBetaOffsetDiv2 < -13 || m_loopFilterBetaOffsetDiv2 > 13,      "Loop Filter Beta Offset div. 2 exceeds supported range (-13 to 13)");
     xConfirmPara(m_loopFilterTcOffsetDiv2 < -13 || m_loopFilterTcOffsetDiv2 > 13,          "Loop Filter Tc Offset div. 2 exceeds supported range (-13 to 13)");
     xConfirmPara(m_iFastSearch < 0 || m_iFastSearch > 2,                                   "Fast Search Mode is not supported value (0:Full search  1:Diamond  2:PMVFAST)");
@@ -1072,14 +1030,6 @@ Void TAppEncCfg::xCheckParameter()
     xConfirmPara(m_iSourceWidth  % TComSPS::getWinUnitX(CHROMA_420) != 0, "Picture width must be an integer multiple of the specified chroma subsampling");
     xConfirmPara(m_iSourceHeight % TComSPS::getWinUnitY(CHROMA_420) != 0, "Picture height must be an integer multiple of the specified chroma subsampling");
 
-    xConfirmPara(m_aiPad[0] % TComSPS::getWinUnitX(CHROMA_420) != 0, "Horizontal padding must be an integer multiple of the specified chroma subsampling");
-    xConfirmPara(m_aiPad[1] % TComSPS::getWinUnitY(CHROMA_420) != 0, "Vertical padding must be an integer multiple of the specified chroma subsampling");
-
-    xConfirmPara(m_confLeft   % TComSPS::getWinUnitX(CHROMA_420) != 0, "Left conformance window offset must be an integer multiple of the specified chroma subsampling");
-    xConfirmPara(m_confRight  % TComSPS::getWinUnitX(CHROMA_420) != 0, "Right conformance window offset must be an integer multiple of the specified chroma subsampling");
-    xConfirmPara(m_confTop    % TComSPS::getWinUnitY(CHROMA_420) != 0, "Top conformance window offset must be an integer multiple of the specified chroma subsampling");
-    xConfirmPara(m_confBottom % TComSPS::getWinUnitY(CHROMA_420) != 0, "Bottom conformance window offset must be an integer multiple of the specified chroma subsampling");
-
     // max CU width and height should be power of 2
     UInt ui = m_uiMaxCUWidth;
     while (ui)
@@ -1608,19 +1558,23 @@ Void TAppEncCfg::xSetGlobal()
     g_uiMaxCUDepth = m_uiMaxCUDepth;
 
     // set internal bit-depth and constants
-    g_bitDepthY = m_internalBitDepthY;
-    g_bitDepthC = m_internalBitDepthC;
+#if HIGH_BIT_DEPTH
+    g_bitDepthY = m_internalBitDepth;
+    g_bitDepthC = m_internalBitDepth;
 
-    g_uiPCMBitDepthLuma = m_bPCMInputBitDepthFlag ? m_inputBitDepthY : m_internalBitDepthY;
-    g_uiPCMBitDepthChroma = m_bPCMInputBitDepthFlag ? m_inputBitDepthC : m_internalBitDepthC;
+    g_uiPCMBitDepthLuma = m_bPCMInputBitDepthFlag ? m_inputBitDepth : m_internalBitDepth;
+    g_uiPCMBitDepthChroma = m_bPCMInputBitDepthFlag ? m_inputBitDepth : m_internalBitDepth;
+#else
+    g_bitDepthY = g_bitDepthC = 8;
+    g_uiPCMBitDepthLuma = g_uiPCMBitDepthChroma = 8;
+#endif
 }
 
 Void TAppEncCfg::xPrintParameter()
 {
     printf("\n");
     printf("Bitstream      File          : %s\n", m_pchBitstreamFile);
-    printf("Real     Format              : %dx%d %dHz\n", m_iSourceWidth - m_confLeft - m_confRight, m_iSourceHeight - m_confTop - m_confBottom, m_iFrameRate);
-    printf("Internal Format              : %dx%d %dHz\n", m_iSourceWidth, m_iSourceHeight, m_iFrameRate);
+    printf("Format                       : %dx%d %dHz\n", m_iSourceWidth, m_iSourceHeight, m_iFrameRate);
     printf("Frame index                  : %u - %d (%d frames)\n", m_FrameSkip, m_FrameSkip + m_framesToBeEncoded - 1, m_framesToBeEncoded);
     printf("CU size / depth              : %d / %d\n", m_uiMaxCUWidth, m_uiMaxCUDepth);
     printf("RQT trans. size (min / max)  : %d / %d\n", 1 << m_uiQuadtreeTULog2MinSize, 1 << m_uiQuadtreeTULog2MaxSize);
@@ -1638,7 +1592,9 @@ Void TAppEncCfg::xPrintParameter()
 
     printf("QP adaptation                : %d (range=%d)\n", m_bUseAdaptiveQP, (m_bUseAdaptiveQP ? m_iQPAdaptationRange : 0));
     printf("GOP size                     : %d\n", m_iGOPSize);
-    printf("Internal bit depth           : (Y:%d, C:%d)\n", m_internalBitDepthY, m_internalBitDepthC);
+#if HIGH_BIT_DEPTH
+    printf("Internal bit depth           : %d\n", m_internalBitDepth);
+#endif
     printf("PCM sample bit depth         : (Y:%d, C:%d)\n", g_uiPCMBitDepthLuma, g_uiPCMBitDepthChroma);
 #if RATE_CONTROL_LAMBDA_DOMAIN
     printf("RateControl                  : %d\n", m_RCEnableRateControl);
@@ -1663,7 +1619,9 @@ Void TAppEncCfg::xPrintParameter()
     printf("\n");
 
     printf("TOOL CFG: ");
-    printf("IBD:%d ", g_bitDepthY > m_inputBitDepthY || g_bitDepthC > m_inputBitDepthC);
+#if HIGH_BIT_DEPTH
+    printf("IBD:%d ", g_bitDepthY > m_inputBitDepth || g_bitDepthC > m_inputBitDepth);
+#endif
     printf("HAD:%d ", m_bUseHADME);
     printf("SRD:%d ", m_bUseSBACRD);
     printf("RDQ:%d ", m_useRDOQ);
--- a/source/x265cfg.h	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/x265cfg.h	Wed Apr 24 12:00:28 2013 +0530
@@ -66,18 +66,13 @@ protected:
     UInt      m_FrameSkip;                                    ///< number of skipped frames from the beginning
     Int       m_iSourceWidth;                                 ///< source width in pixel
     Int       m_iSourceHeight;                                ///< source height in pixel
-    Int       m_conformanceMode;
-    Int       m_confLeft;
-    Int       m_confRight;
-    Int       m_confTop;
-    Int       m_confBottom;
     Int       m_framesToBeEncoded;                            ///< number of encoded frames
-    Int       m_aiPad[2];                                     ///< number of padded pixels for width and height
 
     // profile/level
     Profile::Name m_profile;
     Level::Tier   m_levelTier;
     Level::Name   m_level;
+
 #if L0046_CONSTRAINT_FLAGS
     Bool m_progressiveSourceFlag;
     Bool m_interlacedSourceFlag;
@@ -122,7 +117,7 @@ protected:
     Bool      m_bUseAdaptiveQP;                               ///< Flag for enabling QP adaptation based on a psycho-visual model
     Int       m_iQPAdaptationRange;                           ///< dQP range by QP adaptation
 
-    Int       m_maxTempLayer;                                ///< Max temporal layer
+    Int       m_maxTempLayer;                                 ///< Max temporal layer
 
     // coding unit (CU) definition
     UInt      m_uiMaxCUWidth;                                 ///< max. CU width in pixel
@@ -136,13 +131,12 @@ protected:
     UInt      m_uiQuadtreeTUMaxDepthInter;
     UInt      m_uiQuadtreeTUMaxDepthIntra;
 
+#if HIGH_BIT_DEPTH
     // coding tools (bit-depth)
-    Int       m_inputBitDepthY;                             ///< bit-depth of input file (luma component)
-    Int       m_inputBitDepthC;                             ///< bit-depth of input file (chroma component)
-    Int       m_outputBitDepthY;                            ///< bit-depth of output file (luma component)
-    Int       m_outputBitDepthC;                            ///< bit-depth of output file (chroma component)
-    Int       m_internalBitDepthY;                          ///< bit-depth codec operates at in luma (input/output files will be converted)
-    Int       m_internalBitDepthC;                          ///< bit-depth codec operates at in chroma (input/output files will be converted)
+    Int       m_inputBitDepth;                               ///< bit-depth of input file (luma component)
+    Int       m_outputBitDepth;                              ///< bit-depth of output file (luma component)
+    Int       m_internalBitDepth;                            ///< bit-depth codec operates at in luma (input/output files will be converted)
+#endif
 
     // coding tools (PCM bit-depth)
     Bool      m_bPCMInputBitDepthFlag;                        ///< 0: PCM bit-depth is internal bit-depth. 1: PCM bit-depth is input bit-depth.
@@ -155,9 +149,9 @@ protected:
     Bool      m_saoLcuBasedOptimization;                      ///< SAO LCU-based optimization
     // coding tools (loop filter)
     Bool      m_bLoopFilterDisable;                           ///< flag for using deblocking filter
-    Bool      m_loopFilterOffsetInPPS;                       ///< offset for deblocking filter in 0 = slice header, 1 = PPS
-    Int       m_loopFilterBetaOffsetDiv2;                   ///< beta offset for deblocking filter
-    Int       m_loopFilterTcOffsetDiv2;                     ///< tc offset for deblocking filter
+    Bool      m_loopFilterOffsetInPPS;                        ///< offset for deblocking filter in 0 = slice header, 1 = PPS
+    Int       m_loopFilterBetaOffsetDiv2;                     ///< beta offset for deblocking filter
+    Int       m_loopFilterTcOffsetDiv2;                       ///< tc offset for deblocking filter
     Bool      m_DeblockingFilterControlPresent;               ///< deblocking filter control present flag in PPS
 #if L0386_DB_METRIC
     Bool      m_DeblockingFilterMetric;                       ///< blockiness metric in encoder
--- a/source/x265enc.cpp	Wed Apr 24 11:58:14 2013 +0530
+++ b/source/x265enc.cpp	Wed Apr 24 12:00:28 2013 +0530
@@ -97,8 +97,10 @@ Void TAppEncTop::xInitLibCfg()
     m_cTEncTop.setFrameSkip(m_FrameSkip);
     m_cTEncTop.setSourceWidth(m_iSourceWidth);
     m_cTEncTop.setSourceHeight(m_iSourceHeight);
-    m_cTEncTop.setConformanceWindow(m_confLeft, m_confRight, m_confTop, m_confBottom);
+    m_cTEncTop.setConformanceWindow(0, 0, 0, 0);
     m_cTEncTop.setFramesToBeEncoded(m_framesToBeEncoded);
+    int nullpad[2] = { 0, 0 };
+    m_cTEncTop.setPad(nullpad);
 
     //====== Coding Structure ========
     m_cTEncTop.setIntraPeriod(m_iIntraPeriod);
@@ -119,8 +121,6 @@ Void TAppEncTop::xInitLibCfg()
 
     m_cTEncTop.setQP(m_iQP);
 
-    m_cTEncTop.setPad(m_aiPad);
-
     m_cTEncTop.setMaxTempLayer(m_maxTempLayer);
     m_cTEncTop.setUseAMP(m_enableAMP);
 
@@ -450,20 +450,16 @@ Void TAppEncTop::encode()
 
         // read input YUV file
         x265_picture pic;
-        m_input->readPicture(pic);
-
-        // increase number of received frames
-        m_iFrameRcvd++;
-
-        bEos = (m_iFrameRcvd == m_framesToBeEncoded);
-
-        Bool flush = 0;
-        // if end of file (which is only detected on a read failure) flush the encoder of any queued pictures
-        if (m_input->isEof())
+        Bool flush = false;
+        if (m_input->readPicture(pic))
+        {
+            m_iFrameRcvd++;
+            bEos = (m_iFrameRcvd == m_framesToBeEncoded);
+        }
+        else
         {
             flush = true;
             bEos = true;
-            m_iFrameRcvd--;
             m_cTEncTop.setFramesToBeEncoded(m_iFrameRcvd);
         }