changeset 607:b95fa4555d92

Merged multicoreware/xhevc into default
author Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
date Fri, 19 Apr 2013 10:22:04 +0530
parents 53fe4175ecf6 (current diff) 7d9db6232717 (diff)
children f517af8fa3e1
files source/encoder/macroblock.cpp
diffstat 11 files changed, 159 insertions(+-), 80 deletions(-) [+]
line wrap: on
line diff
--- a/source/CMakeLists.txt	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/CMakeLists.txt	Fri Apr 19 10:22:04 2013 +0530
@@ -43,7 +43,6 @@ endif(HIGH_BIT_DEPTH)
 option(ENABLE_PRIMITIVES "Enable use of optimized encoder primitives" ON)
 if(ENABLE_PRIMITIVES)
     add_definitions(-DENABLE_PRIMITIVES)
-    set(CPRIMITIVES pixel.cpp macroblock.cpp)
 
     option(ENABLE_PRIMITIVES_VEC "Enable use of SIMD vector class primitives" ON)
     if(ENABLE_PRIMITIVES_VEC)
--- a/source/encoder/CMakeLists.txt	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/encoder/CMakeLists.txt	Fri Apr 19 10:22:04 2013 +0530
@@ -1,9 +1,14 @@
 if(GCC)
     if (NOT X64)
+        # force gcc to generate code for sync primitives
         set_source_files_properties(threadpool.cpp PROPERTIES COMPILE_FLAGS -march=i686)
     endif()
 endif(GCC)
 
+if(ENABLE_PRIMITIVES)
+    set(CPRIMITIVES pixel.cpp macroblock.cpp)
+endif(ENABLE_PRIMITIVES)
+
 add_library(x265 ../../COPYING
     ${CPRIMITIVES}
     primitives.cpp primitives.h
--- a/source/encoder/macroblock.cpp	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/encoder/macroblock.cpp	Fri Apr 19 10:22:04 2013 +0530
@@ -155,7 +155,7 @@ void CDECL filter_8_nonvertical(const sh
     }
 }
 
-template<int N>
+template<int N, int isFirst, int isLast>
 void CDECL filter_Vertical(const short *coeff,
                            pixel *      src,
                            int          srcStride,
@@ -173,9 +173,22 @@ void CDECL filter_Vertical(const short *
 
     int offset;
     short maxVal;
+    int headRoom = IF_INTERNAL_PREC - bitDepth;
     int shift = IF_FILTER_PREC;
-    offset = 1 << (shift - 1);
-    maxVal = (1 << bitDepth) - 1;
+
+    if (isLast)
+    {
+        shift += (isFirst) ? 0 : headRoom;
+        offset = 1 << (shift - 1);
+        offset += (isFirst) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
+        maxVal = (1 << bitDepth) - 1;
+    }
+    else
+    {
+        shift -= (isFirst) ? headRoom : 0;
+        offset = (isFirst) ? -IF_INTERNAL_OFFS << shift : 0;
+        maxVal = 0;
+    }
 
     for (row = 0; row < block_height; row++)
     {
@@ -202,10 +215,12 @@ void CDECL filter_Vertical(const short *
                 sum += src[col + 7 * cStride] * coeff[7];
             }
 
-            short val = (short)(sum + offset) >> shift;
-
-            val = (val < 0) ? 0 : val;
-            val = (val > maxVal) ? maxVal : val;
+            short val = (short) (sum + offset) >> shift;
+            if(isLast)
+            {
+                val = (val < 0) ? 0 : val;
+                val = (val > maxVal) ? maxVal : val;
+            }
 
             dst[col] = val;
         }
@@ -280,7 +295,7 @@ void Setup_C_MacroblockPrimitives(Encode
 {
     p.inversedst = inversedst;
 
-    p.filter[FILTER_H_4_0_0] = filter_8_nonvertical<4, 0, 0>;
+    /*p.filter[FILTER_H_4_0_0] = filter_8_nonvertical<4, 0, 0>;
     p.filter[FILTER_H_4_0_1] = filter_8_nonvertical<4, 0, 1>;
     p.filter[FILTER_H_4_1_0] = filter_8_nonvertical<4, 1, 0>;
     p.filter[FILTER_H_4_1_1] = filter_8_nonvertical<4, 1, 1>;
@@ -288,17 +303,17 @@ void Setup_C_MacroblockPrimitives(Encode
     p.filter[FILTER_H_8_0_0] = filter_8_nonvertical<8, 0, 0>;
     p.filter[FILTER_H_8_0_1] = filter_8_nonvertical<8, 0, 1>;
     p.filter[FILTER_H_8_1_0] = filter_8_nonvertical<8, 1, 0>;
-    p.filter[FILTER_H_8_1_1] = filter_8_nonvertical<8, 1, 1>;
+    p.filter[FILTER_H_8_1_1] = filter_8_nonvertical<8, 1, 1>;*/
 
-    p.filter[FILTER_V_4_0_0] = filter_Vertical<4>;
-    p.filter[FILTER_V_4_0_1] = filter_Vertical<4>;
-    p.filter[FILTER_V_4_1_0] = filter_Vertical<4>;
-    p.filter[FILTER_V_4_1_1] = filter_Vertical<4>;
+    p.filter[FILTER_V_4_0_0] = filter_Vertical<4,0,0>;
+    p.filter[FILTER_V_4_0_1] = filter_Vertical<4,0,1>;
+    p.filter[FILTER_V_4_1_0] = filter_Vertical<4,1,0>;
+    p.filter[FILTER_V_4_1_1] = filter_Vertical<4,1,1>;
 
-    p.filter[FILTER_V_8_0_0] = filter_Vertical<8>;
-    p.filter[FILTER_V_8_0_1] = filter_Vertical<8>;
-    p.filter[FILTER_V_8_1_0] = filter_Vertical<8>;
-    p.filter[FILTER_V_8_1_1] = filter_Vertical<8>;
+    p.filter[FILTER_V_8_0_0] = filter_Vertical<8,0,0>;
+    p.filter[FILTER_V_8_0_1] = filter_Vertical<8,0,1>;
+    p.filter[FILTER_V_8_1_0] = filter_Vertical<8,1,0>;
+    p.filter[FILTER_V_8_1_1] = filter_Vertical<8,1,1>;
 
     p.partial_butterfly[BUTTERFLY_16] = partialButterfly16;
 }
--- a/source/encoder/primitives.cpp	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/encoder/primitives.cpp	Fri Apr 19 10:22:04 2013 +0530
@@ -31,6 +31,7 @@
 namespace x265 {
 // x265 private namespace
 
+#if ENABLE_PRIMITIVES
 static int8_t psize[16][16] =
 {
     // 4, 8, 12, 16, 20, 24, 28, 32
@@ -61,7 +62,6 @@ static int8_t psize[16][16] =
 // else returns -1 (in which case you should use the slow path)
 int PartitionFromSizes(int Width, int Height)
 {
-#if ENABLE_PRIMITIVES
     if ((Width | Height) & ~(4 | 8 | 16 | 32)) // Check for bits in the wrong places
         return -1;
 
@@ -69,13 +69,9 @@ int PartitionFromSizes(int Width, int He
         return -1;
 
     return (int)psize[(Width >> 2) - 1][(Height >> 2) - 1];
-#else
-    return Width || Height ? -1 : -1;
-#endif
 }
 
 /* the "authoritative" set of encoder primitives */
-#if ENABLE_PRIMITIVES
 EncoderPrimitives primitives;
 
 void Setup_C_PixelPrimitives(EncoderPrimitives &p);
@@ -97,17 +93,26 @@ void SetupPrimitives(int cpuid)
         cpuid = CpuIDDetect();
     }
 
+    fprintf(stdout, "x265: performance primitives:");
+
 #if ENABLE_PRIMITIVES
     Setup_C_Primitives(primitives);
 
 #if ENABLE_VECTOR_PRIMITIVES
     Setup_Vector_Primitives(primitives, cpuid);
+    fprintf(stdout, " vector");
 #endif
 
 #if ENABLE_ASM_PRIMITIVES
     Setup_Assembly_Primitives(primitives, cpuid);
+    fprintf(stdout, " assembly");
 #endif
+
+#else
+    fprintf(stdout," disabled!");
 #endif // if ENABLE_PRIMITIVES
+
+    fprintf(stdout, "\n");
 }
 
 static const char *CpuType[] = {
--- a/source/encoder/threading.cpp	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/encoder/threading.cpp	Fri Apr 19 10:22:04 2013 +0530
@@ -47,12 +47,18 @@ bool Thread::Start()
     return threadId > 0;
 }
 
+void Thread::Stop()
+{
+    if (this->thread)
+    {
+        WaitForSingleObject(this->thread, INFINITE);
+    }
+}
+
 Thread::~Thread()
 {
     if (this->thread)
     {
-        WaitForSingleObject(this->thread, INFINITE);
-
         CloseHandle(this->thread);
     }
 }
@@ -65,6 +71,7 @@ static void *ThreadShim(void *opaque)
     Thread *instance = reinterpret_cast<Thread *>(opaque);
 
     instance->ThreadMain();
+
     return NULL;
 }
 
@@ -73,13 +80,14 @@ bool Thread::Start()
     if (pthread_create(&this->thread, NULL, ThreadShim, this))
     {
         this->thread = 0;
+
         return false;
     }
 
     return true;
 }
 
-Thread::~Thread()
+void Thread::Stop()
 {
     if (this->thread)
     {
@@ -87,6 +95,10 @@ Thread::~Thread()
     }
 }
 
+Thread::~Thread()
+{
+}
+
 #endif // if _WIN32
 
 Thread::Thread()
--- a/source/encoder/threading.h	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/encoder/threading.h	Fri Apr 19 10:22:04 2013 +0530
@@ -204,6 +204,8 @@ public:
 
     //< Returns true if thread was successfully created
     bool Start();
+
+    void Stop();
 };
 } // end namespace x265
 
--- a/source/encoder/threadpool.cpp	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/encoder/threadpool.cpp	Fri Apr 19 10:22:04 2013 +0530
@@ -79,6 +79,7 @@ namespace x265 {
 // x265 private namespace
 
 class ThreadPoolImpl;
+static int get_cpu_count();
 
 class PoolThread : public Thread
 {
@@ -151,6 +152,8 @@ public:
 
     void Release();
 
+    void Stop();
+
     bool IsValid() const
     {
         return m_ok;
@@ -195,40 +198,7 @@ void PoolThread::ThreadMain()
 
 void ThreadPoolImpl::PokeIdleThreads()
 {
-    int initialCount = PoolThread::s_sleepCount;
-    for (int i = 0; i < initialCount; i++)
-        PoolThread::s_wakeEvent.Trigger();
-}
-
-static int get_cpu_count()
-{
-#if WIN32
-    SYSTEM_INFO sysinfo;
-    GetSystemInfo(&sysinfo);
-    return sysinfo.dwNumberOfProcessors;
-#elif __unix__
-    return sysconf(_SC_NPROCESSORS_ONLN);
-#elif MACOS
-    int nm[2];
-    size_t len = 4;
-    uint32_t count;
-
-    nm[0] = CTL_HW;
-    nm[1] = HW_AVAILCPU;
-    sysctl(nm, 2, &count, &len, NULL, 0);
-
-    if (count < 1)
-    {
-        nm[1] = HW_NCPU;
-        sysctl(nm, 2, &count, &len, NULL, 0);
-        if (count < 1)
-            count = 1;
-    }
-
-    return count;
-#else // if WIN32
-    return 2; // default to 2 threads, everywhere else
-#endif // if WIN32
+    PoolThread::s_wakeEvent.Trigger();
 }
 
 ThreadPoolImpl *ThreadPoolImpl::instance;
@@ -249,6 +219,7 @@ void ThreadPoolImpl::Release()
     {
         assert(this == ThreadPoolImpl::instance);
         ThreadPoolImpl::instance = NULL;
+        this->Stop();
         delete this;
     }
 }
@@ -282,26 +253,34 @@ ThreadPoolImpl::ThreadPoolImpl(int numTh
     }
 }
 
-ThreadPoolImpl::~ThreadPoolImpl()
+void ThreadPoolImpl::Stop()
 {
-    if (m_ok && m_threads)
+    if (m_ok)
     {
+        // wait for all threads to idle
         while (PoolThread::s_sleepCount < m_numThreads)
             GIVE_UP_TIME();
 
+        // set invalid flag, then wake them up so they exit their main func
         m_ok = false;
-
-        // destructors will block for thread completions
         for (int i = 0; i < m_numThreads; i++)
-        {
             PokeIdleThreads();
+
+        // wait for each thread to exit
+        for (int i = 0; i < m_numThreads; i++)
+            m_threads[i].Stop();
+    }
+}
+
+ThreadPoolImpl::~ThreadPoolImpl()
+{
+    if (m_threads)
+    {
+        // cleanup thread handles
+        for (int i = 0; i < m_numThreads; i++)
             m_threads[i].~PoolThread();
-        }
-
         delete[] reinterpret_cast<char*>(m_threads);
     }
-
-    // leak threads on program exit if there were resource failures
 }
 
 void ThreadPoolImpl::EnqueueJobProvider(JobProvider &p)
@@ -445,4 +424,36 @@ bool QueueFrame::FindJob()
     // made it through the bitmap without finding any enqueued rows
     return false;
 }
+
+static int get_cpu_count()
+{
+#if WIN32
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+#elif __unix__
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#elif MACOS
+    int nm[2];
+    size_t len = 4;
+    uint32_t count;
+
+    nm[0] = CTL_HW;
+    nm[1] = HW_AVAILCPU;
+    sysctl(nm, 2, &count, &len, NULL, 0);
+
+    if (count < 1)
+    {
+        nm[1] = HW_NCPU;
+        sysctl(nm, 2, &count, &len, NULL, 0);
+        if (count < 1)
+            count = 1;
+    }
+
+    return count;
+#else // if WIN32
+    return 2; // default to 2 threads, everywhere else
+#endif // if WIN32
+}
+
 } // end namespace x265
--- a/source/test/mbdstharness.cpp	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/test/mbdstharness.cpp	Fri Apr 19 10:22:04 2013 +0530
@@ -26,10 +26,9 @@
 #include <string.h>
 #include <stdio.h>
 #include <malloc.h>
-
-#ifdef __MINGW32__ 
-#define _aligned_malloc __mingw_aligned_malloc 
-#define _aligned_free  __mingw_aligned_free 
+#ifdef __MINGW32__
+#define _aligned_malloc __mingw_aligned_malloc
+#define _aligned_free  __mingw_aligned_free
 #endif
 
 using namespace x265;
@@ -67,9 +66,9 @@ MBDstHarness::MBDstHarness()
     mbuf2 = (short*)_aligned_malloc(mb_t_size, 32);
     mbuf3 = (short*)_aligned_malloc(mb_t_size, 32);
 #else
-    posix_memalign((void **)&mbuf1, 32, 0x1e00 * sizeof(short));
-    posix_memalign((void **)&mbuf2, 32, mb_t_size);
-    posix_memalign((void **)&mbuf3, 32, mb_t_size);
+    posix_memalign((void**)&mbuf1, 32, 0x1e00 * sizeof(short));
+    posix_memalign((void**)&mbuf2, 32, mb_t_size);
+    posix_memalign((void**)&mbuf3, 32, mb_t_size);
 #endif
     if (!mbuf1 || !mbuf2 || !mbuf3)
     {
@@ -141,6 +140,27 @@ bool MBDstHarness::check_butterfly16_pri
     return true;
 }
 
+bool MBDstHarness::check_butterfly32_primitive(butterfly ref, butterfly opt)
+{
+    int j = 0;
+    int mem_cmp_size = 640; // 2*32*10 -> sizeof(short)*number of elements*number of lines
+
+    for (int i = 0; i <= 100; i++)
+    {
+        opt(mbuf1 + j, mbuf2, 3, 10);
+        ref(mbuf1 + j, mbuf3, 3, 10);
+
+        if (memcmp(mbuf2, mbuf3, mem_cmp_size))
+            return false;
+
+        j += 16;
+        memset(mbuf2, 0, mem_cmp_size);
+        memset(mbuf3, 0, mem_cmp_size);
+    }
+
+    return true;
+}
+
 bool MBDstHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.inversedst)
@@ -161,6 +181,15 @@ bool MBDstHarness::testCorrectness(const
         }
     }
 
+    if (opt.partial_butterfly[butterfly_32])
+    {
+        if (!check_butterfly32_primitive(ref.partial_butterfly[butterfly_32], opt.partial_butterfly[butterfly_32]))
+        {
+            printf("\npartialButterfly%s failed\n", ButterflyConf_names[butterfly_32]);
+            return false;
+        }
+    }
+
     return true;
 }
 
--- a/source/test/mbdstharness.h	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/test/mbdstharness.h	Fri Apr 19 10:22:04 2013 +0530
@@ -37,6 +37,7 @@ protected:
 
     bool check_mbdst_primitive(x265::mbdst ref, x265::mbdst opt);
     bool check_butterfly16_primitive(x265::butterfly ref, x265::butterfly opt);
+    bool check_butterfly32_primitive(x265::butterfly ref, x265::butterfly opt);
 
 public:
 
--- a/source/test/testbench.cpp	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/test/testbench.cpp	Fri Apr 19 10:22:04 2013 +0530
@@ -37,6 +37,7 @@ using namespace x265;
 
 int main(int argc, char *argv[])
 {
+#if ENABLE_PRIMITIVES
     int cpuid = CpuIDDetect();
 
     for (int i = 1; i < argc - 1; i += 2)
@@ -120,6 +121,8 @@ int main(int argc, char *argv[])
     }
 
     printf("\n");
-
+#else
+    printf("x265 is configured without performance primitives, nothing to test\n");
+#endif
     return 0;
 }
--- a/source/x265main.cpp	Fri Apr 19 10:21:09 2013 +0530
+++ b/source/x265main.cpp	Fri Apr 19 10:22:04 2013 +0530
@@ -75,9 +75,6 @@ int main(int argc, char *argv[])
 #else
     fprintf(stdout, "x265: 8bpp only\n");
 #endif
-#if ENABLE_PRIMITIVES
-    fprintf(stdout, "x265: performance primitives enabled\n");
-#endif
 
     // starting time
     long lBefore = clock();