changeset 9564:fa3fd0b3fe12

Merge with public
author Praveen Tiwari <praveen@multicorewareinc.com>
date Tue, 24 Feb 2015 15:34:32 +0530
parents 39f45f701936 (current diff) 7252c10278a1 (diff)
children f81fb7f458b8
files source/test/pixelharness.cpp
diffstat 32 files changed, 2368 insertions(+-), 2040 deletions(-) [+]
line wrap: on
line diff
--- a/doc/reST/cli.rst	Mon Feb 23 15:28:01 2015 +0530
+++ b/doc/reST/cli.rst	Tue Feb 24 15:34:32 2015 +0530
@@ -173,19 +173,52 @@ Performance Options
 
 	**Values:** any value between 8 and 16. Default is 0, auto-detect
 
-.. option:: --threads <integer>
-
-	Number of threads to allocate for the worker thread pool  This pool
-	is used for WPP and for distributed analysis and motion search:
-	:option:`--wpp` :option:`--pmode` and :option:`--pme` respectively.
+.. option:: --pools <string>, --numa-pools <string>
 
-	If :option:`--threads` 1 is specified, then no thread pool is
-	created. When no thread pool is created, all the thread pool
-	features are implicitly disabled. If all the pool features are
-	disabled by the user, then the pool is implicitly disabled.
+	Comma seperated list of threads per NUMA node. If "none", then no worker
+	pools are created and only frame parallelism is possible. If NULL or ""
+	(default) x265 will use all available threads on each NUMA node::
 
-	Default 0, one thread is allocated per detected hardware thread
-	(logical CPU cores)
+	'+'  is a special value indicating all cores detected on the node
+	'*'  is a special value indicating all cores detected on the node and all remaining nodes
+	'-'  is a special value indicating no cores on the node, same as '0'
+
+	example strings for a 4-node system::
+
+	""        - default, unspecified, all numa nodes are used for thread pools
+	"*"       - same as default
+	"none"    - no thread pools are created, only frame parallelism possible
+	"-"       - same as "none"
+	"10"      - allocate one pool, using up to 10 cores on node 0
+	"-,+"     - allocate one pool, using all cores on node 1
+	"+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
+	"+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
+	"-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+	"8,8,8,8" - allocate four pools with up to 8 threads in each pool
+
+	The total number of threads will be determined by the number of threads
+	assigned to all nodes. The worker threads will each be given affinity for
+	their node, they will not be allowed to migrate between nodes, but they
+	will be allowed to move between CPU cores within their node.
+
+	If the three pool features: :option:`--wpp` :option:`--pmode` and
+	:option:`--pme` are all disabled, then :option:`--pools` is ignored
+	and no thread pools are created.
+
+	If "none" is specified, then all three of the thread pool features are
+	implicitly disabled.
+
+	Multiple thread pools will be allocated for any NUMA node with more than
+	64 logical CPU cores. But any given thread pool will always use at most
+	one NUMA node.
+
+	Frame encoders are distributed between the available thread pools,
+	and the encoder will never generate more thread pools than
+	:option:`--frame-threads`.  The pools are used for WPP and for
+	distributed analysis and motion search.
+
+	Default "", one thread is allocated per detected hardware thread
+	(logical CPU cores) and one thread pool per NUMA node.
 
 .. option:: --wpp, --no-wpp
 
--- a/doc/reST/threading.rst	Mon Feb 23 15:28:01 2015 +0530
+++ b/doc/reST/threading.rst	Tue Feb 24 15:34:32 2015 +0530
@@ -2,41 +2,34 @@
 Threading
 *********
 
-Thread Pool
-===========
+Thread Pools
+============
 
-x265 creates a pool of worker threads and shares this thread pool
-with all encoders within the same process (it is process global, aka a
-singleton).  The number of threads within the thread pool is determined
-by the encoder which first allocates the pool, which by definition is
-the first encoder created within each process.
+x265 creates one or more thread pools per encoder, one pool per NUMA
+node (typically a CPU socket). :option:`--pools` specifies the number of
+pools and the number of threads per pool the encoder will allocate. By
+default x265 allocates one thread per (hyperthreaded) CPU core on each
+NUMA node.
 
-:option:`--threads` specifies the number of threads the encoder will
-try to allocate for its thread pool.  If the thread pool was already
-allocated this parameter is ignored.  By default x265 allocates one
-thread per (hyperthreaded) CPU core in your system.
+If you are running multiple encoders on a system with multiple NUMA
+nodes, it is recommended to isolate each of them to a single node in
+order to avoid the NUMA overhead of remote memory access.
 
-Work distribution is job based.  Idle worker threads ask their parent
-pool object for jobs to perform.  When no jobs are available, idle
-worker threads block and consume no CPU cycles.
+Work distribution is job based. Idle worker threads scan the job
+providers assigned to their thread pool for jobs to perform. When no
+jobs are available, the idle worker threads block and consume no CPU
+cycles.
 
 Objects which desire to distribute work to worker threads are known as
-job providers (and they derive from the JobProvider class).  When job
-providers have work they enqueue themselves into the pool's provider
-list (and dequeue themselves when they no longer have work).  The thread
+job providers (and they derive from the JobProvider class).  The thread
 pool has a method to **poke** awake a blocked idle thread, and job
 providers are recommended to call this method when they make new jobs
 available.
 
 Worker jobs are not allowed to block except when abosultely necessary
-for data locking. If a job becomes blocked, the worker thread is
-expected to drop that job and go back to the pool and find more work.
-
-.. note::
-
-	x265_cleanup() frees the process-global thread pool, allowing
-	it to be reallocated if necessary, but only if no encoders are
-	allocated at the time it is called.
+for data locking. If a job becomes blocked, the work function is
+expected to drop that job so the worker thread may go back to the pool
+and find more work.
 
 Wavefront Parallel Processing
 =============================
@@ -82,24 +75,35 @@ threading is not disabled, the encoder w
 thread count to be higher than if WPP was enabled.  The exact formulas
 are described in the next section.
 
+Bonded Task Groups
+==================
+
+If a worker thread job has work which can be performed in parallel by
+many threads, it may allocate a bonded task group and enlist the help of
+other idle worker threads in the same pool. Those threads will cooperate
+to complete the work of the bonded task group and then return to their
+idle states. The larger and more uniform those tasks are, the better the
+bonded task group will perform.
+
 Parallel Mode Analysis
-======================
+~~~~~~~~~~~~~~~~~~~~~~
 
 When :option:`--pmode` is enabled, each CU (at all depths from 64x64 to
-8x8) will distribute its analysis work to the thread pool. Each analysis
-job will measure the cost of one prediction for the CU: merge, skip,
-intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At slower presets, the amount
-of increased parallelism is often enough to be able to reduce frame
-parallelism while achieving the same overall CPU utilization. Reducing
-frame threads is often beneficial to ABR and VBV rate control.
+8x8) will distribute its analysis work to the thread pool via a bonded
+task group. Each analysis job will measure the cost of one prediction
+for the CU: merge, skip, intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At
+slower presets, the amount of increased parallelism is often enough to
+be able to reduce frame parallelism while achieving the same overall CPU
+utilization. Reducing frame threads is often beneficial to ABR and VBV
+rate control.
 
 Parallel Motion Estimation
-==========================
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When :option:`--pme` is enabled all of the analysis functions which
 perform motion searches to reference frames will distribute those motion
-searches as jobs for worker threads (if more than two motion searches
-are required).
+searches as jobs for worker threads via a bonded task group (if more
+than two motion searches are required).
 
 Frame Threading
 ===============
@@ -138,8 +142,8 @@ The third extenuating circumstance is th
 becomes blocked by a reference frame row being available, that frame's
 wave-front becomes completely stalled and when the row becomes available
 again it can take quite some time for the wave to be restarted, if it
-ever does. This makes WPP many times less effective when frame
-parallelism is in use.
+ever does. This makes WPP less effective when frame parallelism is in
+use.
 
 :option:`--merange` can have a negative impact on frame parallelism. If
 the range is too large, more rows of CTU lag must be added to ensure
@@ -218,13 +222,12 @@ Lookahead
 
 The lookahead module of x265 (the lowres pre-encode which determines
 scene cuts and slice types) uses the thread pool to distribute the
-lowres cost analysis to worker threads. It follows the same wave-front
-pattern as the main encoder except it works in reverse-scan order.
+lowres cost analysis to worker threads. It will use bonded task groups
+to perform batches of frame cost estimates.
 
-The function slicetypeDecide() itself may also be performed by a worker
-thread if your system has enough CPU cores to make this a beneficial
-trade-off, else it runs within the context of the thread which calls the
-x265_encoder_encode().
+The function slicetypeDecide() itself is also be performed by a worker
+thread if your encoder has a thread pool, else it runs within the
+context of the thread which calls the x265_encoder_encode().
 
 SAO
 ===
--- a/source/CMakeLists.txt	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/CMakeLists.txt	Tue Feb 24 15:34:32 2015 +0530
@@ -21,7 +21,7 @@ include(CheckSymbolExists)
 include(CheckCXXCompilerFlag)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 46)
+set(X265_BUILD 47)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
--- a/source/common/cudata.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/cudata.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -300,12 +300,12 @@ void CUData::initCTU(const Frame& frame,
 // initialize Sub partition
 void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom)
 {
-    m_absIdxInCTU   = cuGeom.encodeIdx;
+    m_absIdxInCTU   = cuGeom.absPartIdx;
     m_encData       = ctu.m_encData;
     m_slice         = ctu.m_slice;
     m_cuAddr        = ctu.m_cuAddr;
-    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx];
-    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx];
+    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
+    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
     m_cuLeft        = ctu.m_cuLeft;
     m_cuAbove       = ctu.m_cuAbove;
     m_cuAboveLeft   = ctu.m_cuAboveLeft;
@@ -392,7 +392,7 @@ void CUData::initLosslessCU(const CUData
     m_cuAbove      = cu.m_cuAbove;
     m_cuAboveLeft  = cu.m_cuAboveLeft;
     m_cuAboveRight = cu.m_cuAboveRight;
-    m_absIdxInCTU  = cuGeom.encodeIdx;
+    m_absIdxInCTU  = cuGeom.absPartIdx;
     m_numPartitions = cuGeom.numPartitions;
     memcpy(m_qp, cu.m_qp, BytesPerPartition * m_numPartitions);
     memcpy(m_mv[0],  cu.m_mv[0],  m_numPartitions * sizeof(MV));
@@ -462,9 +462,9 @@ void CUData::copyFromPic(const CUData& c
     m_encData       = ctu.m_encData;
     m_slice         = ctu.m_slice;
     m_cuAddr        = ctu.m_cuAddr;
-    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx];
-    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx];
-    m_absIdxInCTU   = cuGeom.encodeIdx;
+    m_cuPelX        = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
+    m_cuPelY        = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
+    m_absIdxInCTU   = cuGeom.absPartIdx;
     m_numPartitions = cuGeom.numPartitions;
 
     /* copy out all prediction info for this part */
@@ -2094,7 +2094,7 @@ void CUData::calcCTUGeoms(uint32_t ctuWi
                 CUGeom *cu = cuDataArray + cuIdx;
                 cu->log2CUSize = log2CUSize;
                 cu->childOffset = childIdx - cuIdx;
-                cu->encodeIdx = g_depthScanIdx[yOffset][xOffset] * 4;
+                cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4;
                 cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2));
                 cu->depth = g_log2Size[maxCUSize] - log2CUSize;
 
--- a/source/common/cudata.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/cudata.h	Tue Feb 24 15:34:32 2015 +0530
@@ -82,7 +82,7 @@ struct CUGeom
 
     uint32_t log2CUSize;    // Log of the CU size.
     uint32_t childOffset;   // offset of the first child CU from current CU
-    uint32_t encodeIdx;     // Encoding index of this CU in terms of 4x4 blocks.
+    uint32_t absPartIdx;    // Part index of this CU in terms of 4x4 blocks.
     uint32_t numPartitions; // Number of 4x4 blocks in the CU
     uint32_t depth;         // depth of this CU relative from CTU
     uint32_t flags;         // CU flags.
--- a/source/common/deblock.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/deblock.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -70,7 +70,7 @@ static inline uint8_t bsCuEdge(const CUD
  * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */
 void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[])
 {
-    uint32_t absPartIdx = cuGeom.encodeIdx;
+    uint32_t absPartIdx = cuGeom.absPartIdx;
     uint32_t depth = cuGeom.depth;
     if (cu->m_predMode[absPartIdx] == MODE_NONE)
         return;
--- a/source/common/framedata.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/framedata.h	Tue Feb 24 15:34:32 2015 +0530
@@ -32,6 +32,7 @@ namespace x265 {
 // private namespace
 
 class PicYuv;
+class JobProvider;
 
 /* Per-frame data that is used during encodes and referenced while the picture
  * is available for reference. A FrameData instance is attached to a Frame as it
@@ -52,6 +53,7 @@ public:
     PicYuv*        m_reconPic;
     bool           m_bHasReferences;   /* used during DPB/RPS updates */
     int            m_frameEncoderID;   /* the ID of the FrameEncoder encoding this frame */
+    JobProvider*   m_jobProvider;
 
     CUDataMemPool  m_cuMemPool;
     CUData*        m_picCTU;
--- a/source/common/lowres.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/lowres.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -56,12 +56,14 @@ bool Lowres::create(PicYuv *origPic, int
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
     /* allocate lowres buffers */
-    for (int i = 0; i < 4; i++)
-    {
-        CHECKED_MALLOC(buffer[i], pixel, planesize);
-        /* initialize the whole buffer to prevent valgrind warnings on right edge */
-        memset(buffer[i], 0, sizeof(pixel) * planesize);
-    }
+    CHECKED_MALLOC(buffer[0], pixel, 4 * planesize);
+
+    /* initialize the whole buffer to prevent valgrind warnings on right edge */
+    memset(buffer[0], 0, 4 * sizeof(pixel) * planesize);
+
+    buffer[1] = buffer[0] + planesize;
+    buffer[2] = buffer[1] + planesize;
+    buffer[3] = buffer[2] + planesize;
 
     lowresPlane[0] = buffer[0] + padoffset;
     lowresPlane[1] = buffer[1] + padoffset;
@@ -96,9 +98,7 @@ fail:
 
 void Lowres::destroy()
 {
-    for (int i = 0; i < 4; i++)
-        X265_FREE(buffer[i]);
-
+    X265_FREE(buffer[0]);
     X265_FREE(intraCost);
     X265_FREE(intraMode);
 
@@ -126,13 +126,11 @@ void Lowres::destroy()
 }
 
 // (re) initialize lowres state
-void Lowres::init(PicYuv *origPic, int poc, int type)
+void Lowres::init(PicYuv *origPic, int poc)
 {
-    bIntraCalculated = false;
     bLastMiniGopBFrame = false;
     bScenecut = true;  // could be a scene-cut, until ruled out by flash detection
     bKeyframe = false; // Not a keyframe unless identified by lookahead
-    sliceType = type;
     frameNum = poc;
     leadingBframes = 0;
     indB = 0;
@@ -158,8 +156,8 @@ void Lowres::init(PicYuv *origPic, int p
 
     /* downscale and generate 4 hpel planes for lookahead */
     primitives.frameInitLowres(origPic->m_picOrg[0],
-                                      lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
-                                      origPic->m_stride, lumaStride, width, lines);
+                               lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3],
+                               origPic->m_stride, lumaStride, width, lines);
 
     /* extend hpel planes for motion search */
     extendPicBorder(lowresPlane[0], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
--- a/source/common/lowres.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/lowres.h	Tue Feb 24 15:34:32 2015 +0530
@@ -114,7 +114,6 @@ struct Lowres : public ReferencePlanes
     int    lines;            // height of lowres frame in pixel lines
     int    leadingBframes;   // number of leading B frames for P or I
 
-    bool   bIntraCalculated;
     bool   bScenecut;        // Set to false if the frame cannot possibly be part of a real scenecut.
     bool   bKeyframe;
     bool   bLastMiniGopBFrame;
@@ -151,7 +150,7 @@ struct Lowres : public ReferencePlanes
 
     bool create(PicYuv *origPic, int _bframes, bool bAqEnabled);
     void destroy();
-    void init(PicYuv *origPic, int poc, int sliceType);
+    void init(PicYuv *origPic, int poc);
 };
 }
 
--- a/source/common/param.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/param.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -100,7 +100,6 @@ void x265_param_default(x265_param *para
     /* Applying default values to all elements in the param structure */
     param->cpuid = x265::cpu_detect();
     param->bEnableWavefront = 1;
-    param->poolNumThreads = 0;
     param->frameNumThreads = 0;
 
     param->logLevel = X265_LOG_INFO;
@@ -545,7 +544,6 @@ int x265_param_parse(x265_param *p, cons
             }
         }
     }
-    OPT("threads") p->poolNumThreads = atoi(value);
     OPT("frame-threads") p->frameNumThreads = atoi(value);
     OPT("pmode") p->bDistributeModeAnalysis = atobool(value);
     OPT("pme") p->bDistributeMotionEstimation = atobool(value);
@@ -821,6 +819,7 @@ int x265_param_parse(x265_param *p, cons
     OPT("stats") p->rc.statFileName = strdup(value);
     OPT("csv") p->csvfn = strdup(value);
     OPT("scaling-list") p->scalingLists = strdup(value);
+    OPT2("pools", "numa-pools") p->numaPools = strdup(value);
     OPT("lambda-file") p->rc.lambdaFileName = strdup(value);
     OPT("analysis-file") p->analysisFileName = strdup(value);
     else
--- a/source/common/predict.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/predict.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -117,7 +117,7 @@ void Predict::initMotionCompensation(con
     m_predSlice = cu.m_slice;
     cu.getPartIndexAndSize(partIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
     m_ctuAddr = cu.m_cuAddr;
-    m_cuAbsPartIdx = cuGeom.encodeIdx;
+    m_cuAbsPartIdx = cuGeom.absPartIdx;
 }
 
 void Predict::prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
@@ -620,7 +620,7 @@ void Predict::initAdiPattern(const CUDat
     int tuSize = intraNeighbors.tuSize;
     int tuSize2 = tuSize << 1;
 
-    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
@@ -687,7 +687,7 @@ void Predict::initAdiPattern(const CUDat
 
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
--- a/source/common/threadpool.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/threadpool.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -27,115 +27,65 @@
 
 #include <new>
 
+#if X86_64
+
+#ifdef __GNUC__
+
+#define SLEEPBITMAP_CTZ(id, x)     id = (unsigned long)__builtin_ctzll(x)
+#define SLEEPBITMAP_OR(ptr, mask)  __sync_fetch_and_or(ptr, mask)
+#define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
+
+#elif defined(_MSC_VER)
+
+#define SLEEPBITMAP_CTZ(id, x)     _BitScanForward64(&id, x)
+#define SLEEPBITMAP_OR(ptr, mask)  InterlockedOr64((volatile LONG64*)ptr, (LONG)mask)
+#define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask)
+
+#endif // ifdef __GNUC__
+
+#else
+
+/* use 32-bit primitives defined in threading.h */
+#define SLEEPBITMAP_CTZ CTZ
+#define SLEEPBITMAP_OR  ATOMIC_OR
+#define SLEEPBITMAP_AND ATOMIC_AND
+
+#endif
+
 #if MACOS
 #include <sys/param.h>
 #include <sys/sysctl.h>
 #endif
+#if HAVE_LIBNUMA
+#include <numa.h>
+#endif
 
 namespace x265 {
 // x265 private namespace
 
-class ThreadPoolImpl;
-
-class PoolThread : public Thread
-{
-private:
-
-    ThreadPoolImpl &m_pool;
-
-    PoolThread& operator =(const PoolThread&);
-
-    int            m_id;
-
-    bool           m_dirty;
-
-    bool           m_exited;
-
-    Event          m_wakeEvent;
-
-public:
-
-    PoolThread(ThreadPoolImpl& pool, int id)
-        : m_pool(pool)
-        , m_id(id)
-        , m_dirty(false)
-        , m_exited(false)
-    {
-    }
-
-    bool isDirty() const  { return m_dirty; }
-
-    void markDirty()      { m_dirty = true; }
-
-    bool isExited() const { return m_exited; }
-
-    void poke()           { m_wakeEvent.trigger(); }
-
-    virtual ~PoolThread() {}
-
-    void threadMain();
-};
-
-class ThreadPoolImpl : public ThreadPool
+class WorkerThread : public Thread
 {
 private:
 
-    bool         m_ok;
-    int          m_referenceCount;
-    int          m_numThreads;
-    int          m_numSleepMapWords;
-    PoolThread  *m_threads;
-    volatile uint32_t *m_sleepMap;
+    ThreadPool&  m_pool;
+    int          m_id;
+    Event        m_wakeEvent;
 
-    /* Lock for write access to the provider lists.  Threads are
-     * always allowed to read m_firstProvider and follow the
-     * linked list.  Providers must zero their m_nextProvider
-     * pointers before removing themselves from this list */
-    Lock         m_writeLock;
-
-public:
-
-    static ThreadPoolImpl *s_instance;
-    static Lock s_createLock;
-
-    JobProvider *m_firstProvider;
-    JobProvider *m_lastProvider;
+    WorkerThread& operator =(const WorkerThread&);
 
 public:
 
-    ThreadPoolImpl(int numthreads);
-
-    virtual ~ThreadPoolImpl();
-
-    ThreadPoolImpl *AddReference()
-    {
-        m_referenceCount++;
-
-        return this;
-    }
-
-    void markThreadAsleep(int id);
-
-    void waitForAllIdle();
+    JobProvider*     m_curJobProvider;
+    BondedTaskGroup* m_bondMaster;
 
-    int getThreadCount() const { return m_numThreads; }
-
-    bool IsValid() const       { return m_ok; }
-
-    void release();
+    WorkerThread(ThreadPool& pool, int id) : m_pool(pool), m_id(id) {}
+    virtual ~WorkerThread() {}
 
-    void Stop();
-
-    void enqueueJobProvider(JobProvider &);
-
-    void dequeueJobProvider(JobProvider &);
-
-    void FlushProviderList();
-
-    void pokeIdleThread();
+    void threadMain();
+    void awaken()           { m_wakeEvent.trigger(); }
 };
 
-void PoolThread::threadMain()
+void WorkerThread::threadMain()
 {
     THREAD_NAME("Worker", m_id);
 
@@ -145,286 +95,350 @@ void PoolThread::threadMain()
     __attribute__((unused)) int val = nice(10);
 #endif
 
-    while (m_pool.IsValid())
+    m_pool.setCurrentThreadAffinity();
+
+    uint32_t idBit = 1 << m_id;
+    m_curJobProvider = m_pool.m_jpTable[0];
+    m_bondMaster = NULL;
+
+    SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
+    SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
+    m_wakeEvent.wait();
+
+    while (m_pool.m_isActive)
     {
-        /* Walk list of job providers, looking for work */
-        JobProvider *cur = m_pool.m_firstProvider;
-        while (cur)
+        if (m_bondMaster)
         {
-            // FindJob() may perform actual work and return true.  If
-            // it does we restart the job search
-            if (cur->findJob(m_id) == true)
-                break;
-
-            cur = cur->m_nextProvider;
+            m_bondMaster->processTasks(m_id);
+            m_bondMaster->m_exitedPeerCount.incr();
+            m_bondMaster = NULL;
         }
 
-        // this thread has reached the end of the provider list
-        m_dirty = false;
-
-        if (cur == NULL)
+        do
         {
-            m_pool.markThreadAsleep(m_id);
-            m_wakeEvent.wait();
-        }
-    }
-
-    m_exited = true;
-}
-
-void ThreadPoolImpl::markThreadAsleep(int id)
-{
-    int word = id >> 5;
-    uint32_t bit = 1 << (id & 31);
-
-    ATOMIC_OR(&m_sleepMap[word], bit);
-}
-
-void ThreadPoolImpl::pokeIdleThread()
-{
-    /* Find a bit in the sleeping thread bitmap and poke it awake, do
-     * not give up until a thread is awakened or all of them are awake */
-    for (int i = 0; i < m_numSleepMapWords; i++)
-    {
-        uint32_t oldval = m_sleepMap[i];
-        while (oldval)
-        {
-            unsigned long id;
-            CTZ(id, oldval);
-
-            uint32_t bit = 1 << id;
-            if (ATOMIC_AND(&m_sleepMap[i], ~bit) & bit)
-            {
-                m_threads[i * 32 + id].poke();
-                return;
-            }
+            /* do pending work for current job provider */
+            m_curJobProvider->findJob(m_id);
 
-            oldval = m_sleepMap[i];
+            /* if the current job provider still wants help, only switch to a
+             * higher priority provider (lower slice type). Else take the first
+             * available job provider with the highest priority */
+            int curPriority = (m_curJobProvider->m_helpWanted) ? m_curJobProvider->m_sliceType :
+                                                                 INVALID_SLICE_PRIORITY + 1;
+            int nextProvider = -1;
+            for (int i = 0; i < m_pool.m_numProviders; i++)
+            {
+                if (m_pool.m_jpTable[i]->m_helpWanted &&
+                    m_pool.m_jpTable[i]->m_sliceType < curPriority)
+                {
+                    nextProvider = i;
+                    curPriority = m_pool.m_jpTable[i]->m_sliceType;
+                }
+            }
+            if (nextProvider != -1 && m_curJobProvider != m_pool.m_jpTable[nextProvider])
+            {
+                SLEEPBITMAP_AND(&m_curJobProvider->m_ownerBitmap, ~idBit);
+                m_curJobProvider = m_pool.m_jpTable[nextProvider];
+                SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit);
+            }
         }
-    }
-}
-
-ThreadPoolImpl *ThreadPoolImpl::s_instance;
-Lock ThreadPoolImpl::s_createLock;
-
-/* static */
-ThreadPool *ThreadPool::allocThreadPool(int numthreads)
-{
-    if (ThreadPoolImpl::s_instance)
-        return ThreadPoolImpl::s_instance->AddReference();
-
-    /* acquire the lock to create the instance */
-    ThreadPoolImpl::s_createLock.acquire();
+        while (m_curJobProvider->m_helpWanted);
 
-    if (ThreadPoolImpl::s_instance)
-        /* pool was allocated while we waited for the lock */
-        ThreadPoolImpl::s_instance->AddReference();
-    else
-        ThreadPoolImpl::s_instance = new ThreadPoolImpl(numthreads);
-    ThreadPoolImpl::s_createLock.release();
-
-    return ThreadPoolImpl::s_instance;
-}
-
-ThreadPool *ThreadPool::getThreadPool()
-{
-    X265_CHECK(ThreadPoolImpl::s_instance, "getThreadPool() called prior to allocThreadPool()\n");
-    return ThreadPoolImpl::s_instance;
-}
-
-void ThreadPoolImpl::release()
-{
-    if (--m_referenceCount == 0)
-    {
-        X265_CHECK(this == ThreadPoolImpl::s_instance, "multiple thread pool instances detected\n");
-        ThreadPoolImpl::s_instance = NULL;
-        this->Stop();
-        delete this;
+        /* While the worker sleeps, a job-provider or bond-group may acquire this
+         * worker's sleep bitmap bit. Once acquired, that thread may modify 
+         * m_bondMaster or m_curJobProvider, then waken the thread */
+        SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit);
+        m_wakeEvent.wait();
     }
 }
 
-ThreadPoolImpl::ThreadPoolImpl(int numThreads)
-    : m_ok(false)
-    , m_referenceCount(1)
-    , m_firstProvider(NULL)
-    , m_lastProvider(NULL)
+void JobProvider::tryWakeOne()
 {
-    m_numSleepMapWords = (numThreads + 31) >> 5;
-    m_sleepMap = X265_MALLOC(uint32_t, m_numSleepMapWords);
-
-    char *buffer = (char*)X265_MALLOC(PoolThread, numThreads);
-    m_threads = reinterpret_cast<PoolThread*>(buffer);
-    m_numThreads = numThreads;
-
-    if (m_threads && m_sleepMap)
+    int id = m_pool->tryAcquireSleepingThread(m_ownerBitmap, ALL_POOL_THREADS);
+    if (id < 0)
     {
-        for (int i = 0; i < m_numSleepMapWords; i++)
-            m_sleepMap[i] = 0;
-
-        m_ok = true;
-        int i;
-        for (i = 0; i < numThreads; i++)
-        {
-            new (buffer)PoolThread(*this, i);
-            buffer += sizeof(PoolThread);
-            if (!m_threads[i].start())
-            {
-                m_ok = false;
-                break;
-            }
-        }
+        m_helpWanted = true;
+        return;
+    }
 
-        if (m_ok)
-            waitForAllIdle();
-        else
-        {
-            // stop threads that did start up
-            for (int j = 0; j < i; j++)
-            {
-                m_threads[j].poke();
-                m_threads[j].stop();
-            }
-        }
-    }
-}
-
-void ThreadPoolImpl::waitForAllIdle()
-{
-    if (!m_ok)
-        return;
-
-    int id = 0;
-    do
+    WorkerThread& worker = m_pool->m_workers[id];
+    if (worker.m_curJobProvider != this) /* poaching */
     {
-        int word = id >> 5;
-        uint32_t bit = 1 << (id & 31);
-        if (m_sleepMap[word] & bit)
-            id++;
-        else
-        {
-            GIVE_UP_TIME();
-        }
+        uint32_t bit = 1 << id;
+        SLEEPBITMAP_AND(&worker.m_curJobProvider->m_ownerBitmap, ~bit);
+        worker.m_curJobProvider = this;
+        SLEEPBITMAP_OR(&worker.m_curJobProvider->m_ownerBitmap, bit);
     }
-    while (id < m_numThreads);
-}
-
-void ThreadPoolImpl::Stop()
-{
-    if (m_ok)
-    {
-        waitForAllIdle();
-
-        // set invalid flag, then wake them up so they exit their main func
-        m_ok = false;
-        for (int i = 0; i < m_numThreads; i++)
-        {
-            m_threads[i].poke();
-            m_threads[i].stop();
-        }
-    }
+    worker.awaken();
 }
 
-ThreadPoolImpl::~ThreadPoolImpl()
+int ThreadPool::tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap)
 {
-    X265_FREE((void*)m_sleepMap);
+    unsigned long id;
 
-    if (m_threads)
+    sleepbitmap_t masked = m_sleepBitmap & firstTryBitmap;
+    while (masked)
     {
-        // cleanup thread handles
-        for (int i = 0; i < m_numThreads; i++)
-            m_threads[i].~PoolThread();
+        SLEEPBITMAP_CTZ(id, masked);
 
-        X265_FREE(reinterpret_cast<char*>(m_threads));
+        uint32_t bit = 1 << id;
+        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
+            return (int)id;
+
+        masked = m_sleepBitmap & firstTryBitmap;
     }
-}
-
-void ThreadPoolImpl::enqueueJobProvider(JobProvider &p)
-{
-    // only one list writer at a time
-    ScopedLock l(m_writeLock);
 
-    p.m_nextProvider = NULL;
-    p.m_prevProvider = m_lastProvider;
-    m_lastProvider = &p;
+    masked = m_sleepBitmap & secondTryBitmap;
+    while (masked)
+    {
+        SLEEPBITMAP_CTZ(id, masked);
 
-    if (p.m_prevProvider)
-        p.m_prevProvider->m_nextProvider = &p;
-    else
-        m_firstProvider = &p;
+        uint32_t bit = 1 << id;
+        if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit)
+            return (int)id;
+
+        masked = m_sleepBitmap & secondTryBitmap;
+    }
+
+    return -1;
 }
 
-void ThreadPoolImpl::dequeueJobProvider(JobProvider &p)
+int ThreadPool::tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master)
 {
-    // only one list writer at a time
-    ScopedLock l(m_writeLock);
-
-    // update pool entry pointers first
-    if (m_firstProvider == &p)
-        m_firstProvider = p.m_nextProvider;
-
-    if (m_lastProvider == &p)
-        m_lastProvider = p.m_prevProvider;
-
-    // extract self from doubly linked lists
-    if (p.m_nextProvider)
-        p.m_nextProvider->m_prevProvider = p.m_prevProvider;
-
-    if (p.m_prevProvider)
-        p.m_prevProvider->m_nextProvider = p.m_nextProvider;
-
-    p.m_nextProvider = NULL;
-    p.m_prevProvider = NULL;
-}
-
-/* Ensure all threads have made a full pass through the provider list, ensuring
- * dequeued providers are safe for deletion. */
-void ThreadPoolImpl::FlushProviderList()
-{
-    for (int i = 0; i < m_numThreads; i++)
-    {
-        m_threads[i].markDirty();
-        m_threads[i].poke();
-    }
-
-    int i;
+    int bondCount = 0;
     do
     {
-        for (i = 0; i < m_numThreads; i++)
-        {
-            if (m_threads[i].isDirty())
-            {
-                GIVE_UP_TIME();
-                break;
-            }
-        }
+        int id = tryAcquireSleepingThread(peerBitmap, 0);
+        if (id < 0)
+            return bondCount;
+
+        m_workers[id].m_bondMaster = &master;
+        m_workers[id].awaken();
+        bondCount++;
     }
-    while (i < m_numThreads);
+    while (bondCount < maxPeers);
+
+    return bondCount;
 }
 
-void JobProvider::flush()
+ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools)
 {
-    if (m_nextProvider || m_prevProvider)
-        dequeue();
-    dynamic_cast<ThreadPoolImpl*>(m_pool)->FlushProviderList();
+    enum { MAX_NODE_NUM = 127 };
+    int cpusPerNode[MAX_NODE_NUM + 1];
+
+    memset(cpusPerNode, 0, sizeof(cpusPerNode));
+    int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM);
+    int cpuCount = getCpuCount();
+    bool bNumaSupport = false;
+
+#if _WIN32_WINNT >= 0x0601
+    bNumaSupport = true;
+#elif HAVE_LIBNUMA
+    bNumaSupport = numa_available() >= 0;
+#endif
+
+
+    for (int i = 0; i < cpuCount; i++)
+    {
+#if _WIN32_WINNT >= 0x0601
+        UCHAR node;
+        if (GetNumaProcessorNode((UCHAR)i, &node))
+            cpusPerNode[X265_MIN(node, MAX_NODE_NUM)]++;
+        else
+#elif HAVE_LIBNUMA
+        if (bNumaSupport >= 0)
+            cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++;
+        else
+#endif
+            cpusPerNode[0]++;
+    }
+
+    if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG)
+        for (int i = 0; i < numNumaNodes; i++)
+            x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]);
+
+    /* limit nodes based on param->numaPools */
+    if (p->numaPools && *p->numaPools)
+    {
+        char *nodeStr = p->numaPools;
+        for (int i = 0; i < numNumaNodes; i++)
+        {
+            if (!*nodeStr)
+            {
+                cpusPerNode[i] = 0;
+                continue;
+            }
+            else if (*nodeStr == '-')
+                cpusPerNode[i] = 0;
+            else if (*nodeStr == '*')
+                break;
+            else if (*nodeStr == '+')
+                ;
+            else
+            {
+                int count = atoi(nodeStr);
+                cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]);
+            }
+
+            /* consume current node string, comma, and white-space */
+            while (*nodeStr && *nodeStr != ',')
+               ++nodeStr;
+            if (*nodeStr == ',' || *nodeStr == ' ')
+               ++nodeStr;
+        }
+    }
+
+    numPools = 0;
+    for (int i = 0; i < numNumaNodes; i++)
+    {
+        if (bNumaSupport)
+            x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]);
+        if (cpusPerNode[i])
+            numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS;
+    }
+
+    if (!numPools)
+        return NULL;
+
+    if (numPools > p->frameNumThreads)
+    {
+        x265_log(p, X265_LOG_DEBUG, "Reducing number of thread pools for frame thread count\n");
+        numPools = X265_MAX(p->frameNumThreads / 2, 1);
+    }
+
+    ThreadPool *pools = new ThreadPool[numPools];
+    if (pools)
+    {
+        int maxProviders = (p->frameNumThreads + 1 + numPools - 1) / numPools; /* +1 is Lookahead */
+        int node = 0;
+        for (int i = 0; i < numPools; i++)
+        {
+            while (!cpusPerNode[node])
+                node++;
+            int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]);
+            if (!pools[i].create(cores, maxProviders, node))
+            {
+                X265_FREE(pools);
+                numPools = 0;
+                return NULL;
+            }
+            if (bNumaSupport)
+                x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node);
+            else
+                x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores);
+            cpusPerNode[node] -= cores;
+        }
+    }
+    else
+        numPools = 0;
+    return pools;
 }
 
-void JobProvider::enqueue()
+ThreadPool::ThreadPool()
 {
-    // Add this provider to the end of the thread pool's job provider list
-    X265_CHECK(!m_nextProvider && !m_prevProvider && m_pool, "job provider was already queued\n");
-    m_pool->enqueueJobProvider(*this);
-    m_pool->pokeIdleThread();
+    memset(this, 0, sizeof(*this));
 }
 
-void JobProvider::dequeue()
+bool ThreadPool::create(int numThreads, int maxProviders, int node)
 {
-    // Remove this provider from the thread pool's job provider list
-    m_pool->dequeueJobProvider(*this);
-    // Ensure no jobs were missed while the provider was being removed
-    m_pool->pokeIdleThread();
+    X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n");
+
+    m_numaNode = node;
+    m_numWorkers = numThreads;
+
+    m_workers = X265_MALLOC(WorkerThread, numThreads);
+    /* placement new initialization */
+    if (m_workers)
+        for (int i = 0; i < numThreads; i++)
+            new (m_workers + i)WorkerThread(*this, i);
+
+    m_jpTable = X265_MALLOC(JobProvider*, maxProviders);
+    m_numProviders = 0;
+
+    return m_workers && m_jpTable;
 }
 
-int getCpuCount()
+bool ThreadPool::start()
+{
+    m_isActive = true;
+    for (int i = 0; i < m_numWorkers; i++)
+    {
+        if (!m_workers[i].start())
+        {
+            m_isActive = false;
+            return false;
+        }
+    }
+    return true;
+}
+
+ThreadPool::~ThreadPool()
+{
+    if (m_workers)
+    {
+        m_isActive = false;
+        for (int i = 0; i < m_numWorkers; i++)
+        {
+            m_workers[i].awaken();
+            m_workers[i].stop();
+            m_workers[i].~WorkerThread();
+        }
+
+        X265_FREE(m_workers);
+    }
+
+    X265_FREE(m_jpTable);
+}
+
+void ThreadPool::setCurrentThreadAffinity()
+{
+    setThreadNodeAffinity(m_numaNode);
+}
+
+/* static */
+void ThreadPool::setThreadNodeAffinity(int numaNode)
+{
+#if _WIN32_WINNT >= 0x0601
+    GROUP_AFFINITY groupAffinity;
+    if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
+    {
+        if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask))
+            return;
+    }
+    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
+#elif HAVE_LIBNUMA
+    if (numa_available() >= 0)
+    {
+        numa_run_on_node(numaNode);
+        numa_set_preferred(numaNode);
+        numa_set_localalloc();
+        return;
+    }
+    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
+#else
+    (void)numaNode;
+#endif
+}
+
+/* static */
+int ThreadPool::getNumaNodeCount()
+{
+#if _WIN32_WINNT >= 0x0601
+    ULONG num = 1;
+    if (GetNumaHighestNodeNumber(&num))
+        num++;
+    return (int)num;
+#elif HAVE_LIBNUMA
+    if (numa_available() >= 0)
+        return numa_max_node() + 1;
+    else
+        return 1;
+#else
+    return 1;
+#endif
+}
+
+/* static */
+int ThreadPool::getCpuCount()
 {
 #if _WIN32
     SYSTEM_INFO sysinfo;
@@ -450,8 +464,9 @@ int getCpuCount()
     }
 
     return count;
-#else // if _WIN32
+#else
     return 2; // default to 2 threads, everywhere else
-#endif // if _WIN32
+#endif
 }
+
 } // end namespace x265
--- a/source/common/threadpool.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/threadpool.h	Tue Feb 24 15:34:32 2015 +0530
@@ -25,85 +25,147 @@
 #define X265_THREADPOOL_H
 
 #include "common.h"
+#include "threading.h"
 
 namespace x265 {
 // x265 private namespace
 
 class ThreadPool;
+class WorkerThread;
+class BondedTaskGroup;
 
-int getCpuCount();
+#if X86_64
+typedef uint64_t sleepbitmap_t;
+#else
+typedef uint32_t sleepbitmap_t;
+#endif
 
-// Any class that wants to distribute work to the thread pool must
-// derive from JobProvider and implement FindJob().
+static const sleepbitmap_t ALL_POOL_THREADS = (sleepbitmap_t)-1;
+enum { MAX_POOL_THREADS = sizeof(sleepbitmap_t) * 8 };
+enum { INVALID_SLICE_PRIORITY = 10 }; // a value larger than any X265_TYPE_* macro
+
+// Frame level job providers. FrameEncoder and Lookahead derive from
+// this class and implement findJob()
 class JobProvider
 {
-protected:
-
-    ThreadPool   *m_pool;
-
-    JobProvider  *m_nextProvider;
-    JobProvider  *m_prevProvider;
-
 public:
 
-    JobProvider(ThreadPool *p) : m_pool(p), m_nextProvider(0), m_prevProvider(0) {}
+    ThreadPool*   m_pool;
+    sleepbitmap_t m_ownerBitmap;
+    int           m_jpId;
+    int           m_sliceType;
+    bool          m_helpWanted;
+    bool          m_isFrameEncoder; /* rather ugly hack, but nothing better presents itself */
+
+    JobProvider()
+        : m_pool(NULL)
+        , m_ownerBitmap(0)
+        , m_jpId(-1)
+        , m_sliceType(INVALID_SLICE_PRIORITY)
+        , m_helpWanted(false)
+        , m_isFrameEncoder(false)
+    {}
 
     virtual ~JobProvider() {}
 
-    void setThreadPool(ThreadPool *p) { m_pool = p; }
-
-    // Register this job provider with the thread pool, jobs are available
-    void enqueue();
-
-    // Remove this job provider from the thread pool, all jobs complete
-    void dequeue();
+    // Worker threads will call this method to perform work
+    virtual void findJob(int workerThreadId) = 0;
 
-    // Worker threads will call this method to find a job.  Must return true if
-    // work was completed.  False if no work was available.
-    virtual bool findJob(int threadId) = 0;
-
-    // All derived objects that call Enqueue *MUST* call flush before allowing
-    // their object to be destroyed, otherwise you will see random crashes involving
-    // partially freed vtables and you will be unhappy
-    void flush();
-
-    friend class ThreadPoolImpl;
-    friend class PoolThread;
+    // Will awaken one idle thread, preferring a thread which most recently
+    // performed work for this provider.
+    void tryWakeOne();
 };
 
-// Abstract interface to ThreadPool.  Each encoder instance should call
-// AllocThreadPool() to get a handle to the singleton object and then make
-// it available to their job provider structures (wave-front frame encoders,
-// etc).
 class ThreadPool
 {
-protected:
-
-    // Destructor is inaccessable, force the use of reference counted Release()
-    ~ThreadPool() {}
-
-    virtual void enqueueJobProvider(JobProvider &) = 0;
-
-    virtual void dequeueJobProvider(JobProvider &) = 0;
-
 public:
 
-    // When numthreads == 0, a default thread count is used. A request may grow
-    // an existing pool but it will never shrink.
-    static ThreadPool *allocThreadPool(int numthreads = 0);
-
-    static ThreadPool *getThreadPool();
-
-    virtual void pokeIdleThread() = 0;
+    sleepbitmap_t m_sleepBitmap;
+    int           m_numProviders;
+    int           m_numWorkers;
+    int           m_numaNode;
+    bool          m_isActive;
 
-    // The pool is reference counted so all calls to AllocThreadPool() should be
-    // followed by a call to Release()
-    virtual void release() = 0;
+    JobProvider** m_jpTable;
+    WorkerThread* m_workers;
 
-    virtual int  getThreadCount() const = 0;
+    ThreadPool();
+    ~ThreadPool();
 
-    friend class JobProvider;
+    bool create(int numThreads, int maxProviders, int node);
+    bool start();
+    void setCurrentThreadAffinity();
+    int  tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap);
+    int  tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master);
+
+    static ThreadPool* allocThreadPools(x265_param* p, int& numPools);
+
+    static int  getCpuCount();
+    static int  getNumaNodeCount();
+    static void setThreadNodeAffinity(int node);
 };
+
+/* Any worker thread may enlist the help of idle worker threads from the same
+ * job provider. They must derive from this class and implement the
+ * processTasks() method.  To use, an instance must be instantiated by a worker
+ * thread (referred to as the master thread) and then tryBondPeers() must be
+ * called. If it returns non-zero then some number of slave worker threads are
+ * already in the process of calling your processTasks() function. The master
+ * thread should participate and call processTasks() itself. When
+ * waitForExit() returns, all bonded peer threads are quarunteed to have
+ * exitied processTasks(). Since the thread count is small, it uses explicit
+ * locking instead of atomic counters and bitmasks */
+class BondedTaskGroup
+{
+public:
+
+    Lock              m_lock;
+    ThreadSafeInteger m_exitedPeerCount;
+    int               m_bondedPeerCount;
+    int               m_jobTotal;
+    int               m_jobAcquired;
+
+    BondedTaskGroup()  { m_bondedPeerCount = m_jobTotal = m_jobAcquired = 0; }
+
+    /* Do not allow the instance to be destroyed before all bonded peers have
+     * exited processTasks() */
+    ~BondedTaskGroup() { waitForExit(); }
+
+    /* Try to enlist the help of idle worker threads on most recently associated
+     * with the given job provider and "bond" them to work on your tasks. Up to
+     * maxPeers worker threads will call your processTasks() method. */
+    int tryBondPeers(JobProvider& jp, int maxPeers)
+    {
+        int count = jp.m_pool->tryBondPeers(maxPeers, jp.m_ownerBitmap, *this);
+        m_bondedPeerCount += count;
+        return count;
+    }
+
+    /* Try to enlist the help of any idle worker threads and "bond" them to work
+     * on your tasks. Up to maxPeers worker threads will call your
+     * processTasks() method. */
+    int tryBondPeers(ThreadPool& pool, int maxPeers)
+    {
+        int count = pool.tryBondPeers(maxPeers, ALL_POOL_THREADS, *this);
+        m_bondedPeerCount += count;
+        return count;
+    }
+
+    /* Returns when all bonded peers have exited processTasks(). It does *NOT*
+     * ensure all tasks are completed (but this is generally implied). */
+    void waitForExit()
+    {
+        int exited = m_exitedPeerCount.get();
+        while (m_bondedPeerCount != exited)
+            exited = m_exitedPeerCount.waitForChange(exited);
+    }
+
+    /* Derived classes must define this method. The worker thread ID may be
+     * used to index into thread local data, or ignored.  The ID will be between
+     * 0 and jp.m_numWorkers - 1 */
+    virtual void processTasks(int workerThreadId) = 0;
+};
+
 } // end namespace x265
 
 #endif // ifndef X265_THREADPOOL_H
--- a/source/common/wavefront.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/wavefront.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -60,7 +60,6 @@ void WaveFront::enqueueRow(int row)
 {
     uint32_t bit = 1 << (row & 31);
     ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
-    if (m_pool) m_pool->pokeIdleThread();
 }
 
 void WaveFront::enableRow(int row)
@@ -80,11 +79,11 @@ bool WaveFront::dequeueRow(int row)
     return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit);
 }
 
-bool WaveFront::findJob(int threadId)
+void WaveFront::findJob(int threadId)
 {
     unsigned long id;
 
-    // thread safe
+    /* Loop over each word until all available rows are finished */
     for (int w = 0; w < m_numWords; w++)
     {
         uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
@@ -97,15 +96,14 @@ bool WaveFront::findJob(int threadId)
             {
                 /* we cleared the bit, we get to process the row */
                 processRow(w * 32 + id, threadId);
-                return true;
+                m_helpWanted = true;
+                return; /* check for a higher priority task */
             }
 
-            // some other thread cleared the bit, try another bit
             oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
         }
     }
 
-    // made it through the bitmap without finding any enqueued rows
-    return false;
+    m_helpWanted = false;
 }
 }
--- a/source/common/wavefront.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/common/wavefront.h	Tue Feb 24 15:34:32 2015 +0530
@@ -53,10 +53,9 @@ private:
 
 public:
 
-    WaveFront(ThreadPool *pool)
-        : JobProvider(pool)
-        , m_internalDependencyBitmap(0)
-        , m_externalDependencyBitmap(0)
+    WaveFront()
+        : m_internalDependencyBitmap(NULL)
+        , m_externalDependencyBitmap(NULL)
     {}
 
     virtual ~WaveFront();
@@ -86,8 +85,8 @@ public:
 
     // WaveFront's implementation of JobProvider::findJob. Consults
     // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row
-    // or returns false
-    bool findJob(int threadId);
+    // processes available rows and returns when no work remains
+    void findJob(int threadId);
 
     // Start or resume encode processing of this row, must be implemented by
     // derived classes.
--- a/source/encoder/analysis.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/analysis.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -71,7 +71,6 @@ using namespace x265;
 
 Analysis::Analysis()
 {
-    m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0;
     m_reuseIntraDataCTU = NULL;
     m_reuseInterDataCTU = NULL;
 }
@@ -231,7 +230,7 @@ void Analysis::compressIntraCU(const CUD
         char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
         uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
 
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx)
+        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx)
         {
             m_quant.setQPforQuant(parentCTU);
 
@@ -291,7 +290,7 @@ void Analysis::compressIntraCU(const CUD
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
             if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
                 compressIntraCU(parentCTU, childGeom, zOrder);
 
@@ -321,201 +320,165 @@ void Analysis::compressIntraCU(const CUD
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
-}
-
-bool Analysis::findJob(int threadId)
-{
-    /* try to acquire a CU mode to analyze */
-    m_pmodeLock.acquire();
-    if (m_totalNumJobs > m_numAcquiredJobs)
-    {
-        int id = m_numAcquiredJobs++;
-        m_pmodeLock.release();
-
-        {
-            ProfileScopeEvent(pmode);
-            ProfileCUScope(m_modeDepth[m_curGeom->depth].pred[PRED_2Nx2N].cu, pmodeTime, countPModeTasks);
-            parallelModeAnalysis(threadId, id);
-        }
-
-        m_pmodeLock.acquire();
-        if (++m_numCompletedJobs == m_totalNumJobs)
-            m_modeCompletionEvent.trigger();
-        m_pmodeLock.release();
-        return true;
-    }
-    else
-        m_pmodeLock.release();
-
-    m_meLock.acquire();
-    if (m_totalNumME > m_numAcquiredME)
-    {
-        int id = m_numAcquiredME++;
-        m_meLock.release();
-
-        {
-            ProfileScopeEvent(pme);
-            ProfileCUScope(m_curInterMode->cu, pmeTime, countPMETasks);
-            parallelME(threadId, id);
-        }
-
-        m_meLock.acquire();
-        if (++m_numCompletedME == m_totalNumME)
-            m_meCompletionEvent.trigger();
-        m_meLock.release();
-        return true;
-    }
-    else
-        m_meLock.release();
-
-    return false;
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 }
 
-void Analysis::parallelME(int threadId, int meId)
+void Analysis::PMODE::processTasks(int workerThreadId)
 {
-    Analysis* slave;
-
-    if (threadId == -1)
-        slave = this;
-    else
-    {
-        slave = &m_tld[threadId].analysis;
-        slave->setQP(*m_slice, m_rdCost.m_qp);
-        slave->m_slice = m_slice;
-        slave->m_frame = m_frame;
-
-        slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
-        slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart);
-    }
-
-    if (meId < m_slice->m_numRefIdx[0])
-        slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId);
-    else
-        slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]);
+#if DETAILED_CU_STATS
+    int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID;
+    master.m_stats[fe].countPModeTasks++;
+    ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime);
+#endif
+    ProfileScopeEvent(pmode);
+    master.processPmode(*this, master.m_tld[workerThreadId].analysis);
 }
 
-void Analysis::parallelModeAnalysis(int threadId, int jobId)
+/* process pmode jobs until none remain; may be called by the master thread or by
+ * a bonded peer (slave) thread via pmodeTasks() */
+void Analysis::processPmode(PMODE& pmode, Analysis& slave)
 {
-    Analysis* slave;
-
-    if (threadId == -1)
-        slave = this;
-    else
-    {
-        slave = &m_tld[threadId].analysis;
-        slave->m_slice = m_slice;
-        slave->m_frame = m_frame;
-        slave->setQP(*m_slice, m_rdCost.m_qp);
-        slave->invalidateContexts(0);
-    }
-
-    ModeDepth& md = m_modeDepth[m_curGeom->depth];
-
-    if (m_param->rdLevel <= 4)
+    /* acquire a mode task, else exit early */
+    int task;
+    pmode.m_lock.acquire();
+    if (pmode.m_jobTotal > pmode.m_jobAcquired)
     {
-        switch (jobId)
-        {
-        case 0:
-            if (slave != this)
-                slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
-            slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
-            if (m_param->rdLevel > 2)
-                slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom);
-            break;
-
-        case 1:
-            slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N);
-            if (m_slice->m_sliceType == B_SLICE)
-                slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
-            break;
-
-        case 2:
-            slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N);
-            break;
-
-        case 3:
-            slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN);
-            break;
-
-        case 4:
-            slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU);
-            break;
-
-        case 5:
-            slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD);
-            break;
-
-        case 6:
-            slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N);
-            break;
-
-        case 7:
-            slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N);
-            break;
-
-        default:
-            X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
-            break;
-        }
+        task = pmode.m_jobAcquired++;
+        pmode.m_lock.release();
     }
     else
     {
-        bool bMergeOnly = m_curGeom->log2CUSize == 6;
-        if (slave != this)
-        {
-            slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur);
-            slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
-        }
-        
-        switch (jobId)
-        {
-        case 0:
-            slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL, NULL);
-            if (m_curGeom->log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
-                slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL, NULL);
-            break;
-
-        case 1:
-            slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false);
-            md.pred[PRED_BIDIR].rdCost = MAX_INT64;
-            if (m_slice->m_sliceType == B_SLICE)
-            {
-                slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom);
-                if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
-                    slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom);
-            }
-            break;
+        pmode.m_lock.release();
+        return;
+    }
 
-        case 2:
-            slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false);
-            break;
-
-        case 3:
-            slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false);
-            break;
-
-        case 4:
-            slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly);
-            break;
+    ModeDepth& md = m_modeDepth[pmode.cuGeom.depth];
+    bool bMergeOnly = pmode.cuGeom.log2CUSize == 6;
 
-        case 5:
-            slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly);
-            break;
+    /* setup slave Analysis */
+    if (&slave != this)
+    {
+        slave.m_slice = m_slice;
+        slave.m_frame = m_frame;
+        slave.setQP(*m_slice, m_rdCost.m_qp);
+        slave.invalidateContexts(0);
 
-        case 6:
-            slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly);
-            break;
-
-        case 7:
-            slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly);
-            break;
-
-        default:
-            X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
-            break;
+        if (m_param->rdLevel >= 5)
+        {
+            slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
+            slave.m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu);
         }
     }
+
+
+    /* perform Mode task, repeat until no more work is available */
+    do
+    {
+        if (m_param->rdLevel <= 4)
+        {
+            switch (pmode.modes[task])
+            {
+            case PRED_INTRA:
+                if (&slave != this)
+                    slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur);
+                slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
+                if (m_param->rdLevel > 2)
+                    slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom);
+                break;
+
+            case PRED_2Nx2N:
+                slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N);
+                if (m_slice->m_sliceType == B_SLICE)
+                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
+                break;
+
+            case PRED_Nx2N:
+                slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N);
+                break;
+
+            case PRED_2NxN:
+                slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN);
+                break;
+
+            case PRED_2NxnU:
+                slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU);
+                break;
+
+            case PRED_2NxnD:
+                slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD);
+                break;
+
+            case PRED_nLx2N:
+                slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N);
+                break;
+
+            case PRED_nRx2N:
+                slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N);
+                break;
+
+            default:
+                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
+                break;
+            }
+        }
+        else
+        {
+            switch (pmode.modes[task])
+            {
+            case PRED_INTRA:
+                slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N, NULL, NULL);
+                if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
+                    slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN, NULL, NULL);
+                break;
+
+            case PRED_2Nx2N:
+                slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, false);
+                md.pred[PRED_BIDIR].rdCost = MAX_INT64;
+                if (m_slice->m_sliceType == B_SLICE)
+                {
+                    slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom);
+                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
+                        slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom);
+                }
+                break;
+
+            case PRED_Nx2N:
+                slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, false);
+                break;
+
+            case PRED_2NxN:
+                slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, false);
+                break;
+
+            case PRED_2NxnU:
+                slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, bMergeOnly);
+                break;
+
+            case PRED_2NxnD:
+                slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, bMergeOnly);
+                break;
+
+            case PRED_nLx2N:
+                slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, bMergeOnly);
+                break;
+
+            case PRED_nRx2N:
+                slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, bMergeOnly);
+                break;
+
+            default:
+                X265_CHECK(0, "invalid job ID for parallel mode analysis\n");
+                break;
+            }
+        }
+
+        task = -1;
+        pmode.m_lock.acquire();
+        if (pmode.m_jobTotal > pmode.m_jobAcquired)
+            task = pmode.m_jobAcquired++;
+        pmode.m_lock.release();
+    }
+    while (task >= 0);
 }
 
 void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom)
@@ -536,59 +499,37 @@ void Analysis::compressInterCU_dist(cons
         int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4);
         int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames;
 
+        PMODE pmode(*this, cuGeom);
+
         /* Initialize all prediction CUs based on parentCTU */
-        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom);
-        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
         md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
         md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
-        if (m_param->bEnableRectInter)
-        {
-            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom);
-        }
-        if (bTryAmp)
-        {
-            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom);
-            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom);
-        }
         if (bTryIntra)
         {
             md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom);
-            if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
+            if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
                 md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom);
+            pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
+        }
+        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
+        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom);
+        if (m_param->bEnableRectInter)
+        {
+            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
+            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
+        }
+        if (bTryAmp)
+        {
+            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
+            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
+            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
+            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
         }
 
-        m_pmodeLock.acquire();
-        m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4;
-        m_numAcquiredJobs = !bTryIntra;
-        m_numCompletedJobs = m_numAcquiredJobs;
-        m_curGeom = &cuGeom;
-        m_bJobsQueued = true;
-        JobProvider::enqueue();
-        m_pmodeLock.release();
-
-        for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++)
-            m_pool->pokeIdleThread();
+        pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal);
 
         /* participate in processing jobs, until all are distributed */
-        m_pmodeLock.acquire();
-        while (m_totalNumJobs > m_numAcquiredJobs)
-        {
-            int id = m_numAcquiredJobs++;
-            m_pmodeLock.release();
-
-            parallelModeAnalysis(-1, id);
-
-            m_pmodeLock.acquire();
-            if (++m_numCompletedJobs == m_totalNumJobs)
-                m_modeCompletionEvent.trigger();
-        }
-        m_pmodeLock.release();
-
-        JobProvider::dequeue();
-        m_bJobsQueued = false;
+        processPmode(pmode, *this);
 
         /* the master worker thread (this one) does merge analysis. By doing
          * merge after all the other jobs are at least started, we usually avoid
@@ -600,7 +541,7 @@ void Analysis::compressInterCU_dist(cons
 
             {
                 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
-                m_modeCompletionEvent.wait();
+                pmode.waitForExit();
             }
 
             /* select best inter mode based on sa8d cost */
@@ -681,7 +622,7 @@ void Analysis::compressInterCU_dist(cons
             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false);
             {
                 ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters);
-                m_modeCompletionEvent.wait();
+                pmode.waitForExit();
             }
 
             checkBestMode(md.pred[PRED_2Nx2N], depth);
@@ -749,7 +690,7 @@ void Analysis::compressInterCU_dist(cons
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
             if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
                 compressInterCU_dist(parentCTU, childGeom);
 
@@ -788,7 +729,7 @@ void Analysis::compressInterCU_dist(cons
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
 }
 
 void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom)
@@ -984,7 +925,7 @@ void Analysis::compressInterCU_rd0_4(con
                         residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
                         getBestIntraModeChroma(*md.bestMode, cuGeom);
                         residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
-                        md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO:
+                        md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
                     }
                 }
             }
@@ -1022,7 +963,7 @@ void Analysis::compressInterCU_rd0_4(con
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
             if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
                 compressInterCU_rd0_4(parentCTU, childGeom);
 
@@ -1072,7 +1013,7 @@ void Analysis::compressInterCU_rd0_4(con
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel)
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
 }
 
 void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder)
@@ -1088,7 +1029,7 @@ void Analysis::compressInterCU_rd5_6(con
     {
         uint8_t* reuseDepth  = &m_reuseInterDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
         uint8_t* reuseModes  = &m_reuseInterDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions];
-        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx && reuseModes[zOrder] == MODE_SKIP)
+        if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx && reuseModes[zOrder] == MODE_SKIP)
         {
             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom);
             md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom);
@@ -1221,7 +1162,7 @@ void Analysis::compressInterCU_rd5_6(con
             const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
             if (childGeom.flags & CUGeom::PRESENT)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx);
+                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
                 m_rqt[nextDepth].cur.load(*nextContext);
                 compressInterCU_rd5_6(parentCTU, childGeom, zOrder);
 
@@ -1251,7 +1192,7 @@ void Analysis::compressInterCU_rd5_6(con
     /* Copy best data to encData CTU and recon */
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
-        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 }
 
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -1521,38 +1462,33 @@ void Analysis::checkInter_rd0_4(Mode& in
             }
         }
     }
-    if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d))
+
+    predInterSearch(interMode, cuGeom, false, m_bChromaSa8d);
+
+    /* predInterSearch sets interMode.sa8dBits */
+    const Yuv& fencYuv = *interMode.fencYuv;
+    Yuv& predYuv = interMode.predYuv;
+    int part = partitionFromLog2Size(cuGeom.log2CUSize);
+    interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
+    if (m_bChromaSa8d)
     {
-        /* predInterSearch sets interMode.sa8dBits */
-        const Yuv& fencYuv = *interMode.fencYuv;
-        Yuv& predYuv = interMode.predYuv;
-        int part = partitionFromLog2Size(cuGeom.log2CUSize);
-        interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
-        if (m_bChromaSa8d)
+        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
+        interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
+    }
+    interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
+
+    if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+    {
+        for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
         {
-            interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
-            interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
-        }
-        interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits);
-
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
-        {
-            for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+            MotionData* bestME = interMode.bestME[puIdx];
+            for (int32_t i = 0; i < numPredDir; i++)
             {
-                MotionData* bestME = interMode.bestME[puIdx];
-                for (int32_t i = 0; i < numPredDir; i++)
-                {
-                    *m_reuseRef = bestME[i].ref;
-                    m_reuseRef++;
-                }
+                *m_reuseRef = bestME[i].ref;
+                m_reuseRef++;
             }
         }
     }
-    else
-    {
-        interMode.distortion = MAX_UINT;
-        interMode.sa8dCost = MAX_INT64;
-    }
 }
 
 void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly)
@@ -1574,29 +1510,24 @@ void Analysis::checkInter_rd5_6(Mode& in
             }
         }
     }
-    if (predInterSearch(interMode, cuGeom, bMergeOnly, true))
+
+    predInterSearch(interMode, cuGeom, bMergeOnly, true);
+
+    /* predInterSearch sets interMode.sa8dBits, but this is ignored */
+    encodeResAndCalcRdInterCU(interMode, cuGeom);
+
+    if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
     {
-        /* predInterSearch sets interMode.sa8dBits, but this is ignored */
-        encodeResAndCalcRdInterCU(interMode, cuGeom);
-
-        if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU)
+        for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
         {
-            for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++)
+            MotionData* bestME = interMode.bestME[puIdx];
+            for (int32_t i = 0; i < numPredDir; i++)
             {
-                MotionData* bestME = interMode.bestME[puIdx];
-                for (int32_t i = 0; i < numPredDir; i++)
-                {
-                    *m_reuseRef = bestME[i].ref;
-                    m_reuseRef++;
-                }
+                *m_reuseRef = bestME[i].ref;
+                m_reuseRef++;
             }
         }
     }
-    else
-    {
-        interMode.distortion = MAX_UINT;
-        interMode.rdCost = MAX_INT64;
-    }
 }
 
 void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom)
@@ -1690,8 +1621,8 @@ void Analysis::checkBidir2Nx2N(Mode& int
         }
         else
         {
-            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
-            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx);
+            pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx);
+            pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx);
             intptr_t refStride = m_slice->m_mref[0][0].lumaStride;
 
             primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32);
@@ -1742,7 +1673,7 @@ void Analysis::checkBidir2Nx2N(Mode& int
 
 void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom)
 {
-    if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth)
+    if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < g_maxCUDepth)
     {
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
@@ -1753,7 +1684,7 @@ void Analysis::encodeResidue(const CUDat
         return;
     }
 
-    uint32_t absPartIdx = cuGeom.encodeIdx;
+    uint32_t absPartIdx = cuGeom.absPartIdx;
     int sizeIdx = cuGeom.log2CUSize - 2;
 
     /* reuse the bestMode data structures at the current depth */
@@ -1912,11 +1843,11 @@ uint32_t Analysis::topSkipMinDepth(const
         numRefs++;
         const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
         previousQP = cu.m_qp[0];
-        if (!cu.m_cuDepth[cuGeom.encodeIdx])
+        if (!cu.m_cuDepth[cuGeom.absPartIdx])
             return 0;
         for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4)
         {
-            uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
+            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
             minDepth0 = X265_MIN(d, minDepth0);
             sum += d;
         }
@@ -1925,11 +1856,11 @@ uint32_t Analysis::topSkipMinDepth(const
     {
         numRefs++;
         const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr);
-        if (!cu.m_cuDepth[cuGeom.encodeIdx])
+        if (!cu.m_cuDepth[cuGeom.absPartIdx])
             return 0;
         for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4)
         {
-            uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i];
+            uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i];
             minDepth1 = X265_MIN(d, minDepth1);
             sum += d;
         }
--- a/source/encoder/analysis.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/analysis.h	Tue Feb 24 15:34:32 2015 +0530
@@ -70,6 +70,25 @@ public:
         CUDataMemPool  cuMemPool;
     };
 
+    class PMODE : public BondedTaskGroup
+    {
+    public:
+
+        Analysis&     master;
+        const CUGeom& cuGeom;
+        int           modes[MAX_PRED_TYPES];
+
+        PMODE(Analysis& m, const CUGeom& g) : master(m), cuGeom(g) {}
+
+        void processTasks(int workerThreadId);
+
+    protected:
+
+        PMODE operator=(const PMODE&);
+    };
+
+    void processPmode(PMODE& pmode, Analysis& slave);
+
     ModeDepth m_modeDepth[NUM_CU_DEPTH];
     bool      m_bTryLossless;
     bool      m_bChromaSa8d;
@@ -83,16 +102,6 @@ public:
 
 protected:
 
-    /* mode analysis distribution */
-    int           m_totalNumJobs;
-    volatile int  m_numAcquiredJobs;
-    volatile int  m_numCompletedJobs;
-    Lock          m_pmodeLock;
-    Event         m_modeCompletionEvent;
-    bool findJob(int threadId);
-    void parallelModeAnalysis(int threadId, int jobId);
-    void parallelME(int threadId, int meId);
-
     /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
     analysis_intra_data* m_reuseIntraDataCTU;
     analysis_inter_data* m_reuseInterDataCTU;
--- a/source/encoder/encoder.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/encoder.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -43,7 +43,7 @@ namespace x265 {
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
 }
 
-static const char *summaryCSVHeader =
+static const char* summaryCSVHeader =
     "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
     "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
     "I count, I ave-QP, I kpbs, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
@@ -51,7 +51,7 @@ static const char *summaryCSVHeader =
     "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
     "Version\n";
 
-const char* defaultAnalysisFileName = "x265_analysis.dat";
+static const char* defaultAnalysisFileName = "x265_analysis.dat";
 
 using namespace x265;
 
@@ -78,7 +78,6 @@ Encoder::Encoder()
     m_buOffsetY = NULL;
     m_buOffsetC = NULL;
     m_threadPool = NULL;
-    m_numThreadLocalData = 0;
     m_analysisFile = NULL;
     for (int i = 0; i < X265_MAX_FRAME_THREADS; i++)
         m_frameEncoder[i] = NULL;
@@ -102,38 +101,16 @@ void Encoder::create()
     if (rows == 1 || cols < 3)
         p->bEnableWavefront = 0;
 
-    int poolThreadCount = p->poolNumThreads ? p->poolNumThreads : getCpuCount();
+    bool allowPools = !p->numaPools || strcmp(p->numaPools, "none");
 
     // Trim the thread pool if --wpp, --pme, and --pmode are disabled
     if (!p->bEnableWavefront && !p->bDistributeModeAnalysis && !p->bDistributeMotionEstimation)
-        poolThreadCount = 0;
-
-    if (poolThreadCount > 1)
-    {
-        m_threadPool = ThreadPool::allocThreadPool(poolThreadCount);
-        poolThreadCount = m_threadPool->getThreadCount();
-    }
-    else
-        poolThreadCount = 0;
-
-    if (!poolThreadCount)
-    {
-        // issue warnings if any of these features were requested
-        if (p->bEnableWavefront)
-            x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --wpp disabled\n");
-        if (p->bDistributeMotionEstimation)
-            x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --pme disabled\n");
-        if (p->bDistributeModeAnalysis)
-            x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --pmode disabled\n");
-
-        // disable all pool features if the thread pool is disabled or unusable.
-        p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = 0;
-    }
+        allowPools = false;
 
     if (!p->frameNumThreads)
     {
         // auto-detect frame threads
-        int cpuCount = getCpuCount();
+        int cpuCount = ThreadPool::getCpuCount();
         if (!p->bEnableWavefront)
             p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2);
         else if (cpuCount >= 32)
@@ -148,14 +125,51 @@ void Encoder::create()
             p->frameNumThreads = 1;
     }
 
-    x265_log(p, X265_LOG_INFO, "WPP streams / frame threads / pool  : %d / %d / %d%s%s\n", 
-             p->bEnableWavefront ? rows : 0, p->frameNumThreads, poolThreadCount,
-             p->bDistributeMotionEstimation ? " / pme" : "", p->bDistributeModeAnalysis ? " / pmode" : "");
+    m_numPools = 0;
+    if (allowPools)
+        m_threadPool = ThreadPool::allocThreadPools(p, m_numPools);
+
+    if (!m_numPools)
+    {
+        // issue warnings if any of these features were requested
+        if (p->bEnableWavefront)
+            x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --wpp disabled\n");
+        if (p->bDistributeMotionEstimation)
+            x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --pme disabled\n");
+        if (p->bDistributeModeAnalysis)
+            x265_log(p, X265_LOG_WARNING, "No thread pool allocated, --pmode disabled\n");
+
+        // disable all pool features if the thread pool is disabled or unusable.
+        p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = 0;
+    }
+
+    char buf[128];
+    int len = 0;
+    if (p->bEnableWavefront)
+        len += sprintf(buf + len, "wpp(%d rows)", rows);
+    if (p->bDistributeModeAnalysis)
+        len += sprintf(buf + len, "%spmode", len ? "+" : "");
+    if (p->bDistributeMotionEstimation)
+        len += sprintf(buf + len, "%spme ", len ? "+" : "");
+    if (!len)
+        strcpy(buf, "none");
+
+    x265_log(p, X265_LOG_INFO, "frame threads / pool features       : %d / %s\n", p->frameNumThreads, buf);
 
     for (int i = 0; i < m_param->frameNumThreads; i++)
+        m_frameEncoder[i] = new FrameEncoder;
+
+    if (m_numPools)
     {
-        m_frameEncoder[i] = new FrameEncoder;
-        m_frameEncoder[i]->setThreadPool(m_threadPool);
+        for (int i = 0; i < m_param->frameNumThreads; i++)
+        {
+            int pool = i % m_numPools;
+            m_frameEncoder[i]->m_pool = &m_threadPool[pool];
+            m_frameEncoder[i]->m_jpId = m_threadPool[pool].m_numProviders++;
+            m_threadPool[pool].m_jpTable[m_frameEncoder[i]->m_jpId] = m_frameEncoder[i];
+        }
+        for (int i = 0; i < m_numPools; i++)
+            m_threadPool[i].start();
     }
 
     if (!m_scalingList.init())
@@ -171,24 +185,13 @@ void Encoder::create()
         m_aborted = true;
     m_scalingList.setupQuantMatrices();
 
-    /* Allocate thread local data, one for each thread pool worker and
-     * if --no-wpp, one for each frame encoder */
-    m_numThreadLocalData = poolThreadCount;
-    if (!m_param->bEnableWavefront)
-        m_numThreadLocalData += m_param->frameNumThreads;
-    m_threadLocalData = new ThreadLocalData[m_numThreadLocalData];
-    for (int i = 0; i < m_numThreadLocalData; i++)
+    m_lookahead = new Lookahead(m_param, m_threadPool);
+    if (m_numPools)
     {
-        m_threadLocalData[i].analysis.setThreadPool(m_threadPool);
-        m_threadLocalData[i].analysis.initSearch(*m_param, m_scalingList);
-        m_threadLocalData[i].analysis.create(m_threadLocalData);
+        m_lookahead->m_jpId = m_threadPool[0].m_numProviders++;
+        m_threadPool[0].m_jpTable[m_lookahead->m_jpId] = m_lookahead;
     }
 
-    if (!m_param->bEnableWavefront)
-        for (int i = 0; i < m_param->frameNumThreads; i++)
-            m_frameEncoder[i]->m_tld = &m_threadLocalData[poolThreadCount + i];
-
-    m_lookahead = new Lookahead(m_param, m_threadPool);
     m_dpb = new DPB(m_param);
     m_rateControl = new RateControl(m_param);
 
@@ -236,19 +239,19 @@ void Encoder::create()
     int numCols = (m_param->sourceWidth  + g_maxCUSize - 1) / g_maxCUSize;
     for (int i = 0; i < m_param->frameNumThreads; i++)
     {
-        if (!m_frameEncoder[i]->init(this, numRows, numCols, i))
+        if (!m_frameEncoder[i]->init(this, numRows, numCols))
         {
             x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n");
             m_aborted = true;
         }
+        m_frameEncoder[i]->start();
     }
-
     if (m_param->bEmitHRDSEI)
         m_rateControl->initHRD(&m_sps);
     if (!m_rateControl->init(&m_sps))
         m_aborted = true;
-
-    m_lookahead->init();
+    if (!m_lookahead->create())
+        m_aborted = true;
 
     if (m_param->analysisMode)
     {
@@ -282,24 +285,28 @@ void Encoder::destroy()
     if (m_rateControl)
         m_rateControl->terminate(); // unblock all blocked RC calls
 
+    if (m_lookahead)
+        m_lookahead->stop();
+    
     for (int i = 0; i < m_param->frameNumThreads; i++)
-    {
+        if (m_frameEncoder[i]) m_frameEncoder[i]->getEncodedPicture(m_nalList);
+		
+    for (int i = 0; i < m_param->frameNumThreads; i++)
         if (m_frameEncoder[i])
         {
-            // Ensure frame encoder is idle before destroying it
-            m_frameEncoder[i]->getEncodedPicture(m_nalList);
             m_frameEncoder[i]->destroy();
             delete m_frameEncoder[i];
         }
-    }
 
-    for (int i = 0; i < m_numThreadLocalData; i++)
-        m_threadLocalData[i].destroy();
-
-    delete [] m_threadLocalData;
+    // thread pools can be cleaned up now that all the JobProviders are
+    // known to be shutdown
+    delete [] m_threadPool;
 
     if (m_lookahead)
-        m_lookahead->stop();
+    {
+        m_lookahead->destroy();
+        delete m_lookahead;
+    }
 
     delete m_dpb;
     if (m_rateControl)
@@ -308,16 +315,6 @@ void Encoder::destroy()
         delete m_rateControl;
     }
 
-    // thread pool release should always happen last
-    if (m_threadPool)
-        m_threadPool->release();
-
-    if (m_lookahead)
-    {
-        m_lookahead->destroy();
-        delete m_lookahead;
-    }
-
     X265_FREE(m_cuOffsetY);
     X265_FREE(m_cuOffsetC);
     X265_FREE(m_buOffsetY);
@@ -445,9 +442,9 @@ int Encoder::encode(const x265_picture* 
             inFrame = m_dpb->m_freeList.popBack();
 
         /* Copy input picture into a Frame and PicYuv, send to lookahead */
-        inFrame->m_poc = ++m_pocLast;
         inFrame->m_fencPic->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset);
 
+        inFrame->m_poc       = ++m_pocLast;
         inFrame->m_userData  = pic_in->userData;
         inFrame->m_pts       = pic_in->pts;
         inFrame->m_forceqp   = pic_in->forceqp;
@@ -459,21 +456,14 @@ int Encoder::encode(const x265_picture* 
 
         /* Encoder holds a reference count until stats collection is finished */
         ATOMIC_INC(&inFrame->m_countRefEncoders);
-        bool bEnableWP = m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred;
-        if (m_param->rc.aqMode || bEnableWP)
+
+        if ((m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred) &&
+            (m_param->rc.cuTree && m_param->rc.bStatRead))
         {
-            if (m_param->rc.cuTree && m_param->rc.bStatRead)
+            if (!m_rateControl->cuTreeReadFor2Pass(inFrame))
             {
-                if (!m_rateControl->cuTreeReadFor2Pass(inFrame))
-                {
-                    m_aborted = 1;
-                    return -1;
-                }
-            }
-            else
-            {
-                ProfileScopeEvent(prelookahead);
-                m_rateControl->calcAdaptiveQuantFrame(inFrame);
+                m_aborted = 1;
+                return -1;
             }
         }
 
@@ -496,7 +486,7 @@ int Encoder::encode(const x265_picture* 
             sliceType = inputPic->analysisData.sliceType;
         }
 
-        m_lookahead->addPicture(inFrame, sliceType);
+        m_lookahead->addPicture(*inFrame, sliceType);
         m_numDelayedPic++;
     }
     else
@@ -822,17 +812,19 @@ void Encoder::printSummary()
     CUStats cuStats;
     for (int i = 0; i < m_param->frameNumThreads; i++)
         cuStats.accumulate(m_frameEncoder[i]->m_cuStats);
-    
+
     if (!cuStats.totalCTUTime)
         return;
 
-#define ELAPSED_SEC(val)  ((double)(val) / 1000000)
-#define ELAPSED_MSEC(val) ((double)(val) / 1000)
+    int totalWorkerCount = 0;
+    for (int i = 0; i < m_numPools; i++)
+        totalWorkerCount += m_threadPool[i].m_numWorkers;
 
-    int64_t lookaheadWorkerTime = m_lookahead->m_slicetypeDecideElapsedTime;
-    if (m_lookahead->usingWorkerThreads())
-        /* if the lookahead is not using worker threads, processRow() time is already included in slicetypeDecide time */
-        lookaheadWorkerTime += m_lookahead->m_est.m_processRowElapsedTime;
+    int64_t  batchElapsedTime, coopSliceElapsedTime;
+    uint64_t batchCount, coopSliceCount;
+    m_lookahead->getWorkerStats(batchElapsedTime, batchCount, coopSliceElapsedTime, coopSliceCount);
+    int64_t lookaheadWorkerTime = m_lookahead->m_slicetypeDecideElapsedTime + m_lookahead->m_preLookaheadElapsedTime +
+                                  batchElapsedTime + coopSliceElapsedTime;
 
     int64_t totalWorkerTime = cuStats.totalCTUTime + cuStats.loopFilterElapsedTime + cuStats.pmodeTime + cuStats.pmeTime + lookaheadWorkerTime;
     int64_t elapsedEncodeTime = x265_mdate() - m_encodeStartTime;
@@ -851,6 +843,9 @@ void Encoder::printSummary()
     int64_t unaccounted = (cuStats.totalCTUTime + cuStats.pmodeTime) -
                           (cuStats.intraAnalysisElapsedTime + cuStats.motionEstimationElapsedTime + interRDOTotalTime + intraRDOTotalTime);
 
+#define ELAPSED_SEC(val)  ((double)(val) / 1000000)
+#define ELAPSED_MSEC(val) ((double)(val) / 1000)
+
     if (m_param->bDistributeMotionEstimation && cuStats.countPMEMasters)
     {
         x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in motion estimation, averaging %.3lf CU inter modes per CTU\n",
@@ -891,10 +886,10 @@ void Encoder::printSummary()
                  ELAPSED_MSEC(cuStats.pmodeTime) / cuStats.countPModeTasks);
     }
 
-    x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in slicetypeDecide (avg %.3lfms) and lookahead row cost (avg %.3lfns)\n",
+    x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in slicetypeDecide (avg %.3lfms) and prelookahead (avg %.3lfms)\n",
              100.0 * lookaheadWorkerTime / totalWorkerTime,
              ELAPSED_MSEC(m_lookahead->m_slicetypeDecideElapsedTime) / m_lookahead->m_countSlicetypeDecide,
-             (double)m_lookahead->m_est.m_processRowElapsedTime / m_lookahead->m_est.m_countProcessRow);
+             ELAPSED_MSEC(m_lookahead->m_preLookaheadElapsedTime) / m_lookahead->m_countPreLookahead);
 
     x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in other tasks\n",
              100.0 * unaccounted / totalWorkerTime);
@@ -927,9 +922,9 @@ void Encoder::printSummary()
              cuStats.totalCTUs / ELAPSED_SEC(totalWorkerTime));
 
     if (m_threadPool)
-        x265_log(m_param, X265_LOG_INFO, "CU: %.3lf average worker occupancy, %%%05.2lf of theoretical maximum occupancy\n",
+        x265_log(m_param, X265_LOG_INFO, "CU: %.3lf average worker utilization, %%%05.2lf of theoretical maximum utilization\n",
                  (double)totalWorkerTime / elapsedEncodeTime,
-                 100.0 * totalWorkerTime / (elapsedEncodeTime * m_threadPool->getThreadCount()));
+                 100.0 * totalWorkerTime / (elapsedEncodeTime * totalWorkerCount));
 
 #undef ELAPSED_SEC
 #undef ELAPSED_MSEC
--- a/source/encoder/encoder.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/encoder.h	Tue Feb 24 15:34:32 2015 +0530
@@ -70,7 +70,6 @@ class DPB;
 class Lookahead;
 class RateControl;
 class ThreadPool;
-struct ThreadLocalData;
 
 class Encoder : public x265_encoder
 {
@@ -91,6 +90,7 @@ public:
 
     Frame*             m_exportedPic;
 
+    int                m_numPools;
     int                m_curEncoder;
 
     /* cached PicYuv offset arrays, shared by all instances of
@@ -120,14 +120,12 @@ public:
     PPS                m_pps;
     NALList            m_nalList;
     ScalingList        m_scalingList;      // quantization matrix information
-    int                m_numThreadLocalData;
 
     int                m_lastBPSEI;
     uint32_t           m_numDelayedPic;
 
     x265_param*        m_param;
     RateControl*       m_rateControl;
-    ThreadLocalData*   m_threadLocalData;
     Lookahead*         m_lookahead;
     Window             m_conformanceWindow;
 
--- a/source/encoder/frameencoder.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/frameencoder.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -39,13 +39,11 @@ namespace x265 {
 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param);
 
 FrameEncoder::FrameEncoder()
-    : WaveFront(NULL)
-    , m_threadActive(true)
 {
     m_prevOutputTime = x265_mdate();
-    m_totalWorkerElapsedTime = 0;
+    m_isFrameEncoder = true;
+    m_threadActive = true;
     m_slicetypeWaitTime = 0;
-    m_frameEncoderID = 0;
     m_activeWorkerCount = 0;
     m_bAllRowsStop = false;
     m_vbvResetTriggerRow = -1;
@@ -66,7 +64,22 @@ FrameEncoder::FrameEncoder()
 void FrameEncoder::destroy()
 {
     if (m_pool)
-        JobProvider::flush();  // ensure no worker threads are using this frame
+    {
+        if (!m_jpId)
+        {
+            int numTLD = m_pool->m_numWorkers;
+            if (!m_param->bEnableWavefront)
+                numTLD += m_pool->m_numProviders;
+            for (int i = 0; i < numTLD; i++)
+                m_tld[i].destroy();
+            delete [] m_tld;
+        }
+    }
+    else
+    {
+        m_tld->destroy();
+        delete m_tld;
+    }
 
     m_threadActive = false;
     m_enable.trigger();
@@ -90,7 +103,7 @@ void FrameEncoder::destroy()
     stop();
 }
 
-bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id)
+bool FrameEncoder::init(Encoder *top, int numRows, int numCols)
 {
     m_top = top;
     m_param = top->m_param;
@@ -99,7 +112,6 @@ bool FrameEncoder::init(Encoder *top, in
     m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ?
                         2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0);
     m_filterRowDelayCus = m_filterRowDelay * numCols;
-    m_frameEncoderID = id;
     m_rows = new CTURow[m_numRows];
     bool ok = !!m_numRows;
 
@@ -134,7 +146,6 @@ bool FrameEncoder::init(Encoder *top, in
     else
         m_param->noiseReductionIntra = m_param->noiseReductionInter = 0;
 
-    start();
     return ok;
 }
 
@@ -205,7 +216,9 @@ bool FrameEncoder::startCompressFrame(Fr
 {
     m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime;
     m_frame = curFrame;
-    curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it
+    m_sliceType = curFrame->m_lowres.sliceType;
+    curFrame->m_encData->m_frameEncoderID = m_jpId;
+    curFrame->m_encData->m_jobProvider = this;
     curFrame->m_encData->m_slice->m_mref = m_mref;
 
     if (!m_cuGeoms)
@@ -220,19 +233,61 @@ bool FrameEncoder::startCompressFrame(Fr
 
 void FrameEncoder::threadMain()
 {
-    THREAD_NAME("Frame", m_frameEncoderID);
+    THREAD_NAME("Frame", m_jpId);
 
-    // worker thread routine for FrameEncoder
-    do
+    if (m_pool)
     {
-        m_enable.wait(); // Encoder::encode() triggers this event
-        if (m_threadActive)
+        m_pool->setCurrentThreadAffinity();
+
+        /* the first FE on each NUMA node is responsible for allocating thread
+         * local data for all worker threads in that pool. If WPP is disabled, then
+         * each FE also needs a TLD instance */
+        if (!m_jpId)
         {
-            compressFrame();
-            m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event
+            int numTLD = m_pool->m_numWorkers;
+            if (!m_param->bEnableWavefront)
+                numTLD += m_pool->m_numProviders;
+
+            m_tld = new ThreadLocalData[numTLD];
+            for (int i = 0; i < numTLD; i++)
+            {
+                m_tld[i].analysis.m_pool = m_pool;
+                m_tld[i].analysis.initSearch(*m_param, m_top->m_scalingList);
+                m_tld[i].analysis.create(m_tld);
+            }
+
+            for (int i = 0; i < m_pool->m_numProviders; i++)
+            {
+                if (m_pool->m_jpTable[i]->m_isFrameEncoder) /* ugh; over-allocation and other issues here */
+                {
+                    FrameEncoder *peer = dynamic_cast<FrameEncoder*>(m_pool->m_jpTable[i]);
+                    peer->m_tld = m_tld;
+                }
+            }
         }
+
+        if (m_param->bEnableWavefront)
+            m_localTldIdx = -1; // cause exception if used
+        else
+            m_localTldIdx = m_pool->m_numWorkers + m_jpId;
     }
-    while (m_threadActive);
+    else
+    {
+        m_tld = new ThreadLocalData;
+        m_tld->analysis.m_pool = NULL;
+        m_tld->analysis.initSearch(*m_param, m_top->m_scalingList);
+        m_tld->analysis.create(NULL);
+        m_localTldIdx = 0;
+    }
+
+    m_enable.wait();      /* Encoder::encode() triggers this event */
+
+    while (m_threadActive)
+    {
+        compressFrame();
+        m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
+        m_enable.wait();
+    }
 }
 
 void FrameEncoder::compressFrame()
@@ -488,15 +543,31 @@ void FrameEncoder::compressFrame()
     if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0)
         m_top->m_aborted = true;
 
-    /* Accumulate NR statistics from all worker threads */
+    /* Decrement referenced frame reference counts, allow them to be recycled */
+    for (int l = 0; l < numPredDir; l++)
+    {
+        for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
+        {
+            Frame *refpic = slice->m_refPicList[l][ref];
+            ATOMIC_DEC(&refpic->m_countRefEncoders);
+        }
+    }
+
+    int numTLD;
+    if (m_pool)
+        numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders;
+    else
+        numTLD = 1;
+
     if (m_nr)
     {
-        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
+        /* Accumulate NR statistics from all worker threads */
+        for (int i = 0; i < numTLD; i++)
         {
-            NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
+            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
             for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++)
             {
-                for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
+                for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++)
                     m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff];
             
                 m_nr->count[cat] += nr->count[cat];
@@ -506,30 +577,20 @@ void FrameEncoder::compressFrame()
         noiseReductionUpdate();
 
         /* Copy updated NR coefficients back to all worker threads */
-        for (int i = 0; i < m_top->m_numThreadLocalData; i++)
+        for (int i = 0; i < numTLD; i++)
         {
-            NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID];
+            NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId];
             memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
             memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES);
             memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS);
         }
     }
 
-    // Decrement referenced frame reference counts, allow them to be recycled
-    for (int l = 0; l < numPredDir; l++)
-    {
-        for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
-        {
-            Frame *refpic = slice->m_refPicList[l][ref];
-            ATOMIC_DEC(&refpic->m_countRefEncoders);
-        }
-    }
-
 #if DETAILED_CU_STATS
     /* Accumulate CU statistics from each worker thread, we could report
      * per-frame stats here, but currently we do not. */
-    for (int i = 0; i < m_top->m_numThreadLocalData; i++)
-        m_cuStats.accumulate(m_top->m_threadLocalData[i].analysis.m_stats[m_frameEncoderID]);
+    for (int i = 0; i < numTLD; i++)
+        m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId]);
 #endif
 
     m_endFrameTime = x265_mdate();
@@ -621,10 +682,9 @@ void FrameEncoder::compressCTURows()
     int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
 
     m_rows[0].active = true;
-    if (m_pool && m_param->bEnableWavefront)
+    if (m_param->bEnableWavefront)
     {
         WaveFront::clearEnabledRowMask();
-        WaveFront::enqueue();
 
         for (uint32_t row = 0; row < m_numRows; row++)
         {
@@ -645,19 +705,17 @@ void FrameEncoder::compressCTURows()
             }
 
             enableRowEncoder(row);
-            if (row)
-                m_pool->pokeIdleThread();
-            else
+            if (!row)
             {
                 m_row0WaitTime = x265_mdate();
                 enqueueRowEncoder(0);
             }
+            tryWakeOne();
         }
 
         m_allRowsAvailableTime = x265_mdate();
+        tryWakeOne(); /* ensure one thread is active or help-wanted flag is set prior to blocking */
         m_completionEvent.wait();
-
-        WaveFront::dequeue();
     }
     else
     {
@@ -687,10 +745,9 @@ void FrameEncoder::compressCTURows()
                     m_row0WaitTime = x265_mdate();
                 else if (i == m_numRows - 1)
                     m_allRowsAvailableTime = x265_mdate();
-                processRowEncoder(i, *m_tld);
+                processRowEncoder(i, m_tld[m_localTldIdx]);
             }
 
-            // Filter
             if (i >= m_filterRowDelay)
                 m_frameFilter.processRow(i - m_filterRowDelay);
         }
@@ -706,10 +763,8 @@ void FrameEncoder::processRow(int row, i
     const uint32_t realRow = row >> 1;
     const uint32_t typeNum = row & 1;
 
-    ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld;
-    
     if (!typeNum)
-        processRowEncoder(realRow, tld);
+        processRowEncoder(realRow, m_tld[threadId]);
     else
     {
         m_frameFilter.processRow(realRow);
@@ -932,21 +987,21 @@ void FrameEncoder::processRowEncoder(int
             }
         }
 
-        // NOTE: do CU level Filter
+        /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
         if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
-            // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas
             m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
 
-        // NOTE: active next row
-        if (curRow.completed >= 2 && row < m_numRows - 1)
+        if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 &&
+            (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
         {
+            /* activate next row */
             ScopedLock below(m_rows[row + 1].lock);
             if (m_rows[row + 1].active == false &&
-                m_rows[row + 1].completed + 2 <= curRow.completed &&
-                (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow))
+                m_rows[row + 1].completed + 2 <= curRow.completed)
             {
                 m_rows[row + 1].active = true;
                 enqueueRowEncoder(row + 1);
+                tryWakeOne(); /* wake up a sleeping thread or set the help wanted flag */
             }
         }
 
@@ -961,11 +1016,7 @@ void FrameEncoder::processRowEncoder(int
         }
     }
 
-    /* *this row of CTUs has been encoded* */
-
-    /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
-    if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1))
-        rowCoder.finishSlice();
+    /** this row of CTUs has been compressed **/
 
     /* If encoding with ABR, update update bits and complexity in rate control
      * after a number of rows so the next frame's rateControlStart has more
@@ -994,6 +1045,10 @@ void FrameEncoder::processRowEncoder(int
         m_top->m_rateControl->rateControlUpdateStats(&m_rce);
     }
 
+    /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
+    if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1))
+        rowCoder.finishSlice();
+
     if (m_param->bEnableWavefront)
     {
         /* trigger row-wise loop filters */
@@ -1004,11 +1059,13 @@ void FrameEncoder::processRowEncoder(int
             /* NOTE: Activate filter if first row (row 0) */
             if (row == m_filterRowDelay)
                 enqueueRowFilter(0);
+            tryWakeOne();
         }
         if (row == m_numRows - 1)
         {
             for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++)
                 enableRowFilter(i);
+            tryWakeOne();
         }
     }
 
--- a/source/encoder/frameencoder.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/frameencoder.h	Tue Feb 24 15:34:32 2015 +0530
@@ -122,7 +122,7 @@ public:
 
     virtual ~FrameEncoder() {}
 
-    virtual bool init(Encoder *top, int numRows, int numCols, int id);
+    virtual bool init(Encoder *top, int numRows, int numCols);
 
     void destroy();
 
@@ -136,7 +136,7 @@ public:
     Event                    m_done;
     Event                    m_completionEvent;
     bool                     m_threadActive;
-    int                      m_frameEncoderID;
+    int                      m_localTldIdx;
 
     uint32_t                 m_numRows;
     uint32_t                 m_numCols;
--- a/source/encoder/framefilter.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/framefilter.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -426,7 +426,7 @@ static void restoreOrigLosslessYuv(const
 /* Original YUV restoration for CU in lossless coding */
 static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
 {
-    uint32_t absPartIdx = cuGeom.encodeIdx;
+    uint32_t absPartIdx = cuGeom.absPartIdx;
     if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
     {
         for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
--- a/source/encoder/ratecontrol.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/ratecontrol.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -145,30 +145,6 @@ inline void copyRceData(RateControlEntry
 }
 
 }  // end anonymous namespace
-/* Compute variance to derive AC energy of each block */
-static inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int i)
-{
-    uint32_t sum = (uint32_t)sum_ssd;
-    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
-
-    curFrame->m_lowres.wp_sum[i] += sum;
-    curFrame->m_lowres.wp_ssd[i] += ssd;
-    return ssd - ((uint64_t)sum * sum >> shift);
-}
-
-/* Find the energy of each block in Y/Cb/Cr plane */
-static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int bChroma, int colorFormat)
-{
-    if ((colorFormat != X265_CSP_I444) && bChroma)
-    {
-        ALIGN_VAR_8(pixel, pix[8 * 8]);
-        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, bChroma);
-    }
-    else
-        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, bChroma);
-}
-
 /* Returns the zone for the current frame */
 x265_zone* RateControl::getZone()
 {
@@ -181,134 +157,6 @@ x265_zone* RateControl::getZone()
     return NULL;
 }
 
-/* Find the total AC energy of each block in all planes */
-uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y)
-{
-    intptr_t stride = curFrame->m_fencPic->m_stride;
-    intptr_t cStride = curFrame->m_fencPic->m_strideC;
-    intptr_t blockOffsetLuma = block_x + (block_y * stride);
-    int colorFormat = m_param->internalCsp;
-    int hShift = CHROMA_H_SHIFT(colorFormat);
-    int vShift = CHROMA_V_SHIFT(colorFormat);
-    intptr_t blockOffsetChroma = (block_x >> hShift) + ((block_y >> vShift) * cStride);
-
-    uint32_t var;
-
-    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat);
-    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat);
-    x265_emms();
-    return var;
-}
-
-void RateControl::calcAdaptiveQuantFrame(Frame *curFrame)
-{
-    /* Actual adaptive quantization */
-    int maxCol = curFrame->m_fencPic->m_picWidth;
-    int maxRow = curFrame->m_fencPic->m_picHeight;
-
-    for (int y = 0; y < 3; y++)
-    {
-        curFrame->m_lowres.wp_ssd[y] = 0;
-        curFrame->m_lowres.wp_sum[y] = 0;
-    }
-
-    /* Calculate Qp offset for each 16x16 block in the frame */
-    int block_xy = 0;
-    int block_x = 0, block_y = 0;
-    double strength = 0.f;
-    if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
-    {
-        /* Need to init it anyways for CU tree */
-        int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-        int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-        int cuCount = cuWidth * cuHeight;
-
-        if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
-        {
-            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
-            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
-            for (int cuxy = 0; cuxy < cuCount; cuxy++)
-                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
-        }
-
-        /* Need variance data for weighted prediction */
-        if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred)
-        {
-            for (block_y = 0; block_y < maxRow; block_y += 16)
-                for (block_x = 0; block_x < maxCol; block_x += 16)
-                    acEnergyCu(curFrame, block_x, block_y);
-        }
-    }
-    else
-    {
-        block_xy = 0;
-        double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
-        if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
-        {
-            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
-            for (block_y = 0; block_y < maxRow; block_y += 16)
-            {
-                for (block_x = 0; block_x < maxCol; block_x += 16)
-                {
-                    uint32_t energy = acEnergyCu(curFrame, block_x, block_y);
-                    qp_adj = pow(energy + 1, 0.1);
-                    curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
-                    avg_adj += qp_adj;
-                    avg_adj_pow2 += qp_adj * qp_adj;
-                    block_xy++;
-                }
-            }
-
-            avg_adj /= m_ncu;
-            avg_adj_pow2 /= m_ncu;
-            strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
-        }
-        else
-            strength = m_param->rc.aqStrength * 1.0397f;
-
-        block_xy = 0;
-        for (block_y = 0; block_y < maxRow; block_y += 16)
-        {
-            for (block_x = 0; block_x < maxCol; block_x += 16)
-            {
-                if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
-                {
-                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[block_xy];
-                    qp_adj = strength * (qp_adj - avg_adj);
-                }
-                else
-                {
-                    uint32_t energy = acEnergyCu(curFrame, block_x, block_y);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
-                }
-                curFrame->m_lowres.qpAqOffset[block_xy] = qp_adj;
-                curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
-                curFrame->m_lowres.invQscaleFactor[block_xy] = x265_exp2fix8(qp_adj);
-                block_xy++;
-            }
-        }
-    }
-
-    if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred)
-    {
-        int hShift = CHROMA_H_SHIFT(m_param->internalCsp);
-        int vShift = CHROMA_V_SHIFT(m_param->internalCsp);
-        maxCol = ((maxCol + 8) >> 4) << 4;
-        maxRow = ((maxRow + 8) >> 4) << 4;
-        int width[3]  = { maxCol, maxCol >> hShift, maxCol >> hShift };
-        int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };
-
-        for (int i = 0; i < 3; i++)
-        {
-            uint64_t sum, ssd;
-            sum = curFrame->m_lowres.wp_sum[i];
-            ssd = curFrame->m_lowres.wp_ssd[i];
-            curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
-        }
-    }
-}
 
 RateControl::RateControl(x265_param *p)
 {
@@ -1160,6 +1008,7 @@ int RateControl::rateControlStart(Frame*
             m_currentSatd = curFrame->m_lowres.satdCost >> (X265_DEPTH - 8);
             /* Update rce for use in rate control VBV later */
             rce->lastSatd = m_currentSatd;
+            X265_CHECK(rce->lastSatd, "satdcost cannot be zero\n");
         }
         double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce));
         q = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, q);
@@ -1351,6 +1200,8 @@ bool RateControl::cuTreeReadFor2Pass(Fra
 
     if (m_rce2Pass[frame->m_poc].keptAsRef)
     {
+        /* TODO: We don't need pre-lookahead to measure AQ offsets, but there is currently
+         * no way to signal this */
         uint8_t type;
         if (m_cuTreeStats.qpBufPos < 0)
         {
@@ -1379,8 +1230,6 @@ bool RateControl::cuTreeReadFor2Pass(Fra
         }
         m_cuTreeStats.qpBufPos--;
     }
-    else
-        calcAdaptiveQuantFrame(frame);
     return true;
 
 fail:
--- a/source/encoder/ratecontrol.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/ratecontrol.h	Tue Feb 24 15:34:32 2015 +0530
@@ -226,7 +226,6 @@ public:
 
     // to be called for each curFrame to process RateControl and set QP
     int rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
-    void calcAdaptiveQuantFrame(Frame *curFrame);
     void rateControlUpdateStats(RateControlEntry* rce);
     int rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats);
     int rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
@@ -253,7 +252,6 @@ protected:
     double getQScale(RateControlEntry *rce, double rateFactor);
     double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
     void accumPQpUpdate();
-    uint32_t acEnergyCu(Frame* pic, uint32_t block_x, uint32_t block_y);
 
     void updateVbv(int64_t bits, RateControlEntry* rce);
     void updatePredictor(Predictor *p, double q, double var, double bits);
--- a/source/encoder/search.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/search.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -30,6 +30,9 @@
 #include "entropy.h"
 #include "rdcost.h"
 
+#include "analysis.h"  // TLD
+#include "framedata.h"
+
 using namespace x265;
 
 #if _MSC_VER
@@ -42,7 +45,7 @@ using namespace x265;
 
 ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 };
 
-Search::Search() : JobProvider(NULL)
+Search::Search()
 {
     memset(m_rqt, 0, sizeof(m_rqt));
 
@@ -53,11 +56,16 @@ Search::Search() : JobProvider(NULL)
     }
 
     m_numLayers = 0;
+    m_intraPred = NULL;
+    m_intraPredAngs = NULL;
+    m_fencScaled = NULL;
+    m_fencTransposed = NULL;
+    m_tsCoeff = NULL;
+    m_tsResidual = NULL;
+    m_tsRecon = NULL;
     m_param = NULL;
     m_slice = NULL;
     m_frame = NULL;
-    m_bJobsQueued = false;
-    m_totalNumME = m_numAcquiredME = m_numCompletedME = 0;
 }
 
 bool Search::initSearch(const x265_param& param, ScalingList& scalingList)
@@ -115,6 +123,15 @@ bool Search::initSearch(const x265_param
     m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
     m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
 
+    CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
+    m_fencScaled = m_intraPred + 32 * 32;
+    m_fencTransposed = m_fencScaled + 32 * 32;
+    m_intraPredAngs = m_fencTransposed + 32 * 32;
+
+    CHECKED_MALLOC(m_tsCoeff,    coeff_t, MAX_TS_SIZE * MAX_TS_SIZE);
+    CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE);
+    CHECKED_MALLOC(m_tsRecon,    pixel,   MAX_TS_SIZE * MAX_TS_SIZE);
+
     return ok;
 
 fail:
@@ -140,6 +157,10 @@ Search::~Search()
 
     X265_FREE(m_qtTempCbf[0]);
     X265_FREE(m_qtTempTransformSkipFlag[0]);
+    X265_FREE(m_intraPred);
+    X265_FREE(m_tsCoeff);
+    X265_FREE(m_tsResidual);
+    X265_FREE(m_tsRecon);
 }
 
 void Search::setQP(const Slice& slice, int qp)
@@ -420,7 +441,7 @@ void Search::codeIntraLumaQT(Mode& mode,
     }
 
     // set reconstruction for next intra prediction blocks if full TU prediction won
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = m_frame->m_reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
@@ -476,17 +497,14 @@ void Search::codeIntraLumaTSkip(Mode& mo
     if (m_bEnableRDOQ)
         m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 
-    ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
-    ALIGN_VAR_32(pixel,   tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]);
-
     int checkTransformSkip = 1;
     for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
     {
         uint64_t tmpCost;
         uint32_t tmpEnergy = 0;
 
-        coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY);
-        pixel*   tmpRecon = (useTSkip ? tsReconY : reconQt);
+        coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY);
+        pixel*   tmpRecon = (useTSkip ? m_tsRecon : reconQt);
         uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
@@ -577,8 +595,8 @@ void Search::codeIntraLumaTSkip(Mode& mo
 
     if (bTSkip)
     {
-        memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
-        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, tsReconY, tuSize);
+        memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2));
+        primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize);
     }
     else if (checkTransformSkip)
     {
@@ -588,7 +606,7 @@ void Search::codeIntraLumaTSkip(Mode& mo
     }
 
     // set reconstruction for next intra prediction blocks
-    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+    pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
     intptr_t picStride = m_frame->m_reconPic->m_stride;
     primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
 
@@ -638,7 +656,7 @@ void Search::residualTransformQuantIntra
         uint32_t sizeIdx   = log2TrSize - 2;
         primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride);
 
-        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+        pixel*   picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
         intptr_t picStride = m_frame->m_reconPic->m_stride;
 
         uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
@@ -798,7 +816,7 @@ uint32_t Search::codeIntraChromaQt(Mode&
             coeff_t* coeffC        = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC;
             pixel*   reconQt       = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize;
-            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = m_frame->m_reconPic->m_strideC;
 
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -863,9 +881,6 @@ uint32_t Search::codeIntraChromaTSkip(Mo
      * condition as it arrived, and to do all bit estimates from the same state. */
     m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
 
-    ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
-    ALIGN_VAR_32(pixel,   tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
-
     uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2;
     const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT;
 
@@ -913,8 +928,8 @@ uint32_t Search::codeIntraChromaTSkip(Mo
             int checkTransformSkip = 1;
             for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++)
             {
-                coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC);
-                pixel*   recon = (useTSkip ? tskipReconC : reconQt);
+                coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC);
+                pixel*   recon = (useTSkip ? m_tsRecon : reconQt);
                 uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride);
 
                 primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
@@ -971,14 +986,14 @@ uint32_t Search::codeIntraChromaTSkip(Mo
 
             if (bTSkip)
             {
-                memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
-                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE);
+                memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2));
+                primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE);
             }
 
             cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep);
             cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
-            pixel*   reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            pixel*   reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = m_frame->m_reconPic->m_strideC;
             primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride);
 
@@ -1088,7 +1103,7 @@ void Search::residualQTIntraChroma(Mode&
             int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC);
             uint32_t coeffOffsetC  = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift));
             coeff_t* coeffC        = cu.m_trCoeff[ttype] + coeffOffsetC;
-            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC);
+            pixel*   picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC);
             intptr_t picStride = m_frame->m_reconPic->m_strideC;
 
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
@@ -1203,9 +1218,6 @@ void Search::checkIntraInInter(Mode& int
     uint64_t cost, bcost;
 
     // 33 Angle modes once
-    ALIGN_VAR_32(pixel, bufScale[32 * 32]);
-    ALIGN_VAR_32(pixel, bufTrans[32 * 32]);
-    ALIGN_VAR_32(pixel, preds[33 * 32 * 32]);
     int scaleTuSize = tuSize;
     int scaleStride = stride;
     int costShift = 0;
@@ -1213,15 +1225,15 @@ void Search::checkIntraInInter(Mode& int
 
     if (tuSize > 32)
     {
-        // origin is 64x64, we scale to 32x32 and setup required parameters
-        primitives.scale2D_64to32(bufScale, fenc, stride);
-        fenc = bufScale;
+        // CU is 64x64, we scale to 32x32 and adjust required parameters
+        primitives.scale2D_64to32(m_fencScaled, fenc, stride);
+        fenc = m_fencScaled;
 
         pixel nScale[129];
         intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
         primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1, 0);
 
-        //TO DO: primitive
+        // TODO: primitive
         for (int x = 1; x < 65; x++)
         {
             intraNeighbourBuf[0][x] = nScale[x];           // Top pixel
@@ -1250,8 +1262,8 @@ void Search::checkIntraInInter(Mode& int
     uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
 
     // DC
-    primitives.cu[sizeIdx].intra_pred[DC_IDX](preds, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
-    bsad = sa8d(fenc, scaleStride, preds, scaleStride) << costShift;
+    primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
+    bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
     bmode = mode = DC_IDX;
     bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
     bcost = m_rdCost.calcRdSADCost(bsad, bbits);
@@ -1261,8 +1273,8 @@ void Search::checkIntraInInter(Mode& int
     if (tuSize & (8 | 16 | 32))
         planar = intraNeighbourBuf[1];
 
-    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](preds, scaleStride, planar, 0, 0);
-    sad = sa8d(fenc, scaleStride, preds, scaleStride) << costShift;
+    primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0);
+    sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift;
     mode = PLANAR_IDX;
     bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
     cost = m_rdCost.calcRdSADCost(sad, bits);
@@ -1271,8 +1283,8 @@ void Search::checkIntraInInter(Mode& int
     bool allangs = true;
     if (primitives.cu[sizeIdx].intra_pred_allangs)
     {
-        primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride);
-        primitives.cu[sizeIdx].intra_pred_allangs(preds, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
+        primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
+        primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
     }
     else
         allangs = false;
@@ -1280,15 +1292,15 @@ void Search::checkIntraInInter(Mode& int
 #define TRY_ANGLE(angle) \
     if (allangs) { \
         if (angle < 18) \
-            sad = sa8d(bufTrans, scaleTuSize, &preds[(angle - 2) * predsize], scaleTuSize) << costShift; \
+            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
         else \
-            sad = sa8d(fenc, scaleStride, &preds[(angle - 2) * predsize], scaleTuSize) << costShift; \
+            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \
         bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
         cost = m_rdCost.calcRdSADCost(sad, bits); \
     } else { \
         int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \
-        primitives.cu[sizeIdx].intra_pred[angle](preds, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
-        sad = sa8d(fenc, scaleStride, preds, scaleTuSize) << costShift; \
+        primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \
+        sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \
         bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \
         cost = m_rdCost.calcRdSADCost(sad, bits); \
     }
@@ -1422,8 +1434,6 @@ uint32_t Search::estIntraPredQT(Mode &in
             bmode = sharedModes[puIdx];
         else
         {
-            ALIGN_VAR_32(pixel, pred[32 * 32]);
-
             uint64_t candCostList[MAX_RD_INTRA_MODES];
             uint32_t rdModeList[MAX_RD_INTRA_MODES];
             uint64_t bcost;
@@ -1448,9 +1458,8 @@ uint32_t Search::estIntraPredQT(Mode &in
                 if (tuSize > 32)
                 {
                     // origin is 64x64, we scale to 32x32 and setup required parameters
-                    ALIGN_VAR_32(pixel, bufScale[32 * 32]);
-                    primitives.scale2D_64to32(bufScale, fenc, stride);
-                    fenc = bufScale;
+                    primitives.scale2D_64to32(m_fencScaled, fenc, stride);
+                    fenc = m_fencScaled;
 
                     pixel nScale[129];
                     intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
@@ -1485,9 +1494,9 @@ uint32_t Search::estIntraPredQT(Mode &in
                 uint64_t modeCosts[35];
 
                 // DC
-                primitives.cu[sizeIdx].intra_pred[DC_IDX](pred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
+                primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
                 uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
-                uint32_t sad = sa8d(fenc, scaleStride, pred, scaleStride) << costShift;
+                uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
                 modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
 
                 // PLANAR
@@ -1495,27 +1504,24 @@ uint32_t Search::estIntraPredQT(Mode &in
                 if (tuSize >= 8 && tuSize <= 32)
                     planar = intraNeighbourBuf[1];
 
-                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](pred, scaleStride, planar, 0, 0);
+                primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
                 bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
-                sad = sa8d(fenc, scaleStride, pred, scaleStride) << costShift;
+                sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
                 modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
                 COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
 
                 // angular predictions
                 if (primitives.cu[sizeIdx].intra_pred_allangs)
                 {
-                    ALIGN_VAR_32(pixel, bufTrans[32 * 32]);      // TODO: Use aligned mallocs
-                    ALIGN_VAR_32(pixel, allPreds[33 * 32 * 32]);
-
-                    primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride);
-                    primitives.cu[sizeIdx].intra_pred_allangs(allPreds, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
+                    primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
+                    primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
                     for (int mode = 2; mode < 35; mode++)
                     {
                         bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                         if (mode < 18)
-                            sad = sa8d(bufTrans, scaleTuSize, &allPreds[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+                            sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                         else
-                            sad = sa8d(fenc, scaleStride, &allPreds[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+                            sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
                         modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                         COPY1_IF_LT(bcost, modeCosts[mode]);
                     }
@@ -1526,8 +1532,8 @@ uint32_t Search::estIntraPredQT(Mode &in
                     {
                         bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
                         int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
-                        primitives.cu[sizeIdx].intra_pred[mode](pred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
-                        sad = sa8d(fenc, scaleStride, pred, scaleTuSize) << costShift;
+                        primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
+                        sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
                         modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
                         COPY1_IF_LT(bcost, modeCosts[mode]);
                     }
@@ -1589,7 +1595,7 @@ uint32_t Search::estIntraPredQT(Mode &in
              * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also
              * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think
              * that the contexts should be tracked through each PU */
-            pixel*   dst         = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
+            pixel*   dst         = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
             uint32_t dststride   = m_frame->m_reconPic->m_stride;
             const pixel*   src   = reconYuv->getLumaAddr(absPartIdx);
             uint32_t srcstride   = reconYuv->m_size;
@@ -1754,7 +1760,7 @@ uint32_t Search::estIntraPredChromaQT(Mo
 
         if (!tuIterator.isLastSection())
         {
-            uint32_t zorder    = cuGeom.encodeIdx + absPartIdxC;
+            uint32_t zorder    = cuGeom.absPartIdx + absPartIdxC;
             uint32_t dststride = m_frame->m_reconPic->m_strideC;
             const pixel* src;
             pixel* dst;
@@ -1860,6 +1866,60 @@ uint32_t Search::mergeEstimation(CUData&
     return outCost;
 }
 
+void Search::PME::processTasks(int workerThreadId)
+{
+#if DETAILED_CU_STATS
+    int fe = mode.cu.m_encData->m_frameEncoderID;
+    master.m_stats[fe].countPMETasks++;
+    ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime);
+#endif
+    ProfileScopeEvent(pme);
+    master.processPME(*this, master.m_tld[workerThreadId].analysis);
+}
+
+void Search::processPME(PME& pme, Search& slave)
+{
+    /* acquire a motion estimation job, else exit early */
+    int meId;
+    pme.m_lock.acquire();
+    if (pme.m_jobTotal > pme.m_jobAcquired)
+    {
+        meId = pme.m_jobAcquired++;
+        pme.m_lock.release();
+    }
+    else
+    {
+        pme.m_lock.release();
+        return;
+    }
+
+    /* Setup slave Search instance for ME for master's CU */
+    if (&slave != this)
+    {
+        slave.setQP(*m_slice, m_rdCost.m_qp);
+        slave.m_slice = m_slice;
+        slave.m_frame = m_frame;
+        slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.mode.cu.m_cuAddr, pme.cuGeom.absPartIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
+        slave.prepMotionCompensation(pme.mode.cu, pme.cuGeom, pme.puIdx);
+    }
+
+    /* Perform ME, repeat until no more work is available */
+    do
+    {
+        if (meId < m_slice->m_numRefIdx[0])
+            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.puIdx, 0, meId);
+        else
+            slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]);
+
+        meId = -1;
+        pme.m_lock.acquire();
+        if (pme.m_jobTotal > pme.m_jobAcquired)
+            meId = pme.m_jobAcquired++;
+        pme.m_lock.release();
+    }
+    while (meId >= 0);
+}
+
 /* this function assumes the caller has configured its MotionEstimation engine with the
  * correct source plane and source PU, and has called prepMotionCompensation() to set
  * m_puAbsPartIdx, m_puWidth, and m_puHeight */
@@ -1926,9 +1986,8 @@ void Search::singleMotionEstimation(Sear
     }
 }
 
-/* search of the best candidate for inter prediction
- * returns true if predYuv was filled with a motion compensated prediction */
-bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
+/* find the best inter prediction for each PU of specified mode */
+void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D)
 {
     ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);
 
@@ -1943,7 +2002,8 @@ bool Search::predInterSearch(Mode& inter
     const int* numRefIdx = slice->m_numRefIdx;
     uint32_t lastMode = 0;
     int      totalmebits = 0;
-    bool     bDistributed = m_param->bDistributeMotionEstimation && (numRefIdx[0] + numRefIdx[1]) > 2;
+    int      numME = numRefIdx[0] + numRefIdx[1];
+    bool     bTryDistributed = m_param->bDistributeMotionEstimation && numME > 2;
     MV       mvzero(0, 0);
     Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;
 
@@ -1957,7 +2017,7 @@ bool Search::predInterSearch(Mode& inter
         /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */
         initMotionCompensation(cu, cuGeom, puIdx);
 
-        m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
+        m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.absPartIdx, m_puAbsPartIdx, m_puWidth, m_puHeight);
 
         uint32_t mrgCost = MAX_UINT;
 
@@ -1969,15 +2029,8 @@ bool Search::predInterSearch(Mode& inter
             merge.height     = m_puHeight;
             mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge);
 
-            if (bMergeOnly)
+            if (bMergeOnly && mrgCost != MAX_UINT)
             {
-                if (mrgCost == MAX_UINT)
-                {
-                    /* No valid merge modes were found, there is no possible way to
-                     * perform a valid motion compensation prediction, so early-exit */
-                    return false;
-                }
-                // set merge result
                 cu.m_mergeFlag[m_puAbsPartIdx] = true;
                 cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx
                 cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx);
@@ -1997,6 +2050,7 @@ bool Search::predInterSearch(Mode& inter
         bestME[1].cost = MAX_UINT;
 
         getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);
+        bool bDoUnidir = true;
 
         /* Uni-directional prediction */
         if (m_param->analysisMode == X265_ANALYSIS_LOAD)
@@ -2058,62 +2112,30 @@ bool Search::predInterSearch(Mode& inter
                     bestME[l].bits = bits;
                 }
             }
+            bDoUnidir = false;
         }
-        else if (bDistributed)
+        else if (bTryDistributed)
         {
-            m_meLock.acquire();
-            m_curInterMode = &interMode;
-            m_curGeom = &cuGeom;
-            m_curPart = puIdx;
-            m_totalNumME = 0;
-            m_numAcquiredME = 1;
-            m_numCompletedME = 0;
-            m_totalNumME = numRefIdx[0] + numRefIdx[1];
-            m_meLock.release();
-
-            if (!m_bJobsQueued)
-                JobProvider::enqueue();
-
-            for (int i = 1; i < m_totalNumME; i++)
-                m_pool->pokeIdleThread();
-
-            do
+            PME pme(*this, interMode, cuGeom, puIdx);
+            pme.m_jobTotal = numME;
+            pme.m_jobAcquired = 1; /* reserve L0-0 */
+
+            if (pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, numME - 1))
             {
-                m_meLock.acquire();
-                if (m_totalNumME > m_numAcquiredME)
-                {
-                    int id = m_numAcquiredME++;
-                    m_meLock.release();
-
-                    if (id < numRefIdx[0])
-                        singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id);
-                    else
-                        singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]);
-
-                    m_meLock.acquire();
-                    m_numCompletedME++;
-                    m_meLock.release();
-                }
-                else
-                    m_meLock.release();
+                processPME(pme, *this);
+
+                singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0); /* L0-0 */
+
+                bDoUnidir = false;
+
+                ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
+                pme.waitForExit();
             }
-            while (m_totalNumME > m_numAcquiredME);
-
-            if (!m_bJobsQueued)
-                JobProvider::dequeue();
-
-            /* we saved L0-0 for ourselves */
-            singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0);
-
-            m_meLock.acquire();
-            if (++m_numCompletedME == m_totalNumME)
-                m_meCompletionEvent.trigger();
-            m_meLock.release();
-
-            ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);
-            m_meCompletionEvent.wait();
+
+            /* if no peer threads were bonded, fall back to doing unidirectional
+             * searches ourselves without overhead of singleMotionEstimation() */
         }
-        else
+        if (bDoUnidir)
         {
             for (int l = 0; l < numPredDir; l++)
             {
@@ -2254,8 +2276,8 @@ bool Search::predInterSearch(Mode& inter
                 }
                 else
                 {
-                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
-                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx);
+                    const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + m_puAbsPartIdx);
+                    const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + m_puAbsPartIdx);
                     intptr_t refStride = slice->m_mref[0][0].lumaStride;
 
                     primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);
@@ -2359,7 +2381,6 @@ bool Search::predInterSearch(Mode& inter
     }
 
     interMode.sa8dBits += totalmebits;
-    return true;
 }
 
 void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3])
@@ -3028,9 +3049,6 @@ void Search::estimateResidualQT(Mode& mo
             uint32_t nonZeroPsyEnergyY = 0;
             uint64_t singleCostY = MAX_INT64;
 
-            ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]);
-            ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]);
-
             m_entropyCoder.load(m_rqt[depth].rqtRoot);
 
             cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth);
@@ -3040,22 +3058,22 @@ void Search::estimateResidualQT(Mode& mo
 
             fenc = fencYuv->getLumaAddr(absPartIdx);
             resi = resiYuv.getLumaAddr(absPartIdx);
-            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true);
+            uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true);
 
             if (numSigTSkipY)
             {
                 m_entropyCoder.resetBits();
                 m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth);
-                m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA);
+                m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA);
                 const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits();
 
-                m_quant.invtransformNxN(tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
-
-                nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
+                m_quant.invtransformNxN(m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY);
+
+                nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
 
                 if (m_rdCost.m_psyRd)
                 {
-                    nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize);
+                    nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize);
                     singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY);
                 }
                 else
@@ -3071,8 +3089,8 @@ void Search::estimateResidualQT(Mode& mo
                 cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY;
                 bestTransformMode[TEXT_LUMA][0] = 1;
                 uint32_t numCoeffY = 1 << (log2TrSize << 1);
-                memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY);
-                primitives.cu[partSize].copy_ss(curResiY, strideResiY, tsResiY, trSize);
+                memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY);
+                primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize);
             }
 
             cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
@@ -3099,9 +3117,6 @@ void Search::estimateResidualQT(Mode& mo
 
                     int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC);
 
-                    ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
-                    ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]);
-
                     cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
 
                     if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V))
@@ -3109,7 +3124,7 @@ void Search::estimateResidualQT(Mode& mo
 
                     fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC);
                     resi = resiYuv.getChromaAddr(chromaId, absPartIdxC);
-                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
+                    uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true);
 
                     m_entropyCoder.resetBits();
                     singleBits[chromaId][tuIterator.section] = 0;
@@ -3117,16 +3132,16 @@ void Search::estimateResidualQT(Mode& mo
                     if (numSigTSkipC)
                     {
                         m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth);
-                        m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId);
+                        m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId);
                         singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits();
 
-                        m_quant.invtransformNxN(tsResiC, trSizeC, tsCoeffC,
+                        m_quant.invtransformNxN(m_tsResidual, trSizeC, m_tsCoeff,
                                                 log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC);
-                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
+                        uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
                         nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist);
                         if (m_rdCost.m_psyRd)
                         {
-                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC);
+                            nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC);
                             singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC);
                         }
                         else
@@ -3142,8 +3157,8 @@ void Search::estimateResidualQT(Mode& mo
                         cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC;
                         bestTransformMode[chromaId][tuIterator.section] = 1;
                         uint32_t numCoeffC = 1 << (log2TrSizeC << 1);
-                        memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC);
-                        primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, tsResiC, trSizeC);
+                        memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC);
+                        primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC);
                     }
 
                     cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep);
--- a/source/encoder/search.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/search.h	Tue Feb 24 15:34:32 2015 +0530
@@ -210,7 +210,7 @@ inline int getTUBits(int idx, int numIdx
     return idx + (idx < numIdx - 1);
 }
 
-class Search : public JobProvider, public Predict
+class Search : public Predict
 {
 public:
 
@@ -220,6 +220,7 @@ public:
     Quant           m_quant;
     RDCost          m_rdCost;
     const x265_param* m_param;
+    ThreadPool*     m_pool;
     Frame*          m_frame;
     const Slice*    m_slice;
 
@@ -229,13 +230,22 @@ public:
     uint8_t*        m_qtTempCbf[3];
     uint8_t*        m_qtTempTransformSkipFlag[3];
 
+    pixel*          m_fencScaled;     /* 32x32 buffer for down-scaled version of 64x64 CU fenc */
+    pixel*          m_fencTransposed; /* 32x32 buffer for transposed copy of fenc */
+    pixel*          m_intraPred;      /* 32x32 buffer for individual intra predictions */
+    pixel*          m_intraPredAngs;  /* allocation for 33 consecutive (all angular) 32x32 intra predictions */
+
+    coeff_t*        m_tsCoeff;        /* transform skip coeff 32x32 */
+    int16_t*        m_tsResidual;     /* transform skip residual 32x32 */
+    pixel*          m_tsRecon;        /* transform skip reconstructed pixels 32x32 */
+
     bool            m_bFrameParallel;
     bool            m_bEnableRDOQ;
     uint32_t        m_numLayers;
     uint32_t        m_refLagPixels;
 
 #if DETAILED_CU_STATS
-    /* Accumulate CU statistics seperately for each frame encoder */
+    /* Accumulate CU statistics separately for each frame encoder */
     CUStats         m_stats[X265_MAX_FRAME_THREADS];
 #endif
 
@@ -257,7 +267,7 @@ public:
     void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
 
     // estimation inter prediction (non-skip)
-    bool     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma);
+    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma);
 
     // encode residual and compute rd-cost for inter mode
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
@@ -271,21 +281,34 @@ public:
     // pick be chroma mode from available using just sa8d costs
     void     getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
 
+    class PME : public BondedTaskGroup
+    {
+    public:
+
+        Search&       master;
+        Mode&         mode;
+        const CUGeom& cuGeom;
+        int           puIdx;
+
+        PME(Search& s, Mode& m, const CUGeom& g, int p) : master(s), mode(m), cuGeom(g), puIdx(p) {}
+
+        void processTasks(int workerThreadId);
+
+    protected:
+
+        PME operator=(const PME&);
+    };
+
+    void     processPME(PME& pme, Search& slave);
+    void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
+
 protected:
 
     /* motion estimation distribution */
     ThreadLocalData* m_tld;
-    Mode*         m_curInterMode;
-    const CUGeom* m_curGeom;
-    int           m_curPart;
+
     uint32_t      m_listSelBits[3];
-    int           m_totalNumME;
-    volatile int  m_numAcquiredME;
-    volatile int  m_numCompletedME;
-    Event         m_meCompletionEvent;
     Lock          m_meLock;
-    bool          m_bJobsQueued;
-    void     singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref);
 
     void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
 
--- a/source/encoder/slicetype.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/slicetype.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -34,11 +34,11 @@
 #include "motion.h"
 #include "ratecontrol.h"
 
-#define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU)
-
 using namespace x265;
 
-static inline int16_t median(int16_t a, int16_t b, int16_t c)
+namespace {
+
+inline int16_t median(int16_t a, int16_t b, int16_t c)
 {
     int16_t t = (a - b) & ((a - b) >> 31);
 
@@ -49,61 +49,521 @@ static inline int16_t median(int16_t a, 
     return b;
 }
 
-static inline void median_mv(MV &dst, MV a, MV b, MV c)
+inline void median_mv(MV &dst, MV a, MV b, MV c)
 {
     dst.x = median(a.x, b.x, c.x);
     dst.y = median(a.y, b.y, c.y);
 }
 
-Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
-    : JobProvider(pool)
-    , m_est(pool)
+/* Compute variance to derive AC energy of each block */
+inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane)
 {
-    m_bReady = false;
-    m_bBusy = false;
+    uint32_t sum = (uint32_t)sum_ssd;
+    uint32_t ssd = (uint32_t)(sum_ssd >> 32);
+
+    curFrame->m_lowres.wp_sum[plane] += sum;
+    curFrame->m_lowres.wp_ssd[plane] += ssd;
+    return ssd - ((uint64_t)sum * sum >> shift);
+}
+
+/* Find the energy of each block in Y/Cb/Cr plane */
+inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat)
+{
+    if ((colorFormat != X265_CSP_I444) && plane)
+    {
+        ALIGN_VAR_8(pixel, pix[8 * 8]);
+        primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride);
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane);
+    }
+    else
+        return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane);
+}
+
+} // end anonymous namespace
+
+/* Find the total AC energy of each block in all planes */
+uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp)
+{
+    intptr_t stride = curFrame->m_fencPic->m_stride;
+    intptr_t cStride = curFrame->m_fencPic->m_strideC;
+    intptr_t blockOffsetLuma = blockX + (blockY * stride);
+    int hShift = CHROMA_H_SHIFT(csp);
+    int vShift = CHROMA_V_SHIFT(csp);
+    intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride);
+
+    uint32_t var;
+
+    var  = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp);
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp);
+    var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp);
+    x265_emms();
+    return var;
+}
+
+void LookaheadTLD::calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param)
+{
+    /* Actual adaptive quantization */
+    int maxCol = curFrame->m_fencPic->m_picWidth;
+    int maxRow = curFrame->m_fencPic->m_picHeight;
+
+    for (int y = 0; y < 3; y++)
+    {
+        curFrame->m_lowres.wp_ssd[y] = 0;
+        curFrame->m_lowres.wp_sum[y] = 0;
+    }
+
+    /* Calculate Qp offset for each 16x16 block in the frame */
+    int blockXY = 0;
+    int blockX = 0, blockY = 0;
+    double strength = 0.f;
+    if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
+    {
+        /* Need to init it anyways for CU tree */
+        int cuCount = widthInCU * heightInCU;
+
+        if (param->rc.aqMode && param->rc.aqStrength == 0)
+        {
+            memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
+            memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
+            for (int cuxy = 0; cuxy < cuCount; cuxy++)
+                curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+        }
+
+        /* Need variance data for weighted prediction */
+        if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
+        {
+            for (blockY = 0; blockY < maxRow; blockY += 16)
+                for (blockX = 0; blockX < maxCol; blockX += 16)
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+        }
+    }
+    else
+    {
+        blockXY = 0;
+        double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
+        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+        {
+            double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
+            for (blockY = 0; blockY < maxRow; blockY += 16)
+            {
+                for (blockX = 0; blockX < maxCol; blockX += 16)
+                {
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    qp_adj = pow(energy + 1, 0.1);
+                    curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
+                    avg_adj += qp_adj;
+                    avg_adj_pow2 += qp_adj * qp_adj;
+                    blockXY++;
+                }
+            }
+
+            avg_adj /= ncu;
+            avg_adj_pow2 /= ncu;
+            strength = param->rc.aqStrength * avg_adj / bit_depth_correction;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
+        }
+        else
+            strength = param->rc.aqStrength * 1.0397f;
+
+        blockXY = 0;
+        for (blockY = 0; blockY < maxRow; blockY += 16)
+        {
+            for (blockX = 0; blockX < maxCol; blockX += 16)
+            {
+                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+                {
+                    qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                    qp_adj = strength * (qp_adj - avg_adj);
+                }
+                else
+                {
+                    uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp);
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8)));
+                }
+                curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
+                curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
+                curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
+                blockXY++;
+            }
+        }
+    }
+
+    if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
+    {
+        int hShift = CHROMA_H_SHIFT(param->internalCsp);
+        int vShift = CHROMA_V_SHIFT(param->internalCsp);
+        maxCol = ((maxCol + 8) >> 4) << 4;
+        maxRow = ((maxRow + 8) >> 4) << 4;
+        int width[3]  = { maxCol, maxCol >> hShift, maxCol >> hShift };
+        int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift };
+
+        for (int i = 0; i < 3; i++)
+        {
+            uint64_t sum, ssd;
+            sum = curFrame->m_lowres.wp_sum[i];
+            ssd = curFrame->m_lowres.wp_ssd[i];
+            curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
+        }
+    }
+}
+
+void LookaheadTLD::lowresIntraEstimate(Lowres& fenc)
+{
+    ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+    ALIGN_VAR_32(pixel, fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+    pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
+
+    const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
+    const int intraPenalty = 5 * lookAheadLambda;
+    const int lowresPenalty = 4; /* fixed CU cost overhead */
+
+    const int cuSize  = X265_LOWRES_CU_SIZE;
+    const int cuSize2 = cuSize << 1;
+    const int sizeIdx = X265_LOWRES_CU_BITS - 2;
+
+    pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
+    pixelcmp_t satd = primitives.pu[sizeIdx].satd;
+
+    for (int cuY = 0; cuY < heightInCU; cuY++)
+    {
+        fenc.rowSatds[0][0][cuY] = 0;
+
+        for (int cuX = 0; cuX < widthInCU; cuX++)
+        {
+            const int cuXY = cuX + cuY * widthInCU;
+            const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc.lumaStride;
+
+            /* Prep reference pixels */
+            pixel *pixCur = fenc.lowresPlane[0] + pelOffset;
+            primitives.cu[sizeIdx].copy_pp(fencIntra, cuSize, pixCur, fenc.lumaStride);
+
+            memcpy(neighbours[0], pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
+            for (int i = 1; i < cuSize + 1; i++)
+                neighbours[0][i + cuSize2] = pixCur[-1 - fenc.lumaStride + i * fenc.lumaStride]; /* todo: fixme */
+
+            for (int i = 0; i < cuSize; i++)
+            {
+                neighbours[0][i + cuSize + 1] = neighbours[0][cuSize];                     // Copy above-last pixel
+                neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; // Copy left-last pixel
+            }
+
+            neighbours[1][0]  = neighbours[0][0];                      // Copy top-left pixel 
+            neighbours[1][cuSize2] = neighbours[0][cuSize2];           // Copy top-right pixel
+            neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
+
+            // Filter neighbour pixels with [1-2-1]
+            neighbours[1][1]           = (neighbours[0][0] + (neighbours[0][1] << 1)           + neighbours[0][2] + 2)               >> 2;
+            neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
+            for (int i = 2; i < cuSize2; i++)
+            {
+                neighbours[1][i]           = (neighbours[0][i - 1]           + (neighbours[0][i] << 1)           + neighbours[0][i + 1]      + 2) >> 2;
+                neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
+            }
+
+            int cost, icost = me.COST_MAX;
+            uint32_t ilowmode = 0;
+
+            /* DC and planar */
+            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16));
+            cost = satd(fencIntra, cuSize, prediction, cuSize);
+            COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
+
+            primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0);
+            cost = satd(fencIntra, cuSize, prediction, cuSize);
+            COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX);
+
+            /* scan angular predictions */
+            int filter, acost = me.COST_MAX;
+            uint32_t mode, alowmode = 4;
+            for (mode = 5; mode < 35; mode += 5)
+            {
+                filter = !!(g_intraFilterFlags[mode] & cuSize);
+                primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+                cost = satd(fencIntra, cuSize, prediction, cuSize);
+                COPY2_IF_LT(acost, cost, alowmode, mode);
+            }
+            for (uint32_t dist = 2; dist >= 1; dist--)
+            {
+                int minusmode = alowmode - dist;
+                int plusmode = alowmode + dist;
+
+                mode = minusmode;
+                filter = !!(g_intraFilterFlags[mode] & cuSize);
+                primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+                cost = satd(fencIntra, cuSize, prediction, cuSize);
+                COPY2_IF_LT(acost, cost, alowmode, mode);
+
+                mode = plusmode;
+                filter = !!(g_intraFilterFlags[mode] & cuSize);
+                primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
+                cost = satd(fencIntra, cuSize, prediction, cuSize);
+                COPY2_IF_LT(acost, cost, alowmode, mode);
+            }
+            COPY2_IF_LT(icost, acost, ilowmode, alowmode);
+
+            icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
+
+            fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT));
+            fenc.intraCost[cuXY] = icost;
+            fenc.intraMode[cuXY] = (uint8_t)ilowmode;
+            fenc.rowSatds[0][0][cuY] += icost;
+            fenc.costEst[0][0] += icost;
+        }
+    }
+}
+
+uint32_t LookaheadTLD::weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp)
+{
+    pixel *src = ref.fpelPlane[0];
+    intptr_t stride = fenc.lumaStride;
+
+    if (wp.bPresentFlag)
+    {
+        int offset = wp.inputOffset << (X265_DEPTH - 8);
+        int scale = wp.inputWeight;
+        int denom = wp.log2WeightDenom;
+        int round = denom ? 1 << (denom - 1) : 0;
+        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+        int widthHeight = (int)stride;
+
+        primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines,
+            scale, round << correction, denom + correction, offset);
+        src = weightedRef.fpelPlane[0];
+    }
+
+    uint32_t cost = 0;
+    intptr_t pixoff = 0;
+    int mb = 0;
+
+    for (int y = 0; y < fenc.lines; y += 8, pixoff = y * stride)
+    {
+        for (int x = 0; x < fenc.width; x += 8, mb++, pixoff += 8)
+        {
+            int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc.fpelPlane[0] + pixoff, stride);
+            cost += X265_MIN(satd, fenc.intraCost[mb]);
+        }
+    }
+
+    return cost;
+}
+
+bool LookaheadTLD::allocWeightedRef(Lowres& fenc)
+{
+    intptr_t planesize = fenc.buffer[1] - fenc.buffer[0];
+    intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0];
+    paddedLines = (int)(planesize / fenc.lumaStride);
+
+    wbuffer[0] = X265_MALLOC(pixel, 4 * planesize);
+    if (wbuffer[0])
+    {
+        wbuffer[1] = wbuffer[0] + planesize;
+        wbuffer[2] = wbuffer[1] + planesize;
+        wbuffer[3] = wbuffer[2] + planesize;
+    }
+    else
+        return false;
+
+    for (int i = 0; i < 4; i++)
+        weightedRef.lowresPlane[i] = wbuffer[i] + padoffset;
+
+    weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0];
+    weightedRef.lumaStride = fenc.lumaStride;
+    weightedRef.isLowres = true;
+    weightedRef.isWeighted = false;
+
+    return true;
+}
+
+void LookaheadTLD::weightsAnalyse(Lowres& fenc, Lowres& ref)
+{
+    static const float epsilon = 1.f / 128.f;
+    int deltaIndex = fenc.frameNum - ref.frameNum;
+
+    WeightParam wp;
+    wp.bPresentFlag = false;
+
+    if (!wbuffer[0])
+    {
+        if (!allocWeightedRef(fenc))
+            return;
+    }
+
+    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
+    float guessScale, fencMean, refMean;
+    x265_emms();
+    if (fenc.wp_ssd[0] && ref.wp_ssd[0])
+        guessScale = sqrtf((float)fenc.wp_ssd[0] / ref.wp_ssd[0]);
+    else
+        guessScale = 1.0f;
+    fencMean = (float)fenc.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8));
+    refMean = (float)ref.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8));
+
+    /* Early termination */
+    if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon)
+        return;
+
+    int minoff = 0, minscale, mindenom;
+    unsigned int minscore = 0, origscore = 1;
+    int found = 0;
+
+    wp.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true);
+    mindenom = wp.log2WeightDenom;
+    minscale = wp.inputWeight;
+
+    origscore = minscore = weightCostLuma(fenc, ref, wp);
+
+    if (!minscore)
+        return;
+
+    unsigned int s = 0;
+    int curScale = minscale;
+    int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f);
+    if (curOffset < -128 || curOffset > 127)
+    {
+        /* Rescale considering the constraints on curOffset. We do it in this order
+        * because scale has a much wider range than offset (because of denom), so
+        * it should almost never need to be clamped. */
+        curOffset = x265_clip3(-128, 127, curOffset);
+        curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
+        curScale = x265_clip3(0, 127, curScale);
+    }
+    SET_WEIGHT(wp, true, curScale, mindenom, curOffset);
+    s = weightCostLuma(fenc, ref, wp);
+    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
+
+    /* Use a smaller denominator if possible */
+    while (mindenom > 0 && !(minscale & 1))
+    {
+        mindenom--;
+        minscale >>= 1;
+    }
+
+    if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
+        return;
+    else
+    {
+        SET_WEIGHT(wp, true, minscale, mindenom, minoff);
+
+        // set weighted delta cost
+        fenc.weightedCostDelta[deltaIndex] = minscore / origscore;
+
+        int offset = wp.inputOffset << (X265_DEPTH - 8);
+        int scale = wp.inputWeight;
+        int denom = wp.log2WeightDenom;
+        int round = denom ? 1 << (denom - 1) : 0;
+        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+        intptr_t stride = ref.lumaStride;
+        int widthHeight = (int)stride;
+
+        for (int i = 0; i < 4; i++)
+            primitives.weight_pp(ref.buffer[i], wbuffer[i], stride, widthHeight, paddedLines,
+            scale, round << correction, denom + correction, offset);
+
+        weightedRef.isWeighted = true;
+    }
+}
+
+Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
+{
     m_param = param;
+    m_pool  = pool;
+
+    m_lastNonB = NULL;
+    m_scratch  = NULL;
+    m_tld      = NULL;
+    m_filled   = false;
+    m_outputSignalRequired = false;
+    m_isActive = true;
+
+    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_ncu = m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU;
+
     m_lastKeyframe = -m_param->keyframeMax;
-    m_lastNonB = NULL;
-    m_bFilled = false;
-    m_bFlushed = false;
-    m_bFlush = false;
+    memset(m_preframes, 0, sizeof(m_preframes));
+    m_preTotal = m_preAcquired = m_preCompleted = 0;
+    m_sliceTypeBusy = false;
+    m_fullQueueSize = m_param->lookaheadDepth;
+    m_bAdaptiveQuant = m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred;
+
+    /* If we have a thread pool and are using --b-adapt 2, it is generally
+     * preferrable to perform all motion searches for each lowres frame in large
+     * batched; this will create one job per --bframe per lowres frame, and
+     * these jobs are performed by workers bonded to the thread running
+     * slicetypeDecide() */
+    m_bBatchMotionSearch = m_pool && m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS;
+
+    /* It is also beneficial to pre-calculate all possible frame cost estimates
+     * using worker threads bonded to the worker thread running
+     * slicetypeDecide(). This creates bframes * bframes jobs which take less
+     * time than the motion search batches but there are many of them. This may
+     * do much unnecessary work, some frame cost estimates are not needed, so if
+     * the thread pool is small we disable this feature after the initial burst
+     * of work */
+    m_bBatchFrameCosts = m_bBatchMotionSearch;
+
+    if (m_bBatchMotionSearch && m_pool->m_numWorkers > 12)
+    {
+        m_numRowsPerSlice = m_heightInCU / (m_pool->m_numWorkers - 1);   // default to numWorkers - 1 slices
+        m_numRowsPerSlice = X265_MAX(m_numRowsPerSlice, 10);             // at least 10 rows per slice
+        m_numRowsPerSlice = X265_MIN(m_numRowsPerSlice, m_heightInCU);   // but no more than the full picture
+        m_numCoopSlices = m_heightInCU / m_numRowsPerSlice;
+    }
+    else
+    {
+        m_numRowsPerSlice = m_heightInCU;
+        m_numCoopSlices = 1;
+    }
 
 #if DETAILED_CU_STATS
     m_slicetypeDecideElapsedTime = 0;
+    m_preLookaheadElapsedTime = 0;
     m_countSlicetypeDecide = 0;
+    m_countPreLookahead = 0;
 #endif
 
-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
     memset(m_histogram, 0, sizeof(m_histogram));
 }
 
-Lookahead::~Lookahead() { }
-
-void Lookahead::init()
+#if DETAILED_CU_STATS
+void Lookahead::getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount)
 {
-    if (m_pool && m_pool->getThreadCount() >= 4 &&
-        ((m_param->bFrameAdaptive && m_param->bframes) ||
-         m_param->rc.cuTree || m_param->scenecutThreshold ||
-         (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
+    batchElapsedTime = coopSliceElapsedTime = 0;
+    coopSliceCount = batchCount = 0;
+    int tldCount = m_pool ? m_pool->m_numWorkers : 1;
+    for (int i = 0; i < tldCount; i++)
     {
-        JobProvider::enqueue();
+        batchElapsedTime += m_tld[i].batchElapsedTime;
+        coopSliceElapsedTime += m_tld[i].coopSliceElapsedTime;
+        batchCount += m_tld[i].countBatches;
+        coopSliceCount += m_tld[i].countCoopSlices;
     }
-    else
-        m_pool = NULL; /* disable use of worker thread */
+}
+#endif
+
+bool Lookahead::create()
+{
+    int numTLD = 1 + (m_pool ? m_pool->m_numWorkers : 0);
+    m_tld = new LookaheadTLD[numTLD];
+    for (int i = 0; i < numTLD; i++)
+        m_tld[i].init(m_widthInCU, m_heightInCU, m_ncu);
+    m_scratch = X265_MALLOC(int, m_tld[0].widthInCU);
+
+    return m_tld && m_scratch;
 }
 
 void Lookahead::stop()
 {
-    /* do not allow slicetypeDecide() to get started again */
-    m_bReady = false;
-    m_bFlushed = false;
-    m_bFlush = false;
-    m_bBusy = false;
+    if (m_pool && !m_inputQueue.empty())
+    {
+        m_preLookaheadLock.acquire();
+        m_isActive = false;
+        bool wait = m_outputSignalRequired = m_sliceTypeBusy;
+        m_preLookaheadLock.release();
 
-    if (m_pool)
-        JobProvider::flush(); // flush will dequeue, if it is necessary
+        if (wait)
+            m_outputSignal.wait();
+    }
 }
 
 void Lookahead::destroy()
@@ -123,132 +583,165 @@ void Lookahead::destroy()
         delete curFrame;
     }
 
-    x265_free(m_scratch);
+    X265_FREE(m_scratch);
+
+    delete [] m_tld;
 }
 
+/* The synchronization of slicetypeDecide is managed here.  The findJob() method
+ * polls the occupancy of the input queue. If the queue is
+ * full, it will run slicetypeDecide() and output a mini-gop of frames to the
+ * output queue. If the flush() method has been called (implying no new pictures
+ * will be received) then the input queue is considered full if it has even one
+ * picture left. getDecidedPicture() removes pictures from the output queue and
+ * only blocks as a last resort. It does not start removing pictures until
+ * m_filled is true, which occurs after *more than* the lookahead depth of
+ * pictures have been input so slicetypeDecide() should have started prior to
+ * output pictures being withdrawn. The first slicetypeDecide() will obviously
+ * still require a blocking wait, but after this slicetypeDecide() will maintain
+ * its lead over the encoder (because one picture is added to the input queue
+ * each time one is removed from the output) and decides slice types of pictures
+ * just ahead of when the encoder needs them */
+
 /* Called by API thread */
-void Lookahead::addPicture(Frame *curFrame, int sliceType)
+void Lookahead::addPicture(Frame& curFrame, int sliceType)
 {
-    {
-        ProfileScopeEvent(prelookahead);
-        PicYuv *orig = curFrame->m_fencPic;
-        curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType);
-    }
-
-    m_inputQueueLock.acquire();
-    m_inputQueue.pushBack(*curFrame);
-
-    if (m_inputQueue.size() >= m_param->lookaheadDepth)
-    {
-        if (m_pool)
-        {
-            m_bReady = !m_bBusy;
-            m_inputQueueLock.release();
-            m_pool->pokeIdleThread();
-        }
-        else
-            slicetypeDecide();
-    }
-    else
-        m_inputQueueLock.release();
+    curFrame.m_lowres.sliceType = sliceType;
 
     /* determine if the lookahead is (over) filled enough for frames to begin to
      * be consumed by frame encoders */
-    if (!m_bFilled)
+    if (!m_filled)
     {
         if (!m_param->bframes & !m_param->lookaheadDepth)
-            m_bFilled = true; /* zero-latency */
-        else if (curFrame->m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes)
-            m_bFilled = true; /* full capacity plus mini-gop lag */
+            m_filled = true; /* zero-latency */
+        else if (curFrame.m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes)
+            m_filled = true; /* full capacity plus mini-gop lag */
     }
+
+    m_preLookaheadLock.acquire();
+
+    m_inputLock.acquire();
+    m_inputQueue.pushBack(curFrame);
+    m_inputLock.release();
+
+    m_preframes[m_preTotal++] = &curFrame;
+    X265_CHECK(m_preTotal <= X265_LOOKAHEAD_MAX, "prelookahead overflow\n");
+    
+    m_preLookaheadLock.release();
+
+    if (m_pool)
+        tryWakeOne();
 }
 
 /* Called by API thread */
 void Lookahead::flush()
 {
-    m_bFlush = true;
-    m_bFilled = true;
-
-    /* just in case the input queue is never allowed to fill */
-    m_inputQueueLock.acquire();
-    if (m_inputQueue.empty())
-    {
-        m_bFlushed = true;
-        m_inputQueueLock.release();
-    }
-    else
-    {
-        if (m_pool)
-        {
-            m_bReady = !m_bBusy;
-            m_inputQueueLock.release();
-            m_pool->pokeIdleThread();
-        }
-        else
-            slicetypeDecide();
-    }
+    /* force slicetypeDecide to run until the input queue is empty */
+    m_fullQueueSize = 1;
+    m_filled = true;
 }
 
-/* Called by API thread. If the lookahead queue has not yet been filled the
- * first time, it immediately returns NULL.  Else the function blocks until
- * outputs are available and then pops the first frame from the output queue. If
- * flush() has been called and the output queue is empty, NULL is returned. */
-Frame* Lookahead::getDecidedPicture()
+void Lookahead::findJob(int workerThreadID)
 {
-    if (!m_bFilled)
-        return NULL;
+    Frame* preFrame;
+    bool   doDecide;
 
-    m_outputQueueLock.acquire();
-    Frame *fenc = m_outputQueue.popFront();
-    m_outputQueueLock.release();
+    if (!m_isActive)
+        return;
 
-    if (fenc || m_bFlushed)
-        return fenc;
+    int tld = workerThreadID;
+    if (workerThreadID < 0)
+        tld = m_pool ? m_pool->m_numWorkers : 0;
 
+    m_preLookaheadLock.acquire();
     do
     {
-        m_outputAvailable.wait();
+        preFrame = NULL;
+        doDecide = false;
 
-        m_outputQueueLock.acquire();
-        fenc = m_outputQueue.popFront();
-        m_outputQueueLock.release();
+        if (m_preTotal > m_preAcquired)
+            preFrame = m_preframes[m_preAcquired++];
+        else
+        {
+            if (m_preTotal == m_preCompleted)
+                m_preAcquired = m_preTotal = m_preCompleted = 0;
+
+            /* the worker thread that performs the last pre-lookahead will generally get to run
+             * slicetypeDecide() */
+            m_inputLock.acquire();
+            if (!m_sliceTypeBusy && !m_preTotal && m_inputQueue.size() >= m_fullQueueSize && m_isActive)
+                 doDecide = m_sliceTypeBusy = true;
+            m_inputLock.release();
+        }
+        m_preLookaheadLock.release();
+
+        if (preFrame)
+        {
+#if DETAILED_CU_STATS
+            ScopedElapsedTime filterPerfScope(m_preLookaheadElapsedTime);
+            m_countPreLookahead++;
+#endif
+            ProfileScopeEvent(prelookahead);
+
+            preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc);
+            if (m_bAdaptiveQuant)
+                m_tld[tld].calcAdaptiveQuantFrame(preFrame, m_param);
+            m_tld[tld].lowresIntraEstimate(preFrame->m_lowres);
+
+            m_preLookaheadLock.acquire(); /* re-acquire for next pass */
+            m_preCompleted++;
+        }
+        else if (doDecide)
+        {
+#if DETAILED_CU_STATS
+            ScopedElapsedTime filterPerfScope(m_slicetypeDecideElapsedTime);
+            m_countSlicetypeDecide++;
+#endif
+            ProfileScopeEvent(slicetypeDecideEV);
+
+            slicetypeDecide();
+
+            m_preLookaheadLock.acquire(); /* re-acquire for next pass */
+            if (m_outputSignalRequired)
+            {
+                m_outputSignal.trigger();
+                m_outputSignalRequired = false;
+            }
+            m_sliceTypeBusy = false;
+        }
     }
-    while (!fenc);
+    while (preFrame || doDecide);
 
-    return fenc;
+    m_helpWanted = false;
 }
 
-/* Called by pool worker threads */
-bool Lookahead::findJob(int)
+/* Called by API thread */
+Frame* Lookahead::getDecidedPicture()
 {
-    if (!m_bReady)
-        return false;
-
-    m_inputQueueLock.acquire();
-    if (!m_bReady)
+    if (m_filled)
     {
-        m_inputQueueLock.release();
-        return false;
-    }
+        m_outputLock.acquire();
+        Frame *out = m_outputQueue.popFront();
+        m_outputLock.release();
 
-    m_bReady = false;
-    m_bBusy = true;
-
-    do
-    {
-        slicetypeDecide(); // releases input queue lock
-
-        m_inputQueueLock.acquire();
+        if (out)
+            return out;
 
-        if (!m_bBusy)
-            break;
+        /* process all pending pre-lookahead frames and run slicetypeDecide() if
+         * necessary */
+        findJob(-1);
+
+        m_preLookaheadLock.acquire();
+        bool wait = m_outputSignalRequired = m_sliceTypeBusy || m_preTotal;
+        m_preLookaheadLock.release();
+
+        if (wait)
+            m_outputSignal.wait();
+
+        return m_outputQueue.popFront();
     }
-    while (m_inputQueue.size() >= m_param->lookaheadDepth ||
-           (m_bFlush && m_inputQueue.size()));
-
-    m_bBusy = false;
-    m_inputQueueLock.release();
-    return true;
+    else
+        return NULL;
 }
 
 /* Called by rate-control to calculate the estimated SATD cost for a given
@@ -339,12 +832,6 @@ void Lookahead::getEstimatedPictureCost(
 /* called by API thread or worker thread with inputQueueLock acquired */
 void Lookahead::slicetypeDecide()
 {
-    ProfileScopeEvent(slicetypeDecideEV);
-#if DETAILED_CU_STATS
-    ScopedElapsedTime filterPerfScope(m_slicetypeDecideElapsedTime);
-    m_countSlicetypeDecide++;
-#endif
-
     Lowres *frames[X265_LOOKAHEAD_MAX];
     Frame *list[X265_LOOKAHEAD_MAX];
     int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
@@ -352,6 +839,7 @@ void Lookahead::slicetypeDecide()
     memset(frames, 0, sizeof(frames));
     memset(list, 0, sizeof(list));
     {
+        ScopedLock lock(m_inputLock);
         Frame *curFrame = m_inputQueue.first();
         int j;
         for (j = 0; j < m_param->bframes + 2; j++)
@@ -373,11 +861,6 @@ void Lookahead::slicetypeDecide()
         maxSearch = j;
     }
 
-    m_inputQueueLock.release();
-
-    if (!m_est.m_rows && list[0])
-        m_est.init(m_param, list[0]);
-
     if (m_lastNonB && !m_param->rc.bStatRead &&
         ((m_param->bFrameAdaptive && m_param->bframes) ||
          m_param->rc.cuTree || m_param->scenecutThreshold ||
@@ -399,7 +882,7 @@ void Lookahead::slicetypeDecide()
         }
 
         /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available.
-           smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/
+         * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */
         else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs &&
                  m_param->maxNumReferences <= (brefs + 3))
         {
@@ -408,7 +891,7 @@ void Lookahead::slicetypeDecide()
                      frm.sliceType, m_param->maxNumReferences);
         }
 
-        if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
+        if (/* (!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax)
         {
             if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I)
                 frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR;
@@ -484,7 +967,10 @@ void Lookahead::slicetypeDecide()
         /* estimate new non-B cost */
         p1 = b = bframes + 1;
         p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0;
-        m_est.estimateFrameCost(frames, p0, p1, b, 0);
+
+        CostEstimateGroup estGroup(*this, frames);
+
+        estGroup.singleCost(p0, p1, b);
 
         if (bframes)
         {
@@ -497,7 +983,7 @@ void Lookahead::slicetypeDecide()
                 else
                     p1 = bframes + 1;
 
-                m_est.estimateFrameCost(frames, p0, p1, b, 0);
+                estGroup.singleCost(p0, p1, b);
 
                 if (frames[b]->sliceType == X265_TYPE_BREF)
                     p0 = b;
@@ -505,8 +991,7 @@ void Lookahead::slicetypeDecide()
         }
     }
 
-    m_inputQueueLock.acquire();
-
+    m_inputLock.acquire();
     /* dequeue all frames from inputQueue that are about to be enqueued
      * in the output queue. The order is important because Frame can
      * only be in one list at a time */
@@ -518,10 +1003,9 @@ void Lookahead::slicetypeDecide()
         pts[i] = curFrame->m_pts;
         maxSearch--;
     }
+    m_inputLock.release();
 
-    m_inputQueueLock.release();
-
-    m_outputQueueLock.acquire();
+    m_outputLock.acquire();
     /* add non-B to output queue */
     int idx = 0;
     list[bframes]->m_reorderedPts = pts[idx++];
@@ -543,18 +1027,19 @@ void Lookahead::slicetypeDecide()
     /* add B frames to output queue */
     for (int i = 0; i < bframes; i++)
     {
-        /* push all the B frames into output queue except B-ref, which already pushed into output queue*/
+        /* push all the B frames into output queue except B-ref, which already pushed into output queue */
         if (list[i]->m_lowres.sliceType != X265_TYPE_BREF)
         {
             list[i]->m_reorderedPts = pts[idx++];
             m_outputQueue.pushBack(*list[i]);
         }
     }
+    m_outputLock.release();
 
     bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead;
     if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType))
     {
-        m_inputQueueLock.acquire();
+        m_inputLock.acquire();
         Frame *curFrame = m_inputQueue.first();
         frames[0] = m_lastNonB;
         int j;
@@ -563,14 +1048,11 @@ void Lookahead::slicetypeDecide()
             frames[j + 1] = &curFrame->m_lowres;
             curFrame = curFrame->m_next;
         }
+        m_inputLock.release();
 
         frames[j + 1] = NULL;
-        m_inputQueueLock.release();
         slicetypeAnalyse(frames, true);
     }
-
-    m_outputQueueLock.release();
-    m_outputAvailable.trigger();
 }
 
 void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe)
@@ -592,6 +1074,7 @@ void Lookahead::vbvLookahead(Lowres **fr
             int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB;
             frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB);
             frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType;
+
             /* Save the nextNonB Cost in each B frame of the current miniGop */
             if (curNonB > miniGopEnd)
             {
@@ -603,13 +1086,15 @@ void Lookahead::vbvLookahead(Lowres **fr
             }
             idx++;
         }
+
         /* Handle the B-frames: coded order */
         if (m_param->bBPyramid && curNonB - prevNonB > 1)
             nextBRef = (prevNonB + curNonB + 1) / 2;
 
         for (int i = prevNonB + 1; i < curNonB; i++, idx++)
         {
-            int64_t satdCost = 0; int type = X265_TYPE_B;
+            int64_t satdCost = 0;
+            int type = X265_TYPE_B;
             if (nextBRef)
             {
                 if (i == nextBRef)
@@ -649,7 +1134,8 @@ void Lookahead::vbvLookahead(Lowres **fr
 
 int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b)
 {
-    int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0);
+    CostEstimateGroup estGroup(*this, frames);
+    int64_t cost = estGroup.singleCost(p0, p1, b);
 
     if (m_param->rc.aqMode)
     {
@@ -658,6 +1144,7 @@ int64_t Lookahead::vbvFrameCost(Lowres *
         else
             return frames[b]->costEstAq[b - p0][p1 - b];
     }
+
     return cost;
 }
 
@@ -665,7 +1152,7 @@ void Lookahead::slicetypeAnalyse(Lowres 
 {
     int numFrames, origNumFrames, keyintLimit, framecnt;
     int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX);
-    int cuCount = NUM_CUS;
+    int cuCount = m_ncu;
     int resetStart;
     bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth;
 
@@ -699,6 +1186,46 @@ void Lookahead::slicetypeAnalyse(Lowres 
         return;
     }
 
+    if (m_bBatchMotionSearch)
+    {
+        /* pre-calculate all motion searches, using many worker threads */
+        CostEstimateGroup estGroup(*this, frames);
+        for (int b = 2; b < numFrames; b++)
+        {
+            for (int i = 1; i <= m_param->bframes + 1; i++)
+            {
+                if (b >= i && frames[b]->lowresMvs[0][i - 1][0].x == 0x7FFF)
+                    estGroup.add(b - i, b + i < numFrames ? b + i : b, b);
+            }
+        }
+        /* auto-disable after the first batch if pool is small */
+        m_bBatchMotionSearch &= m_pool->m_numWorkers >= 4;
+        estGroup.finishBatch();
+
+        if (m_bBatchFrameCosts)
+        {
+            /* pre-calculate all frame cost estimates, using many worker threads */
+            for (int b = 2; b < numFrames; b++)
+            {
+                for (int i = 1; i <= m_param->bframes + 1; i++)
+                {
+                    if (b < i)
+                        continue;
+
+                    for (int j = 0; j <= m_param->bframes; j++)
+                    {
+                        if (b + j < numFrames && frames[b]->costEst[i][j] < 0)
+                            estGroup.add(b - i, b + j, b);
+                    }
+                }
+            }
+
+            /* auto-disable after the first batch if the pool is not large */
+            m_bBatchFrameCosts &= m_pool->m_numWorkers > 12;
+            estGroup.finishBatch();
+        }
+    }
+
     int numBFrames = 0;
     int numAnalyzed = numFrames;
     if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch))
@@ -716,29 +1243,27 @@ void Lookahead::slicetypeAnalyse(Lowres 
                 char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" };
                 int best_path_index = numFrames % (X265_BFRAME_MAX + 1);
 
-                /* Perform the frametype analysis. */
+                /* Perform the frame type analysis. */
                 for (int j = 2; j <= numFrames; j++)
-                {
                     slicetypePath(frames, j, best_paths);
-                }
 
                 numBFrames = (int)strspn(best_paths[best_path_index], "B");
 
                 /* Load the results of the analysis into the frame types. */
                 for (int j = 1; j < numFrames; j++)
-                {
                     frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P;
-                }
             }
             frames[numFrames]->sliceType = X265_TYPE_P;
         }
         else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST)
         {
+            CostEstimateGroup estGroup(*this, frames);
+
             int64_t cost1p0, cost2p0, cost1b1, cost2p1;
 
             for (int i = 0; i <= numFrames - 2; )
             {
-                cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1);
+                cost2p1 = estGroup.singleCost(i + 0, i + 2, i + 2, true);
                 if (frames[i + 2]->intraMbs[2] > cuCount / 2)
                 {
                     frames[i + 1]->sliceType = X265_TYPE_P;
@@ -747,9 +1272,9 @@ void Lookahead::slicetypeAnalyse(Lowres 
                     continue;
                 }
 
-                cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0);
-                cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0);
-                cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0);
+                cost1b1 = estGroup.singleCost(i + 0, i + 2, i + 1);
+                cost1p0 = estGroup.singleCost(i + 0, i + 1, i + 1);
+                cost2p0 = estGroup.singleCost(i + 1, i + 2, i + 2);
 
                 if (cost1p0 + cost2p0 < cost1b1 + cost2p1)
                 {
@@ -767,7 +1292,7 @@ void Lookahead::slicetypeAnalyse(Lowres 
                 for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++)
                 {
                     int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10);
-                    int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1);
+                    int64_t pcost = estGroup.singleCost(i + 0, j + 1, j + 1, true);
                     if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3)
                         break;
                     frames[j]->sliceType = X265_TYPE_B;
@@ -779,20 +1304,17 @@ void Lookahead::slicetypeAnalyse(Lowres 
             frames[numFrames]->sliceType = X265_TYPE_P;
             numBFrames = 0;
             while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B)
-            {
                 numBFrames++;
-            }
         }
         else
         {
             numBFrames = X265_MIN(numFrames - 1, m_param->bframes);
             for (int j = 1; j < numFrames; j++)
-            {
                 frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P;
-            }
 
             frames[numFrames]->sliceType = X265_TYPE_P;
         }
+
         /* Check scenecut on the first minigop. */
         for (int j = 1; j < numBFrames + 1; j++)
         {
@@ -809,9 +1331,7 @@ void Lookahead::slicetypeAnalyse(Lowres 
     else
     {
         for (int j = 1; j <= numFrames; j++)
-        {
             frames[j]->sliceType = X265_TYPE_P;
-        }
 
         resetStart = bKeyframe ? 1 : 2;
     }
@@ -829,11 +1349,9 @@ void Lookahead::slicetypeAnalyse(Lowres 
     if (bIsVbvLookahead)
         vbvLookahead(frames, numFrames, bKeyframe);
 
-    /* Restore frametypes for all frames that haven't actually been decided yet. */
+    /* Restore frame types for all frames that haven't actually been decided yet. */
     for (int j = resetStart; j <= numFrames; j++)
-    {
         frames[j]->sliceType = X265_TYPE_AUTO;
-    }
 }
 
 bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch)
@@ -858,9 +1376,7 @@ bool Lookahead::scenecut(Lowres **frames
             if (!scenecutInternal(frames, p0, cp1, false))
                 /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */
                 for (int i = cp1; i > p0; i--)
-                {
                     frames[i]->bScenecut = false;
-                }
         }
 
         /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF
@@ -886,7 +1402,8 @@ bool Lookahead::scenecutInternal(Lowres 
 {
     Lowres *frame = frames[p1];
 
-    m_est.estimateFrameCost(frames, p0, p1, p1, 0);
+    CostEstimateGroup estGroup(*this, frames);
+    estGroup.singleCost(p0, p1, p1);
 
     int64_t icost = frame->costEst[0][0];
     int64_t pcost = frame->costEst[p1 - p0][0];
@@ -915,7 +1432,7 @@ bool Lookahead::scenecutInternal(Lowres 
     if (res && bRealScenecut)
     {
         int imb = frame->intraMbs[p1 - p0];
-        int pmb = NUM_CUS - imb;
+        int pmb = m_ncu - imb;
         x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n",
                  frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb);
     }
@@ -957,18 +1474,19 @@ int64_t Lookahead::slicetypePathCost(Low
     int loc = 1;
     int cur_p = 0;
 
+    CostEstimateGroup estGroup(*this, frames);
+
     path--; /* Since the 1st path element is really the second frame */
     while (path[loc])
     {
         int next_p = loc;
         /* Find the location of the next P-frame. */
         while (path[next_p] != 'P')
-        {
             next_p++;
-        }
 
         /* Add the cost of the P-frame found above */
-        cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0);
+        cost += estGroup.singleCost(cur_p, next_p, next_p);
+
         /* Early terminate if the cost we have found is larger than the best path cost so far */
         if (cost > threshold)
             break;
@@ -976,23 +1494,18 @@ int64_t Lookahead::slicetypePathCost(Low
         if (m_param->bBPyramid && next_p - cur_p > 2)
         {
             int middle = cur_p + (next_p - cur_p) / 2;
-            cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0);
+            cost += estGroup.singleCost(cur_p, next_p, middle);
+
             for (int next_b = loc; next_b < middle && cost < threshold; next_b++)
-            {
-                cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0);
-            }
+                cost += estGroup.singleCost(cur_p, middle, next_b);
 
             for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++)
-            {
-                cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0);
-            }
+                cost += estGroup.singleCost(middle, next_p, next_b);
         }
         else
         {
             for (int next_b = loc; next_b < next_p && cost < threshold; next_b++)
-            {
-                cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0);
-            }
+                cost += estGroup.singleCost(cur_p, next_p, next_b);
         }
 
         loc = next_p + 1;
@@ -1018,9 +1531,6 @@ void Lookahead::cuTree(Lowres **frames, 
     int i = numframes;
     int cuCount = m_widthInCU * m_heightInCU;
 
-    if (bIntra)
-        m_est.estimateFrameCost(frames, 0, 0, 0, 0);
-
     while (i > 0 && frames[i]->sliceType == X265_TYPE_B)
         i--;
 
@@ -1047,6 +1557,8 @@ void Lookahead::cuTree(Lowres **frames, 
         memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
     }
 
+    CostEstimateGroup estGroup(*this, frames);
+
     while (i-- > idx)
     {
         curnonb = i;
@@ -1056,13 +1568,14 @@ void Lookahead::cuTree(Lowres **frames, 
         if (curnonb < idx)
             break;
 
-        m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0);
+        estGroup.singleCost(curnonb, lastnonb, lastnonb);
+
         memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t));
         bframes = lastnonb - curnonb - 1;
         if (m_param->bBPyramid && bframes > 1)
         {
             int middle = (bframes + 1) / 2 + curnonb;
-            m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0);
+            estGroup.singleCost(curnonb, lastnonb, middle);
             memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t));
             while (i > curnonb)
             {
@@ -1070,7 +1583,7 @@ void Lookahead::cuTree(Lowres **frames, 
                 int p1 = i < middle ? middle : lastnonb;
                 if (i != middle)
                 {
-                    m_est.estimateFrameCost(frames, p0, p1, i, 0);
+                    estGroup.singleCost(p0, p1, i);
                     estimateCUPropagate(frames, averageDuration, p0, p1, i, 0);
                 }
                 i--;
@@ -1082,7 +1595,7 @@ void Lookahead::cuTree(Lowres **frames, 
         {
             while (i > curnonb)
             {
-                m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0);
+                estGroup.singleCost(curnonb, lastnonb, i);
                 estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0);
                 i--;
             }
@@ -1093,7 +1606,7 @@ void Lookahead::cuTree(Lowres **frames, 
 
     if (!m_param->lookaheadDepth)
     {
-        m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0);
+        estGroup.singleCost(0, lastnonb, lastnonb);
         estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1);
         std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost);
     }
@@ -1118,7 +1631,7 @@ void Lookahead::estimateCUPropagate(Lowr
     x265_emms();
     double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);
 
-    /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */
+    /* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */
     if (!referenced)
         memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t));
 
@@ -1132,6 +1645,7 @@ void Lookahead::estimateCUPropagate(Lowr
 
         if (referenced)
             propagateCost += m_widthInCU;
+
         for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++)
         {
             int32_t propagate_amount = m_scratch[blockx];
@@ -1211,8 +1725,8 @@ void Lookahead::cuTreeFinish(Lowres *fra
     if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0)
         weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]);
 
-    /* Allow the strength to be adjusted via qcompress, since the two
-     * concepts are very similar. */
+    /* Allow the strength to be adjusted via qcompress, since the two concepts
+     * are very similar. */
 
     int cuCount = m_widthInCU * m_heightInCU;
     double strength = 5.0 * (1.0 - m_param->rc.qCompress);
@@ -1260,557 +1774,307 @@ int64_t Lookahead::frameCostRecalculate(
     return score;
 }
 
-CostEstimate::CostEstimate(ThreadPool *p)
-    : WaveFront(p)
+
+int64_t CostEstimateGroup::singleCost(int p0, int p1, int b, bool intraPenalty)
 {
-    m_param = NULL;
-    m_curframes = NULL;
-    m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0;
-    m_rows = NULL;
-    m_paddedLines = m_widthInCU = m_heightInCU = 0;
-    m_bDoSearch[0] = m_bDoSearch[1] = false;
-    m_curb = m_curp0 = m_curp1 = 0;
-    m_bFrameCompleted = false;
-}
-
-CostEstimate::~CostEstimate()
-{
-    for (int i = 0; i < 4; i++)
-        X265_FREE(m_wbuffer[i]);
-
-    delete[] m_rows;
+    LookaheadTLD& tld = m_lookahead.m_tld[m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0];
+    return estimateFrameCost(tld, p0, p1, b, intraPenalty);
 }
 
-void CostEstimate::init(x265_param *_param, Frame *curFrame)
+void CostEstimateGroup::add(int p0, int p1, int b, bool intraPenalty)
 {
-    m_param = _param;
-    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
-    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    X265_CHECK(m_batchMode || !m_jobTotal, "single CostEstimateGroup instance cannot mix batch modes\n");
+    m_batchMode = true;
 
-#if DETAILED_CU_STATS
-    m_processRowElapsedTime = 0;
-    m_countProcessRow = 0;
-#endif
+    Estimate& e = m_estimates[m_jobTotal++];
+    e.p0 = p0;
+    e.p1 = p1;
+    e.b = b;
+    e.bIntraPenalty = intraPenalty;
 
-    m_rows = new EstimateRow[m_heightInCU];
-    for (int i = 0; i < m_heightInCU; i++)
+    if (m_jobTotal == MAX_BATCH_SIZE)
+        finishBatch();
+}
+
+void CostEstimateGroup::finishBatch()
+{
+    if (m_lookahead.m_pool)
+        tryBondPeers(*m_lookahead.m_pool, m_jobTotal);
+    processTasks(-1);
+    waitForExit();
+    m_jobTotal = m_jobAcquired = 0;
+}
+
+void CostEstimateGroup::processTasks(int workerThreadID)
+{
+    ThreadPool* pool = m_lookahead.m_pool;
+    int id = workerThreadID;
+    if (workerThreadID < 0)
+        id = pool ? pool->m_numWorkers : 0;
+    LookaheadTLD& tld = m_lookahead.m_tld[id];
+
+    m_lock.acquire();
+    while (m_jobAcquired < m_jobTotal)
     {
-        m_rows[i].m_widthInCU = m_widthInCU;
-        m_rows[i].m_heightInCU = m_heightInCU;
-        m_rows[i].m_param = m_param;
-    }
-
-    if (WaveFront::init(m_heightInCU))
-        WaveFront::enableAllRows();
-    else
-        m_pool = NULL;
+        int i = m_jobAcquired++;
+        m_lock.release();
 
-    if (m_param->bEnableWeightedPred)
-    {
-        PicYuv *orig = curFrame->m_fencPic;
-        m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY;
-        intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX;
+        if (m_batchMode)
+        {
+#if DETAILED_CU_STATS
+            ScopedElapsedTime filterPerfScope(tld.batchElapsedTime);
+            tld.countBatches++;
+#endif
+            ProfileScopeEvent(estCostSingle);
+            Estimate& e = m_estimates[i];
 
-        /* allocate weighted lowres buffers */
-        for (int i = 0; i < 4; i++)
+            estimateFrameCost(tld, e.p0, e.p1, e.b, e.bIntraPenalty);
+        }
+        else
         {
-            m_wbuffer[i] = X265_MALLOC(pixel, curFrame->m_lowres.lumaStride * m_paddedLines);
-            m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset;
+#if DETAILED_CU_STATS
+            ScopedElapsedTime filterPerfScope(tld.coopSliceElapsedTime);
+            tld.countCoopSlices++;
+#endif
+            ProfileScopeEvent(estCostCoop);
+            X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slices\n");
+
+            int firstY = m_lookahead.m_numRowsPerSlice * i;
+            int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_heightInCU - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
+
+            bool lastRow = true;
+            for (int cuY = lastY; cuY >= firstY; cuY--)
+            {
+                m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;
+
+                for (int cuX = m_lookahead.m_widthInCU - 1; cuX >= 0; cuX--)
+                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i);
+
+                lastRow = false;
+            }
         }
 
-        m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0];
-        m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride;
-        m_weightedRef.isLowres = true;
-        m_weightedRef.isWeighted = false;
+        m_lock.acquire();
     }
+    m_lock.release();
 }
 
-int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty)
+int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool bIntraPenalty)
 {
-    int64_t score = 0;
-    Lowres *fenc = frames[b];
+    Lowres*     fenc  = m_frames[b];
+    x265_param* param = m_lookahead.m_param;
+    int64_t     score = 0;
 
     if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
         score = fenc->costEst[b - p0][p1 - b];
     else
     {
-        m_weightedRef.isWeighted = false;
-        if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF)
-        {
-            if (!fenc->bIntraCalculated)
-                estimateFrameCost(frames, b, b, b, 0);
-            weightsAnalyse(frames, b, p0);
-        }
+        X265_CHECK(p0 != b, "I frame estimates should always be pre-calculated\n");
 
-        /* For each list, check to see whether we have lowres motion-searched this reference */
-        m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
-        m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
+        bool bDoSearch[2];
+        bDoSearch[0] = p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
+        bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
 
-        if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
-        if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0;
+        tld.weightedRef.isWeighted = false;
+        if (param->bEnableWeightedPred && bDoSearch[0])
+            tld.weightsAnalyse(*m_frames[b], *m_frames[p0]);
 
-        m_curb = b;
-        m_curp0 = p0;
-        m_curp1 = p1;
-        m_curframes = frames;
         fenc->costEst[b - p0][p1 - b] = 0;
         fenc->costEstAq[b - p0][p1 - b] = 0;
 
-        for (int i = 0; i < m_heightInCU; i++)
+        if (!m_batchMode && m_lookahead.m_numCoopSlices > 1 && ((p1 > b) || bDoSearch[0] || bDoSearch[1]))
         {
-            m_rows[i].init();
-            if (!fenc->bIntraCalculated)
-                fenc->rowSatds[0][0][i] = 0;
-            fenc->rowSatds[b - p0][p1 - b][i] = 0;
-#if DETAILED_CU_STATS
-            m_rows[i].m_processRowElapsedTime = 0;
-            m_rows[i].m_countProcessRow = 0;
-#endif
-        }
+            /* Use cooperative mode if a thread pool is available and the cost estimate is
+             * going to need motion searches or bidir measurements */
 
-        m_bFrameCompleted = false;
-
-        if (m_pool)
-        {
-            WaveFront::enqueue();
+            memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices);
 
-            // enableAllRows must be already called
-            enqueueRow(0);
-            while (!m_bFrameCompleted)
-                WaveFront::findJob(-1);
+            m_lock.acquire();
+            X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n");
+            m_coop.p0 = p0;
+            m_coop.p1 = p1;
+            m_coop.b = b;
+            m_coop.bDoSearch[0] = bDoSearch[0];
+            m_coop.bDoSearch[1] = bDoSearch[1];
+            m_jobTotal = m_lookahead.m_numCoopSlices;
+            m_jobAcquired = 0;
+            m_lock.release();
 
-            WaveFront::dequeue();
+            tryBondPeers(*m_lookahead.m_pool, m_jobTotal);
+
+            processTasks(-1);
+
+            waitForExit();
+
+            for (int i = 0; i < m_lookahead.m_numCoopSlices; i++)
+            {
+                fenc->costEst[b - p0][p1 - b] += m_slice[i].costEst;
+                fenc->costEstAq[b - p0][p1 - b] += m_slice[i].costEstAq;
+                if (p1 == b)
+                    fenc->intraMbs[b - p0] += m_slice[i].intraMbs;
+            }
         }
         else
         {
-            for (int row = 0; row < m_heightInCU; row++)
-                processRow(row, -1);
+            bool lastRow = true;
+            for (int cuY = m_lookahead.m_heightInCU - 1; cuY >= 0; cuY--)
+            {
+                fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
 
-            x265_emms();
+                for (int cuX = m_lookahead.m_widthInCU - 1; cuX >= 0; cuX--)
+                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1);
+
+                lastRow = false;
+            }
         }
 
-        // Accumulate cost from each row
-        for (int row = 0; row < m_heightInCU; row++)
-        {
-#if DETAILED_CU_STATS
-            m_processRowElapsedTime += m_rows[row].m_processRowElapsedTime;
-            m_countProcessRow += m_rows[row].m_countProcessRow;
-#endif
-            score += m_rows[row].m_costEst;
-            fenc->costEst[0][0] += m_rows[row].m_costIntra;
-            if (m_param->rc.aqMode)
-            {
-                fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq;
-                fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq;
-            }
-            fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs;
-        }
-
-        fenc->bIntraCalculated = true;
+        score = fenc->costEst[b - p0][p1 - b];
 
         if (b != p1)
-            score = (uint64_t)score * 100 / (130 + m_param->bFrameBias);
-        if (b != p0 || b != p1) //Not Intra cost
-            fenc->costEst[b - p0][p1 - b] = score;
+            score = score * 100 / (130 + param->bFrameBias);
+
+        fenc->costEst[b - p0][p1 - b] = score;
     }
 
     if (bIntraPenalty)
-    {
         // arbitrary penalty for I-blocks after B-frames
-        int ncu = NUM_CUS;
-        score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8);
-    }
+        score += score * fenc->intraMbs[b - p0] / (tld.ncu * 8);
+
     return score;
 }
 
-uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp)
-{
-    Lowres *fenc = frames[b];
-    Lowres *ref  = frames[p0];
-    pixel *src = ref->fpelPlane[0];
-    intptr_t stride = fenc->lumaStride;
-
-    if (wp)
-    {
-        int offset = wp->inputOffset << (X265_DEPTH - 8);
-        int scale = wp->inputWeight;
-        int denom = wp->log2WeightDenom;
-        int round = denom ? 1 << (denom - 1) : 0;
-        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
-        int widthHeight = (int)stride;
-
-        primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines,
-                             scale, round << correction, denom + correction, offset);
-        src = m_weightedRef.fpelPlane[0];
-    }
-
-    uint32_t cost = 0;
-    intptr_t pixoff = 0;
-    int mb = 0;
-
-    for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride)
-    {
-        for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8)
-        {
-            int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride);
-            cost += X265_MIN(satd, fenc->intraCost[mb]);
-        }
-    }
-
-    return cost;
-}
-
-void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0)
+void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
 {
-    static const float epsilon = 1.f / 128.f;
-    Lowres *fenc, *ref;
-
-    fenc = frames[b];
-    ref  = frames[p0];
-    int deltaIndex = fenc->frameNum - ref->frameNum;
-
-    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
-    float guessScale, fencMean, refMean;
-    x265_emms();
-    if (fenc->wp_ssd[0] && ref->wp_ssd[0])
-        guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]);
-    else
-        guessScale = 1.0f;
-    fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
-    refMean  = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8));
-
-    /* Early termination */
-    if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon)
-        return;
-
-    int minoff = 0, minscale, mindenom;
-    unsigned int minscore = 0, origscore = 1;
-    int found = 0;
-
-    m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true);
-    mindenom = m_w.log2WeightDenom;
-    minscale = m_w.inputWeight;
-
-    origscore = minscore = weightCostLuma(frames, b, p0, NULL);
-
-    if (!minscore)
-        return;
-
-    unsigned int s = 0;
-    int curScale = minscale;
-    int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f);
-    if (curOffset < -128 || curOffset > 127)
-    {
-        /* Rescale considering the constraints on curOffset. We do it in this order
-         * because scale has a much wider range than offset (because of denom), so
-         * it should almost never need to be clamped. */
-        curOffset = x265_clip3(-128, 127, curOffset);
-        curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f);
-        curScale = x265_clip3(0, 127, curScale);
-    }
-    SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset);
-    s = weightCostLuma(frames, b, p0, &m_w);
-    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1);
+    Lowres *fref0 = m_frames[p0];
+    Lowres *fref1 = m_frames[p1];
+    Lowres *fenc  = m_frames[b];
 
-    /* Use a smaller denominator if possible */
-    while (mindenom > 0 && !(minscale & 1))
-    {
-        mindenom--;
-        minscale >>= 1;
-    }
-
-    if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f)
-        return;
-    else
-    {
-        SET_WEIGHT(m_w, 1, minscale, mindenom, minoff);
-        // set weighted delta cost
-        fenc->weightedCostDelta[deltaIndex] = minscore / origscore;
-
-        int offset = m_w.inputOffset << (X265_DEPTH - 8);
-        int scale = m_w.inputWeight;
-        int denom = m_w.log2WeightDenom;
-        int round = denom ? 1 << (denom - 1) : 0;
-        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
-        intptr_t stride = ref->lumaStride;
-        int widthHeight = (int)stride;
-
-        for (int i = 0; i < 4; i++)
-            primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines,
-                                 scale, round << correction, denom + correction, offset);
-
-        m_weightedRef.isWeighted = true;
-    }
-}
-
-void CostEstimate::processRow(int row, int /*threadId*/)
-{
-    ProfileScopeEvent(costEstimateRow);
-#if DETAILED_CU_STATS
-    ScopedElapsedTime filterPerfScope(m_processRowElapsedTime);
-    m_countProcessRow++;
-#endif
-
-    int realrow = m_heightInCU - 1 - row;
-    Lowres **frames = m_curframes;
-    ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0];
+    ReferencePlanes *wfref0 = tld.weightedRef.isWeighted ? &tld.weightedRef : fref0;
 
-    /* Lowres lookahead goes backwards because the MVs are used as
-     * predictors in the main encode.  This considerably improves MV
-     * prediction overall. */
-    for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--)
-    {
-        // TODO: use lowres MVs as motion candidates in full-res search
-        m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch);
-        m_rows[row].m_completed++;
-
-        if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1)
-        {
-            ScopedLock below(m_rows[row + 1].m_lock);
-            if (m_rows[row + 1].m_active == false &&
-                m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed)
-            {
-                m_rows[row + 1].m_active = true;
-                enqueueRow(row + 1);
-            }
-        }
-
-        ScopedLock self(m_rows[row].m_lock);
-        if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 &&
-            m_rows[row - 1].m_completed < m_rows[row].m_completed + 2)
-        {
-            m_rows[row].m_active = false;
-            return;
-        }
-    }
+    const int widthInCU = m_lookahead.m_widthInCU;
+    const int heightInCU = m_lookahead.m_heightInCU;
+    const int bBidir = (b < p1);
+    const int cuXY = cuX + cuY * widthInCU;
+    const int cuSize = X265_LOWRES_CU_SIZE;
+    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
 
-    if (row == m_heightInCU - 1)
-        m_bFrameCompleted = true;
-}
-
-void EstimateRow::init()
-{
-    m_costEst = 0;
-    m_costEstAq = 0;
-    m_costIntra = 0;
-    m_costIntraAq = 0;
-    m_intraMbs = 0;
-    m_active = false;
-    m_completed = 0;
-}
-
-void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2])
-{
-    Lowres *fref1 = frames[p1];
-    Lowres *fenc  = frames[b];
-
-    const int bBidir = (b < p1);
-    const int cuXY = cux + cuy * m_widthInCU;
-    const int cuSize = X265_LOWRES_CU_SIZE;
-    const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride;
-
-    // should this CU's cost contribute to the frame cost?
-    const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 &&
-                                cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2;
-
-    m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
+    if (bBidir || bDoSearch[0] || bDoSearch[1])
+        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize);
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;
 
-    MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
-                         &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
-    int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY],
-                            &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
+    MV(*fencMVs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY],
+                        &fenc->lowresMvs[1][p1 - b - 1][cuXY] };
+    int(*fencCosts[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY],
+                           &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] };
 
     MV mvmin, mvmax;
-    int bcost = m_me.COST_MAX;
+    int bcost = tld.me.COST_MAX;
     int listused = 0;
 
     // establish search bounds that don't cross extended frame boundaries
-    mvmin.x = (int16_t)(-cux * cuSize - 8);
-    mvmin.y = (int16_t)(-cuy * cuSize - 8);
-    mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8);
-    mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8);
+    mvmin.x = (int16_t)(-cuX * cuSize - 8);
+    mvmin.y = (int16_t)(-cuY * cuSize - 8);
+    mvmax.x = (int16_t)((widthInCU - cuX - 1) * cuSize + 8);
+    mvmax.y = (int16_t)((heightInCU - cuY - 1) * cuSize + 8);
 
-    if (p0 != p1)
+    for (int i = 0; i < 1 + bBidir; i++)
     {
-        for (int i = 0; i < 1 + bBidir; i++)
+        if (!bDoSearch[i])
         {
-            if (!bDoSearch[i])
-            {
-                /* Use previously calculated cost */
-                COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
-                continue;
-            }
-            int numc = 0;
-            MV mvc[4], mvp;
-            MV *fenc_mv = fenc_mvs[i];
+            COPY2_IF_LT(bcost, *fencCosts[i], listused, i + 1);
+            continue;
+        }
 
-            /* Reverse-order MV prediction. */
-            mvc[0] = 0;
-            mvc[2] = 0;
+        int numc = 0;
+        MV mvc[4], mvp;
+        MV *fencMV = fencMVs[i];
+
+        /* Reverse-order MV prediction */
+        mvc[0] = 0;
+        mvc[2] = 0;
 #define MVC(mv) mvc[numc++] = mv;
-            if (cux < m_widthInCU - 1)
-                MVC(fenc_mv[1]);
-            if (cuy < m_heightInCU - 1)
-            {
-                MVC(fenc_mv[m_widthInCU]);
-                if (cux > 0)
-                    MVC(fenc_mv[m_widthInCU - 1]);
-                if (cux < m_widthInCU - 1)
-                    MVC(fenc_mv[m_widthInCU + 1]);
-            }
+        if (cuX < widthInCU - 1)
+            MVC(fencMV[1]);
+        if (!lastRow)
+        {
+            MVC(fencMV[widthInCU]);
+            if (cuX > 0)
+                MVC(fencMV[widthInCU - 1]);
+            if (cuX < widthInCU - 1)
+                MVC(fencMV[widthInCU + 1]);
+        }
 #undef MVC
-            if (numc <= 1)
-                mvp = mvc[0];
-            else
-            {
-                median_mv(mvp, mvc[0], mvc[1], mvc[2]);
-            }
+        if (numc <= 1)
+            mvp = mvc[0];
+        else
+            median_mv(mvp, mvc[0], mvc[1], mvc[2]);
 
-            *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]);
-            COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1);
-        }
-        if (bBidir)
-        {
-            ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-            ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-            intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
-            pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0);
-            pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1);
-
-            ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-            primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
-            int bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
-            COPY2_IF_LT(bcost, bicost, listused, 3);
-
-            // Try 0,0 candidates
-            src0 = wfref0->lowresPlane[0] + pelOffset;
-            src1 = fref1->lowresPlane[0] + pelOffset;
-            primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32);
-            bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE);
-            COPY2_IF_LT(bcost, bicost, listused, 3);
-        }
+        *fencCosts[i] = tld.me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, s_merange, *fencMVs[i]);
+        COPY2_IF_LT(bcost, *fencCosts[i], listused, i + 1);
     }
 
-    if (!fenc->bIntraCalculated)
+    if (bBidir) /* B, also consider bidir */
     {
-        ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
-        pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
-        const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
-        const int cuSize2 = cuSize << 1;
-
-        pixel *pixCur = fenc->lowresPlane[0] + pelOffset;
-
-        // Copy Above
-        memcpy(neighbours[0], pixCur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
-
-        // Copy Left
-        for (int i = 1; i < cuSize + 1; i++)
-            neighbours[0][i + cuSize2] = pixCur[-1 - fenc->lumaStride + i * fenc->lumaStride];
-
-        for (int i = 0; i < cuSize; i++)
-        {
-            // Copy above-last pixel
-            neighbours[0][i + cuSize + 1] = neighbours[0][cuSize]; //neighbours[0][i + 9] = neighbours[0][8]
-            // Copy left-last pixel
-            neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; //neighbours[0][i + 25] = neighbours[0][24]
-        }
-
-        // Filter neighbour pixels with [1-2-1]
-        neighbours[1][0]  = neighbours[0][0];  // Copy top-left pixel 
-        neighbours[1][cuSize2] = neighbours[0][cuSize2]; //Copy top-right pixel
-        neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
-
-        neighbours[1][1]           = (neighbours[0][0] + (neighbours[0][1] << 1)           + neighbours[0][2] + 2)               >> 2;
-        neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
-        for (int i = 2; i < cuSize2; i++)
-        {
-            neighbours[1][i]           = (neighbours[0][i - 1]      + (neighbours[0][i] << 1)      + neighbours[0][i + 1]      + 2) >> 2;
-            neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
-        }
-
-        int icost = m_me.COST_MAX, ilowmode;
-        primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16));
-        int cost = m_me.bufSATD(prediction, cuSize);
-        COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
-
-        pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
-        primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0);
-        cost = m_me.bufSATD(prediction, cuSize);
-        COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX);
+        /* NOTE: the wfref0 (weightp) is not used for BIDIR */
 
-        uint32_t mode, lowmode = 4;
-        int acost = m_me.COST_MAX, filter;
-        for (mode = 5; mode < 35; mode += 5)
-        {
-            filter = !!(g_intraFilterFlags[mode] & cuSize);
-            primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
-            cost = m_me.bufSATD(prediction, cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
-        }
-        for (uint32_t dist = 2; dist >= 1; dist--)
-        {
-            int minusmode = lowmode - dist;
-            int plusmode = lowmode + dist;
-
-            mode = minusmode;
-            filter = !!(g_intraFilterFlags[mode] & cuSize);
-            primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
-            cost = m_me.bufSATD(prediction, cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
+        /* avg(l0-mv, l1-mv) candidate */
+        ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+        ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+        intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
+        pixel *src0 = fref0->lowresMC(pelOffset, *fencMVs[0], subpelbuf0, stride0);
+        pixel *src1 = fref1->lowresMC(pelOffset, *fencMVs[1], subpelbuf1, stride1);
 
-            mode = plusmode;
-            filter = !!(g_intraFilterFlags[mode] & cuSize);
-            primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16);
-            cost = m_me.bufSATD(prediction, cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
-        }
-        COPY2_IF_LT(icost, acost, ilowmode, lowmode);
-
-        const int intraPenalty = 5 * m_lookAheadLambda;
-        icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
-        fenc->intraCost[cuXY] = icost;
-        fenc->intraMode[cuXY] = (uint8_t)ilowmode;
+        ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
+        primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
+        int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
+        COPY2_IF_LT(bcost, bicost, listused, 3);
 
-        int icostAq = icost;
-        if (bFrameScoreCU)
-        {
-            m_costIntra += icost;
-            if (fenc->invQscaleFactor)
-            {
-                icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
-                m_costIntraAq += icostAq;
-            }
-        }
-        fenc->rowSatds[0][0][cuy] += icostAq;
+        /* coloc candidate */
+        src0 = fref0->lowresPlane[0] + pelOffset;
+        src1 = fref1->lowresPlane[0] + pelOffset;
+        primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32);
+        bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
+        COPY2_IF_LT(bcost, bicost, listused, 3);
+
+        bcost += lowresPenalty;
     }
-    bcost += lowresPenalty;
-    if (!bBidir)
+    else /* P, also consider intra */
     {
+        bcost += lowresPenalty;
+
         if (fenc->intraCost[cuXY] < bcost)
         {
-            if (bFrameScoreCU) m_intraMbs++;
             bcost = fenc->intraCost[cuXY];
             listused = 0;
         }
     }
 
-    /* For I frames these costs were accumulated earlier */
-    if (p0 != p1)
+    /* do not include edge blocks in the frame cost estimates, they are not very accurate */
+    const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
+                                cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
+
+    int bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8) : bcost;
+
+    if (bFrameScoreCU)
     {
-        int bcostAq = bcost;
-        if (bFrameScoreCU)
+        if (slice < 0)
         {
-            m_costEst += bcost;
-            if (fenc->invQscaleFactor)
-            {
-                bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8;
-                m_costEstAq += bcostAq;
-            }
+            fenc->costEst[b - p0][p1 - b] += bcost;
+            fenc->costEstAq[b - p0][p1 - b] += bcostAq;
+            if (!listused && !bBidir)
+                fenc->intraMbs[b - p0]++;
         }
-        fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq;
+        else
+        {
+            m_slice[slice].costEst += bcost;
+            m_slice[slice].costEstAq += bcostAq;
+            if (!listused && !bBidir)
+                m_slice[slice].intraMbs++;
+        }
     }
+
+    fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq;
     fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT));
 }
--- a/source/encoder/slicetype.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/encoder/slicetype.h	Tue Feb 24 15:34:32 2015 +0530
@@ -28,148 +28,135 @@
 #include "slice.h"
 #include "motion.h"
 #include "piclist.h"
-#include "wavefront.h"
+#include "threadpool.h"
 
 namespace x265 {
 // private namespace
 
 struct Lowres;
 class Frame;
+class Lookahead;
 
 #define LOWRES_COST_MASK  ((1 << 14) - 1)
 #define LOWRES_COST_SHIFT 14
 
-class EstimateRow
+/* Thread local data for lookahead tasks */
+struct LookaheadTLD
 {
-public:
-    x265_param*         m_param;
-    MotionEstimate      m_me;
-    Lock                m_lock;
-
-    volatile uint32_t   m_completed;      // Number of CUs in this row for which cost estimation is completed
-    volatile bool       m_active;
-
-    uint64_t            m_costEst;        // Estimated cost for all CUs in a row
-    uint64_t            m_costEstAq;      // Estimated weight Aq cost for all CUs in a row
-    uint64_t            m_costIntraAq;    // Estimated weighted Aq Intra cost for all CUs in a row
-    int                 m_intraMbs;       // Number of Intra CUs
-    int                 m_costIntra;      // Estimated Intra cost for all CUs in a row
-
-    int                 m_merange;
-    int                 m_lookAheadLambda;
-
-    int                 m_widthInCU;
-    int                 m_heightInCU;
+    MotionEstimate  me;
+    ReferencePlanes weightedRef;
+    pixel*          wbuffer[4];
+    int             widthInCU;
+    int             heightInCU;
+    int             ncu;
+    int             paddedLines;
 
 #if DETAILED_CU_STATS
-    int64_t             m_processRowElapsedTime;
-    uint64_t            m_countProcessRow;
+    int64_t         batchElapsedTime;
+    int64_t         coopSliceElapsedTime;
+    uint64_t        countBatches;
+    uint64_t        countCoopSlices;
 #endif
 
-    EstimateRow()
+    LookaheadTLD()
     {
-        m_me.setQP(X265_LOOKAHEAD_QP);
-        m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
-        m_merange = 16;
-        m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
+        me.setQP(X265_LOOKAHEAD_QP);
+        me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
+        for (int i = 0; i < 4; i++)
+            wbuffer[i] = NULL;
+        widthInCU = heightInCU = ncu = paddedLines = 0;
+
+#if DETAILED_CU_STATS
+        batchElapsedTime = 0;
+        coopSliceElapsedTime = 0;
+        countBatches = 0;
+        countCoopSlices = 0;
+#endif
     }
 
-    void init();
-
-    void estimateCUCost(Lowres * *frames, ReferencePlanes * wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]);
-};
-
-/* CostEstimate manages the cost estimation of a single frame, ie:
- * estimateFrameCost() and everything below it in the call graph */
-class CostEstimate : public WaveFront
-{
-public:
-    CostEstimate(ThreadPool *p);
-    ~CostEstimate();
-    void init(x265_param *, Frame *);
+    void init(int w, int h, int n)
+    {
+        widthInCU = w;
+        heightInCU = h;
+        ncu = n;
+    }
 
-    x265_param      *m_param;
-    EstimateRow     *m_rows;
-    pixel           *m_wbuffer[4];
-    Lowres         **m_curframes;
-
-    ReferencePlanes  m_weightedRef;
-    WeightParam      m_w;
+    ~LookaheadTLD() { X265_FREE(wbuffer[0]); }
 
-    int              m_paddedLines;     // number of lines in padded frame
-    int              m_widthInCU;       // width of lowres frame in downscale CUs
-    int              m_heightInCU;      // height of lowres frame in downscale CUs
+    void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
+    void lowresIntraEstimate(Lowres& fenc);
 
-    bool             m_bDoSearch[2];
-    volatile bool    m_bFrameCompleted;
-    int              m_curb, m_curp0, m_curp1;
-
-#if DETAILED_CU_STATS
-    int64_t          m_processRowElapsedTime;
-    uint64_t         m_countProcessRow;
-#endif
-
-    void     processRow(int row, int threadId);
-    int64_t  estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty);
+    void weightsAnalyse(Lowres& fenc, Lowres& ref);
 
 protected:
 
-    void     weightsAnalyse(Lowres **frames, int b, int p0);
-    uint32_t weightCostLuma(Lowres **frames, int b, int p0, WeightParam *w);
+    uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp);
+    uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
+    bool     allocWeightedRef(Lowres& fenc);
 };
 
 class Lookahead : public JobProvider
 {
 public:
 
-    Lookahead(x265_param *param, ThreadPool *pool);
-    ~Lookahead();
-    void init();
-    void destroy();
+    PicList       m_inputQueue;      // input pictures in order received
+    PicList       m_outputQueue;     // pictures to be encoded, in encode order
+    Lock          m_inputLock;
+    Lock          m_outputLock;
 
-    CostEstimate     m_est;             // Frame cost estimator
-    PicList          m_inputQueue;      // input pictures in order received
-    PicList          m_outputQueue;     // pictures to be encoded, in encode order
+    /* pre-lookahead */
+    Frame*        m_preframes[X265_LOOKAHEAD_MAX];
+    int           m_preTotal, m_preAcquired, m_preCompleted;
+    int           m_fullQueueSize;
+    bool          m_isActive;
+    bool          m_sliceTypeBusy;
+    bool          m_bAdaptiveQuant;
+    bool          m_outputSignalRequired;
+    bool          m_bBatchMotionSearch;
+    bool          m_bBatchFrameCosts;
+    Lock          m_preLookaheadLock;
+    Event         m_outputSignal;
 
-    x265_param      *m_param;
-    Lowres          *m_lastNonB;
-    int             *m_scratch;         // temp buffer
+    LookaheadTLD* m_tld;
+    x265_param*   m_param;
+    Lowres*       m_lastNonB;
+    int*          m_scratch;         // temp buffer for cutree propagate
+    
+    int           m_histogram[X265_BFRAME_MAX + 1];
+    int           m_lastKeyframe;
+    int           m_widthInCU;
+    int           m_heightInCU;
+    int           m_ncu;
+    int           m_numCoopSlices;
+    int           m_numRowsPerSlice;
+    bool          m_filled;
 
-    int              m_widthInCU;       // width of lowres frame in downscale CUs
-    int              m_heightInCU;      // height of lowres frame in downscale CUs
-    int              m_lastKeyframe;
-    int              m_histogram[X265_BFRAME_MAX + 1];
+    Lookahead(x265_param *param, ThreadPool *pool);
 
 #if DETAILED_CU_STATS
-    int64_t          m_slicetypeDecideElapsedTime;
-    uint64_t         m_countSlicetypeDecide;
-    bool             usingWorkerThreads() const { return !!m_pool; }
+    int64_t       m_slicetypeDecideElapsedTime;
+    int64_t       m_preLookaheadElapsedTime;
+    uint64_t      m_countSlicetypeDecide;
+    uint64_t      m_countPreLookahead;
+    void          getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount);
 #endif
 
-    void addPicture(Frame*, int sliceType);
-    void flush();
-    void stop();
-    Frame* getDecidedPicture();
+    bool    create();
+    void    destroy();
+    void    stop();
 
-    void getEstimatedPictureCost(Frame *pic);
+    void    addPicture(Frame&, int sliceType);
+    void    flush();
+    Frame*  getDecidedPicture();
+
+    void    getEstimatedPictureCost(Frame *pic);
+
 
 protected:
 
-    Lock  m_inputQueueLock;
-    Lock  m_outputQueueLock;
-    Event m_outputAvailable;
-
-    bool  m_bReady;   /* input lock - slicetypeDecide() can be started */
-    bool  m_bBusy;    /* input lock - slicetypeDecide() is running */
-    bool  m_bFilled;  /* enough frames in lookahead for output to be available */
-    bool  m_bFlushed; /* all frames have been decided, lookahead is finished */
-    bool  m_bFlush;   /* no more frames will be received, empty the input queue */
-
-    bool  findJob(int);
-
-    /* called by addPicture() or flush() to trigger slice decisions */
-    void slicetypeDecide();
-    void slicetypeAnalyse(Lowres **frames, bool bKeyframe);
+    void    findJob(int workerThreadID);
+    void    slicetypeDecide();
+    void    slicetypeAnalyse(Lowres **frames, bool bKeyframe);
 
     /* called by slicetypeAnalyse() to make slice decisions */
     bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch);
@@ -181,13 +168,64 @@ protected:
 
     /* called by slicetypeAnalyse() to effect cuTree adjustments to adaptive
      * quant offsets */
-    void cuTree(Lowres **frames, int numframes, bool bintra);
-    void estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
-    void cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
+    void    cuTree(Lowres **frames, int numframes, bool bintra);
+    void    estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
+    void    cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
 
     /* called by getEstimatedPictureCost() to finalize cuTree costs */
     int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
 };
+
+class CostEstimateGroup : public BondedTaskGroup
+{
+public:
+
+    Lookahead& m_lookahead;
+    Lowres**   m_frames;
+    bool       m_batchMode;
+
+    CostEstimateGroup(Lookahead& l, Lowres** f) : m_lookahead(l), m_frames(f), m_batchMode(false) {}
+
+    /* Cooperative cost estimate using multiple slices of downscaled frame */
+    struct Coop
+    {
+        int  p0, b, p1;
+        bool bDoSearch[2];
+    } m_coop;
+
+    enum { MAX_COOP_SLICES = 32 };
+    struct Slice
+    {
+        int  costEst;
+        int  costEstAq;
+        int  intraMbs;
+    } m_slice[MAX_COOP_SLICES];
+
+    int64_t singleCost(int p0, int p1, int b, bool intraPenalty = false);
+
+    /* Batch cost estimates, using one worker thread per estimateFrameCost() call */
+    enum { MAX_BATCH_SIZE = 2048 };
+    struct Estimate
+    {
+        int  p0, b, p1;
+        bool bIntraPenalty;
+    } m_estimates[MAX_BATCH_SIZE];
+
+    void add(int p0, int p1, int b, bool intraPenalty = false);
+    void finishBatch();
+
+protected:
+
+    static const int s_merange = 16;
+
+    void    processTasks(int workerThreadID);
+
+    int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty);
+    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
+
+    CostEstimateGroup& operator=(const CostEstimateGroup&);
+};
+
 }
 
 #endif // ifndef X265_SLICETYPE_H
--- a/source/profile/cpuEvents.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/profile/cpuEvents.h	Tue Feb 24 15:34:32 2015 +0530
@@ -5,6 +5,7 @@ CPU_EVENT(encodeCTU)
 CPU_EVENT(filterCTURow)
 CPU_EVENT(slicetypeDecideEV)
 CPU_EVENT(prelookahead)
-CPU_EVENT(costEstimateRow)
+CPU_EVENT(estCostSingle)
+CPU_EVENT(estCostCoop)
 CPU_EVENT(pmode)
 CPU_EVENT(pme)
--- a/source/test/pixelharness.cpp	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/test/pixelharness.cpp	Tue Feb 24 15:34:32 2015 +0530
@@ -1696,7 +1696,7 @@ void PixelHarness::measurePartition(int 
     if (opt.pu[part].copy_pp)
     {
         HEADER("copy_pp[%s]", lumaPartStr[part]);
-        REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 128);
+        REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 64);
     }
 
     if (opt.pu[part].addAvg)
@@ -1731,7 +1731,7 @@ void PixelHarness::measurePartition(int 
         if (opt.cu[part].copy_ss)
         {
             HEADER("copy_ss[%s]", lumaPartStr[part]);
-            REPORT_SPEEDUP(opt.cu[part].copy_ss, ref.cu[part].copy_ss, sbuf1, 64, sbuf2, 128);
+            REPORT_SPEEDUP(opt.cu[part].copy_ss, ref.cu[part].copy_ss, sbuf1, 128, sbuf2, 128);
         }
         if (opt.cu[part].copy_sp)
         {
@@ -1741,7 +1741,7 @@ void PixelHarness::measurePartition(int 
         if (opt.cu[part].copy_ps)
         {
             HEADER("copy_ps[%s]", lumaPartStr[part]);
-            REPORT_SPEEDUP(opt.cu[part].copy_ps, ref.cu[part].copy_ps, sbuf1, 64, pbuf1, 128);
+            REPORT_SPEEDUP(opt.cu[part].copy_ps, ref.cu[part].copy_ps, sbuf1, 128, pbuf1, 64);
         }
     }
 
--- a/source/x265.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/x265.h	Tue Feb 24 15:34:32 2015 +0530
@@ -369,13 +369,53 @@ typedef struct x265_param
      * rate-control can be negatively impacted by increases to the number of
      * frame threads because the extra concurrency adds uncertainty to the
      * bitrate estimations. Frame parallelism is generally limited by the the
-     * number of CTU rows */
+     * is generally limited by the the number of CU rows
+     *
+     * When thread pools are used, each frame thread is assigned to a single
+     * pool and the frame thread itself is given the node affinity of its pool.
+     * But when no thread pools are used no node affinity is assigned. */
     int       frameNumThreads;
 
-    /* Number of threads to allocate for the process global thread pool, if no
-     * thread pool has yet been created. 0 implies auto-detection. By default
-     * x265 will try to allocate one worker thread per CPU core */
-    int       poolNumThreads;
+    /* Comma seperated list of threads per NUMA node. If "none", then no worker
+     * pools are created and only frame parallelism is possible. If NULL or ""
+     * (default) x265 will use all available threads on each NUMA node.
+     *
+     * '+'  is a special value indicating all cores detected on the node
+     * '*'  is a special value indicating all cores detected on the node and all
+     *      remaining nodes.
+     * '-'  is a special value indicating no cores on the node, same as '0'
+     *
+     * example strings for a 4-node system:
+     *   ""        - default, unspecified, all numa nodes are used for thread pools
+     *   "*"       - same as default
+     *   "none"    - no thread pools are created, only frame parallelism possible
+     *   "-"       - same as "none"
+     *   "10"      - allocate one pool, using up to 10 cores on node 0
+     *   "-,+"     - allocate one pool, using all cores on node 1
+     *   "+,-,+"   - allocate two pools, using all cores on nodes 0 and 2
+     *   "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2
+     *   "-,*"     - allocate three pools, using all cores on nodes 1, 2 and 3
+     *   "8,8,8,8" - allocate four pools with up to 8 threads in each pool
+     *
+     * The total number of threads will be determined by the number of threads
+     * assigned to all nodes. The worker threads will each be given affinity for
+     * their node, they will not be allowed to migrate between nodes, but they
+     * will be allowed to move between CPU cores within their node.
+     *
+     * If the three pool features: bEnableWavefront, bDistributeModeAnalysis and
+     * bDistributeMotionEstimation are all disabled, then numaPools is ignored
+     * and no thread pools are created.
+     *
+     * If "none" is specified, then all three of the thread pool features are
+     * implicitly disabled.
+     *
+     * Multiple thread pools will be allocated for any NUMA node with more than
+     * 64 logical CPU cores. But any given thread pool will always use at most
+     * one NUMA node.
+     *
+     * Frame encoders are distributed between the available thread pools, and
+     * the encoder will never generate more thread pools than frameNumThreads */
+    char*     numaPools;
 
     /* Enable wavefront parallel processing, greatly increases parallelism for
      * less than 1% compression efficiency loss. Requires a thread pool, enabled
--- a/source/x265cli.h	Mon Feb 23 15:28:01 2015 +0530
+++ b/source/x265cli.h	Tue Feb 24 15:34:32 2015 +0530
@@ -37,7 +37,8 @@ static const struct option long_options[
     { "version",              no_argument, NULL, 'V' },
     { "asm",            required_argument, NULL, 0 },
     { "no-asm",               no_argument, NULL, 0 },
-    { "threads",        required_argument, NULL, 0 },
+    { "pools",          required_argument, NULL, 0 },
+    { "numa-pools",     required_argument, NULL, 0 },
     { "preset",         required_argument, NULL, 'p' },
     { "tune",           required_argument, NULL, 't' },
     { "frame-threads",  required_argument, NULL, 'F' },
@@ -253,7 +254,8 @@ static void showHelp(x265_param *param)
     H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
     H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
     H0("\nThreading, performance:\n");
-    H0("   --threads <integer>           Number of threads for thread pool (0: detect CPU core count, default)\n");
+    H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
+    H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
     H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
     H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
     H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));