changeset 12556:f5d756344566 stable

Merge with default
author Aruna Matheswaran <aruna@multicorewareinc.com>
date Thu, 12 Sep 2019 16:34:00 +0530
parents 9c5c9cef2a86 (current diff) c4b098f973e6 (diff)
children 5f18017a2a08
files
diffstat 29 files changed, 887 insertions(+-), 247 deletions(-) [+]
line wrap: on
line diff
--- a/doc/reST/cli.rst	Mon Jul 08 16:43:46 2019 +0530
+++ b/doc/reST/cli.rst	Thu Sep 12 16:34:00 2019 +0530
@@ -1,3 +1,4 @@
+
 *********************
 Command Line Options
 *********************
@@ -996,11 +997,14 @@ will not reuse analysis if slice type pa
 	the encoder settings. It is recommended to use :option:`--refine-intra` 4 with dynamic 
 	refinement. Default disabled.
 
-.. option:: --refine-mv
-	
+.. option:: --refine-mv <0..3>
+
 	Enables refinement of motion vector for scaled video. Evaluates the best 
-	motion vector by searching the surrounding eight integer and subpel pixel
-	positions.
+	motion vector based on the level selected. Default 0 - disabled.
+
+	Level 1 - Search around scaled MV.
+	Level 2 - Level 1 + Search around best AMVP cand.
+	Level 3 - Level 2 + Search around the other AMVP cand.
 
 Options which affect the transform unit quad-tree, sometimes referred to
 as the residual quad-tree (RQT).
@@ -1261,6 +1265,18 @@ Temporal / motion search options
 	Enable motion estimation with source frame pixels, in this mode, 
 	motion estimation can be computed independently. Default disabled.
 
+.. option:: --hme, --no-hme
+
+       Enable 3-level Hierarchical motion estimation at One-Sixteenth, 
+       Quarter and Full resolution. Default disabled.
+
+.. option:: --hme-search <integer|string>,<integer|string>,<integer|string>
+
+       Motion search method for HME Level 0, 1 and 2. Refer to :option:`--me` for values.
+       Specify search method for each level. Alternatively, specify a single value
+       which will apply to all levels. Default is hex,umh,umh for 
+       levels 0,1,2 respectively.
+
 Spatial/intra options
 =====================
 
@@ -1633,7 +1649,7 @@ Quality, rate control and rate distortio
 	ignored. Slower presets will generally achieve better compression
 	efficiency (and generate smaller bitstreams). Default disabled.
 
-.. option:: --aq-mode <0|1|2|3>
+.. option:: --aq-mode <0|1|2|3|4>
 
 	Adaptive Quantization operating mode. Raise or lower per-block
 	quantization based on complexity analysis of the source image. The
@@ -1647,6 +1663,7 @@ Quality, rate control and rate distortio
 	3. AQ enabled with auto-variance and bias to dark scenes. This is 
 	recommended for 8-bit encodes or low-bitrate 10-bit encodes, to 
 	prevent color banding/blocking. 
+	4. AQ enabled with auto-variance and edge information.
 
 .. option:: --aq-strength <float>
 
@@ -1979,6 +1996,24 @@ Loop filters
 	on inter prediction mode, CTU spatial-domain correlations, and relations
 	between luma and chroma.
 	Default disabled
+	
+.. option:: --selective-sao <0..4>
+
+	Toggles SAO at slice level. Default 4.
+
+	+--------------+---------------------------------------+
+	|     Level    |              Description              |     
+	+==============+=======================================+
+	|      0       | Disable SAO for all slices            |
+	+--------------+---------------------------------------+
+	|      1       | Enable SAO only for I-slices          |
+	+--------------+---------------------------------------+
+	|      2       | Enable SAO for I-slices & P-slices    |                                  |
+	+--------------+---------------------------------------+
+	|      3       | Enable SAO for all reference slices   |
+	+--------------+---------------------------------------+
+	|      4       | Enable SAO for all slices             |
+	+--------------+---------------------------------------+
 
 VUI (Video Usability Information) options
 =========================================
--- a/source/CMakeLists.txt	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/CMakeLists.txt	Thu Sep 12 16:34:00 2019 +0530
@@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CP
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 176)
+set(X265_BUILD 179)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
--- a/source/common/lowres.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/common/lowres.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -55,6 +55,7 @@ bool Lowres::create(x265_param* param, P
     heightFullRes = origPic->m_picHeight;
     width = origPic->m_picWidth / 2;
     lines = origPic->m_picHeight / 2;
+    bEnableHME = param->bEnableHME ? 1 : 0;
     lumaStride = width + 2 * origPic->m_lumaMarginX;
     if (lumaStride & 31)
         lumaStride += 32 - (lumaStride & 31);
@@ -64,6 +65,7 @@ bool Lowres::create(x265_param* param, P
     maxBlocksInColFullRes = maxBlocksInCol * 2;
     int cuCount = maxBlocksInRow * maxBlocksInCol;
     int cuCountFullRes = (qgSize > 8) ? cuCount : cuCount << 2;
+    isHMELowres = param->bEnableHME ? 1 : 0;
 
     /* rounding the width to multiple of lowres CU size */
     width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
@@ -78,6 +80,7 @@ bool Lowres::create(x265_param* param, P
         CHECKED_MALLOC_ZERO(qpCuTreeOffset, double, cuCountFullRes);
         if (qgSize == 8)
             CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
+        CHECKED_MALLOC_ZERO(edgeInclined, int, cuCountFullRes);
     }
 
     if (origPic->m_param->bAQMotion)
@@ -137,6 +140,26 @@ bool Lowres::create(x265_param* param, P
     lowresPlane[2] = buffer[2] + padoffset;
     lowresPlane[3] = buffer[3] + padoffset;
 
+    if (bEnableHME)
+    {
+        intptr_t lumaStrideHalf = lumaStride / 2;
+        if (lumaStrideHalf & 31)
+            lumaStrideHalf += 32 - (lumaStrideHalf & 31);
+        size_t planesizeHalf = planesize / 2;
+        size_t padoffsetHalf = padoffset / 2;
+        /* allocate lower-res buffers */
+        CHECKED_MALLOC_ZERO(lowerResBuffer[0], pixel, 4 * planesizeHalf);
+
+        lowerResBuffer[1] = lowerResBuffer[0] + planesizeHalf;
+        lowerResBuffer[2] = lowerResBuffer[1] + planesizeHalf;
+        lowerResBuffer[3] = lowerResBuffer[2] + planesizeHalf;
+
+        lowerResPlane[0] = lowerResBuffer[0] + padoffsetHalf;
+        lowerResPlane[1] = lowerResBuffer[1] + padoffsetHalf;
+        lowerResPlane[2] = lowerResBuffer[2] + padoffsetHalf;
+        lowerResPlane[3] = lowerResBuffer[3] + padoffsetHalf;
+    }
+
     CHECKED_MALLOC(intraCost, int32_t, cuCount);
     CHECKED_MALLOC(intraMode, uint8_t, cuCount);
 
@@ -155,6 +178,16 @@ bool Lowres::create(x265_param* param, P
         CHECKED_MALLOC(lowresMvs[1][i], MV, cuCount);
         CHECKED_MALLOC(lowresMvCosts[0][i], int32_t, cuCount);
         CHECKED_MALLOC(lowresMvCosts[1][i], int32_t, cuCount);
+        if (bEnableHME)
+        {
+            int maxBlocksInRowLowerRes = ((width/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+            int maxBlocksInColLowerRes = ((lines/2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+            int cuCountLowerRes = maxBlocksInRowLowerRes * maxBlocksInColLowerRes;
+            CHECKED_MALLOC(lowerResMvs[0][i], MV, cuCountLowerRes);
+            CHECKED_MALLOC(lowerResMvs[1][i], MV, cuCountLowerRes);
+            CHECKED_MALLOC(lowerResMvCosts[0][i], int32_t, cuCountLowerRes);
+            CHECKED_MALLOC(lowerResMvCosts[1][i], int32_t, cuCountLowerRes);
+        }
     }
 
     return true;
@@ -166,6 +199,8 @@ fail:
 void Lowres::destroy()
 {
     X265_FREE(buffer[0]);
+    if(bEnableHME)
+        X265_FREE(lowerResBuffer[0]);
     X265_FREE(intraCost);
     X265_FREE(intraMode);
 
@@ -184,12 +219,20 @@ void Lowres::destroy()
         X265_FREE(lowresMvs[1][i]);
         X265_FREE(lowresMvCosts[0][i]);
         X265_FREE(lowresMvCosts[1][i]);
+        if (bEnableHME)
+        {
+            X265_FREE(lowerResMvs[0][i]);
+            X265_FREE(lowerResMvs[1][i]);
+            X265_FREE(lowerResMvCosts[0][i]);
+            X265_FREE(lowerResMvCosts[1][i]);
+        }
     }
     X265_FREE(qpAqOffset);
     X265_FREE(invQscaleFactor);
     X265_FREE(qpCuTreeOffset);
     X265_FREE(propagateCost);
     X265_FREE(invQscaleFactor8x8);
+    X265_FREE(edgeInclined);
     X265_FREE(qpAqMotionOffset);
     X265_FREE(blockVariance);
     if (maxAQDepth > 0)
@@ -253,5 +296,18 @@ void Lowres::init(PicYuv *origPic, int p
     extendPicBorder(lowresPlane[1], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
     extendPicBorder(lowresPlane[2], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
     extendPicBorder(lowresPlane[3], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY);
+    
+    if (origPic->m_param->bEnableHME)
+    {
+        primitives.frameInitLowerRes(lowresPlane[0],
+            lowerResPlane[0], lowerResPlane[1], lowerResPlane[2], lowerResPlane[3],
+            lumaStride, lumaStride/2, (width / 2), (lines / 2));
+        extendPicBorder(lowerResPlane[0], lumaStride/2, width/2, lines/2, origPic->m_lumaMarginX/2, origPic->m_lumaMarginY/2);
+        extendPicBorder(lowerResPlane[1], lumaStride/2, width/2, lines/2, origPic->m_lumaMarginX/2, origPic->m_lumaMarginY/2);
+        extendPicBorder(lowerResPlane[2], lumaStride/2, width/2, lines/2, origPic->m_lumaMarginX/2, origPic->m_lumaMarginY/2);
+        extendPicBorder(lowerResPlane[3], lumaStride/2, width/2, lines/2, origPic->m_lumaMarginX/2, origPic->m_lumaMarginY/2);
+        fpelLowerResPlane[0] = lowerResPlane[0];
+    }
+
     fpelPlane[0] = lowresPlane[0];
 }
--- a/source/common/lowres.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/common/lowres.h	Thu Sep 12 16:34:00 2019 +0530
@@ -40,8 +40,13 @@ struct ReferencePlanes
     pixel*   lowresPlane[4];
     PicYuv*  reconPic;
 
+    /* 1/16th resolution : Level-0 HME planes */
+    pixel*   fpelLowerResPlane[3];
+    pixel*   lowerResPlane[4];
+
     bool     isWeighted;
     bool     isLowres;
+    bool     isHMELowres;
 
     intptr_t lumaStride;
     intptr_t chromaStride;
@@ -59,46 +64,58 @@ struct ReferencePlanes
 
     /* lowres motion compensation, you must provide a buffer and stride for QPEL averaged pixels
      * in case QPEL is required.  Else it returns a pointer to the HPEL pixels */
-    inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride)
+    inline pixel *lowresMC(intptr_t blockOffset, const MV& qmv, pixel *buf, intptr_t& outstride, bool hme)
     {
+        intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
+        pixel *plane[4];
+        for (int i = 0; i < 4; i++)
+        {
+            plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
+        }
         if ((qmv.x | qmv.y) & 1)
         {
             int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+            pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
             int qmvx = qmv.x + (qmv.x & 1);
             int qmvy = qmv.y + (qmv.y & 1);
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
-            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
-            primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (lumaStride % 64 == 0)](buf, outstride, frefA, lumaStride, frefB, lumaStride, 32);
+            pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;
+            primitives.pu[LUMA_8x8].pixelavg_pp[(outstride % 64 == 0) && (YStride % 64 == 0)](buf, outstride, frefA, YStride, frefB, YStride, 32);
             return buf;
         }
         else
         {
-            outstride = lumaStride;
+            outstride = YStride;
             int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            return lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+            return plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
         }
     }
 
-    inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp)
+    inline int lowresQPelCost(pixel *fenc, intptr_t blockOffset, const MV& qmv, pixelcmp_t comp, bool hme)
     {
+        intptr_t YStride = hme ? lumaStride / 2 : lumaStride;
+        pixel *plane[4];
+        for (int i = 0; i < 4; i++)
+        {
+            plane[i] = hme ? lowerResPlane[i] : lowresPlane[i];
+        }
         if ((qmv.x | qmv.y) & 1)
         {
             ALIGN_VAR_16(pixel, subpelbuf[8 * 8]);
             int hpelA = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            pixel *frefA = lowresPlane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
+            pixel *frefA = plane[hpelA] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
             int qmvx = qmv.x + (qmv.x & 1);
             int qmvy = qmv.y + (qmv.y & 1);
             int hpelB = (qmvy & 2) | ((qmvx & 2) >> 1);
-            pixel *frefB = lowresPlane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * lumaStride;
-            primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, lumaStride, frefB, lumaStride, 32);
+            pixel *frefB = plane[hpelB] + blockOffset + (qmvx >> 2) + (qmvy >> 2) * YStride;
+            primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](subpelbuf, 8, frefA, YStride, frefB, YStride, 32);
             return comp(fenc, FENC_STRIDE, subpelbuf, 8);
         }
         else
         {
             int hpel = (qmv.y & 2) | ((qmv.x & 2) >> 1);
-            pixel *fref = lowresPlane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * lumaStride;
-            return comp(fenc, FENC_STRIDE, fref, lumaStride);
+            pixel *fref = plane[hpel] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * YStride;
+            return comp(fenc, FENC_STRIDE, fref, YStride);
         }
     }
 };
@@ -150,6 +167,7 @@ struct PicQPAdaptationLayer
 struct Lowres : public ReferencePlanes
 {
     pixel *buffer[4];
+    pixel *lowerResBuffer[4]; // Level-0 buffer
 
     int    frameNum;         // Presentation frame number
     int    sliceType;        // Slice type decided by lookahead
@@ -181,6 +199,11 @@ struct Lowres : public ReferencePlanes
     uint32_t  maxBlocksInRowFullRes;
     uint32_t  maxBlocksInColFullRes;
 
+    /* Hierarchical Motion Estimation */
+    bool      bEnableHME;
+    int32_t*  lowerResMvCosts[2][X265_BFRAME_MAX + 2];
+    MV*       lowerResMvs[2][X265_BFRAME_MAX + 2];
+
     /* used for vbvLookahead */
     int       plannedType[X265_LOOKAHEAD_MAX + 1];
     int64_t   plannedSatd[X265_LOOKAHEAD_MAX + 1];
@@ -197,6 +220,8 @@ struct Lowres : public ReferencePlanes
     uint64_t  wp_ssd[3];       // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
     uint64_t  wp_sum[3];
     double    frameVariance;
+    int* edgeInclined;
+
 
     /* cutree intermediate data */
     PicQPAdaptationLayer* pAQLayer;
--- a/source/common/param.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/common/param.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -201,6 +201,9 @@ void x265_param_default(x265_param* para
     param->bEnableTSkipFast = 0;
     param->maxNumReferences = 3;
     param->bEnableTemporalMvp = 1;
+    param->bEnableHME = 0;
+    param->hmeSearchMethod[0] = X265_HEX_SEARCH;
+    param->hmeSearchMethod[1] = param->hmeSearchMethod[2] = X265_UMH_SEARCH;
     param->bSourceReferenceEstimation = 0;
     param->limitTU = 0;
     param->dynamicRd = 0;
@@ -212,6 +215,7 @@ void x265_param_default(x265_param* para
     param->bEnableSAO = 1;
     param->bSaoNonDeblocked = 0;
     param->bLimitSAO = 0;
+    param->selectiveSAO = 4;
 
     /* Coding Quality */
     param->cbQpOffset = 0;
@@ -372,6 +376,7 @@ int x265_param_default_preset(x265_param
             param->subpelRefine = 0;
             param->searchMethod = X265_DIA_SEARCH;
             param->bEnableSAO = 0;
+            param->selectiveSAO = 0;
             param->bEnableSignHiding = 0;
             param->bEnableWeightedPred = 0;
             param->rdLevel = 2;
@@ -401,6 +406,7 @@ int x265_param_default_preset(x265_param
             param->rc.hevcAq = 0;
             param->rc.qgSize = 32;
             param->bEnableSAO = 0;
+            param->selectiveSAO = 0;
             param->bEnableFastIntra = 1;
         }
         else if (!strcmp(preset, "veryfast"))
@@ -548,6 +554,7 @@ int x265_param_default_preset(x265_param
         {
             param->bEnableLoopFilter = 0;
             param->bEnableSAO = 0;
+            param->selectiveSAO = 0;
             param->bEnableWeightedPred = 0;
             param->bEnableWeightedBiPred = 0;
             param->bIntraInBFrames = 0;
@@ -575,6 +582,7 @@ int x265_param_default_preset(x265_param
             param->psyRd = 4.0;
             param->psyRdoq = 10.0;
             param->bEnableSAO = 0;
+            param->selectiveSAO = 0;
             param->rc.bEnableConstVbv = 1;
         }
         else if (!strcmp(tune, "animation"))
@@ -1206,7 +1214,7 @@ int x265_param_parse(x265_param* p, cons
         OPT("scale-factor") p->scaleFactor = atoi(value);
         OPT("refine-intra")p->intraRefine = atoi(value);
         OPT("refine-inter")p->interRefine = atoi(value);
-        OPT("refine-mv")p->mvRefine = atobool(value);
+        OPT("refine-mv")p->mvRefine = atoi(value);
         OPT("force-flush")p->forceFlush = atoi(value);
         OPT("splitrd-skip") p->bEnableSplitRdSkip = atobool(value);
         OPT("lowpass-dct") p->bLowPassDct = atobool(value);
@@ -1279,9 +1287,34 @@ int x265_param_parse(x265_param* p, cons
         OPT("svt-pred-struct") x265_log(p, X265_LOG_WARNING, "Option %s is SVT-HEVC Encoder specific; Disabling it here \n", name);
         OPT("svt-fps-in-vps") x265_log(p, X265_LOG_WARNING, "Option %s is SVT-HEVC Encoder specific; Disabling it here \n", name);
 #endif
+        OPT("selective-sao")
+        {
+            p->selectiveSAO = atoi(value);
+        }
         OPT("fades") p->bEnableFades = atobool(value);
         OPT("field") p->bField = atobool( value );
         OPT("cll") p->bEmitCLL = atobool(value);
+        OPT("hme") p->bEnableHME = atobool(value);
+        OPT("hme-search")
+        {
+            char search[3][5];
+            memset(search, '\0', 15 * sizeof(char));
+            if(3 == sscanf(value, "%d,%d,%d", &p->hmeSearchMethod[0], &p->hmeSearchMethod[1], &p->hmeSearchMethod[2]) ||
+               3 == sscanf(value, "%4[^,],%4[^,],%4[^,]", search[0], search[1], search[2]))
+            {
+                if(search[0][0])
+                    for(int level = 0; level < 3; level++)
+                        p->hmeSearchMethod[level] = parseName(search[level], x265_motion_est_names, bError);
+            }
+            else if (sscanf(value, "%d", &p->hmeSearchMethod[0]) || sscanf(value, "%s", search[0]))
+            {
+                if (search[0][0]) {
+                    p->hmeSearchMethod[0] = parseName(search[0], x265_motion_est_names, bError);
+                    p->hmeSearchMethod[1] = p->hmeSearchMethod[2] = p->hmeSearchMethod[0];
+                }
+            }
+            p->bEnableHME = true;
+        }
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1522,7 +1555,7 @@ int x265_check_params(x265_param* param)
           "Lookahead depth must be less than 256");
     CHECK(param->lookaheadSlices > 16 || param->lookaheadSlices < 0,
           "Lookahead slices must between 0 and 16");
-    CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_AUTO_VARIANCE_BIASED < param->rc.aqMode,
+    CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_EDGE < param->rc.aqMode,
           "Aq-Mode is out of range");
     CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
           "Aq-Strength is out of range");
@@ -1626,6 +1659,8 @@ int x265_check_params(x265_param* param)
           "Strict-cbr cannot be applied without specifying target bitrate or vbv bufsize");
     CHECK((param->analysisSave || param->analysisLoad) && (param->analysisReuseLevel < 1 || param->analysisReuseLevel > 10),
         "Invalid analysis refine level. Value must be between 1 and 10 (inclusive)");
+    CHECK(param->analysisLoad && (param->mvRefine < 0 || param->mvRefine > 3),
+        "Invalid mv refinement level. Value must be between 0 and 3 (inclusive)");
     CHECK(param->scaleFactor > 2, "Invalid scale-factor. Supports factor <= 2");
     CHECK(param->rc.qpMax < QP_MIN || param->rc.qpMax > QP_MAX_MAX,
         "qpmax exceeds supported range (0 to 69)");
@@ -1660,6 +1695,8 @@ int x265_check_params(x265_param* param)
         CHECK( (param->bFrameAdaptive==0), "Adaptive B-frame decision method should be closed for field feature.\n" );
         // to do
     }
+    CHECK(param->selectiveSAO < 0 || param->selectiveSAO > 4,
+        "Invalid SAO tune level. Value must be between 0 and 4 (inclusive)");
 #if !X86_64
     CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
         "SEA motion search does not support resolutions greater than 480p in 32 bit build");
@@ -1732,8 +1769,13 @@ void x265_print_params(x265_param* param
     x265_log(param, X265_LOG_INFO, "Residual QT: max TU size, max depth : %d / %d inter / %d intra\n",
              param->maxTUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth);
 
-    x265_log(param, X265_LOG_INFO, "ME / range / subpel / merge         : %s / %d / %d / %d\n",
-             x265_motion_est_names[param->searchMethod], param->searchRange, param->subpelRefine, param->maxNumMergeCand);
+    if (param->bEnableHME)
+        x265_log(param, X265_LOG_INFO, "HME L0,1,2 / range / subpel / merge : %s, %s, %s / %d / %d / %d\n",
+            x265_motion_est_names[param->hmeSearchMethod[0]], x265_motion_est_names[param->hmeSearchMethod[1]], x265_motion_est_names[param->hmeSearchMethod[2]], param->searchRange, param->subpelRefine, param->maxNumMergeCand);
+    else
+        x265_log(param, X265_LOG_INFO, "ME / range / subpel / merge         : %s / %d / %d / %d\n",
+            x265_motion_est_names[param->searchMethod], param->searchRange, param->subpelRefine, param->maxNumMergeCand);
+
     if (param->keyframeMax != INT_MAX || param->scenecutThreshold)
         x265_log(param, X265_LOG_INFO, "Keyframe min / max / scenecut / bias: %d / %d / %d / %.2lf\n", param->keyframeMin, param->keyframeMax, param->scenecutThreshold, param->scenecutBias * 100);
     else
@@ -1831,6 +1873,8 @@ void x265_print_params(x265_param* param
     }
     TOOLOPT(param->bSaoNonDeblocked, "sao-non-deblock");
     TOOLOPT(!param->bSaoNonDeblocked && param->bEnableSAO, "sao");
+    if (param->selectiveSAO != 4)
+        TOOLOPT(param->selectiveSAO, "selective-sao");
     TOOLOPT(param->rc.bStatWrite, "stats-write");
     TOOLOPT(param->rc.bStatRead,  "stats-read");
     TOOLOPT(param->bSingleSeiNal, "single-sei");
@@ -1928,6 +1972,9 @@ char *x265_param2string(x265_param* p, i
     s += sprintf(s, " subme=%d", p->subpelRefine);
     s += sprintf(s, " merange=%d", p->searchRange);
     BOOL(p->bEnableTemporalMvp, "temporal-mvp");
+    BOOL(p->bEnableHME, "hme");
+    if (p->bEnableHME)
+        s += sprintf(s, " Level 0,1,2=%d,%d,%d", p->hmeSearchMethod[0], p->hmeSearchMethod[1], p->hmeSearchMethod[2]);
     BOOL(p->bEnableWeightedPred, "weightp");
     BOOL(p->bEnableWeightedBiPred, "weightb");
     BOOL(p->bSourceReferenceEstimation, "analyze-src-pics");
@@ -1937,6 +1984,7 @@ char *x265_param2string(x265_param* p, i
     BOOL(p->bEnableSAO, "sao");
     BOOL(p->bSaoNonDeblocked, "sao-non-deblock");
     s += sprintf(s, " rd=%d", p->rdLevel);
+    s += sprintf(s, "selective-sao=%d", p->selectiveSAO);
     BOOL(p->bEnableEarlySkip, "early-skip");
     BOOL(p->bEnableRecursionSkip, "rskip");
     BOOL(p->bEnableFastIntra, "fast-intra");
@@ -2215,6 +2263,12 @@ void x265_copy_params(x265_param* dst, x
     dst->subpelRefine = src->subpelRefine;
     dst->searchRange = src->searchRange;
     dst->bEnableTemporalMvp = src->bEnableTemporalMvp;
+    dst->bEnableHME = src->bEnableHME;
+    if (src->bEnableHME)
+    {
+        for (int level = 0; level < 3; level++)
+            dst->hmeSearchMethod[level] = src->hmeSearchMethod[level];
+    }
     dst->bEnableWeightedBiPred = src->bEnableWeightedBiPred;
     dst->bEnableWeightedPred = src->bEnableWeightedPred;
     dst->bSourceReferenceEstimation = src->bSourceReferenceEstimation;
@@ -2372,7 +2426,7 @@ void x265_copy_params(x265_param* dst, x
     dst->bLowPassDct = src->bLowPassDct;
     dst->vbvBufferEnd = src->vbvBufferEnd;
     dst->vbvEndFrameAdjust = src->vbvEndFrameAdjust;
-
+    dst->bAnalysisType = src->bAnalysisType;
     dst->bCopyPicToFrame = src->bCopyPicToFrame;
     if (src->analysisSave) dst->analysisSave=strdup(src->analysisSave);
     else dst->analysisSave = NULL;
@@ -2380,6 +2434,7 @@ void x265_copy_params(x265_param* dst, x
     else dst->analysisLoad = NULL;
     dst->gopLookahead = src->gopLookahead;
     dst->radl = src->radl;
+    dst->selectiveSAO = src->selectiveSAO;
     dst->maxAUSizeFactor = src->maxAUSizeFactor;
     dst->bEmitIDRRecoverySEI = src->bEmitIDRRecoverySEI;
     dst->bDynamicRefine = src->bDynamicRefine;
--- a/source/common/pixel.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/common/pixel.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -1309,6 +1309,7 @@ void setupPixelPrimitives_c(EncoderPrimi
     p.scale1D_128to64[NONALIGNED] = p.scale1D_128to64[ALIGNED] = scale1D_128to64;
     p.scale2D_64to32 = scale2D_64to32;
     p.frameInitLowres = frame_init_lowres_core;
+    p.frameInitLowerRes = frame_init_lowres_core;
     p.ssim_4x4x2_core = ssim_4x4x2_core;
     p.ssim_end_4 = ssim_end_4;
 
--- a/source/common/primitives.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/common/primitives.h	Thu Sep 12 16:34:00 2019 +0530
@@ -349,6 +349,7 @@ struct EncoderPrimitives
     saoCuStatsE3_t        saoCuStatsE3;
 
     downscale_t           frameInitLowres;
+    downscale_t           frameInitLowerRes;
     cutree_propagate_cost propagateCost;
     cutree_fix8_unpack    fix8Unpack;
     cutree_fix8_pack      fix8Pack;
--- a/source/common/slice.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/common/slice.h	Thu Sep 12 16:34:00 2019 +0530
@@ -356,6 +356,7 @@ public:
     bool        m_bCheckLDC;       // TODO: is this necessary?
     bool        m_sLFaseFlag;      // loop filter boundary flag
     bool        m_colFromL0Flag;   // collocated picture from List0 or List1 flag
+    int         m_bUseSao;
 
     int         m_iPPSQpMinus26;
     int         numRefIdxDefault[2];
--- a/source/common/x86/asm-primitives.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -1090,6 +1090,7 @@ void setupAssemblyPrimitives(EncoderPrim
         LUMA_VSS_FILTERS(sse2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it 
         //p.planecopy_sp = PFX(downShift_16_sse2);
         p.planecopy_sp_shl = PFX(upShift_16_sse2);
@@ -1132,6 +1133,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_8x8].idct = PFX(idct8_ssse3);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_ssse3);
 
         ALL_LUMA_PU(convert_p2s[ALIGNED], filterPixelToShort, ssse3);
         ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, ssse3);
@@ -1453,6 +1455,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
 
         p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx);
         p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
@@ -1469,6 +1472,7 @@ void setupAssemblyPrimitives(EncoderPrim
 #endif
         LUMA_VAR(xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
     }
     if (cpuMask & X265_CPU_AVX2)
     {
@@ -2296,6 +2300,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
         p.fix8Pack = PFX(cutree_fix8_pack_avx2);
@@ -3294,6 +3299,7 @@ void setupAssemblyPrimitives(EncoderPrim
 
         //p.frameInitLowres = PFX(frame_init_lowres_core_mmx2);
         p.frameInitLowres = PFX(frame_init_lowres_core_sse2);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_sse2);
 
         ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, sse2);
         ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, sse2);
@@ -3414,6 +3420,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_8x8].luma_hvpp = PFX(interp_8tap_hv_pp_8x8_ssse3);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_ssse3);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_ssse3);
         ASSIGN2(p.scale1D_128to64, scale1D_128to64_ssse3);
         p.scale2D_64to32 = PFX(scale2D_64to32_ssse3);
 
@@ -3682,6 +3689,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_avx);
         p.propagateCost = PFX(mbtree_propagate_cost_avx);
     }
     if (cpuMask & X265_CPU_XOP)
@@ -3693,6 +3701,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_8x8].sse_pp = PFX(pixel_ssd_8x8_xop);
         p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_xop);
+
     }
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
@@ -4667,6 +4677,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+        p.frameInitLowerRes = PFX(frame_init_lowres_core_avx2);
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
         p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
--- a/source/dynamicHDR10/json11/json11.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/dynamicHDR10/json11/json11.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -43,11 +43,20 @@ using std::make_shared;
 using std::initializer_list;
 using std::move;
 
+    /* Helper for representing null - just a do-nothing struct, plus comparison
+     * operators so the helpers in JsonValue work. We can't use nullptr_t because
+     * it may not be orderable.
+     */
+    struct NullStruct {
+        bool operator==(NullStruct) const { return true; }
+        bool operator<(NullStruct) const { return false; }
+    };
+
 /* * * * * * * * * * * * * * * * * * * *
  * Serialization
  */
 
-static void dump(std::nullptr_t, string &out) {
+static void dump(NullStruct, string &out) {
     out += "null";
 }
 
@@ -214,9 +223,9 @@ public:
     explicit JsonObject(Json::object &&value)      : Value(move(value)) {}
 };
 
-class JsonNull final : public Value<Json::NUL, std::nullptr_t> {
+class JsonNull final : public Value<Json::NUL, NullStruct> {
 public:
-    JsonNull() : Value(nullptr) {}
+    JsonNull() : Value({}) {}
 };
 
 /* * * * * * * * * * * * * * * * * * * *
--- a/source/encoder/analysis.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/analysis.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -2488,14 +2488,18 @@ void Analysis::recodeCU(const CUData& pa
                             MV mvp;
 
                             int numMvc = mode.cu.getPMV(mode.interNeighbours, list, ref, mode.amvpCand[list][ref], mvc);
-                            if (m_param->interRefine != 1)
-                                mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
-                            else
-                                mvp = interDataCTU->mv[list][cuIdx + part].word;
+                            mvp = mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]];
                             if (m_param->mvRefine || m_param->interRefine == 1)
                             {
-                                MV outmv;
-                                searchMV(mode, pu, list, ref, outmv, mvp, numMvc, mvc);
+                                MV outmv, mvpSelect[3];
+                                mvpSelect[0] = interDataCTU->mv[list][cuIdx + part].word;
+                                switch (m_param->mvRefine)
+                                {
+                                case 3: mvpSelect[2] = mode.amvpCand[list][ref][!(mode.cu.m_mvpIdx[list][pu.puAbsPartIdx])];
+                                case 2: mvpSelect[1] = mvp;
+                                default: break;
+                                }
+                                searchMV(mode, list, ref, outmv, mvpSelect, numMvc, mvc);
                                 mode.cu.setPUMv(list, outmv, pu.puAbsPartIdx, part);
                             }
                             mode.cu.m_mvd[list][pu.puAbsPartIdx] = mode.cu.m_mv[list][pu.puAbsPartIdx] - mode.amvpCand[list][ref][mode.cu.m_mvpIdx[list][pu.puAbsPartIdx]]/*mvp*/;
--- a/source/encoder/api.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/api.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -96,7 +96,12 @@ x265_encoder *x265_encoder_open(x265_par
     x265_param* param = PARAM_NS::x265_param_alloc();
     x265_param* latestParam = PARAM_NS::x265_param_alloc();
     x265_param* zoneParam = PARAM_NS::x265_param_alloc();
-    if (!param || !latestParam)
+
+    if(param) PARAM_NS::x265_param_default(param);
+    if(latestParam) PARAM_NS::x265_param_default(latestParam);
+    if(zoneParam) PARAM_NS::x265_param_default(zoneParam);
+
+    if (!param || !latestParam || !zoneParam)
         goto fail;
     if (p->rc.zoneCount || p->rc.zonefileCount)
     {
@@ -106,6 +111,8 @@ x265_encoder *x265_encoder_open(x265_par
     }
 
     x265_copy_params(param, p);
+    x265_copy_params(latestParam, p);
+    x265_copy_params(zoneParam, p);
     x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", PFX(version_str));
     x265_log(param, X265_LOG_INFO, "build info %s\n", PFX(build_info_str));
 
@@ -212,6 +219,7 @@ fail:
     delete encoder;
     PARAM_NS::x265_param_free(param);
     PARAM_NS::x265_param_free(latestParam);
+    PARAM_NS::x265_param_free(zoneParam);
     return NULL;
 }
 
@@ -944,11 +952,11 @@ x265_zone *x265_zone_alloc(int zoneCount
 
 void x265_zone_free(x265_param *param)
 {
-    if (param->rc.zonefileCount) {
+    if (param && param->rc.zonefileCount) {
         for (int i = 0; i < param->rc.zonefileCount; i++)
             x265_free(param->rc.zones[i].zoneParam);
     }
-    if (param->rc.zoneCount || param->rc.zonefileCount)
+    if (param && (param->rc.zoneCount || param->rc.zonefileCount))
         x265_free(param->rc.zones);
 }
 
--- a/source/encoder/encoder.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/encoder.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -1621,6 +1621,28 @@ int Encoder::encode(const x265_picture* 
             }
             /* determine references, setup RPS, etc */
             m_dpb->prepareEncode(frameEnc);
+            if (!!m_param->selectiveSAO)
+            {
+                Slice* slice = frameEnc->m_encData->m_slice;
+                slice->m_bUseSao = curEncoder->m_frameFilter.m_useSao = 1;
+                switch (m_param->selectiveSAO)
+                {
+                case 3: if (!IS_REFERENCED(frameEnc))
+                            slice->m_bUseSao = curEncoder->m_frameFilter.m_useSao = 0;
+                        break;
+                case 2: if (!!m_param->bframes && slice->m_sliceType == B_SLICE)
+                            slice->m_bUseSao = curEncoder->m_frameFilter.m_useSao = 0;
+                        break;
+                case 1: if (slice->m_sliceType != I_SLICE)
+                            slice->m_bUseSao = curEncoder->m_frameFilter.m_useSao = 0;
+                        break;
+                }
+            }
+            else
+            {
+                Slice* slice = frameEnc->m_encData->m_slice;
+                slice->m_bUseSao = curEncoder->m_frameFilter.m_useSao = 0;
+            }
 
             if (m_param->rc.rateControlMode != X265_RC_CQP)
                 m_lookahead->getEstimatedPictureCost(frameEnc);
@@ -2891,6 +2913,14 @@ void Encoder::configure(x265_param *p)
 
     }
 
+    if (p->selectiveSAO && !p->bEnableSAO)
+    {
+        p->bEnableSAO = 1;
+        x265_log(p, X265_LOG_WARNING, "SAO turned ON when selective-sao is ON\n");
+    }
+
+    if (!p->selectiveSAO && p->bEnableSAO)
+        p->selectiveSAO = 4;
 
     if (p->interlaceMode)
         x265_log(p, X265_LOG_WARNING, "Support for interlaced video is experimental\n");
@@ -2983,11 +3013,11 @@ void Encoder::configure(x265_param *p)
             x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10. Disabling MV refine.\n");
             p->mvRefine = 0;
         }
-        else if (p->interRefine >= 2)
-        {
-            x265_log(p, X265_LOG_WARNING, "MVs are recomputed when refine-inter >= 2. MV refinement not applicable. Disabling MV refine\n");
-            p->mvRefine = 0;
-        }
+    }
+    if (p->scaleFactor && p->analysisLoad && p->interRefine && p->analysisReuseLevel == 10 && !p->mvRefine)
+    {
+        x265_log(p, X265_LOG_WARNING, "Enabling MV refinement level 1 with scaling and analysis-reuse-level=10.\n");
+        p->mvRefine = 1;
     }
 
     if (p->ctuDistortionRefine == CTU_DISTORTION_INTERNAL)
@@ -3379,6 +3409,19 @@ void Encoder::configure(x265_param *p)
         p->bRepeatHeaders = 1;
         x265_log(p, X265_LOG_WARNING, "Turning on repeat - headers for zone encoding\n");
     }
+
+    if (m_param->bEnableHME)
+    {
+        if (m_param->sourceHeight < 540)
+        {
+            x265_log(p, X265_LOG_WARNING, "Source height < 540p is too low for HME. Disabling HME.\n");
+            p->bEnableHME = 0;
+        }
+        if (m_param->bEnableHME && m_param->searchMethod != m_param->hmeSearchMethod[2])
+        {
+            m_param->searchMethod = m_param->hmeSearchMethod[2];
+        }
+    }
 }
 
 void Encoder::readAnalysisFile(x265_analysis_data* analysis, int curPoc, const x265_picture* picIn, int paramBytes)
--- a/source/encoder/entropy.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/entropy.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -530,6 +530,10 @@ void Entropy::codeScalingList(const Scal
     for (int i = 0; i < coefNum; i++)
     {
         data = src[scan[i]] - nextCoef;
+        if (data < -128)
+            data += 256;
+        if (data > 127)
+            data -= 256;
         nextCoef = (nextCoef + data + 256) % 256;
         WRITE_SVLC(data,  "scaling_list_delta_coef");
     }
@@ -637,12 +641,18 @@ void Entropy::codeSliceHeader(const Slic
             WRITE_FLAG(1, "slice_temporal_mvp_enable_flag");
     }
     const SAOParam *saoParam = encData.m_saoParam;
-    if (slice.m_sps->bUseSAO)
+    if (slice.m_bUseSao)
     {
         WRITE_FLAG(saoParam->bSaoFlag[0], "slice_sao_luma_flag");
         if (encData.m_param->internalCsp != X265_CSP_I400)
             WRITE_FLAG(saoParam->bSaoFlag[1], "slice_sao_chroma_flag");
     }
+    else if(encData.m_param->selectiveSAO)
+    {
+        WRITE_FLAG(0, "slice_sao_luma_flag");
+        if (encData.m_param->internalCsp != X265_CSP_I400)
+            WRITE_FLAG(0, "slice_sao_chroma_flag");
+    }
 
     // check if numRefIdx match the defaults (1, hard-coded in PPS). If not, override
     // TODO: this might be a place to optimize a few bits per slice, by using param->refs for L0 default
@@ -702,7 +712,7 @@ void Entropy::codeSliceHeader(const Slic
 
     if (encData.m_param->maxSlices <= 1)
     {
-        bool isSAOEnabled = slice.m_sps->bUseSAO ? saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1] : false;
+        bool isSAOEnabled = slice.m_sps->bUseSAO && slice.m_bUseSao ? saoParam->bSaoFlag[0] || saoParam->bSaoFlag[1] : false;
         bool isDBFEnabled = !slice.m_pps->bPicDisableDeblockingFilter;
 
         if (isSAOEnabled || isDBFEnabled)
--- a/source/encoder/frameencoder.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/frameencoder.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -634,14 +634,22 @@ void FrameEncoder::compressFrame()
         if (!m_param->bEnableWavefront)
             m_backupStreams = new Bitstream[numSubstreams];
         m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
-        if (!m_param->bEnableSAO)
+        if (!slice->m_bUseSao)
+        {
             for (uint32_t i = 0; i < numSubstreams; i++)
                 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
+        }
     }
     else
     {
         for (uint32_t i = 0; i < numSubstreams; i++)
+        {
             m_outStreams[i].resetBits();
+            if (!slice->m_bUseSao)
+                m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
+            else
+                m_rows[i].rowGoOnCoder.setBitstream(NULL);
+        }
     }
 
     m_rce.encodeOrder = m_frame->m_encodeOrder;
@@ -981,7 +989,7 @@ void FrameEncoder::compressFrame()
     m_entropyCoder.setBitstream(&m_bs);
 
     // finish encode of each CTU row, only required when SAO is enabled
-    if (m_param->bEnableSAO)
+    if (slice->m_bUseSao)
         encodeSlice(0);
 
     m_entropyCoder.setBitstream(&m_bs);
@@ -1221,7 +1229,7 @@ void FrameEncoder::encodeSlice(uint32_t 
     const uint32_t lastCUAddr = (slice->m_endCUAddr + m_param->num4x4Partitions - 1) / m_param->num4x4Partitions;
     const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1;
 
-    SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL;
+    SAOParam* saoParam = slice->m_sps->bUseSAO && slice->m_bUseSao ? m_frame->m_encData->m_saoParam : NULL;
     for (uint32_t cuAddr = sliceAddr; cuAddr < lastCUAddr; cuAddr++)
     {
         uint32_t col = cuAddr % widthInLCUs;
@@ -1515,11 +1523,11 @@ void FrameEncoder::processRowEncoder(int
             curRow.bufferedEntropy.loadContexts(rowCoder);
 
         /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */
-        if (m_param->bEnableSAO && m_param->bSaoNonDeblocked)
+        if (slice->m_bUseSao && m_param->bSaoNonDeblocked)
             m_frameFilter.m_parallelFilter[row].m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row);
 
         /* Deblock with idle threading */
-        if (m_param->bEnableLoopFilter | m_param->bEnableSAO)
+        if (m_param->bEnableLoopFilter | slice->m_bUseSao)
         {
             // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
             if (!bIsVbv)
@@ -1833,12 +1841,12 @@ void FrameEncoder::processRowEncoder(int
 
     /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */
     /* end_of_sub_stream_one_bit / end_of_slice_segment_flag */
-    if (!m_param->bEnableSAO && (m_param->bEnableWavefront || bLastRowInSlice))
-        rowCoder.finishSlice();
+       if (!slice->m_bUseSao && (m_param->bEnableWavefront || bLastRowInSlice))
+               rowCoder.finishSlice();
 
 
     /* Processing left Deblock block with current threading */
-    if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (rowInSlice >= 2))
+    if ((m_param->bEnableLoopFilter | slice->m_bUseSao) & (rowInSlice >= 2))
     {
         /* Check conditional to start previous row process with current threading */
         if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols)
--- a/source/encoder/frameencoder.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/frameencoder.h	Thu Sep 12 16:34:00 2019 +0530
@@ -150,6 +150,7 @@ public:
     uint32_t                 m_filterRowDelay;
     uint32_t                 m_filterRowDelayCus;
     uint32_t                 m_refLagRows;
+    bool                     m_bUseSao;
 
     CTURow*                  m_rows;
     uint16_t                 m_sliceAddrBits;
--- a/source/encoder/framefilter.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/framefilter.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -163,7 +163,7 @@ void FrameFilter::destroy()
 
     if (m_parallelFilter)
     {
-        if (m_param->bEnableSAO)
+        if (m_useSao)
         {
             for(int row = 0; row < m_numRows; row++)
                 m_parallelFilter[row].m_sao.destroy((row == 0 ? 1 : 0));
@@ -178,6 +178,7 @@ void FrameFilter::init(Encoder *top, Fra
 {
     m_param = frame->m_param;
     m_frameEncoder = frame;
+    m_useSao = 1;
     m_numRows = numRows;
     m_numCols = numCols;
     m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
@@ -196,12 +197,12 @@ void FrameFilter::init(Encoder *top, Fra
 
     if (m_parallelFilter)
     {
-        if (m_param->bEnableSAO)
+        if (m_useSao)
         {
             for(int row = 0; row < numRows; row++)
             {
                 if (!m_parallelFilter[row].m_sao.create(m_param, (row == 0 ? 1 : 0)))
-                    m_param->bEnableSAO = 0;
+                    m_useSao = 0;
                 else
                 {
                     if (row != 0)
@@ -235,7 +236,7 @@ void FrameFilter::start(Frame *frame, En
     {
         for(int row = 0; row < m_numRows; row++)
         {
-            if (m_param->bEnableSAO)
+            if (m_useSao)
                 m_parallelFilter[row].m_sao.startSlice(frame, initState);
 
             m_parallelFilter[row].m_lastCol.set(0);
@@ -245,7 +246,7 @@ void FrameFilter::start(Frame *frame, En
         }
 
         // Reset SAO common statistics
-        if (m_param->bEnableSAO)
+        if (m_useSao)
             m_parallelFilter[0].m_sao.resetStats();
     }
 }
@@ -472,11 +473,11 @@ void FrameFilter::ParallelFilter::proces
                 deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
 
                 // When SAO Disable, setting column counter here
-                if (!m_frameFilter->m_param->bEnableSAO & !ctuPrev->m_bFirstRowInSlice)
+                if (!m_frameFilter->m_useSao & !ctuPrev->m_bFirstRowInSlice)
                     m_prevRow->processPostCu(col - 1);
             }
 
-            if (m_frameFilter->m_param->bEnableSAO)
+            if (m_frameFilter->m_useSao)
             {
                 // Save SAO bottom row reference pixels
                 copySaoAboveRef(ctuPrev, reconPic, cuAddr - 1, col - 1);
@@ -514,12 +515,12 @@ void FrameFilter::ParallelFilter::proces
             deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_HOR);
 
             // When SAO Disable, setting column counter here
-            if (!m_frameFilter->m_param->bEnableSAO & !ctuPrev->m_bFirstRowInSlice)
+            if (!m_frameFilter->m_useSao & !ctuPrev->m_bFirstRowInSlice)
                 m_prevRow->processPostCu(numCols - 1);
         }
 
         // TODO: move processPostCu() into processSaoUnitCu()
-        if (m_frameFilter->m_param->bEnableSAO)
+        if (m_frameFilter->m_useSao)
         {
             const CUData* ctu = m_encData->getPicCTU(m_rowAddr + numCols - 2);
 
@@ -570,7 +571,7 @@ void FrameFilter::processRow(int row)
     m_frameEncoder->m_cuStats.countLoopFilter++;
 #endif
 
-    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
+    if (!m_param->bEnableLoopFilter && !m_useSao)
     {
         processPostRow(row);
         return;
@@ -596,7 +597,7 @@ void FrameFilter::processRow(int row)
                 x265_log(m_param, X265_LOG_WARNING, "detected ParallelFilter race condition on last row\n");
 
             /* Apply SAO on last row of CUs, because we always apply SAO on row[X-1] */
-            if (m_param->bEnableSAO)
+            if (m_useSao)
             {
                 for(int col = 0; col < m_numCols; col++)
                 {
@@ -634,7 +635,7 @@ void FrameFilter::processRow(int row)
 
     if (numRowFinished == m_numRows)
     {
-        if (m_param->bEnableSAO)
+        if (m_useSao)
         {
             // Merge numNoSao into RootNode (Node0)
             for(int i = 1; i < m_numRows; i++)
--- a/source/encoder/framefilter.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/framefilter.h	Thu Sep 12 16:34:00 2019 +0530
@@ -46,6 +46,7 @@ public:
 
     x265_param*   m_param;
     Frame*        m_frame;
+    int           m_useSao;
     FrameEncoder* m_frameEncoder;
     int           m_hChromaShift;
     int           m_vChromaShift;
--- a/source/encoder/motion.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/motion.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -104,6 +104,8 @@ MotionEstimate::MotionEstimate()
     ctuAddr = -1;
     absPartIdx = -1;
     searchMethod = X265_HEX_SEARCH;
+    searchMethodL0 = X265_HEX_SEARCH;
+    searchMethodL1 = X265_HEX_SEARCH;
     subpelRefine = 2;
     blockwidth = blockheight = 0;
     blockOffset = 0;
@@ -162,7 +164,7 @@ MotionEstimate::~MotionEstimate()
 }
 
 /* Called by lookahead, luma only, no use of PicYuv */
-void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int refine)
+void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int method, const int searchL0, const int searchL1, const int refine)
 {
     partEnum = partitionFromSizes(pwidth, pheight);
     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
@@ -179,6 +181,8 @@ void MotionEstimate::setSourcePU(pixel *
 
     /* Search params */
     searchMethod = method;
+    searchMethodL0 = searchL0;
+    searchMethodL1 = searchL1;
     subpelRefine = refine;
 
     /* copy PU block into cache */
@@ -363,12 +367,13 @@ void MotionEstimate::StarPatternSearch(R
                                        int &            bPointNr,
                                        int &            bDistance,
                                        int              earlyExitIters,
-                                       int              merange)
+                                       int              merange,
+                                       int              hme)
 {
     ALIGN_VAR_16(int, costs[16]);
     pixel* fenc = fencPUYuv.m_buf[0];
-    pixel* fref = ref->fpelPlane[0] + blockOffset;
-    intptr_t stride = ref->lumaStride;
+    pixel* fref = (hme? ref->fpelLowerResPlane[0] : ref->fpelPlane[0]) + blockOffset;
+    intptr_t stride = hme? ref->lumaStride / 2 : ref->lumaStride;
 
     MV omv = bmv;
     int saved = bcost;
@@ -743,9 +748,10 @@ int MotionEstimate::motionEstimate(Refer
                                    pixel *          srcReferencePlane)
 {
     ALIGN_VAR_16(int, costs[16]);
+    bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];
     if (ctuAddr >= 0)
         blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
-    intptr_t stride = ref->lumaStride;
+    intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;
     pixel* fenc = fencPUYuv.m_buf[0];
     pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;
 
@@ -767,7 +773,7 @@ int MotionEstimate::motionEstimate(Refer
     int bprecost;
 
     if (ref->isLowres)
-        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
+        bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);
     else
         bprecost = subpelCompare(ref, pmv, sad);
 
@@ -808,7 +814,8 @@ int MotionEstimate::motionEstimate(Refer
     pmv = pmv.roundToFPel();
     MV omv = bmv;  // current search origin or starting point
 
-    switch (searchMethod)
+    int search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;
+    switch (search)
     {
     case X265_DIA_SEARCH:
     {
@@ -1128,7 +1135,7 @@ me_hex2:
         int bDistance = 0;
 
         const int EarlyExitIters = 3;
-        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
+        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange, hme);
         if (bDistance == 1)
         {
             // if best distance was only 1, check two missing points.  If no new point is found, stop
@@ -1201,7 +1208,7 @@ me_hex2:
             bDistance = 0;
             bPointNr = 0;
             const int MaxIters = 32;
-            StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
+            StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange, hme);
 
             if (bDistance == 1)
             {
@@ -1391,11 +1398,20 @@ me_hex2:
     {
         // dead slow exhaustive search, but at least it uses sad_x4()
         MV tmv;
-        for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
+        int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;
+        if (ref->isHMELowres)
         {
-            for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
+            merange = (merange < 0 ? -merange : merange);
+            mvmin_y = X265_MAX(mvmin.y, -merange);
+            mvmin_x = X265_MAX(mvmin.x, -merange);
+            mvmax_y = X265_MIN(mvmax.y, merange);
+            mvmax_x = X265_MIN(mvmax.x, merange);
+        }
+        for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++)
+        {
+            for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++)
             {
-                if (tmv.x + 3 <= mvmax.x)
+                if (tmv.x + 3 <= mvmax_x)
                 {
                     pixel *pix_base = fref + tmv.y * stride + tmv.x;
                     sad_x4(fenc,
@@ -1463,12 +1479,12 @@ me_hex2:
             if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
                 continue;
 
-            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
+            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);
             COPY2_IF_LT(bcost, cost, bdir, i);
         }
 
         bmv += square1[bdir] * 2;
-        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
+        bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);
 
         bdir = 0;
         for (int i = 1; i <= wl.qpel_dirs; i++)
@@ -1479,7 +1495,7 @@ me_hex2:
             if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))
                 continue;
 
-            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
+            int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);
             COPY2_IF_LT(bcost, cost, bdir, i);
         }
 
--- a/source/encoder/motion.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/motion.h	Thu Sep 12 16:34:00 2019 +0530
@@ -44,6 +44,8 @@ protected:
     int absPartIdx;  // part index of PU, including CU offset within CTU
 
     int searchMethod;
+    int searchMethodL0;
+    int searchMethodL1;
     int subpelRefine;
 
     int blockwidth;
@@ -76,7 +78,7 @@ public:
 
     /* Methods called at slice setup */
 
-    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int subpelRefine);
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight, const int searchMethod, const int searchL0, const int searchL1, const int subpelRefine);
     void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight, const int searchMethod, const int subpelRefine, bool bChroma);
 
     /* buf*() and motionEstimate() methods all use cached fenc pixels and thus
@@ -107,7 +109,8 @@ protected:
                                   int &            bPointNr,
                                   int &            bDistance,
                                   int              earlyExitIters,
-                                  int              merange);
+                                  int              merange,
+                                  int              hme);
 };
 }
 
--- a/source/encoder/ratecontrol.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/ratecontrol.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -2116,6 +2116,9 @@ void RateControl::checkAndResetABR(RateC
             if ((underflow < epsilon || rce->isFadeEnd) && !isFrameDone)
             {
                 init(*m_curSlice->m_sps);
+                // Reduce tune complexity factor for scenes that follow blank frames
+                double tuneCplxFactor = (m_ncu > 3600 && m_param->rc.cuTree && !m_param->rc.hevcAq) ? 2.5 : m_param->rc.hevcAq ? 1.5 : m_isGrainEnabled ? 1.9 : 1.0;
+                m_cplxrSum /= tuneCplxFactor;
                 m_shortTermCplxSum = rce->lastSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION);
                 m_shortTermCplxCount = 1;
                 m_isAbrReset = true;
--- a/source/encoder/search.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/search.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -2096,13 +2096,16 @@ void Search::singleMotionEstimation(Sear
 
     const MV* amvp = interMode.amvpCand[list][ref];
     int mvpIdx = selectMVP(interMode.cu, pu, amvp, list, ref);
-    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
+    bool bLowresMVP = false;
+    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
 
     if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging if lowresMV is not available */
     {
         MV lmv = getLowresMV(interMode.cu, pu, list, ref);
         if (lmv.notZero())
             mvc[numMvc++] = lmv;
+        if (m_param->bEnableHME)
+            mvp_lowres = lmv;
     }
 
     setSearchRange(interMode.cu, mvp, m_param->searchRange, mvmin, mvmax);
@@ -2110,11 +2113,28 @@ void Search::singleMotionEstimation(Sear
     int satdCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
 
+    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
+    {
+        MV outmv_lowres;
+        setSearchRange(interMode.cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
+        int lowresMvCost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+        if (lowresMvCost < satdCost)
+        {
+            outmv = outmv_lowres;
+            satdCost = lowresMvCost;
+            bLowresMVP = true;
+        }
+    }
     /* Get total cost of partition, but only include MV bit cost once */
     bits += m_me.bitcost(outmv);
     uint32_t mvCost = m_me.mvcost(outmv);
     uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
 
+    /* Update LowresMVP to best AMVP cand*/
+    if (bLowresMVP)
+        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
+
     /* Refine MVP selection, updates: mvpIdx, bits, cost */
     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
 
@@ -2132,23 +2152,27 @@ void Search::singleMotionEstimation(Sear
         bestME[list].mvCost  = mvCost;
     }
 }
-void Search::searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc)
+void Search::searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc)
 {
     CUData& cu = interMode.cu;
-    const Slice *slice = m_slice;
-    MV mv;
-    if (m_param->interRefine == 1)
-        mv = mvp;
-    else
-        mv = cu.m_mv[list][pu.puAbsPartIdx];
+    MV mv, mvmin, mvmax;
     cu.clipMv(mv);
-    MV mvmin, mvmax;
-    setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
-    if (m_param->interRefine == 1)
-        m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
+    int cand = 0, bestcost = INT_MAX;
+    do
+    {
+        if (cand && (mvp[cand] == mvp[cand - 1] || (cand == 2 && mvp[cand] == mvp[cand - 2])))
+            continue;
+        MV bestMV;
+        mv = mvp[cand];
+        setSearchRange(cu, mv, m_param->searchRange, mvmin, mvmax);
+        int cost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mv, numMvc, mvc, m_param->searchRange, bestMV, m_param->maxSlices,
         m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
-    else
-        m_me.refineMV(&slice->m_mref[list][ref], mvmin, mvmax, mv, outmv);
+        if (bestcost > cost)
+        {
+            bestcost = cost;
+            outmv = bestMV;
+        }
+    }while (++cand < m_param->mvRefine);
 }
 /* find the best inter prediction for each PU of specified mode */
 void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
@@ -2209,7 +2233,6 @@ void Search::predInterSearch(Mode& inter
                 int ref = -1;
                 if (useAsMVP)
                     ref = interDataCTU->refIdx[list][cuIdx + puIdx];
-
                 else
                     ref = bestME[list].ref;
                 if (ref < 0)
@@ -2223,7 +2246,7 @@ void Search::predInterSearch(Mode& inter
                 const MV* amvp = interMode.amvpCand[list][ref];
                 int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
                 MV mvmin, mvmax, outmv, mvp;
-                if (useAsMVP)
+                if (useAsMVP && !m_param->mvRefine)
                 {
                     mvp = interDataCTU->mv[list][cuIdx + puIdx].word;
                     mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];
@@ -2239,11 +2262,44 @@ void Search::predInterSearch(Mode& inter
                 }
                 setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
                 MV mvpIn = mvp;
+                int satdCost;
                 if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)
                     mvpIn = bestME[list].mv;
-                    
-                int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
-                  m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+                if (useAsMVP && m_param->mvRefine)
+                {
+                    MV bestmv, mvpSel[3];
+                    int mvpIdxSel[3];
+                    satdCost = m_me.COST_MAX;
+                    switch (m_param->mvRefine)
+                    {
+                    case 3: mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];
+                            mvpIdxSel[2] = !mvpIdx;
+                    case 2: mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];
+                            mvpIdxSel[1] = mvpIdx;
+                    case 1: mvpSel[0] = interDataCTU->mv[list][cuIdx + puIdx].word;
+                            mvpIdxSel[0] = interDataCTU->mvpIdx[list][cuIdx + puIdx];
+                    }
+                    for (int cand = 0; cand < m_param->mvRefine; cand++)
+                    {
+                        if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))
+                            continue;
+                        setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);
+                        int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices,
+                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+                        if (satdCost > bcost)
+                        {
+                            satdCost = bcost;
+                            outmv = bestmv;
+                            mvp = mvpSel[cand];
+                            mvpIdx = mvpIdxSel[cand];
+                        }
+                    }
+                }
+                else
+                {
+                    satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,
+                        m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+                }
 
                 /* Get total cost of partition, but only include MV bit cost once */
                 bits += m_me.bitcost(outmv);
@@ -2346,13 +2402,16 @@ void Search::predInterSearch(Mode& inter
 
                     const MV* amvp = interMode.amvpCand[list][ref];
                     int mvpIdx = selectMVP(cu, pu, amvp, list, ref);
-                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx];
+                    MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;
+                    bool bLowresMVP = false;
 
                     if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */
                     {
                         MV lmv = getLowresMV(cu, pu, list, ref);
                         if (lmv.notZero())
                             mvc[numMvc++] = lmv;
+                        if (m_param->bEnableHME)
+                            mvp_lowres = lmv;
                     }
                     if (m_param->searchMethod == X265_SEA)
                     {
@@ -2365,10 +2424,27 @@ void Search::predInterSearch(Mode& inter
                     int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, 
                       m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
 
+                    if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp)
+                    {
+                        MV outmv_lowres;
+                        setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);
+                        int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,
+                            m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
+                        if (lowresMvCost < satdCost)
+                        {
+                            outmv = outmv_lowres;
+                            satdCost = lowresMvCost;
+                            bLowresMVP = true;
+                        }
+                    }
+
                     /* Get total cost of partition, but only include MV bit cost once */
                     bits += m_me.bitcost(outmv);
                     uint32_t mvCost = m_me.mvcost(outmv);
                     uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);
+                    /* Update LowresMVP to best AMVP cand*/
+                    if (bLowresMVP)
+                        updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);
 
                     /* Refine MVP selection, updates: mvpIdx, bits, cost */
                     mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);
@@ -2631,6 +2707,15 @@ const MV& Search::checkBestMVP(const MV*
     return amvpCand[mvpIdx];
 }
 
+/* Update to default MVP when using an alternative mvp */
+void Search::updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP)
+{
+    int diffBits = m_me.bitcost(mv, amvp) - m_me.bitcost(mv, alterMVP);
+    uint32_t origOutBits = outBits;
+    outBits = origOutBits + diffBits;
+    outCost = (outCost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(outBits);
+}
+
 void Search::setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const
 {
     MV dist((int32_t)merange << 2, (int32_t)merange << 2);
--- a/source/encoder/search.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/search.h	Thu Sep 12 16:34:00 2019 +0530
@@ -310,7 +310,7 @@ public:
 
     // estimation inter prediction (non-skip)
     void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
-    void     searchMV(Mode& interMode, const PredictionUnit& pu, int list, int ref, MV& outmv, MV mvp, int numMvc, MV* mvc);
+    void     searchMV(Mode& interMode, int list, int ref, MV& outmv, MV mvp[3], int numMvc, MV* mvc);
     // encode residual and compute rd-cost for inter mode
     void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
     void     encodeResAndCalcRdSkipCU(Mode& interMode);
@@ -425,6 +425,7 @@ protected:
     void     setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
     uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
     static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
+    void      updateMVP(const MV amvp, const MV& mv, uint32_t& outBits, uint32_t& outCost, const MV& alterMVP);
 
     /* intra helper functions */
     enum { MAX_RD_INTRA_MODES = 16 };
--- a/source/encoder/slicetype.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/slicetype.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -85,6 +85,140 @@ inline uint32_t acEnergyPlane(Frame *cur
 
 } // end anonymous namespace
 
+void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3, intptr_t stride, int height, int width)
+{
+    pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
+    pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+
+    for (int i = 0; i < height; i++)
+    {
+        memcpy(edgePic, src, width * sizeof(pixel));
+        memcpy(refPic, src, width * sizeof(pixel));
+        src += stride;
+        edgePic += stride;
+        refPic += stride;
+    }
+
+    //Applying Gaussian filter on the picture
+    src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
+    refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    pixel pixelValue = 0;
+
+    for (int rowNum = 0; rowNum < height; rowNum++)
+    {
+        for (int colNum = 0; colNum < width; colNum++)
+        {
+            if ((rowNum >= 2) && (colNum >= 2) && (rowNum != height - 2) && (colNum != width - 2)) //Ignoring the border pixels of the picture
+            {
+                /*  5x5 Gaussian filter
+                    [2   4   5   4   2]
+                 1  [4   9   12  9   4]
+                --- [5   12  15  12  5]
+                159 [4   9   12  9   4]
+                    [2   4   5   4   2]*/
+
+                const intptr_t rowOne = (rowNum - 2)*stride, colOne = colNum - 2;
+                const intptr_t rowTwo = (rowNum - 1)*stride, colTwo = colNum - 1;
+                const intptr_t rowThree = rowNum * stride, colThree = colNum;
+                const intptr_t rowFour = (rowNum + 1)*stride, colFour = colNum + 1;
+                const intptr_t rowFive = (rowNum + 2)*stride, colFive = colNum + 2;
+                const intptr_t index = (rowNum*stride) + colNum;
+
+                pixelValue = ((2 * src[rowOne + colOne] + 4 * src[rowOne + colTwo] + 5 * src[rowOne + colThree] + 4 * src[rowOne + colFour] + 2 * src[rowOne + colFive] +
+                    4 * src[rowTwo + colOne] + 9 * src[rowTwo + colTwo] + 12 * src[rowTwo + colThree] + 9 * src[rowTwo + colFour] + 4 * src[rowTwo + colFive] +
+                    5 * src[rowThree + colOne] + 12 * src[rowThree + colTwo] + 15 * src[rowThree + colThree] + 12 * src[rowThree + colFour] + 5 * src[rowThree + colFive] +
+                    4 * src[rowFour + colOne] + 9 * src[rowFour + colTwo] + 12 * src[rowFour + colThree] + 9 * src[rowFour + colFour] + 4 * src[rowFour + colFive] +
+                    2 * src[rowFive + colOne] + 4 * src[rowFive + colTwo] + 5 * src[rowFive + colThree] + 4 * src[rowFive + colFour] + 2 * src[rowFive + colFive]) / 159);
+                refPic[index] = pixelValue;
+            }
+        }
+    }
+
+#if HIGH_BIT_DEPTH //10-bit build
+    float threshold = 1023;
+    pixel whitePixel = 1023;
+#else
+    float threshold = 255;
+    pixel whitePixel = 255;
+#endif
+#define PI 3.14159265 
+
+    float gradientH = 0, gradientV = 0, radians = 0, theta = 0;
+    float gradientMagnitude = 0;
+    pixel blackPixel = 0;
+    edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+    //Applying Sobel filter on the gaussian filtered picture
+    for (int rowNum = 0; rowNum < height; rowNum++)
+    {
+        for (int colNum = 0; colNum < width; colNum++)
+        {
+            edgeTheta[(rowNum*stride) + colNum] = 0;
+            if ((rowNum != 0) && (colNum != 0) && (rowNum != height - 1) && (colNum != width - 1)) //Ignoring the border pixels of the picture
+            {
+                /*Horizontal and vertical gradients
+                       [ -3   0   3 ]        [-3   -10  -3 ]
+                  gH = [ -10  0   10]   gV = [ 0    0    0 ]
+                       [ -3   0   3 ]        [ 3    10   3 ]*/
+
+                const intptr_t rowOne = (rowNum - 1)*stride, colOne = colNum -1;
+                const intptr_t rowTwo = rowNum * stride, colTwo = colNum;
+                const intptr_t rowThree = (rowNum + 1)*stride, colThree = colNum + 1;
+                const intptr_t index = (rowNum*stride) + colNum;
+
+                gradientH = (float)(-3 * refPic[rowOne + colOne] + 3 * refPic[rowOne + colThree] - 10 * refPic[rowTwo + colOne] + 10 * refPic[rowTwo + colThree] - 3 * refPic[rowThree + colOne] + 3 * refPic[rowThree + colThree]);
+                gradientV = (float)(-3 * refPic[rowOne + colOne] - 10 * refPic[rowOne + colTwo] - 3 * refPic[rowOne + colThree] + 3 * refPic[rowThree + colOne] + 10 * refPic[rowThree + colTwo] + 3 * refPic[rowThree + colThree]);
+
+                gradientMagnitude = sqrtf(gradientH * gradientH + gradientV * gradientV);
+                radians = atan2(gradientV, gradientH);
+                theta = (float)((radians * 180) / PI);
+                if (theta < 0)
+                    theta = 180 + theta;
+                edgeTheta[(rowNum*stride) + colNum] = (pixel)theta;
+
+                edgePic[index] = gradientMagnitude >= threshold ? whitePixel : blackPixel;
+            }
+        }
+    }
+}
+
+//Find the angle of a block by averaging the pixel angles 
+inline void findAvgAngle(const pixel* block, intptr_t stride, uint32_t size, uint32_t &angle)
+{
+    int sum = 0;
+    for (uint32_t y = 0; y < size; y++)
+    {
+        for (uint32_t x = 0; x < size; x++)
+        {
+            sum += block[x];
+        }
+        block += stride;
+    }
+    angle = sum / (size*size);
+}
+
+uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame,pixel *edgeImage, pixel *edgeTheta, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize)
+{
+    intptr_t srcStride = curFrame->m_fencPic->m_stride;
+    intptr_t blockOffsetLuma = blockX + (blockY * srcStride);
+    int plane = 0; // Sobel filter is applied only on Y component
+    uint32_t var;
+
+    if (qgSize == 8)
+    {
+        findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, qgSize, avgAngle);
+        var = acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(edgeImage + blockOffsetLuma, srcStride), 6, plane);
+    }
+    else
+    {
+        findAvgAngle(edgeTheta + blockOffsetLuma, srcStride, 16, avgAngle);
+        var = acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(edgeImage + blockOffsetLuma, srcStride), 8, plane);
+    }
+    x265_emms();
+    return var;
+}
+
 /* Find the total AC energy of each block in all planes */
 uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize)
 {
@@ -303,146 +437,203 @@ void LookaheadTLD::calcAdaptiveQuantFram
         curFrame->m_lowres.wp_sum[y] = 0;
     }
 
-    /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */    
-    if ((param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) || (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
+    if (!(param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame)))
     {
-        if (param->rc.aqMode && param->rc.aqStrength == 0)
+        /* Calculate Qp offset for each 16x16 or 8x8 block in the frame */
+        if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0)
         {
-            if (quantOffsets)
+            if (param->rc.aqMode && param->rc.aqStrength == 0)
             {
-                for (int cuxy = 0; cuxy < blockCount; cuxy++)
+                if (quantOffsets)
                 {
-                    curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
-                    curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
+                    for (int cuxy = 0; cuxy < blockCount; cuxy++)
+                    {
+                        curFrame->m_lowres.qpCuTreeOffset[cuxy] = curFrame->m_lowres.qpAqOffset[cuxy] = quantOffsets[cuxy];
+                        curFrame->m_lowres.invQscaleFactor[cuxy] = x265_exp2fix8(curFrame->m_lowres.qpCuTreeOffset[cuxy]);
+                    }
                 }
+                else
+                {
+                    memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
+                    memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
+                    for (int cuxy = 0; cuxy < blockCount; cuxy++)
+                        curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
+                }
+            }
+
+            /* Need variance data for weighted prediction and dynamic refinement*/
+            if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
+            {
+                for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
+                    for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
+                        acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+            }
+        }
+        else
+        {
+            if (param->rc.hevcAq)
+            {
+                // New method for calculating variance and qp offset
+                xPreanalyze(curFrame);
             }
             else
             {
-               memset(curFrame->m_lowres.qpCuTreeOffset, 0, blockCount * sizeof(double));
-               memset(curFrame->m_lowres.qpAqOffset, 0, blockCount * sizeof(double));
-               for (int cuxy = 0; cuxy < blockCount; cuxy++)
-                   curFrame->m_lowres.invQscaleFactor[cuxy] = 256;
-            }
-        }
-
-        /* Need variance data for weighted prediction and dynamic refinement*/
-        if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
-        {
-            for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
-                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
-                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
-        }
-    }
-    else
-    {
-        if (param->rc.hevcAq)
-        {
-            // New method for calculating variance and qp offset
-            xPreanalyze(curFrame);
-        }
-        else
-        {
-            int blockXY = 0;
-            double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
-            double bias_strength = 0.f;
-            double strength = 0.f;
-            if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
-            {
-                double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8)));
-
+#define AQ_EDGE_BIAS 0.5
+#define EDGE_INCLINATION 45
+                uint32_t numCuInHeight = (maxRow + param->maxCUSize - 1) / param->maxCUSize;
+                int maxHeight = numCuInHeight * param->maxCUSize;
+                intptr_t stride = curFrame->m_fencPic->m_stride;
+                pixel *edgePic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)));
+                pixel *gaussianPic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)));
+                pixel *thetaPic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)));
+                memset(edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+                memset(gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+                memset(thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+                if (param->rc.aqMode == X265_AQ_EDGE)
+                    edgeFilter(curFrame, edgePic, gaussianPic, thetaPic, stride, maxRow, maxCol);
+
+                int blockXY = 0, inclinedEdge = 0;
+                double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
+                double bias_strength = 0.f;
+                double strength = 0.f;
+                if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
+                {
+                    double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8)));
+                    for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
+                    {
+                        for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
+                        {
+                            uint32_t energy, edgeDensity, avgAngle;
+                            energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+                            if (param->rc.aqMode == X265_AQ_EDGE)
+                            {
+                                pixel *edgeImage = edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+                                pixel *edgeTheta = thetaPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX;
+                                edgeDensity = edgeDensityCu(curFrame, edgeImage, edgeTheta, avgAngle, blockX, blockY, param->rc.qgSize);
+                                if (edgeDensity)
+                                {
+                                    qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1);
+                                    //Increasing the QP of a block if its edge orientation lies around the multiples of 45 degree
+                                    if ((avgAngle >= EDGE_INCLINATION - 15 && avgAngle <= EDGE_INCLINATION + 15) || (avgAngle >= EDGE_INCLINATION + 75 && avgAngle <= EDGE_INCLINATION + 105))
+                                        curFrame->m_lowres.edgeInclined[blockXY] = 1;
+                                    else
+                                        curFrame->m_lowres.edgeInclined[blockXY] = 0;
+                                }
+                                else
+                                {
+                                    qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
+                                    curFrame->m_lowres.edgeInclined[blockXY] = 0;
+                                }
+                            }
+                            else
+                                qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
+                            curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
+                            avg_adj += qp_adj;
+                            avg_adj_pow2 += qp_adj * qp_adj;
+                            blockXY++;
+                        }
+                    }
+                    avg_adj /= blockCount;
+                    avg_adj_pow2 /= blockCount;
+                    strength = param->rc.aqStrength * avg_adj;
+                    avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
+                    bias_strength = param->rc.aqStrength;
+                }
+                else
+                    strength = param->rc.aqStrength * 1.0397f;
+
+                X265_FREE(edgePic);
+                X265_FREE(gaussianPic);
+                X265_FREE(thetaPic);
+                blockXY = 0;
                 for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
                 {
                     for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
                     {
-                        uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
-                        qp_adj = pow(energy * bit_depth_correction + 1, 0.1);
+                        if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
+                        {
+                            qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                            qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
+                        }
+                        else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
+                        {
+                            qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                            qp_adj = strength * (qp_adj - avg_adj);
+                        }
+                        else if (param->rc.aqMode == X265_AQ_EDGE)
+                        {
+                            inclinedEdge = curFrame->m_lowres.edgeInclined[blockXY];
+                            qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
+                            if(inclinedEdge && (qp_adj - avg_adj > 0))
+                                qp_adj = ((strength + AQ_EDGE_BIAS) * (qp_adj - avg_adj));
+                            else
+                                qp_adj = strength * (qp_adj - avg_adj);
+                        }
+                        else
+                        {
+                            uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+                            qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
+                        }
+
+                        if (param->bHDROpt)
+                        {
+                            uint32_t sum = lumaSumCu(curFrame, blockX, blockY, param->rc.qgSize);
+                            uint32_t lumaAvg = sum / (loopIncr * loopIncr);
+                            if (lumaAvg < 301)
+                                qp_adj += 3;
+                            else if (lumaAvg >= 301 && lumaAvg < 367)
+                                qp_adj += 2;
+                            else if (lumaAvg >= 367 && lumaAvg < 434)
+                                qp_adj += 1;
+                            else if (lumaAvg >= 501 && lumaAvg < 567)
+                                qp_adj -= 1;
+                            else if (lumaAvg >= 567 && lumaAvg < 634)
+                                qp_adj -= 2;
+                            else if (lumaAvg >= 634 && lumaAvg < 701)
+                                qp_adj -= 3;
+                            else if (lumaAvg >= 701 && lumaAvg < 767)
+                                qp_adj -= 4;
+                            else if (lumaAvg >= 767 && lumaAvg < 834)
+                                qp_adj -= 5;
+                            else if (lumaAvg >= 834)
+                                qp_adj -= 6;
+                        }
+                        if (quantOffsets != NULL)
+                            qp_adj += quantOffsets[blockXY];
+                        curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
                         curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
-                        avg_adj += qp_adj;
-                        avg_adj_pow2 += qp_adj * qp_adj;
+                        curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
                         blockXY++;
                     }
                 }
-                avg_adj /= blockCount;
-                avg_adj_pow2 /= blockCount;
-                strength = param->rc.aqStrength * avg_adj;
-                avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - modeTwoConst) / avg_adj;
-                bias_strength = param->rc.aqStrength;
-            }
-            else
-                strength = param->rc.aqStrength * 1.0397f;
-
-            blockXY = 0;
-            for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
-            {
-                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
-                {
-                    if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED)
-                    {
-                        qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
-                        qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - modeTwoConst / (qp_adj * qp_adj));
-                    }
-                    else if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
-                    {
-                        qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY];
-                        qp_adj = strength * (qp_adj - avg_adj);
-                    }
-                    else
-                    {
-                        uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
-                        qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
-                    }
-
-                    if (param->bHDROpt)
-                    {
-                        uint32_t sum = lumaSumCu(curFrame, blockX, blockY, param->rc.qgSize);
-                        uint32_t lumaAvg = sum / (loopIncr * loopIncr);
-                        if (lumaAvg < 301)
-                            qp_adj += 3;
-                        else if (lumaAvg >= 301 && lumaAvg < 367)
-                            qp_adj += 2;
-                        else if (lumaAvg >= 367 && lumaAvg < 434)
-                            qp_adj += 1;
-                        else if (lumaAvg >= 501 && lumaAvg < 567)
-                            qp_adj -= 1;
-                        else if (lumaAvg >= 567 && lumaAvg < 634)
-                            qp_adj -= 2;
-                        else if (lumaAvg >= 634 && lumaAvg < 701)
-                            qp_adj -= 3;
-                        else if (lumaAvg >= 701 && lumaAvg < 767)
-                            qp_adj -= 4;
-                        else if (lumaAvg >= 767 && lumaAvg < 834)
-                            qp_adj -= 5;
-                        else if (lumaAvg >= 834)
-                            qp_adj -= 6;
-                    }
-                    if (quantOffsets != NULL)
-                        qp_adj += quantOffsets[blockXY];
-                    curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj;
-                    curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj;
-                    curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj);
-                    blockXY++;
-                }
             }
         }
-    }
-
-    if (param->rc.qgSize == 8)
-    {
-        for (int cuY = 0; cuY < heightInCU; cuY++)
+
+        if (param->rc.qgSize == 8)
         {
-            for (int cuX = 0; cuX < widthInCU; cuX++)
+            for (int cuY = 0; cuY < heightInCU; cuY++)
             {
-                const int cuXY = cuX + cuY * widthInCU;
-                curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
-                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
-                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
-                                                               curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+                for (int cuX = 0; cuX < widthInCU; cuX++)
+                {
+                    const int cuXY = cuX + cuY * widthInCU;
+                    curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+                        curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+                        curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
+                        curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+                }
             }
         }
     }
 
     if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
     {
+        if (param->rc.bStatRead && param->rc.cuTree && IS_REFERENCED(curFrame))
+        {
+            for (int blockY = 0; blockY < maxRow; blockY += loopIncr)
+                for (int blockX = 0; blockX < maxCol; blockX += loopIncr)
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+        }
+
         int hShift = CHROMA_H_SHIFT(param->internalCsp);
         int vShift = CHROMA_V_SHIFT(param->internalCsp);
         maxCol = ((maxCol + 8) >> 4) << 4;
@@ -664,6 +855,7 @@ void LookaheadTLD::weightsAnalyse(Lowres
     weightedRef.lumaStride = fenc.lumaStride;
     weightedRef.isLowres = true;
     weightedRef.isWeighted = false;
+    weightedRef.isHMELowres = ref.bEnableHME;
 
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     float guessScale, fencMean, refMean;
@@ -759,6 +951,8 @@ Lookahead::Lookahead(x265_param *param, 
     m_extendGopBoundary = false;
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_4x4Height = ((m_param->sourceHeight / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+    m_4x4Width = ((m_param->sourceWidth / 4) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_cuCount = m_8x8Width * m_8x8Height;
     m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_cuCount + 4 - 2 * (m_8x8Width + m_8x8Height)) : m_cuCount;
     m_isFadeIn = false;
@@ -1172,9 +1366,7 @@ void PreLookaheadGroup::processTasks(int
         ProfileScopeEvent(prelookahead);
         m_lock.release();
         preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc);
-        if (m_lookahead.m_param->rc.bStatRead && m_lookahead.m_param->rc.cuTree && IS_REFERENCED(preFrame))
-            /* cu-tree offsets were read from stats file */;
-        else if (m_lookahead.m_bAdaptiveQuant)
+        if (m_lookahead.m_bAdaptiveQuant)
             tld.calcAdaptiveQuantFrame(preFrame, m_lookahead.m_param);
         tld.lowresIntraEstimate(preFrame->m_lowres, m_lookahead.m_param->rc.qgSize);
         preFrame->m_lowresInit = true;
@@ -2782,16 +2974,32 @@ void CostEstimateGroup::processTasks(int
 
             X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slices\n");
 
-            int firstY = m_lookahead.m_numRowsPerSlice * i;
-            int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
-
-            bool lastRow = true;
+            int firstY, lastY;
+            bool lastRow;
+            if (m_lookahead.m_param->bEnableHME)
+            {
+                int numRowsPerSlice = m_lookahead.m_4x4Height / m_lookahead.m_param->lookaheadSlices;
+                numRowsPerSlice = X265_MIN(X265_MAX(numRowsPerSlice, 5), m_lookahead.m_4x4Height);
+                firstY = numRowsPerSlice * i;
+                lastY = (i == m_jobTotal - 1) ? m_lookahead.m_4x4Height - 1 : numRowsPerSlice * (i + 1) - 1;
+                lastRow = true;
+                for (int cuY = lastY; cuY >= firstY; cuY--)
+                {
+                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
+                        estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 1);
+                    lastRow = false;
+                }
+            }
+
+            firstY = m_lookahead.m_numRowsPerSlice * i;
+            lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1;
+            lastRow = true;
             for (int cuY = lastY; cuY >= firstY; cuY--)
             {
                 m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0;
 
                 for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
-                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i);
+                    estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i, 0);
 
                 lastRow = false;
             }
@@ -2864,13 +3072,25 @@ int64_t CostEstimateGroup::estimateFrame
         }
         else
         {
-            bool lastRow = true;
+            /* Calculate MVs for 1/16th resolution*/
+            bool lastRow;
+            if (param->bEnableHME)
+            {
+                lastRow = true;
+                for (int cuY = m_lookahead.m_4x4Height - 1; cuY >= 0; cuY--)
+                {
+                    for (int cuX = m_lookahead.m_4x4Width - 1; cuX >= 0; cuX--)
+                        estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 1);
+                    lastRow = false;
+                }
+            }
+            lastRow = true;
             for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--)
             {
                 fenc->rowSatds[b - p0][p1 - b][cuY] = 0;
 
                 for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--)
-                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1);
+                    estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1, 0);
 
                 lastRow = false;
             }
@@ -2891,23 +3111,27 @@ int64_t CostEstimateGroup::estimateFrame
     return score;
 }
 
-void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice)
+void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme)
 {
     Lowres *fref0 = m_frames[p0];
     Lowres *fref1 = m_frames[p1];
     Lowres *fenc  = m_frames[b];
 
-    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted ? &fenc->weightedRef[b - p0] : fref0;
-
-    const int widthInCU = m_lookahead.m_8x8Width;
-    const int heightInCU = m_lookahead.m_8x8Height;
+    ReferencePlanes *wfref0 = fenc->weightedRef[b - p0].isWeighted && !hme ? &fenc->weightedRef[b - p0] : fref0;
+
+    const int widthInCU = hme ? m_lookahead.m_4x4Width : m_lookahead.m_8x8Width;
+    const int heightInCU = hme ? m_lookahead.m_4x4Height : m_lookahead.m_8x8Height;
     const int bBidir = (b < p1);
     const int cuXY = cuX + cuY * widthInCU;
+    const int cuXY_4x4 = (cuX / 2) + (cuY / 2) * widthInCU / 2;
     const int cuSize = X265_LOWRES_CU_SIZE;
-    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride;
-
-    if (bBidir || bDoSearch[0] || bDoSearch[1])
-        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, 1);
+    const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * (hme ? fenc->lumaStride/2 : fenc->lumaStride);
+
+    if ((bBidir || bDoSearch[0] || bDoSearch[1]) && hme)
+        tld.me.setSourcePU(fenc->lowerResPlane[0], fenc->lumaStride / 2, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
+    else if((bBidir || bDoSearch[0] || bDoSearch[1]) && !hme)
+        tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize, X265_HEX_SEARCH, m_lookahead.m_param->hmeSearchMethod[0], m_lookahead.m_param->hmeSearchMethod[1], 1);
+
 
     /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */
     int lowresPenalty = 4;
@@ -2926,7 +3150,7 @@ void CostEstimateGroup::estimateCUCost(L
 
     for (int i = 0; i < 1 + bBidir; i++)
     {
-        int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY];
+        int& fencCost = hme ? fenc->lowerResMvCosts[i][listDist[i]][cuXY] : fenc->lowresMvCosts[i][listDist[i]][cuXY];
         int skipCost = INT_MAX;
 
         if (!bDoSearch[i])
@@ -2936,8 +3160,8 @@ void CostEstimateGroup::estimateCUCost(L
         }
 
         int numc = 0;
-        MV mvc[4], mvp;
-        MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY];
+        MV mvc[5], mvp;
+        MV* fencMV = hme ? &fenc->lowerResMvs[i][listDist[i]][cuXY] : &fenc->lowresMvs[i][listDist[i]][cuXY];
         ReferencePlanes* fref = i ? fref1 : wfref0;
 
         /* Reverse-order MV prediction */
@@ -2952,6 +3176,10 @@ void CostEstimateGroup::estimateCUCost(L
             if (cuX < widthInCU - 1)
                 MVC(fencMV[widthInCU + 1]);
         }
+        if (fenc->lowerResMvs[0][0] && !hme && fenc->lowerResMvCosts[i][listDist[i]][cuXY_4x4] > 0)
+        {
+            MVC((fenc->lowerResMvs[i][listDist[i]][cuXY_4x4]) * 2);
+        }
 #undef MVC
 
         if (!numc)
@@ -2967,7 +3195,7 @@ void CostEstimateGroup::estimateCUCost(L
             for (int idx = 0; idx < numc; idx++)
             {
                 intptr_t stride = X265_LOWRES_CU_SIZE;
-                pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride);
+                pixel *src = fref->lowresMC(pelOffset, mvc[idx], subpelbuf, stride, hme);
                 int cost = tld.me.bufSATD(src, stride);
                 COPY2_IF_LT(mvpcost, cost, mvp, mvc[idx]);
                 /* Except for mv0 case, everyting else is likely to have enough residual to not trigger the skip. */
@@ -2978,7 +3206,10 @@ void CostEstimateGroup::estimateCUCost(L
 
         /* ME will never return a cost larger than the cost @MVP, so we do not
          * have to check that ME cost is more than the estimated merge cost */
-        fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
+        if(!hme)
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices);
+        else
+            fencCost = tld.me.motionEstimate(fref, mvmin, mvmax, mvp, 0, NULL, s_merange, *fencMV, m_lookahead.m_param->maxSlices, fref->lowerResPlane[0]);
         if (skipCost < 64 && skipCost < fencCost && bBidir)
         {
             fencCost = skipCost;
@@ -2986,6 +3217,8 @@ void CostEstimateGroup::estimateCUCost(L
         }
         COPY2_IF_LT(bcost, fencCost, listused, i + 1);
     }
+    if (hme)
+        return;
 
     if (bBidir) /* B, also consider bidir */
     {
@@ -2995,8 +3228,8 @@ void CostEstimateGroup::estimateCUCost(L
         ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE;
-        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0);
-        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1);
+        pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0, 0);
+        pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1, 0);
         ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
         primitives.pu[LUMA_8x8].pixelavg_pp[NONALIGNED](ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32);
         int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE);
--- a/source/encoder/slicetype.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/slicetype.h	Thu Sep 12 16:34:00 2019 +0530
@@ -92,6 +92,7 @@ struct LookaheadTLD
 protected:
 
     uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize);
+    uint32_t edgeDensityCu(Frame*curFrame, pixel *edgeImage, pixel *edgeTheta, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t lumaSumCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
     bool     allocWeightedRef(Lowres& fenc);
@@ -124,6 +125,10 @@ public:
     int           m_inputCount;
     double        m_cuTreeStrength;
 
+    /* HME */
+    int           m_4x4Width;
+    int           m_4x4Height;
+
     bool          m_isActive;
     bool          m_sliceTypeBusy;
     bool          m_bAdaptiveQuant;
@@ -246,7 +251,7 @@ protected:
     void    processTasks(int workerThreadID);
 
     int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty);
-    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
+    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice, bool hme);
 
     CostEstimateGroup& operator=(const CostEstimateGroup&);
 };
--- a/source/encoder/weightPrediction.cpp	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/encoder/weightPrediction.cpp	Thu Sep 12 16:34:00 2019 +0530
@@ -82,7 +82,7 @@ void mcLuma(pixel* mcout, Lowres& ref, c
             /* clip MV to available pixels */
             MV mv = mvs[cu];
             mv = mv.clipped(mvmin, mvmax);
-            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
+            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
             primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
         }
     }
--- a/source/test/regression-tests.txt	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/test/regression-tests.txt	Thu Sep 12 16:34:00 2019 +0530
@@ -153,6 +153,9 @@ Kimono1_1920x1080_24_400.yuv,--preset pl
 big_buck_bunny_360p24.y4m, --keyint 60 --min-keyint 40 --gop-lookahead 14
 BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2 --vbv-maxrate 5000 --vbv-bufsize 5000
 big_buck_bunny_360p24.y4m, --bitrate 500 --fades
+720p50_parkrun_ter.y4m,--preset medium --bitrate 400 --hme
+ducks_take_off_420_1_720p50.y4m,--preset medium --aq-mode 4 --crf 22 --no-cutree
+ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20
 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
--- a/source/x265.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/x265.h	Thu Sep 12 16:34:00 2019 +0530
@@ -561,6 +561,7 @@ typedef enum
 #define X265_AQ_VARIANCE             1
 #define X265_AQ_AUTO_VARIANCE        2
 #define X265_AQ_AUTO_VARIANCE_BIASED 3
+#define X265_AQ_EDGE                 4
 #define x265_ADAPT_RD_STRENGTH   4
 #define X265_REFINE_INTER_LEVELS 3
 /* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */
@@ -1172,6 +1173,14 @@ typedef struct x265_param
     /* Enable availability of temporal motion vector for AMVP, default is enabled */
     int       bEnableTemporalMvp;
 
+    /* Enable 3-level Hierarchical motion estimation at One-Sixteenth, Quarter and Full resolution.
+     * Default is disabled */
+    int       bEnableHME;
+
+    /* Enable HME search method (DIA, HEX, UMH, STAR, SEA, FULL) for level 0, 1 and 2.
+     * Default is hex, umh, umh for L0, L1 and L2 respectively. */
+    int       hmeSearchMethod[3];
+
     /* Enable weighted prediction in P slices.  This enables weighting analysis
      * in the lookahead, which influences slice decisions, and enables weighting
      * analysis in the main encoder which allows P reference samples to have a
@@ -1214,6 +1223,12 @@ typedef struct x265_param
      * non-deblocked pixels are used entirely. Default is disabled */
     int       bSaoNonDeblocked;
 
+    /* Select tune rate in which SAO has to be applied.
+    1 - Filtering applied only on I-frames(I) [Light tune]
+    2 - No Filtering on B frames (I, P) [Medium tune]
+    3 - No Filtering on non-ref b frames  (I, P, B) [Strong tune] */
+    int       selectiveSAO;
+
     /*== Analysis tools ==*/
 
     /* A value between 1 and 6 (both inclusive) which determines the level of 
--- a/source/x265cli.h	Mon Jul 08 16:43:46 2019 +0530
+++ b/source/x265cli.h	Thu Sep 12 16:34:00 2019 +0530
@@ -95,6 +95,9 @@ static const struct option long_options[
     { "max-merge",      required_argument, NULL, 0 },
     { "no-temporal-mvp",      no_argument, NULL, 0 },
     { "temporal-mvp",         no_argument, NULL, 0 },
+    { "hme",                  no_argument, NULL, 0 },
+    { "no-hme",               no_argument, NULL, 0 },
+    { "hme-search",     required_argument, NULL, 0 },
     { "rdpenalty",      required_argument, NULL, 0 },
     { "no-rect",              no_argument, NULL, 0 },
     { "rect",                 no_argument, NULL, 0 },
@@ -197,6 +200,7 @@ static const struct option long_options[
     { "no-deblock",           no_argument, NULL, 0 },
     { "deblock",        required_argument, NULL, 0 },
     { "no-sao",               no_argument, NULL, 0 },
+    { "selective-sao",  required_argument, NULL, 0 },
     { "sao",                  no_argument, NULL, 0 },
     { "no-sao-non-deblock",   no_argument, NULL, 0 },
     { "sao-non-deblock",      no_argument, NULL, 0 },
@@ -294,8 +298,7 @@ static const struct option long_options[
     { "dhdr10-opt",           no_argument, NULL, 0},
     { "no-dhdr10-opt",        no_argument, NULL, 0},
     { "dolby-vision-profile",  required_argument, NULL, 0 },
-    { "refine-mv",            no_argument, NULL, 0 },
-    { "no-refine-mv",         no_argument, NULL, 0 },
+    { "refine-mv",      required_argument, NULL, 0 },
     { "refine-ctu-distortion", required_argument, NULL, 0 },
     { "force-flush",    required_argument, NULL, 0 },
     { "splitrd-skip",         no_argument, NULL, 0 },
@@ -464,6 +467,8 @@ static void showHelp(x265_param *param)
     H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
     H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
     H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
+    H1("   --[no-]hme                    Enable Hierarchical Motion Estimation. Default %s\n", OPT(param->bEnableHME));
+    H1("   --hme-search <string>         Motion search-method for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeSearchMethod[0], param->hmeSearchMethod[1], param->hmeSearchMethod[2]);
     H0("\nSpatial / intra options:\n");
     H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
     H0("   --[no-]constrained-intra      Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
@@ -544,16 +549,16 @@ static void showHelp(x265_param *param)
         "                                    - 3 : Functionality of (1) + irrespective of size evaluate all inter modes.\n"
         "                                Default:%d\n", param->interRefine);
     H0("   --[no-]dynamic-refine         Dynamically changes refine-inter level for each CU. Default %s\n", OPT(param->bDynamicRefine));
-    H0("   --[no-]refine-mv              Enable mv refinement for load mode. Default %s\n", OPT(param->mvRefine));
+    H0("   --refine-mv <0..3>            Enable mv refinement for load mode. Default %d\n", param->mvRefine);
     H0("   --refine-ctu-distortion       Store/normalize ctu distortion in analysis-save/load.\n"
         "                                    - 0 : Disabled.\n"
         "                                    - 1 : Store/Load ctu distortion to/from the file specified in analysis-save/load.\n"
         "                                Default 0 - Disabled\n");
-    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
+    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes 4:auto variance with edge information. Default %d\n", param->rc.aqMode);
     H0("   --[no-]hevc-aq                Mode for HEVC Adaptive Quantization. Default %s\n", OPT(param->rc.hevcAq));
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
     H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
-    H0("   --[no-]aq-motion              Adaptive Quantization based on the relative motion of each CU w.r.t., frame. Default %s\n", OPT(param->bOptCUDeltaQP));
+    H0("   --[no-]aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
     H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
     H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
     H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
@@ -585,6 +590,7 @@ static void showHelp(x265_param *param)
     H0("   --[no-]sao                    Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
     H1("   --[no-]sao-non-deblock        Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
     H0("   --[no-]limit-sao              Limit Sample Adaptive Offset types. Default %s\n", OPT(param->bLimitSAO));
+    H0("   --selective-sao <int>         Enable slice-level SAO filter. Default %d\n", param->selectiveSAO);
     H0("\nVUI options:\n");
     H0("   --sar <width:height|int>      Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
     H0("                                 Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");