changeset 12653:50fde7e6037b stable

Merge with default
author Aruna Matheswaran <aruna@multicorewareinc.com>
date Wed, 06 May 2020 14:59:56 +0530
parents dd2464df2f40 (current diff) df0886d58b86 (diff)
children e088c8787232 18610b4f88bc
files
diffstat 45 files changed, 6082 insertions(+-), 1730 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/aarch64-linux/crosscompile.cmake	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,15 @@
+# CMake toolchain file for cross compiling x265 for aarch64
+# This feature is only supported as experimental. Use with caution.
+# Please report bugs on bitbucket
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source
+
+set(CROSS_COMPILE_ARM 1)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+# specify the cross compiler
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
+
+# specify the target environment
+SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/aarch64-linux/make-Makefiles.bash	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Run this from within a bash shell
+
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source
--- a/doc/reST/cli.rst	Mon Feb 17 20:46:36 2020 +0530
+++ b/doc/reST/cli.rst	Wed May 06 14:59:56 2020 +0530
@@ -107,6 +107,9 @@ Logging/Statistic Options
 	
 	**BufferFillFinal** Buffer bits available after removing the frame out of CPB.
 	
+	**UnclippedBufferFillFinal** Unclipped buffer bits available after removing the frame 
+	out of CPB only used for csv logging purpose.
+	
 	**Latency** Latency in terms of number of frames between when the frame 
 	was given in and when the frame is given out.
 	
@@ -842,15 +845,31 @@ the prediction quad-tree.
 	Measure 2Nx2N merge candidates first; if no residual is found, 
 	additional modes at that depth are not analysed. Default disabled
 
-.. option:: --rskip, --no-rskip
-
-	This option determines early exit from CU depth recursion. When a skip CU is
-	found, additional heuristics (depending on rd-level) are used to decide whether
-	to terminate recursion. In rdlevels 5 and 6, comparison with inter2Nx2N is used, 
-	while at rdlevels 4 and neighbour costs are used to skip recursion.
-	Provides minimal quality degradation at good performance gains when enabled. 
-
-	Default: enabled, disabled for :option:`--tune grain`
+.. option:: --rskip <0|1|2>
+
+	This option determines early exit from CU depth recursion in modes 1 and 2. When a skip CU is
+	found, additional heuristics (depending on the RD level and rskip mode) are used to decide whether
+	to terminate recursion. The following table summarizes the behavior.
+	
+	+----------+------------+----------------------------------------------------------------+
+	| RD Level | Rskip Mode |   Skip Recursion Heuristic                                     |
+	+==========+============+================================================================+
+	|   0 - 4  |      1     |   Neighbour costs and CU homogenity.                           |
+	+----------+------------+----------------------------------------------------------------+
+	|   5 - 6  |      1     |   Comparison with inter2Nx2N.                                  |
+	+----------+------------+----------------------------------------------------------------+
+	|   0 - 6  |      2     |   CU edge density.                                             |
+	+----------+------------+----------------------------------------------------------------+
+
+	Provides minimal quality degradation at good performance gains for non-zero modes.
+	:option:`--rskip mode 0` means disabled. Default: 1, disabled when :option:`--tune grain` is used.
+	This is a integer value representing the edge-density percentage within the CU. Internally normalized to a number between 0.0 to 1.0 in x265. 
+	Recommended low thresholds for slow encodes and high for fast encodes.
+
+.. option:: --rskip-edge-threshold <0..100>
+
+	Denotes the minimum expected edge-density percentage within the CU, below which the recursion is skipped.
+	Default: 5, requires :option:`--rskip mode 2` to be enabled.
 
 .. option:: --splitrd-skip, --no-splitrd-skip
 
@@ -2501,6 +2520,28 @@ Debugging options
 	--recon-y4m-exec "ffplay -i pipe:0 -autoexit"
 
 	**CLI ONLY**
+	
+ABR-ladder Options
+==================
+
+.. option:: --abr-ladder <filename>
+
+	File containing the encoder configurations to generate ABR ladder.
+	The format of each line is:
+
+	**<encID:reuse-level:refID> <CLI>**
+	
+	where, encID indicates the unique name given to the encode, refID indicates
+	the name of the encode from which analysis info has to be re-used ( set to 'nil'
+	if analysis reuse isn't preferred ), and reuse-level indicates the level ( :option:`--analysis-load-reuse-level`)
+	at which analysis info has to be reused.
+	
+	A sample config file is available in `the downloads page <https://bitbucket.org/multicoreware/x265/downloads/Sample_ABR_ladder_config>`_
+	
+	Default: Disabled ( Conventional single encode generation ). Experimental feature.
+
+	**CLI ONLY**
+
 
 SVT-HEVC Encoder Options
 ========================
--- a/source/CMakeLists.txt	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/CMakeLists.txt	Wed May 06 14:59:56 2020 +0530
@@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CP
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 188)
+set(X265_BUILD 192)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -40,7 +40,7 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_
 # System architecture detection
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
-set(ARM_ALIASES armv6l armv7l)
+set(ARM_ALIASES armv6l armv7l aarch64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
 set(POWER_ALIASES ppc64 ppc64le)
@@ -70,9 +70,15 @@ elseif(ARMMATCH GREATER "-1")
     else()
         set(CROSS_COMPILE_ARM 0)
     endif()
-    message(STATUS "Detected ARM target processor")
     set(ARM 1)
-    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
+    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
+        message(STATUS "Detected ARM64 target processor")
+        set(ARM64 1)
+        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)
+    else()
+        message(STATUS "Detected ARM target processor")
+        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)
+    endif()
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
@@ -95,6 +101,8 @@ if(UNIX)
         if(NUMA_FOUND)
             link_directories(${NUMA_LIBRARY_DIR})
             list(APPEND CMAKE_REQUIRED_LIBRARIES numa)
+            list(APPEND CMAKE_REQUIRED_INCLUDES ${NUMA_INCLUDE_DIR})
+            list(APPEND CMAKE_REQUIRED_LINK_OPTIONS "-L${NUMA_LIBRARY_DIR}")
             check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2)
             if(NUMA_V2)
                 add_definitions(-DHAVE_LIBNUMA)
@@ -231,14 +239,24 @@ if(GCC)
         endif()
     endif()
     if(ARM AND CROSS_COMPILE_ARM)
-        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+        if(ARM64)
+            set(ARM_ARGS -fPIC)
+        else()
+            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
+        endif()
+        message(STATUS "cross compile arm")
     elseif(ARM)
-        find_package(Neon)
-        if(CPU_HAS_NEON)
-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+        if(ARM64)
+            set(ARM_ARGS -fPIC)
             add_definitions(-DHAVE_NEON)
         else()
-            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+            find_package(Neon)
+            if(CPU_HAS_NEON)
+                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)
+                add_definitions(-DHAVE_NEON)
+            else()
+                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
+            endif()
         endif()
     endif()
     add_definitions(${ARM_ARGS})
@@ -518,7 +536,11 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE
     # compile ARM arch asm files here
         enable_language(ASM)
         foreach(ASM ${ARM_ASMS})
-            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+            if(ARM64)
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+            else()
+                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
+            endif()
             list(APPEND ASM_SRCS ${ASM_SRC})
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
             add_custom_command(
@@ -725,16 +747,16 @@ if(ENABLE_CLI)
         # Xcode seems unable to link the CLI with libs, so link as one targget
         if(ENABLE_HDR10_PLUS)
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
-                        x265.cpp x265.h x265cli.h
+                        x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h
                         $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> $<TARGET_OBJECTS:dynamicHDR10> ${ASM_OBJS})
         else()
             add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
-                        x265.cpp x265.h x265cli.h
+                        x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h
                         $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${ASM_OBJS})
         endif()
     else()
         add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
-                       ${ExportDefs} x265.cpp x265.h x265cli.h)
+                       ${ExportDefs} x265.cpp x265.h x265cli.cpp x265cli.h abrEncApp.cpp abrEncApp.h)
         if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
             # The CLI cannot link to the shared library on Windows, it
             # requires internal APIs not exported from the DLL
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/abrEncApp.cpp	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,1108 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "abrEncApp.h"
+#include "mv.h"
+#include "slice.h"
+#include "param.h"
+
+#include <signal.h>
+#include <errno.h>
+
+#include <queue>
+
+using namespace X265_NS;
+
+/* Ctrl-C handler */
+static volatile sig_atomic_t b_ctrl_c /* = 0 */;
+static void sigint_handler(int)
+{
+    b_ctrl_c = 1;
+}
+
+namespace X265_NS {
+    // private namespace
+#define X265_INPUT_QUEUE_SIZE 250
+
+    AbrEncoder::AbrEncoder(CLIOptions cliopt[], uint8_t numEncodes, int &ret)
+    {
+        m_numEncodes = numEncodes;
+        m_numActiveEncodes.set(numEncodes);
+        m_queueSize = X265_INPUT_QUEUE_SIZE;
+        m_passEnc = X265_MALLOC(PassEncoder*, m_numEncodes);
+
+        for (uint8_t i = 0; i < m_numEncodes; i++)
+        {
+            m_passEnc[i] = new PassEncoder(i, cliopt[i], this);
+            if (!m_passEnc[i])
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for passEncoder\n");
+                ret = 4;
+            }
+            m_passEnc[i]->init(ret);
+        }
+
+        if (!allocBuffers())
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Unable to allocate memory for buffers\n");
+            ret = 4;
+        }
+
+        /* start passEncoder worker threads */
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+            m_passEnc[pass]->startThreads();
+    }
+
+    bool AbrEncoder::allocBuffers()
+    {
+        m_inputPicBuffer = X265_MALLOC(x265_picture**, m_numEncodes);
+        m_analysisBuffer = X265_MALLOC(x265_analysis_data*, m_numEncodes);
+
+        m_picWriteCnt = new ThreadSafeInteger[m_numEncodes];
+        m_picReadCnt = new ThreadSafeInteger[m_numEncodes];
+        m_analysisWriteCnt = new ThreadSafeInteger[m_numEncodes];
+        m_analysisReadCnt = new ThreadSafeInteger[m_numEncodes];
+
+        m_picIdxReadCnt = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_analysisWrite = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_analysisRead = X265_MALLOC(ThreadSafeInteger*, m_numEncodes);
+        m_readFlag = X265_MALLOC(int*, m_numEncodes);
+
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            m_inputPicBuffer[pass] = X265_MALLOC(x265_picture*, m_queueSize);
+            for (uint32_t idx = 0; idx < m_queueSize; idx++)
+            {
+                m_inputPicBuffer[pass][idx] = x265_picture_alloc();
+                x265_picture_init(m_passEnc[pass]->m_param, m_inputPicBuffer[pass][idx]);
+            }
+
+            m_analysisBuffer[pass] = X265_MALLOC(x265_analysis_data, m_queueSize);
+            m_picIdxReadCnt[pass] = new ThreadSafeInteger[m_queueSize];
+            m_analysisWrite[pass] = new ThreadSafeInteger[m_queueSize];
+            m_analysisRead[pass] = new ThreadSafeInteger[m_queueSize];
+            m_readFlag[pass] = X265_MALLOC(int, m_queueSize);
+        }
+        return true;
+    }
+
+    void AbrEncoder::destroy()
+    {
+        x265_cleanup(); /* Free library singletons */
+        for (uint8_t pass = 0; pass < m_numEncodes; pass++)
+        {
+            for (uint32_t index = 0; index < m_queueSize; index++)
+            {
+                X265_FREE(m_inputPicBuffer[pass][index]->planes[0]);
+                x265_picture_free(m_inputPicBuffer[pass][index]);
+            }
+
+            X265_FREE(m_inputPicBuffer[pass]);
+            X265_FREE(m_analysisBuffer[pass]);
+            X265_FREE(m_readFlag[pass]);
+            delete[] m_picIdxReadCnt[pass];
+            delete[] m_analysisWrite[pass];
+            delete[] m_analysisRead[pass];
+            m_passEnc[pass]->destroy();
+            delete m_passEnc[pass];
+        }
+        X265_FREE(m_inputPicBuffer);
+        X265_FREE(m_analysisBuffer);
+        X265_FREE(m_readFlag);
+
+        delete[] m_picWriteCnt;
+        delete[] m_picReadCnt;
+        delete[] m_analysisWriteCnt;
+        delete[] m_analysisReadCnt;
+
+        X265_FREE(m_picIdxReadCnt);
+        X265_FREE(m_analysisWrite);
+        X265_FREE(m_analysisRead);
+
+        X265_FREE(m_passEnc);
+    }
+
+    PassEncoder::PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent)
+    {
+        m_id = id;
+        m_cliopt = cliopt;
+        m_parent = parent;
+        if(!(m_cliopt.enableScaler && m_id))
+            m_input = m_cliopt.input;
+        m_param = cliopt.param;
+        m_inputOver = false;
+        m_lastIdx = -1;
+        m_encoder = NULL;
+        m_scaler = NULL;
+        m_reader = NULL;
+        m_ret = 0;
+    }
+
+    int PassEncoder::init(int &result)
+    {
+        if (m_parent->m_numEncodes > 1)
+            setReuseLevel();
+                
+        if (!(m_cliopt.enableScaler && m_id))
+            m_reader = new Reader(m_id, this);
+        else
+        {
+            VideoDesc *src = NULL, *dst = NULL;
+            dst = new VideoDesc(m_param->sourceWidth, m_param->sourceHeight, m_param->internalCsp, m_param->internalBitDepth);
+            int dstW = m_parent->m_passEnc[m_id - 1]->m_param->sourceWidth;
+            int dstH = m_parent->m_passEnc[m_id - 1]->m_param->sourceHeight;
+            src = new VideoDesc(dstW, dstH, m_param->internalCsp, m_param->internalBitDepth);
+            if (src != NULL && dst != NULL)
+            {
+                m_scaler = new Scaler(0, 1, m_id, src, dst, this);
+                if (!m_scaler)
+                {
+                    x265_log(m_param, X265_LOG_ERROR, "\n MALLOC failure in Scaler");
+                    result = 4;
+                }
+            }
+        }
+
+        /* note: we could try to acquire a different libx265 API here based on
+        * the profile found during option parsing, but it must be done before
+        * opening an encoder */
+
+        if (m_param)
+            m_encoder = m_cliopt.api->encoder_open(m_param);
+        if (!m_encoder)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "x265_encoder_open() failed for Enc, \n");
+            m_ret = 2;
+            return -1;
+        }
+
+        /* get the encoder parameters post-initialization */
+        m_cliopt.api->encoder_parameters(m_encoder, m_param);
+
+        return 1;
+    }
+
+    void PassEncoder::setReuseLevel()
+    {
+        uint32_t r, padh = 0, padw = 0;
+
+        m_param->confWinBottomOffset = m_param->confWinRightOffset = 0;
+
+        m_param->analysisLoadReuseLevel = m_cliopt.loadLevel;
+        m_param->analysisSaveReuseLevel = m_cliopt.saveLevel;
+        m_param->analysisSave = m_cliopt.saveLevel ? "save.dat" : NULL;
+        m_param->analysisLoad = m_cliopt.loadLevel ? "load.dat" : NULL;
+        m_param->bUseAnalysisFile = 0;
+
+        if (m_cliopt.loadLevel)
+        {
+            x265_param *refParam = m_parent->m_passEnc[m_cliopt.refId]->m_param;
+
+            if (m_param->sourceHeight == (refParam->sourceHeight - refParam->confWinBottomOffset) &&
+                m_param->sourceWidth == (refParam->sourceWidth - refParam->confWinRightOffset))
+            {
+                m_parent->m_passEnc[m_id]->m_param->confWinBottomOffset = refParam->confWinBottomOffset;
+                m_parent->m_passEnc[m_id]->m_param->confWinRightOffset = refParam->confWinRightOffset;
+            }
+            else
+            {
+                int srcH = refParam->sourceHeight - refParam->confWinBottomOffset;
+                int srcW = refParam->sourceWidth - refParam->confWinRightOffset;
+
+                double scaleFactorH = double(m_param->sourceHeight / srcH);
+                double scaleFactorW = double(m_param->sourceWidth / srcW);
+
+                int absScaleFactorH = (int)(10 * scaleFactorH + 0.5);
+                int absScaleFactorW = (int)(10 * scaleFactorW + 0.5);
+
+                if (absScaleFactorH == 20 && absScaleFactorW == 20)
+                {
+                    m_param->scaleFactor = 2;
+
+                    m_parent->m_passEnc[m_id]->m_param->confWinBottomOffset = refParam->confWinBottomOffset * 2;
+                    m_parent->m_passEnc[m_id]->m_param->confWinRightOffset = refParam->confWinRightOffset * 2;
+
+                }
+            }
+        }
+
+        int h = m_param->sourceHeight + m_param->confWinBottomOffset;
+        int w = m_param->sourceWidth + m_param->confWinRightOffset;
+        if (h & (m_param->minCUSize - 1))
+        {
+            r = h & (m_param->minCUSize - 1);
+            padh = m_param->minCUSize - r;
+            m_param->confWinBottomOffset += padh;
+
+        }
+
+        if (w & (m_param->minCUSize - 1))
+        {
+            r = w & (m_param->minCUSize - 1);
+            padw = m_param->minCUSize - r;
+            m_param->confWinRightOffset += padw;
+        }
+    }
+
+    void PassEncoder::startThreads()
+    {
+        /* Start slave worker threads */
+        m_threadActive = true;
+        start();
+        /* Start reader threads*/
+        if (m_reader != NULL)
+        {
+            m_reader->m_threadActive = true;
+            m_reader->start();
+        }
+        /* Start scaling worker threads */
+        if (m_scaler != NULL)
+        {
+            m_scaler->m_threadActive = true;
+            m_scaler->start();
+        }
+    }
+
+    void PassEncoder::copyInfo(x265_analysis_data * src)
+    {
+
+        uint32_t written = m_parent->m_analysisWriteCnt[m_id].get();
+
+        int index = written % m_parent->m_queueSize;
+        //If all streams have read analysis data, reuse that position in Queue
+
+        int read = m_parent->m_analysisRead[m_id][index].get();
+        int write = m_parent->m_analysisWrite[m_id][index].get();
+
+        int overwrite = written / m_parent->m_queueSize;
+        bool emptyIdxFound = 0;
+        while (!emptyIdxFound && overwrite)
+        {
+            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
+            {
+                read = m_parent->m_analysisRead[m_id][i].get();
+                write = m_parent->m_analysisWrite[m_id][i].get();
+                write *= m_cliopt.numRefs;
+
+                if (read == write)
+                {
+                    index = i;
+                    emptyIdxFound = 1;
+                }
+            }
+        }
+
+        x265_analysis_data *m_analysisInfo = &m_parent->m_analysisBuffer[m_id][index];
+
+        memcpy(m_analysisInfo, src, sizeof(x265_analysis_data));
+        x265_alloc_analysis_data(m_param, m_analysisInfo);
+
+        bool isVbv = m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate;
+        if (m_param->bDisableLookahead && isVbv)
+        {
+            memcpy(m_analysisInfo->lookahead.intraSatdForVbv, src->lookahead.intraSatdForVbv, src->numCuInHeight * sizeof(uint32_t));
+            memcpy(m_analysisInfo->lookahead.satdForVbv, src->lookahead.satdForVbv, src->numCuInHeight * sizeof(uint32_t));
+            memcpy(m_analysisInfo->lookahead.intraVbvCost, src->lookahead.intraVbvCost, src->numCUsInFrame * sizeof(uint32_t));
+            memcpy(m_analysisInfo->lookahead.vbvCost, src->lookahead.vbvCost, src->numCUsInFrame * sizeof(uint32_t));
+        }
+
+        if (src->sliceType == X265_TYPE_IDR || src->sliceType == X265_TYPE_I)
+        {
+            if (m_param->analysisSaveReuseLevel < 2)
+                goto ret;
+            x265_analysis_intra_data *intraDst, *intraSrc;
+            intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
+            intraSrc = (x265_analysis_intra_data*)src->intraData;
+            memcpy(intraDst->depth, intraSrc->depth, sizeof(uint8_t) * src->depthBytes);
+            memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numCUsInFrame * src->numPartitions);
+            memcpy(intraDst->partSizes, intraSrc->partSizes, sizeof(char) * src->depthBytes);
+            memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
+            if (m_param->rc.cuTree)
+                memcpy(intraDst->cuQPOff, intraSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
+        }
+        else
+        {
+            bool bIntraInInter = (src->sliceType == X265_TYPE_P || m_param->bIntraInBFrames);
+            int numDir = src->sliceType == X265_TYPE_P ? 1 : 2;
+            memcpy(m_analysisInfo->wt, src->wt, sizeof(WeightParam) * 3 * numDir);
+            if (m_param->analysisSaveReuseLevel < 2)
+                goto ret;
+            x265_analysis_inter_data *interDst, *interSrc;
+            interDst = (x265_analysis_inter_data*)m_analysisInfo->interData;
+            interSrc = (x265_analysis_inter_data*)src->interData;
+            memcpy(interDst->depth, interSrc->depth, sizeof(uint8_t) * src->depthBytes);
+            memcpy(interDst->modes, interSrc->modes, sizeof(uint8_t) * src->depthBytes);
+            if (m_param->rc.cuTree)
+                memcpy(interDst->cuQPOff, interSrc->cuQPOff, sizeof(int8_t) * src->depthBytes);
+            if (m_param->analysisSaveReuseLevel > 4)
+            {
+                memcpy(interDst->partSize, interSrc->partSize, sizeof(uint8_t) * src->depthBytes);
+                memcpy(interDst->mergeFlag, interSrc->mergeFlag, sizeof(uint8_t) * src->depthBytes);
+                if (m_param->analysisSaveReuseLevel == 10)
+                {
+                    memcpy(interDst->interDir, interSrc->interDir, sizeof(uint8_t) * src->depthBytes);
+                    for (int dir = 0; dir < numDir; dir++)
+                    {
+                        memcpy(interDst->mvpIdx[dir], interSrc->mvpIdx[dir], sizeof(uint8_t) * src->depthBytes);
+                        memcpy(interDst->refIdx[dir], interSrc->refIdx[dir], sizeof(int8_t) * src->depthBytes);
+                        memcpy(interDst->mv[dir], interSrc->mv[dir], sizeof(MV) * src->depthBytes);
+                    }
+                    if (bIntraInInter)
+                    {
+                        x265_analysis_intra_data *intraDst = (x265_analysis_intra_data*)m_analysisInfo->intraData;
+                        x265_analysis_intra_data *intraSrc = (x265_analysis_intra_data*)src->intraData;
+                        memcpy(intraDst->modes, intraSrc->modes, sizeof(uint8_t) * src->numPartitions * src->numCUsInFrame);
+                        memcpy(intraDst->chromaModes, intraSrc->chromaModes, sizeof(uint8_t) * src->depthBytes);
+                    }
+               }
+            }
+            if (m_param->analysisSaveReuseLevel != 10)
+                memcpy(interDst->ref, interSrc->ref, sizeof(int32_t) * src->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir);
+        }
+
+ret:
+        //increment analysis Write counter 
+        m_parent->m_analysisWriteCnt[m_id].incr();
+        m_parent->m_analysisWrite[m_id][index].incr();
+        return;
+    }
+
+
+    bool PassEncoder::readPicture(x265_picture *dstPic)
+    {
+        /*Check and wait if there any input frames to read*/
+        int ipread = m_parent->m_picReadCnt[m_id].get();
+        int ipwrite = m_parent->m_picWriteCnt[m_id].get();
+
+        bool isAbrLoad = m_cliopt.loadLevel && (m_parent->m_numEncodes > 1);
+        while (!m_inputOver && (ipread == ipwrite))
+        {
+            ipwrite = m_parent->m_picWriteCnt[m_id].waitForChange(ipwrite);
+        }
+
+        if (m_threadActive && ipread < ipwrite)
+        {
+            /*Get input index to read from inputQueue. If doesn't need analysis info, it need not wait to fetch poc from analysisQueue*/
+            int readPos = ipread % m_parent->m_queueSize;
+            x265_analysis_data* analysisData = 0;
+
+            if (isAbrLoad)
+            {
+                /*If stream is master of each slave pass, then fetch analysis data from prev pass*/
+                int analysisQId = m_cliopt.refId;
+                /*Check and wait if there any analysis Data to read*/
+                int analysisWrite = m_parent->m_analysisWriteCnt[analysisQId].get();
+                int written = analysisWrite * m_parent->m_passEnc[analysisQId]->m_cliopt.numRefs;
+                int analysisRead = m_parent->m_analysisReadCnt[analysisQId].get();
+                
+                while (m_threadActive && written == analysisRead)
+                {
+                    analysisWrite = m_parent->m_analysisWriteCnt[analysisQId].waitForChange(analysisWrite);
+                    written = analysisWrite * m_parent->m_passEnc[analysisQId]->m_cliopt.numRefs;
+                }
+
+                if (analysisRead < written)
+                {
+                    int analysisIdx = 0;
+                    if (!m_param->bDisableLookahead)
+                    {
+                        bool analysisdRead = false;
+                        while ((analysisRead < written) && !analysisdRead)
+                        {
+                            while (analysisWrite < ipread)
+                            {
+                                analysisWrite = m_parent->m_analysisWriteCnt[analysisQId].waitForChange(analysisWrite);
+                                written = analysisWrite * m_parent->m_passEnc[analysisQId]->m_cliopt.numRefs;
+                            }
+                            for (uint32_t i = 0; i < m_parent->m_queueSize; i++)
+                            {
+                                analysisData = &m_parent->m_analysisBuffer[analysisQId][i];
+                                int read = m_parent->m_analysisRead[analysisQId][i].get();
+                                int write = m_parent->m_analysisWrite[analysisQId][i].get() * m_parent->m_passEnc[analysisQId]->m_cliopt.numRefs;
+                                if ((analysisData->poc == (uint32_t)(ipread)) && (read < write))
+                                {
+                                    analysisIdx = i;
+                                    analysisdRead = true;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        analysisIdx = analysisRead % m_parent->m_queueSize;
+                        analysisData = &m_parent->m_analysisBuffer[analysisQId][analysisIdx];
+                        readPos = analysisData->poc % m_parent->m_queueSize;
+                        while ((ipwrite < readPos) || ((ipwrite - 1) < (int)analysisData->poc))
+                        {
+                            ipwrite = m_parent->m_picWriteCnt[m_id].waitForChange(ipwrite);
+                        }
+                    }
+
+                    m_lastIdx = analysisIdx;
+                }
+                else
+                    return false;
+            }
+
+
+            x265_picture *srcPic = (x265_picture*)(m_parent->m_inputPicBuffer[m_id][readPos]);
+
+            x265_picture *pic = (x265_picture*)(dstPic);
+            pic->colorSpace = srcPic->colorSpace;
+            pic->bitDepth = srcPic->bitDepth;
+            pic->framesize = srcPic->framesize;
+            pic->height = srcPic->height;
+            pic->pts = srcPic->pts;
+            pic->dts = srcPic->dts;
+            pic->reorderedPts = srcPic->reorderedPts;
+            pic->width = srcPic->width;
+            pic->analysisData = srcPic->analysisData;
+            pic->userSEI = srcPic->userSEI;
+            pic->stride[0] = srcPic->stride[0];
+            pic->stride[1] = srcPic->stride[1];
+            pic->stride[2] = srcPic->stride[2];
+            pic->planes[0] = srcPic->planes[0];
+            pic->planes[1] = srcPic->planes[1];
+            pic->planes[2] = srcPic->planes[2];
+            if (isAbrLoad)
+                pic->analysisData = *analysisData;
+            return true;
+        }
+        else
+            return false;
+    }
+
+    void PassEncoder::threadMain()
+    {
+        THREAD_NAME("PassEncoder", m_id);
+
+        while (m_threadActive)
+        {
+
+#if ENABLE_LIBVMAF
+            x265_vmaf_data* vmafdata = m_cliopt.vmafData;
+#endif
+            /* This allows muxers to modify bitstream format */
+            m_cliopt.output->setParam(m_param);
+            const x265_api* api = m_cliopt.api;
+            ReconPlay* reconPlay = NULL;
+            if (m_cliopt.reconPlayCmd)
+                reconPlay = new ReconPlay(m_cliopt.reconPlayCmd, *m_param);
+            char* profileName = m_cliopt.encName ? m_cliopt.encName : (char *)"x265";
+
+            if (m_cliopt.zoneFile)
+            {
+                if (!m_cliopt.parseZoneFile())
+                {
+                    x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile in %s\n", profileName);
+                    fclose(m_cliopt.zoneFile);
+                    m_cliopt.zoneFile = NULL;
+                }
+            }
+
+            if (signal(SIGINT, sigint_handler) == SIG_ERR)
+                x265_log(m_param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s in %s\n",
+                    strerror(errno), profileName);
+
+            x265_picture pic_orig, pic_out;
+            x265_picture *pic_in = &pic_orig;
+            /* Allocate recon picture if analysis save/load is enabled */
+            std::priority_queue<int64_t>* pts_queue = m_cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
+            x265_picture *pic_recon = (m_cliopt.recon || m_param->analysisSave || m_param->analysisLoad || pts_queue || reconPlay || m_param->csvLogLevel) ? &pic_out : NULL;
+            uint32_t inFrameCount = 0;
+            uint32_t outFrameCount = 0;
+            x265_nal *p_nal;
+            x265_stats stats;
+            uint32_t nal;
+            int16_t *errorBuf = NULL;
+            bool bDolbyVisionRPU = false;
+            uint8_t *rpuPayload = NULL;
+            int inputPicNum = 1;
+            x265_picture picField1, picField2;
+            x265_analysis_data* analysisInfo = (x265_analysis_data*)(&pic_out.analysisData);
+            bool isAbrSave = m_cliopt.saveLevel && (m_parent->m_numEncodes > 1);
+
+            if (!m_param->bRepeatHeaders && !m_param->bEnableSvtHevc)
+            {
+                if (api->encoder_headers(m_encoder, &p_nal, &nal) < 0)
+                {
+                    x265_log(m_param, X265_LOG_ERROR, "Failure generating stream headers in %s\n", profileName);
+                    m_ret = 3;
+                    goto fail;
+                }
+                else
+                    m_cliopt.totalbytes += m_cliopt.output->writeHeaders(p_nal, nal);
+            }
+
+            if (m_param->bField && m_param->interlaceMode)
+            {
+                api->picture_init(m_param, &picField1);
+                api->picture_init(m_param, &picField2);
+                // return back the original height of input
+                m_param->sourceHeight *= 2;
+                api->picture_init(m_param, &pic_orig);
+            }
+            else
+                api->picture_init(m_param, &pic_orig);
+
+            if (m_param->dolbyProfile && m_cliopt.dolbyVisionRpu)
+            {
+                rpuPayload = X265_MALLOC(uint8_t, 1024);
+                pic_in->rpu.payload = rpuPayload;
+                if (pic_in->rpu.payload)
+                    bDolbyVisionRPU = true;
+            }
+
+            if (m_cliopt.bDither)
+            {
+                errorBuf = X265_MALLOC(int16_t, m_param->sourceWidth + 1);
+                if (errorBuf)
+                    memset(errorBuf, 0, (m_param->sourceWidth + 1) * sizeof(int16_t));
+                else
+                    m_cliopt.bDither = false;
+            }
+
+            // main encoder loop
+            while (pic_in && !b_ctrl_c)
+            {
+                pic_orig.poc = (m_param->bField && m_param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
+                if (m_cliopt.qpfile)
+                {
+                    if (!m_cliopt.parseQPFile(pic_orig))
+                    {
+                        x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d in %s\n",
+                            pic_in->poc, profileName);
+                        fclose(m_cliopt.qpfile);
+                        m_cliopt.qpfile = NULL;
+                    }
+                }
+
+                if (m_cliopt.framesToBeEncoded && inFrameCount >= m_cliopt.framesToBeEncoded)
+                    pic_in = NULL;
+                else if (readPicture(pic_in))
+                    inFrameCount++;
+                else
+                    pic_in = NULL;
+
+                if (pic_in)
+                {
+                    if (pic_in->bitDepth > m_param->internalBitDepth && m_cliopt.bDither)
+                    {
+                        x265_dither_image(pic_in, m_cliopt.input->getWidth(), m_cliopt.input->getHeight(), errorBuf, m_param->internalBitDepth);
+                        pic_in->bitDepth = m_param->internalBitDepth;
+                    }
+                    /* Overwrite PTS */
+                    pic_in->pts = pic_in->poc;
+
+                    // convert to field
+                    if (m_param->bField && m_param->interlaceMode)
+                    {
+                        int height = pic_in->height >> 1;
+
+                        int static bCreated = 0;
+                        if (bCreated == 0)
+                        {
+                            bCreated = 1;
+                            inputPicNum = 2;
+                            picField1.fieldNum = 1;
+                            picField2.fieldNum = 2;
+
+                            picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
+                            picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
+                            picField1.height = picField2.height = pic_in->height >> 1;
+                            picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
+
+                            size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
+                            char* field1Buf = X265_MALLOC(char, fieldFrameSize);
+                            char* field2Buf = X265_MALLOC(char, fieldFrameSize);
+
+                            int stride = picField1.stride[0] = picField2.stride[0] = pic_in->stride[0];
+                            uint64_t framesize = stride * (height >> x265_cli_csps[pic_in->colorSpace].height[0]);
+                            picField1.planes[0] = field1Buf;
+                            picField2.planes[0] = field2Buf;
+                            for (int i = 1; i < x265_cli_csps[pic_in->colorSpace].planes; i++)
+                            {
+                                picField1.planes[i] = field1Buf + framesize;
+                                picField2.planes[i] = field2Buf + framesize;
+
+                                stride = picField1.stride[i] = picField2.stride[i] = pic_in->stride[i];
+                                framesize += (stride * (height >> x265_cli_csps[pic_in->colorSpace].height[i]));
+                            }
+                            assert(framesize == picField1.framesize);
+                        }
+
+                        picField1.pts = picField1.poc = pic_in->poc;
+                        picField2.pts = picField2.poc = pic_in->poc + 1;
+
+                        picField1.userSEI = picField2.userSEI = pic_in->userSEI;
+
+                        //if (pic_in->userData)
+                        //{
+                        //    // Have to handle userData here
+                        //}
+
+                        if (pic_in->framesize)
+                        {
+                            for (int i = 0; i < x265_cli_csps[pic_in->colorSpace].planes; i++)
+                            {
+                                char* srcP1 = (char*)pic_in->planes[i];
+                                char* srcP2 = (char*)pic_in->planes[i] + pic_in->stride[i];
+                                char* p1 = (char*)picField1.planes[i];
+                                char* p2 = (char*)picField2.planes[i];
+
+                                int stride = picField1.stride[i];
+
+                                for (int y = 0; y < (height >> x265_cli_csps[pic_in->colorSpace].height[i]); y++)
+                                {
+                                    memcpy(p1, srcP1, stride);
+                                    memcpy(p2, srcP2, stride);
+                                    srcP1 += 2 * stride;
+                                    srcP2 += 2 * stride;
+                                    p1 += stride;
+                                    p2 += stride;
+                                }
+                            }
+                        }
+                    }
+
+                    if (bDolbyVisionRPU)
+                    {
+                        if (m_param->bField && m_param->interlaceMode)
+                        {
+                            if (m_cliopt.rpuParser(&picField1) > 0)
+                                goto fail;
+                            if (m_cliopt.rpuParser(&picField2) > 0)
+                                goto fail;
+                        }
+                        else
+                        {
+                            if (m_cliopt.rpuParser(pic_in) > 0)
+                                goto fail;
+                        }
+                    }
+                }
+
+                for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
+                {
+                    x265_picture *picInput = NULL;
+                    if (inputPicNum == 2)
+                        picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
+                    else
+                        picInput = pic_in;
+
+                    int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, picInput, pic_recon);
+
+                    int idx = (inFrameCount - 1) % m_parent->m_queueSize;
+                    m_parent->m_picIdxReadCnt[m_id][idx].incr();
+                    m_parent->m_picReadCnt[m_id].incr();
+                    if (m_cliopt.loadLevel && picInput)
+                    {
+                        m_parent->m_analysisReadCnt[m_cliopt.refId].incr();
+                        m_parent->m_analysisRead[m_cliopt.refId][m_lastIdx].incr();
+                    }
+
+                    if (numEncoded < 0)
+                    {
+                        b_ctrl_c = 1;
+                        m_ret = 4;
+                        break;
+                    }
+
+                    if (reconPlay && numEncoded)
+                        reconPlay->writePicture(*pic_recon);
+
+                    outFrameCount += numEncoded;
+
+                    if (isAbrSave && numEncoded)
+                    {
+                        copyInfo(analysisInfo);
+                    }
+
+                    if (numEncoded && pic_recon && m_cliopt.recon)
+                        m_cliopt.recon->writePicture(pic_out);
+                    if (nal)
+                    {
+                        m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
+                        if (pts_queue)
+                        {
+                            pts_queue->push(-pic_out.pts);
+                            if (pts_queue->size() > 2)
+                                pts_queue->pop();
+                        }
+                    }
+                    m_cliopt.printStatus(outFrameCount);
+                }
+            }
+
+            /* Flush the encoder */
+            while (!b_ctrl_c)
+            {
+                int numEncoded = api->encoder_encode(m_encoder, &p_nal, &nal, NULL, pic_recon);
+                if (numEncoded < 0)
+                {
+                    m_ret = 4;
+                    break;
+                }
+
+                if (reconPlay && numEncoded)
+                    reconPlay->writePicture(*pic_recon);
+
+                outFrameCount += numEncoded;
+                if (isAbrSave && numEncoded)
+                {
+                    copyInfo(analysisInfo);
+                }
+
+                if (numEncoded && pic_recon && m_cliopt.recon)
+                    m_cliopt.recon->writePicture(pic_out);
+                if (nal)
+                {
+                    m_cliopt.totalbytes += m_cliopt.output->writeFrame(p_nal, nal, pic_out);
+                    if (pts_queue)
+                    {
+                        pts_queue->push(-pic_out.pts);
+                        if (pts_queue->size() > 2)
+                            pts_queue->pop();
+                    }
+                }
+
+                m_cliopt.printStatus(outFrameCount);
+
+                if (!numEncoded)
+                    break;
+            }
+
+            if (bDolbyVisionRPU)
+            {
+                if (fgetc(m_cliopt.dolbyVisionRpu) != EOF)
+                    x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count in %s\n",
+                        profileName);
+                x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful in %s\n",
+                    profileName);
+            }
+
+            /* clear progress report */
+            if (m_cliopt.bProgress)
+                fprintf(stderr, "%*s\r", 80, " ");
+
+        fail:
+
+            delete reconPlay;
+
+            api->encoder_get_stats(m_encoder, &stats, sizeof(stats));
+            if (m_param->csvfn && !b_ctrl_c)
+#if ENABLE_LIBVMAF
+                api->vmaf_encoder_log(m_encoder, m_cliopt.argCount, m_cliopt.argString, m_cliopt.param, vmafdata);
+#else
+                api->encoder_log(m_encoder, m_cliopt.argCnt, m_cliopt.argString);
+#endif
+            api->encoder_close(m_encoder);
+
+            int64_t second_largest_pts = 0;
+            int64_t largest_pts = 0;
+            if (pts_queue && pts_queue->size() >= 2)
+            {
+                second_largest_pts = -pts_queue->top();
+                pts_queue->pop();
+                largest_pts = -pts_queue->top();
+                pts_queue->pop();
+                delete pts_queue;
+                pts_queue = NULL;
+            }
+            m_cliopt.output->closeFile(largest_pts, second_largest_pts);
+
+            if (b_ctrl_c)
+                general_log(m_param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d in %s\n",
+                    m_cliopt.seek + inFrameCount, stats.encodedPictureCount, profileName);
+
+            api->param_free(m_param);
+
+            X265_FREE(errorBuf);
+            X265_FREE(rpuPayload);
+
+            m_threadActive = false;
+            m_parent->m_numActiveEncodes.decr();
+        }
+    }
+
+    void PassEncoder::destroy()
+    {
+        stop();
+        if (m_reader)
+        {
+            m_reader->stop();
+            delete m_reader;
+        }
+        else
+        {
+            m_scaler->stop();
+            m_scaler->destroy();
+            delete m_scaler;
+        }
+    }
+
+    Scaler::Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc *dst, PassEncoder *parentEnc)
+    {
+        m_parentEnc = parentEnc;
+        m_id = id;
+        m_srcFormat = src;
+        m_dstFormat = dst;
+        m_threadActive = false;
+        m_scaleFrameSize = 0;
+        m_filterManager = NULL;
+        m_threadId = threadId;
+        m_threadTotal = threadNum;
+
+        int csp = dst->m_csp;
+        uint32_t pixelbytes = dst->m_inputDepth > 8 ? 2 : 1;
+        for (int i = 0; i < x265_cli_csps[csp].planes; i++)
+        {
+            int w = dst->m_width >> x265_cli_csps[csp].width[i];
+            int h = dst->m_height >> x265_cli_csps[csp].height[i];
+            m_scalePlanes[i] = w * h * pixelbytes;
+            m_scaleFrameSize += m_scalePlanes[i];
+        }
+
+        if (src->m_height != dst->m_height || src->m_width != dst->m_width)
+        {
+            m_filterManager = new ScalerFilterManager;
+            m_filterManager->init(4, m_srcFormat, m_dstFormat);
+        }
+    }
+
+    bool Scaler::scalePic(x265_picture * destination, x265_picture * source)
+    {
+        if (!destination || !source)
+            return false;
+        x265_param* param = m_parentEnc->m_param;
+        int pixelBytes = m_dstFormat->m_inputDepth > 8 ? 2 : 1;
+        if (m_srcFormat->m_height != m_dstFormat->m_height || m_srcFormat->m_width != m_dstFormat->m_width)
+        {
+            void **srcPlane = NULL, **dstPlane = NULL;
+            int srcStride[3], dstStride[3];
+            destination->bitDepth = source->bitDepth;
+            destination->colorSpace = source->colorSpace;
+            destination->pts = source->pts;
+            destination->dts = source->dts;
+            destination->reorderedPts = source->reorderedPts;
+            destination->poc = source->poc;
+            destination->userSEI = source->userSEI;
+            srcPlane = source->planes;
+            dstPlane = destination->planes;
+            srcStride[0] = source->stride[0];
+            destination->stride[0] = m_dstFormat->m_width * pixelBytes;
+            dstStride[0] = destination->stride[0];
+            if (param->internalCsp != X265_CSP_I400)
+            {
+                srcStride[1] = source->stride[1];
+                srcStride[2] = source->stride[2];
+                destination->stride[1] = destination->stride[0] >> x265_cli_csps[param->internalCsp].width[1];
+                destination->stride[2] = destination->stride[0] >> x265_cli_csps[param->internalCsp].width[2];
+                dstStride[1] = destination->stride[1];
+                dstStride[2] = destination->stride[2];
+            }
+            if (m_scaleFrameSize)
+            {
+                m_filterManager->scale_pic(srcPlane, dstPlane, srcStride, dstStride);
+                return true;
+            }
+            else
+                x265_log(param, X265_LOG_INFO, "Empty frame received\n");
+        }
+        return false;
+    }
+
+    void Scaler::threadMain()
+    {
+        THREAD_NAME("Scaler", m_id);
+
+        /* unscaled picture is stored in the last index */
+        uint32_t srcId = m_id - 1;
+        int QDepth = m_parentEnc->m_parent->m_queueSize;
+        while (!m_parentEnc->m_inputOver)
+        {
+
+            uint32_t scaledWritten = m_parentEnc->m_parent->m_picWriteCnt[m_id].get();
+
+            if (m_parentEnc->m_cliopt.framesToBeEncoded && scaledWritten >= m_parentEnc->m_cliopt.framesToBeEncoded)
+                break;
+
+            if (m_threadTotal > 1 && (m_threadId != scaledWritten % m_threadTotal))
+            {
+                continue;
+            }
+            uint32_t written = m_parentEnc->m_parent->m_picWriteCnt[srcId].get();
+
+            /*If all the input pictures are scaled by the current scale worker thread wait for input pictures*/
+            while (m_threadActive && (scaledWritten == written)) {
+                written = m_parentEnc->m_parent->m_picWriteCnt[srcId].waitForChange(written);
+            }
+
+            if (m_threadActive && scaledWritten < written)
+            {
+
+                int scaledWriteIdx = scaledWritten % QDepth;
+                int overWritePicBuffer = scaledWritten / QDepth;
+                int read = m_parentEnc->m_parent->m_picIdxReadCnt[m_id][scaledWriteIdx].get();
+
+                while (overWritePicBuffer && read < overWritePicBuffer)
+                {
+                    read = m_parentEnc->m_parent->m_picIdxReadCnt[m_id][scaledWriteIdx].waitForChange(read);
+                }
+
+                if (!m_parentEnc->m_parent->m_inputPicBuffer[m_id][scaledWriteIdx])
+                {
+                    int framesize = 0;
+                    int planesize[3];
+                    int csp = m_dstFormat->m_csp;
+                    int stride[3];
+                    stride[0] = m_dstFormat->m_width;
+                    stride[1] = stride[0] >> x265_cli_csps[csp].width[1];
+                    stride[2] = stride[0] >> x265_cli_csps[csp].width[2];
+                    for (int i = 0; i < x265_cli_csps[csp].planes; i++)
+                    {
+                        uint32_t h = m_dstFormat->m_height >> x265_cli_csps[csp].height[i];
+                        planesize[i] = h * stride[i];
+                        framesize += planesize[i];
+                    }
+
+                    m_parentEnc->m_parent->m_inputPicBuffer[m_id][scaledWriteIdx] = x265_picture_alloc();
+                    x265_picture_init(m_parentEnc->m_param, m_parentEnc->m_parent->m_inputPicBuffer[m_id][scaledWriteIdx]);
+
+                    ((x265_picture*)m_parentEnc->m_parent->m_inputPicBuffer[m_id][scaledWritten % QDepth])->framesize = framesize;
+                    for (int32_t j = 0; j < x265_cli_csps[csp].planes; j++)
+                    {
+                        m_parentEnc->m_parent->m_inputPicBuffer[m_id][scaledWritten % QDepth]->planes[j] = X265_MALLOC(char, planesize[j]);
+                    }
+                }
+
+                x265_picture *srcPic = m_parentEnc->m_parent->m_inputPicBuffer[srcId][scaledWritten % QDepth];
+                x265_picture* destPic = m_parentEnc->m_parent->m_inputPicBuffer[m_id][scaledWriteIdx];
+
+                // Enqueue this picture up with the current encoder so that it will asynchronously encode
+                if (!scalePic(destPic, srcPic))
+                    x265_log(NULL, X265_LOG_ERROR, "Unable to copy scaled input picture to input queue \n");
+                else
+                    m_parentEnc->m_parent->m_picWriteCnt[m_id].incr();
+                m_scaledWriteCnt.incr();
+                m_parentEnc->m_parent->m_picIdxReadCnt[srcId][scaledWriteIdx].incr();
+            }
+            if (m_threadTotal > 1)
+            {
+                written = m_parentEnc->m_parent->m_picWriteCnt[srcId].get();
+                int totalWrite = written / m_threadTotal;
+                if (written % m_threadTotal > m_threadId)
+                    totalWrite++;
+                if (totalWrite == m_scaledWriteCnt.get())
+                {
+                    m_parentEnc->m_parent->m_picWriteCnt[srcId].poke();
+                    m_parentEnc->m_parent->m_picWriteCnt[m_id].poke();
+                    break;
+                }
+            }
+            else
+            {
+                /* Once end of video is reached and all frames are scaled, release wait on picwritecount */
+                scaledWritten = m_parentEnc->m_parent->m_picWriteCnt[m_id].get();
+                written = m_parentEnc->m_parent->m_picWriteCnt[srcId].get();
+                if (written == scaledWritten)
+                {
+                    m_parentEnc->m_parent->m_picWriteCnt[srcId].poke();
+                    m_parentEnc->m_parent->m_picWriteCnt[m_id].poke();
+                    break;
+                }
+            }
+
+        }
+        m_threadActive = false;
+        destroy();
+    }
+
+    Reader::Reader(int id, PassEncoder *parentEnc)
+    {
+        m_parentEnc = parentEnc;
+        m_id = id;
+        m_input = parentEnc->m_input;
+    }
+
+    void Reader::threadMain()
+    {
+        THREAD_NAME("Reader", m_id);
+
+        int QDepth = m_parentEnc->m_parent->m_queueSize;
+        x265_picture* src = x265_picture_alloc();
+        x265_picture_init(m_parentEnc->m_param, src);
+
+        while (m_threadActive)
+        {
+            uint32_t written = m_parentEnc->m_parent->m_picWriteCnt[m_id].get();
+            uint32_t writeIdx = written % QDepth;
+            uint32_t read = m_parentEnc->m_parent->m_picIdxReadCnt[m_id][writeIdx].get();
+            uint32_t overWritePicBuffer = written / QDepth;
+
+            if (m_parentEnc->m_cliopt.framesToBeEncoded && written >= m_parentEnc->m_cliopt.framesToBeEncoded)
+                break;
+
+            while (overWritePicBuffer && read < overWritePicBuffer)
+            {
+                read = m_parentEnc->m_parent->m_picIdxReadCnt[m_id][writeIdx].waitForChange(read);
+            }
+
+            x265_picture* dest = m_parentEnc->m_parent->m_inputPicBuffer[m_id][writeIdx];
+            if (m_input->readPicture(*src))
+            {
+                dest->poc = src->poc;
+                dest->pts = src->pts;
+                dest->userSEI = src->userSEI;
+                dest->bitDepth = src->bitDepth;
+                dest->framesize = src->framesize;
+                dest->height = src->height;
+                dest->width = src->width;
+                dest->colorSpace = src->colorSpace;
+                dest->userSEI = src->userSEI;
+                dest->rpu.payload = src->rpu.payload;
+                dest->picStruct = src->picStruct;
+                dest->stride[0] = src->stride[0];
+                dest->stride[1] = src->stride[1];
+                dest->stride[2] = src->stride[2];
+
+                if (!dest->planes[0])
+                    dest->planes[0] = X265_MALLOC(char, dest->framesize);
+
+                memcpy(dest->planes[0], src->planes[0], src->framesize * sizeof(char));
+                dest->planes[1] = (char*)dest->planes[0] + src->stride[0] * src->height;
+                dest->planes[2] = (char*)dest->planes[1] + src->stride[1] * (src->height >> x265_cli_csps[src->colorSpace].height[1]);
+                m_parentEnc->m_parent->m_picWriteCnt[m_id].incr();
+            }
+            else
+            {
+                m_threadActive = false;
+                m_parentEnc->m_inputOver = true;
+                m_parentEnc->m_parent->m_picWriteCnt[m_id].poke();
+            }
+        }
+        x265_picture_free(src);
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/abrEncApp.h	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,153 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*          Aruna Matheswaran <aruna@multicorewareinc.com>
+*           
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef ABR_ENCODE_H
+#define ABR_ENCODE_H
+
+#include "x265.h"
+#include "scaler.h"
+#include "threading.h"
+#include "x265cli.h"
+
+namespace X265_NS {
+    // private namespace
+
+    class PassEncoder;
+    class Scaler;
+    class Reader;
+
+    class AbrEncoder
+    {
+    public:
+        uint8_t           m_numEncodes;
+        PassEncoder        **m_passEnc;
+        uint32_t           m_queueSize;
+        ThreadSafeInteger  m_numActiveEncodes;
+
+        x265_picture       ***m_inputPicBuffer; //[numEncodes][queueSize]
+        x265_analysis_data **m_analysisBuffer; //[numEncodes][queueSize]
+        int                **m_readFlag;
+
+        ThreadSafeInteger  *m_picWriteCnt;
+        ThreadSafeInteger  *m_picReadCnt;
+        ThreadSafeInteger  **m_picIdxReadCnt;
+        ThreadSafeInteger  *m_analysisWriteCnt; //[numEncodes][queueSize]
+        ThreadSafeInteger  *m_analysisReadCnt; //[numEncodes][queueSize]
+        ThreadSafeInteger  **m_analysisWrite; //[numEncodes][queueSize]
+        ThreadSafeInteger  **m_analysisRead; //[numEncodes][queueSize]
+
+        AbrEncoder(CLIOptions cliopt[], uint8_t numEncodes, int& ret);
+        bool allocBuffers();
+        void destroy();
+
+    };
+
+    class PassEncoder : public Thread
+    {
+    public:
+
+        uint32_t m_id;
+        x265_param *m_param;
+        AbrEncoder *m_parent;
+        x265_encoder *m_encoder;
+        Reader *m_reader;
+        Scaler *m_scaler;
+        bool m_inputOver;
+
+        int m_threadActive;
+        int m_lastIdx;
+        uint32_t m_outputNalsCount;
+
+        x265_picture **m_inputPicBuffer;
+        x265_analysis_data **m_analysisBuffer;
+        x265_nal **m_outputNals;
+        x265_picture **m_outputRecon;
+
+        CLIOptions m_cliopt;
+        InputFile* m_input;
+        const char* m_reconPlayCmd;
+        FILE*    m_qpfile;
+        FILE*    m_zoneFile;
+        FILE*    m_dolbyVisionRpu;/* File containing Dolby Vision BL RPU metadata */
+
+        int m_ret;
+
+        PassEncoder(uint32_t id, CLIOptions cliopt, AbrEncoder *parent);
+        int init(int &result);
+        void setReuseLevel();
+
+        void startThreads();
+        void copyInfo(x265_analysis_data *src);
+
+        bool readPicture(x265_picture*);
+        void destroy();
+
+    private:
+        void threadMain();
+    };
+
+    class Scaler : public Thread
+    {
+    public:
+        PassEncoder *m_parentEnc;
+        int m_id;
+        int m_scalePlanes[3];
+        int m_scaleFrameSize;
+        uint32_t m_threadId;
+        uint32_t m_threadTotal;
+        ThreadSafeInteger m_scaledWriteCnt;
+        VideoDesc* m_srcFormat;
+        VideoDesc* m_dstFormat;
+        int m_threadActive;
+        ScalerFilterManager* m_filterManager;
+
+        Scaler(int threadId, int threadNum, int id, VideoDesc *src, VideoDesc * dst, PassEncoder *parentEnc);
+        bool scalePic(x265_picture *destination, x265_picture *source);
+        void threadMain();
+        void destroy()
+        {
+            if (m_filterManager)
+            {
+                delete m_filterManager;
+                m_filterManager = NULL;
+            }
+        }
+    };
+
+    class Reader : public Thread
+    {
+    public:
+        PassEncoder *m_parentEnc;
+        int m_id;
+        InputFile* m_input;
+        int m_threadActive;
+
+        Reader(int id, PassEncoder *parentEnc);
+        void threadMain();
+    };
+}
+
+#endif // ifndef ABR_ENCODE_H
+#pragma once
--- a/source/common/CMakeLists.txt	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/CMakeLists.txt	Wed May 06 14:59:56 2020 +0530
@@ -14,7 +14,7 @@ if(EXTRA_LIB)
 endif(EXTRA_LIB)
 
 if(ENABLE_ASSEMBLY)
-    set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
+    set_source_files_properties(threading.cpp primitives.cpp pixel.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
     list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
 endif(ENABLE_ASSEMBLY)
 
@@ -84,16 +84,33 @@ if(ENABLE_ASSEMBLY AND X86)
 endif(ENABLE_ASSEMBLY AND X86)
 
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
-    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+    if(ARM64)
+        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
+            message(STATUS "Detected CXX compiler using -O3 optimization level")
+            add_definitions(-DAUTO_VECTORIZE=1)
+        endif()
+        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
 
-    # add ARM assembly/intrinsic files here
-    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
-    set(VEC_PRIMITIVES)
+        # add ARM assembly/intrinsic files here
+        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
+        set(VEC_PRIMITIVES)
 
-    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
-    foreach(SRC ${C_SRCS})
-        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
-    endforeach()
+        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+        foreach(SRC ${C_SRCS})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    else()
+        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
+
+        # add ARM assembly/intrinsic files here
+        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
+        set(VEC_PRIMITIVES)
+
+        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
+        foreach(SRC ${C_SRCS})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
+        endforeach()
+    endif()
     source_group(Assembly FILES ${ASM_PRIMITIVES})
 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
 
@@ -151,4 +168,5 @@ add_library(common OBJECT
     predict.cpp  predict.h
     scalinglist.cpp scalinglist.h
     quant.cpp quant.h contexts.h
-    deblock.cpp deblock.h)
+    deblock.cpp deblock.h
+    scaler.cpp scaler.h)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/asm-primitives.cpp	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,219 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "cpu.h"
+
+
+#if defined(__GNUC__)
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+#define GCC_4_9_0 40900
+#define GCC_5_1_0 50100
+
+extern "C" {
+#include "pixel.h"
+#include "pixel-util.h"
+#include "ipfilter8.h"
+}
+
+namespace X265_NS {
+// private x265 namespace
+
+
+template<int size>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)
+{
+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);
+    const int halfFilterSize = NTAPS_LUMA >> 1;
+    const int immedStride = MAX_CU_SIZE;
+
+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);
+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);
+}
+
+
+/* Temporary workaround because luma_vsp assembly primitive has not been completed
+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+ * Otherwise, segment fault occurs. */
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        asmp.pu[LUMA_8x4].luma_vsp   = cp.pu[LUMA_8x4].luma_vsp;
+        asmp.pu[LUMA_8x8].luma_vsp   = cp.pu[LUMA_8x8].luma_vsp;
+        asmp.pu[LUMA_8x16].luma_vsp  = cp.pu[LUMA_8x16].luma_vsp;
+        asmp.pu[LUMA_8x32].luma_vsp  = cp.pu[LUMA_8x32].luma_vsp;
+        asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        asmp.pu[LUMA_16x4].luma_vsp  = cp.pu[LUMA_16x4].luma_vsp;
+        asmp.pu[LUMA_16x8].luma_vsp  = cp.pu[LUMA_16x8].luma_vsp;
+        asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;
+        asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;
+        asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;
+        asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;
+        asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;
+        asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;
+        asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;
+        asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;
+        asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;
+        asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;
+        asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;
+        asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;
+        asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;    
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
+        asmp.pu[LUMA_4x4].luma_vsp   = cp.pu[LUMA_4x4].luma_vsp;
+        asmp.pu[LUMA_4x8].luma_vsp   = cp.pu[LUMA_4x8].luma_vsp;
+        asmp.pu[LUMA_4x16].luma_vsp  = cp.pu[LUMA_4x16].luma_vsp;
+        asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;
+        asmp.pu[LUMA_32x8].luma_vsp  = cp.pu[LUMA_32x8].luma_vsp;
+#endif
+#endif
+    }
+}
+
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) 
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
+        p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
+        p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
+        p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
+        p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
+        p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);
+        
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);
+        p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);
+        p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);
+        p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);
+
+        p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);
+        p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);
+        p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);
+        p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);
+
+        // quant
+        p.quant = PFX(quant_neon);
+        // luma_hps
+        p.pu[LUMA_4x4].luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);
+        p.pu[LUMA_4x8].luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);
+        p.pu[LUMA_4x16].luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);
+        p.pu[LUMA_8x4].luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);
+        p.pu[LUMA_8x8].luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);
+        p.pu[LUMA_8x16].luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);
+        p.pu[LUMA_8x32].luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);
+        p.pu[LUMA_16x8].luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
+        p.pu[LUMA_32x8].luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
+#endif
+
+        p.pu[LUMA_8x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;
+        p.pu[LUMA_8x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;
+        p.pu[LUMA_8x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;
+        p.pu[LUMA_8x32].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;
+        p.pu[LUMA_12x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */
+        p.pu[LUMA_16x4].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;
+        p.pu[LUMA_16x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;
+        p.pu[LUMA_16x12].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;
+        p.pu[LUMA_16x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;
+        p.pu[LUMA_16x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;
+        p.pu[LUMA_16x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;
+        p.pu[LUMA_32x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;
+        p.pu[LUMA_32x24].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;
+        p.pu[LUMA_32x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;
+        p.pu[LUMA_32x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;
+        p.pu[LUMA_48x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;
+        p.pu[LUMA_64x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x16>;
+        p.pu[LUMA_64x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x32>;
+        p.pu[LUMA_64x48].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x48>;
+        p.pu[LUMA_64x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x64>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */
+        p.pu[LUMA_4x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x4>;
+        p.pu[LUMA_4x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x8>;
+        p.pu[LUMA_4x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_4x16>;
+        p.pu[LUMA_24x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_24x32>;
+        p.pu[LUMA_32x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_32x8>;
+#endif
+#endif
+
+#if !HIGH_BIT_DEPTH
+        p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
+#endif // !HIGH_BIT_DEPTH
+
+    }
+}
+} // namespace X265_NS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/asm.S	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+.arch           armv8-a
+
+#ifdef PREFIX
+#define EXTERN_ASM _
+#else
+#define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#define ELF
+#else
+#define ELF @
+#endif
+
+#define HAVE_AS_FUNC 1
+
+#if HAVE_AS_FUNC
+#define FUNC
+#else
+#define FUNC @
+#endif
+
+.macro function name, export=1
+    .macro endfunc
+ELF     .size   \name, . - \name
+FUNC    .endfunc
+        .purgem endfunc
+    .endm
+        .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+FUNC    .func   EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF     .hidden \name
+ELF     .type   \name, %function
+FUNC    .func   \name
+\name:
+.endif
+.endm
+
+
+#define FENC_STRIDE 64
+#define FDEC_STRIDE 32
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/ipfilter8.S	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,414 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+
+
+.macro qpel_filter_0_32b
+    movi            v24.8h, #64
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v24.4h
+    smull2          v18.4s, v19.8h, v24.8h
+.endm
+
+.macro qpel_filter_1_32b
+    movi            v16.8h, #58
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #17
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #5
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v4.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v3.8b
+    ssubl           v21.4s, v2.4h, v1.4h
+    ssubl2          v22.4s, v2.8h, v1.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+.macro qpel_filter_2_32b
+    movi            v16.4s, #11
+    uxtl            v19.8h, v5.8b
+    uxtl            v20.8h, v2.8b
+    saddl           v17.4s, v19.4h, v20.4h
+    saddl2          v18.4s, v19.8h, v20.8h
+
+    uxtl            v21.8h, v1.8b
+    uxtl            v22.8h, v6.8b
+    saddl           v19.4s, v21.4h, v22.4h
+    saddl2          v20.4s, v21.8h, v22.8h
+
+    mul             v19.4s, v19.4s, v16.4s
+    mul             v20.4s, v20.4s, v16.4s
+
+    movi            v16.4s, #40
+    mul             v17.4s, v17.4s, v16.4s
+    mul             v18.4s, v18.4s, v16.4s
+
+    uxtl            v21.8h, v4.8b
+    uxtl            v22.8h, v3.8b
+    saddl           v23.4s, v21.4h, v22.4h
+    saddl2          v16.4s, v21.8h, v22.8h
+
+    uxtl            v1.8h, v0.8b
+    uxtl            v2.8h, v7.8b
+    saddl           v21.4s, v1.4h, v2.4h
+    saddl2          v22.4s, v1.8h, v2.8h
+
+    shl             v23.4s, v23.4s, #2
+    shl             v16.4s, v16.4s, #2
+
+    add             v19.4s, v19.4s, v21.4s
+    add             v20.4s, v20.4s, v22.4s
+    add             v17.4s, v17.4s, v23.4s
+    add             v18.4s, v18.4s, v16.4s
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+.endm
+
+.macro qpel_filter_3_32b
+    movi            v16.8h, #17
+    movi            v24.8h, #5
+
+    uxtl            v19.8h, v5.8b
+    smull           v17.4s, v19.4h, v16.4h
+    smull2          v18.4s, v19.8h, v16.8h
+
+    uxtl            v21.8h, v1.8b
+    smull           v19.4s, v21.4h, v24.4h
+    smull2          v20.4s, v21.8h, v24.8h
+
+    movi            v16.8h, #58
+    uxtl            v23.8h, v2.8b
+    smull           v21.4s, v23.4h, v16.4h
+    smull2          v22.4s, v23.8h, v16.8h
+
+    movi            v24.8h, #10
+    uxtl            v1.8h, v6.8b
+    smull           v23.4s, v1.4h, v24.4h
+    smull2          v16.4s, v1.8h, v24.8h
+
+    sub             v17.4s, v17.4s, v19.4s
+    sub             v18.4s, v18.4s, v20.4s
+
+    uxtl            v1.8h, v3.8b
+    sshll           v19.4s, v1.4h, #2
+    sshll2          v20.4s, v1.8h, #2
+
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+
+    uxtl            v1.8h, v4.8b
+    uxtl            v2.8h, v7.8b
+    ssubl           v21.4s, v1.4h, v2.4h
+    ssubl2          v22.4s, v1.8h, v2.8h
+
+    add             v17.4s, v17.4s, v19.4s
+    add             v18.4s, v18.4s, v20.4s
+    sub             v21.4s, v21.4s, v23.4s
+    sub             v22.4s, v22.4s, v16.4s
+    add             v17.4s, v17.4s, v21.4s
+    add             v18.4s, v18.4s, v22.4s
+.endm
+
+
+
+
+.macro vextin8
+    ld1             {v3.16b}, [x11], #16
+    mov             v7.d[0], v3.d[1]
+    ext             v0.8b, v3.8b, v7.8b, #1
+    ext             v4.8b, v3.8b, v7.8b, #2
+    ext             v1.8b, v3.8b, v7.8b, #3
+    ext             v5.8b, v3.8b, v7.8b, #4
+    ext             v2.8b, v3.8b, v7.8b, #5
+    ext             v6.8b, v3.8b, v7.8b, #6
+    ext             v3.8b, v3.8b, v7.8b, #7
+.endm
+
+
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro HPS_FILTER a b filterhps
+    mov             w12, #8192
+    mov             w6, w10
+    sub             x3, x3, #\a
+    lsl             x3, x3, #1
+    mov             w9, #\a
+    cmp             w9, #4
+    b.eq            14f
+    cmp             w9, #12
+    b.eq            15f
+    b               7f
+14:
+    HPS_FILTER_4 \a \b \filterhps
+    b               10f
+15:
+    HPS_FILTER_12 \a \b \filterhps
+    b               10f
+7:
+    cmp             w5, #0
+    b.eq            8f
+    cmp             w5, #1
+    b.eq            9f
+8:
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
+    mov             w7, #\a
+    lsr             w7, w7, #3
+    mov             x11, x0
+    sub             x11, x11, #4
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    subs            w7, w7, #1
+    sub             x11, x11, #8
+    b.ne            loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
+    b               10f
+9:
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
+    mov             w7, #\a
+    lsr             w7, w7, #3
+    mov             x11, x0
+    sub             x11, x11, #4
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    subs            w7, w7, #1
+    sub             x11, x11, #8
+    b.ne            loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
+10:
+.endm
+
+.macro HPS_FILTER_4 w h filterhps
+    cmp             w5, #0
+    b.eq            11f
+    cmp             w5, #1
+    b.eq            12f
+11:
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    sub             x11, x11, #8
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
+    b               13f
+12:
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    sub             x11, x11, #8
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    add             x2, x2, x3
+    b.ne            loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
+13:
+.endm
+
+.macro HPS_FILTER_12 w h filterhps
+    cmp             w5, #0
+    b.eq            14f
+    cmp             w5, #1
+    b.eq            15f
+14:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    sub             x11, x11, #8
+
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    add             x2, x2, x3
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
+    b               16f
+15:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
+    mov             x11, x0
+    sub             x11, x11, #4
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    sub             v18.4s, v18.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    xtn2            v0.8h, v18.4s
+    st1             {v0.8h}, [x2], #16
+    sub             x11, x11, #8
+
+    vextin8
+    \filterhps
+    dup             v16.4s, w12
+    sub             v17.4s, v17.4s, v16.4s
+    xtn             v0.4h, v17.4s
+    st1             {v0.4h}, [x2], #8
+    add             x2, x2, x3
+    subs            w6, w6, #1
+    add             x0, x0, x1
+    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
+16:
+.endm
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro LUMA_HPS w h
+function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
+    mov             w10, #\h
+    cmp             w5, #0
+    b.eq            6f
+    sub             x0, x0, x1, lsl #2
+
+    add             x0, x0, x1
+    add             w10, w10, #7
+6:
+    cmp             w4, #0
+    b.eq            0f
+    cmp             w4, #1
+    b.eq            1f
+    cmp             w4, #2
+    b.eq            2f
+    cmp             w4, #3
+    b.eq            3f
+0:
+    HPS_FILTER  \w \h qpel_filter_0_32b
+    b               5f
+1:
+    HPS_FILTER  \w \h qpel_filter_1_32b
+    b               5f
+2:
+    HPS_FILTER  \w \h qpel_filter_2_32b
+    b               5f
+3:
+    HPS_FILTER  \w \h qpel_filter_3_32b
+    b               5f
+5:
+    ret
+endfunc
+.endm
+
+LUMA_HPS    4 4
+LUMA_HPS    4 8
+LUMA_HPS    4 16
+LUMA_HPS    8 4
+LUMA_HPS    8 8
+LUMA_HPS    8 16
+LUMA_HPS    8 32
+LUMA_HPS    12 16
+LUMA_HPS    16 4
+LUMA_HPS    16 8
+LUMA_HPS    16 12
+LUMA_HPS    16 16
+LUMA_HPS    16 32
+LUMA_HPS    16 64
+LUMA_HPS    24 32
+LUMA_HPS    32 8
+LUMA_HPS    32 16
+LUMA_HPS    32 24
+LUMA_HPS    32 32
+LUMA_HPS    32 64
+LUMA_HPS    48 64
+LUMA_HPS    64 16
+LUMA_HPS    64 32
+LUMA_HPS    64 48
+LUMA_HPS    64 64
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/ipfilter8.h	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,55 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_IPFILTER8_AARCH64_H
+#define X265_IPFILTER8_AARCH64_H
+
+
+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+
+#endif // ifndef X265_IPFILTER8_AARCH64_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/mc-a.S	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,63 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro pixel_avg_pp_4xN_neon h
+function x265_pixel_avg_pp_4x\h\()_neon
+.rept \h
+    ld1             {v0.s}[0], [x2], x3
+    ld1             {v1.s}[0], [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.s}[0], [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_4xN_neon 4
+pixel_avg_pp_4xN_neon 8
+pixel_avg_pp_4xN_neon 16
+
+.macro pixel_avg_pp_8xN_neon h
+function x265_pixel_avg_pp_8x\h\()_neon
+.rept \h
+    ld1             {v0.8b}, [x2], x3
+    ld1             {v1.8b}, [x4], x5
+    urhadd          v2.8b, v0.8b, v1.8b
+    st1             {v2.8b}, [x0], x1
+.endr
+    ret
+endfunc
+.endm
+
+pixel_avg_pp_8xN_neon 4
+pixel_avg_pp_8xN_neon 8
+pixel_avg_pp_8xN_neon 16
+pixel_avg_pp_8xN_neon 32
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/pixel-util.S	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,419 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro x265_satd_4x8_8x4_end_neon
+    add             v0.8h, v4.8h, v6.8h
+    add             v1.8h, v5.8h, v7.8h
+    sub             v2.8h, v4.8h, v6.8h
+    sub             v3.8h, v5.8h, v7.8h
+
+    trn1            v16.8h, v0.8h, v1.8h
+    trn2            v17.8h, v0.8h, v1.8h
+    add             v4.8h, v16.8h, v17.8h
+    trn1            v18.8h, v2.8h, v3.8h
+    trn2            v19.8h, v2.8h, v3.8h
+    sub             v5.8h, v16.8h, v17.8h
+    add             v6.8h, v18.8h, v19.8h
+    sub             v7.8h, v18.8h, v19.8h
+    trn1            v0.4s, v4.4s, v6.4s
+    trn2            v2.4s, v4.4s, v6.4s
+    abs             v0.8h, v0.8h
+    trn1            v1.4s, v5.4s, v7.4s
+    trn2            v3.4s, v5.4s, v7.4s
+    abs             v2.8h, v2.8h
+    abs             v1.8h, v1.8h
+    abs             v3.8h, v3.8h
+    umax            v0.8h, v0.8h, v2.8h
+    umax            v1.8h, v1.8h, v3.8h
+    add             v0.8h, v0.8h, v1.8h
+    uaddlv          s0, v0.8h
+.endm
+
+.macro pixel_satd_4x8_neon
+    ld1r             {v1.2s}, [x2], x3
+    ld1r            {v0.2s}, [x0], x1
+    ld1r            {v3.2s}, [x2], x3
+    ld1r            {v2.2s}, [x0], x1
+    ld1r            {v5.2s}, [x2], x3
+    ld1r            {v4.2s}, [x0], x1
+    ld1r            {v7.2s}, [x2], x3
+    ld1r            {v6.2s}, [x0], x1
+
+    ld1             {v1.s}[1], [x2], x3
+    ld1             {v0.s}[1], [x0], x1
+    usubl           v0.8h, v0.8b, v1.8b
+    ld1             {v3.s}[1], [x2], x3
+    ld1             {v2.s}[1], [x0], x1
+    usubl           v1.8h, v2.8b, v3.8b
+    ld1             {v5.s}[1], [x2], x3
+    ld1             {v4.s}[1], [x0], x1
+    usubl           v2.8h, v4.8b, v5.8b
+    ld1             {v7.s}[1], [x2], x3
+    add             v4.8h, v0.8h, v1.8h
+    sub             v5.8h, v0.8h, v1.8h
+    ld1             {v6.s}[1], [x0], x1
+    usubl           v3.8h, v6.8b, v7.8b
+    add         v6.8h, v2.8h, v3.8h
+    sub         v7.8h, v2.8h, v3.8h
+    x265_satd_4x8_8x4_end_neon
+.endm
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x8_neon
+    pixel_satd_4x8_neon
+    mov               w0, v0.s[0]
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x16_neon
+    eor             w4, w4, w4
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w4, w4, w5
+    pixel_satd_4x8_neon
+    mov               w5, v0.s[0]
+    add             w0, w5, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x32_neon
+    eor             w4, w4, w4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+.endr
+    mov             w0, w4
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x16_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w0, w7, w6
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_12x32_neon
+    mov             x4, x0
+    mov             x5, x2
+    eor             w7, w7, w7
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #4
+    add             x2, x5, #4
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    add             x0, x4, #8
+    add             x2, x5, #8
+.rept 4
+    pixel_satd_4x8_neon
+    mov             w6, v0.s[0]
+    add             w7, w7, w6
+.endr
+
+    mov             w0, w7
+    ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_8x8_neon
+    eor             w4, w4, w4
+    mov             x6, x0
+    mov             x7, x2
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w4, w4, w5
+    add             x0, x6, #4
+    add             x2, x7, #4
+    pixel_satd_4x8_neon
+    mov             w5, v0.s[0]
+    add             w0, w4, w5
+    ret
+endfunc
+
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+function x265_psyCost_4x4_neon
+    ld1r            {v4.2s}, [x0], x1
+    ld1r            {v5.2s}, [x0], x1
+    ld1             {v4.s}[1], [x0], x1
+    ld1             {v5.s}[1], [x0], x1
+
+    ld1r            {v6.2s}, [x2], x3
+    ld1r            {v7.2s}, [x2], x3
+    ld1             {v6.s}[1], [x2], x3
+    ld1             {v7.s}[1], [x2], x3
+
+    uaddl           v2.8h, v4.8b, v5.8b
+    usubl           v3.8h, v4.8b, v5.8b
+    uaddl           v18.8h, v6.8b, v7.8b
+    usubl           v19.8h, v6.8b, v7.8b
+
+    mov             v20.d[0], v2.d[1]
+    add             v0.4h, v2.4h, v20.4h
+    sub             v1.4h, v2.4h, v20.4h
+    mov             v21.d[0], v3.d[1]
+    add             v22.4h, v3.4h, v21.4h
+    sub             v23.4h, v3.4h, v21.4h
+
+    mov             v24.d[0], v18.d[1]
+    add             v16.4h, v18.4h, v24.4h
+    sub             v17.4h, v18.4h, v24.4h
+    mov             v25.d[0], v19.d[1]
+    add             v26.4h, v19.4h, v25.4h
+    sub             v27.4h, v19.4h, v25.4h
+
+    mov             v0.d[1], v22.d[0]
+    mov             v1.d[1], v23.d[0]
+    trn1            v22.8h, v0.8h, v1.8h
+    trn2            v23.8h, v0.8h, v1.8h
+    mov             v16.d[1], v26.d[0]
+    mov             v17.d[1], v27.d[0]
+    trn1            v26.8h, v16.8h, v17.8h
+    trn2            v27.8h, v16.8h, v17.8h
+
+    add             v2.8h, v22.8h, v23.8h
+    sub             v3.8h, v22.8h, v23.8h
+    add             v18.8h, v26.8h, v27.8h
+    sub             v19.8h, v26.8h, v27.8h
+
+    uaddl           v20.8h, v4.8b, v5.8b
+    uaddl           v21.8h, v6.8b, v7.8b
+
+    trn1            v0.4s, v2.4s, v3.4s
+    trn2            v1.4s, v2.4s, v3.4s
+    trn1            v16.4s, v18.4s, v19.4s
+    trn2            v17.4s, v18.4s, v19.4s
+    abs             v0.8h, v0.8h
+    abs             v16.8h, v16.8h
+    abs             v1.8h, v1.8h
+    abs             v17.8h, v17.8h
+
+    uaddlv          s20, v20.8h
+    uaddlv          s21, v21.8h
+    mov             v20.s[1], v21.s[0]
+
+    smax            v0.8h, v0.8h, v1.8h
+    smax            v16.8h, v16.8h, v17.8h
+
+    trn1            v4.2d, v0.2d, v16.2d
+    trn2            v5.2d, v0.2d, v16.2d
+    add             v0.8h, v4.8h, v5.8h
+    mov             v4.d[0], v0.d[1]
+    uaddlv          s0, v0.4h
+    uaddlv          s4, v4.4h
+
+    ushr            v20.2s, v20.2s, #2
+    mov             v0.s[1], v4.s[0]
+    sub             v0.2s, v0.2s, v20.2s
+    mov             w0, v0.s[0]
+    mov             w1, v0.s[1]
+    subs            w0, w0, w1
+    cneg            w0, w0, mi
+
+    ret
+endfunc
+
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+function x265_quant_neon
+    mov             w9, #1
+    lsl             w9, w9, w4
+    dup             v0.2s, w9
+    neg             w9, w4
+    dup             v1.4s, w9
+    add             w9, w9, #8
+    dup             v2.4s, w9
+    dup             v3.4s, w5
+
+    lsr             w6, w6, #2
+    eor             v4.16b, v4.16b, v4.16b
+    eor             w10, w10, w10
+    eor             v17.16b, v17.16b, v17.16b
+
+.loop_quant:
+
+    ld1             {v18.4h}, [x0], #8
+    ld1             {v7.4s}, [x1], #16
+    sxtl            v6.4s, v18.4h
+
+    cmlt            v5.4s, v6.4s, #0
+
+    abs             v6.4s, v6.4s
+
+
+    mul             v6.4s, v6.4s, v7.4s
+
+    add             v7.4s, v6.4s, v3.4s
+    sshl            v7.4s, v7.4s, v1.4s
+
+    mls             v6.4s, v7.4s, v0.s[0]
+    sshl            v16.4s, v6.4s, v2.4s
+    st1             {v16.4s}, [x2], #16
+
+    // numsig
+    cmeq            v16.4s, v7.4s, v17.4s
+    add             v4.4s, v4.4s, v16.4s
+    add             w10, w10, #4
+
+    // level *= sign
+    eor             v16.16b, v7.16b, v5.16b
+    sub             v16.4s, v16.4s, v5.4s
+    sqxtn           v5.4h, v16.4s
+    st1             {v5.4h}, [x3], #8
+
+    subs            w6, w6, #1
+    b.ne             .loop_quant
+
+    addv            s4, v4.4s
+    mov             w9, v4.s[0]
+    add             w0, w10, w9
+    ret
+endfunc
+
+.macro satd_4x4_neon
+    ld1             {v1.s}[0], [x2], x3
+    ld1             {v0.s}[0], [x0], x1
+    ld1             {v3.s}[0], [x2], x3
+    ld1             {v2.s}[0], [x0], x1
+
+    ld1             {v1.s}[1], [x2], x3
+    ld1             {v0.s}[1], [x0], x1
+    ld1             {v3.s}[1], [x2], x3
+    ld1             {v2.s}[1], [x0], x1
+
+    usubl           v4.8h, v0.8b, v1.8b
+    usubl           v5.8h, v2.8b, v3.8b
+
+    add             v6.8h, v4.8h, v5.8h
+    sub             v7.8h, v4.8h, v5.8h
+
+    mov             v4.d[0], v6.d[1]
+    add             v0.8h, v6.8h, v4.8h
+    sub             v2.8h, v6.8h, v4.8h
+
+    mov             v5.d[0], v7.d[1]
+    add             v1.8h, v7.8h, v5.8h
+    sub             v3.8h, v7.8h, v5.8h
+
+    trn1            v4.4h, v0.4h, v1.4h
+    trn2            v5.4h, v0.4h, v1.4h
+
+    trn1            v6.4h, v2.4h, v3.4h
+    trn2            v7.4h, v2.4h, v3.4h
+
+    add             v0.4h, v4.4h, v5.4h
+    sub             v1.4h, v4.4h, v5.4h
+
+    add             v2.4h, v6.4h, v7.4h
+    sub             v3.4h, v6.4h, v7.4h
+
+    trn1            v4.2s, v0.2s, v1.2s
+    trn2            v5.2s, v0.2s, v1.2s
+
+    trn1            v6.2s, v2.2s, v3.2s
+    trn2            v7.2s, v2.2s, v3.2s
+
+    abs             v4.4h, v4.4h
+    abs             v5.4h, v5.4h
+    abs             v6.4h, v6.4h
+    abs             v7.4h, v7.4h
+
+    smax            v1.4h, v4.4h, v5.4h
+    smax            v2.4h, v6.4h, v7.4h
+
+    add             v0.4h, v1.4h, v2.4h
+    uaddlp          v0.2s, v0.4h
+    uaddlp          v0.1d, v0.2s
+.endm
+
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x4_neon
+    satd_4x4_neon
+    umov            x0, v0.d[0]
+    ret
+endfunc
+
+// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
+function x265_pixel_satd_8x4_neon
+    mov             x4, x0
+    mov             x5, x2
+    satd_4x4_neon
+    add             x0, x4, #4
+    add             x2, x5, #4
+    umov            x6, v0.d[0]
+    satd_4x4_neon
+    umov            x0, v0.d[0]
+    add             x0, x0, x6
+    ret
+endfunc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/pixel-util.h	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su@huawei.com>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_PIXEL_UTIL_AARCH64_H
+#define X265_PIXEL_UTIL_AARCH64_H
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+
+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/pixel.h	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_I386_PIXEL_AARCH64_H
+#define X265_I386_PIXEL_AARCH64_H
+
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+
+#endif // ifndef X265_I386_PIXEL_AARCH64_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/aarch64/sad-a.S	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro SAD_X_START_8 x
+    ld1             {v0.8b}, [x0], x9
+.if \x == 3
+    ld1             {v1.8b}, [x1], x4
+    ld1             {v2.8b}, [x2], x4
+    ld1             {v3.8b}, [x3], x4
+.elseif \x == 4
+    ld1             {v1.8b}, [x1], x5
+    ld1             {v2.8b}, [x2], x5
+    ld1             {v3.8b}, [x3], x5
+    ld1             {v4.8b}, [x4], x5
+.endif
+    uabdl           v16.8h, v0.8b, v1.8b
+    uabdl           v17.8h, v0.8b, v2.8b
+    uabdl           v18.8h, v0.8b, v3.8b
+.if \x == 4
+    uabdl           v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8 x
+    ld1             {v0.8b}, [x0], x9
+.if \x == 3
+    ld1             {v1.8b}, [x1], x4
+    ld1             {v2.8b}, [x2], x4
+    ld1             {v3.8b}, [x3], x4
+.elseif \x == 4
+    ld1             {v1.8b}, [x1], x5
+    ld1             {v2.8b}, [x2], x5
+    ld1             {v3.8b}, [x3], x5
+    ld1             {v4.8b}, [x4], x5
+.endif
+    uabal           v16.8h, v0.8b, v1.8b
+    uabal           v17.8h, v0.8b, v2.8b
+    uabal           v18.8h, v0.8b, v3.8b
+.if \x == 4
+    uabal           v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8xN x, h
+function x265_sad_x\x\()_8x\h\()_neon
+    mov             x9, #FENC_STRIDE
+    SAD_X_START_8 \x
+.rept \h - 1
+    SAD_X_8 \x
+.endr
+    uaddlv          s0, v16.8h
+    uaddlv          s1, v17.8h
+    uaddlv          s2, v18.8h
+.if \x == 4
+    uaddlv          s3, v19.8h
+.endif
+
+.if \x == 3
+    stp             s0, s1, [x5]
+    str             s2, [x5, #8]
+.elseif \x == 4
+    stp             s0, s1, [x6]
+    stp             s2, s3, [x6, #8]
+.endif
+    ret
+endfunc
+.endm
+
+SAD_X_8xN 3 4
+SAD_X_8xN 3 8
+SAD_X_8xN 3 16
+SAD_X_8xN 3 32
+
+SAD_X_8xN 4 4
+SAD_X_8xN 4 8
+SAD_X_8xN 4 16
+SAD_X_8xN 4 32
--- a/source/common/arm/asm-primitives.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/arm/asm-primitives.cpp	Wed May 06 14:59:56 2020 +0530
@@ -5,6 +5,7 @@
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
  *          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
  *          Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -48,77 +49,77 @@ void setupAssemblyPrimitives(EncoderPrim
         p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
 
         // addAvg
-         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
-         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
-         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
-         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
-         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
-         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
-         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
-         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
-         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
-         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
-         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
-         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
-         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
-         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
-         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
-         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
-         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
-         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
-         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
-         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
-         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
-         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
-         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
-         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
-         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
+         p.pu[LUMA_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
+         p.pu[LUMA_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+         p.pu[LUMA_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+         p.pu[LUMA_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+         p.pu[LUMA_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+         p.pu[LUMA_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+         p.pu[LUMA_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+         p.pu[LUMA_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
+         p.pu[LUMA_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
+         p.pu[LUMA_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+         p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
+         p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+         p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+         p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
+         p.pu[LUMA_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
+         p.pu[LUMA_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
+         p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+         p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
+         p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+         p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
+         p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_neon);
+         p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_neon);
+         p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_neon);
+         p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_neon);
+         p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_neon);
 
         // chroma addAvg
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   = PFX(addAvg_4x2_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   = PFX(addAvg_4x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   = PFX(addAvg_4x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  = PFX(addAvg_4x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   = PFX(addAvg_6x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = PFX(addAvg_8x2_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = PFX(addAvg_8x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = PFX(addAvg_8x6_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = PFX(addAvg_8x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = PFX(addAvg_8x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = PFX(addAvg_8x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = PFX(addAvg_16x4_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = PFX(addAvg_16x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = PFX(addAvg_32x8_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[NONALIGNED]   = PFX(addAvg_4x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[NONALIGNED]   = PFX(addAvg_6x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[NONALIGNED]   = PFX(addAvg_8x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[NONALIGNED]   = PFX(addAvg_8x6_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
 
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   = PFX(addAvg_4x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  = PFX(addAvg_4x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  = PFX(addAvg_4x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  = PFX(addAvg_6x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   = PFX(addAvg_8x4_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   = PFX(addAvg_8x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  = PFX(addAvg_8x12_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  = PFX(addAvg_8x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  = PFX(addAvg_8x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  = PFX(addAvg_8x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  = PFX(addAvg_16x8_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[NONALIGNED]  = PFX(addAvg_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[NONALIGNED]  = PFX(addAvg_6x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[NONALIGNED]  = PFX(addAvg_8x12_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[NONALIGNED]  = PFX(addAvg_8x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[NONALIGNED] = PFX(addAvg_12x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[NONALIGNED] = PFX(addAvg_24x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
 
         // quant
          p.quant = PFX(quant_neon);
@@ -402,7 +403,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.scale2D_64to32  = PFX(scale2D_64to32_neon);
 
         // scale1D_128to64
-        p.scale1D_128to64 = PFX(scale1D_128to64_neon);
+        p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
 
         // copy_count
         p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
@@ -411,37 +412,37 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_32x32].copy_cnt   = PFX(copy_cnt_32_neon);
 
         // filterPixelToShort
-        p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
-        p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
-        p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
-        p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
-        p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
-        p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
-        p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
-        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
-        p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
-        p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
-        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
-        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
-        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
-        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
-        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
-        p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
-        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
-        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
-        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
-        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
-        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
-        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
-        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
-        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
-        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
+        p.pu[LUMA_4x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x4_neon);
+        p.pu[LUMA_4x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_4x8_neon);
+        p.pu[LUMA_4x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_4x16_neon);
+        p.pu[LUMA_8x4].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x4_neon);
+        p.pu[LUMA_8x8].convert_p2s[NONALIGNED]   = PFX(filterPixelToShort_8x8_neon);
+        p.pu[LUMA_8x16].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x16_neon);
+        p.pu[LUMA_8x32].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_8x32_neon);
+        p.pu[LUMA_12x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_12x16_neon);
+        p.pu[LUMA_16x4].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_16x4_neon);
+        p.pu[LUMA_16x8].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_16x8_neon);
+        p.pu[LUMA_16x12].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x12_neon);
+        p.pu[LUMA_16x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x16_neon);
+        p.pu[LUMA_16x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x32_neon);
+        p.pu[LUMA_16x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x64_neon);
+        p.pu[LUMA_24x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_24x32_neon);
+        p.pu[LUMA_32x8].convert_p2s[NONALIGNED]  = PFX(filterPixelToShort_32x8_neon);
+        p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_neon);
+        p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_neon);
+        p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_neon);
+        p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_neon);
+        p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_48x64_neon);
+        p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_neon);
+        p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_neon);
+        p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_neon);
+        p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_neon);
 
         // Block_fill
-        p.cu[BLOCK_4x4].blockfill_s   = PFX(blockfill_s_4x4_neon);
-        p.cu[BLOCK_8x8].blockfill_s   = PFX(blockfill_s_8x8_neon);
-        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);
-        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);
+        p.cu[BLOCK_4x4].blockfill_s[NONALIGNED]   = PFX(blockfill_s_4x4_neon);
+        p.cu[BLOCK_8x8].blockfill_s[NONALIGNED]   = PFX(blockfill_s_8x8_neon);
+        p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = PFX(blockfill_s_16x16_neon);
+        p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = PFX(blockfill_s_32x32_neon);
 
         // Blockcopy_ss
         p.cu[BLOCK_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
@@ -495,21 +496,21 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon);
 
         // pixel_add_ps
-        p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
-        p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
-        p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
-        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
-        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon);
+        p.cu[BLOCK_4x4].add_ps[NONALIGNED]   = PFX(pixel_add_ps_4x4_neon);
+        p.cu[BLOCK_8x8].add_ps[NONALIGNED]   = PFX(pixel_add_ps_8x8_neon);
+        p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
+        p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
+        p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_neon);
 
         // chroma add_ps
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps   = PFX(pixel_add_ps_4x8_neon);
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps  = PFX(pixel_add_ps_8x16_neon);
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_neon);
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_neon);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED]   = PFX(pixel_add_ps_4x4_neon);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED]   = PFX(pixel_add_ps_8x8_neon);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED]   = PFX(pixel_add_ps_4x8_neon);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED]  = PFX(pixel_add_ps_8x16_neon);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_neon);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_neon);
 
         // cpy2Dto1D_shr
         p.cu[BLOCK_4x4].cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
@@ -518,10 +519,10 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
 
         // ssd_s
-        p.cu[BLOCK_4x4].ssd_s   = PFX(pixel_ssd_s_4x4_neon);
-        p.cu[BLOCK_8x8].ssd_s   = PFX(pixel_ssd_s_8x8_neon);
-        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
-        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
+        p.cu[BLOCK_4x4].ssd_s[NONALIGNED]   = PFX(pixel_ssd_s_4x4_neon);
+        p.cu[BLOCK_8x8].ssd_s[NONALIGNED]   = PFX(pixel_ssd_s_8x8_neon);
+        p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);
+        p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);
 
         // sse_ss
         p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
@@ -548,10 +549,10 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
 
         // calc_Residual
-        p.cu[BLOCK_4x4].calcresidual   = PFX(getResidual4_neon);
-        p.cu[BLOCK_8x8].calcresidual   = PFX(getResidual8_neon);
-        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_neon);
-        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_neon);
+        p.cu[BLOCK_4x4].calcresidual[NONALIGNED]   = PFX(getResidual4_neon);
+        p.cu[BLOCK_8x8].calcresidual[NONALIGNED]   = PFX(getResidual8_neon);
+        p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = PFX(getResidual16_neon);
+        p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_neon);
 
         // sse_pp
         p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_neon);
@@ -722,31 +723,31 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon);
 
         // pixel_avg_pp
-        p.pu[LUMA_4x4].pixelavg_pp   = PFX(pixel_avg_pp_4x4_neon);
-        p.pu[LUMA_4x8].pixelavg_pp   = PFX(pixel_avg_pp_4x8_neon);
-        p.pu[LUMA_4x16].pixelavg_pp  = PFX(pixel_avg_pp_4x16_neon);
-        p.pu[LUMA_8x4].pixelavg_pp   = PFX(pixel_avg_pp_8x4_neon);
-        p.pu[LUMA_8x8].pixelavg_pp   = PFX(pixel_avg_pp_8x8_neon);
-        p.pu[LUMA_8x16].pixelavg_pp  = PFX(pixel_avg_pp_8x16_neon);
-        p.pu[LUMA_8x32].pixelavg_pp  = PFX(pixel_avg_pp_8x32_neon);
-        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_pp_12x16_neon);
-        p.pu[LUMA_16x4].pixelavg_pp  = PFX(pixel_avg_pp_16x4_neon);
-        p.pu[LUMA_16x8].pixelavg_pp  = PFX(pixel_avg_pp_16x8_neon);
-        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_pp_16x12_neon);
-        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_pp_16x16_neon);
-        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_pp_16x32_neon);
-        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_pp_16x64_neon);
-        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_pp_24x32_neon);
-        p.pu[LUMA_32x8].pixelavg_pp  = PFX(pixel_avg_pp_32x8_neon);
-        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_pp_32x16_neon);
-        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_pp_32x24_neon);
-        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_pp_32x32_neon);
-        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_pp_32x64_neon);
-        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_pp_48x64_neon);
-        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_pp_64x16_neon);
-        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_pp_64x32_neon);
-        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_pp_64x48_neon);
-        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_pp_64x64_neon);
+        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x4_neon);
+        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x8_neon);
+        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_4x16_neon);
+        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x4_neon);
+        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x8_neon);
+        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x16_neon);
+        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x32_neon);
+        p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_12x16_neon);
+        p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_16x4_neon);
+        p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_16x8_neon);
+        p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x12_neon);
+        p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x16_neon);
+        p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x32_neon);
+        p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x64_neon);
+        p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_24x32_neon);
+        p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_32x8_neon);
+        p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x16_neon);
+        p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x24_neon);
+        p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x32_neon);
+        p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x64_neon);
+        p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_48x64_neon);
+        p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x16_neon);
+        p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x32_neon);
+        p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x48_neon);
+        p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x64_neon);
 
         // planecopy
         p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
--- a/source/common/common.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/common.h	Wed May 06 14:59:56 2020 +0530
@@ -129,6 +129,7 @@ typedef uint32_t sum_t;
 typedef uint64_t sum2_t;
 typedef uint64_t pixel4;
 typedef int64_t  ssum2_t;
+#define SHIFT_TO_BITPLANE 9
 #define HISTOGRAM_BINS 1024
 #else
 typedef uint8_t  pixel;
@@ -136,6 +137,7 @@ typedef uint16_t sum_t;
 typedef uint32_t sum2_t;
 typedef uint32_t pixel4;
 typedef int32_t  ssum2_t; // Signed sum
+#define SHIFT_TO_BITPLANE 7
 #define HISTOGRAM_BINS 256
 #endif // if HIGH_BIT_DEPTH
 
@@ -270,6 +272,9 @@ typedef int16_t  coeff_t;      // transf
 #define MAX_TR_SIZE (1 << MAX_LOG2_TR_SIZE)
 #define MAX_TS_SIZE (1 << MAX_LOG2_TS_SIZE)
 
+#define RDCOST_BASED_RSKIP 1
+#define EDGE_BASED_RSKIP 2
+
 #define COEF_REMAIN_BIN_REDUCTION   3 // indicates the level at which the VLC
                                       // transitions from Golomb-Rice to TU+EG(k)
 
--- a/source/common/cpu.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/cpu.cpp	Wed May 06 14:59:56 2020 +0530
@@ -5,6 +5,8 @@
  *          Laurent Aimar <fenrir@via.ecp.fr>
  *          Fiona Glaser <fiona@x264.com>
  *          Steve Borho <steve@borho.org>
+ *          Hongbin Liu <liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -367,6 +369,8 @@ uint32_t cpu_detect(bool benableavx512)
     flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
 #endif
     // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
+#elif X265_ARCH_ARM64
+    flags |= X265_CPU_NEON;
 #endif // if HAVE_ARMV6
     return flags;
 }
--- a/source/common/frame.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/frame.cpp	Wed May 06 14:59:56 2020 +0530
@@ -61,6 +61,8 @@ Frame::Frame()
     m_edgePic = NULL;
     m_gaussianPic = NULL;
     m_thetaPic = NULL;
+    m_edgeBitPlane = NULL;
+    m_edgeBitPic = NULL;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -115,6 +117,19 @@ bool Frame::create(x265_param *param, fl
         m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
     }
 
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize;
+        uint32_t lumaMarginX = param->maxCUSize + 32;
+        uint32_t lumaMarginY = param->maxCUSize + 16;
+        uint32_t stride = (numCuInWidth * param->maxCUSize) + (lumaMarginX << 1);
+        uint32_t maxHeight = numCuInHeight * param->maxCUSize;
+        uint32_t bitPlaneSize = stride * (maxHeight + (lumaMarginY * 2));
+        CHECKED_MALLOC_ZERO(m_edgeBitPlane, pixel, bitPlaneSize);
+        m_edgeBitPic = m_edgeBitPlane + lumaMarginY * stride + lumaMarginX;
+    }
+
     if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
@@ -267,4 +282,10 @@ void Frame::destroy()
         X265_FREE(m_gaussianPic);
         X265_FREE(m_thetaPic);
     }
+
+    if (m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        X265_FREE_ZERO(m_edgeBitPlane);
+        m_edgeBitPic = NULL;
+    }
 }
--- a/source/common/frame.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/frame.h	Wed May 06 14:59:56 2020 +0530
@@ -99,7 +99,7 @@ public:
     float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
     x265_sei               m_userSEI;
     uint32_t               m_picStruct;          // picture structure SEI message
-    x265_dolby_vision_rpu            m_rpu;
+    x265_dolby_vision_rpu  m_rpu;
 
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
     ThreadSafeInteger*     m_reconRowFlag;       // flag of CTU rows completely reconstructed and extended for motion reference
@@ -137,6 +137,10 @@ public:
     pixel*                 m_gaussianPic;
     pixel*                 m_thetaPic;
 
+    /* edge bit plane for rskips 2 and 3 */
+    pixel*                 m_edgeBitPlane;
+    pixel*                 m_edgeBitPic;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
--- a/source/common/param.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/param.cpp	Wed May 06 14:59:56 2020 +0530
@@ -198,7 +198,8 @@ void x265_param_default(x265_param* para
     param->bEnableWeightedPred = 1;
     param->bEnableWeightedBiPred = 0;
     param->bEnableEarlySkip = 1;
-    param->bEnableRecursionSkip = 1;
+    param->recursionSkipMode = 1;
+    param->edgeVarThreshold = 0.05f;
     param->bEnableAMP = 0;
     param->bEnableRectInter = 0;
     param->rdLevel = 3;
@@ -285,6 +286,7 @@ void x265_param_default(x265_param* para
     param->rc.bEnableConstVbv = 0;
     param->bResetZoneConfig = 1;
     param->reconfigWindowSize = 0;
+    param->decoderVbvMaxRate = 0;
 
     /* Video Usability Information (VUI) */
     param->vui.aspectRatioIdc = 0;
@@ -546,7 +548,7 @@ int x265_param_default_preset(x265_param
             param->maxNumMergeCand = 5;
             param->searchMethod = X265_STAR_SEARCH;
             param->bEnableTransformSkip = 1;
-            param->bEnableRecursionSkip = 0;
+            param->recursionSkipMode = 0;
             param->maxNumReferences = 5;
             param->limitReferences = 0;
             param->lookaheadSlices = 0; // disabled for best quality
@@ -598,7 +600,7 @@ int x265_param_default_preset(x265_param
             param->rc.hevcAq = 0;
             param->rc.qpStep = 1;
             param->rc.bEnableGrain = 1;
-            param->bEnableRecursionSkip = 0;
+            param->recursionSkipMode = 0;
             param->psyRd = 4.0;
             param->psyRdoq = 10.0;
             param->bEnableSAO = 0;
@@ -702,8 +704,9 @@ int x265_zone_param_parse(x265_param* p,
     OPT("ref") p->maxNumReferences = atoi(value);
     OPT("fast-intra") p->bEnableFastIntra = atobool(value);
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
-    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
-    OPT("me")p->searchMethod = parseName(value, x265_motion_est_names, bError);
+    OPT("rskip") p->recursionSkipMode = atoi(value);
+    OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
+    OPT("me") p->searchMethod = parseName(value, x265_motion_est_names, bError);
     OPT("subme") p->subpelRefine = atoi(value);
     OPT("merange") p->searchRange = atoi(value);
     OPT("rect") p->bEnableRectInter = atobool(value);
@@ -919,7 +922,7 @@ int x265_param_parse(x265_param* p, cons
     OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value);
     OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value);
     OPT("early-skip") p->bEnableEarlySkip = atobool(value);
-    OPT("rskip") p->bEnableRecursionSkip = atobool(value);
+    OPT("rskip") p->recursionSkipMode = atoi(value);
     OPT("rdpenalty") p->rdPenalty = atoi(value);
     OPT("tskip") p->bEnableTransformSkip = atobool(value);
     OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value);
@@ -1221,6 +1224,7 @@ int x265_param_parse(x265_param* p, cons
             }
         }
         OPT("hist-threshold") p->edgeTransitionThreshold = atof(value);
+        OPT("rskip-edge-threshold") p->edgeVarThreshold = atoi(value)/100.0f;
         OPT("lookahead-threads") p->lookaheadThreads = atoi(value);
         OPT("opt-cu-delta-qp") p->bOptCUDeltaQP = atobool(value);
         OPT("multi-pass-opt-analysis") p->analysisMultiPassRefine = atobool(value);
@@ -1596,9 +1600,16 @@ int x265_check_params(x265_param* param)
     CHECK(param->rdLevel < 1 || param->rdLevel > 6,
           "RD Level is out of range");
     CHECK(param->rdoqLevel < 0 || param->rdoqLevel > 2,
-        "RDOQ Level is out of range");
+          "RDOQ Level is out of range");
     CHECK(param->dynamicRd < 0 || param->dynamicRd > x265_ADAPT_RD_STRENGTH,
-        "Dynamic RD strength must be between 0 and 4");
+          "Dynamic RD strength must be between 0 and 4");
+    CHECK(param->recursionSkipMode > 2 || param->recursionSkipMode < 0,
+          "Invalid Recursion skip mode. Valid modes 0,1,2");
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        CHECK(param->edgeVarThreshold < 0.0f || param->edgeVarThreshold > 1.0f,
+              "Minimum edge density percentage for a CU should be an integer between 0 to 100");
+    }
     CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead,
           "Lookahead depth must be greater than the max consecutive bframe count");
     CHECK(param->bframes < 0,
@@ -1789,6 +1800,7 @@ int x265_check_params(x265_param* param)
     }
     CHECK(param->confWinRightOffset < 0, "Conformance Window Right Offset must be 0 or greater");
     CHECK(param->confWinBottomOffset < 0, "Conformance Window Bottom Offset must be 0 or greater");
+    CHECK(param->decoderVbvMaxRate < 0, "Invalid Decoder Vbv Maxrate. Value can not be less than zero");
     return check_failed;
 }
 
@@ -1908,7 +1920,9 @@ void x265_print_params(x265_param* param
     TOOLVAL(param->psyRdoq, "psy-rdoq=%.2lf");
     TOOLOPT(param->bEnableRdRefine, "rd-refine");
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
-    TOOLOPT(param->bEnableRecursionSkip, "rskip");
+    TOOLVAL(param->recursionSkipMode, "rskip mode=%d");
+    if (param->recursionSkipMode == EDGE_BASED_RSKIP)
+        TOOLVAL(param->edgeVarThreshold, "rskip-edge-threshold=%.2f");
     TOOLOPT(param->bEnableSplitRdSkip, "splitrd-skip");
     TOOLVAL(param->noiseReductionIntra, "nr-intra=%d");
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
@@ -2066,7 +2080,10 @@ char *x265_param2string(x265_param* p, i
     s += sprintf(s, " rd=%d", p->rdLevel);
     s += sprintf(s, " selective-sao=%d", p->selectiveSAO);
     BOOL(p->bEnableEarlySkip, "early-skip");
-    BOOL(p->bEnableRecursionSkip, "rskip");
+    BOOL(p->recursionSkipMode, "rskip");
+    if (p->recursionSkipMode == EDGE_BASED_RSKIP)
+        s += sprintf(s, " rskip-edge-threshold=%f", p->edgeVarThreshold);
+
     BOOL(p->bEnableFastIntra, "fast-intra");
     BOOL(p->bEnableTSkipFast, "tskip-fast");
     BOOL(p->bCULossless, "cu-lossless");
@@ -2204,6 +2221,7 @@ char *x265_param2string(x265_param* p, i
     if (p->bEnableSceneCutAwareQp)
         s += sprintf(s, " scenecut-window=%d max-qp-delta=%d", p->scenecutWindow, p->maxQpDelta);
     s += sprintf(s, "conformance-window-offsets right=%d bottom=%d", p->confWinRightOffset, p->confWinBottomOffset);
+    s += sprintf(s, " decoder-max-rate=%d", p->decoderVbvMaxRate);
 #undef BOOL
     return buf;
 }
@@ -2373,7 +2391,8 @@ void x265_copy_params(x265_param* dst, x
     dst->bSaoNonDeblocked = src->bSaoNonDeblocked;
     dst->rdLevel = src->rdLevel;
     dst->bEnableEarlySkip = src->bEnableEarlySkip;
-    dst->bEnableRecursionSkip = src->bEnableRecursionSkip;
+    dst->recursionSkipMode = src->recursionSkipMode;
+    dst->edgeVarThreshold = src->edgeVarThreshold;
     dst->bEnableFastIntra = src->bEnableFastIntra;
     dst->bEnableTSkipFast = src->bEnableTSkipFast;
     dst->bCULossless = src->bCULossless;
@@ -2419,8 +2438,9 @@ void x265_copy_params(x265_param* dst, x
     dst->rc.zonefileCount = src->rc.zonefileCount;
     dst->reconfigWindowSize = src->reconfigWindowSize;
     dst->bResetZoneConfig = src->bResetZoneConfig;
+    dst->decoderVbvMaxRate = src->decoderVbvMaxRate;
 
-    if (src->rc.zonefileCount && src->rc.zones)
+    if (src->rc.zonefileCount && src->rc.zones && src->bResetZoneConfig)
     {
         for (int i = 0; i < src->rc.zonefileCount; i++)
         {
--- a/source/common/pixel.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/pixel.cpp	Wed May 06 14:59:56 2020 +0530
@@ -5,6 +5,7 @@
  *          Mandar Gurav <mandar@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *          Min Chen <min.chen@multicorewareinc.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -265,6 +266,10 @@ int satd4(const pixel* pix1, intptr_t st
 {
     int satd = 0;
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
+#endif
+
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 4)
             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -279,6 +284,10 @@ int satd8(const pixel* pix1, intptr_t st
 {
     int satd = 0;
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
+#endif
+
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 8)
             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -876,6 +885,18 @@ static void planecopy_sp_c(const uint16_
     }
 }
 
+static void planecopy_pp_shr_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
+{
+    for (int r = 0; r < height; r++)
+    {
+        for (int c = 0; c < width; c++)
+            dst[c] = (pixel)((src[c] >> shift));
+
+        dst += dstStride;
+        src += srcStride;
+    }
+}
+
 static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
 {
     for (int r = 0; r < height; r++)
@@ -1316,6 +1337,7 @@ void setupPixelPrimitives_c(EncoderPrimi
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
     p.planecopy_sp_shl = planecopy_sp_shl_c;
+    p.planecopy_pp_shr = planecopy_pp_shr_c;
 #if HIGH_BIT_DEPTH
     p.planeClipAndMax = planeClipAndMax_c;
 #endif
--- a/source/common/primitives.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/primitives.h	Wed May 06 14:59:56 2020 +0530
@@ -8,6 +8,8 @@
  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
  *          Min Chen <chenm003@163.com>
+ *          Hongbin Liu<liuhongbin1@huawei.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -204,6 +206,7 @@ typedef void (*saoCuStatsE3_t)(const int
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
+typedef void (*planecopy_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
@@ -358,6 +361,7 @@ struct EncoderPrimitives
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
     planecopy_sp_t        planecopy_sp_shl;
+    planecopy_pp_t        planecopy_pp_shr;
     planeClipAndMax_t     planeClipAndMax;
 
     weightp_sp_t          weight_sp;
@@ -465,6 +469,9 @@ void setupCPrimitives(EncoderPrimitives 
 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
+#if X265_ARCH_ARM64
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
+#endif
 #if HAVE_ALTIVEC
 void setupPixelPrimitives_altivec(EncoderPrimitives &p);
 void setupDCTPrimitives_altivec(EncoderPrimitives &p);
@@ -479,4 +486,10 @@ extern const char* PFX(version_str);
 extern const char* PFX(build_info_str);
 #endif
 
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
+extern "C" {
+#include "aarch64/pixel-util.h"
+}
+#endif
+
 #endif // ifndef X265_PRIMITIVES_H
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/scaler.cpp	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,1110 @@
+/*****************************************************************************
+* Copyright (C) 2013-2020 MulticoreWare, Inc
+*
+* Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "scaler.h"
+
+#if _MSC_VER
+#pragma warning(disable: 4706) // assignment within conditional
+#pragma warning(disable: 4244) // '=' : possible loss of data
+#endif
+
+#define SHORT_MIN (-(1 << 15))
+#define SHORT_MAX ((1 << 15) - 1)
+#define SHORT_MAX_10 ((1 << 10) - 1)
+
+namespace X265_NS{
+
+ScalerFilterManager::ScalerFilterManager() :
+    m_bitDepth(0),
+    m_algorithmFlags(0),
+    m_srcW(0),
+    m_srcH(0),
+    m_dstW(0),
+    m_dstH(0),
+    m_crSrcW(0),
+    m_crSrcH(0),
+    m_crDstW(0),
+    m_crDstH(0),
+    m_crSrcHSubSample(0),
+    m_crSrcVSubSample(0),
+    m_crDstHSubSample(0),
+    m_crDstVSubSample(0)
+{
+    for (int i = 0; i < m_numSlice; i++)
+        m_slices[i] = NULL;
+    for (int i = 0; i < m_numFilter; i++)
+        m_ScalerFilters[i] = NULL;
+}
+
+inline static void filter_copy_c(int64_t* filter, int64_t* filter2, int size)
+{
+    for (int i = 0; i < size; i++)
+        filter2[i] = filter[i];
+}
+
+#if X265_DEPTH == 8
+static void doScaling_c(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)src[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow ...
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 7);
+    }
+}
+static uint8_t clipUint8(int a)
+{
+    if (a&(~0xFF))
+        return (-a) >> 31;
+    else
+        return a;
+}
+
+static void yuv2PlaneX_c(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 64 << 12;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        dest[i] = clipUint8(val >> 19);
+    }
+}
+#else
+static void yuv2PlaneX_c_h(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 1 << 16;
+        uint16_t* dst16bit = (uint16_t *)dest;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        uint16_t d = x265_clip3(0, SHORT_MAX_10, val >> 17);
+        ((uint8_t*)(&dst16bit[i]))[0] = (d);
+        ((uint8_t*)(&dst16bit[i]))[1] = (d) >> 8;
+    }
+}
+static void doScaling_c_h(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    const uint16_t *srcLocal = (const uint16_t *)src;
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)srcLocal[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 9);
+    }
+}
+#endif
+
+ScalerFilter::ScalerFilter() :
+    m_filtLen(0),
+    m_filtPos(NULL),
+    m_filt(NULL),
+    m_sourceSlice(NULL),
+    m_destSlice(NULL)
+{
+}
+
+ScalerFilter::~ScalerFilter()
+{
+    if (m_filtPos) {
+        delete[] m_filtPos; m_filtPos = NULL;
+    }
+    if (m_filt) {
+        delete[] m_filt; m_filt = NULL;
+    }
+}
+
+void ScalerHLumFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src = m_sourceSlice->m_plane[0].lineBuf;
+    uint8_t ** dst = m_destSlice->m_plane[0].lineBuf;
+    int sourcePos = sliceVer - m_sourceSlice->m_plane[0].sliceVer;
+    int destPos = sliceVer - m_destSlice->m_plane[0].sliceVer;
+    int dstW = m_destSlice->m_width;
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst[destPos + i], dstW, (const uint8_t *)src[sourcePos + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[0].sliceHor += 1;
+    }
+}
+
+void ScalerHCrFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src1 = m_sourceSlice->m_plane[1].lineBuf;
+    uint8_t ** dst1 = m_destSlice->m_plane[1].lineBuf;
+    uint8_t ** src2 = m_sourceSlice->m_plane[2].lineBuf;
+    uint8_t ** dst2 = m_destSlice->m_plane[2].lineBuf;
+
+    int sourcePos1 = sliceVer - m_sourceSlice->m_plane[1].sliceVer;
+    int destPos1 = sliceVer - m_destSlice->m_plane[1].sliceVer;
+    int sourcePos2 = sliceVer - m_sourceSlice->m_plane[2].sliceVer;
+    int destPos2 = sliceVer - m_destSlice->m_plane[2].sliceVer;
+
+    int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample;
+
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst1[destPos1 + i], dstW, src1[sourcePos1 + i], m_filt, m_filtPos, m_filtLen);
+        m_hFilterScaler->doScaling((int16_t*)dst2[destPos2 + i], dstW, src2[sourcePos2 + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[1].sliceHor += 1;
+        m_destSlice->m_plane[2].sliceHor += 1;
+    }
+}
+
+void VFilterScaler8Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    yuv2PlaneX_c(filter, filterSize, src, dest, dstW);
+#else
+    yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW);
+#endif
+}
+
+void VFilterScaler10Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    yuv2PlaneX_c(filter, filterSize, src, dest, dstW);
+#else
+    yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW);
+#endif
+}
+
+void ScalerVLumFilter::process(int sliceVer, int sliceHor)
+{
+    (void)sliceHor;
+    int first = X265_MAX(1 - m_filtLen, m_filtPos[sliceVer]);
+    int sp = first - m_sourceSlice->m_plane[0].sliceVer;
+    int dp = sliceVer - m_destSlice->m_plane[0].sliceVer;
+    uint8_t **src = m_sourceSlice->m_plane[0].lineBuf + sp;
+    uint8_t **dst = m_destSlice->m_plane[0].lineBuf + dp;
+    int16_t *filter = m_filt + (sliceVer * m_filtLen);
+    int dstW = m_destSlice->m_width;
+    m_vFilterScaler->yuv2PlaneX(filter, m_filtLen, (const int16_t**)src, dst[0], dstW);
+}
+
+void ScalerVCrFilter::process(int sliceVer, int sliceHor)
+{
+    (void)sliceHor;
+
+    const int crSkipMask = (1 << m_destSlice->m_vCrSubSample) - 1;
+    if (sliceVer & crSkipMask)
+        return;
+    else
+    {
+        int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample;
+        int crSliceVer = sliceVer >> m_destSlice->m_vCrSubSample;
+        int first = X265_MAX(1 - m_filtLen, m_filtPos[crSliceVer]);
+        int sp1 = first - m_sourceSlice->m_plane[1].sliceVer;
+        int sp2 = first - m_sourceSlice->m_plane[2].sliceVer;
+        int dp1 = crSliceVer - m_destSlice->m_plane[1].sliceVer;
+        int dp2 = crSliceVer - m_destSlice->m_plane[2].sliceVer;
+        uint8_t **src1 = m_sourceSlice->m_plane[1].lineBuf + sp1;
+        uint8_t **src2 = m_sourceSlice->m_plane[2].lineBuf + sp2;
+        uint8_t **dst1 = m_destSlice->m_plane[1].lineBuf + dp1;
+        uint8_t **dst2 = m_destSlice->m_plane[2].lineBuf + dp2;
+        int16_t *filter = m_filt + (crSliceVer * m_filtLen);
+
+        m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src1, dst1[0], dstW);
+        m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src2, dst2[0], dstW);
+    }
+}
+
+int ScalerFilter::initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos)
+{
+    int filterSize;
+    int filter2Size;
+    int minFilterSize;
+    int64_t *filter = NULL;
+    int64_t *filter2 = NULL;
+    const int64_t fone = 1LL << (54 - x265_min((int)X265_LOG2(srcW / dstW), 8));
+    int *outFilterSize = &m_filtLen;
+    int64_t xDstInSrc;
+    int sizeFactor = flag;
+
+    // Init filter pos, the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end
+    m_filtPos = new int32_t[dstW + 3];
+    int32_t **filterPos = &m_filtPos;
+
+    if (inc <= 1 << 16)
+        filterSize = 1 + sizeFactor; // upscale
+    else
+        filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;
+
+    filterSize = x265_min(filterSize, srcW - 2);
+    filterSize = x265_max(filterSize, 1);
+    filter = new int64_t[dstW * sizeof(*filter) * filterSize];
+
+    xDstInSrc = ((destPos*(int64_t)inc) >> 7) - ((sourcePos * 0x10000LL) >> 7);
+    for (int i = 0; i < dstW; i++)
+    {
+        int xx = (xDstInSrc - (filterSize - 2) * (1LL << 16)) / (1 << 17);
+        (*filterPos)[i] = xx;
+        for (int j = 0; j < filterSize; j++)
+        {
+            int64_t d = (X265_ABS(((int64_t)xx * (1 << 17)) - xDstInSrc)) << 13;
+            int64_t coeff = 0;
+
+            if (inc > 1 << 16)
+                d = d * dstW / srcW;
+
+            if (flag == 4) // BiCUBIC
+            {
+                int64_t B = (0) * (1 << 24);
+                int64_t C = (0.6) * (1 << 24);
+
+                if (d >= 1LL << 31)
+                    coeff = 0.0;
+                else
+                {
+                    int64_t dd = (d  * d) >> 30;
+                    int64_t ddd = (dd * d) >> 30;
+
+                    if (d < 1LL << 30)
+                        coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd + (-18 * (1 << 24) + 12 * B + 6 * C) * dd + (6 * (1 << 24) - 2 * B) * (1 << 30);
+                    else
+                        coeff = (-B - 6 * C) * ddd + (6 * B + 30 * C) * dd + (-12 * B - 48 * C) * d + (8 * B + 24 * C) * (1 << 30);
+                }
+                coeff /= (1LL << 54) / fone;
+            }
+            else if (flag == 1) // BILINEAR
+            {
+                coeff = (1 << 30) - d;
+                if (coeff < 0)
+                    coeff = 0;
+                coeff *= fone >> 30;
+            }
+            else
+                assert(0);
+
+            filter[i * filterSize + j] = coeff;
+            xx++;
+        }
+        xDstInSrc += 2 * inc;
+    }
+
+    //apply src & dst Filter to filter -> filter2
+    X265_CHECK(filterSize > 0, "invalid filterSize value.\n");
+    filter2Size = filterSize;
+    filter2 = new int64_t[dstW * sizeof(*filter2) * filter2Size];
+
+    /* This is hard to read code, but much faster. Speed is crucial here */
+    int index = RES_FACTOR_DEF;
+    int size = dstW * filterSize;
+
+    (size % 4 == 0) && (index = RES_FACTOR_4);
+    (size % 8 == 0) && (index = RES_FACTOR_8);
+    (size % 16 == 0) && (index = RES_FACTOR_16);
+    (size % 32 == 0) && (index = RES_FACTOR_32);
+    (size % 64 == 0) && (index = RES_FACTOR_64);
+
+    filter_copy_c(filter, filter2, size);
+
+    delete[](filter);
+
+    // try to reduce the filter-size (step1 find size and shift left)
+    // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
+    minFilterSize = 0;
+    for (int i = dstW - 1; i >= 0; i--)
+    {
+        int min = filter2Size;
+        int64_t cutOff = 0.0;
+
+        // get rid of near zero elements on the left by shifting left
+        for (int j = 0; j < filter2Size; j++)
+        {
+            int k;
+            cutOff += X265_ABS(filter2[i * filter2Size]);
+
+            if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone)
+                break;
+            // preserve monotonicity because the core can't handle the filter otherwise
+            if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])
+                break;
+
+            // move filter coefficients left
+            for (k = 1; k < filter2Size; k++)
+                filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];
+            filter2[i * filter2Size + k - 1] = 0;
+            (*filterPos)[i]++;
+        }
+
+        cutOff = 0;
+        // count near zeros on the right
+        for (int j = filter2Size - 1; j > 0; j--)
+        {
+            cutOff += X265_ABS(filter2[i * filter2Size + j]);
+
+            if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone)
+                break;
+            min--;
+        }
+
+        if (min > minFilterSize)
+            minFilterSize = min;
+    }
+
+    X265_CHECK(minFilterSize > 0, "invalid minFilterSize value.\n");
+    filterSize = (minFilterSize + (filtAlign - 1)) & (~(filtAlign - 1));
+    X265_CHECK(filterSize > 0, "invalid filterSize value.\n");
+    filter = new int64_t[dstW*filterSize * sizeof(*filter)];
+
+    *outFilterSize = filterSize;
+
+    // try to reduce the filter-size (step2 reduce it)
+    for (int i = 0; i < dstW; i++)
+    {
+        for (int j = 0; j < filterSize; j++)
+        {
+            if (j >= filter2Size)
+                filter[i * filterSize + j] = 0;
+            else
+                filter[i * filterSize + j] = filter2[i * filter2Size + j];
+            if ((flag & SCALER_BITEXACT) && j >= minFilterSize)
+                filter[i * filterSize + j] = 0;
+        }
+    }
+
+    // fix borders
+    for (int i = 0; i < dstW; i++)
+    {
+        int j;
+        if ((*filterPos)[i] < 0)
+        {
+            // move filter coefficients left to compensate for filterPos
+            for (j = 1; j < filterSize; j++)
+            {
+                int left = x265_max(j + (*filterPos)[i], 0);
+                filter[i * filterSize + left] += filter[i * filterSize + j];
+                filter[i * filterSize + j] = 0;
+            }
+            (*filterPos)[i] = 0;
+        }
+
+        if ((*filterPos)[i] + filterSize > srcW)
+        {
+            int shift = (*filterPos)[i] + x265_min(filterSize - srcW, 0);
+            int64_t acc = 0;
+
+            for (j = filterSize - 1; j >= 0; j--)
+            {
+                if ((*filterPos)[i] + j >= srcW)
+                {
+                    acc += filter[i * filterSize + j];
+                    filter[i * filterSize + j] = 0;
+                }
+            }
+            for (j = filterSize - 1; j >= 0; j--)
+            {
+                if (j < shift)
+                    filter[i * filterSize + j] = 0;
+                else
+                    filter[i * filterSize + j] = filter[i * filterSize + j - shift];
+            }
+
+            (*filterPos)[i] -= shift;
+            filter[i * filterSize + srcW - 1 - (*filterPos)[i]] += acc;
+        }
+
+        X265_CHECK((*filterPos)[i] >= 0, "invalid: Value of (*filterPos)[%d] < 0.\n", i);
+        X265_CHECK((*filterPos)[i] < srcW, "invalid: Value of (*filterPos)[%d] > %d .\n", i, srcW);
+        if ((*filterPos)[i] + filterSize > srcW)
+        {
+            for (j = 0; j < filterSize; j++)
+            {
+                X265_CHECK(!filter[i * filterSize + j], "invalid: Value of filter[%d * filterSize + %d] != 0.\n", i, j);
+                X265_CHECK((*filterPos)[i] + j < srcW, "invalid: (*filterPos)[%d] + %d > %d .\n", i, i, srcW);
+            }
+        }
+    }
+
+    // init filter
+    m_filt = new int16_t[(dstW + 3)*(*outFilterSize)];
+    int16_t **outFilter = &m_filt;
+
+    // normalize & store in outFilter
+    for (int i = 0; i < dstW; i++)
+    {
+        int64_t error = 0;
+        int64_t sum = 0;
+
+        for (int j = 0; j < filterSize; j++)
+            sum += filter[i * filterSize + j];
+        sum = (sum + one / 2) / one;
+        if (!sum)
+        {
+            x265_log(NULL, X265_LOG_WARNING, "Scaler: zero vector in scaling\n");
+            sum = 1;
+        }
+        for (int j = 0; j < *outFilterSize; j++)
+        {
+            int64_t v = filter[i * filterSize + j] + error;
+            int intV = ROUNDED_DIVISION(v, sum);
+            (*outFilter)[i * (*outFilterSize) + j] = intV;
+            error = v - intV * sum;
+        }
+    }
+
+    (*filterPos)[dstW + 0] =
+        (*filterPos)[dstW + 1] =
+        (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1];
+    for (int i = 0; i < *outFilterSize; i++)
+    {
+        int k = (dstW - 1) * (*outFilterSize) + i;
+        (*outFilter)[k + 1 * (*outFilterSize)] =
+            (*outFilter)[k + 2 * (*outFilterSize)] =
+            (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];
+    }
+
+    delete[](filter);
+    delete[](filter2);
+    return 0;
+}
+
+int ScalerFilterManager::init(int algorithmFlags, VideoDesc *srcVideoDesc, VideoDesc *dstVideoDesc)
+{
+    int srcW = m_srcW = srcVideoDesc->m_width;
+    int srcH = m_srcH = srcVideoDesc->m_height;
+    int dstW = m_dstW = dstVideoDesc->m_width;
+    int dstH = m_dstH = dstVideoDesc->m_height;
+    int lumXInc, crXInc;
+    int lumYInc, crYInc;
+    int  srcHCrPos;
+    int  dstHCrPos;
+    int  srcVCrPos;
+    int  dstVCrPos;
+    int dst_stride = SCALER_ALIGN(dstW * sizeof(int16_t) + 66, 16);
+    m_bitDepth = dstVideoDesc->m_inputDepth;
+    if (m_bitDepth == 16)
+        dst_stride <<= 1;
+
+    m_algorithmFlags = algorithmFlags;
+    lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
+    lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
+
+    srcHCrPos = -513;
+    dstHCrPos = -513;
+    srcVCrPos = -513;
+    dstVCrPos = -513;
+
+    int srcCsp = srcVideoDesc->m_csp;
+    if (x265_cli_csps[srcCsp].planes > 1)
+    {
+        m_crSrcHSubSample = x265_cli_csps[srcCsp].width[1];
+        m_crSrcVSubSample = x265_cli_csps[srcCsp].height[1];
+        m_crSrcW = srcVideoDesc->m_width >> m_crSrcHSubSample;
+        m_crSrcH = srcVideoDesc->m_height >> m_crSrcVSubSample;
+        if (srcCsp == 1)// i420
+            srcVCrPos = 128;
+    }
+    else
+    {
+        m_crSrcW = 0;
+        m_crSrcH = 0;
+        m_crSrcHSubSample = 0;
+        m_crSrcVSubSample = 0;
+    }
+    int dstCsp = dstVideoDesc->m_csp;
+    if (x265_cli_csps[dstCsp].planes > 1)
+    {
+        m_crDstHSubSample = x265_cli_csps[dstCsp].width[1];
+        m_crDstVSubSample = x265_cli_csps[dstCsp].height[1];
+        m_crDstW = dstVideoDesc->m_width >> m_crDstHSubSample;
+        m_crDstH = dstVideoDesc->m_height >> m_crDstVSubSample;
+        if (dstCsp == 1)// i420
+            dstVCrPos = 128;
+    }
+    else
+    {
+        m_crDstW = 0;
+        m_crDstH = 0;
+        m_crDstHSubSample = 0;
+        m_crDstVSubSample = 0;
+    }
+    // Only srcCsp == dstCsp is supported at present
+    if (srcCsp != dstCsp)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "wrong, source csp != destination csp \n");
+        return false;
+    }
+
+    lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
+    lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
+    crXInc = (((int64_t)m_crSrcW << 16) + (m_crDstW >> 1)) / m_crDstW;
+    crYInc = (((int64_t)m_crSrcH << 16) + (m_crDstH >> 1)) / m_crDstH;
+
+    const int filterAlign = 1;
+
+    // init horizontal Luma Scaler filter
+    m_ScalerFilters[0] = new ScalerHLumFilter(m_bitDepth);
+    m_ScalerFilters[0]->initCoeff(m_algorithmFlags, lumXInc, srcW, dstW, filterAlign, 1 << 14, getLocalPos(0, 0), getLocalPos(0, 0));
+
+    // init horizontal cr Scaler filter
+    m_ScalerFilters[1] = new ScalerHCrFilter(m_bitDepth);
+    m_ScalerFilters[1]->initCoeff(m_algorithmFlags, crXInc, m_crSrcW, m_crDstW, filterAlign, 1 << 14,
+        getLocalPos(m_crSrcHSubSample, srcHCrPos), getLocalPos(m_crDstHSubSample, dstHCrPos));
+
+    // init vertical Luma scaler filter
+    m_ScalerFilters[2] = new ScalerVLumFilter(m_bitDepth);
+    m_ScalerFilters[2]->initCoeff(m_algorithmFlags, lumYInc, srcH, dstH, filterAlign, 1 << 12, getLocalPos(0, 0), getLocalPos(0, 0));
+
+    // init vertical cr scaler filter
+    m_ScalerFilters[3] = new ScalerVCrFilter(m_bitDepth);
+    m_ScalerFilters[3]->initCoeff(m_algorithmFlags, crYInc, m_crSrcH, m_crDstH, filterAlign, 1 << 12,
+        getLocalPos(m_crSrcVSubSample, srcVCrPos), getLocalPos(m_crDstVSubSample, dstVCrPos));
+
+    // init slice, must after filter initialization
+    initScalerSlice();
+
+    // set slice
+    m_ScalerFilters[0]->setSlice(m_slices[0], m_slices[1]);
+    m_ScalerFilters[1]->setSlice(m_slices[0], m_slices[1]);
+
+    m_ScalerFilters[2]->setSlice(m_slices[1], m_slices[2]);
+    m_ScalerFilters[3]->setSlice(m_slices[1], m_slices[2]);
+
+    return 0;
+}
+
+void HFilterScaler8Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    /* This is hard to read code, but much faster. Speed is crucial here */
+    (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8);
+
+    /* Do not check multiple of width 4, if width is already multiple of 8 */
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4);
+
+    (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    doScaling_c(dst, dstW, src, filter, filterPos, filterSize);
+#else
+    doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize);
+#endif
+}
+
+void HFilterScaler10Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    /* This is hard to read code, but much faster. Speed is crucial here */
+    (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8);
+
+    /* Do not check multiple of width 4, if width is already multiple of 8 */
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4);
+
+    (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    doScaling_c(dst, dstW, src, filter, filterPos, filterSize);
+#else
+    doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize);
+#endif
+}
+
+int ScalerFilterManager::scale_pic(void ** src, void ** dst, int * srcStride, int * dstStride)
+{
+    uint8_t** src_8bit, **dst_8bit;
+    src_8bit = (uint8_t**)src;
+    dst_8bit = (uint8_t**)dst;
+    if (!src_8bit || !dst_8bit)
+        return -1;
+
+    const int srcsliceHor = m_srcH;
+    const int dstW = m_dstW;
+    const int dstH = m_dstH;
+    int32_t *vLumFilterPos = m_ScalerFilters[2]->m_filtPos;
+    int32_t *vCrFilterPos = m_ScalerFilters[3]->m_filtPos;
+    const int vLumFilterSize = m_ScalerFilters[2]->m_filtLen;
+    const int vCrFilterSize = m_ScalerFilters[3]->m_filtLen;
+    const int crSrcsliceHor = UH_CEIL_SHIFTR(srcsliceHor, m_crSrcVSubSample);
+
+    // vars which will change and which we need to store back in the context
+    int lumBufIndex = -1;
+    int crBufIndex = -1;
+    int lastInLumBuf = -1;
+    int lastInCrBuf = -1;
+
+    int hasLumHoles = 1;
+    int hasCrHoles = 1;
+
+    ScalerSlice *src_slice = m_slices[0];
+    ScalerSlice *hout_slice = m_slices[1];
+    ScalerSlice *vout_slice = m_slices[2];
+    src_slice->initFromSrc((uint8_t**)src, srcStride, m_srcW, 0, srcsliceHor, 0, crSrcsliceHor, 1);
+    vout_slice->initFromSrc((uint8_t**)dst, dstStride, m_dstW, 0, dstH, 0, UH_CEIL_SHIFTR(dstH, m_crDstVSubSample), 0);
+
+    hout_slice->m_plane[0].sliceVer = 0;
+    hout_slice->m_plane[1].sliceVer = 0;
+    hout_slice->m_plane[2].sliceVer = 0;
+    hout_slice->m_plane[3].sliceVer = 0;
+    hout_slice->m_plane[0].sliceHor = 0;
+    hout_slice->m_plane[1].sliceHor = 0;
+    hout_slice->m_plane[2].sliceHor = 0;
+    hout_slice->m_plane[3].sliceHor = 0;
+    hout_slice->m_width = dstW;
+
+    for (int dstY = 0; dstY < dstH; dstY++)
+    {
+        const int crDstY = dstY >> m_crDstVSubSample;
+        const int firstLumSrcY = x265_max(1 - vLumFilterSize, vLumFilterPos[dstY]);
+        const int firstLumSrcY2 = x265_max(1 - vLumFilterSize, vLumFilterPos[x265_min(dstY | ((1 << m_crDstVSubSample) - 1), dstH - 1)]);
+        const int firstCrSrcY = x265_max(1 - vCrFilterSize, vCrFilterPos[crDstY]);
+
+        int lastLumSrcY = x265_min(m_srcH, firstLumSrcY + vLumFilterSize) - 1;
+        int lastLumSrcY2 = x265_min(m_srcH, firstLumSrcY2 + vLumFilterSize) - 1;
+        int lastCrSrcY = x265_min(m_crSrcH, firstCrSrcY + vCrFilterSize) - 1;
+
+        // handle holes
+        if (firstLumSrcY > lastInLumBuf)
+        {
+            hasLumHoles = lastInLumBuf != firstLumSrcY - 1;
+            if (hasLumHoles)
+            {
+                hout_slice->m_plane[0].sliceVer = firstLumSrcY;
+                hout_slice->m_plane[3].sliceVer = firstLumSrcY;
+                hout_slice->m_plane[0].sliceHor =
+                    hout_slice->m_plane[3].sliceHor = 0;
+            }
+
+            lastInLumBuf = firstLumSrcY - 1;
+        }
+        if (firstCrSrcY > lastInCrBuf)
+        {
+            hasCrHoles = lastInCrBuf != firstCrSrcY - 1;
+            if (hasCrHoles)
+            {
+                hout_slice->m_plane[1].sliceVer = firstCrSrcY;
+                hout_slice->m_plane[2].sliceVer = firstCrSrcY;
+                hout_slice->m_plane[1].sliceHor =
+                    hout_slice->m_plane[2].sliceHor = 0;
+            }
+
+            lastInCrBuf = firstCrSrcY - 1;
+        }
+
+        // Do we have enough lines in this slice to output the dstY line
+        int enoughLines = lastLumSrcY2 < 0 + srcsliceHor && lastCrSrcY < UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample);
+        if (!enoughLines)
+        {
+            lastLumSrcY = 0 + srcsliceHor - 1;
+            lastCrSrcY = 0 + crSrcsliceHor - 1;
+            x265_log(NULL, X265_LOG_INFO, "buffering slice: lastLumSrcY %d lastCrSrcY %d\n", lastLumSrcY, lastCrSrcY);
+        }
+
+        X265_CHECK(((lastLumSrcY - firstLumSrcY + 1) <= hout_slice->m_plane[0].availLines), "invalid value %d", lastLumSrcY - firstLumSrcY + 1);
+        X265_CHECK((lastCrSrcY - firstCrSrcY + 1) <= hout_slice->m_plane[1].availLines, "invalid value %d", lastCrSrcY - firstCrSrcY + 1);
+
+        int firstPosY, lastPosY, firstCPosY, lastCPosY;
+        int posY = hout_slice->m_plane[0].sliceVer + hout_slice->m_plane[0].sliceHor;
+        if (posY <= lastLumSrcY && !hasLumHoles)
+        {
+            firstPosY = x265_max(firstLumSrcY, posY);
+            lastPosY = x265_min(firstLumSrcY + hout_slice->m_plane[0].availLines - 1, 0 + srcsliceHor - 1);
+        }
+        else
+        {
+            firstPosY = posY;
+            lastPosY = lastLumSrcY;
+        }
+
+        int cPosY = hout_slice->m_plane[1].sliceVer + hout_slice->m_plane[1].sliceHor;
+        if (cPosY <= lastCrSrcY && !hasCrHoles)
+        {
+            firstCPosY = x265_max(firstCrSrcY, cPosY);
+            lastCPosY = x265_min(firstCrSrcY + hout_slice->m_plane[1].availLines - 1, UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample) - 1);
+        }
+        else
+        {
+            firstCPosY = cPosY;
+            lastCPosY = lastCrSrcY;
+        }
+
+        hout_slice->rotate(lastPosY, lastCPosY);
+        // horizontal luma scale
+        if (posY < lastLumSrcY + 1)
+            m_ScalerFilters[0]->process(firstPosY, lastPosY - firstPosY + 1);
+
+        lumBufIndex += lastLumSrcY - lastInLumBuf;
+        lastInLumBuf = lastLumSrcY;
+        // horizontal chroma Scale
+        if (cPosY < lastCrSrcY + 1)
+            m_ScalerFilters[1]->process(firstCPosY, lastCPosY - firstCPosY + 1);
+
+        crBufIndex += lastCrSrcY - lastInCrBuf;
+        lastInCrBuf = lastCrSrcY;
+
+        // wrap buf index around to stay inside the ring buffer
+        if (lumBufIndex >= vLumFilterSize)
+            lumBufIndex -= vLumFilterSize;
+        if (crBufIndex >= vCrFilterSize)
+            crBufIndex -= vCrFilterSize;
+        if (!enoughLines)
+            break;  // we can't output a dstY line so let's try with the next slice
+
+        // vertical scale(output converter)
+        for (int i = 2; i < m_numFilter; ++i)
+            m_ScalerFilters[i]->process(dstY, 1);
+    }
+    return 0;
+}
+
+void ScalerFilterManager::getMinBufferSize(int *out_lum_size, int *out_cr_size)
+{
+    int lumY;
+    int dstH = m_dstH;
+    int crDstH = m_crDstH;
+    int *lumFilterPos = m_ScalerFilters[2]->m_filtPos;
+    int *crFilterPos = m_ScalerFilters[3]->m_filtPos;
+    int lumFilterSize = m_ScalerFilters[2]->m_filtLen;
+    int crFilterSize = m_ScalerFilters[3]->m_filtLen;
+    int crSubSample = m_crSrcVSubSample;
+
+    *out_lum_size = lumFilterSize;
+    *out_cr_size = crFilterSize;
+
+    for (lumY = 0; lumY < dstH; lumY++)
+    {
+        int crY = (int64_t)lumY * crDstH / dstH;
+        int nextSlice = x265_max(lumFilterPos[lumY] + lumFilterSize - 1, ((crFilterPos[crY] + crFilterSize - 1) << crSubSample));
+
+        nextSlice >>= crSubSample;
+        nextSlice <<= crSubSample;
+        (*out_lum_size) = x265_max((*out_lum_size), nextSlice - lumFilterPos[lumY]);
+        (*out_cr_size) = x265_max((*out_cr_size), (nextSlice >> crSubSample) - crFilterPos[crY]);
+    }
+}
+
+int ScalerFilterManager::initScalerSlice()
+{
+    int ret = 0;
+    int dst_stride = SCALER_ALIGN(m_dstW * sizeof(int16_t) + 66, 16);
+    if (m_bitDepth == 16)
+        dst_stride <<= 1;
+
+    int lumBufSize;
+    int crBufSize;
+    int vLumFilterSize = m_ScalerFilters[2]->m_filtLen; // Vertical filter size for luma pixels.
+    int vCrFilterSize = m_ScalerFilters[3]->m_filtLen;  // Vertical filter size for chroma pixels.
+    getMinBufferSize(&lumBufSize, &crBufSize);
+    lumBufSize = X265_MAX(lumBufSize, vLumFilterSize + MAX_NUM_LINES_AHEAD);
+    crBufSize = X265_MAX(crBufSize, vCrFilterSize + MAX_NUM_LINES_AHEAD);
+
+    for (int i = 0; i < m_numSlice; i++)
+        m_slices[i] = new ScalerSlice;
+    ret = m_slices[0]->create(m_srcH, m_crSrcH, m_crSrcHSubSample, m_crSrcVSubSample, 0);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "alloc_slice m_slice[0] failed\n");
+        return -1;
+    }
+
+    // horizontal scaler output
+    ret = m_slices[1]->create(lumBufSize, crBufSize, m_crDstHSubSample, m_crDstVSubSample, 1);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "m_slice[1].create failed\n");
+        return -1;
+    }
+    ret = m_slices[1]->createLines(dst_stride, m_dstW);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "m_slice[1].createLines failed\n");
+        return -1;
+    }
+
+    m_slices[1]->fillOnes(dst_stride >> 1, m_bitDepth == 16);
+
+    // vertical scaler output
+    ret = m_slices[2]->create(m_dstH, m_crDstH, m_crDstHSubSample, m_crDstVSubSample, 0);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "m_slice[2].create failed\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+int ScalerFilterManager::getLocalPos(int crSubSample, int pos)
+{
+    if (pos == -1 || pos <= -513)
+        pos = (128 << crSubSample) - 128;
+    pos += 128; // relative to ideal left edge
+    return pos >> crSubSample;
+}
+
+ScalerSlice::ScalerSlice() :
+    m_width(0),
+    m_hCrSubSample(0),
+    m_vCrSubSample(0),
+    m_isRing(0),
+    m_destroyLines(0)
+{
+    for (int i = 0; i < m_numSlicePlane; i++)
+    {
+        m_plane[i].availLines = 0;
+        m_plane[i].sliceVer = 0;
+        m_plane[i].sliceHor = 0;
+        m_plane[i].lineBuf = NULL;
+    }
+}
+
+void ScalerSlice::destroy()
+{
+    if (m_destroyLines)
+        destroyLines();
+    for (int i = 0; i < m_numSlicePlane; i++)
+    {
+        if (m_plane[i].lineBuf)
+            X265_FREE(m_plane[i].lineBuf);
+    }
+}
+
+int ScalerSlice::create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring)
+{
+    int i;
+    int size[4] = { lumLines, crLines, crLines, lumLines };
+
+    m_hCrSubSample = h_sub_sample;
+    m_vCrSubSample = v_sub_sample;
+    m_isRing = ring;
+    m_destroyLines = 0;
+
+    for (i = 0; i < m_numSlicePlane; ++i)
+    {
+        int n = size[i] * (ring == 0 ? 1 : 3);
+        m_plane[i].lineBuf = X265_MALLOC(uint8_t*, n);
+        if (!m_plane[i].lineBuf)
+            return -1;
+
+        m_plane[i].availLines = size[i];
+        m_plane[i].sliceVer = 0;
+        m_plane[i].sliceHor = 0;
+    }
+    return 0;
+}
+
+/*
+slice lines contains extra bytes for vectorial code thus @size
+is the allocated memory size and @width is the number of pixels
+*/
+int ScalerSlice::createLines(int size, int width)
+{
+    int i;
+    int idx[2] = { 3, 2 };
+
+    m_destroyLines = 1;
+    m_width = width;
+
+    for (i = 0; i < 2; ++i) {
+        int n = m_plane[i].availLines;
+        int j;
+        int ii = idx[i];
+        assert(n == m_plane[ii].availLines);
+        for (j = 0; j < n; ++j)
+        {
+            // chroma plane line U and V are expected to be contiguous in memory
+            m_plane[i].lineBuf[j] = (uint8_t*)X265_MALLOC(uint8_t, size * 2 + 32);
+            if (!m_plane[i].lineBuf[j])
+            {
+                destroyLines();
+                return -1;
+            }
+            m_plane[ii].lineBuf[j] = m_plane[i].lineBuf[j] + size + 16;
+            if (m_isRing)
+            {
+                m_plane[i].lineBuf[j + n] = m_plane[i].lineBuf[j];
+                m_plane[ii].lineBuf[j + n] = m_plane[ii].lineBuf[j];
+            }
+        }
+    }
+
+    return 0;
+}
+
+void ScalerSlice::destroyLines()
+{
+    int i;
+    for (i = 0; i < 2; ++i)
+    {
+        int n = m_plane[i].availLines;
+        int j;
+        for (j = 0; j < n; ++j)
+        {
+            X265_FREE(m_plane[i].lineBuf[j]);
+            m_plane[i].lineBuf[j] = NULL;
+            if (m_isRing)
+                m_plane[i].lineBuf[j + n] = NULL;
+        }
+    }
+
+    for (i = 0; i < m_numSlicePlane; ++i)
+        memset(m_plane[i].lineBuf, 0, sizeof(uint8_t*) * m_plane[i].availLines * (m_isRing ? 3 : 1));
+    m_destroyLines = 0;
+}
+
+void ScalerSlice::fillOnes(int n, int is16bit)
+{
+    int i;
+    for (i = 0; i < m_numSlicePlane; ++i)
+    {
+        int j;
+        int size = m_plane[i].availLines;
+        for (j = 0; j < size; ++j)
+        {
+            int k;
+            int end = is16bit ? n >> 1 : n;
+            // fill also one extra element
+            end += 1;
+            if (is16bit)
+                for (k = 0; k < end; ++k)
+                    ((int32_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 18;
+            else
+                for (k = 0; k < end; ++k)
+                    ((int16_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 14;
+        }
+    }
+}
+
+int ScalerSlice::rotate(int lum, int cr)
+{
+    int i;
+    if (lum)
+    {
+        for (i = 0; i < m_numSlicePlane; i += 3)
+        {
+            int n = m_plane[i].availLines;
+            int l = lum - m_plane[i].sliceVer;
+
+            if (l >= n * 2)
+            {
+                m_plane[i].sliceVer += n;
+                m_plane[i].sliceHor -= n;
+            }
+        }
+    }
+    if (cr)
+    {
+        for (i = 1; i < 3; ++i)
+        {
+            int n = m_plane[i].availLines;
+            int l = cr - m_plane[i].sliceVer;
+
+            if (l >= n * 2)
+            {
+                m_plane[i].sliceVer += n;
+                m_plane[i].sliceHor -= n;
+            }
+        }
+    }
+    return 0;
+}
+
+int ScalerSlice::initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative)
+{
+    int i = 0;
+
+    const int start[m_numSlicePlane] = { lumY, crY, crY, lumY };
+
+    const int end[m_numSlicePlane] = { lumY + lumH, crY + crH, crY + crH, lumY + lumH };
+
+    uint8_t *const src_[m_numSlicePlane] = { src[0] + (relative ? 0 : start[0]) * stride[0],
+        src[1] + (relative ? 0 : start[1]) * stride[1],
+        src[2] + (relative ? 0 : start[2]) * stride[2],
+        src[3] + (relative ? 0 : start[3]) * stride[3] };
+
+    m_width = srcW;
+
+    for (i = 0; i < m_numSlicePlane; ++i)
+    {
+        int j;
+        int first = m_plane[i].sliceVer;
+        int n = m_plane[i].availLines;
+        int lines = end[i] - start[i];
+        int tot_lines = end[i] - first;
+
+        if (start[i] >= first && n >= tot_lines)
+        {
+            m_plane[i].sliceHor = x265_max(tot_lines, m_plane[i].sliceHor);
+            for (j = 0; j < lines; j += 1)
+                m_plane[i].lineBuf[start[i] - first + j] = src_[i] + j * stride[i];
+        }
+        else
+        {
+            m_plane[i].sliceVer = start[i];
+            lines = lines > n ? n : lines;
+            m_plane[i].sliceHor = lines;
+            for (j = 0; j < lines; j += 1)
+                m_plane[i].lineBuf[j] = src_[i] + j * stride[i];
+        }
+    }
+    return 0;
+}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/scaler.h	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,254 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2020 MulticoreWare, Inc
+ *
+ * Authors: Pooja Venkatesan <pooja@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_SCALER_H
+#define X265_SCALER_H
+
+#include "common.h"
+
+namespace X265_NS {
+//x265 private namespace
+
+class ScalerSlice;
+class VideoDesc;
+
+#define MAX_NUM_LINES_AHEAD 4
+#define SCALER_ALIGN(x, j) (((x)+(j)-1)&~((j)-1))
+#define X265_ABS(j) ((j) >= 0 ? (j) : (-(j)))
+#define SCALER_MAX_REDUCE_CUTOFF 0.002
+#define SCALER_BITEXACT  0x80000
+#define ROUNDED_DIVISION(i,j) (((i)>0 ? (i) + ((j)>>1) : (i) - ((j)>>1))/(j))
+#define UH_CEIL_SHIFTR(i,j) (!scale_builtin_constant_p(j) ? -((-(i)) >> (j)) \
+                                                          : ((i) + (1<<(j)) - 1) >> (j))
+
+#if defined(__GNUC__) || defined(__clang__)
+#    define scale_builtin_constant_p __builtin_constant_p
+#else
+#    define scale_builtin_constant_p(x) 0
+#endif
+
+enum ResFactor
+{
+    RES_FACTOR_64, RES_FACTOR_32, RES_FACTOR_16, RES_FACTOR_8,
+    RES_FACTOR_4, RES_FACTOR_DEF, NUM_RES_FACTOR
+};
+
+enum ScalerFactor
+{
+    FACTOR_4, FACTOR_8, NUM_FACTOR
+};
+
+enum FilterSize
+{
+    FIL_4, FIL_6, FIL_8, FIL_9, FIL_10, FIL_11, FIL_13, FIL_15,
+    FIL_16, FIL_17, FIL_19, FIL_22, FIL_24, FIL_DEF, NUM_FIL
+};
+
+class ScalerFilter {
+public:
+    int             m_filtLen;
+    int32_t*        m_filtPos;      // Array of horizontal/vertical starting pos for each dst for luma / chroma planes.
+    int16_t*        m_filt;         // Array of horizontal/vertical filter coefficients for luma / chroma planes.
+    ScalerSlice*    m_sourceSlice;  // Source slice
+    ScalerSlice*    m_destSlice;    // Output slice
+    ScalerFilter();
+    virtual ~ScalerFilter();
+    virtual void process(int sliceVer, int sliceHor) = 0;
+    int initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos);
+    void setSlice(ScalerSlice* source, ScalerSlice* dest) { m_sourceSlice = source; m_destSlice = dest; }
+};
+
+class VideoDesc {
+public:
+    int         m_width;
+    int         m_height;
+    int         m_csp;
+    int         m_inputDepth;
+
+    VideoDesc(int w, int h, int csp, int bitDepth)
+    {
+        m_width = w;
+        m_height = h;
+        m_csp = csp;
+        m_inputDepth = bitDepth;
+    }
+};
+
+typedef struct ScalerPlane
+{
+    int       availLines; // max number of lines that can be held by this plane
+    int       sliceVer;   // index of first line
+    int       sliceHor;   // number of lines
+    uint8_t** lineBuf;    // line buffer
+} ScalerPlane;
+
+// Assist horizontal filtering, base class
+class HFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    HFilterScaler() :m_bitDepth(0) {};
+    virtual ~HFilterScaler() {};
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) = 0;
+};
+
+// Assist vertical filtering, base class
+class VFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    VFilterScaler() :m_bitDepth(0) {};
+    virtual ~VFilterScaler() {};
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) = 0;
+};
+
+//  Assist horizontal filtering, process 8 bit case
+class HFilterScaler8Bit : public HFilterScaler {
+public:
+    HFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist horizontal filtering, process 10 bit case
+class HFilterScaler10Bit : public HFilterScaler {
+public:
+    HFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist vertical filtering, process 8 bit case
+class VFilterScaler8Bit : public VFilterScaler {
+public:
+    VFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+//  Assist vertical filtering, process 10 bit case
+class VFilterScaler10Bit : public VFilterScaler {
+public:
+    VFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+// Horizontal filter for luma
+class ScalerHLumFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHLumFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHLumFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Horizontal filter for chroma
+class ScalerHCrFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHCrFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHCrFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for luma
+class ScalerVLumFilter : public ScalerFilter {
+private:
+    VFilterScaler* m_vFilterScaler;
+public:
+    ScalerVLumFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVLumFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for chroma
+class ScalerVCrFilter : public ScalerFilter {
+private:
+    VFilterScaler*    m_vFilterScaler;
+public:
+    ScalerVCrFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVCrFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+class ScalerSlice
+{
+private:
+    enum ScalerSlicePlaneNum { m_numSlicePlane = 4 };
+public:
+    int m_width;        // Slice line width
+    int m_hCrSubSample; // horizontal Chroma subsampling factor
+    int m_vCrSubSample; // vertical chroma subsampling factor
+    int m_isRing;       // flag to identify if this ScalerSlice is a ring buffer
+    int m_destroyLines; // flag to identify if there are dynamic allocated lines
+    ScalerPlane m_plane[m_numSlicePlane];
+public:
+    ScalerSlice();
+    ~ScalerSlice() { destroy(); }
+    int rotate(int lum, int cr);
+    void fillOnes(int n, int is16bit);
+    int create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring);
+    int createLines(int size, int width);
+    void destroyLines();
+    void destroy();
+    int initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative);
+};
+
+class ScalerFilterManager {
+private:
+    enum ScalerFilterNum { m_numSlice = 3, m_numFilter = 4 };
+
+private:
+    int                     m_bitDepth;
+    int                     m_algorithmFlags;  // 1, bilinear; 4 bicubic, default is bicubic
+    int                     m_srcW;            // Width  of source luma planes.
+    int                     m_srcH;            // Height of source luma planes.
+    int                     m_dstW;            // Width of dest luma planes.
+    int                     m_dstH;            // Height of dest luma planes.
+    int                     m_crSrcW;          // Width  of source chroma planes.
+    int                     m_crSrcH;          // Height of source chroma planes.
+    int                     m_crDstW;          // Width  of dest chroma planes.
+    int                     m_crDstH;          // Height of dest chroma planes.
+    int                     m_crSrcHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in src  image.
+    int                     m_crSrcVSubSample; // Binary log of vertical   subsampling factor between Y and Cr planes in src  image.
+    int                     m_crDstHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in dest image.
+    int                     m_crDstVSubSample; // Binary log of vertical   subsampling factor between Y and Cr planes in dest image.
+    ScalerSlice*            m_slices[m_numSlice];
+    ScalerFilter*           m_ScalerFilters[m_numFilter];
+private:
+    int getLocalPos(int crSubSample, int pos);
+    void getMinBufferSize(int *out_lum_size, int *out_cr_size);
+    int initScalerSlice();
+public:
+    ScalerFilterManager();
+    ~ScalerFilterManager() {
+        for (int i = 0; i < m_numSlice; i++)
+            if (m_slices[i]) { m_slices[i]->destroy(); delete m_slices[i]; m_slices[i] = NULL; }
+        for (int i = 0; i < m_numFilter; i++)
+            if (m_ScalerFilters[i]) { delete m_ScalerFilters[i]; m_ScalerFilters[i] = NULL; }
+    }
+    int init(int algorithmFlags, VideoDesc* srcVideoDesc, VideoDesc* dstVideoDesc);
+    int scale_pic(void** src, void** dst, int* srcStride, int* dstStride);
+};
+}
+
+#endif //ifndef X265_SCALER_H
--- a/source/common/threading.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/common/threading.h	Wed May 06 14:59:56 2020 +0530
@@ -238,6 +238,14 @@ public:
         LeaveCriticalSection(&m_cs);
     }
 
+    void decr()
+    {
+        EnterCriticalSection(&m_cs);
+        m_val--;
+        WakeAllConditionVariable(&m_cv);
+        LeaveCriticalSection(&m_cs);
+    }
+
 protected:
 
     CRITICAL_SECTION   m_cs;
@@ -436,6 +444,14 @@ public:
         pthread_mutex_unlock(&m_mutex);
     }
 
+    void decr()
+    {
+        pthread_mutex_lock(&m_mutex);
+        m_val--;
+        pthread_cond_broadcast(&m_cond);
+        pthread_mutex_unlock(&m_mutex);
+    }
+
 protected:
 
     pthread_mutex_t m_mutex;
--- a/source/encoder/analysis.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/analysis.cpp	Wed May 06 14:59:56 2020 +0530
@@ -1272,7 +1272,7 @@ SplitData Analysis::compressInterCU_rd0_
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
-                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
                     if (m_param->rdLevel)
                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
                 }
@@ -1296,7 +1296,7 @@ SplitData Analysis::compressInterCU_rd0_
                     md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
                     checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
-                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    skipRecursion = !!m_param->recursionSkipMode && md.bestMode;
                     if (m_param->rdLevel)
                         skipModes = m_param->bEnableEarlySkip && md.bestMode;
                 }
@@ -1314,15 +1314,23 @@ SplitData Analysis::compressInterCU_rd0_
                 skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
                 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
         }
-        if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
+        if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
         {
             skipRecursion = md.bestMode->cu.isSkipped(0);
-            if (mightSplit && depth >= minDepth && !skipRecursion)
+            if (mightSplit && !skipRecursion)
             {
-                if (depth)
-                    skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
-                if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
+                if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
+                {
+                    if (depth)
+                        skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
+                    if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
+                        skipRecursion = complexityCheckCU(*md.bestMode);
+                }
+                else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+                {
                     skipRecursion = complexityCheckCU(*md.bestMode);
+                }
+
             }
         }
         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
@@ -1972,7 +1980,7 @@ SplitData Analysis::compressInterCU_rd5_
                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
                 }
                 if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
@@ -1996,7 +2004,7 @@ SplitData Analysis::compressInterCU_rd5_
                     checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
                     checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                    if (m_param->recursionSkipMode && depth && m_modeDepth[depth - 1].bestMode)
                         skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
                 }
             }
@@ -2015,8 +2023,10 @@ SplitData Analysis::compressInterCU_rd5_
             checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
             checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
-            if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+            if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP && depth && m_modeDepth[depth - 1].bestMode)
                 skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
+            else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+                skipRecursion = md.bestMode && complexityCheckCU(*md.bestMode);
         }
         if (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)
             skipRecursion = true;
@@ -3525,27 +3535,47 @@ bool Analysis::recursionDepthCheck(const
 
 bool Analysis::complexityCheckCU(const Mode& bestMode)
 {
-    uint32_t mean = 0;
-    uint32_t homo = 0;
-    uint32_t cuSize = bestMode.fencYuv->m_size;
-    for (uint32_t y = 0; y < cuSize; y++) {
-        for (uint32_t x = 0; x < cuSize; x++) {
-            mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
+    if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP)
+    {
+        uint32_t mean = 0;
+        uint32_t homo = 0;
+        uint32_t cuSize = bestMode.fencYuv->m_size;
+        for (uint32_t y = 0; y < cuSize; y++) {
+            for (uint32_t x = 0; x < cuSize; x++) {
+                mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);
+            }
         }
+        mean = mean / (cuSize * cuSize);
+        for (uint32_t y = 0; y < cuSize; y++) {
+            for (uint32_t x = 0; x < cuSize; x++) {
+                homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
+            }
+        }
+        homo = homo / (cuSize * cuSize);
+
+        if (homo < (.1 * mean))
+            return true;
+
+        return false;
     }
-    mean = mean / (cuSize * cuSize);
-    for (uint32_t y = 0 ; y < cuSize; y++){
-        for (uint32_t x = 0 ; x < cuSize; x++){
-            homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));
-        }
+    else
+    {
+        int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;
+        int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;
+        intptr_t stride = m_frame->m_fencPic->m_stride;
+        intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;
+        uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);
+        uint32_t sum = (uint32_t)sum_ss;
+        uint32_t ss = (uint32_t)(sum_ss >> 32);
+        uint32_t pixelCount = 1 << shift;
+        double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;
+
+        if (cuEdgeVariance > (double)m_param->edgeVarThreshold)
+            return false;
+        else
+            return true;
     }
-    homo = homo / (cuSize * cuSize);
-
-    if (homo < (.1 * mean))
-        return true;
-
-    return false;
-}
+ }
 
 uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
 {
@@ -3570,7 +3600,6 @@ uint32_t Analysis::calculateCUVariance(c
             cnt++;
         }
     }
-    
     return cuVariance / cnt;
 }
 
--- a/source/encoder/analysis.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/analysis.h	Wed May 06 14:59:56 2020 +0530
@@ -52,7 +52,7 @@ struct SplitData
         splitRefs = 0;
         mvCost[0] = 0; // L0
         mvCost[1] = 0; // L1
-        sa8dCost    = 0;
+        sa8dCost  = 0;
     }
 };
 
@@ -120,7 +120,6 @@ public:
 
     Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
     int32_t loadTUDepth(CUGeom cuGeom, CUData parentCTU);
-
 protected:
     /* Analysis data for save/load mode, writes/reads data based on absPartIdx */
     x265_analysis_inter_data*  m_reuseInterDataCTU;
--- a/source/encoder/api.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/api.cpp	Wed May 06 14:59:56 2020 +0530
@@ -1016,12 +1016,12 @@ x265_zone *x265_zone_alloc(int zoneCount
 
 void x265_zone_free(x265_param *param)
 {
-    if (param && param->rc.zonefileCount) {
+    if (param && param->rc.zones && (param->rc.zoneCount || param->rc.zonefileCount))
+    {
         for (int i = 0; i < param->rc.zonefileCount; i++)
             x265_free(param->rc.zones[i].zoneParam);
+        x265_free(param->rc.zones);
     }
-    if (param && (param->rc.zoneCount || param->rc.zonefileCount))
-        x265_free(param->rc.zones);
 }
 
 static const x265_api libapi =
@@ -1294,6 +1294,8 @@ FILE* x265_csvlog_open(const x265_param*
                     fprintf(csvfp, "RateFactor, ");
                 if (param->rc.vbvBufferSize)
                     fprintf(csvfp, "BufferFill, BufferFillFinal, ");
+                if (param->rc.vbvBufferSize && param->csvLogLevel >= 2)
+                    fprintf(csvfp, "UnclippedBufferFillFinal, ");
                 if (param->bEnablePsnr)
                     fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
                 if (param->bEnableSsim)
@@ -1405,6 +1407,8 @@ void x265_csvlog_frame(const x265_param*
         fprintf(param->csvfpt, "%.3lf,", frameStats->rateFactor);
     if (param->rc.vbvBufferSize)
         fprintf(param->csvfpt, "%.3lf, %.3lf,", frameStats->bufferFill, frameStats->bufferFillFinal);
+    if (param->rc.vbvBufferSize && param->csvLogLevel >= 2)
+        fprintf(param->csvfpt, "%.3lf,", frameStats->unclippedBufferFillFinal);
     if (param->bEnablePsnr)
         fprintf(param->csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
     if (param->bEnableSsim)
--- a/source/encoder/encoder.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/encoder.cpp	Wed May 06 14:59:56 2020 +0530
@@ -218,10 +218,7 @@ void Encoder::create()
 
     if (m_param->bHistBasedSceneCut)
     {
-        for (int i = 0; i < x265_cli_csps[m_param->internalCsp].planes; i++)
-        {
-            m_planeSizes[i] = (m_param->sourceWidth >> x265_cli_csps[p->internalCsp].width[i]) * (m_param->sourceHeight >> x265_cli_csps[m_param->internalCsp].height[i]);
-        }
+        m_planeSizes[0] = (m_param->sourceWidth >> x265_cli_csps[p->internalCsp].width[0]) * (m_param->sourceHeight >> x265_cli_csps[m_param->internalCsp].height[0]);
         uint32_t pixelbytes = m_param->internalBitDepth > 8 ? 2 : 1;
         m_edgePic = X265_MALLOC(pixel, m_planeSizes[0] * pixelbytes);
         m_edgeHistThreshold = m_param->edgeTransitionThreshold;
@@ -1443,9 +1440,9 @@ bool Encoder::computeHistograms(x265_pic
     int32_t planeCount = x265_cli_csps[m_param->internalCsp].planes;
     memset(m_edgePic, 0, bufSize);
 
-    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false))
-    {
-        x265_log(m_param, X265_LOG_ERROR, "Failed edge computation!");
+    if (!computeEdge(m_edgePic, src, NULL, pic->width, pic->height, pic->width, false, 1))
+    {
+        x265_log(m_param, X265_LOG_ERROR, "Failed to compute edge!");
         return false;
     }
 
@@ -1605,6 +1602,14 @@ int Encoder::encode(const x265_picture* 
         if (m_param->bHistBasedSceneCut && pic_in)
         {
             x265_picture *pic = (x265_picture *) pic_in;
+
+            if (pic->poc == 0)
+            {
+                /* for entire encode compute the chroma plane sizes only once */
+                for (int i = 1; i < x265_cli_csps[m_param->internalCsp].planes; i++)
+                    m_planeSizes[i] = (pic->width >> x265_cli_csps[m_param->internalCsp].width[i]) * (pic->height >> x265_cli_csps[m_param->internalCsp].height[i]);
+            }
+
             if (computeHistograms(pic))
             {
                 double maxUVSad = 0.0, edgeSad = 0.0;
@@ -1752,6 +1757,12 @@ int Encoder::encode(const x265_picture* 
                         }
                     }
                 }
+                if (m_param->recursionSkipMode == EDGE_BASED_RSKIP && m_param->bHistBasedSceneCut)
+                {
+                    pixel* src = m_edgePic;
+                    primitives.planecopy_pp_shr(src, inFrame->m_fencPic->m_picWidth, inFrame->m_edgeBitPic, inFrame->m_fencPic->m_stride,
+                        inFrame->m_fencPic->m_picWidth, inFrame->m_fencPic->m_picHeight, 0);
+                }
             }
             else
             {
@@ -2414,7 +2425,7 @@ int Encoder::reconfigureParam(x265_param
         encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
         encParam->bEnableFastIntra = param->bEnableFastIntra;
         encParam->bEnableEarlySkip = param->bEnableEarlySkip;
-        encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
+        encParam->recursionSkipMode = param->recursionSkipMode;
         encParam->searchMethod = param->searchMethod;
         /* Scratch buffer prevents me_range from being increased for esa/tesa */
         if (param->searchRange < encParam->searchRange)
@@ -3006,6 +3017,8 @@ void Encoder::finishFrameStats(Frame* cu
             frameStats->ipCostRatio = curFrame->m_lowres.ipCostRatio;
         frameStats->bufferFill = m_rateControl->m_bufferFillActual;
         frameStats->bufferFillFinal = m_rateControl->m_bufferFillFinal;
+        if (m_param->csvLogLevel >= 2)
+            frameStats->unclippedBufferFillFinal = m_rateControl->m_unclippedBufferFillFinal;
         frameStats->frameLatency = inPoc - poc;
         if (m_param->rc.rateControlMode == X265_RC_CRF)
             frameStats->rateFactor = curEncData.m_rateFactor;
@@ -3400,7 +3413,7 @@ void Encoder::configureZone(x265_param *
         p->maxNumReferences = zone->maxNumReferences;
         p->bEnableFastIntra = zone->bEnableFastIntra;
         p->bEnableEarlySkip = zone->bEnableEarlySkip;
-        p->bEnableRecursionSkip = zone->bEnableRecursionSkip;
+        p->recursionSkipMode = zone->recursionSkipMode;
         p->searchMethod = zone->searchMethod;
         p->searchRange = zone->searchRange;
         p->subpelRefine = zone->subpelRefine;
@@ -3681,20 +3694,6 @@ void Encoder::configure(x265_param *p)
     if (p->analysisLoad && !p->analysisLoadReuseLevel)
         p->analysisLoadReuseLevel = 5;
 
-    if ((p->bAnalysisType == DEFAULT) && p->rc.cuTree)
-    {
-        if (p->analysisSaveReuseLevel && p->analysisSaveReuseLevel < 10)
-        {
-            x265_log(p, X265_LOG_WARNING, "cu-tree works only with analysis-save-reuse-level 10, Disabling cu-tree\n");
-            p->rc.cuTree = 0;
-        }
-        if (p->analysisLoadReuseLevel && p->analysisLoadReuseLevel < 10)
-        {
-            x265_log(p, X265_LOG_WARNING, "cu-tree works only with analysis-load-reuse-level 10, Disabling cu-tree\n");
-            p->rc.cuTree = 0;
-        }
-    }
-
     if ((p->analysisLoad || p->analysisSave) && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation))
     {
         x265_log(p, X265_LOG_WARNING, "Analysis load/save options incompatible with pmode/pme, Disabling pmode/pme\n");
@@ -3867,29 +3866,30 @@ void Encoder::configure(x265_param *p)
         }
         else
         {
-            if (fread(&m_conformanceWindow.rightOffset, sizeof(int), 1, m_analysisFileIn) != 1)
+            int rightOffset, bottomOffset;
+            if (fread(&rightOffset, sizeof(int), 1, m_analysisFileIn) != 1)
             {
                 x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Conformance window right offset missing\n");
                 m_aborted = true;
             }
-            else if (m_conformanceWindow.rightOffset && p->analysisLoadReuseLevel > 1)
+            else if (rightOffset && p->analysisLoadReuseLevel > 1)
             {
                 int scaleFactor = p->scaleFactor < 2 ? 1 : p->scaleFactor;
-                padsize = m_conformanceWindow.rightOffset * scaleFactor;
+                padsize = rightOffset * scaleFactor;
                 p->sourceWidth += padsize;
                 m_conformanceWindow.bEnabled = true;
                 m_conformanceWindow.rightOffset = padsize;
             }
 
-            if (fread(&m_conformanceWindow.bottomOffset, sizeof(int), 1, m_analysisFileIn) != 1)
+            if (fread(&bottomOffset, sizeof(int), 1, m_analysisFileIn) != 1)
             {
                 x265_log(NULL, X265_LOG_ERROR, "Error reading analysis data. Conformance window bottom offset missing\n");
                 m_aborted = true;
             }
-            else if (m_conformanceWindow.bottomOffset && p->analysisLoadReuseLevel > 1)
+            else if (bottomOffset && p->analysisLoadReuseLevel > 1)
             {
                 int scaleFactor = p->scaleFactor < 2 ? 1 : p->scaleFactor;
-                padsize = m_conformanceWindow.bottomOffset * scaleFactor;
+                padsize = bottomOffset * scaleFactor;
                 p->sourceHeight += padsize;
                 m_conformanceWindow.bEnabled = true;
                 m_conformanceWindow.bottomOffset = padsize;
@@ -4196,7 +4196,7 @@ void Encoder::configure(x265_param *p)
         x265_log(p, X265_LOG_WARNING, "Radl requires fixed gop-length (keyint == min-keyint). Disabling radl.\n");
     }
 
-    if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP)
+    if ((p->chunkStart || p->chunkEnd) && p->bOpenGOP && m_param->bResetZoneConfig)
     {
         p->chunkStart = p->chunkEnd = 0;
         x265_log(p, X265_LOG_WARNING, "Chunking requires closed gop structure. Disabling chunking.\n");
@@ -4229,12 +4229,6 @@ void Encoder::configure(x265_param *p)
         x265_log(p, X265_LOG_WARNING, "Turning on repeat - headers for zone encoding\n");
     }
 
-    if (!m_param->bResetZoneConfig && (p->keyframeMax != p->keyframeMin))
-        x265_log(p, X265_LOG_WARNING, "External zone reconfiguration requires a fixed GOP size to enable appropriate signaling of HRD info\n");
-
-    if (!m_param->bResetZoneConfig && (p->reconfigWindowSize != (uint64_t)p->keyframeMax))
-        x265_log(p, X265_LOG_WARNING, "Zone size must be multiple of GOP size to enable appropriate signaling of HRD info\n");
-
     if (m_param->bEnableHME)
     {
         if (m_param->sourceHeight < 540)
@@ -4311,18 +4305,27 @@ void Encoder::readAnalysisFile(x265_anal
         }
     }
 
+    uint32_t numCUsLoad, numCUsInHeightLoad;
+
     /* Now arrived at the right frame, read the record */
     analysis->poc = poc;
     analysis->frameRecordSize = frameRecordSize;
     X265_FREAD(&analysis->sliceType, sizeof(int), 1, m_analysisFileIn, &(picData->sliceType));
     X265_FREAD(&analysis->bScenecut, sizeof(int), 1, m_analysisFileIn, &(picData->bScenecut));
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFileIn, &(picData->satdCost));
-    X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
+    X265_FREAD(&numCUsLoad, sizeof(int), 1, m_analysisFileIn, &(picData->numCUsInFrame));
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFileIn, &(picData->numPartitions));
 
+    /* Update analysis info to save current settings */
+    uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+    uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+    uint32_t numCUsInFrame = widthInCU * heightInCU;
+    analysis->numCUsInFrame = numCUsInFrame;
+    analysis->numCuInHeight = heightInCU;
+
     if (m_param->bDisableLookahead)
     {
-        X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
+        X265_FREAD(&numCUsInHeightLoad, sizeof(uint32_t), 1, m_analysisFileIn, &(picData->numCuInHeight));
         X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFileIn, &(picData->lookahead));
     }
     int scaledNumPartition = analysis->numPartitions;
@@ -4335,16 +4338,16 @@ void Encoder::readAnalysisFile(x265_anal
 
     if (m_param->ctuDistortionRefine == CTU_DISTORTION_INTERNAL)
     {
-        X265_FREAD((analysis->distortionData)->ctuDistortion, sizeof(sse_t), analysis->numCUsInFrame, m_analysisFileIn, picDistortion);
+        X265_FREAD((analysis->distortionData)->ctuDistortion, sizeof(sse_t), numCUsLoad, m_analysisFileIn, picDistortion);
         computeDistortionOffset(analysis);
     }
     if (m_param->bDisableLookahead && m_rateControl->m_isVbv)
     {
         size_t vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
-        X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.intraVbvCost);
-        X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFileIn, picData->lookahead.vbvCost);
-        X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.satdForVbv);
-        X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFileIn, picData->lookahead.intraSatdForVbv);
+        X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), numCUsLoad, m_analysisFileIn, picData->lookahead.intraVbvCost);
+        X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), numCUsLoad, m_analysisFileIn, picData->lookahead.vbvCost);
+        X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), numCUsInHeightLoad, m_analysisFileIn, picData->lookahead.satdForVbv);
+        X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), numCUsInHeightLoad, m_analysisFileIn, picData->lookahead.intraSatdForVbv);
         X265_FREAD(analysis->lookahead.plannedSatd, sizeof(int64_t), vbvCount, m_analysisFileIn, picData->lookahead.plannedSatd);
 
         if (m_param->scaleFactor)
@@ -4352,12 +4355,12 @@ void Encoder::readAnalysisFile(x265_anal
             for (uint64_t index = 0; index < vbvCount; index++)
                 analysis->lookahead.plannedSatd[index] *= factor;
 
-            for (uint32_t i = 0; i < analysis->numCuInHeight; i++)
+            for (uint32_t i = 0; i < numCUsInHeightLoad; i++)
             {
                 analysis->lookahead.satdForVbv[i] *= factor;
                 analysis->lookahead.intraSatdForVbv[i] *= factor;
             }
-            for (uint32_t i = 0; i < analysis->numCUsInFrame; i++)
+            for (uint32_t i = 0; i < numCUsLoad; i++)
             {
                 analysis->lookahead.vbvCost[i] *= factor;
                 analysis->lookahead.intraVbvCost[i] *= factor;
@@ -4407,13 +4410,13 @@ void Encoder::readAnalysisFile(x265_anal
 
         if (!m_param->scaleFactor)
         {
-            X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
+            X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), numCUsLoad * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
         }
         else
         {
-            uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
-            X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
-            for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor)
+            uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, numCUsLoad * scaledNumPartition);
+            X265_FREAD(tempLumaBuf, sizeof(uint8_t), numCUsLoad * scaledNumPartition, m_analysisFileIn, intraPic->modes);
+            for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < numCUsLoad * scaledNumPartition; ctu32Idx++, cnt += factor)
                 memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
             X265_FREE(tempLumaBuf);
         }
@@ -4447,7 +4450,7 @@ void Encoder::readAnalysisFile(x265_anal
         }
         if (m_param->bAnalysisType == HEVC_INFO)
         {
-            depthBytes = analysis->numCUsInFrame * analysis->numPartitions;
+            depthBytes = numCUsLoad * analysis->numPartitions;
             memcpy(((x265_analysis_inter_data *)analysis->interData)->depth, interPic->depth, depthBytes);
         }
         else
@@ -4551,25 +4554,26 @@ void Encoder::readAnalysisFile(x265_anal
             {
                 if (!m_param->scaleFactor)
                 {
-                    X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
+                    X265_FREAD((analysis->intraData)->modes, sizeof(uint8_t), numCUsLoad * analysis->numPartitions, m_analysisFileIn, intraPic->modes);
                 }
                 else
                 {
-                    uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, analysis->numCUsInFrame * scaledNumPartition);
-                    X265_FREAD(tempLumaBuf, sizeof(uint8_t), analysis->numCUsInFrame * scaledNumPartition, m_analysisFileIn, intraPic->modes);
-                    for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < analysis->numCUsInFrame * scaledNumPartition; ctu32Idx++, cnt += factor)
+                    uint8_t *tempLumaBuf = X265_MALLOC(uint8_t, numCUsLoad * scaledNumPartition);
+                    X265_FREAD(tempLumaBuf, sizeof(uint8_t), numCUsLoad * scaledNumPartition, m_analysisFileIn, intraPic->modes);
+                    for (uint32_t ctu32Idx = 0, cnt = 0; ctu32Idx < numCUsLoad * scaledNumPartition; ctu32Idx++, cnt += factor)
                         memset(&(analysis->intraData)->modes[cnt], tempLumaBuf[ctu32Idx], factor);
                     X265_FREE(tempLumaBuf);
                 }
             }
         }
         else
-            X265_FREAD((analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
+            X265_FREAD((analysis->interData)->ref, sizeof(int32_t), numCUsLoad * X265_MAX_PRED_MODE_PER_CTU * numDir, m_analysisFileIn, interPic->ref);
 
         consumedBytes += frameRecordSize;
         if (numDir == 1)
             totalConsumedBytes = consumedBytes;
     }
+
 #undef X265_FREAD
 }
 
@@ -5032,13 +5036,14 @@ int Encoder::validateAnalysisData(x265_a
     X265_PARAM_VALIDATE(saveParam->lookaheadDepth, sizeof(int), 1, &m_param->lookaheadDepth, rc - lookahead);
     X265_PARAM_VALIDATE(saveParam->chunkStart, sizeof(int), 1, &m_param->chunkStart, chunk-start);
     X265_PARAM_VALIDATE(saveParam->chunkEnd, sizeof(int), 1, &m_param->chunkEnd, chunk-end);
-    X265_PARAM_VALIDATE(saveParam->cuTree,sizeof(int),1,&m_param->rc.cuTree, cutree - offset);
     X265_PARAM_VALIDATE(saveParam->ctuDistortionRefine, sizeof(int), 1, &m_param->ctuDistortionRefine, ctu - distortion);
+    X265_PARAM_VALIDATE(saveParam->frameDuplication, sizeof(int), 1, &m_param->bEnableFrameDuplication, frame - dup);
 
     int sourceHeight, sourceWidth;
     if (writeFlag)
     {
         X265_PARAM_VALIDATE(saveParam->analysisReuseLevel, sizeof(int), 1, &m_param->analysisSaveReuseLevel, analysis - save - reuse - level);
+        X265_PARAM_VALIDATE(saveParam->cuTree, sizeof(int), 1, &m_param->rc.cuTree, cutree-offset);
         sourceHeight = m_param->sourceHeight - m_conformanceWindow.bottomOffset;
         sourceWidth = m_param->sourceWidth - m_conformanceWindow.rightOffset;
         X265_PARAM_VALIDATE(saveParam->sourceWidth, sizeof(int), 1, &sourceWidth, res-width);
@@ -5073,6 +5078,15 @@ int Encoder::validateAnalysisData(x265_a
             return -1;
         }
 
+        int bcutree;
+        X265_FREAD(&bcutree, sizeof(int), 1, m_analysisFileIn, &(saveParam->cuTree));
+        if (loadLevel == 10 && m_param->rc.cuTree && (!bcutree || saveLevel < 2))
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Error reading cu-tree info. Disabling cutree offsets. \n");
+            m_param->rc.cuTree = 0;
+            return -1;
+        }
+
         bool error = false;
         int curSourceHeight = m_param->sourceHeight - m_conformanceWindow.bottomOffset;
         int curSourceWidth = m_param->sourceWidth - m_conformanceWindow.rightOffset;
@@ -5701,7 +5715,7 @@ void Encoder::printReconfigureParams()
     TOOLCMP(oldParam->maxNumReferences, newParam->maxNumReferences, "ref=%d to %d\n");
     TOOLCMP(oldParam->bEnableFastIntra, newParam->bEnableFastIntra, "fast-intra=%d to %d\n");
     TOOLCMP(oldParam->bEnableEarlySkip, newParam->bEnableEarlySkip, "early-skip=%d to %d\n");
-    TOOLCMP(oldParam->bEnableRecursionSkip, newParam->bEnableRecursionSkip, "rskip=%d to %d\n");
+    TOOLCMP(oldParam->recursionSkipMode, newParam->recursionSkipMode, "rskip=%d to %d\n");
     TOOLCMP(oldParam->searchMethod, newParam->searchMethod, "me=%d to %d\n");
     TOOLCMP(oldParam->searchRange, newParam->searchRange, "merange=%d to %d\n");
     TOOLCMP(oldParam->subpelRefine, newParam->subpelRefine, "subme= %d to %d\n");
--- a/source/encoder/frameencoder.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/frameencoder.cpp	Wed May 06 14:59:56 2020 +0530
@@ -130,7 +130,7 @@ bool FrameEncoder::init(Encoder *top, in
         {
             rowSum += sliceGroupSizeAccu;
             m_sliceBaseRow[++sidx] = i;
-        }        
+        }
     }
     X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
     m_sliceBaseRow[0] = 0;
@@ -448,6 +448,18 @@ void FrameEncoder::compressFrame()
     m_ssimCnt = 0;
     memset(&(m_frame->m_encData->m_frameStats), 0, sizeof(m_frame->m_encData->m_frameStats));
 
+    if (!m_param->bHistBasedSceneCut && m_param->rc.aqMode != X265_AQ_EDGE && m_param->recursionSkipMode == EDGE_BASED_RSKIP)
+    {
+        int height = m_frame->m_fencPic->m_picHeight;
+        int width = m_frame->m_fencPic->m_picWidth;
+        intptr_t stride = m_frame->m_fencPic->m_stride;
+
+        if (!computeEdge(m_frame->m_edgeBitPic, m_frame->m_fencPic->m_picOrg[0], NULL, stride, height, width, false, 1))
+        {
+            x265_log(m_param, X265_LOG_ERROR, " Failed to compute edge !");
+        }
+    }
+
     /* Emit access unit delimiter unless this is the first frame and the user is
      * not repeating headers (since AUD is supposed to be the first NAL in the access
      * unit) */
--- a/source/encoder/ratecontrol.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/ratecontrol.cpp	Wed May 06 14:59:56 2020 +0530
@@ -269,7 +269,7 @@ RateControl::RateControl(x265_param& p, 
         x265_log(m_param, X265_LOG_WARNING, "NAL HRD parameters require VBV parameters, ignored\n");
         m_param->bEmitHRDSEI = 0;
     }
-    m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && !m_2pass && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate;
+    m_isCbr = m_param->rc.rateControlMode == X265_RC_ABR && m_isVbv && m_param->rc.vbvMaxBitrate <= m_param->rc.bitrate;
     if (m_param->rc.bStrictCbr && !m_isCbr)
     {
         x265_log(m_param, X265_LOG_WARNING, "strict CBR set without CBR mode, ignored\n");
@@ -335,7 +335,7 @@ bool RateControl::init(const SPS& sps)
         int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
         int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000;
 
-        if (m_param->bEmitHRDSEI)
+        if (m_param->bEmitHRDSEI && !m_param->decoderVbvMaxRate)
         {
             const HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
             vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT);
@@ -509,6 +509,7 @@ bool RateControl::init(const SPS& sps)
                 CMP_OPT_FIRST_PASS(" keyint", m_param->keyframeMax);
                 CMP_OPT_FIRST_PASS("scenecut", m_param->scenecutThreshold);
                 CMP_OPT_FIRST_PASS("intra-refresh", m_param->bIntraRefresh);
+                CMP_OPT_FIRST_PASS("frame-dup", m_param->bEnableFrameDuplication);
                 if (m_param->bMultiPassOptRPS)
                 {
                     CMP_OPT_FIRST_PASS("multi-pass-opt-rps", m_param->bMultiPassOptRPS);
@@ -546,7 +547,7 @@ bool RateControl::init(const SPS& sps)
                 x265_log(m_param, X265_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n",
                          m_param->totalFrames, m_numEntries);
             }
-            if (m_param->totalFrames > m_numEntries)
+            if (m_param->totalFrames > m_numEntries && !m_param->bEnableFrameDuplication)
             {
                 x265_log(m_param, X265_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n",
                          m_param->totalFrames, m_numEntries);
@@ -781,6 +782,10 @@ void RateControl::initHRD(SPS& sps)
     // Init HRD
     HRDInfo* hrd = &sps.vuiParameters.hrdParameters;
     hrd->cbrFlag = m_isCbr;
+    if (m_param->reconfigWindowSize) {
+        hrd->cbrFlag = 0;
+        vbvMaxBitrate = m_param->decoderVbvMaxRate * 1000;
+    }
 
     // normalize HRD size and rate to the value / scale notation
     hrd->bitRateScale = x265_clip3(0, 15, calcScale(vbvMaxBitrate) - BR_SHIFT);
@@ -829,7 +834,7 @@ bool RateControl::analyseABR2Pass(uint64
         /* weighted average of cplx of future frames */
         for (int j = 1; j < cplxBlur * 2 && j < m_numEntries - i; j++)
         {
-            int index = m_encOrder[i + j];
+            int index = i+j;
             RateControlEntry *rcj = &m_rce2Pass[index];
             weight *= 1 - pow(rcj->iCuCount / m_ncu, 2);
             if (weight < 0.0001)
@@ -842,7 +847,7 @@ bool RateControl::analyseABR2Pass(uint64
         weight = 1.0;
         for (int j = 0; j <= cplxBlur * 2 && j <= i; j++)
         {
-            int index = m_encOrder[i - j];
+            int index = i-j;
             RateControlEntry *rcj = &m_rce2Pass[index];
             gaussianWeight = weight * exp(-j * j / 200.0);
             weightSum += gaussianWeight;
@@ -851,7 +856,7 @@ bool RateControl::analyseABR2Pass(uint64
             if (weight < .0001)
                 break;
         }
-        m_rce2Pass[m_encOrder[i]].blurredComplexity = cplxSum / weightSum;
+        m_rce2Pass[i].blurredComplexity= cplxSum / weightSum;
     }
     CHECKED_MALLOC(qScale, double, m_numEntries);
     if (filterSize > 1)
@@ -870,7 +875,7 @@ bool RateControl::analyseABR2Pass(uint64
     expectedBits = 1;
     for (int i = 0; i < m_numEntries; i++)
     {
-        RateControlEntry* rce = &m_rce2Pass[m_encOrder[i]];
+        RateControlEntry* rce = &m_rce2Pass[i];
         double q = getQScale(rce, 1.0);
         expectedBits += qScale2bits(rce, q);
         m_lastQScaleFor[rce->sliceType] = q;
@@ -893,15 +898,15 @@ bool RateControl::analyseABR2Pass(uint64
         /* find qscale */
         for (int i = 0; i < m_numEntries; i++)
         {
-            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
+            RateControlEntry *rce = &m_rce2Pass[i];
             qScale[i] = getQScale(rce, rateFactor);
             m_lastQScaleFor[rce->sliceType] = qScale[i];
         }
 
         /* fixed I/B qscale relative to P */
-        for (int i = m_numEntries - 1; i >= 0; i--)
+        for (int i = 0; i < m_numEntries; i++)
         {
-            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[m_encOrder[i]], qScale[i]);
+            qScale[i] = getDiffLimitedQScale(&m_rce2Pass[i], qScale[i]);
             X265_CHECK(qScale[i] >= 0, "qScale became negative\n");
         }
 
@@ -912,7 +917,6 @@ bool RateControl::analyseABR2Pass(uint64
             for (int i = 0; i < m_numEntries; i++)
             {
                 double q = 0.0, sum = 0.0;
-
                 for (int j = 0; j < filterSize; j++)
                 {
                     int idx = i + j - filterSize / 2;
@@ -920,7 +924,7 @@ bool RateControl::analyseABR2Pass(uint64
                     double coeff = qBlur == 0 ? 1.0 : exp(-d * d / (qBlur * qBlur));
                     if (idx < 0 || idx >= m_numEntries)
                         continue;
-                    if (m_rce2Pass[m_encOrder[i]].sliceType != m_rce2Pass[m_encOrder[idx]].sliceType)
+                    if (m_rce2Pass[i].sliceType != m_rce2Pass[idx].sliceType)
                         continue;
                     q += qScale[idx] * coeff;
                     sum += coeff;
@@ -932,7 +936,7 @@ bool RateControl::analyseABR2Pass(uint64
         /* find expected bits */
         for (int i = 0; i < m_numEntries; i++)
         {
-            RateControlEntry *rce = &m_rce2Pass[m_encOrder[i]];
+            RateControlEntry *rce = &m_rce2Pass[i];
             rce->newQScale = clipQscale(NULL, rce, blurredQscale[i]); // check if needed
             X265_CHECK(rce->newQScale >= 0, "new Qscale is negative\n");
             expectedBits += qScale2bits(rce, rce->newQScale);
@@ -1279,6 +1283,7 @@ int RateControl::rateControlStart(Frame*
                 m_param->rc.vbvMaxBitrate = m_param->rc.zones[i].zoneParam->rc.vbvMaxBitrate;
                 memcpy(m_relativeComplexity, m_param->rc.zones[i].relativeComplexity, sizeof(double) * m_param->reconfigWindowSize);
                 reconfigureRC();
+                m_isCbr = 1; /* Always vbvmaxrate == bitrate here*/
                 m_top->zoneReadCount[i].incr();
             }
         }
@@ -1951,7 +1956,7 @@ double RateControl::rateEstimateQscale(F
                 /* Adjust quant based on the difference between
                  * achieved and expected bitrate so far */
                 double curTime = (double)rce->encodeOrder / m_numEntries;
-                double w = x265_clip3(0.0, 1.0, curTime * 100);
+                double w = x265_clip3(0.0, 1.0, curTime);
                 q *= pow((double)m_totalBits / m_expectedBitsSum, w);
             }
             if (m_framesDone == 0 && m_param->rc.rateControlMode == X265_RC_ABR && m_isGrainEnabled)
@@ -2742,7 +2747,9 @@ int RateControl::updateVbv(int64_t bits,
         x265_log(m_param, X265_LOG_WARNING, "poc:%d, VBV underflow (%.0f bits)\n", rce->poc, m_bufferFillFinal);
 
     m_bufferFillFinal = X265_MAX(m_bufferFillFinal, 0);
-    m_bufferFillFinal += m_bufferRate;
+    m_bufferFillFinal += rce->bufferRate;
+    if (m_param->csvLogLevel >= 2)
+        m_unclippedBufferFillFinal = m_bufferFillFinal;
 
     if (m_param->rc.bStrictCbr)
     {
@@ -2752,14 +2759,14 @@ int RateControl::updateVbv(int64_t bits,
             filler += FILLER_OVERHEAD * 8;
         }
         m_bufferFillFinal -= filler;
-        bufferBits = X265_MIN(bits + filler + m_bufferExcess, m_bufferRate);
+        bufferBits = X265_MIN(bits + filler + m_bufferExcess, rce->bufferRate);
         m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits + filler, 0);
         m_bufferFillActual += bufferBits - bits - filler;
     }
     else
     {
         m_bufferFillFinal = X265_MIN(m_bufferFillFinal, m_bufferSize);
-        bufferBits = X265_MIN(bits + m_bufferExcess, m_bufferRate);
+        bufferBits = X265_MIN(bits + m_bufferExcess, rce->bufferRate);
         m_bufferExcess = X265_MAX(m_bufferExcess - bufferBits + bits, 0);
         m_bufferFillActual += bufferBits - bits;
         m_bufferFillActual = X265_MIN(m_bufferFillActual, m_bufferSize);
--- a/source/encoder/ratecontrol.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/ratecontrol.h	Wed May 06 14:59:56 2020 +0530
@@ -157,6 +157,7 @@ public:
     double m_rateFactorConstant;
     double m_bufferSize;
     double m_bufferFillFinal;  /* real buffer as of the last finished frame */
+    double m_unclippedBufferFillFinal; /* real unclipped buffer as of the last finished frame used to log in CSV*/
     double m_bufferFill;       /* planned buffer, if all in-progress frames hit their bit budget */
     double m_bufferRate;       /* # of bits added to buffer_fill after each frame */
     double m_vbvMaxRate;       /* in kbps */
--- a/source/encoder/slicetype.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/slicetype.cpp	Wed May 06 14:59:56 2020 +0530
@@ -87,7 +87,7 @@ inline uint32_t acEnergyPlane(Frame *cur
 
 namespace X265_NS {
 
-bool computeEdge(pixel *edgePic, pixel *refPic, pixel *edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta)
+bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel)
 {
     intptr_t rowOne = 0, rowTwo = 0, rowThree = 0, colOne = 0, colTwo = 0, colThree = 0;
     intptr_t middle = 0, topLeft = 0, topRight = 0, bottomLeft = 0, bottomRight = 0;
@@ -141,7 +141,7 @@ bool computeEdge(pixel *edgePic, pixel *
                        theta = 180 + theta;
                     edgeTheta[middle] = (pixel)theta;
                 }
-                edgePic[middle] = (pixel)(gradientMagnitude >= edgeThreshold ? edgeThreshold : blackPixel);
+                edgePic[middle] = (pixel)(gradientMagnitude >= EDGE_THRESHOLD ? whitePixel : blackPixel);
             }
         }
         return true;
@@ -519,6 +519,13 @@ void LookaheadTLD::calcAdaptiveQuantFram
                 if (param->rc.aqMode == X265_AQ_EDGE)
                     edgeFilter(curFrame, param);
 
+                if (param->rc.aqMode == X265_AQ_EDGE && !param->bHistBasedSceneCut && param->recursionSkipMode == EDGE_BASED_RSKIP)
+                {
+                    pixel* src = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX;
+                    primitives.planecopy_pp_shr(src, curFrame->m_fencPic->m_stride, curFrame->m_edgeBitPic,
+                        curFrame->m_fencPic->m_stride, curFrame->m_fencPic->m_picWidth, curFrame->m_fencPic->m_picHeight, SHIFT_TO_BITPLANE);
+                }
+
                 if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE)
                 {
                     double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8)));
--- a/source/encoder/slicetype.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/encoder/slicetype.h	Wed May 06 14:59:56 2020 +0530
@@ -44,9 +44,9 @@ class Lookahead;
 #define EDGE_INCLINATION 45
 
 #if HIGH_BIT_DEPTH
-#define edgeThreshold 1023.0
+#define EDGE_THRESHOLD 1023.0
 #else
-#define edgeThreshold 255.0
+#define EDGE_THRESHOLD 255.0
 #endif
 #define PI 3.14159265
 
@@ -101,7 +101,7 @@ struct LookaheadTLD
 protected:
 
     uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize);
-    uint32_t edgeDensityCu(Frame*curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
+    uint32_t edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t lumaSumCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, uint32_t qgSize);
     uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
     bool     allocWeightedRef(Lowres& fenc);
@@ -265,7 +265,6 @@ protected:
     CostEstimateGroup& operator=(const CostEstimateGroup&);
 };
 
-bool computeEdge(pixel *edgePic, pixel *refPic, pixel *edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta);
-
+bool computeEdge(pixel* edgePic, pixel* refPic, pixel* edgeTheta, intptr_t stride, int height, int width, bool bcalcTheta, pixel whitePixel = EDGE_THRESHOLD);
 }
 #endif // ifndef X265_SLICETYPE_H
--- a/source/test/CMakeLists.txt	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/test/CMakeLists.txt	Wed May 06 14:59:56 2020 +0530
@@ -23,13 +23,15 @@ endif(X86)
 
 # add ARM assembly files
 if(ARM OR CROSS_COMPILE_ARM)
-    enable_language(ASM)
-    set(NASM_SRC checkasm-arm.S)
-    add_custom_command(
-        OUTPUT checkasm-arm.obj
-        COMMAND ${CMAKE_CXX_COMPILER}
-        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
-        DEPENDS checkasm-arm.S)
+    if(NOT ARM64)
+        enable_language(ASM)
+        set(NASM_SRC checkasm-arm.S)
+        add_custom_command(
+            OUTPUT checkasm-arm.obj
+            COMMAND ${CMAKE_CXX_COMPILER}
+            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj
+            DEPENDS checkasm-arm.S)
+    endif()
 endif(ARM OR CROSS_COMPILE_ARM)
 
 # add PowerPC assembly files
--- a/source/test/regression-tests.txt	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/test/regression-tests.txt	Wed May 06 14:59:56 2020 +0530
@@ -75,7 +75,7 @@ News-4k.y4m,--preset ultrafast --no-cutr
 News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0
 News-4k.y4m,--preset superfast --slices 4 --aq-mode 0 
 News-4k.y4m,--preset medium --tune ssim --no-sao --qg-size 16
-News-4k.y4m,--preset veryslow --no-rskip
+News-4k.y4m,--preset veryslow --rskip 0
 News-4k.y4m,--preset veryslow --pme --crf 40
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp
 OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp
@@ -162,7 +162,11 @@ Island_960x540_24.yuv,--no-cutree --aq-m
 sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02 --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
 sintel_trailer_2k_1920x1080_24.yuv, --preset medium --hist-scenecut --hist-threshold 0.02
 sintel_trailer_2k_1920x1080_24.yuv, --preset ultrafast --hist-scenecut --hist-threshold 0.02
-
+crowd_run_1920x1080_50.yuv, --preset faster --ctu 32 --rskip 2 --rskip-edge-threshold 5
+crowd_run_1920x1080_50.yuv, --preset fast --ctu 64 --rskip 2 --rskip-edge-threshold 5 --aq-mode 4
+crowd_run_1920x1080_50.yuv, --preset slow --ctu 32 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1
+crowd_run_1920x1080_50.yuv, --preset slower --ctu 16 --rskip 2 --rskip-edge-threshold 5 --hist-scenecut --hist-threshold 0.1 --aq-mode 4
+ 
 # Main12 intraCost overflow bug test
 720p50_parkrun_ter.y4m,--preset medium
 
--- a/source/test/save-load-tests.txt	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/test/save-load-tests.txt	Wed May 06 14:59:56 2020 +0530
@@ -18,3 +18,4 @@ crowd_run_1080p50.y4m,   --preset medium
 RaceHorses_416x240_30.y4m,   --preset slow --no-cutree --ctu 16 --analysis-save x265_analysis.dat --analysis-save-reuse-level 10 --scale-factor 2 --crf 22  --vbv-maxrate 1000 --vbv-bufsize 1000::RaceHorses_832x480_30.y4m,    --preset slow --no-cutree --ctu 32 --analysis-load x265_analysis.dat  --analysis-save x265_analysis_2.dat --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --crf 16 --vbv-maxrate 4000 --vbv-bufsize 4000 --refine-intra 0 --refine-inter 1::RaceHorses_1664x960_30.y4m,   --preset slow --no-cutree --ctu 64 --analysis-load x265_analysis_2.dat  --analysis-load-reuse-level 10 --scale-factor 2 --crf 12 --vbv-maxrate 7000 --vbv-bufsize 7000 --refine-intra 2 --refine-inter 2
 crowd_run_540p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,   --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset veryslow --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
 crowd_run_540p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_540.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 5000 --vbv-bufsize 15000 --vbv-maxrate 9000::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat  --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_1080p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_1080.dat --analysis-load x265_analysis_540.dat --refine-intra 4 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 10000 --vbv-bufsize 30000 --vbv-maxrate 17500::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-save x265_analysis_2160.dat --analysis-load x265_analysis_1080.dat --refine-intra 3 --dynamic-refine --analysis-load-reuse-level 10 --analysis-save-reuse-level 10 --scale-factor 2 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000::crowd_run_2160p50.y4m,  --preset medium --no-cutree --analysis-load x265_analysis_2160.dat --refine-intra 2 --dynamic-refine --analysis-load-reuse-level 10 --scale-factor 1 --bitrate 24000 --vbv-bufsize 84000 --vbv-maxrate 49000
+News-4k.y4m,  --preset medium --analysis-save x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000::News-4k.y4m --analysis-load x265_analysis_fdup.dat --frame-dup --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000
--- a/source/test/testbench.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/test/testbench.cpp	Wed May 06 14:59:56 2020 +0530
@@ -5,6 +5,7 @@
  *          Mandar Gurav <mandar@multicorewareinc.com>
  *          Mahesh Pittala <mahesh@multicorewareinc.com>
  *          Min Chen <chenm003@163.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -208,6 +209,14 @@ int main(int argc, char *argv[])
         EncoderPrimitives asmprim;
         memset(&asmprim, 0, sizeof(asmprim));
         setupAssemblyPrimitives(asmprim, test_arch[i].flag);
+
+#if X265_ARCH_ARM64
+        /* Temporary workaround because luma_vsp assembly primitive has not been completed
+         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+         * Otherwise, segment fault occurs. */
+        setupAliasCPrimitives(cprim, asmprim, test_arch[i].flag);
+#endif
+
         setupAliasPrimitives(asmprim);
         memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
@@ -232,6 +241,13 @@ int main(int argc, char *argv[])
 #endif
     setupAssemblyPrimitives(optprim, cpuid);
 
+#if X265_ARCH_ARM64
+    /* Temporary workaround because luma_vsp assembly primitive has not been completed
+     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+     * Otherwise, segment fault occurs. */
+    setupAliasCPrimitives(cprim, optprim, cpuid);
+#endif
+
     /* Note that we do not setup aliases for performance tests, that would be
      * redundant. The testbench only verifies they are correctly aliased */
 
--- a/source/test/testharness.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/test/testharness.h	Wed May 06 14:59:56 2020 +0530
@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <steve@borho.org>
  *          Min Chen <chenm003@163.com>
+ *          Yimeng Su <yimeng.su@huawei.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -81,12 +82,16 @@ static inline uint32_t __rdtsc(void)
 #if X265_ARCH_X86
     asm volatile("rdtsc" : "=a" (a) ::"edx");
 #elif X265_ARCH_ARM
+#if X265_ARCH_ARM64
+    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
+#else
     // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch
     // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
 
     // TO-DO: replace clock() function with appropriate ARM cpu instructions
     a = clock();
 #endif
+#endif
     return a;
 }
 #endif // ifdef _MSC_VER
--- a/source/x265.cpp	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/x265.cpp	Wed May 06 14:59:56 2020 +0530
@@ -27,11 +27,7 @@
 
 #include "x265.h"
 #include "x265cli.h"
-
-#include "input/input.h"
-#include "output/output.h"
-#include "output/reconplay.h"
-#include "svt.h"
+#include "abrEncApp.h"
 
 #if HAVE_VLD
 /* Visual Leak Detector */
@@ -47,707 +43,13 @@
 #include <fstream>
 #include <queue>
 
-#define CONSOLE_TITLE_SIZE 200
-#ifdef _WIN32
-#include <windows.h>
-#define SetThreadExecutionState(es)
-static char orgConsoleTitle[CONSOLE_TITLE_SIZE] = "";
-#else
-#define GetConsoleTitle(t, n)
-#define SetConsoleTitle(t)
-#define SetThreadExecutionState(es)
-#endif
-
 using namespace X265_NS;
 
-/* Ctrl-C handler */
-static volatile sig_atomic_t b_ctrl_c /* = 0 */;
-static void sigint_handler(int)
-{
-    b_ctrl_c = 1;
-}
-#define START_CODE 0x00000001
-#define START_CODE_BYTES 4
-
-struct CLIOptions
-{
-    InputFile* input;
-    ReconFile* recon;
-    OutputFile* output;
-    FILE*       qpfile;
-    FILE*       zoneFile;
-    FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
-    const char* reconPlayCmd;
-    const x265_api* api;
-    x265_param* param;
-    x265_vmaf_data* vmafData;
-    bool bProgress;
-    bool bForceY4m;
-    bool bDither;
-    uint32_t seek;              // number of frames to skip from the beginning
-    uint32_t framesToBeEncoded; // number of frames to encode
-    uint64_t totalbytes;
-    int64_t startTime;
-    int64_t prevUpdateTime;
-
-    /* in microseconds */
-    static const int UPDATE_INTERVAL = 250000;
-
-    CLIOptions()
-    {
-        input = NULL;
-        recon = NULL;
-        output = NULL;
-        qpfile = NULL;
-        zoneFile = NULL;
-        dolbyVisionRpu = NULL;
-        reconPlayCmd = NULL;
-        api = NULL;
-        param = NULL;
-        vmafData = NULL;
-        framesToBeEncoded = seek = 0;
-        totalbytes = 0;
-        bProgress = true;
-        bForceY4m = false;
-        startTime = x265_mdate();
-        prevUpdateTime = 0;
-        bDither = false;
-    }
-
-    void destroy();
-    void printStatus(uint32_t frameNum);
-    bool parse(int argc, char **argv);
-    bool parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount);
-    bool parseQPFile(x265_picture &pic_org);
-    bool parseZoneFile();
-};
-
-void CLIOptions::destroy()
-{
-    if (input)
-        input->release();
-    input = NULL;
-    if (recon)
-        recon->release();
-    recon = NULL;
-    if (qpfile)
-        fclose(qpfile);
-    qpfile = NULL;
-    if (zoneFile)
-        fclose(zoneFile);
-    zoneFile = NULL;
-    if (dolbyVisionRpu)
-        fclose(dolbyVisionRpu);
-    dolbyVisionRpu = NULL;
-    if (output)
-        output->release();
-    output = NULL;
-}
-
-void CLIOptions::printStatus(uint32_t frameNum)
-{
-    char buf[200];
-    int64_t time = x265_mdate();
-
-    if (!bProgress || !frameNum || (prevUpdateTime && time - prevUpdateTime < UPDATE_INTERVAL))
-        return;
-
-    int64_t elapsed = time - startTime;
-    double fps = elapsed > 0 ? frameNum * 1000000. / elapsed : 0;
-    float bitrate = 0.008f * totalbytes * (param->fpsNum / param->fpsDenom) / ((float)frameNum);
-    if (framesToBeEncoded)
-    {
-        int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000));
-        sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
-            100. * frameNum / (param->chunkEnd ? param->chunkEnd : param->totalFrames), frameNum, (param->chunkEnd ? param->chunkEnd : param->totalFrames), fps, bitrate,
-                eta / 3600, (eta / 60) % 60, eta % 60);
-    }
-    else
-        sprintf(buf, "x265 %d frames: %.2f fps, %.2f kb/s", frameNum, fps, bitrate);
-
-    fprintf(stderr, "%s  \r", buf + 5);
-    SetConsoleTitle(buf);
-    fflush(stderr); // needed in windows
-    prevUpdateTime = time;
-}
-
-bool CLIOptions::parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount)
-{
-    bool bError = false;
-    int bShowHelp = false;
-    int outputBitDepth = 0;
-    const char *profile = NULL;
-
-    /* Presets are applied before all other options. */
-    for (optind = 0;;)
-    {
-        int c = getopt_long(argc, argv, short_options, long_options, NULL);
-        if (c == -1)
-            break;
-        else if (c == 'D')
-            outputBitDepth = atoi(optarg);
-        else if (c == 'P')
-            profile = optarg;
-        else if (c == '?')
-            bShowHelp = true;
-    }
-
-    if (!outputBitDepth && profile)
-    {
-        /* try to derive the output bit depth from the requested profile */
-        if (strstr(profile, "10"))
-            outputBitDepth = 10;
-        else if (strstr(profile, "12"))
-            outputBitDepth = 12;
-        else
-            outputBitDepth = 8;
-    }
-
-    api = x265_api_get(outputBitDepth);
-    if (!api)
-    {
-        x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
-        api = x265_api_get(0);
-    }
-
-    if (bShowHelp)
-    {
-        printVersion(globalParam, api);
-        showHelp(globalParam);
-    }
-
-    globalParam->rc.zones[zonefileCount].zoneParam = api->param_alloc();
-    if (!globalParam->rc.zones[zonefileCount].zoneParam)
-    {
-        x265_log(NULL, X265_LOG_ERROR, "param alloc failed\n");
-        return true;
-    }
-
-    memcpy(globalParam->rc.zones[zonefileCount].zoneParam, globalParam, sizeof(x265_param));
-
-    for (optind = 0;;)
-    {
-        int long_options_index = -1;
-        int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
-        if (c == -1)
-            break;
-
-        if (long_options_index < 0 && c > 0)
-        {
-            for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options[0]); i++)
-            {
-                if (long_options[i].val == c)
-                {
-                    long_options_index = (int)i;
-                    break;
-                }
-            }
-
-            if (long_options_index < 0)
-            {
-                /* getopt_long might have already printed an error message */
-                if (c != 63)
-                    x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
-                return true;
-            }
-        }
-        if (long_options_index < 0)
-        {
-            x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
-            return true;
-        }
-
-        bError |= !!api->zone_param_parse(globalParam->rc.zones[zonefileCount].zoneParam, long_options[long_options_index].name, optarg);
-
-        if (bError)
-        {
-            const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind - 2];
-            x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg);
-            return true;
-        }
-    }
-
-    if (optind < argc)
-    {
-        x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argv[optind]);
-        return true;
-    }
-    return false;
-}
-
-bool CLIOptions::parse(int argc, char **argv)
-{
-    bool bError = false;
-    int bShowHelp = false;
-    int inputBitDepth = 8;
-    int outputBitDepth = 0;
-    int reconFileBitDepth = 0;
-    const char *inputfn = NULL;
-    const char *reconfn = NULL;
-    const char *outputfn = NULL;
-    const char *preset = NULL;
-    const char *tune = NULL;
-    const char *profile = NULL;
-    int svtEnabled = 0;
-
-    if (argc <= 1)
-    {
-        x265_log(NULL, X265_LOG_ERROR, "No input file. Run x265 --help for a list of options.\n");
-        return true;
-    }
-
-    /* Presets are applied before all other options. */
-    for (optind = 0;; )
-    {
-        int optionsIndex = -1;
-        int c = getopt_long(argc, argv, short_options, long_options, &optionsIndex);
-        if (c == -1)
-            break;
-        else if (c == 'p')
-            preset = optarg;
-        else if (c == 't')
-            tune = optarg;
-        else if (c == 'D')
-            outputBitDepth = atoi(optarg);
-        else if (c == 'P')
-            profile = optarg;
-        else if (c == '?')
-            bShowHelp = true;
-        else if (!c && !strcmp(long_options[optionsIndex].name, "svt"))
-            svtEnabled = 1;
-    }
-
-    if (!outputBitDepth && profile)
-    {
-        /* try to derive the output bit depth from the requested profile */
-        if (strstr(profile, "10"))
-            outputBitDepth = 10;
-        else if (strstr(profile, "12"))
-            outputBitDepth = 12;
-        else
-            outputBitDepth = 8;
-    }
-
-    api = x265_api_get(outputBitDepth);
-    if (!api)
-    {
-        x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
-        api = x265_api_get(0);
-    }
-
-    param = api->param_alloc();
-    if (!param)
-    {
-        x265_log(NULL, X265_LOG_ERROR, "param alloc failed\n");
-        return true;
-    }
-#if ENABLE_LIBVMAF
-    vmafData = (x265_vmaf_data*)x265_malloc(sizeof(x265_vmaf_data));
-    if(!vmafData)
-    {
-        x265_log(NULL, X265_LOG_ERROR, "vmaf data alloc failed\n");
-        return true;
-    }
-#endif
-
-    if (api->param_default_preset(param, preset, tune) < 0)
-    {
-        x265_log(NULL, X265_LOG_ERROR, "preset or tune unrecognized\n");
-        return true;
-    }
-
-    if (bShowHelp)
-    {
-        printVersion(param, api);
-        showHelp(param);
-    }
-
-    //Set enable SVT-HEVC encoder first if found in the command line
-    if (svtEnabled) api->param_parse(param, "svt", NULL);
-
-    for (optind = 0;; )
-    {
-        int long_options_index = -1;
-        int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
-        if (c == -1)
-            break;
-
-        switch (c)
-        {
-        case 'h':
-            printVersion(param, api);
-            showHelp(param);
-            break;
-
-        case 'V':
-            printVersion(param, api);
-            x265_report_simd(param);
-            exit(0);
-
-        default:
-            if (long_options_index < 0 && c > 0)
-            {
-                for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options[0]); i++)
-                {
-                    if (long_options[i].val == c)
-                    {
-                        long_options_index = (int)i;
-                        break;
-                    }
-                }
+#define X265_HEAD_ENTRIES 3
 
-                if (long_options_index < 0)
-                {
-                    /* getopt_long might have already printed an error message */
-                    if (c != 63)
-                        x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
-                    return true;
-                }
-            }
-            if (long_options_index < 0)
-            {
-                x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
-                return true;
-            }
-#define OPT(longname) \
-    else if (!strcmp(long_options[long_options_index].name, longname))
-#define OPT2(name1, name2) \
-    else if (!strcmp(long_options[long_options_index].name, name1) || \
-             !strcmp(long_options[long_options_index].name, name2))
-
-            if (0) ;
-            OPT2("frame-skip", "seek") this->seek = (uint32_t)x265_atoi(optarg, bError);
-            OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
-            OPT("no-progress") this->bProgress = false;
-            OPT("output") outputfn = optarg;
-            OPT("input") inputfn = optarg;
-            OPT("recon") reconfn = optarg;
-            OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError);
-            OPT("dither") this->bDither = true;
-            OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
-            OPT("y4m") this->bForceY4m = true;
-            OPT("profile") /* handled above */;
-            OPT("preset")  /* handled above */;
-            OPT("tune")    /* handled above */;
-            OPT("output-depth")   /* handled above */;
-            OPT("recon-y4m-exec") reconPlayCmd = optarg;
-            OPT("svt")    /* handled above */;
-            OPT("qpfile")
-            {
-                this->qpfile = x265_fopen(optarg, "rb");
-                if (!this->qpfile)
-                    x265_log_file(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
-            }
-            OPT("dolby-vision-rpu")
-            {
-                this->dolbyVisionRpu = x265_fopen(optarg, "rb");
-                if (!this->dolbyVisionRpu)
-                {
-                    x265_log_file(param, X265_LOG_ERROR, "Dolby Vision RPU metadata file %s not found or error in opening file\n", optarg);
-                    return true;
-                }
-            }
-            OPT("zonefile")
-            {
-                this->zoneFile = x265_fopen(optarg, "rb");
-                if (!this->zoneFile)
-                    x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
-            }
-            OPT("fullhelp")
-            {
-                param->logLevel = X265_LOG_FULL;
-                printVersion(param, api);
-                showHelp(param);
-                break;
-            }
-            else
-                bError |= !!api->param_parse(param, long_options[long_options_index].name, optarg);
-            if (bError)
-            {
-                const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind - 2];
-                x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg);
-                return true;
-            }
-#undef OPT
-        }
-    }
-
-    if (optind < argc && !inputfn)
-        inputfn = argv[optind++];
-    if (optind < argc && !outputfn)
-        outputfn = argv[optind++];
-    if (optind < argc)
-    {
-        x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argv[optind]);
-        return true;
-    }
-
-    if (argc <= 1)
-    {
-        api->param_default(param);
-        printVersion(param, api);
-        showHelp(param);
-    }
-
-    if (!inputfn || !outputfn)
-    {
-        x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
-        return true;
-    }
-
-    if (param->internalBitDepth != api->bit_depth)
-    {
-        x265_log(param, X265_LOG_ERROR, "Only bit depths of %d are supported in this build\n", api->bit_depth);
-        return true;
-    }
-
-#ifdef SVT_HEVC
-    if (svtEnabled)
-    {
-        EB_H265_ENC_CONFIGURATION* svtParam = (EB_H265_ENC_CONFIGURATION*)param->svtHevcParam;
-        param->sourceWidth = svtParam->sourceWidth;
-        param->sourceHeight = svtParam->sourceHeight;
-        param->fpsNum = svtParam->frameRateNumerator;
-        param->fpsDenom = svtParam->frameRateDenominator;
-        svtParam->encoderBitDepth = inputBitDepth;
-    }
-#endif
-
-    InputFileInfo info;
-    info.filename = inputfn;
-    info.depth = inputBitDepth;
-    info.csp = param->internalCsp;
-    info.width = param->sourceWidth;
-    info.height = param->sourceHeight;
-    info.fpsNum = param->fpsNum;
-    info.fpsDenom = param->fpsDenom;
-    info.sarWidth = param->vui.sarWidth;
-    info.sarHeight = param->vui.sarHeight;
-    info.skipFrames = seek;
-    info.frameCount = 0;
-    getParamAspectRatio(param, info.sarWidth, info.sarHeight);
-
-
-    this->input = InputFile::open(info, this->bForceY4m);
-    if (!this->input || this->input->isFail())
-    {
-        x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
-        return true;
-    }
-
-    if (info.depth < 8 || info.depth > 16)
-    {
-        x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
-        return true;
-    }
-
-    /* Unconditionally accept height/width/csp/bitDepth from file info */
-    param->sourceWidth = info.width;
-    param->sourceHeight = info.height;
-    param->internalCsp = info.csp;
-    param->sourceBitDepth = info.depth;
-
-    /* Accept fps and sar from file info if not specified by user */
-    if (param->fpsDenom == 0 || param->fpsNum == 0)
-    {
-        param->fpsDenom = info.fpsDenom;
-        param->fpsNum = info.fpsNum;
-    }
-    if (!param->vui.aspectRatioIdc && info.sarWidth && info.sarHeight)
-        setParamAspectRatio(param, info.sarWidth, info.sarHeight);
-    if (this->framesToBeEncoded == 0 && info.frameCount > (int)seek)
-        this->framesToBeEncoded = info.frameCount - seek;
-    param->totalFrames = this->framesToBeEncoded;
-
-#ifdef SVT_HEVC
-    if (svtEnabled)
-    {
-        EB_H265_ENC_CONFIGURATION* svtParam = (EB_H265_ENC_CONFIGURATION*)param->svtHevcParam;
-        svtParam->sourceWidth = param->sourceWidth;
-        svtParam->sourceHeight = param->sourceHeight;
-        svtParam->frameRateNumerator = param->fpsNum;
-        svtParam->frameRateDenominator = param->fpsDenom;
-        svtParam->framesToBeEncoded = param->totalFrames;
-		svtParam->encoderColorFormat = (EB_COLOR_FORMAT)param->internalCsp;
-    }
+#ifdef _WIN32
+#define strdup _strdup
 #endif
-    
-    /* Force CFR until we have support for VFR */
-    info.timebaseNum = param->fpsDenom;
-    info.timebaseDenom = param->fpsNum;
-
-    if (param->bField && param->interlaceMode)
-    {   // Field FPS
-        param->fpsNum *= 2;
-        // Field height
-        param->sourceHeight = param->sourceHeight >> 1;
-        // Number of fields to encode
-        param->totalFrames *= 2;
-    }
-
-    if (api->param_apply_profile(param, profile))
-        return true;
-
-    if (param->logLevel >= X265_LOG_INFO)
-    {
-        char buf[128];
-        int p = sprintf(buf, "%dx%d fps %d/%d %sp%d", param->sourceWidth, param->sourceHeight,
-                        param->fpsNum, param->fpsDenom, x265_source_csp_names[param->internalCsp], info.depth);
-
-        int width, height;
-        getParamAspectRatio(param, width, height);
-        if (width && height)
-            p += sprintf(buf + p, " sar %d:%d", width, height);
-
-        if (framesToBeEncoded <= 0 || info.frameCount <= 0)
-            strcpy(buf + p, " unknown frame count");
-        else
-            sprintf(buf + p, " frames %u - %d of %d", this->seek, this->seek + this->framesToBeEncoded - 1, info.frameCount);
-
-        general_log(param, input->getName(), X265_LOG_INFO, "%s\n", buf);
-    }
-
-    this->input->startReader();
-
-    if (reconfn)
-    {
-        if (reconFileBitDepth == 0)
-            reconFileBitDepth = param->internalBitDepth;
-        this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
-                                      param->fpsNum, param->fpsDenom, param->internalCsp);
-        if (this->recon->isFail())
-        {
-            x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
-            this->recon->release();
-            this->recon = 0;
-        }
-        else
-            general_log(param, this->recon->getName(), X265_LOG_INFO,
-                    "reconstructed images %dx%d fps %d/%d %s\n",
-                    param->sourceWidth, param->sourceHeight, param->fpsNum, param->fpsDenom,
-                    x265_source_csp_names[param->internalCsp]);
-    }
-#if ENABLE_LIBVMAF
-    if (!reconfn)
-    {
-        x265_log(param, X265_LOG_ERROR, "recon file must be specified to get VMAF score, try --help for help\n");
-        return true;
-    }
-    const char *str = strrchr(info.filename, '.');
-
-    if (!strcmp(str, ".y4m"))
-    {
-        x265_log(param, X265_LOG_ERROR, "VMAF supports YUV file format only.\n");
-        return true; 
-    }
-    if(param->internalCsp == X265_CSP_I420 || param->internalCsp == X265_CSP_I422 || param->internalCsp == X265_CSP_I444)
-    {
-        vmafData->reference_file = x265_fopen(inputfn, "rb");
-        vmafData->distorted_file = x265_fopen(reconfn, "rb");
-    }
-    else
-    {
-        x265_log(param, X265_LOG_ERROR, "VMAF will support only yuv420p, yu422p, yu444p, yuv420p10le, yuv422p10le, yuv444p10le formats.\n");
-        return true;
-    }
-#endif
-    this->output = OutputFile::open(outputfn, info);
-    if (this->output->isFail())
-    {
-        x265_log_file(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
-        return true;
-    }
-    general_log_file(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
-    return false;
-}
-
-bool CLIOptions::parseQPFile(x265_picture &pic_org)
-{
-    int32_t num = -1, qp, ret;
-    char type;
-    uint32_t filePos;
-    pic_org.forceqp = 0;
-    pic_org.sliceType = X265_TYPE_AUTO;
-    while (num < pic_org.poc)
-    {
-        filePos = ftell(qpfile);
-        qp = -1;
-        ret = fscanf(qpfile, "%d %c%*[ \t]%d\n", &num, &type, &qp);
-
-        if (num > pic_org.poc || ret == EOF)
-        {
-            fseek(qpfile, filePos, SEEK_SET);
-            break;
-        }
-        if (num < pic_org.poc && ret >= 2)
-            continue;
-        if (ret == 3 && qp >= 0)
-            pic_org.forceqp = qp + 1;
-        if (type == 'I') pic_org.sliceType = X265_TYPE_IDR;
-        else if (type == 'i') pic_org.sliceType = X265_TYPE_I;
-        else if (type == 'K') pic_org.sliceType = param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
-        else if (type == 'P') pic_org.sliceType = X265_TYPE_P;
-        else if (type == 'B') pic_org.sliceType = X265_TYPE_BREF;
-        else if (type == 'b') pic_org.sliceType = X265_TYPE_B;
-        else ret = 0;
-        if (ret < 2 || qp < -1 || qp > 51)
-            return 0;
-    }
-    return 1;
-}
-
-bool CLIOptions::parseZoneFile()
-{
-    char line[256];
-    char* argLine;
-    param->rc.zonefileCount = 0;
-
-    while (fgets(line, sizeof(line), zoneFile))
-    {
-        if (!((*line == '#') || (strcmp(line, "\r\n") == 0)))
-            param->rc.zonefileCount++;
-    }
-
-    rewind(zoneFile);
-    param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
-    for (int i = 0; i < param->rc.zonefileCount; i++)
-    {
-        while (fgets(line, sizeof(line), zoneFile))
-        {
-            if (*line == '#' || (strcmp(line, "\r\n") == 0))
-                continue;
-            param->rc.zones[i].zoneParam = X265_MALLOC(x265_param, 1);
-            int index = (int)strcspn(line, "\r\n");
-            line[index] = '\0';
-            argLine = line;
-            while (isspace((unsigned char)*argLine)) argLine++;
-            char* start = strchr(argLine, ' ');
-            start++;
-            param->rc.zones[i].startFrame = atoi(argLine);
-            int argCount = 0;
-            char **args = (char**)malloc(256 * sizeof(char *));
-            // Adding a dummy string to avoid file parsing error
-            args[argCount++] = (char *)"x265";
-            char* token = strtok(start, " ");
-            while (token) 
-            {
-                args[argCount++] = token;
-                token = strtok(NULL, " ");
-            }
-            args[argCount] = NULL;
-            CLIOptions cliopt;
-            if (cliopt.parseZoneParam(argCount, args,param, i))
-            {
-                cliopt.destroy();
-                if (cliopt.api)
-                    cliopt.api->param_free(cliopt.param);
-                exit(1);
-            }
-            break;
-        }
-    }
-    return 1;
-}
 
 #ifdef _WIN32
 /* Copy of x264 code, which allows for Unicode characters in the command line.
@@ -782,59 +84,165 @@ static int get_argv_utf8(int *argc_ptr, 
 }
 #endif
 
-/* Parse the RPU file and extract the RPU corresponding to the current picture 
- * and fill the rpu field of the input picture */
-static int rpuParser(x265_picture * pic, FILE * ptr)
-{
-    uint8_t byteVal;
-    uint32_t code = 0;
-    int bytesRead = 0;
-    pic->rpu.payloadSize = 0;
+/* Checks for abr-ladder config file in the command line.
+ * Returns true if abr-config file is present. Returns 
+ * false otherwise */
 
-    if (!pic->pts)
-    {
-        while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, ptr))
-            code = (code << 8) | byteVal;
-      
-        if (code != START_CODE)
-        {
-            x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
-            return 1;
-        }
-    } 
-
-    bytesRead = 0;
-    while (fread(&byteVal, sizeof(uint8_t), 1, ptr))
+static bool checkAbrLadder(int argc, char **argv, FILE **abrConfig)
+{
+    for (optind = 0;;)
     {
-        code = (code << 8) | byteVal;
-        if (bytesRead++ < 3)
-            continue;
-        if (bytesRead >= 1024)
+        int long_options_index = -1;
+        int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
+        if (c == -1)
+            break;
+        if (long_options_index < 0 && c > 0)
         {
-            x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
-            return 1;
+            for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options[0]); i++)
+            {
+                if (long_options[i].val == c)
+                {
+                    long_options_index = (int)i;
+                    break;
+                }
+            }
+
+            if (long_options_index < 0)
+            {
+                /* getopt_long might have already printed an error message */
+                if (c != 63)
+                    x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
+                return false;
+            }
         }
-        
-        if (code != START_CODE)
-            pic->rpu.payload[pic->rpu.payloadSize++] = (code >> (3 * 8)) & 0xFF;
-        else
-            return 0;       
+        if (long_options_index < 0)
+        {
+            x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
+            return false;
+        }
+        if (!strcmp(long_options[long_options_index].name, "abr-ladder"))
+        {
+            *abrConfig = x265_fopen(optarg, "rb");
+            if (!abrConfig)
+                x265_log_file(NULL, X265_LOG_ERROR, "%s abr-ladder config file not found or error in opening zone file\n", optarg);
+            return true;
+        }
     }
-
-    int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
-    int bytesLeft = bytesRead - pic->rpu.payloadSize;
-    code = (code << ShiftBytes * 8);
-    for (int i = 0; i < bytesLeft; i++)
-    {
-        pic->rpu.payload[pic->rpu.payloadSize++] = (code >> (3 * 8)) & 0xFF;
-        code = (code << 8);
-    }
-    if (!pic->rpu.payloadSize)
-        x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
-    return 0;
+    return false;
 }
 
+static uint8_t getNumAbrEncodes(FILE* abrConfig)
+{
+    char line[1024];
+    uint8_t numEncodes = 0;
 
+    while (fgets(line, sizeof(line), abrConfig))
+    {
+        if (strcmp(line, "\n") == 0)
+            continue;
+        else if (!(*line == '#'))
+            numEncodes++;
+    }
+    rewind(abrConfig);
+    return numEncodes;
+}
+
+static bool parseAbrConfig(FILE* abrConfig, CLIOptions cliopt[], uint8_t numEncodes)
+{
+    char line[1024];
+    char* argLine;
+
+    for (uint32_t i = 0; i < numEncodes; i++)
+    {
+        fgets(line, sizeof(line), abrConfig);
+        if (*line == '#' || (strcmp(line, "\r\n") == 0))
+            continue;
+        int index = (int)strcspn(line, "\r\n");
+        line[index] = '\0';
+        argLine = line;
+        char* start = strchr(argLine, ' ');
+        while (isspace((unsigned char)*start)) start++;
+        int argc = 0;
+        char **argv = (char**)malloc(256 * sizeof(char *));
+        // Adding a dummy string to avoid file parsing error
+        argv[argc++] = (char *)"x265";
+
+        /* Parse CLI header to identify the ID of the load encode and the reuse level */
+        char *header = strtok(argLine, "[]");
+        uint32_t idCount = 0;
+        char *id = strtok(header, ":");
+        char *head[X265_HEAD_ENTRIES];
+        cliopt[i].encId = i;
+ 
+        while (id && (idCount <= X265_HEAD_ENTRIES))
+        {
+            head[idCount] = id;
+            id = strtok(NULL, ":");
+            idCount++;
+        }
+        if (idCount != X265_HEAD_ENTRIES)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Incorrect number of arguments in ABR CLI header at line %d\n", i);
+            return false;
+        }
+        else
+        {
+            cliopt[i].encName = strdup(head[0]);
+            cliopt[i].loadLevel = atoi(head[1]);
+            cliopt[i].reuseName = strdup(head[2]);
+        }
+
+        char* token = strtok(start, " ");
+        while (token)
+        {
+            argv[argc++] = token;
+            token = strtok(NULL, " ");
+        }
+        argv[argc] = NULL;
+        if (cliopt[i].parse(argc++, argv))
+        {
+            cliopt[i].destroy();
+            if (cliopt[i].api)
+                cliopt[i].api->param_free(cliopt[i].param);
+            exit(1);
+        }
+    }
+    return true;
+}
+
+static bool setRefContext(CLIOptions cliopt[], uint32_t numEncodes)
+{
+    bool hasRef = false;
+    bool isRefFound = false;
+
+    /* Identify reference encode IDs and set save/load reuse levels */
+    for (uint32_t curEnc = 0; curEnc < numEncodes; curEnc++)
+    {
+        isRefFound = false;
+        hasRef = !strcmp(cliopt[curEnc].reuseName, "nil") ? false : true;
+        if (hasRef)
+        {
+            for (uint32_t refEnc = 0; refEnc < numEncodes; refEnc++)
+            {
+                if (!strcmp(cliopt[curEnc].reuseName, cliopt[refEnc].encName))
+                {
+                    cliopt[curEnc].refId = refEnc;
+                    cliopt[refEnc].numRefs++;
+                    cliopt[refEnc].saveLevel = X265_MAX(cliopt[refEnc].saveLevel, cliopt[curEnc].loadLevel);
+                    isRefFound = true;
+                    break;
+                }
+            }
+            if (!isRefFound)
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Reference encode (%s) not found for %s\n", cliopt[curEnc].reuseName,
+                    cliopt[curEnc].encName);
+                return false;
+            }
+        }
+    }
+    return true;
+}
 /* CLI return codes:
  *
  * 0 - encode successful
@@ -859,354 +267,57 @@ int main(int argc, char **argv)
     get_argv_utf8(&argc, &argv);
 #endif
 
-    ReconPlay* reconPlay = NULL;
-    CLIOptions cliopt;
+    uint8_t numEncodes = 1;
+    FILE *abrConfig = NULL;
+    bool isAbrLadder = checkAbrLadder(argc, argv, &abrConfig);
 
-    if (cliopt.parse(argc, argv))
+    if (isAbrLadder)
+        numEncodes = getNumAbrEncodes(abrConfig);
+
+    CLIOptions* cliopt = new CLIOptions[numEncodes];
+
+    if (isAbrLadder)
     {
-        cliopt.destroy();
-        if (cliopt.api)
-            cliopt.api->param_free(cliopt.param);
+        if (!parseAbrConfig(abrConfig, cliopt, numEncodes))
+            exit(1);
+        if (!setRefContext(cliopt, numEncodes))
+            exit(1);
+    }
+    else if (cliopt[0].parse(argc, argv))
+    {
+        cliopt[0].destroy();
+        if (cliopt[0].api)
+            cliopt[0].api->param_free(cliopt[0].param);
         exit(1);
     }
 
-    x265_param* param = cliopt.param;
-    const x265_api* api = cliopt.api;
-#if ENABLE_LIBVMAF
-    x265_vmaf_data* vmafdata = cliopt.vmafData;
-#endif
-    /* This allows muxers to modify bitstream format */
-    cliopt.output->setParam(param);
+    int ret = 0;
 
-    if (cliopt.reconPlayCmd)
-        reconPlay = new ReconPlay(cliopt.reconPlayCmd, *param);
-
-    if (cliopt.zoneFile)
+    AbrEncoder* abrEnc = new AbrEncoder(cliopt, numEncodes, ret);
+    int threadsActive = abrEnc->m_numActiveEncodes.get();
+    while (threadsActive)
     {
-        if (!cliopt.parseZoneFile())
+        threadsActive = abrEnc->m_numActiveEncodes.waitForChange(threadsActive);
+        for (uint8_t idx = 0; idx < numEncodes; idx++)
         {
-            x265_log(NULL, X265_LOG_ERROR, "Unable to parse zonefile\n");
-            fclose(cliopt.zoneFile);
-            cliopt.zoneFile = NULL;
+            if (abrEnc->m_passEnc[idx]->m_ret)
+            {
+                if (isAbrLadder)
+                    x265_log(NULL, X265_LOG_INFO, "Error generating ABR-ladder \n");
+                ret = abrEnc->m_passEnc[idx]->m_ret;
+                threadsActive = 0;
+                break;
+            }
         }
     }
 
-    /* note: we could try to acquire a different libx265 API here based on
-     * the profile found during option parsing, but it must be done before
-     * opening an encoder */
-
-    x265_encoder *encoder = api->encoder_open(param);
-    if (!encoder)
-    {
-        x265_log(param, X265_LOG_ERROR, "failed to open encoder\n");
-        cliopt.destroy();
-        api->param_free(param);
-        api->cleanup();
-        exit(2);
-    }
-
-    /* get the encoder parameters post-initialization */
-    api->encoder_parameters(encoder, param);
-
-     /* Control-C handler */
-    if (signal(SIGINT, sigint_handler) == SIG_ERR)
-        x265_log(param, X265_LOG_ERROR, "Unable to register CTRL+C handler: %s\n", strerror(errno));
-
-    x265_picture pic_orig, pic_out;
-    x265_picture *pic_in = &pic_orig;
-    /* Allocate recon picture if analysis save/load is enabled */
-    std::priority_queue<int64_t>* pts_queue = cliopt.output->needPTS() ? new std::priority_queue<int64_t>() : NULL;
-    x265_picture *pic_recon = (cliopt.recon || param->analysisSave || param->analysisLoad || pts_queue || reconPlay || param->csvLogLevel) ? &pic_out : NULL;
-    uint32_t inFrameCount = 0;
-    uint32_t outFrameCount = 0;
-    x265_nal *p_nal;
-    x265_stats stats;
-    uint32_t nal;
-    int16_t *errorBuf = NULL;
-    bool bDolbyVisionRPU = false;
-    uint8_t *rpuPayload = NULL;
-    int ret = 0;
-    int inputPicNum = 1;
-    x265_picture picField1, picField2;
-
-    if (!param->bRepeatHeaders && !param->bEnableSvtHevc)
-    {
-        if (api->encoder_headers(encoder, &p_nal, &nal) < 0)
-        {
-            x265_log(param, X265_LOG_ERROR, "Failure generating stream headers\n");
-            ret = 3;
-            goto fail;
-        }
-        else
-            cliopt.totalbytes += cliopt.output->writeHeaders(p_nal, nal);
-    }
-
-    if (param->bField && param->interlaceMode)
-    {
-        api->picture_init(param, &picField1);
-        api->picture_init(param, &picField2);
-        // return back the original height of input
-        param->sourceHeight *= 2;
-        api->picture_init(param, pic_in);
-    }
-    else
-        api->picture_init(param, pic_in);
-
-    if (param->dolbyProfile && cliopt.dolbyVisionRpu)
-    {
-        rpuPayload = X265_MALLOC(uint8_t, 1024);
-        pic_in->rpu.payload = rpuPayload;
-        if (pic_in->rpu.payload)
-            bDolbyVisionRPU = true;
-    }
-    
-    if (cliopt.bDither)
-    {
-        errorBuf = X265_MALLOC(int16_t, param->sourceWidth + 1);
-        if (errorBuf)
-            memset(errorBuf, 0, (param->sourceWidth + 1) * sizeof(int16_t));
-        else
-            cliopt.bDither = false;
-    }
-
-    // main encoder loop
-    while (pic_in && !b_ctrl_c)
-    {
-        pic_orig.poc = (param->bField && param->interlaceMode) ? inFrameCount * 2 : inFrameCount;
-        if (cliopt.qpfile)
-        {
-            if (!cliopt.parseQPFile(pic_orig))
-            {
-                x265_log(NULL, X265_LOG_ERROR, "can't parse qpfile for frame %d\n", pic_in->poc);
-                fclose(cliopt.qpfile);
-                cliopt.qpfile = NULL;
-            }
-        }
-
-        if (cliopt.framesToBeEncoded && inFrameCount >= cliopt.framesToBeEncoded)
-            pic_in = NULL;
-        else if (cliopt.input->readPicture(pic_orig))
-            inFrameCount++;
-        else
-            pic_in = NULL;
-
-        if (pic_in)
-        {
-            if (pic_in->bitDepth > param->internalBitDepth && cliopt.bDither)
-            {
-                x265_dither_image(pic_in, cliopt.input->getWidth(), cliopt.input->getHeight(), errorBuf, param->internalBitDepth);
-                pic_in->bitDepth = param->internalBitDepth;
-            }
-            /* Overwrite PTS */
-            pic_in->pts = pic_in->poc;
-
-            // convert to field
-            if (param->bField && param->interlaceMode)
-            {
-                int height = pic_in->height >> 1;
-                
-                int static bCreated = 0;
-                if (bCreated == 0)
-                {
-                    bCreated = 1;
-                    inputPicNum = 2;
-                    picField1.fieldNum = 1;
-                    picField2.fieldNum = 2;
-
-                    picField1.bitDepth = picField2.bitDepth = pic_in->bitDepth;
-                    picField1.colorSpace = picField2.colorSpace = pic_in->colorSpace;
-                    picField1.height = picField2.height = pic_in->height >> 1;
-                    picField1.framesize = picField2.framesize = pic_in->framesize >> 1;
-
-                    size_t fieldFrameSize = (size_t)pic_in->framesize >> 1;
-                    char* field1Buf = X265_MALLOC(char, fieldFrameSize);
-                    char* field2Buf = X265_MALLOC(char, fieldFrameSize);
-  
-                    int stride = picField1.stride[0] = picField2.stride[0] = pic_in->stride[0];
-                    uint64_t framesize = stride * (height >> x265_cli_csps[pic_in->colorSpace].height[0]);
-                    picField1.planes[0] = field1Buf;
-                    picField2.planes[0] = field2Buf;
-                    for (int i = 1; i < x265_cli_csps[pic_in->colorSpace].planes; i++)
-                    {
-                        picField1.planes[i] = field1Buf + framesize;
-                        picField2.planes[i] = field2Buf + framesize;
-
-                        stride = picField1.stride[i] = picField2.stride[i] = pic_in->stride[i];
-                        framesize += (stride * (height >> x265_cli_csps[pic_in->colorSpace].height[i]));
-                    }
-                    assert(framesize  == picField1.framesize);
-                }
-
-                picField1.pts = picField1.poc = pic_in->poc;
-                picField2.pts = picField2.poc = pic_in->poc + 1;
-
-                picField1.userSEI = picField2.userSEI = pic_in->userSEI;
-
-                //if (pic_in->userData)
-                //{
-                //    // Have to handle userData here
-                //}
+    abrEnc->destroy();
+    delete abrEnc;
 
-                if (pic_in->framesize)
-                {
-                    for (int i = 0; i < x265_cli_csps[pic_in->colorSpace].planes; i++)
-                    {
-                        char* srcP1 = (char*)pic_in->planes[i];
-                        char* srcP2 = (char*)pic_in->planes[i] + pic_in->stride[i];
-                        char* p1 = (char*)picField1.planes[i];
-                        char* p2 = (char*)picField2.planes[i];
-
-                        int stride = picField1.stride[i];
-
-                        for (int y = 0; y < (height >> x265_cli_csps[pic_in->colorSpace].height[i]); y++)
-                        {
-                            memcpy(p1, srcP1, stride);
-                            memcpy(p2, srcP2, stride);
-                            srcP1 += 2*stride;
-                            srcP2 += 2*stride;
-                            p1 += stride;
-                            p2 += stride;
-                        }
-                    }
-                }
-            }
-
-            if (bDolbyVisionRPU)
-            {
-                if (param->bField && param->interlaceMode)
-                {
-                    if (rpuParser(&picField1, cliopt.dolbyVisionRpu) > 0)
-                        goto fail;
-                    if (rpuParser(&picField2, cliopt.dolbyVisionRpu) > 0)
-                        goto fail;
-                }
-                else
-                {
-                    if (rpuParser(pic_in, cliopt.dolbyVisionRpu) > 0)
-                        goto fail;
-                }
-            }
-        }
-                
-        for (int inputNum = 0; inputNum < inputPicNum; inputNum++)
-        {  
-            x265_picture *picInput = NULL;
-            if (inputPicNum == 2)
-                picInput = pic_in ? (inputNum ? &picField2 : &picField1) : NULL;
-            else
-                picInput = pic_in;
-
-            int numEncoded = api->encoder_encode( encoder, &p_nal, &nal, picInput, pic_recon );
-            if( numEncoded < 0 )
-            {
-                b_ctrl_c = 1;
-                ret = 4;
-                break;
-            }
-
-            if (reconPlay && numEncoded)
-                reconPlay->writePicture(*pic_recon);
-
-            outFrameCount += numEncoded;
-
-            if (numEncoded && pic_recon && cliopt.recon)
-                cliopt.recon->writePicture(pic_out);
-            if (nal)
-            {
-                cliopt.totalbytes += cliopt.output->writeFrame(p_nal, nal, pic_out);
-                if (pts_queue)
-                {
-                    pts_queue->push(-pic_out.pts);
-                    if (pts_queue->size() > 2)
-                        pts_queue->pop();
-                }
-            }
-            cliopt.printStatus( outFrameCount );
-        }
-    }
+    for (uint8_t idx = 0; idx < numEncodes; idx++)
+        cliopt[idx].destroy();
 
-    /* Flush the encoder */
-    while (!b_ctrl_c)
-    {
-        int numEncoded = api->encoder_encode(encoder, &p_nal, &nal, NULL, pic_recon);
-        if (numEncoded < 0)
-        {
-            ret = 4;
-            break;
-        }
-
-        if (reconPlay && numEncoded)
-            reconPlay->writePicture(*pic_recon);
-
-        outFrameCount += numEncoded;
-        if (numEncoded && pic_recon && cliopt.recon)
-            cliopt.recon->writePicture(pic_out);
-        if (nal)
-        {
-            cliopt.totalbytes += cliopt.output->writeFrame(p_nal, nal, pic_out);
-            if (pts_queue)
-            {
-                pts_queue->push(-pic_out.pts);
-                if (pts_queue->size() > 2)
-                    pts_queue->pop();
-            }
-        }
-
-        cliopt.printStatus(outFrameCount);
-
-        if (!numEncoded)
-            break;
-    }
-  
-    if (bDolbyVisionRPU)
-    {
-        if(fgetc(cliopt.dolbyVisionRpu) != EOF)
-            x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU count is greater than frame count\n");
-        x265_log(NULL, X265_LOG_INFO, "VES muxing with Dolby Vision RPU file successful\n");
-    }
-
-    /* clear progress report */
-    if (cliopt.bProgress)
-        fprintf(stderr, "%*s\r", 80, " ");
-
-fail:
-
-    delete reconPlay;
-
-    api->encoder_get_stats(encoder, &stats, sizeof(stats));
-    if (param->csvfn && !b_ctrl_c)
-#if ENABLE_LIBVMAF
-        api->vmaf_encoder_log(encoder, argc, argv, param, vmafdata);
-#else
-        api->encoder_log(encoder, argc, argv);
-#endif
-    api->encoder_close(encoder);
-
-    int64_t second_largest_pts = 0;
-    int64_t largest_pts = 0;
-    if (pts_queue && pts_queue->size() >= 2)
-    {
-        second_largest_pts = -pts_queue->top();
-        pts_queue->pop();
-        largest_pts = -pts_queue->top();
-        pts_queue->pop();
-        delete pts_queue;
-        pts_queue = NULL;
-    }
-    cliopt.output->closeFile(largest_pts, second_largest_pts);
-
-    if (b_ctrl_c)
-        general_log(param, NULL, X265_LOG_INFO, "aborted at input frame %d, output frame %d\n",
-                    cliopt.seek + inFrameCount, stats.encodedPictureCount);
-
-    api->cleanup(); /* Free library singletons */
-
-    cliopt.destroy();
-
-    api->param_free(param);
-
-    X265_FREE(errorBuf);
-    X265_FREE(rpuPayload);
+    delete[] cliopt;
 
     SetConsoleTitle(orgConsoleTitle);
     SetThreadExecutionState(ES_CONTINUOUS);
--- a/source/x265.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/x265.h	Wed May 06 14:59:56 2020 +0530
@@ -134,6 +134,7 @@ typedef struct x265_analysis_validate
     int     ctuDistortionRefine;
     int     rightOffset;
     int     bottomOffset;
+    int     frameDuplication;
 }x265_analysis_validate;
 
 /* Stores intra analysis data for a single frame. This struct needs better packing */
@@ -304,6 +305,7 @@ typedef struct x265_frame_stats
     double           totalFrameTime;
     double           vmafFrameScore;
     double           bufferFillFinal;
+    double           unclippedBufferFillFinal;
 } x265_frame_stats;
 
 typedef struct x265_ctu_info_t
@@ -1255,9 +1257,9 @@ typedef struct x265_param
      * skip blocks. Default is disabled */
     int       bEnableEarlySkip;
 
-    /* Enable early CU size decisions to avoid recursing to higher depths. 
+    /* Enable early CU size decisions to avoid recursing to higher depths.
      * Default is enabled */
-    int bEnableRecursionSkip;
+    int       recursionSkipMode;
 
     /* Use a faster search method to find the best intra mode. Default is 0 */
     int       bEnableFastIntra;
@@ -1857,7 +1859,7 @@ typedef struct x265_param
     double    edgeTransitionThreshold;
 
     /* Enables histogram based scenecut detection algorithm to detect scenecuts. Default disabled */
-    int      bHistBasedSceneCut;
+    int       bHistBasedSceneCut;
 
     /* Enable HME search ranges for L0, L1 and L2 respectively. */
     int       hmeRange[3];
@@ -1874,7 +1876,7 @@ typedef struct x265_param
     * analysis information stored in analysis-save. Higher the refine level higher
     * the information stored. Default is 5 */
     int       analysisSaveReuseLevel;
-    
+
     /* A value between 1 and 10 (both inclusive) determines the level of
     * analysis information reused in analysis-load. Higher the refine level higher
     * the information reused. Default is 5 */
@@ -1901,6 +1903,12 @@ typedef struct x265_param
     * info is available from the corresponding analysis-save. */
 
     int      confWinBottomOffset;
+
+    /* Edge variance threshold for quad tree establishment. */
+    float    edgeVarThreshold;
+
+    /* Maxrate that could be signaled to the decoder. Default 0. API only. */
+    int      decoderVbvMaxRate;
 } x265_param;
 
 /* x265_param_alloc:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/x265cli.cpp	Wed May 06 14:59:56 2020 +0530
@@ -0,0 +1,1054 @@
+/*****************************************************************************
+ * Copyright (C) 2013-2020 MulticoreWare, Inc
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
+#include "x265cli.h"
+
+#define START_CODE 0x00000001
+#define START_CODE_BYTES 4
+
+#ifdef __cplusplus
+namespace X265_NS {
+#endif
+
+    static void printVersion(x265_param *param, const x265_api* api)
+    {
+        x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
+        x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
+    }
+
+    static void showHelp(x265_param *param)
+    {
+        int level = param->logLevel;
+
+#define OPT(value) (value ? "enabled" : "disabled")
+#define H0 printf
+#define H1 if (level >= X265_LOG_DEBUG) printf
+
+        H0("\nSyntax: x265 [options] infile [-o] outfile\n");
+        H0("    infile can be YUV or Y4M\n");
+        H0("    outfile is raw HEVC bitstream\n");
+        H0("\nExecutable Options:\n");
+        H0("-h/--help                        Show this help text and exit\n");
+        H0("   --fullhelp                    Show all options and exit\n");
+        H0("-V/--version                     Show version info and exit\n");
+        H0("\nOutput Options:\n");
+        H0("-o/--output <filename>           Bitstream output file name\n");
+        H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
+        H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
+        H0("   --no-progress                 Disable CLI progress reports\n");
+        H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
+        H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
+        H0("\nInput Options:\n");
+        H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
+        H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
+        H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
+        H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
+        H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
+        H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
+        H1("                                 0 - i400 (4:0:0 monochrome)\n");
+        H1("                                 1 - i420 (4:2:0 default)\n");
+        H1("                                 2 - i422 (4:2:2)\n");
+        H1("                                 3 - i444 (4:4:4)\n");
+#if ENABLE_HDR10_PLUS
+        H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
+        H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
+#endif
+        H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
+        H0("   --dolby-vision-rpu <filename> File containing Dolby Vision RPU metadata.\n"
+            "                                 If given, x265's Dolby Vision metadata parser will fill the RPU field of input pictures with the metadata read from the file. Default NULL(disabled).\n");
+        H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
+        H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
+        H0("   --seek <integer>              First frame to encode\n");
+        H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
+        H0("   --[no-]field                  Enable or disable field coding. Default %s\n", OPT(param->bField));
+        H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
+        H0("   --[no-]copy-pic               Copy buffers of input picture in frame. Default %s\n", OPT(param->bCopyPicToFrame));
+        H0("\nQuality reporting metrics:\n");
+        H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
+        H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
+        H0("\nProfile, Level, Tier:\n");
+        H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
+        H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
+        H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
+        H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
+        H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
+        H0("\nThreading, performance:\n");
+        H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
+        H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
+        H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
+        H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
+        H0("   --[no-]slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
+        H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
+        H0("   --[no-]pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
+        H0("   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto\n");
+        H0("\nPresets:\n");
+        H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
+        H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
+        H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
+        H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
+        H0("\nQuad-Tree size and depth:\n");
+        H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
+        H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
+        H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
+        H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
+        H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
+        H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
+        H0("\nAnalysis:\n");
+        H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
+        H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
+        H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
+        H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
+        H0("   --dynamic-rd <0..4.0>         Strength of dynamic RD, 0 to disable. Default %.2f\n", param->dynamicRd);
+        H0("   --[no-]ssim-rd                Enable ssim rate distortion optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd));
+        H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
+        H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
+        H0("   --rskip <mode>                Set mode for early exit from recursion. Mode 1: exit using rdcost & CU homogenity. Mode 2: exit using CU edge density.\n"
+            "                                 Mode 0: disabled. Default %d\n", param->recursionSkipMode);
+        H1("   --rskip-edge-threshold        Threshold in terms of percentage (integer of range [0,100]) for minimum edge density in CUs used to prun the recursion depth. Applicable only for rskip mode 2. Value is preset dependent. Default: %.f\n", param->edgeVarThreshold*100.0f);
+        H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+        H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
+        H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
+        H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
+        H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
+            "                                    - 1: force the partitions if CTU information is present\n"
+            "                                    - 2: functionality of (1) and reduce qp if CTU information has changed\n"
+            "                                    - 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise\n"
+            "                                    Enable this option only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously\n");
+        H0("\nCoding tools:\n");
+        H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
+        H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
+        H0("   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
+        H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
+        H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
+        H0("\nTemporal / motion search options:\n");
+        H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
+        H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
+        H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
+        H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
+        H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
+        H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
+        H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
+        H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
+        H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
+        H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
+        H1("   --[no-]hme                    Enable Hierarchical Motion Estimation. Default %s\n", OPT(param->bEnableHME));
+        H1("   --hme-search <string>         Motion search-method for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeSearchMethod[0], param->hmeSearchMethod[1], param->hmeSearchMethod[2]);
+        H1("   --hme-range <int>,<int>,<int> Motion search-range for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeRange[0], param->hmeRange[1], param->hmeRange[2]);
+        H0("\nSpatial / intra options:\n");
+        H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
+        H0("   --[no-]constrained-intra      Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
+        H0("   --[no-]b-intra                Enable intra in B frames in veryslow presets. Default %s\n", OPT(param->bIntraInBFrames));
+        H0("   --[no-]fast-intra             Enable faster search method for angular intra predictions. Default %s\n", OPT(param->bEnableFastIntra));
+        H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
+        H0("\nSlice decision options:\n");
+        H0("   --[no-]open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
+        H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
+        H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
+        H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
+        H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
+        H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
+        H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias);
+        H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
+        H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
+        H1("   --hist-threshold <0.0..2.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
+        H0("   --[no-]fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
+        H1("   --[no-]scenecut-aware-qp      Enable increasing QP for frames inside the scenecut window after scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
+        H1("   --scenecut-window <0..1000>   QP incremental duration(in milliseconds) when scenecut-aware-qp is enabled. Default %d\n", param->scenecutWindow);
+        H1("   --max-qp-delta <0..10>        QP offset to increment with base QP for inter-frames. Default %d\n", param->maxQpDelta);
+        H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
+        H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
+        H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
+        H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
+        H0("   --lookahead-threads <integer> Number of threads to be dedicated to perform lookahead only. Default %d\n", param->lookaheadThreads);
+        H0("-b/--bframes <0..16>             Maximum number of consecutive b-frames. Default %d\n", param->bframes);
+        H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
+        H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
+        H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
+        H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
+        H1("                                 Format of each line: framenumber frametype QP\n");
+        H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
+        H1("                                 QPs are restricted by qpmin/qpmax.\n");
+        H1("   --force-flush <integer>       Force the encoder to flush frames. Default %d\n", param->forceFlush);
+        H1("                                 0 - flush the encoder only when all the input pictures are over.\n");
+        H1("                                 1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.\n");
+        H1("                                 2 - flush the slicetype decided frames only.\n");
+        H0("   --[no-]-hrd-concat            Set HRD concatenation flag for the first keyframe in the buffering period SEI. Default %s\n", OPT(param->bEnableHRDConcatFlag));
+        H0("\nRate control, Adaptive Quantization:\n");
+        H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
+        H1("-q/--qp <integer>                QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
+        H0("   --crf <float>                 Quality-based VBR (0-51). Default %.1f\n", param->rc.rfConstant);
+        H1("   --[no-]lossless               Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
+        H1("   --crf-max <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
+        H1("                                 May cause VBV underflows!\n");
+        H1("   --crf-min <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
+        H1("                                 this specifies a minimum rate factor value for encode!\n");
+        H0("   --vbv-maxrate <integer>       Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate);
+        H0("   --vbv-bufsize <integer>       Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize);
+        H0("   --vbv-init <float>            Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
+        H0("   --vbv-end <float>             Final VBV buffer emptiness (fraction of bufsize or in kbits). Default 0 (disabled)\n");
+        H0("   --vbv-end-fr-adj <float>      Frame from which qp has to be adjusted to achieve final decode buffer emptiness. Default 0\n");
+        H0("   --chunk-start <integer>       First frame of the chunk. Default 0 (disabled)\n");
+        H0("   --chunk-end <integer>         Last frame of the chunk. Default 0 (disabled)\n");
+        H0("   --pass                        Multi pass rate control.\n"
+            "                                   - 1 : First pass, creates stats file\n"
+            "                                   - 2 : Last pass, does not overwrite stats file\n"
+            "                                   - 3 : Nth pass, overwrites stats file\n");
+        H0("   --[no-]multi-pass-opt-analysis   Refine analysis in 2 pass based on analysis information from pass 1\n");
+        H0("   --[no-]multi-pass-opt-distortion Use distortion of CTU from pass 1 to refine qp in 2 pass\n");
+        H0("   --stats                       Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
+        H0("   --[no-]analyze-src-pics       Motion estimation uses source frame planes. Default disable\n");
+        H0("   --[no-]slow-firstpass         Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass));
+        H0("   --[no-]strict-cbr             Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr));
+        H0("   --analysis-save <filename>    Dump analysis info into the specified file. Default Disabled\n");
+        H0("   --analysis-load <filename>    Load analysis buffers from the file specified. Default Disabled\n");
+        H0("   --analysis-reuse-file <filename>    Specify file name used for either dumping or reading analysis data. Deault x265_analysis.dat\n");
+        H0("   --analysis-reuse-level <1..10>      Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Now deprecated. Default %d\n", param->analysisReuseLevel);
+        H0("   --analysis-save-reuse-level <1..10> Indicates the amount of analysis info stored in save mode, 1:least..10:most. Default %d\n", param->analysisSaveReuseLevel);
+        H0("   --analysis-load-reuse-level <1..10> Indicates the amount of analysis info reused in load mode, 1:least..10:most. Default %d\n", param->analysisLoadReuseLevel);
+        H0("   --refine-analysis-type <string>     Reuse anlaysis information received through API call. Supported options are avc and hevc. Default disabled - %d\n", param->bAnalysisType);
+        H0("   --scale-factor <int>          Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor);
+        H0("   --refine-intra <0..4>         Enable intra refinement for encode that uses analysis-load.\n"
+            "                                    - 0 : Forces both mode and depth from the save encode.\n"
+            "                                    - 1 : Functionality of (0) + evaluate all intra modes at min-cu-size's depth when current depth is one smaller than min-cu-size's depth.\n"
+            "                                    - 2 : Functionality of (1) + irrespective of size evaluate all angular modes when the save encode decides the best mode as angular.\n"
+            "                                    - 3 : Functionality of (1) + irrespective of size evaluate all intra modes.\n"
+            "                                    - 4 : Re-evaluate all intra blocks, does not reuse data from save encode.\n"
+            "                                Default:%d\n", param->intraRefine);
+        H0("   --refine-inter <0..3>         Enable inter refinement for encode that uses analysis-load.\n"
+            "                                    - 0 : Forces both mode and depth from the save encode.\n"
+            "                                    - 1 : Functionality of (0) + evaluate all inter modes at min-cu-size's depth when current depth is one smaller than\n"
+            "                                          min-cu-size's depth. When save encode decides the current block as skip(for all sizes) evaluate skip/merge.\n"
+            "                                    - 2 : Functionality of (1) + irrespective of size restrict the modes evaluated when specific modes are decided as the best mode by the save encode.\n"
+            "                                    - 3 : Functionality of (1) + irrespective of size evaluate all inter modes.\n"
+            "                                Default:%d\n", param->interRefine);
+        H0("   --[no-]dynamic-refine         Dynamically changes refine-inter level for each CU. Default %s\n", OPT(param->bDynamicRefine));
+        H0("   --refine-mv <1..3>            Enable mv refinement for load mode. Default %d\n", param->mvRefine);
+        H0("   --refine-ctu-distortion       Store/normalize ctu distortion in analysis-save/load.\n"
+            "                                    - 0 : Disabled.\n"
+            "                                    - 1 : Store/Load ctu distortion to/from the file specified in analysis-save/load.\n"
+            "                                Default 0 - Disabled\n");
+        H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes 4:auto variance with edge information. Default %d\n", param->rc.aqMode);
+        H0("   --[no-]hevc-aq                Mode for HEVC Adaptive Quantization. Default %s\n", OPT(param->rc.hevcAq));
+        H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
+        H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
+        H0("   --[no-]aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
+        H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
+        H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
+        H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
+        H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
+        H1("   --pbratio <float>             QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
+        H1("   --qcomp <float>               Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
+        H1("   --qpstep <integer>            The maximum single adjustment in QP allowed to rate control. Default %d\n", param->rc.qpStep);
+        H1("   --qpmin <integer>             sets a hard lower limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMin);
+        H1("   --qpmax <integer>             sets a hard upper limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMax);
+        H0("   --[no-]const-vbv              Enable consistent vbv. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableConstVbv));
+        H1("   --cbqpoffs <integer>          Chroma Cb QP Offset [-12..12]. Default %d\n", param->cbQpOffset);
+        H1("   --crqpoffs <integer>          Chroma Cr QP Offset [-12..12]. Default %d\n", param->crQpOffset);
+        H1("   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
+        H1("   --zones <zone0>/<zone1>/...   Tweak the bitrate of regions of the video\n");
+        H1("                                 Each zone is of the form\n");
+        H1("                                   <start frame>,<end frame>,<option>\n");
+        H1("                                   where <option> is either\n");
+        H1("                                       q=<integer> (force QP)\n");
+        H1("                                   or  b=<float> (bitrate multiplier)\n");
+        H0("   --zonefile <filename>         Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
+        H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
+        H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
+        H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
+        H1("                                 Comma is considered to be white-space\n");
+        H0("   --max-ausize-factor <float>   This value controls the maximum AU size defined in specification.\n");
+        H0("                                 It represents the percentage of maximum AU size used. Default %.1f\n", param->maxAUSizeFactor);
+        H0("\nLoop filters (deblock and SAO):\n");
+        H0("   --[no-]deblock                Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
+        H0("   --[no-]sao                    Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
+        H1("   --[no-]sao-non-deblock        Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
+        H0("   --[no-]limit-sao              Limit Sample Adaptive Offset types. Default %s\n", OPT(param->bLimitSAO));
+        H0("   --selective-sao <int>         Enable slice-level SAO filter. Default %d\n", param->selectiveSAO);
+        H0("\nVUI options:\n");
+        H0("   --sar <width:height|int>      Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
+        H0("                                 Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
+        H0("                                 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
+        H0("                                 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
+        H1("   --display-window <string>     Describe overscan cropping region as 'left,top,right,bottom' in pixels\n");
+        H1("   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
+        H0("   --videoformat <string>        Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
+        H0("   --range <string>              Specify black level and range of luma and chroma signals as full or limited Default limited\n");
+        H0("   --colorprim <string>          Specify color primaries from  bt709, unknown, reserved, bt470m, bt470bg, smpte170m,\n");
+        H0("                                 smpte240m, film, bt2020, smpte428, smpte431, smpte432. Default undef\n");
+        H0("   --transfer <string>           Specify transfer characteristics from bt709, unknown, reserved, bt470m, bt470bg, smpte170m,\n");
+        H0("                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
+        H0("                                 bt2020-10, bt2020-12, smpte2084, smpte428, arib-std-b67. Default undef\n");
+        H1("   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
+        H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c, smpte2085, chroma-derived-nc, chroma-derived-c, ictcp. Default undef\n");
+        H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
+        H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
+        H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
+        H0("   --max-cll <string>            Specify content light level info SEI as \"cll,fall\" (HDR).\n");
+        H0("   --[no-]cll                    Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
+        H0("   --[no-]hdr10                  Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
+        H0("   --[no-]hdr-opt                Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
+        H0("   --[no-]hdr10-opt              Block-level QP optimization for HDR10 content. Default %s.\n", OPT(param->bHDR10Opt));
+        H0("   --min-luma <integer>          Minimum luma plane value of input source picture\n");
+        H0("   --max-luma <integer>          Maximum luma plane value of input source picture\n");
+        H0("\nBitstream options:\n");
+        H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
+        H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
+        H0("   --[no-]hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
+        H0("   --[no-]idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
+        H0("   --[no-]temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
+        H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
+        H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
+        H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
+        H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
+        H0("   --log2-max-poc-lsb <integer>  Maximum of the picture order count\n");
+        H0("   --[no-]vui-timing-info        Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
+        H0("   --[no-]vui-hrd-info           Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
+        H0("   --[no-]opt-qp-pps             Dynamically optimize QP in PPS (instead of default 26) based on QPs in previous GOP. Default %s\n", OPT(param->bOptQpPPS));
+        H0("   --[no-]opt-ref-list-length-pps  Dynamically set L0 and L1 ref list length in PPS (instead of default 0) based on values in last GOP. Default %s\n", OPT(param->bOptRefListLengthPPS));
+        H0("   --[no-]multi-pass-opt-rps     Enable storing commonly used RPS in SPS in multi pass mode. Default %s\n", OPT(param->bMultiPassOptRPS));
+        H0("   --[no-]opt-cu-delta-qp        Optimize to signal consistent CU level delta QPs in frame. Default %s\n", OPT(param->bOptCUDeltaQP));
+        H1("\nReconstructed video options (debugging):\n");
+        H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
+        H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
+        H1("   --recon-y4m-exec <string>     pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
+        H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
+        H0("   --[no-]frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
+        H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
+#ifdef SVT_HEVC
+        H0("   --[no]svt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
+        H0("   --[no-]svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
+        H0("   --svt-search-width            Motion estimation search area width for SVT HEVC encoder \n");
+        H0("   --svt-search-height           Motion estimation search area height for SVT HEVC encoder \n");
+        H0("   --[no-]svt-compressed-ten-bit-format  Enable 8+2 encoding mode for 10bit input in SVT HEVC encoder \n");
+        H0("   --[no-]svt-speed-control      Enable speed control functionality to achieve real time encoding speed for  SVT HEVC encoder \n");
+        H0("   --svt-preset-tuner            Enable additional faster presets of SVT; This only has to be used on top of x265's ultrafast preset. Accepts values in the range of 0-2 \n");
+        H0("   --svt-hierarchical-level      Hierarchical layer for SVT-HEVC encoder; Accepts inputs in the range 0-3 \n");
+        H0("   --svt-base-layer-switch-mode  Select whether B/P slice should be used in base layer for SVT-HEVC encoder. 0-Use B-frames; 1-Use P frames in the base layer \n");
+        H0("   --svt-pred-struct             Select pred structure for SVT HEVC encoder;  Accepts inputs in the range 0-2 \n");
+        H0("   --[no-]svt-fps-in-vps         Enable VPS timing info for SVT HEVC encoder  \n");
+#endif
+        H0(" ABR-ladder settings\n");
+        H0("   --abr-ladder <file>           File containing config settings required for the generation of ABR-ladder\n");
+        H1("\nExecutable return codes:\n");
+        H1("    0 - encode successful\n");
+        H1("    1 - unable to parse command line\n");
+        H1("    2 - unable to open encoder\n");
+        H1("    3 - unable to generate stream headers\n");
+        H1("    4 - encoder abort\n");
+#undef OPT
+#undef H0
+#undef H1
+        if (level < X265_LOG_DEBUG)
+            printf("\nUse --fullhelp for a full listing (or --log-level full --help)\n");
+        printf("\n\nComplete documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
+        exit(1);
+    }
+
+    void CLIOptions::destroy()
+    {
+        if (input)
+            input->release();
+        input = NULL;
+        if (recon)
+            recon->release();
+        recon = NULL;
+        if (qpfile)
+            fclose(qpfile);
+        qpfile = NULL;
+        if (zoneFile)
+            fclose(zoneFile);
+        zoneFile = NULL;
+        if (dolbyVisionRpu)
+            fclose(dolbyVisionRpu);
+        dolbyVisionRpu = NULL;
+        if (output)
+            output->release();
+        output = NULL;
+    }
+
+    void CLIOptions::printStatus(uint32_t frameNum)
+    {
+        char buf[200];
+        int64_t time = x265_mdate();
+
+        if (!bProgress || !frameNum || (prevUpdateTime && time - prevUpdateTime < UPDATE_INTERVAL))
+            return;
+
+        int64_t elapsed = time - startTime;
+        double fps = elapsed > 0 ? frameNum * 1000000. / elapsed : 0;
+        float bitrate = 0.008f * totalbytes * (param->fpsNum / param->fpsDenom) / ((float)frameNum);
+        if (framesToBeEncoded)
+        {
+            int eta = (int)(elapsed * (framesToBeEncoded - frameNum) / ((int64_t)frameNum * 1000000));
+            sprintf(buf, "x265 [%.1f%%] %d/%d frames, %.2f fps, %.2f kb/s, eta %d:%02d:%02d",
+                100. * frameNum / (param->chunkEnd ? param->chunkEnd : param->totalFrames), frameNum, (param->chunkEnd ? param->chunkEnd : param->totalFrames), fps, bitrate,
+                eta / 3600, (eta / 60) % 60, eta % 60);
+        }
+        else
+            sprintf(buf, "x265 %d frames: %.2f fps, %.2f kb/s", frameNum, fps, bitrate);
+
+        fprintf(stderr, "%s  \r", buf + 5);
+        SetConsoleTitle(buf);
+        fflush(stderr); // needed in windows
+        prevUpdateTime = time;
+    }
+
+    bool CLIOptions::parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount)
+    {
+        bool bError = false;
+        int bShowHelp = false;
+        int outputBitDepth = 0;
+        const char *profile = NULL;
+
+        /* Presets are applied before all other options. */
+        for (optind = 0;;)
+        {
+            int c = getopt_long(argc, argv, short_options, long_options, NULL);
+            if (c == -1)
+                break;
+            else if (c == 'D')
+                outputBitDepth = atoi(optarg);
+            else if (c == 'P')
+                profile = optarg;
+            else if (c == '?')
+                bShowHelp = true;
+        }
+
+        if (!outputBitDepth && profile)
+        {
+            /* try to derive the output bit depth from the requested profile */
+            if (strstr(profile, "10"))
+                outputBitDepth = 10;
+            else if (strstr(profile, "12"))
+                outputBitDepth = 12;
+            else
+                outputBitDepth = 8;
+        }
+
+        api = x265_api_get(outputBitDepth);
+        if (!api)
+        {
+            x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
+            api = x265_api_get(0);
+        }
+
+        if (bShowHelp)
+        {
+            printVersion(globalParam, api);
+            showHelp(globalParam);
+        }
+
+        globalParam->rc.zones[zonefileCount].zoneParam = api->param_alloc();
+        if (!globalParam->rc.zones[zonefileCount].zoneParam)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "param alloc failed\n");
+            return true;
+        }
+
+        memcpy(globalParam->rc.zones[zonefileCount].zoneParam, globalParam, sizeof(x265_param));
+
+        for (optind = 0;;)
+        {
+            int long_options_index = -1;
+            int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
+            if (c == -1)
+                break;
+
+            if (long_options_index < 0 && c > 0)
+            {
+                for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options[0]); i++)
+                {
+                    if (long_options[i].val == c)
+                    {
+                        long_options_index = (int)i;
+                        break;
+                    }
+                }
+
+                if (long_options_index < 0)
+                {
+                    /* getopt_long might have already printed an error message */
+                    if (c != 63)
+                        x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
+                    return true;
+                }
+            }
+            if (long_options_index < 0)
+            {
+                x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
+                return true;
+            }
+
+            bError |= !!api->zone_param_parse(globalParam->rc.zones[zonefileCount].zoneParam, long_options[long_options_index].name, optarg);
+
+            if (bError)
+            {
+                const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind - 2];
+                x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg);
+                return true;
+            }
+        }
+
+        if (optind < argc)
+        {
+            x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argv[optind]);
+            return true;
+        }
+        return false;
+    }
+
+    bool CLIOptions::parse(int argc, char **argv)
+    {
+        bool bError = false;
+        int bShowHelp = false;
+        int inputBitDepth = 8;
+        int outputBitDepth = 0;
+        int reconFileBitDepth = 0;
+        const char *inputfn = NULL;
+        const char *reconfn = NULL;
+        const char *outputfn = NULL;
+        const char *preset = NULL;
+        const char *tune = NULL;
+        const char *profile = NULL;
+        int svtEnabled = 0;
+        argCnt = argc;
+        argString = argv;
+
+        if (argc <= 1)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "No input file. Run x265 --help for a list of options.\n");
+            return true;
+        }
+
+        /* Presets are applied before all other options. */
+        for (optind = 0;;)
+        {
+            int optionsIndex = -1;
+            int c = getopt_long(argc, argv, short_options, long_options, &optionsIndex);
+            if (c == -1)
+                break;
+            else if (c == 'p')
+                preset = optarg;
+            else if (c == 't')
+                tune = optarg;
+            else if (c == 'D')
+                outputBitDepth = atoi(optarg);
+            else if (c == 'P')
+                profile = optarg;
+            else if (c == '?')
+                bShowHelp = true;
+            else if (!c && !strcmp(long_options[optionsIndex].name, "svt"))
+                svtEnabled = 1;
+        }
+
+        if (!outputBitDepth && profile)
+        {
+            /* try to derive the output bit depth from the requested profile */
+            if (strstr(profile, "10"))
+                outputBitDepth = 10;
+            else if (strstr(profile, "12"))
+                outputBitDepth = 12;
+            else
+                outputBitDepth = 8;
+        }
+
+        api = x265_api_get(outputBitDepth);
+        if (!api)
+        {
+            x265_log(NULL, X265_LOG_WARNING, "falling back to default bit-depth\n");
+            api = x265_api_get(0);
+        }
+
+        param = api->param_alloc();
+        if (!param)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "param alloc failed\n");
+            return true;
+        }
+#if ENABLE_LIBVMAF
+        vmafData = (x265_vmaf_data*)x265_malloc(sizeof(x265_vmaf_data));
+        if (!vmafData)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "vmaf data alloc failed\n");
+            return true;
+        }
+#endif
+
+        if (api->param_default_preset(param, preset, tune) < 0)
+        {
+            x265_log(NULL, X265_LOG_ERROR, "preset or tune unrecognized\n");
+            return true;
+        }
+
+        if (bShowHelp)
+        {
+            printVersion(param, api);
+            showHelp(param);
+        }
+
+        //Set enable SVT-HEVC encoder first if found in the command line
+        if (svtEnabled) api->param_parse(param, "svt", NULL);
+
+        for (optind = 0;;)
+        {
+            int long_options_index = -1;
+            int c = getopt_long(argc, argv, short_options, long_options, &long_options_index);
+            if (c == -1)
+                break;
+
+            switch (c)
+            {
+            case 'h':
+                printVersion(param, api);
+                showHelp(param);
+                break;
+
+            case 'V':
+                printVersion(param, api);
+                x265_report_simd(param);
+                exit(0);
+
+            default:
+                if (long_options_index < 0 && c > 0)
+                {
+                    for (size_t i = 0; i < sizeof(long_options) / sizeof(long_options[0]); i++)
+                    {
+                        if (long_options[i].val == c)
+                        {
+                            long_options_index = (int)i;
+                            break;
+                        }
+                    }
+
+                    if (long_options_index < 0)
+                    {
+                        /* getopt_long might have already printed an error message */
+                        if (c != 63)
+                            x265_log(NULL, X265_LOG_WARNING, "internal error: short option '%c' has no long option\n", c);
+                        return true;
+                    }
+                }
+                if (long_options_index < 0)
+                {
+                    x265_log(NULL, X265_LOG_WARNING, "short option '%c' unrecognized\n", c);
+                    return true;
+                }
+#define OPT(longname) \
+                                            else if (!strcmp(long_options[long_options_index].name, longname))
+#define OPT2(name1, name2) \
+                                            else if (!strcmp(long_options[long_options_index].name, name1) || \
+             !strcmp(long_options[long_options_index].name, name2))
+
+                if (0);
+                OPT2("frame-skip", "seek") this->seek = (uint32_t)x265_atoi(optarg, bError);
+                OPT("frames") this->framesToBeEncoded = (uint32_t)x265_atoi(optarg, bError);
+                OPT("no-progress") this->bProgress = false;
+                OPT("output") outputfn = optarg;
+                OPT("input") inputfn = optarg;
+                OPT("recon") reconfn = optarg;
+                OPT("input-depth") inputBitDepth = (uint32_t)x265_atoi(optarg, bError);
+                OPT("dither") this->bDither = true;
+                OPT("recon-depth") reconFileBitDepth = (uint32_t)x265_atoi(optarg, bError);
+                OPT("y4m") this->bForceY4m = true;
+                OPT("profile") /* handled above */;
+                OPT("preset")  /* handled above */;
+                OPT("tune")    /* handled above */;
+                OPT("output-depth")   /* handled above */;
+                OPT("recon-y4m-exec") reconPlayCmd = optarg;
+                OPT("svt")    /* handled above */;
+                OPT("qpfile")
+                {
+                    this->qpfile = x265_fopen(optarg, "rb");
+                    if (!this->qpfile)
+                        x265_log_file(param, X265_LOG_ERROR, "%s qpfile not found or error in opening qp file\n", optarg);
+                }
+                OPT("dolby-vision-rpu")
+                {
+                    this->dolbyVisionRpu = x265_fopen(optarg, "rb");
+                    if (!this->dolbyVisionRpu)
+                    {
+                        x265_log_file(param, X265_LOG_ERROR, "Dolby Vision RPU metadata file %s not found or error in opening file\n", optarg);
+                        return true;
+                    }
+                }
+                OPT("zonefile")
+                {
+                    this->zoneFile = x265_fopen(optarg, "rb");
+                    if (!this->zoneFile)
+                        x265_log_file(param, X265_LOG_ERROR, "%s zone file not found or error in opening zone file\n", optarg);
+                }
+                OPT("fullhelp")
+                {
+                    param->logLevel = X265_LOG_FULL;
+                    printVersion(param, api);
+                    showHelp(param);
+                    break;
+                }
+                else
+                    bError |= !!api->param_parse(param, long_options[long_options_index].name, optarg);
+                if (bError)
+                {
+                    const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind - 2];
+                    x265_log(NULL, X265_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg);
+                    return true;
+                }
+#undef OPT
+            }
+        }
+
+        if (optind < argc && !inputfn)
+            inputfn = argv[optind++];
+        if (optind < argc && !outputfn)
+            outputfn = argv[optind++];
+        if (optind < argc)
+        {
+            x265_log(param, X265_LOG_WARNING, "extra unused command arguments given <%s>\n", argv[optind]);
+            return true;
+        }
+
+        if (argc <= 1)
+        {
+            api->param_default(param);
+            printVersion(param, api);
+            showHelp(param);
+        }
+
+        if (!inputfn || !outputfn)
+        {
+            x265_log(param, X265_LOG_ERROR, "input or output file not specified, try --help for help\n");
+            return true;
+        }
+
+        if (param->internalBitDepth != api->bit_depth)
+        {
+            x265_log(param, X265_LOG_ERROR, "Only bit depths of %d are supported in this build\n", api->bit_depth);
+            return true;
+        }
+
+#ifdef SVT_HEVC
+        if (svtEnabled)
+        {
+            EB_H265_ENC_CONFIGURATION* svtParam = (EB_H265_ENC_CONFIGURATION*)param->svtHevcParam;
+            param->sourceWidth = svtParam->sourceWidth;
+            param->sourceHeight = svtParam->sourceHeight;
+            param->fpsNum = svtParam->frameRateNumerator;
+            param->fpsDenom = svtParam->frameRateDenominator;
+            svtParam->encoderBitDepth = inputBitDepth;
+        }
+#endif
+
+        InputFileInfo info;
+        info.filename = inputfn;
+        info.depth = inputBitDepth;
+        info.csp = param->internalCsp;
+        info.width = param->sourceWidth;
+        info.height = param->sourceHeight;
+        info.fpsNum = param->fpsNum;
+        info.fpsDenom = param->fpsDenom;
+        info.sarWidth = param->vui.sarWidth;
+        info.sarHeight = param->vui.sarHeight;
+        info.skipFrames = seek;
+        info.frameCount = 0;
+        getParamAspectRatio(param, info.sarWidth, info.sarHeight);
+
+
+        this->input = InputFile::open(info, this->bForceY4m);
+        if (!this->input || this->input->isFail())
+        {
+            x265_log_file(param, X265_LOG_ERROR, "unable to open input file <%s>\n", inputfn);
+            return true;
+        }
+
+        if (info.depth < 8 || info.depth > 16)
+        {
+            x265_log(param, X265_LOG_ERROR, "Input bit depth (%d) must be between 8 and 16\n", inputBitDepth);
+            return true;
+        }
+
+        /* Unconditionally accept height/width/csp/bitDepth from file info */
+        param->sourceWidth = info.width;
+        param->sourceHeight = info.height;
+        param->internalCsp = info.csp;
+        param->sourceBitDepth = info.depth;
+
+        /* Accept fps and sar from file info if not specified by user */
+        if (param->fpsDenom == 0 || param->fpsNum == 0)
+        {
+            param->fpsDenom = info.fpsDenom;
+            param->fpsNum = info.fpsNum;
+        }
+        if (!param->vui.aspectRatioIdc && info.sarWidth && info.sarHeight)
+            setParamAspectRatio(param, info.sarWidth, info.sarHeight);
+        if (this->framesToBeEncoded == 0 && info.frameCount > (int)seek)
+            this->framesToBeEncoded = info.frameCount - seek;
+        param->totalFrames = this->framesToBeEncoded;
+
+#ifdef SVT_HEVC
+        if (svtEnabled)
+        {
+            EB_H265_ENC_CONFIGURATION* svtParam = (EB_H265_ENC_CONFIGURATION*)param->svtHevcParam;
+            svtParam->sourceWidth = param->sourceWidth;
+            svtParam->sourceHeight = param->sourceHeight;
+            svtParam->frameRateNumerator = param->fpsNum;
+            svtParam->frameRateDenominator = param->fpsDenom;
+            svtParam->framesToBeEncoded = param->totalFrames;
+            svtParam->encoderColorFormat = (EB_COLOR_FORMAT)param->internalCsp;
+        }
+#endif
+
+        /* Force CFR until we have support for VFR */
+        info.timebaseNum = param->fpsDenom;
+        info.timebaseDenom = param->fpsNum;
+
+        if (param->bField && param->interlaceMode)
+        {   // Field FPS
+            param->fpsNum *= 2;
+            // Field height
+            param->sourceHeight = param->sourceHeight >> 1;
+            // Number of fields to encode
+            param->totalFrames *= 2;
+        }
+
+        if (api->param_apply_profile(param, profile))
+            return true;
+
+        if (param->logLevel >= X265_LOG_INFO)
+        {
+            char buf[128];
+            int p = sprintf(buf, "%dx%d fps %d/%d %sp%d", param->sourceWidth, param->sourceHeight,
+                param->fpsNum, param->fpsDenom, x265_source_csp_names[param->internalCsp], info.depth);
+
+            int width, height;
+            getParamAspectRatio(param, width, height);
+            if (width && height)
+                p += sprintf(buf + p, " sar %d:%d", width, height);
+
+            if (framesToBeEncoded <= 0 || info.frameCount <= 0)
+                strcpy(buf + p, " unknown frame count");
+            else
+                sprintf(buf + p, " frames %u - %d of %d", this->seek, this->seek + this->framesToBeEncoded - 1, info.frameCount);
+
+            general_log(param, input->getName(), X265_LOG_INFO, "%s\n", buf);
+        }
+
+        this->input->startReader();
+
+        if (reconfn)
+        {
+            if (reconFileBitDepth == 0)
+                reconFileBitDepth = param->internalBitDepth;
+            this->recon = ReconFile::open(reconfn, param->sourceWidth, param->sourceHeight, reconFileBitDepth,
+                param->fpsNum, param->fpsDenom, param->internalCsp);
+            if (this->recon->isFail())
+            {
+                x265_log(param, X265_LOG_WARNING, "unable to write reconstructed outputs file\n");
+                this->recon->release();
+                this->recon = 0;
+            }
+            else
+                general_log(param, this->recon->getName(), X265_LOG_INFO,
+                "reconstructed images %dx%d fps %d/%d %s\n",
+                param->sourceWidth, param->sourceHeight, param->fpsNum, param->fpsDenom,
+                x265_source_csp_names[param->internalCsp]);
+        }
+#if ENABLE_LIBVMAF
+        if (!reconfn)
+        {
+            x265_log(param, X265_LOG_ERROR, "recon file must be specified to get VMAF score, try --help for help\n");
+            return true;
+        }
+        const char *str = strrchr(info.filename, '.');
+
+        if (!strcmp(str, ".y4m"))
+        {
+            x265_log(param, X265_LOG_ERROR, "VMAF supports YUV file format only.\n");
+            return true;
+        }
+        if (param->internalCsp == X265_CSP_I420 || param->internalCsp == X265_CSP_I422 || param->internalCsp == X265_CSP_I444)
+        {
+            vmafData->reference_file = x265_fopen(inputfn, "rb");
+            vmafData->distorted_file = x265_fopen(reconfn, "rb");
+        }
+        else
+        {
+            x265_log(param, X265_LOG_ERROR, "VMAF will support only yuv420p, yu422p, yu444p, yuv420p10le, yuv422p10le, yuv444p10le formats.\n");
+            return true;
+        }
+#endif
+        this->output = OutputFile::open(outputfn, info);
+        if (this->output->isFail())
+        {
+            x265_log_file(param, X265_LOG_ERROR, "failed to open output file <%s> for writing\n", outputfn);
+            return true;
+        }
+        general_log_file(param, this->output->getName(), X265_LOG_INFO, "output file: %s\n", outputfn);
+        return false;
+    }
+
+    bool CLIOptions::parseQPFile(x265_picture &pic_org)
+    {
+        int32_t num = -1, qp, ret;
+        char type;
+        uint32_t filePos;
+        pic_org.forceqp = 0;
+        pic_org.sliceType = X265_TYPE_AUTO;
+        while (num < pic_org.poc)
+        {
+            filePos = ftell(qpfile);
+            qp = -1;
+            ret = fscanf(qpfile, "%d %c%*[ \t]%d\n", &num, &type, &qp);
+
+            if (num > pic_org.poc || ret == EOF)
+            {
+                fseek(qpfile, filePos, SEEK_SET);
+                break;
+            }
+            if (num < pic_org.poc && ret >= 2)
+                continue;
+            if (ret == 3 && qp >= 0)
+                pic_org.forceqp = qp + 1;
+            if (type == 'I') pic_org.sliceType = X265_TYPE_IDR;
+            else if (type == 'i') pic_org.sliceType = X265_TYPE_I;
+            else if (type == 'K') pic_org.sliceType = param->bOpenGOP ? X265_TYPE_I : X265_TYPE_IDR;
+            else if (type == 'P') pic_org.sliceType = X265_TYPE_P;
+            else if (type == 'B') pic_org.sliceType = X265_TYPE_BREF;
+            else if (type == 'b') pic_org.sliceType = X265_TYPE_B;
+            else ret = 0;
+            if (ret < 2 || qp < -1 || qp > 51)
+                return 0;
+        }
+        return 1;
+    }
+
+    bool CLIOptions::parseZoneFile()
+    {
+        char line[256];
+        char* argLine;
+        param->rc.zonefileCount = 0;
+
+        while (fgets(line, sizeof(line), zoneFile))
+        {
+            if (!((*line == '#') || (strcmp(line, "\r\n") == 0)))
+                param->rc.zonefileCount++;
+        }
+
+        rewind(zoneFile);
+        param->rc.zones = X265_MALLOC(x265_zone, param->rc.zonefileCount);
+        for (int i = 0; i < param->rc.zonefileCount; i++)
+        {
+            while (fgets(line, sizeof(line), zoneFile))
+            {
+                if (*line == '#' || (strcmp(line, "\r\n") == 0))
+                    continue;
+                param->rc.zones[i].zoneParam = X265_MALLOC(x265_param, 1);
+                int index = (int)strcspn(line, "\r\n");
+                line[index] = '\0';
+                argLine = line;
+                while (isspace((unsigned char)*argLine)) argLine++;
+                char* start = strchr(argLine, ' ');
+                start++;
+                param->rc.zones[i].startFrame = atoi(argLine);
+                int argCount = 0;
+                char **args = (char**)malloc(256 * sizeof(char *));
+                // Adding a dummy string to avoid file parsing error
+                args[argCount++] = (char *)"x265";
+                char* token = strtok(start, " ");
+                while (token)
+                {
+                    args[argCount++] = token;
+                    token = strtok(NULL, " ");
+                }
+                args[argCount] = NULL;
+                CLIOptions cliopt;
+                if (cliopt.parseZoneParam(argCount, args, param, i))
+                {
+                    cliopt.destroy();
+                    if (cliopt.api)
+                        cliopt.api->param_free(cliopt.param);
+                    exit(1);
+                }
+                break;
+            }
+        }
+        return 1;
+    }
+
+    /* Parse the RPU file and extract the RPU corresponding to the current picture
+    * and fill the rpu field of the input picture */
+    int CLIOptions::rpuParser(x265_picture * pic)
+    {
+        uint8_t byteVal;
+        uint32_t code = 0;
+        int bytesRead = 0;
+        pic->rpu.payloadSize = 0;
+
+        if (!pic->pts)
+        {
+            while (bytesRead++ < 4 && fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
+                code = (code << 8) | byteVal;
+
+            if (code != START_CODE)
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU startcode in POC %d\n", pic->pts);
+                return 1;
+            }
+        }
+
+        bytesRead = 0;
+        while (fread(&byteVal, sizeof(uint8_t), 1, dolbyVisionRpu))
+        {
+            code = (code << 8) | byteVal;
+            if (bytesRead++ < 3)
+                continue;
+            if (bytesRead >= 1024)
+            {
+                x265_log(NULL, X265_LOG_ERROR, "Invalid Dolby Vision RPU size in POC %d\n", pic->pts);
+                return 1;
+            }
+
+            if (code != START_CODE)
+                pic->rpu.payload[pic->rpu.payloadSize++] = (code >> (3 * 8)) & 0xFF;
+            else
+                return 0;
+        }
+
+        int ShiftBytes = START_CODE_BYTES - (bytesRead - pic->rpu.payloadSize);
+        int bytesLeft = bytesRead - pic->rpu.payloadSize;
+        code = (code << ShiftBytes * 8);
+        for (int i = 0; i < bytesLeft; i++)
+        {
+            pic->rpu.payload[pic->rpu.payloadSize++] = (code >> (3 * 8)) & 0xFF;
+            code = (code << 8);
+        }
+        if (!pic->rpu.payloadSize)
+            x265_log(NULL, X265_LOG_WARNING, "Dolby Vision RPU not found for POC %d\n", pic->pts);
+        return 0;
+    }
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
--- a/source/x265cli.h	Mon Feb 17 20:46:36 2020 +0530
+++ b/source/x265cli.h	Wed May 06 14:59:56 2020 +0530
@@ -27,9 +27,23 @@
 
 #include "common.h"
 #include "param.h"
+#include "input/input.h"
+#include "output/output.h"
+#include "output/reconplay.h"
 
 #include <getopt.h>
 
+#define CONSOLE_TITLE_SIZE 200
+#ifdef _WIN32
+#include <windows.h>
+#define SetThreadExecutionState(es)
+static char orgConsoleTitle[CONSOLE_TITLE_SIZE] = "";
+#else
+#define GetConsoleTitle(t, n)
+#define SetConsoleTitle(t)
+#define SetThreadExecutionState(es)
+#endif
+
 #ifdef __cplusplus
 namespace X265_NS {
 #endif
@@ -105,8 +119,8 @@ static const struct option long_options[
     { "amp",                  no_argument, NULL, 0 },
     { "no-early-skip",        no_argument, NULL, 0 },
     { "early-skip",           no_argument, NULL, 0 },
-    { "no-rskip",             no_argument, NULL, 0 },
-    { "rskip",                no_argument, NULL, 0 },
+    { "rskip",                required_argument, NULL, 0 },
+    { "rskip-edge-threshold", required_argument, NULL, 0 },
     { "no-fast-cbf",          no_argument, NULL, 0 },
     { "fast-cbf",             no_argument, NULL, 0 },
     { "no-tskip",             no_argument, NULL, 0 },
@@ -358,6 +372,7 @@ static const struct option long_options[
     { "cll", no_argument, NULL, 0 },
     { "no-cll", no_argument, NULL, 0 },
     { "hme-range", required_argument, NULL, 0 },
+    { "abr-ladder", required_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -365,336 +380,80 @@ static const struct option long_options[
     { 0, 0, 0, 0 }
 };
 
-static void printVersion(x265_param *param, const x265_api* api)
-{
-    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", api->version_str);
-    x265_log(param, X265_LOG_INFO, "build info %s\n", api->build_info_str);
-}
-
-static void showHelp(x265_param *param)
-{
-    int level = param->logLevel;
-
-#define OPT(value) (value ? "enabled" : "disabled")
-#define H0 printf
-#define H1 if (level >= X265_LOG_DEBUG) printf
+    struct CLIOptions
+    {
+        InputFile* input;
+        ReconFile* recon;
+        OutputFile* output;
+        FILE*       qpfile;
+        FILE*       zoneFile;
+        FILE*    dolbyVisionRpu;    /* File containing Dolby Vision BL RPU metadata */
+        const char* reconPlayCmd;
+        const x265_api* api;
+        x265_param* param;
+        x265_vmaf_data* vmafData;
+        bool bProgress;
+        bool bForceY4m;
+        bool bDither;
+        uint32_t seek;              // number of frames to skip from the beginning
+        uint32_t framesToBeEncoded; // number of frames to encode
+        uint64_t totalbytes;
+        int64_t startTime;
+        int64_t prevUpdateTime;
 
-    H0("\nSyntax: x265 [options] infile [-o] outfile\n");
-    H0("    infile can be YUV or Y4M\n");
-    H0("    outfile is raw HEVC bitstream\n");
-    H0("\nExecutable Options:\n");
-    H0("-h/--help                        Show this help text and exit\n");
-    H0("   --fullhelp                    Show all options and exit\n");
-    H0("-V/--version                     Show version info and exit\n");
-    H0("\nOutput Options:\n");
-    H0("-o/--output <filename>           Bitstream output file name\n");
-    H0("-D/--output-depth 8|10|12        Output bit depth (also internal bit depth). Default %d\n", param->internalBitDepth);
-    H0("   --log-level <string>          Logging level: none error warning info debug full. Default %s\n", X265_NS::logLevelNames[param->logLevel + 1]);
-    H0("   --no-progress                 Disable CLI progress reports\n");
-    H0("   --csv <filename>              Comma separated log file, if csv-log-level > 0 frame level statistics, else one line per run\n");
-    H0("   --csv-log-level <integer>     Level of csv logging, if csv-log-level > 0 frame level statistics, else one line per run: 0-2\n");
-    H0("\nInput Options:\n");
-    H0("   --input <filename>            Raw YUV or Y4M input file name. `-` for stdin\n");
-    H1("   --y4m                         Force parsing of input stream as YUV4MPEG2 regardless of file extension\n");
-    H0("   --fps <float|rational>        Source frame rate (float or num/denom), auto-detected if Y4M\n");
-    H0("   --input-res WxH               Source picture size [w x h], auto-detected if Y4M\n");
-    H1("   --input-depth <integer>       Bit-depth of input file. Default 8\n");
-    H1("   --input-csp <string>          Chroma subsampling, auto-detected if Y4M\n");
-    H1("                                 0 - i400 (4:0:0 monochrome)\n");
-    H1("                                 1 - i420 (4:2:0 default)\n");
-    H1("                                 2 - i422 (4:2:2)\n");
-    H1("                                 3 - i444 (4:4:4)\n");
-#if ENABLE_HDR10_PLUS
-    H0("   --dhdr10-info <filename>      JSON file containing the Creative Intent Metadata to be encoded as Dynamic Tone Mapping\n");
-    H0("   --[no-]dhdr10-opt             Insert tone mapping SEI only for IDR frames and when the tone mapping information changes. Default disabled\n");
-#endif
-    H0("   --dolby-vision-profile <float|integer> Specifies Dolby Vision profile ID. Currently only profile 5, profile 8.1 and profile 8.2 enabled. Specified as '5' or '50'. Default 0 (disabled).\n");
-    H0("   --dolby-vision-rpu <filename> File containing Dolby Vision RPU metadata.\n"
-       "                                 If given, x265's Dolby Vision metadata parser will fill the RPU field of input pictures with the metadata read from the file. Default NULL(disabled).\n");
-    H0("   --nalu-file <filename>        Text file containing SEI messages in the following format : <POC><space><PREFIX><space><NAL UNIT TYPE>/<SEI TYPE><space><SEI Payload>\n");
-    H0("-f/--frames <integer>            Maximum number of frames to encode. Default all\n");
-    H0("   --seek <integer>              First frame to encode\n");
-    H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
-    H0("   --[no-]field                  Enable or disable field coding. Default %s\n", OPT( param->bField));
-    H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
-    H0("   --[no-]copy-pic               Copy buffers of input picture in frame. Default %s\n", OPT(param->bCopyPicToFrame));
-    H0("\nQuality reporting metrics:\n");
-    H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
-    H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
-    H0("\nProfile, Level, Tier:\n");
-    H0("-P/--profile <string>            Enforce an encode profile: main, main10, mainstillpicture\n");
-    H0("   --level-idc <integer|float>   Force a minimum required decoder level (as '5.0' or '50')\n");
-    H0("   --[no-]high-tier              If a decoder level is specified, this modifier selects High tier of that level\n");
-    H0("   --uhd-bd                      Enable UHD Bluray compatibility support\n");
-    H0("   --[no-]allow-non-conformance  Allow the encoder to generate profile NONE bitstreams. Default %s\n", OPT(param->bAllowNonConformance));
-    H0("\nThreading, performance:\n");
-    H0("   --pools <integer,...>         Comma separated thread count per thread pool (pool per NUMA node)\n");
-    H0("                                 '-' implies no threads on node, '+' implies one thread per core on node\n");
-    H0("-F/--frame-threads <integer>     Number of concurrently encoded frames. 0: auto-determined by core count\n");
-    H0("   --[no-]wpp                    Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront));
-    H0("   --[no-]slices <integer>       Enable Multiple Slices feature. Default %d\n", param->maxSlices);
-    H0("   --[no-]pmode                  Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis));
-    H0("   --[no-]pme                    Parallel motion estimation. Default %s\n", OPT(param->bDistributeMotionEstimation));
-    H0("   --[no-]asm <bool|int|string>  Override CPU detection. Default: auto\n");
-    H0("\nPresets:\n");
-    H0("-p/--preset <string>             Trade off performance for compression efficiency. Default medium\n");
-    H0("                                 ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow, or placebo\n");
-    H0("-t/--tune <string>               Tune the settings for a particular type of source or situation:\n");
-    H0("                                 psnr, ssim, grain, zerolatency, fastdecode\n");
-    H0("\nQuad-Tree size and depth:\n");
-    H0("-s/--ctu <64|32|16>              Maximum CU size (WxH). Default %d\n", param->maxCUSize);
-    H0("   --min-cu-size <64|32|16|8>    Minimum CU size (WxH). Default %d\n", param->minCUSize);
-    H0("   --max-tu-size <32|16|8|4>     Maximum TU size (WxH). Default %d\n", param->maxTUSize);
-    H0("   --tu-intra-depth <integer>    Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth);
-    H0("   --tu-inter-depth <integer>    Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth);
-    H0("   --limit-tu <0..4>             Enable early exit from TU recursion for inter coded blocks. Default %d\n", param->limitTU);
-    H0("\nAnalysis:\n");
-    H0("   --rd <1..6>                   Level of RDO in mode decision 1:least....6:full RDO. Default %d\n", param->rdLevel);
-    H0("   --[no-]psy-rd <0..5.0>        Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd);
-    H0("   --[no-]rdoq-level <0|1|2>     Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel);
-    H0("   --[no-]psy-rdoq <0..50.0>     Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq);
-    H0("   --dynamic-rd <0..4.0>         Strength of dynamic RD, 0 to disable. Default %.2f\n", param->dynamicRd);
-    H0("   --[no-]ssim-rd                Enable ssim rate distortion optimization, 0 to disable. Default %s\n", OPT(param->bSsimRd));
-    H0("   --[no-]rd-refine              Enable QP based RD refinement for rd levels 5 and 6. Default %s\n", OPT(param->bEnableRdRefine));
-    H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
-    H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
-    H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
-    H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than one split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
-    H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
-    H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
-    H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
-       "                                    - 1: force the partitions if CTU information is present\n"
-       "                                    - 2: functionality of (1) and reduce qp if CTU information has changed\n"
-       "                                    - 4: functionality of (1) and force Inter modes when CTU Information has changed, merge/skip otherwise\n"
-       "                                    Enable this option only when planning to invoke the API function x265_encoder_ctu_info to copy ctu-info asynchronously\n");
-    H0("\nCoding tools:\n");
-    H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
-    H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
-    H0("   --[no-]cu-lossless            Consider lossless mode in CU RDO decisions. Default %s\n", OPT(param->bCULossless));
-    H0("   --[no-]signhide               Hide sign bit of one coeff per TU (rdo). Default %s\n", OPT(param->bEnableSignHiding));
-    H1("   --[no-]tskip                  Enable intra 4x4 transform skipping. Default %s\n", OPT(param->bEnableTransformSkip));
-    H0("\nTemporal / motion search options:\n");
-    H0("   --max-merge <1..5>            Maximum number of merge candidates. Default %d\n", param->maxNumMergeCand);
-    H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
-    H0("   --limit-refs <0|1|2|3>        Limit references per depth (1) or CU (2) or both (3). Default %d\n", param->limitReferences);
-    H0("   --me <string>                 Motion search method dia hex umh star full. Default %d\n", param->searchMethod);
-    H0("-m/--subme <integer>             Amount of subpel refinement to perform (0:least .. 7:most). Default %d \n", param->subpelRefine);
-    H0("   --merange <integer>           Motion search range. Default %d\n", param->searchRange);
-    H0("   --[no-]rect                   Enable rectangular motion partitions Nx2N and 2NxN. Default %s\n", OPT(param->bEnableRectInter));
-    H0("   --[no-]amp                    Enable asymmetric motion partitions, requires --rect. Default %s\n", OPT(param->bEnableAMP));
-    H0("   --[no-]limit-modes            Limit rectangular and asymmetric motion predictions. Default %d\n", param->limitModes);
-    H1("   --[no-]temporal-mvp           Enable temporal MV predictors. Default %s\n", OPT(param->bEnableTemporalMvp));
-    H1("   --[no-]hme                    Enable Hierarchical Motion Estimation. Default %s\n", OPT(param->bEnableHME));
-    H1("   --hme-search <string>         Motion search-method for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeSearchMethod[0], param->hmeSearchMethod[1], param->hmeSearchMethod[2]);
-    H1("   --hme-range <int>,<int>,<int> Motion search-range for HME L0,L1 and L2. Default(L0,L1,L2) is %d,%d,%d\n", param->hmeRange[0], param->hmeRange[1], param->hmeRange[2]);
-    H0("\nSpatial / intra options:\n");
-    H0("   --[no-]strong-intra-smoothing Enable strong intra smoothing for 32x32 blocks. Default %s\n", OPT(param->bEnableStrongIntraSmoothing));
-    H0("   --[no-]constrained-intra      Constrained intra prediction (use only intra coded reference pixels) Default %s\n", OPT(param->bEnableConstrainedIntra));
-    H0("   --[no-]b-intra                Enable intra in B frames in veryslow presets. Default %s\n", OPT(param->bIntraInBFrames));
-    H0("   --[no-]fast-intra             Enable faster search method for angular intra predictions. Default %s\n", OPT(param->bEnableFastIntra));
-    H0("   --rdpenalty <0..2>            penalty for 32x32 intra TU in non-I slices. 0:disabled 1:RD-penalty 2:maximum. Default %d\n", param->rdPenalty);
-    H0("\nSlice decision options:\n");
-    H0("   --[no-]open-gop               Enable open-GOP, allows I slices to be non-IDR. Default %s\n", OPT(param->bOpenGOP));
-    H0("-I/--keyint <integer>            Max IDR period in frames. -1 for infinite-gop. Default %d\n", param->keyframeMax);
-    H0("-i/--min-keyint <integer>        Scenecuts closer together than this are coded as I, not IDR. Default: auto\n");
-    H0("   --gop-lookahead <integer>     Extends gop boundary if a scenecut is found within this from keyint boundary. Default 0\n");
-    H0("   --no-scenecut                 Disable adaptive I-frame decision\n");
-    H0("   --scenecut <integer>          How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold);
-    H1("   --scenecut-bias <0..100.0>    Bias for scenecut detection. Default %.2f\n", param->scenecutBias); 
-    H0("   --hist-scenecut               Enables histogram based scene-cut detection using histogram based algorithm.\n");
-    H0("   --no-hist-scenecut            Disables histogram based scene-cut detection using histogram based algorithm.\n");
-    H1("   --hist-threshold <0.0..2.0>   Luma Edge histogram's Normalized SAD threshold for histogram based scenecut detection Default %.2f\n", param->edgeTransitionThreshold);
-    H0("   --[no-]fades                  Enable detection and handling of fade-in regions. Default %s\n", OPT(param->bEnableFades));
-    H1("   --[no-]scenecut-aware-qp      Enable increasing QP for frames inside the scenecut window after scenecut. Default %s\n", OPT(param->bEnableSceneCutAwareQp));
-    H1("   --scenecut-window <0..1000>   QP incremental duration(in milliseconds) when scenecut-aware-qp is enabled. Default %d\n", param->scenecutWindow);
-    H1("   --max-qp-delta <0..10>        QP offset to increment with base QP for inter-frames. Default %d\n", param->maxQpDelta);
-    H0("   --radl <integer>              Number of RADL pictures allowed in front of IDR. Default %d\n", param->radl);
-    H0("   --intra-refresh               Use Periodic Intra Refresh instead of IDR frames\n");
-    H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
-    H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
-    H0("   --lookahead-threads <integer> Number of threads to be dedicated to perform lookahead only. Default %d\n", param->lookaheadThreads);
-    H0("-b/--bframes <0..16>             Maximum number of consecutive b-frames. Default %d\n", param->bframes);
-    H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
-    H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
-    H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
-    H1("   --qpfile <string>             Force frametypes and QPs for some or all frames\n");
-    H1("                                 Format of each line: framenumber frametype QP\n");
-    H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
-    H1("                                 QPs are restricted by qpmin/qpmax.\n");
-    H1("   --force-flush <integer>       Force the encoder to flush frames. Default %d\n", param->forceFlush);
-    H1("                                 0 - flush the encoder only when all the input pictures are over.\n");
-    H1("                                 1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.\n");
-    H1("                                 2 - flush the slicetype decided frames only.\n");
-    H0("   --[no-]-hrd-concat            Set HRD concatenation flag for the first keyframe in the buffering period SEI. Default %s\n", OPT(param->bEnableHRDConcatFlag));
-    H0("\nRate control, Adaptive Quantization:\n");
-    H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
-    H1("-q/--qp <integer>                QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
-    H0("   --crf <float>                 Quality-based VBR (0-51). Default %.1f\n", param->rc.rfConstant);
-    H1("   --[no-]lossless               Enable lossless: bypass transform, quant and loop filters globally. Default %s\n", OPT(param->bLossless));
-    H1("   --crf-max <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMax);
-    H1("                                 May cause VBV underflows!\n");
-    H1("   --crf-min <float>             With CRF+VBV, limit RF to this value. Default %f\n", param->rc.rfConstantMin);
-    H1("                                 this specifies a minimum rate factor value for encode!\n");
-    H0("   --vbv-maxrate <integer>       Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate);
-    H0("   --vbv-bufsize <integer>       Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize);
-    H0("   --vbv-init <float>            Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
-    H0("   --vbv-end <float>             Final VBV buffer emptiness (fraction of bufsize or in kbits). Default 0 (disabled)\n");
-    H0("   --vbv-end-fr-adj <float>      Frame from which qp has to be adjusted to achieve final decode buffer emptiness. Default 0\n");
-    H0("   --chunk-start <integer>       First frame of the chunk. Default 0 (disabled)\n");
-    H0("   --chunk-end <integer>         Last frame of the chunk. Default 0 (disabled)\n");
-    H0("   --pass                        Multi pass rate control.\n"
-       "                                   - 1 : First pass, creates stats file\n"
-       "                                   - 2 : Last pass, does not overwrite stats file\n"
-       "                                   - 3 : Nth pass, overwrites stats file\n");
-    H0("   --[no-]multi-pass-opt-analysis   Refine analysis in 2 pass based on analysis information from pass 1\n");
-    H0("   --[no-]multi-pass-opt-distortion Use distortion of CTU from pass 1 to refine qp in 2 pass\n");
-    H0("   --stats                       Filename for stats file in multipass pass rate control. Default x265_2pass.log\n");
-    H0("   --[no-]analyze-src-pics       Motion estimation uses source frame planes. Default disable\n");
-    H0("   --[no-]slow-firstpass         Enable a slow first pass in a multipass rate control mode. Default %s\n", OPT(param->rc.bEnableSlowFirstPass));
-    H0("   --[no-]strict-cbr             Enable stricter conditions and tolerance for bitrate deviations in CBR mode. Default %s\n", OPT(param->rc.bStrictCbr));
-    H0("   --analysis-save <filename>    Dump analysis info into the specified file. Default Disabled\n");
-    H0("   --analysis-load <filename>    Load analysis buffers from the file specified. Default Disabled\n");
-    H0("   --analysis-reuse-file <filename>    Specify file name used for either dumping or reading analysis data. Deault x265_analysis.dat\n");
-    H0("   --analysis-reuse-level <1..10>      Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Now deprecated. Default %d\n", param->analysisReuseLevel);
-    H0("   --analysis-save-reuse-level <1..10> Indicates the amount of analysis info stored in save mode, 1:least..10:most. Default %d\n", param->analysisSaveReuseLevel);
-    H0("   --analysis-load-reuse-level <1..10> Indicates the amount of analysis info reused in load mode, 1:least..10:most. Default %d\n", param->analysisLoadReuseLevel);
-    H0("   --refine-analysis-type <string>     Reuse anlaysis information received through API call. Supported options are avc and hevc. Default disabled - %d\n", param->bAnalysisType);
-    H0("   --scale-factor <int>          Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor);
-    H0("   --refine-intra <0..4>         Enable intra refinement for encode that uses analysis-load.\n"
-        "                                    - 0 : Forces both mode and depth from the save encode.\n"
-        "                                    - 1 : Functionality of (0) + evaluate all intra modes at min-cu-size's depth when current depth is one smaller than min-cu-size's depth.\n"
-        "                                    - 2 : Functionality of (1) + irrespective of size evaluate all angular modes when the save encode decides the best mode as angular.\n"
-        "                                    - 3 : Functionality of (1) + irrespective of size evaluate all intra modes.\n"
-        "                                    - 4 : Re-evaluate all intra blocks, does not reuse data from save encode.\n"
-        "                                Default:%d\n", param->intraRefine);
-    H0("   --refine-inter <0..3>         Enable inter refinement for encode that uses analysis-load.\n"
-        "                                    - 0 : Forces both mode and depth from the save encode.\n"
-        "                                    - 1 : Functionality of (0) + evaluate all inter modes at min-cu-size's depth when current depth is one smaller than\n"
-        "                                          min-cu-size's depth. When save encode decides the current block as skip(for all sizes) evaluate skip/merge.\n"
-        "                                    - 2 : Functionality of (1) + irrespective of size restrict the modes evaluated when specific modes are decided as the best mode by the save encode.\n"
-        "                                    - 3 : Functionality of (1) + irrespective of size evaluate all inter modes.\n"
-        "                                Default:%d\n", param->interRefine);
-    H0("   --[no-]dynamic-refine         Dynamically changes refine-inter level for each CU. Default %s\n", OPT(param->bDynamicRefine));
-    H0("   --refine-mv <1..3>            Enable mv refinement for load mode. Default %d\n", param->mvRefine);
-    H0("   --refine-ctu-distortion       Store/normalize ctu distortion in analysis-save/load.\n"
-        "                                    - 0 : Disabled.\n"
-        "                                    - 1 : Store/Load ctu distortion to/from the file specified in analysis-save/load.\n"
-        "                                Default 0 - Disabled\n");
-    H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes 4:auto variance with edge information. Default %d\n", param->rc.aqMode);
-    H0("   --[no-]hevc-aq                Mode for HEVC Adaptive Quantization. Default %s\n", OPT(param->rc.hevcAq));
-    H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
-    H0("   --qp-adaptation-range <float> Delta QP range by QP adaptation based on a psycho-visual model (1.0 to 6.0). Default %.2f\n", param->rc.qpAdaptationRange);
-    H0("   --[no-]aq-motion              Block level QP adaptation based on the relative motion between the block and the frame. Default %s\n", OPT(param->bAQMotion));
-    H0("   --qg-size <int>               Specifies the size of the quantization group (64, 32, 16, 8). Default %d\n", param->rc.qgSize);
-    H0("   --[no-]cutree                 Enable cutree for Adaptive Quantization. Default %s\n", OPT(param->rc.cuTree));
-    H0("   --[no-]rc-grain               Enable ratecontrol mode to handle grains specifically. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableGrain));
-    H1("   --ipratio <float>             QP factor between I and P. Default %.2f\n", param->rc.ipFactor);
-    H1("   --pbratio <float>             QP factor between P and B. Default %.2f\n", param->rc.pbFactor);
-    H1("   --qcomp <float>               Weight given to predicted complexity. Default %.2f\n", param->rc.qCompress);
-    H1("   --qpstep <integer>            The maximum single adjustment in QP allowed to rate control. Default %d\n", param->rc.qpStep);
-    H1("   --qpmin <integer>             sets a hard lower limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMin);
-    H1("   --qpmax <integer>             sets a hard upper limit on QP allowed to ratecontrol. Default %d\n", param->rc.qpMax);
-    H0("   --[no-]const-vbv              Enable consistent vbv. turned on with tune grain. Default %s\n", OPT(param->rc.bEnableConstVbv));
-    H1("   --cbqpoffs <integer>          Chroma Cb QP Offset [-12..12]. Default %d\n", param->cbQpOffset);
-    H1("   --crqpoffs <integer>          Chroma Cr QP Offset [-12..12]. Default %d\n", param->crQpOffset);
-    H1("   --scaling-list <string>       Specify a file containing HM style quant scaling lists or 'default' or 'off'. Default: off\n");
-    H1("   --zones <zone0>/<zone1>/...   Tweak the bitrate of regions of the video\n");
-    H1("                                 Each zone is of the form\n");
-    H1("                                   <start frame>,<end frame>,<option>\n");
-    H1("                                   where <option> is either\n");
-    H1("                                       q=<integer> (force QP)\n");
-    H1("                                   or  b=<float> (bitrate multiplier)\n");
-    H0("   --zonefile <filename>         Zone file containing the zone boundaries and the parameters to be reconfigured.\n");
-    H1("   --lambda-file <string>        Specify a file containing replacement values for the lambda tables\n");
-    H1("                                 MAX_MAX_QP+1 floats for lambda table, then again for lambda2 table\n");
-    H1("                                 Blank lines and lines starting with hash(#) are ignored\n");
-    H1("                                 Comma is considered to be white-space\n");
-    H0("   --max-ausize-factor <float>   This value controls the maximum AU size defined in specification.\n");
-    H0("                                 It represents the percentage of maximum AU size used. Default %.1f\n", param->maxAUSizeFactor);
-    H0("\nLoop filters (deblock and SAO):\n");
-    H0("   --[no-]deblock                Enable Deblocking Loop Filter, optionally specify tC:Beta offsets Default %s\n", OPT(param->bEnableLoopFilter));
-    H0("   --[no-]sao                    Enable Sample Adaptive Offset. Default %s\n", OPT(param->bEnableSAO));
-    H1("   --[no-]sao-non-deblock        Use non-deblocked pixels, else right/bottom boundary areas skipped. Default %s\n", OPT(param->bSaoNonDeblocked));
-    H0("   --[no-]limit-sao              Limit Sample Adaptive Offset types. Default %s\n", OPT(param->bLimitSAO));
-    H0("   --selective-sao <int>         Enable slice-level SAO filter. Default %d\n", param->selectiveSAO);
-    H0("\nVUI options:\n");
-    H0("   --sar <width:height|int>      Sample Aspect Ratio, the ratio of width to height of an individual pixel.\n");
-    H0("                                 Choose from 0=undef, 1=1:1(\"square\"), 2=12:11, 3=10:11, 4=16:11,\n");
-    H0("                                 5=40:33, 6=24:11, 7=20:11, 8=32:11, 9=80:33, 10=18:11, 11=15:11,\n");
-    H0("                                 12=64:33, 13=160:99, 14=4:3, 15=3:2, 16=2:1 or custom ratio of <int:int>. Default %d\n", param->vui.aspectRatioIdc);
-    H1("   --display-window <string>     Describe overscan cropping region as 'left,top,right,bottom' in pixels\n");
-    H1("   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
-    H0("   --videoformat <string>        Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
-    H0("   --range <string>              Specify black level and range of luma and chroma signals as full or limited Default limited\n");
-    H0("   --colorprim <string>          Specify color primaries from  bt709, unknown, reserved, bt470m, bt470bg, smpte170m,\n");
-    H0("                                 smpte240m, film, bt2020, smpte428, smpte431, smpte432. Default undef\n");
-    H0("   --transfer <string>           Specify transfer characteristics from bt709, unknown, reserved, bt470m, bt470bg, smpte170m,\n");
-    H0("                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
-    H0("                                 bt2020-10, bt2020-12, smpte2084, smpte428, arib-std-b67. Default undef\n");
-    H1("   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
-    H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c, smpte2085, chroma-derived-nc, chroma-derived-c, ictcp. Default undef\n");
-    H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
-    H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
-    H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
-    H0("   --max-cll <string>            Specify content light level info SEI as \"cll,fall\" (HDR).\n");
-    H0("   --[no-]cll                    Emit content light level info SEI. Default %s\n", OPT(param->bEmitCLL));
-    H0("   --[no-]hdr10                  Control dumping of HDR10 SEI packet. If max-cll or master-display has non-zero values, this is enabled. Default %s\n", OPT(param->bEmitHDR10SEI));
-    H0("   --[no-]hdr-opt                Add luma and chroma offsets for HDR/WCG content. Default %s. Now deprecated.\n", OPT(param->bHDROpt));
-    H0("   --[no-]hdr10-opt              Block-level QP optimization for HDR10 content. Default %s.\n", OPT(param->bHDR10Opt));
-    H0("   --min-luma <integer>          Minimum luma plane value of input source picture\n");
-    H0("   --max-luma <integer>          Maximum luma plane value of input source picture\n");
-    H0("\nBitstream options:\n");
-    H0("   --[no-]repeat-headers         Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders));
-    H0("   --[no-]info                   Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI));
-    H0("   --[no-]hrd                    Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI));
-    H0("   --[no-]idr-recovery-sei      Emit recovery point infor SEI at each IDR frame \n");
-    H0("   --[no-]temporal-layers        Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers));
-    H0("   --[no-]aud                    Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters));
-    H1("   --hash <integer>              Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI);
-    H0("   --atc-sei <integer>           Emit the alternative transfer characteristics SEI message where the integer is the preferred transfer characteristics. Default disabled\n");
-    H0("   --pic-struct <integer>        Set the picture structure and emits it in the picture timing SEI message. Values in the range 0..12. See D.3.3 of the HEVC spec. for a detailed explanation.\n");
-    H0("   --log2-max-poc-lsb <integer>  Maximum of the picture order count\n");
-    H0("   --[no-]vui-timing-info        Emit VUI timing information in the bistream. Default %s\n", OPT(param->bEmitVUITimingInfo));
-    H0("   --[no-]vui-hrd-info           Emit VUI HRD information in the bistream. Default %s\n", OPT(param->bEmitVUIHRDInfo));
-    H0("   --[no-]opt-qp-pps             Dynamically optimize QP in PPS (instead of default 26) based on QPs in previous GOP. Default %s\n", OPT(param->bOptQpPPS));
-    H0("   --[no-]opt-ref-list-length-pps  Dynamically set L0 and L1 ref list length in PPS (instead of default 0) based on values in last GOP. Default %s\n", OPT(param->bOptRefListLengthPPS));
-    H0("   --[no-]multi-pass-opt-rps     Enable storing commonly used RPS in SPS in multi pass mode. Default %s\n", OPT(param->bMultiPassOptRPS));
-    H0("   --[no-]opt-cu-delta-qp        Optimize to signal consistent CU level delta QPs in frame. Default %s\n", OPT(param->bOptCUDeltaQP));
-    H1("\nReconstructed video options (debugging):\n");
-    H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
-    H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
-    H1("   --recon-y4m-exec <string>     pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
-    H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
-    H0("   --[no-]frame-dup              Enable Frame duplication. Default %s\n", OPT(param->bEnableFrameDuplication));
-    H0("   --dup-threshold <integer>     PSNR threshold for Frame duplication. Default %d\n", param->dupThreshold);
-#ifdef SVT_HEVC
-    H0("   --[no]svt                     Enable SVT HEVC encoder %s\n", OPT(param->bEnableSvtHevc));
-    H0("   --[no-]svt-hme                Enable Hierarchial motion estimation(HME) in SVT HEVC encoder \n");
-    H0("   --svt-search-width            Motion estimation search area width for SVT HEVC encoder \n");
-    H0("   --svt-search-height           Motion estimation search area height for SVT HEVC encoder \n");
-    H0("   --[no-]svt-compressed-ten-bit-format  Enable 8+2 encoding mode for 10bit input in SVT HEVC encoder \n");
-    H0("   --[no-]svt-speed-control      Enable speed control functionality to achieve real time encoding speed for  SVT HEVC encoder \n");
-    H0("   --svt-preset-tuner            Enable additional faster presets of SVT; This only has to be used on top of x265's ultrafast preset. Accepts values in the range of 0-2 \n");
-    H0("   --svt-hierarchical-level      Hierarchical layer for SVT-HEVC encoder; Accepts inputs in the range 0-3 \n");
-    H0("   --svt-base-layer-switch-mode  Select whether B/P slice should be used in base layer for SVT-HEVC encoder. 0-Use B-frames; 1-Use P frames in the base layer \n");
-    H0("   --svt-pred-struct             Select pred structure for SVT HEVC encoder;  Accepts inputs in the range 0-2 \n");
-    H0("   --[no-]svt-fps-in-vps         Enable VPS timing info for SVT HEVC encoder  \n");
-#endif
-    H1("\nExecutable return codes:\n");
-    H1("    0 - encode successful\n");
-    H1("    1 - unable to parse command line\n");
-    H1("    2 - unable to open encoder\n");
-    H1("    3 - unable to generate stream headers\n");
-    H1("    4 - encoder abort\n");
-#undef OPT
-#undef H0
-#undef H1
-    if (level < X265_LOG_DEBUG)
-        printf("\nUse --fullhelp for a full listing (or --log-level full --help)\n");
-    printf("\n\nComplete documentation may be found at http://x265.readthedocs.org/en/default/cli.html\n");
-    exit(1);
-}
+        int argCnt;
+        char** argString;
 
+        /* ABR ladder settings */
+        bool enableScaler;
+        char*    encName;
+        char*    reuseName;
+        uint32_t encId;
+        int      refId;
+        uint32_t loadLevel;
+        uint32_t saveLevel;
+        uint32_t numRefs;
+
+        /* in microseconds */
+        static const int UPDATE_INTERVAL = 250000;
+        CLIOptions()
+        {
+            input = NULL;
+            recon = NULL;
+            output = NULL;
+            qpfile = NULL;
+            zoneFile = NULL;
+            dolbyVisionRpu = NULL;
+            reconPlayCmd = NULL;
+            api = NULL;
+            param = NULL;
+            vmafData = NULL;
+            framesToBeEncoded = seek = 0;
+            totalbytes = 0;
+            bProgress = true;
+            bForceY4m = false;
+            startTime = x265_mdate();
+            prevUpdateTime = 0;
+            bDither = false;
+            enableScaler = false;
+            encName = NULL;
+            reuseName = NULL;
+            encId = 0;
+            refId = -1;
+            loadLevel = 0;
+            saveLevel = 0;
+            numRefs = 0;
+            argCnt = 0;
+        }
+
+        void destroy();
+        void printStatus(uint32_t frameNum);
+        bool parse(int argc, char **argv);
+        bool parseZoneParam(int argc, char **argv, x265_param* globalParam, int zonefileCount);
+        bool parseQPFile(x265_picture &pic_org);
+        bool parseZoneFile();
+        int rpuParser(x265_picture * pic);
+    };
 #ifdef __cplusplus
 }
 #endif