changeset 11929:752ed1108fce stable

Merge with default; prep for v2.6
author Pradeep Ramachandran <pradeep@multicorewareinc.com>
date Tue, 21 Nov 2017 09:50:45 +0530
parents 0b3ba15b33ea dae558b40d99
children dd11aa99f40a
files source/x265-extras.cpp source/x265-extras.h
diffstat 55 files changed, 3358 insertions(+-), 2046 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/msys-cl/make-Makefiles-64bit.sh	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,27 @@
+#!/bin/sh
+# This is to generate visual studio builds with required environment variables set in this shell, useful for ffmpeg integration
+# Run this from within an MSYS bash shell
+
+target_processor='amd64'
+path=$(which cl)
+
+if cl; then
+    echo
+else
+    echo "please launch 'visual studio command prompt' and run '..\vcvarsall.bat amd64'"
+    echo "and then launch msys bash shell from there"
+    exit 1
+fi
+
+if [[ $path  == *$target_processor* ]]; then
+    echo
+else
+    echo "64 bit target not set, please launch 'visual studio command prompt' and run '..\vcvarsall.bat amd64 | x86_amd64 | amd64_x86'"
+    exit 1
+fi
+
+cmake -G "NMake Makefiles" -DCMAKE_CXX_FLAGS="-DWIN32 -D_WINDOWS -W4 -GR -EHsc" -DCMAKE_C_FLAGS="-DWIN32 -D_WINDOWS -W4"  ../../source
+if [ -e Makefile ]
+then
+    nmake
+fi
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/msys-cl/make-Makefiles.sh	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,17 @@
+#!/bin/sh
+# This is to generate visual studio builds with required environment variables set in this shell, useful for ffmpeg integration
+# Run this from within an MSYS bash shell
+
+if cl; then
+    echo 
+else
+    echo "please launch msys from 'visual studio command prompt'"
+    exit 1
+fi
+
+cmake -G "NMake Makefiles" -DCMAKE_CXX_FLAGS="-DWIN32 -D_WINDOWS -W4 -GR -EHsc" -DCMAKE_C_FLAGS="-DWIN32 -D_WINDOWS -W4"  ../../source
+
+if [ -e Makefile ]
+then
+    nmake
+fi
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/vc15-x86/build-all.bat	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,14 @@
+@echo off
+if "%VS150COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 15 not detected"
+  exit 1
+)
+if not exist x265.sln (
+  call make-solutions.bat
+)
+if exist x265.sln (
+  call "%VS150COMNTOOLS%\..\..\VC\vcvarsall.bat"
+  MSBuild /property:Configuration="Release" x265.sln
+  MSBuild /property:Configuration="Debug" x265.sln
+  MSBuild /property:Configuration="RelWithDebInfo" x265.sln
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/vc15-x86/make-solutions.bat	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,6 @@
+@echo off
+::
+:: run this batch file to create a Visual Studio solution file for this project.
+:: See the cmake documentation for other generator targets
+::
+cmake -G "Visual Studio 15" ..\..\source && cmake-gui ..\..\source
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/vc15-x86_64/build-all.bat	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,14 @@
+@echo off
+if "%VS150COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 15 not detected"
+  exit 1
+)
+if not exist x265.sln (
+  call make-solutions.bat
+)
+if exist x265.sln (
+  call "%VS150COMNTOOLS%\..\..\VC\vcvarsall.bat"
+  MSBuild /property:Configuration="Release" x265.sln
+  MSBuild /property:Configuration="Debug" x265.sln
+  MSBuild /property:Configuration="RelWithDebInfo" x265.sln
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/vc15-x86_64/make-solutions.bat	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,6 @@
+@echo off
+::
+:: run this batch file to create a Visual Studio solution file for this project.
+:: See the cmake documentation for other generator targets
+::
+cmake -G "Visual Studio 15 Win64" ..\..\source && cmake-gui ..\..\source
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build/vc15-x86_64/multilib.bat	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,44 @@
+@echo off
+if "%VS150COMNTOOLS%" == "" (
+  msg "%username%" "Visual Studio 15 not detected"
+  exit 1
+)
+
+call "%VS150COMNTOOLS%\..\..\VC\vcvarsall.bat"
+
+@mkdir 12bit
+@mkdir 10bit
+@mkdir 8bit
+
+@cd 12bit
+cmake -G "Visual Studio 15 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF -DMAIN12=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main12.lib
+)
+
+@cd ..\10bit
+cmake -G "Visual Studio 15 Win64" ../../../source -DHIGH_BIT_DEPTH=ON -DEXPORT_C_API=OFF -DENABLE_SHARED=OFF -DENABLE_CLI=OFF
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  copy/y Release\x265-static.lib ..\8bit\x265-static-main10.lib
+)
+
+@cd ..\8bit
+if not exist x265-static-main10.lib (
+  msg "%username%" "10bit build failed"
+  exit 1
+)
+if not exist x265-static-main12.lib (
+  msg "%username%" "12bit build failed"
+  exit 1
+)
+cmake -G "Visual Studio 15 Win64" ../../../source -DEXTRA_LIB="x265-static-main10.lib;x265-static-main12.lib" -DLINKED_10BIT=ON -DLINKED_12BIT=ON
+if exist x265.sln (
+  MSBuild /property:Configuration="Release" x265.sln
+  :: combine static libraries (ignore warnings caused by winxp.cpp hacks)
+  move Release\x265-static.lib x265-static-main.lib
+  LIB.EXE /ignore:4006 /ignore:4221 /OUT:Release\x265-static.lib x265-static-main.lib x265-static-main10.lib x265-static-main12.lib
+)
+
+pause
--- a/doc/reST/api.rst	Tue Sep 05 11:21:24 2017 +0530
+++ b/doc/reST/api.rst	Tue Nov 21 09:50:45 2017 +0530
@@ -192,12 +192,36 @@ changes made to the parameters for auto-
 	 *      presets is not recommended without a more fine-grained breakdown of
 	 *      parameters to take this into account. */
 	int x265_encoder_reconfig(x265_encoder *, x265_param *);
-**x265_encoder_ctu_info**
-       /* x265_encoder_ctu_info:
-        *    Copy CTU information such as ctu address and ctu partition structure of all
-        *    CTUs in each frame. The function is invoked only if "--ctu-info" is enabled and
-        *    the encoder will wait for this copy to complete if enabled.
-        */
+
+**x265_get_slicetype_poc_and_scenecut()** may be used to fetch slice type, poc and scene cut information mid-encode::
+
+    /* x265_get_slicetype_poc_and_scenecut:
+     *     get the slice type, poc and scene cut information for the current frame,
+     *     returns negative on error, 0 on success.
+     *     This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check. */
+     int x265_get_slicetype_poc_and_scenecut(x265_encoder *encoder, int *slicetype, int *poc, int* sceneCut);
+
+**x265_get_ref_frame_list()** may be used to fetch forward and backward refrence list::
+
+    /* x265_get_ref_frame_list:
+     *     returns negative on error, 0 when access unit were output.
+     *     This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check */
+     int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, x265_picyuv**, int, int);
+ 
+**x265_encoder_ctu_info** may be used to provide additional CTU-specific information to the encoder::
+
+    /* x265_encoder_ctu_info:
+     *    Copy CTU information such as ctu address and ctu partition structure of all
+     *    CTUs in each frame. The function is invoked only if "--ctu-info" is enabled and
+     *    the encoder will wait for this copy to complete if enabled.*/
+    int x265_encoder_ctu_info(x265_encoder *encoder, int poc, x265_ctu_info_t** ctu);
+
+**x265_set_analysis_data()** may be used to recive analysis information from external application::
+
+    /* x265_set_analysis_data:
+     *     set the analysis data. The incoming analysis_data structure is assumed to be AVC-sized blocks.
+     *     returns negative on error, 0 access unit were output.*/
+     int x265_set_analysis_data(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
 
 Pictures
 ========
--- a/doc/reST/cli.rst	Tue Sep 05 11:21:24 2017 +0530
+++ b/doc/reST/cli.rst	Tue Nov 21 09:50:45 2017 +0530
@@ -399,6 +399,18 @@ Performance Options
 
 	Default: 1 slice per frame. **Experimental feature**
 
+.. option:: --copy-pic, --no-copy-pic
+
+	Allow encoder to copy input x265 pictures to internal frame buffers. When disabled,
+	x265 will not make an internal copy of the input picture and will work with the
+	application's buffers. While this allows for deeper integration, it is the responsbility
+	of the application to (a) ensure that the allocated picture has extra space for padding
+	that will be done by the library, and (b) the buffers aren't recycled until the library
+	has completed encoding this frame (which can be figured out by tracking NALs output by x265)
+
+	Default: enabled
+
+
 Input/Output File Options
 =========================
 
@@ -875,17 +887,26 @@ will not reuse analysis if slice type pa
 
 	Note that --analysis-reuse-level must be paired with analysis-reuse-mode.
 
-	+--------+-----------------------------------------+
-	| Level  | Description                             |
-	+========+=========================================+
-	| 1      | Lookahead information                   |
-	+--------+-----------------------------------------+
-	| 2 to 4 | Level 1 + intra/inter modes, ref's      |
-	+--------+-----------------------------------------+
-	| 5 to 9 | Level 2 + rect-amp                      |
-	+--------+-----------------------------------------+
-	| 10     | Level 5 + Full CU analysis-info         |
-	+--------+-----------------------------------------+
+    +--------------+------------------------------------------+
+    | Level        | Description                              |
+    +==============+==========================================+
+    | 1            | Lookahead information                    |
+    +--------------+------------------------------------------+
+    | 2 to 4       | Level 1 + intra/inter modes, ref's       |
+    +--------------+------------------------------------------+
+    | 5,6 and 9    | Level 2 + rect-amp                       |
+    +--------------+------------------------------------------+
+    | 7            | Level 5 + AVC size CU refinement         |
+    +--------------+------------------------------------------+
+    | 8            | Level 5 + AVC size Full CU analysis-info |
+    +--------------+------------------------------------------+
+    | 10           | Level 5 + Full CU analysis-info          |
+    +--------------+------------------------------------------+
+
+.. option:: --refine-mv-type <string>
+
+    Reuse MV information received through API call. Currently receives information for AVC size and the accepted 
+    string input is "avc". Default is disabled.
 
 .. option:: --scale-factor
 
@@ -893,28 +914,44 @@ will not reuse analysis if slice type pa
        This option should be coupled with analysis-reuse-mode option, --analysis-reuse-level 10.
        The ctu size of load should be double the size of save. Default 0.
 
-.. option:: --refine-intra <0|1|2>
+.. option:: --refine-intra <0..3>
 	
 	Enables refinement of intra blocks in current encode. 
 	
-	Level 0 - Forces both mode and depth from the previous encode.
+	Level 0 - Forces both mode and depth from the save encode.
 	
-	Level 1 - Evaluates all intra modes for blocks of size one smaller than 
-	the min-cu-size of the incoming analysis data from the previous encode, 
-	forces modes for blocks of larger size.
+	Level 1 - Evaluates all intra modes at current depth(n) and at depth 
+	(n+1) when current block size is one greater than the min-cu-size.
+	Forces modes for larger blocks.
 	
-	Level 2 - Evaluates all intra modes for	blocks of size one smaller than 
-	the min-cu-size of the incoming analysis data from the previous encode. 
-	For larger blocks, force only depth when angular mode is chosen by the 
-	previous encode, force depth and mode when other intra modes are chosen.
+	Level 2 - In addition to the functionality of level 1, at all depths, force 
+	(a) only depth when angular mode is chosen by the save encode.
+	(b) depth and mode when other intra modes are chosen by the save encode.
+	
+	Level 3 - Perform analysis of intra modes for depth reused from first encode.
 	
 	Default 0.
 	
-.. option:: --refine-inter-depth
-
-	Enables refinement of inter blocks in current encode. Evaluates all 
-	inter modes for blocks of size one smaller than the min-cu-size of the 
-	incoming analysis data from the previous encode. Default disabled.
+.. option:: --refine-inter <0..3>
+
+	Enables refinement of inter blocks in current encode. 
+	
+	Level 0 - Forces both mode and depth from the save encode.
+	
+	Level 1 - Evaluates all inter modes at current depth(n) and at depth 
+	(n+1) when current block size is one greater than the min-cu-size.
+	Forces modes for larger blocks.
+	
+	Level 2 - In addition to the functionality of level 1, restricts the modes 
+	evaluated when specific modes are decided as the best mode by the save encode.
+	
+	2nx2n in save encode - disable re-evaluation of rect and amp.
+	
+	skip in save encode  - re-evaluates only skip, merge and 2nx2n modes.
+	
+	Level 3 - Perform analysis of inter modes while reusing depths from the save encode.
+	
+	Default 0.
 
 .. option:: --refine-mv
 	
@@ -1405,6 +1442,16 @@ Slice decision options
 .. option:: --b-pyramid, --no-b-pyramid
 
 	Use B-frames as references, when possible. Default enabled
+	
+.. option:: --force-flush <integer>
+
+	Force the encoder to flush frames. Default is 0.
+	
+	Values:
+	0 - flush the encoder only when all the input pictures are over.
+	1 - flush all the frames even when the input is not over. 
+	    slicetype decision may change with this option.
+	2 - flush the slicetype decided frames only.     
 
 Quality, rate control and rate distortion options
 =================================================
@@ -1470,6 +1517,24 @@ Quality, rate control and rate distortio
 	Default 0.9
 
 	**Range of values:** fractional: 0 - 1.0, or kbits: 2 .. bufsize
+	
+.. option:: --vbv-end <float>
+
+	Final buffer emptiness. The portion of the decode buffer that must be 
+	available after all the specified frames have been inserted into the 
+	decode buffer. Specified as a fractional value between 0 and 1, or in 
+	kbits. Default 0 (disabled)
+	
+	This enables basic support for chunk-parallel encoding where each segment 
+	can specify the starting and ending state of the VBV buffer so that VBV 
+	compliance can be maintained when chunks are independently encoded and 
+	stitched together.
+	
+.. option:: --vbv-end-fr-adj <float>
+
+	Frame from which qp has to be adjusted to achieve final decode buffer
+	emptiness. Specified as a fraction of the total frames. Fractions > 0 are 
+	supported only when the total number of frames is known. Default 0.
 
 .. option:: --qp, -q <integer>
 
@@ -1529,7 +1594,7 @@ Quality, rate control and rate distortio
 	Enable adaptive quantization for sub-CTUs. This parameter specifies 
 	the minimum CU size at which QP can be adjusted, ie. Quantization Group
 	size. Allowed range of values are 64, 32, 16, 8 provided this falls within 
-	the inclusive range [maxCUSize, minCUSize]. Experimental.
+	the inclusive range [maxCUSize, minCUSize].
 	Default: same as maxCUSize
 
 .. option:: --cutree, --no-cutree
@@ -1618,7 +1683,7 @@ Quality, rate control and rate distortio
 	conservative, waiting until there is enough feedback in terms of 
 	encoded frames to control QP. strict-cbr allows the encoder to be 
 	more aggressive in hitting the target bitrate even for short segment 
-	videos. Experimental.
+	videos.
 	
 .. option:: --cbqpoffs <integer>
 
@@ -1878,7 +1943,7 @@ VUI fields must be manually specified.
 	undefined (not signaled)
 
 	1. bt709
-	2. undef
+	2. unknown
 	3. **reserved**
 	4. bt470m
 	5. bt470bg
@@ -1886,13 +1951,16 @@ VUI fields must be manually specified.
 	7. smpte240m
 	8. film
 	9. bt2020
+    10. smpte428
+    11. smpte431
+    12. smpte432
 
 .. option:: --transfer <integer|string>
 
 	Specify transfer characteristics. Default undefined (not signaled)
 
 	1. bt709
-	2. undef
+	2. unknown
 	3. **reserved**
 	4. bt470m
 	5. bt470bg
@@ -1906,8 +1974,8 @@ VUI fields must be manually specified.
 	13. iec61966-2-1
 	14. bt2020-10
 	15. bt2020-12
-	16. smpte-st-2084
-	17. smpte-st-428
+	16. smpte2084
+	17. smpte428
 	18. arib-std-b67
 
 .. option:: --colormatrix <integer|string>
@@ -1926,6 +1994,10 @@ VUI fields must be manually specified.
 	8. YCgCo
 	9. bt2020nc
 	10. bt2020c
+    11. smpte2085
+    12. chroma-derived-nc
+    13. chroma-derived-c
+    14. ictcp
 
 .. option:: --chromaloc <0..5>
 
@@ -1976,15 +2048,15 @@ VUI fields must be manually specified.
 .. option:: --hdr, --no-hdr
 
 	Force signalling of HDR parameters in SEI packets. Enabled
-	automatically when :option`--master-display` or :option`--max-cll` is
+	automatically when :option:`--master-display` or :option:`--max-cll` is
 	specified. Useful when there is a desire to signal 0 values for max-cll
 	and max-fall. Default disabled.
 	
 .. option:: --hdr-opt, --no-hdr-opt
 
 	Add luma and chroma offsets for HDR/WCG content.
-	Input video should be 10 bit 4:2:0. Applicable for HDR content.
-	Default disabled. **Experimental Feature**
+	Input video should be 10 bit 4:2:0. Applicable for HDR content. It is recommended
+	that AQ-mode be enabled along with this feature. Default disabled.
 	
 .. option:: --dhdr10-info <filename>
 
@@ -2004,12 +2076,12 @@ VUI fields must be manually specified.
 .. option:: --min-luma <integer>
 
 	Minimum luma value allowed for input pictures. Any values below min-luma
-	are clipped. Experimental. No default.
+	are clipped.  No default.
 
 .. option:: --max-luma <integer>
 
 	Maximum luma value allowed for input pictures. Any values above max-luma
-	are clipped. Experimental. No default.
+	are clipped.  No default.
 
 Bitstream options
 =================
@@ -2091,12 +2163,12 @@ Bitstream options
 .. option:: --opt-qp-pps, --no-opt-qp-pps
 
 	Optimize QP in PPS (instead of default value of 26) based on the QP values
-	observed in last GOP. Default enabled.
+	observed in last GOP. Default disabled.
 
 .. option:: --opt-ref-list-length-pps, --no-opt-ref-list-length-pps
 
 	Optimize L0 and L1 ref list length in PPS (instead of default value of 0)
-	based on the lengths observed in the last GOP. Default enabled.
+	based on the lengths observed in the last GOP. Default disabled.
 
 .. option:: --multi-pass-opt-rps, --no-multi-pass-opt-rps
 
@@ -2109,6 +2181,21 @@ Bitstream options
 
 	Only effective at RD levels 5 and 6
 
+DCT Approximations
+=================
+
+.. option:: --lowpass-dct
+
+    If enabled, x265 will use low-pass subband dct approximation instead of the
+    standard dct for 16x16 and 32x32 blocks. This approximation is less computational 
+    intensive but it generates truncated coefficient matrixes for the transformed block. 
+    Empirical analysis shows marginal loss in compression and performance gains up to 10%,
+    paticularly at moderate bit-rates.
+
+    This approximation should be considered for platforms with performance and time 
+    constrains.
+
+    Default disabled. **Experimental feature**
 
 Debugging options
 =================
--- a/source/CMakeLists.txt	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/CMakeLists.txt	Tue Nov 21 09:50:45 2017 +0530
@@ -29,7 +29,7 @@ option(NATIVE_BUILD "Target the build CP
 option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF)
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 130)
+set(X265_BUILD 146)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -184,6 +184,14 @@ if(CC STREQUAL "xlc")
 endif()
 # this option is to enable the inclusion of dynamic HDR10 library to the libx265 compilation
 option(ENABLE_HDR10_PLUS "Enable dynamic HDR10 compilation" OFF)
+if(MSVC AND (MSVC_VERSION LESS 1800) AND ENABLE_HDR10_PLUS)
+    message(FATAL_ERROR "MSVC version 12.0 or above required to support hdr10plus")
+endif()
+if(WIN32 AND (MSVC_VERSION GREATER 1800))
+    if(CMAKE_VERSION VERSION_LESS 3.7)
+        message(FATAL_ERROR "cmake version not compatible for VS 2017. Update the cmake to versions 3.7 or above")
+    endif()
+endif()
 if(GCC)
     add_definitions(-Wall -Wextra -Wshadow)
     add_definitions(-D__STDC_LIMIT_MACROS=1)
@@ -539,6 +547,13 @@ if(ENABLE_HDR10_PLUS)
 endif()
 install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include)
 
+if(WIN32)
+    install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug)
+    install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo)
+    install(FILES "${PROJECT_BINARY_DIR}/Debug/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug OPTIONAL NAMELINK_ONLY)
+    install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo OPTIONAL NAMELINK_ONLY)
+endif()
+
 if(CMAKE_RC_COMPILER)
     # The resource compiler does not need CFLAGS or macro defines. It
     # often breaks them
@@ -639,13 +654,11 @@ if(X265_LATEST_TAG)
             DESTINATION "${LIB_INSTALL_DIR}/pkgconfig")
 endif()
 
-if(NOT WIN32)
-    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in"
-                   "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake"
-                   IMMEDIATE @ONLY)
-    add_custom_target(uninstall
-                      "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake")
-endif()
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in"
+               "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake"
+               IMMEDIATE @ONLY)
+add_custom_target(uninstall
+                  "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake")
 
 # Main CLI application
 set(ENABLE_CLI ON CACHE BOOL "Build standalone CLI application")
--- a/source/cmake/cmake_uninstall.cmake.in	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/cmake/cmake_uninstall.cmake.in	Tue Nov 21 09:50:45 2017 +0530
@@ -17,3 +17,7 @@ foreach(file ${files})
         message(STATUS "File '$ENV{DESTDIR}${file}' does not exist.")
     endif()
 endforeach(file)
+
+if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
+    file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt")
+endif()
--- a/source/common/CMakeLists.txt	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/CMakeLists.txt	Tue Nov 21 09:50:45 2017 +0530
@@ -131,7 +131,7 @@ endif(WIN32)
 add_library(common OBJECT
     ${ASM_PRIMITIVES} ${VEC_PRIMITIVES} ${ALTIVEC_PRIMITIVES} ${WINXP}
     primitives.cpp primitives.h
-    pixel.cpp dct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
+    pixel.cpp dct.cpp lowpassdct.cpp ipfilter.cpp intrapred.cpp loopfilter.cpp
     constants.cpp constants.h
     cpu.cpp cpu.h version.cpp
     threading.cpp threading.h
--- a/source/common/common.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/common.h	Tue Nov 21 09:50:45 2017 +0530
@@ -207,7 +207,6 @@ typedef int16_t  coeff_t;      // transf
 
 // arbitrary, but low because SATD scores are 1/4 normal
 #define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET)
-#define X265_LOOKAHEAD_MAX 250
 
 // Use the same size blocks as x264.  Using larger blocks seems to give artificially
 // high cost estimates (intra and inter both suffer)
--- a/source/common/cudata.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/cudata.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -201,6 +201,8 @@ void CUData::initialize(const CUDataMemP
         m_cuDepth            = charBuf; charBuf += m_numPartitions;
         m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
         m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_skipFlag[0]        = charBuf; charBuf += m_numPartitions;
+        m_skipFlag[1]        = charBuf; charBuf += m_numPartitions;
         m_mergeFlag          = charBuf; charBuf += m_numPartitions;
         m_interDir           = charBuf; charBuf += m_numPartitions;
         m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
@@ -239,6 +241,8 @@ void CUData::initialize(const CUDataMemP
         m_cuDepth            = charBuf; charBuf += m_numPartitions;
         m_predMode           = charBuf; charBuf += m_numPartitions; /* the order up to here is important in initCTU() and initSubCU() */
         m_partSize           = charBuf; charBuf += m_numPartitions;
+        m_skipFlag[0]        = charBuf; charBuf += m_numPartitions;
+        m_skipFlag[1]        = charBuf; charBuf += m_numPartitions;
         m_mergeFlag          = charBuf; charBuf += m_numPartitions;
         m_interDir           = charBuf; charBuf += m_numPartitions;
         m_mvpIdx[0]          = charBuf; charBuf += m_numPartitions;
--- a/source/common/cudata.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/cudata.h	Tue Nov 21 09:50:45 2017 +0530
@@ -199,13 +199,14 @@ public:
     uint8_t*      m_predMode;         // array of prediction modes
     uint8_t*      m_partSize;         // array of partition sizes
     uint8_t*      m_mergeFlag;        // array of merge flags
+    uint8_t*      m_skipFlag[2];
     uint8_t*      m_interDir;         // array of inter directions
     uint8_t*      m_mvpIdx[2];        // array of motion vector predictor candidates or merge candidate indices [0]
     uint8_t*      m_tuDepth;          // array of transform indices
     uint8_t*      m_transformSkip[3]; // array of transform skipping flags per plane
     uint8_t*      m_cbf[3];           // array of coded block flags (CBF) per plane
     uint8_t*      m_chromaIntraDir;   // array of intra directions (chroma)
-    enum { BytesPerPartition = 21 };  // combined sizeof() of all per-part data
+    enum { BytesPerPartition = 23 };  // combined sizeof() of all per-part data
 
     sse_t*        m_distortion;
     coeff_t*      m_trCoeff[3];       // transformed coefficient buffer per plane
--- a/source/common/frame.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/frame.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -77,7 +77,15 @@ bool Frame::create(x265_param *param, fl
         }
     }
 
-    if (m_fencPic->create(param) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize))
+    if (param->bMVType == AVC_INFO)
+    {
+        m_analysisData.wt = NULL;
+        m_analysisData.intraData = NULL;
+        m_analysisData.interData = NULL;
+        m_analysis2Pass.analysisFramedata = NULL;
+    }
+
+    if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
         m_numRows = (m_fencPic->m_picHeight + param->maxCUSize - 1)  / param->maxCUSize;
@@ -150,7 +158,8 @@ void Frame::destroy()
 
     if (m_fencPic)
     {
-        m_fencPic->destroy();
+        if (m_param->bCopyPicToFrame)
+            m_fencPic->destroy();
         delete m_fencPic;
         m_fencPic = NULL;
     }
--- a/source/common/frame.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/frame.h	Tue Nov 21 09:50:45 2017 +0530
@@ -98,6 +98,7 @@ public:
 
     float*                 m_quantOffsets;       // points to quantOffsets in x265_picture
     x265_sei               m_userSEI;
+    Event                  m_reconEncoded;
 
     /* Frame Parallelism - notification between FrameEncoders of available motion reference rows */
     ThreadSafeInteger*     m_reconRowFlag;       // flag of CTU rows completely reconstructed and extended for motion reference
@@ -112,6 +113,8 @@ public:
     x265_analysis_2Pass    m_analysis2Pass;
     RcStats*               m_rcData;
 
+    Event                  m_copyMVType;
+
     x265_ctu_info_t**      m_ctuInfo;
     Event                  m_copied;
     int*                   m_prevCtuInfoChange;
--- a/source/common/framedata.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/framedata.h	Tue Nov 21 09:50:45 2017 +0530
@@ -195,6 +195,7 @@ struct analysis_inter_data
     uint8_t*    mvpIdx[2];
     int8_t*     refIdx[2];
     MV*         mv[2];
+   int64_t*     sadCost;
 };
 
 struct analysis2PassFrameData
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/lowpassdct.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -0,0 +1,127 @@
+/*****************************************************************************
+ * Copyright (C) 2017 
+ *
+ * Authors: Humberto Ribeiro Filho <mont3z.claro5@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+
+using namespace X265_NS;
+
+/* standard dct transformations */
+static dct_t* s_dct4x4;
+static dct_t* s_dct8x8;
+static dct_t* s_dct16x16;
+
+static void lowPassDct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[4 * 4]);
+    ALIGN_VAR_32(int16_t, avgBlock[4 * 4]);
+    int16_t totalSum = 0;
+    int16_t sum = 0;
+    
+    for (int i = 0; i < 4; i++)
+        for (int j =0; j < 4; j++)
+        {
+            // Calculate average of 2x2 cells
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            avgBlock[i*4 + j] = sum >> 2;
+
+            totalSum += sum; // use to calculate total block average
+        }
+
+    //dct4
+    (*s_dct4x4)(avgBlock, coef, 4);
+    memset(dst, 0, 64 * sizeof(int16_t));
+    for (int i = 0; i < 4; i++)
+    {
+        memcpy(&dst[i * 8], &coef[i * 4], 4 * sizeof(int16_t));
+    }
+
+    // replace first coef with total block average
+    dst[0] = totalSum << 1;
+}
+
+static void lowPassDct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[8 * 8]);
+    ALIGN_VAR_32(int16_t, avgBlock[8 * 8]);
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 8; i++)
+        for (int j =0; j < 8; j++)
+        {
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            avgBlock[i*8 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    (*s_dct8x8)(avgBlock, coef, 8);
+    memset(dst, 0, 256 * sizeof(int16_t));
+    for (int i = 0; i < 8; i++)
+    {
+        memcpy(&dst[i * 16], &coef[i * 8], 8 * sizeof(int16_t));
+    }
+    dst[0] = static_cast<int16_t>(totalSum >> 1);
+}
+
+static void lowPassDct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
+{
+    ALIGN_VAR_32(int16_t, coef[16 * 16]);
+    ALIGN_VAR_32(int16_t, avgBlock[16 * 16]);
+    int32_t totalSum = 0;
+    int16_t sum = 0;
+    for (int i = 0; i < 16; i++)
+        for (int j =0; j < 16; j++)
+        {
+            sum = src[2*i*srcStride + 2*j] + src[2*i*srcStride + 2*j + 1]
+                    + src[(2*i+1)*srcStride + 2*j] + src[(2*i+1)*srcStride + 2*j + 1];
+            avgBlock[i*16 + j] = sum >> 2;
+
+            totalSum += sum;
+        }
+
+    (*s_dct16x16)(avgBlock, coef, 16);
+    memset(dst, 0, 1024 * sizeof(int16_t));
+    for (int i = 0; i < 16; i++)
+    {
+        memcpy(&dst[i * 32], &coef[i * 16], 16 * sizeof(int16_t));
+    }
+    dst[0] = static_cast<int16_t>(totalSum >> 3);
+}
+
+namespace X265_NS {
+// x265 private namespace
+
+void setupLowPassPrimitives_c(EncoderPrimitives& p)
+{
+    s_dct4x4 = &(p.cu[BLOCK_4x4].standard_dct);
+    s_dct8x8 = &(p.cu[BLOCK_8x8].standard_dct);
+    s_dct16x16 = &(p.cu[BLOCK_16x16].standard_dct);
+
+    p.cu[BLOCK_8x8].lowpass_dct = lowPassDct8_c;
+    p.cu[BLOCK_16x16].lowpass_dct = lowPassDct16_c;
+    p.cu[BLOCK_32x32].lowpass_dct = lowPassDct32_c;
+}
+}
--- a/source/common/lowres.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/lowres.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -160,6 +160,9 @@ void Lowres::init(PicYuv *origPic, int p
 
     for (int i = 0; i < bframes + 2; i++)
         intraMbs[i] = 0;
+    if (origPic->m_param->rc.vbvBufferSize)
+        for (int i = 0; i < X265_LOOKAHEAD_MAX + 1; i++)
+            plannedType[i] = X265_TYPE_AUTO;
 
     /* downscale and generate 4 hpel planes for lookahead */
     primitives.frameInitLowres(origPic->m_picOrg[0],
--- a/source/common/param.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/param.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -157,6 +157,7 @@ void x265_param_default(x265_param* para
     param->bEnableConstrainedIntra = 0;
     param->bEnableStrongIntraSmoothing = 1;
     param->bEnableFastIntra = 0;
+    param->bEnableSplitRdSkip = 0;
 
     /* Inter Coding tools */
     param->searchMethod = X265_HEX_SEARCH;
@@ -211,6 +212,8 @@ void x265_param_default(x265_param* para
     param->rc.vbvMaxBitrate = 0;
     param->rc.vbvBufferSize = 0;
     param->rc.vbvBufferInit = 0.9;
+    param->vbvBufferEnd = 0;
+    param->vbvEndFrameAdjust = 0;
     param->rc.rfConstant = 28;
     param->rc.bitrate = 0;
     param->rc.qCompress = 0.6;
@@ -268,8 +271,8 @@ void x265_param_default(x265_param* para
 
     param->bEmitVUITimingInfo   = 1;
     param->bEmitVUIHRDInfo      = 1;
-    param->bOptQpPPS            = 1;
-    param->bOptRefListLengthPPS = 1;
+    param->bOptQpPPS            = 0;
+    param->bOptRefListLengthPPS = 0;
     param->bOptCUDeltaQP        = 0;
     param->bAQMotion = 0;
     param->bHDROpt = 0;
@@ -285,6 +288,13 @@ void x265_param_default(x265_param* para
     param->mvRefine = 0;
     param->bUseAnalysisFile = 1;
     param->csvfpt = NULL;
+    param->forceFlush = 0;
+    param->bDisableLookahead = 0;
+    param->bCopyPicToFrame = 1;
+
+    /* DCT Approximations */
+    param->bLowPassDct = 0;
+    param->bMVType = 0;
 }
 
 int x265_param_default_preset(x265_param* param, const char* preset, const char* tune)
@@ -971,8 +981,29 @@ int x265_param_parse(x265_param* p, cons
         OPT("ctu-info") p->bCTUInfo = atoi(value);
         OPT("scale-factor") p->scaleFactor = atoi(value);
         OPT("refine-intra")p->intraRefine = atoi(value);
-        OPT("refine-inter")p->interRefine = atobool(value);
+        OPT("refine-inter")p->interRefine = atoi(value);
         OPT("refine-mv")p->mvRefine = atobool(value);
+        OPT("force-flush")p->forceFlush = atoi(value);
+        OPT("splitrd-skip") p->bEnableSplitRdSkip = atobool(value);
+		OPT("lowpass-dct") p->bLowPassDct = atobool(value);
+        OPT("vbv-end") p->vbvBufferEnd = atof(value);
+        OPT("vbv-end-fr-adj") p->vbvEndFrameAdjust = atof(value);
+        OPT("copy-pic") p->bCopyPicToFrame = atobool(value);
+        OPT("refine-mv-type")
+        {
+            if (strcmp(strdup(value), "avc") == 0)
+            {
+                p->bMVType = AVC_INFO;
+            }
+            else if (strcmp(strdup(value), "off") == 0)
+            {
+                p->bMVType = NO_INFO;
+            }
+            else
+            {
+                bError = true;
+            }
+         }
         else
             return X265_PARAM_BAD_NAME;
     }
@@ -1236,10 +1267,10 @@ int x265_check_params(x265_param* param)
           "Video Format must be component,"
           " pal, ntsc, secam, mac or undef");
     CHECK(param->vui.colorPrimaries < 0
-          || param->vui.colorPrimaries > 9
+          || param->vui.colorPrimaries > 12
           || param->vui.colorPrimaries == 3,
           "Color Primaries must be undef, bt709, bt470m,"
-          " bt470bg, smpte170m, smpte240m, film or bt2020");
+          " bt470bg, smpte170m, smpte240m, film, bt2020, smpte-st-428, smpte-rp-431 or smpte-eg-432");
     CHECK(param->vui.transferCharacteristics < 0
           || param->vui.transferCharacteristics > 18
           || param->vui.transferCharacteristics == 3,
@@ -1247,10 +1278,10 @@ int x265_check_params(x265_param* param)
           " smpte170m, smpte240m, linear, log100, log316, iec61966-2-4, bt1361e,"
           " iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428 or arib-std-b67");
     CHECK(param->vui.matrixCoeffs < 0
-          || param->vui.matrixCoeffs > 10
+          || param->vui.matrixCoeffs > 14
           || param->vui.matrixCoeffs == 3,
           "Matrix Coefficients must be undef, bt709, fcc, bt470bg, smpte170m,"
-          " smpte240m, GBR, YCgCo, bt2020nc or bt2020c");
+          " smpte240m, GBR, YCgCo, bt2020nc, bt2020c, smpte-st-2085, chroma-nc, chroma-c or ictcp");
     CHECK(param->vui.chromaSampleLocTypeTopField < 0
           || param->vui.chromaSampleLocTypeTopField > 5,
           "Chroma Sample Location Type Top Field must be 0-5");
@@ -1291,6 +1322,12 @@ int x265_check_params(x265_param* param)
           "Maximum local bit rate can not be less than zero");
     CHECK(param->rc.vbvBufferInit < 0,
           "Valid initial VBV buffer occupancy must be a fraction 0 - 1, or size in kbits");
+    CHECK(param->vbvBufferEnd < 0,
+        "Valid final VBV buffer emptiness must be a fraction 0 - 1, or size in kbits");
+    CHECK(param->vbvEndFrameAdjust < 0,
+        "Valid vbv-end-fr-adj must be a fraction 0 - 1");
+    CHECK(!param->totalFrames && param->vbvEndFrameAdjust,
+        "vbv-end-fr-adj cannot be enabled when total number of frames is unknown");
     CHECK(param->rc.bitrate < 0,
           "Target bitrate can not be less than zero");
     CHECK(param->rc.qCompress < 0.5 || param->rc.qCompress > 1.0,
@@ -1316,6 +1353,10 @@ int x265_check_params(x265_param* param)
         "Supported range for log2MaxPocLsb is 4 to 16");
     CHECK(param->bCTUInfo < 0 || (param->bCTUInfo != 0 && param->bCTUInfo != 1 && param->bCTUInfo != 2 && param->bCTUInfo != 4 && param->bCTUInfo != 6) || param->bCTUInfo > 6,
         "Supported values for bCTUInfo are 0, 1, 2, 4, 6");
+    CHECK(param->interRefine > 3 || param->interRefine < 0,
+        "Invalid refine-inter value, refine-inter levels 0 to 3 supported");
+    CHECK(param->intraRefine > 3 || param->intraRefine < 0,
+        "Invalid refine-intra value, refine-intra levels 0 to 3 supported");
 #if !X86_64
     CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 || param->sourceHeight > 480),
         "SEA motion search does not support resolutions greater than 480p in 32 bit build");
@@ -1410,9 +1451,15 @@ void x265_print_params(x265_param* param
     }
 
     if (param->rc.vbvBufferSize)
-        x265_log(param, X265_LOG_INFO, "VBV/HRD buffer / max-rate / init    : %d / %d / %.3f\n",
-                 param->rc.vbvBufferSize, param->rc.vbvMaxBitrate, param->rc.vbvBufferInit);
-
+    {
+        if (param->vbvBufferEnd)
+            x265_log(param, X265_LOG_INFO, "VBV/HRD buffer / max-rate / init / end / fr-adj: %d / %d / %.3f / %.3f / %.3f\n",
+            param->rc.vbvBufferSize, param->rc.vbvMaxBitrate, param->rc.vbvBufferInit, param->vbvBufferEnd, param->vbvEndFrameAdjust);
+        else
+            x265_log(param, X265_LOG_INFO, "VBV/HRD buffer / max-rate / init    : %d / %d / %.3f\n",
+            param->rc.vbvBufferSize, param->rc.vbvMaxBitrate, param->rc.vbvBufferInit);
+    }
+    
     char buf[80] = { 0 };
     char tmp[40];
 #define TOOLOPT(FLAG, STR) if (FLAG) appendtool(param, buf, sizeof(buf), STR);
@@ -1429,6 +1476,7 @@ void x265_print_params(x265_param* param
     TOOLOPT(param->bEnableRdRefine, "rd-refine");
     TOOLOPT(param->bEnableEarlySkip, "early-skip");
     TOOLOPT(param->bEnableRecursionSkip, "rskip");
+    TOOLOPT(param->bEnableSplitRdSkip, "splitrd-skip");
     TOOLVAL(param->noiseReductionIntra, "nr-intra=%d");
     TOOLVAL(param->noiseReductionInter, "nr-inter=%d");
     TOOLOPT(param->bEnableTSkipFast, "tskip-fast");
@@ -1444,6 +1492,8 @@ void x265_print_params(x265_param* param
     TOOLVAL(param->lookaheadSlices, "lslices=%d");
     TOOLVAL(param->lookaheadThreads, "lthreads=%d")
     TOOLVAL(param->bCTUInfo, "ctu-info=%d");
+    if (param->bMVType == AVC_INFO)
+        TOOLOPT(param->bMVType, "refine-mv-type=avc");
     if (param->maxSlices > 1)
         TOOLVAL(param->maxSlices, "slices=%d");
     if (param->bEnableLoopFilter)
@@ -1558,6 +1608,7 @@ char *x265_param2string(x265_param* p, i
     BOOL(p->bEnableTSkipFast, "tskip-fast");
     BOOL(p->bCULossless, "cu-lossless");
     BOOL(p->bIntraInBFrames, "b-intra");
+    BOOL(p->bEnableSplitRdSkip, "splitrd-skip");
     s += sprintf(s, " rdpenalty=%d", p->rdPenalty);
     s += sprintf(s, " psy-rd=%.2f", p->psyRd);
     s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq);
@@ -1587,8 +1638,10 @@ char *x265_param2string(x265_param* p, i
         {
             s += sprintf(s, " vbv-maxrate=%d vbv-bufsize=%d vbv-init=%.1f",
                  p->rc.vbvMaxBitrate, p->rc.vbvBufferSize, p->rc.vbvBufferInit);
+            if (p->vbvBufferEnd)
+                s += sprintf(s, " vbv-end=%.1f vbv-end-fr-adj=%.1f", p->vbvBufferEnd, p->vbvEndFrameAdjust);
             if (p->rc.rateControlMode == X265_RC_CRF)
-                s += sprintf(s, " crf-max=%.1f crf-min=%.1f", p->rc.rfConstantMax, p->rc.rfConstantMin);
+                s += sprintf(s, " crf-max=%.1f crf-min=%.1f", p->rc.rfConstantMax, p->rc.rfConstantMin);   
         }
     }
     else if (p->rc.rateControlMode == X265_RC_CQP)
@@ -1665,6 +1718,9 @@ char *x265_param2string(x265_param* p, i
     s += sprintf(s, " refine-mv=%d", p->mvRefine);
     BOOL(p->bLimitSAO, "limit-sao");
     s += sprintf(s, " ctu-info=%d", p->bCTUInfo);
+    BOOL(p->bLowPassDct, "lowpass-dct");
+    s += sprintf(s, " refine-mv-type=%d", p->bMVType);
+    s += sprintf(s, " copy-pic=%d", p->bCopyPicToFrame);
 #undef BOOL
     return buf;
 }
--- a/source/common/piclist.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/piclist.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -117,6 +117,15 @@ Frame *PicList::popBack()
         return NULL;
 }
 
+Frame* PicList::getCurFrame(void)
+{
+    Frame *curFrame = m_start;
+    if (curFrame != NULL)
+        return curFrame;
+    else
+        return NULL;
+}
+
 void PicList::remove(Frame& curFrame)
 {
 #if _DEBUG
--- a/source/common/piclist.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/piclist.h	Tue Nov 21 09:50:45 2017 +0530
@@ -62,6 +62,9 @@ public:
     /** Find frame with specified POC */
     Frame* getPOC(int poc);
 
+    /** Get the current Frame from the list **/
+    Frame* getCurFrame(void);
+
     /** Remove picture from list */
     void remove(Frame& pic);
 
--- a/source/common/picyuv.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/picyuv.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -69,7 +69,7 @@ PicYuv::PicYuv()
     m_vChromaShift = 0;
 }
 
-bool PicYuv::create(x265_param* param, pixel *pixelbuf)
+bool PicYuv::create(x265_param* param, bool picAlloc, pixel *pixelbuf)
 {
     m_param = param;
     uint32_t picWidth = m_param->sourceWidth;
@@ -93,8 +93,11 @@ bool PicYuv::create(x265_param* param, p
         m_picOrg[0] = pixelbuf;
     else
     {
-        CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
-        m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX;
+        if (picAlloc)
+        {
+            CHECKED_MALLOC(m_picBuf[0], pixel, m_stride * (maxHeight + (m_lumaMarginY * 2)));
+            m_picOrg[0] = m_picBuf[0] + m_lumaMarginY * m_stride + m_lumaMarginX;
+        }
     }
 
     if (picCsp != X265_CSP_I400)
@@ -102,12 +105,14 @@ bool PicYuv::create(x265_param* param, p
         m_chromaMarginX = m_lumaMarginX;  // keep 16-byte alignment for chroma CTUs
         m_chromaMarginY = m_lumaMarginY >> m_vChromaShift;
         m_strideC = ((numCuInWidth * m_param->maxCUSize) >> m_hChromaShift) + (m_chromaMarginX * 2);
+        if (picAlloc)
+        {
+            CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
+            CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
 
-        CHECKED_MALLOC(m_picBuf[1], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
-        CHECKED_MALLOC(m_picBuf[2], pixel, m_strideC * ((maxHeight >> m_vChromaShift) + (m_chromaMarginY * 2)));
-
-        m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
-        m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+            m_picOrg[1] = m_picBuf[1] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+            m_picOrg[2] = m_picBuf[2] + m_chromaMarginY * m_strideC + m_chromaMarginX;
+        }
     }
     else
     {
@@ -236,8 +241,10 @@ void PicYuv::copyFromPicture(const x265_
     uint64_t crSum;
     lumaSum = cbSum = crSum = 0;
 
-    if (pic.bitDepth == 8)
+    if (m_param->bCopyPicToFrame)
     {
+        if (pic.bitDepth == 8)
+        {
 #if (X265_DEPTH > 8)
         {
             pixel *yPixel = m_picOrg[0];
@@ -260,7 +267,7 @@ void PicYuv::copyFromPicture(const x265_
             }
         }
 #else /* Case for (X265_DEPTH == 8) */
-        // TODO: Does we need this path? may merge into above in future
+            // TODO: Does we need this path? may merge into above in future
         {
             pixel *yPixel = m_picOrg[0];
             uint8_t *yChar = (uint8_t*)pic.planes[0];
@@ -294,47 +301,54 @@ void PicYuv::copyFromPicture(const x265_
             }
         }
 #endif /* (X265_DEPTH > 8) */
-    }
-    else /* pic.bitDepth > 8 */
-    {
-        /* defensive programming, mask off bits that are supposed to be zero */
-        uint16_t mask = (1 << X265_DEPTH) - 1;
-        int shift = abs(pic.bitDepth - X265_DEPTH);
-        pixel *yPixel = m_picOrg[0];
+        }
+        else /* pic.bitDepth > 8 */
+        {
+            /* defensive programming, mask off bits that are supposed to be zero */
+            uint16_t mask = (1 << X265_DEPTH) - 1;
+            int shift = abs(pic.bitDepth - X265_DEPTH);
+            pixel *yPixel = m_picOrg[0];
 
-        uint16_t *yShort = (uint16_t*)pic.planes[0];
-
-        if (pic.bitDepth > X265_DEPTH)
-        {
-            /* shift right and mask pixels to final size */
-            primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-        }
-        else /* Case for (pic.bitDepth <= X265_DEPTH) */
-        {
-            /* shift left and mask pixels to final size */
-            primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-        }
-
-        if (param.internalCsp != X265_CSP_I400)
-        {
-            pixel *uPixel = m_picOrg[1];
-            pixel *vPixel = m_picOrg[2];
-
-            uint16_t *uShort = (uint16_t*)pic.planes[1];
-            uint16_t *vShort = (uint16_t*)pic.planes[2];
+            uint16_t *yShort = (uint16_t*)pic.planes[0];
 
             if (pic.bitDepth > X265_DEPTH)
             {
-                primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-                primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                /* shift right and mask pixels to final size */
+                primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
             }
             else /* Case for (pic.bitDepth <= X265_DEPTH) */
             {
-                primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-                primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                /* shift left and mask pixels to final size */
+                primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+            }
+
+            if (param.internalCsp != X265_CSP_I400)
+            {
+                pixel *uPixel = m_picOrg[1];
+                pixel *vPixel = m_picOrg[2];
+
+                uint16_t *uShort = (uint16_t*)pic.planes[1];
+                uint16_t *vShort = (uint16_t*)pic.planes[2];
+
+                if (pic.bitDepth > X265_DEPTH)
+                {
+                    primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                }
+                else /* Case for (pic.bitDepth <= X265_DEPTH) */
+                {
+                    primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                    primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+                }
             }
         }
     }
+    else
+    {
+        m_picOrg[0] = (pixel*)pic.planes[0];
+        m_picOrg[1] = (pixel*)pic.planes[1];
+        m_picOrg[2] = (pixel*)pic.planes[2];
+    }
 
     pixel *Y = m_picOrg[0];
     pixel *U = m_picOrg[1];
--- a/source/common/picyuv.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/picyuv.h	Tue Nov 21 09:50:45 2017 +0530
@@ -27,6 +27,7 @@
 #include "common.h"
 #include "md5.h"
 #include "x265.h"
+struct x265_picyuv {};
 
 namespace X265_NS {
 // private namespace
@@ -34,7 +35,7 @@ namespace X265_NS {
 class ShortYuv;
 struct SPS;
 
-class PicYuv
+class PicYuv : public x265_picyuv
 {
 public:
 
@@ -75,7 +76,7 @@ public:
 
     PicYuv();
 
-    bool  create(x265_param* param, pixel *pixelbuf = NULL);
+    bool  create(x265_param* param, bool picAlloc = true, pixel *pixelbuf = NULL);
     bool  createOffsets(const SPS& sps);
     void  destroy();
     int   getLumaBufLen(uint32_t picWidth, uint32_t picHeight, uint32_t picCsp);
--- a/source/common/primitives.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/primitives.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -58,11 +58,13 @@ void setupIntraPrimitives_c(EncoderPrimi
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
 void setupSaoPrimitives_c(EncoderPrimitives &p);
 void setupSeaIntegralPrimitives_c(EncoderPrimitives &p);
+void setupLowPassPrimitives_c(EncoderPrimitives& p);
 
 void setupCPrimitives(EncoderPrimitives &p)
 {
     setupPixelPrimitives_c(p);      // pixel.cpp
     setupDCTPrimitives_c(p);        // dct.cpp
+    setupLowPassPrimitives_c(p);    // lowpassdct.cpp
     setupFilterPrimitives_c(p);     // ipfilter.cpp
     setupIntraPrimitives_c(p);      // intrapred.cpp
     setupLoopFilterPrimitives_c(p); // loopfilter.cpp
@@ -70,6 +72,19 @@ void setupCPrimitives(EncoderPrimitives 
     setupSeaIntegralPrimitives_c(p);  // framefilter.cpp
 }
 
+void enableLowpassDCTPrimitives(EncoderPrimitives &p)
+{
+    // update copies of the standard dct transform
+    p.cu[BLOCK_4x4].standard_dct = p.cu[BLOCK_4x4].dct;
+    p.cu[BLOCK_8x8].standard_dct = p.cu[BLOCK_8x8].dct;
+    p.cu[BLOCK_16x16].standard_dct = p.cu[BLOCK_16x16].dct;
+    p.cu[BLOCK_32x32].standard_dct = p.cu[BLOCK_32x32].dct;
+
+    // replace active dct by lowpass dct for high dct transforms
+    p.cu[BLOCK_16x16].dct = p.cu[BLOCK_16x16].lowpass_dct;
+    p.cu[BLOCK_32x32].dct = p.cu[BLOCK_32x32].lowpass_dct;
+}
+
 void setupAliasPrimitives(EncoderPrimitives &p)
 {
 #if HIGH_BIT_DEPTH
@@ -256,6 +271,11 @@ void x265_setup_primitives(x265_param *p
 #endif
 
         setupAliasPrimitives(primitives);
+
+        if (param->bLowPassDct)
+        {
+            enableLowpassDCTPrimitives(primitives); 
+        }
     }
 
     x265_report_simd(param);
--- a/source/common/primitives.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/primitives.h	Tue Nov 21 09:50:45 2017 +0530
@@ -259,8 +259,12 @@ struct EncoderPrimitives
      * primitives will leave 64x64 pointers NULL.  Indexed by LumaCU */
     struct CU
     {
-        dct_t           dct;
-        idct_t          idct;
+        dct_t           dct;    // active dct transformation
+        idct_t          idct;   // active idct transformation
+
+        dct_t           standard_dct;   // original dct function, used by lowpass_dct
+        dct_t           lowpass_dct;    // lowpass dct approximation
+
         calcresidual_t  calcresidual;
         pixel_sub_ps_t  sub_ps;
         pixel_add_ps_t  add_ps;
--- a/source/common/wavefront.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/wavefront.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -43,11 +43,17 @@ bool WaveFront::init(int numRows)
     if (m_externalDependencyBitmap)
         memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
 
+    m_row_to_idx = X265_MALLOC(uint32_t, m_numRows);
+    m_idx_to_row = X265_MALLOC(uint32_t, m_numRows);
+
     return m_internalDependencyBitmap && m_externalDependencyBitmap;
 }
 
 WaveFront::~WaveFront()
 {
+    x265_free((void*)m_row_to_idx);
+    x265_free((void*)m_idx_to_row);
+
     x265_free((void*)m_internalDependencyBitmap);
     x265_free((void*)m_externalDependencyBitmap);
 }
--- a/source/common/wavefront.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/common/wavefront.h	Tue Nov 21 09:50:45 2017 +0530
@@ -52,6 +52,10 @@ private:
 
     int m_numRows;
 
+protected:
+    uint32_t *m_row_to_idx;
+    uint32_t *m_idx_to_row;
+
 public:
 
     WaveFront()
--- a/source/dynamicHDR10/metadataFromJson.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/dynamicHDR10/metadataFromJson.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -32,11 +32,7 @@
 
 #include "BasicStructures.h"
 #include "SeiMetadataDictionary.h"
-
-#define M_PI 3.14159265358979323846
-
 using namespace SeiMetadataDictionary;
-
 class metadataFromJson::DynamicMetaIO
 {
 public:
--- a/source/encoder/CMakeLists.txt	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/CMakeLists.txt	Tue Nov 21 09:50:45 2017 +0530
@@ -43,5 +43,4 @@ add_library(encoder OBJECT ../x265.h
     reference.cpp reference.h
     encoder.cpp encoder.h
     api.cpp
-    weightPrediction.cpp
-    ../x265-extras.cpp ../x265-extras.h)
+    weightPrediction.cpp)
--- a/source/encoder/analysis.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/analysis.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -75,6 +75,10 @@ Analysis::Analysis()
     m_reuseInterDataCTU = NULL;
     m_reuseRef = NULL;
     m_bHD = false;
+    m_modeFlag[0] = false;
+    m_modeFlag[1] = false;
+    m_checkMergeAndSkipOnly[0] = false;
+    m_checkMergeAndSkipOnly[1] = false;
     m_evaluateInter = 0;
 }
 
@@ -235,6 +239,32 @@ Mode& Analysis::compressCTU(CUData& ctu,
     }
     else
     {
+        bool bCopyAnalysis = ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10) || (m_param->bMVType && m_param->analysisReuseLevel >= 7 && ctu.m_numPartitions <= 16));
+        bool BCompressInterCUrd0_4 = (m_param->bMVType && m_param->analysisReuseLevel >= 7 && m_param->rdLevel <= 4);
+        bool BCompressInterCUrd5_6 = (m_param->bMVType && m_param->analysisReuseLevel >= 7 && m_param->rdLevel >= 5 && m_param->rdLevel <= 6);
+        bCopyAnalysis = bCopyAnalysis || BCompressInterCUrd0_4 || BCompressInterCUrd5_6;
+
+        if (bCopyAnalysis)
+        {
+            analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
+            int posCTU = ctu.m_cuAddr * numPartition;
+            memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
+            memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
+            for (int list = 0; list < m_slice->isInterB() + 1; list++)
+                memcpy(ctu.m_skipFlag[list], &m_frame->m_analysisData.modeFlag[list][posCTU], sizeof(uint8_t) * numPartition);
+
+            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !m_param->bMVType)
+            {
+                analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
+                memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
+                memcpy(ctu.m_chromaIntraDir, &intraDataCTU->chromaModes[posCTU], sizeof(uint8_t) * numPartition);
+            }
+            //Calculate log2CUSize from depth
+            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+                ctu.m_log2CUSize[i] = (uint8_t)m_param->maxLog2CUSize - ctu.m_cuDepth[i];
+        }
+
         if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&
             ctu.m_cuPelX / m_param->maxCUSize >= frame.m_encData->m_pir.pirStartCol
             && ctu.m_cuPelX / m_param->maxCUSize < frame.m_encData->m_pir.pirEndCol)
@@ -250,14 +280,14 @@ Mode& Analysis::compressCTU(CUData& ctu,
             /* generate residual for entire CTU at once and copy to reconPic */
             encodeResidue(ctu, cuGeom);
         }
-        else if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10)
+        else if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel == 10) || ((m_param->bMVType == AVC_INFO) && m_param->analysisReuseLevel >= 7))
         {
             analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
             int posCTU = ctu.m_cuAddr * numPartition;
             memcpy(ctu.m_cuDepth, &interDataCTU->depth[posCTU], sizeof(uint8_t) * numPartition);
             memcpy(ctu.m_predMode, &interDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
             memcpy(ctu.m_partSize, &interDataCTU->partSize[posCTU], sizeof(uint8_t) * numPartition);
-            if (m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames)
+            if ((m_slice->m_sliceType == P_SLICE || m_param->bIntraInBFrames) && !(m_param->bMVType == AVC_INFO))
             {
                 analysis_intra_data* intraDataCTU = (analysis_intra_data*)m_frame->m_analysisData.intraData;
                 memcpy(ctu.m_lumaIntraDir, &intraDataCTU->modes[posCTU], sizeof(uint8_t) * numPartition);
@@ -306,11 +336,10 @@ void Analysis::collectPUStatistics(const
                 mode = 2;
             else if (ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnU || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_2NxnD || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nLx2N || ctu.m_partSize[puabsPartIdx + absPartIdx] == SIZE_nRx2N)
                  mode = 3;
-
             if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_SKIP)
             {
-                ctu.m_encData->m_frameStats.cntSkipPu[depth] += (uint64_t)(1 << shift);
-                ctu.m_encData->m_frameStats.totalPu[depth] += (uint64_t)(1 << shift);
+                ctu.m_encData->m_frameStats.cntSkipPu[depth] += 1ULL << shift;
+                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
             }
             else if (ctu.m_predMode[puabsPartIdx + absPartIdx] == MODE_INTRA)
             {
@@ -321,14 +350,14 @@ void Analysis::collectPUStatistics(const
                 }
                 else
                 {
-                    ctu.m_encData->m_frameStats.cntIntraPu[depth] += (uint64_t)(1 << shift);
-                    ctu.m_encData->m_frameStats.totalPu[depth] += (uint64_t)(1 << shift);
+                    ctu.m_encData->m_frameStats.cntIntraPu[depth] += 1ULL << shift;
+                    ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
                 }
             }
             else if (mode == 3)
             {
-                ctu.m_encData->m_frameStats.cntAmp[depth] += (uint64_t)(1 << shift);
-                ctu.m_encData->m_frameStats.totalPu[depth] += (uint64_t)(1 << shift);
+                ctu.m_encData->m_frameStats.cntAmp[depth] += 1ULL << shift;
+                ctu.m_encData->m_frameStats.totalPu[depth] += 1ULL << shift;
                 break;
             }
             else
@@ -485,7 +514,7 @@ void Analysis::qprdRefine(const CUData& 
     md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 }
 
-void Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
+uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
 {
     uint32_t depth = cuGeom.depth;
     ModeDepth& md = m_modeDepth[depth];
@@ -511,7 +540,9 @@ void Analysis::compressIntraCU(const CUD
             Mode& mode = md.pred[0];
             md.bestMode = &mode;
             mode.cu.initSubCU(parentCTU, cuGeom, qp);
-            if (m_param->intraRefine != 2 || parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] <= 1)
+            bool reuseModes = !((m_param->intraRefine == 3) ||
+                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
+            if (reuseModes)
             {
                 memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
                 memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
@@ -560,6 +591,8 @@ void Analysis::compressIntraCU(const CUD
         invalidateContexts(nextDepth);
         Entropy* nextContext = &m_rqt[depth].cur;
         int32_t nextQP = qp;
+        uint64_t curCost = 0;
+        int skipSplitCheck = 0;
 
         for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
         {
@@ -572,7 +605,17 @@ void Analysis::compressIntraCU(const CUD
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
 
-                compressIntraCU(parentCTU, childGeom, nextQP);
+                if (m_param->bEnableSplitRdSkip)
+                {
+                    curCost += compressIntraCU(parentCTU, childGeom, nextQP);
+                    if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
+                    {
+                        skipSplitCheck = 1;
+                        break;
+                    }
+                }
+                else
+                    compressIntraCU(parentCTU, childGeom, nextQP);
 
                 // Save best CU and pred data for this sub CU
                 splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
@@ -590,14 +633,17 @@ void Analysis::compressIntraCU(const CUD
                     memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
             }
         }
-        nextContext->store(splitPred->contexts);
-        if (mightNotSplit)
-            addSplitFlagCost(*splitPred, cuGeom.depth);
-        else
-            updateModeCost(*splitPred);
-
-        checkDQPForSplitPred(*splitPred, cuGeom);
-        checkBestMode(*splitPred, depth);
+        if (!skipSplitCheck)
+        {
+            nextContext->store(splitPred->contexts);
+            if (mightNotSplit)
+                addSplitFlagCost(*splitPred, cuGeom.depth);
+            else
+                updateModeCost(*splitPred);
+
+            checkDQPForSplitPred(*splitPred, cuGeom);
+            checkBestMode(*splitPred, depth);
+        }
     }
 
     if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
@@ -620,6 +666,8 @@ void Analysis::compressIntraCU(const CUD
     md.bestMode->cu.copyToPic(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
+
+    return md.bestMode->rdCost;
 }
 
 void Analysis::PMODE::processTasks(int workerThreadId)
@@ -1106,7 +1154,7 @@ SplitData Analysis::compressInterCU_rd0_
     uint32_t depth = cuGeom.depth;
     uint32_t cuAddr = parentCTU.m_cuAddr;
     ModeDepth& md = m_modeDepth[depth];
-    md.bestMode = NULL;
+
 
     if (m_param->searchMethod == X265_SEA)
     {
@@ -1119,420 +1167,941 @@ SplitData Analysis::compressInterCU_rd0_
     }
 
     PicYuv& reconPic = *m_frame->m_reconPic;
-
-    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
-    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
-    uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
-    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
-    bool skipModes = false; /* Skip any remaining mode analyses at current depth */
-    bool skipRecursion = false; /* Skip recursion */
-    bool splitIntra = true;
-    bool skipRectAmp = false;
-    bool chooseMerge = false;
-    bool bCtuInfoCheck = false;
-    int sameContentRef = 0;
-
-    if (m_evaluateInter == 1)
+    SplitData splitCUData;
+
+    bool bHEVCBlockAnalysis = (m_param->bMVType && cuGeom.numPartitions > 16);
+    bool bRefineAVCAnalysis = (m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
+    bool bNooffloading = !m_param->bMVType;
+
+    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
     {
-        skipRectAmp = !!md.bestMode;
-        mightSplit &= false;
-        minDepth = depth;
-    }
-
-    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
-        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
-
-    SplitData splitData[4];
-    splitData[0].initSplitCUData();
-    splitData[1].initSplitCUData();
-    splitData[2].initSplitCUData();
-    splitData[3].initSplitCUData();
-
-    // avoid uninitialize value in below reference
-    if (m_param->limitModes)
-    {
-        md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
-        md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
-        md.pred[PRED_2Nx2N].sa8dCost = 0;
-    }
-
-    if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
-    {
-        if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
-            sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
-        if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
+        md.bestMode = NULL;
+        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
+        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
+        uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);
+        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
+        bool skipModes = false; /* Skip any remaining mode analyses at current depth */
+        bool skipRecursion = false; /* Skip recursion */
+        bool splitIntra = true;
+        bool skipRectAmp = false;
+        bool chooseMerge = false;
+        bool bCtuInfoCheck = false;
+        int sameContentRef = 0;
+
+        if (m_evaluateInter)
         {
-            mightNotSplit &= bDecidedDepth;
-            bCtuInfoCheck = skipRecursion = false;
-            skipModes = true;
+            if (m_param->interRefine == 2)
+            {
+                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
+                    skipModes = true;
+                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
+                    skipRectAmp = true;
+            }
+            mightSplit &= false;
+            minDepth = depth;
         }
-        else if (mightNotSplit && bDecidedDepth)
+
+        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
+
+        SplitData splitData[4];
+        splitData[0].initSplitCUData();
+        splitData[1].initSplitCUData();
+        splitData[2].initSplitCUData();
+        splitData[3].initSplitCUData();
+
+        // avoid uninitialize value in below reference
+        if (m_param->limitModes)
         {
-            if (m_additionalCtuInfo[cuGeom.absPartIdx])
+            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
+            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
+            md.pred[PRED_2Nx2N].sa8dCost = 0;
+        }
+
+        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
+        {
+            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
+                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
+            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
             {
-                bCtuInfoCheck = skipRecursion = true;
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-                if (!sameContentRef)
+                mightNotSplit &= bDecidedDepth;
+                bCtuInfoCheck = skipRecursion = false;
+                skipModes = true;
+            }
+            else if (mightNotSplit && bDecidedDepth)
+            {
+                if (m_additionalCtuInfo[cuGeom.absPartIdx])
                 {
-                    if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
+                    bCtuInfoCheck = skipRecursion = true;
+                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+                    if (!sameContentRef)
                     {
-                        qp -= int32_t(0.04 * qp);
-                        setLambdaFromQP(parentCTU, qp);
+                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
+                        {
+                            qp -= int32_t(0.04 * qp);
+                            setLambdaFromQP(parentCTU, qp);
+                        }
+                        if (m_param->bCTUInfo & 4)
+                            skipModes = false;
                     }
-                    if (m_param->bCTUInfo & 4)
-                        skipModes = false;
+                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
+                    {
+                        if (m_param->rdLevel)
+                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
+                        if ((m_param->bCTUInfo & 4) && sameContentRef)
+                            skipModes = md.bestMode && true;
+                    }
                 }
-                if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
+                else
                 {
+                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
                     if (m_param->rdLevel)
                         skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
-                    if ((m_param->bCTUInfo & 4) && sameContentRef)
-                        skipModes = md.bestMode && true;
                 }
+                mightSplit &= !bDecidedDepth;
             }
-            else
+        }
+        if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10))
+        {
+            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
             {
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-                if (m_param->rdLevel)
-                    skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
-            }
-            mightSplit &= !bDecidedDepth;
-        }
-    }
-    if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
-    {
-        if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
-        {
-            if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
-            {
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-
-                skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
-                if (m_param->rdLevel)
-                    skipModes = m_param->bEnableEarlySkip && md.bestMode;
-            }
-            if (m_param->analysisReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
-            {
-                if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4)
+                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
                 {
-                    skipRectAmp = true && !!md.bestMode;
-                    chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
+                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+
+                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    if (m_param->rdLevel)
+                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
+                }
+                if (m_param->analysisReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
+                {
+                    if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4)
+                    {
+                        skipRectAmp = true && !!md.bestMode;
+                        chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;
+                    }
                 }
             }
         }
-    }
-    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
-    {
-        if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
+        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
         {
-            if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
+            if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
             {
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-
-                skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
-                if (m_param->rdLevel)
-                    skipModes = m_param->bEnableEarlySkip && md.bestMode;
+                if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
+                {
+                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+
+                    skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
+                    if (m_param->rdLevel)
+                        skipModes = m_param->bEnableEarlySkip && md.bestMode;
+                }
+            }
+        }
+
+        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
+        if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bMVType && (m_modeFlag[0] || m_modeFlag[1]))) /* TODO: Re-evaluate if analysis load/save still works */
+        {
+            /* Compute Merge Cost */
+            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+            checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+            if (m_param->rdLevel)
+                skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2)
+                && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
+        }
+
+        if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bMVType && (m_modeFlag[0] || m_modeFlag[1])))
+        {
+            skipRecursion = md.bestMode->cu.isSkipped(0);
+            if (mightSplit && depth >= minDepth && !skipRecursion)
+            {
+                if (depth)
+                    skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
+                if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
+                    skipRecursion = complexityCheckCU(*md.bestMode);
+            }
+        }
+
+        if (m_param->bMVType && md.bestMode && cuGeom.numPartitions <= 16)
+            skipRecursion = true;
+
+        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
+        if (mightSplit && !skipRecursion)
+        {
+            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
+                qp = int((1 / 0.96) * qp + 0.5);
+            Mode* splitPred = &md.pred[PRED_SPLIT];
+            splitPred->initCosts();
+            CUData* splitCU = &splitPred->cu;
+            splitCU->initSubCU(parentCTU, cuGeom, qp);
+
+            uint32_t nextDepth = depth + 1;
+            ModeDepth& nd = m_modeDepth[nextDepth];
+            invalidateContexts(nextDepth);
+            Entropy* nextContext = &m_rqt[depth].cur;
+            int nextQP = qp;
+            splitIntra = false;
+
+            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+            {
+                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+                if (childGeom.flags & CUGeom::PRESENT)
+                {
+                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
+                    m_rqt[nextDepth].cur.load(*nextContext);
+
+                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
+                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
+
+                    splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
+
+                    // Save best CU and pred data for this sub CU
+                    splitIntra |= nd.bestMode->cu.isIntra(0);
+                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                    splitPred->addSubCosts(*nd.bestMode);
+
+                    if (m_param->rdLevel)
+                        nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
+                    else
+                        nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
+                    if (m_param->rdLevel > 1)
+                        nextContext = &nd.bestMode->contexts;
+                }
+                else
+                    splitCU->setEmptyPart(childGeom, subPartIdx);
+            }
+            nextContext->store(splitPred->contexts);
+
+            if (mightNotSplit)
+                addSplitFlagCost(*splitPred, cuGeom.depth);
+            else if (m_param->rdLevel > 1)
+                updateModeCost(*splitPred);
+            else
+                splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
+        }
+
+        /* If analysis mode is simple do not Evaluate other modes */
+        if ((m_param->bMVType && cuGeom.numPartitions <= 16) && (m_slice->m_sliceType == P_SLICE || m_slice->m_sliceType == B_SLICE))
+            mightNotSplit = !(m_checkMergeAndSkipOnly[0] || (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1]));
+
+        /* Split CUs
+         *   0  1
+         *   2  3 */
+        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
+        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
+        if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
+        {
+            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
+                setLambdaFromQP(parentCTU, qp);
+
+            if (!skipModes)
+            {
+                uint32_t refMasks[2];
+                refMasks[0] = allSplitRefs;
+                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
+
+                if (m_param->limitReferences & X265_REF_LIMIT_CU)
+                {
+                    CUData& cu = md.pred[PRED_2Nx2N].cu;
+                    uint32_t refMask = cu.getBestRefIdx(0);
+                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
+                }
+
+                if (m_slice->m_sliceType == B_SLICE)
+                {
+                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+                }
+
+                Mode *bestInter = &md.pred[PRED_2Nx2N];
+                if (!skipRectAmp)
+                {
+                    if (m_param->bEnableRectInter)
+                    {
+                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
+                        uint32_t threshold_2NxN, threshold_Nx2N;
+
+                        if (m_slice->m_sliceType == P_SLICE)
+                        {
+                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
+                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
+                        }
+                        else
+                        {
+                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
+                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
+                        }
+
+                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
+                        if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
+                        {
+                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
+                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
+                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
+                                bestInter = &md.pred[PRED_2NxN];
+                        }
+
+                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
+                        {
+                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
+                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
+                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
+                            if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
+                                bestInter = &md.pred[PRED_Nx2N];
+                        }
+
+                        if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
+                        {
+                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
+                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
+                            if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
+                                bestInter = &md.pred[PRED_2NxN];
+                        }
+                    }
+
+                    if (m_slice->m_sps->maxAMPDepth > depth)
+                    {
+                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
+                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
+
+                        if (m_slice->m_sliceType == P_SLICE)
+                        {
+                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
+                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
+
+                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
+                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
+                        }
+                        else
+                        {
+                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
+                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
+                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
+
+                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
+                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
+                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
+                        }
+
+                        bool bHor = false, bVer = false;
+                        if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
+                            bHor = true;
+                        else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
+                            bVer = true;
+                        else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
+                            md.bestMode && md.bestMode->cu.getQtRootCbf(0))
+                        {
+                            bHor = true;
+                            bVer = true;
+                        }
+
+                        if (bHor)
+                        {
+                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
+                            if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% top */
+                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
+                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
+                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
+                                    bestInter = &md.pred[PRED_2NxnD];
+                            }
+
+                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
+                            {
+                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
+                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
+                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
+                                if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
+                                    bestInter = &md.pred[PRED_2NxnU];
+                            }
+
+                            if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% top */
+                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
+                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
+                                if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
+                                    bestInter = &md.pred[PRED_2NxnD];
+                            }
+                        }
+                        if (bVer)
+                        {
+                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
+                            if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
+                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
+                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
+                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
+                                    bestInter = &md.pred[PRED_nRx2N];
+                            }
+
+                            if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
+                            {
+                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
+                                refMasks[1] = allSplitRefs;                                    /* 75% right */
+                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
+                                if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
+                                    bestInter = &md.pred[PRED_nLx2N];
+                            }
+
+                            if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
+                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
+                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
+                                if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
+                                    bestInter = &md.pred[PRED_nRx2N];
+                            }
+                        }
+                    }
+                }
+                bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
+                if (m_param->rdLevel >= 3)
+                {
+                    /* Calculate RD cost of best inter option */
+                    if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
+                    {
+                        uint32_t numPU = bestInter->cu.getNumPartInter(0);
+                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
+                        {
+                            PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
+                            motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
+                        }
+                    }
+
+                    if (!chooseMerge)
+                    {
+                        encodeResAndCalcRdInterCU(*bestInter, cuGeom);
+                        checkBestMode(*bestInter, depth);
+
+                        /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
+                        if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
+                            md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+                        {
+                            uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
+                            if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
+                                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
+                                {
+                                    PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
+                                    motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
+                                }
+                            encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
+                            checkBestMode(md.pred[PRED_BIDIR], depth);
+                        }
+                    }
+
+                    if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
+                        md.bestMode->sa8dCost == MAX_INT64)
+                    {
+                        if (!m_param->limitReferences || splitIntra)
+                        {
+                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
+                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
+                            encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
+                            checkBestMode(md.pred[PRED_INTRA], depth);
+                        }
+                        else
+                        {
+                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
+                        }
+                    }
+                }
+                else
+                {
+                    /* SA8D choice between merge/skip, inter, bidir, and intra */
+                    if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
+                        md.bestMode = bestInter;
+
+                    if (m_slice->m_sliceType == B_SLICE &&
+                        md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
+                        md.bestMode = &md.pred[PRED_BIDIR];
+
+                    if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
+                    {
+                        if (!m_param->limitReferences || splitIntra)
+                        {
+                            ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
+                            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
+                            if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
+                                md.bestMode = &md.pred[PRED_INTRA];
+                        }
+                        else
+                        {
+                            ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
+                        }
+                    }
+
+                    /* finally code the best mode selected by SA8D costs:
+                     * RD level 2 - fully encode the best mode
+                     * RD level 1 - generate recon pixels
+                     * RD level 0 - generate chroma prediction */
+                    if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
+                    {
+                        /* prediction already generated for this CU, and if rd level
+                         * is not 0, it is already fully encoded */
+                    }
+                    else if (md.bestMode->cu.isInter(0))
+                    {
+                        uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
+                        if (m_csp != X265_CSP_I400)
+                        {
+                            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
+                            {
+                                PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                                motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
+                            }
+                        }
+                        if (m_param->rdLevel == 2)
+                            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
+                        else if (m_param->rdLevel == 1)
+                        {
+                            /* generate recon pixels with no rate distortion considerations */
+                            CUData& cu = md.bestMode->cu;
+
+                            uint32_t tuDepthRange[2];
+                            cu.getInterTUQtDepthRange(tuDepthRange, 0);
+                            m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
+                            residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
+                            if (cu.getQtRootCbf(0))
+                                md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
+                            else
+                            {
+                                md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
+                                if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
+                                    cu.setPredModeSubParts(MODE_SKIP);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if (m_param->rdLevel == 2)
+                            encodeIntraInInter(*md.bestMode, cuGeom);
+                        else if (m_param->rdLevel == 1)
+                        {
+                            /* generate recon pixels with no rate distortion considerations */
+                            CUData& cu = md.bestMode->cu;
+
+                            uint32_t tuDepthRange[2];
+                            cu.getIntraTUQtDepthRange(tuDepthRange, 0);
+
+                            residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
+                            if (m_csp != X265_CSP_I400)
+                            {
+                                getBestIntraModeChroma(*md.bestMode, cuGeom);
+                                residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
+                            }
+                            md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
+                        }
+                    }
+                }
+            } // !earlyskip
+
+            if (m_bTryLossless)
+                tryLossless(cuGeom);
+
+            if (mightSplit)
+                addSplitFlagCost(*md.bestMode, cuGeom.depth);
+        }
+
+        if (mightSplit && !skipRecursion)
+        {
+            Mode* splitPred = &md.pred[PRED_SPLIT];
+            if (!md.bestMode)
+                md.bestMode = splitPred;
+            else if (m_param->rdLevel > 1)
+                checkBestMode(*splitPred, cuGeom.depth);
+            else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
+                md.bestMode = splitPred;
+
+            checkDQPForSplitPred(*md.bestMode, cuGeom);
+        }
+
+        /* determine which motion references the parent CU should search */
+        splitCUData.initSplitCUData();
+
+        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
+        {
+            if (md.bestMode == &md.pred[PRED_SPLIT])
+                splitCUData.splitRefs = allSplitRefs;
+            else
+            {
+                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
+                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
+                uint32_t numPU = cu.getNumPartInter(0);
+                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
+                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
+            }
+        }
+
+        if (m_param->limitModes)
+        {
+            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
+            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
+            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
+        }
+
+        if (mightNotSplit && md.bestMode->cu.isSkipped(0))
+        {
+            FrameData& curEncData = *m_frame->m_encData;
+            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
+            uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
+            cuStat.count[depth] += 1;
+            cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
+        }
+
+        /* Copy best data to encData CTU and recon */
+        md.bestMode->cu.copyToPic(depth);
+        if (m_param->rdLevel)
+            md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
+
+        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+        {
+            if (mightNotSplit)
+            {
+                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+                int8_t maxTUDepth = -1;
+                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
+                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
             }
         }
     }
-
-    /* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */
-    if (mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) /* TODO: Re-evaluate if analysis load/save still works */
+    else
     {
-        /* Compute Merge Cost */
-        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-        checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-        if (m_param->rdLevel)
-            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
-    }
-
-    if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck)
-    {
-        skipRecursion = md.bestMode->cu.isSkipped(0);
-        if (mightSplit && depth >= minDepth && !skipRecursion)
+        if (m_param->bMVType && cuGeom.numPartitions <= 16)
         {
-            if (depth)
-                skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);
-            if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)
-                skipRecursion = complexityCheckCU(*md.bestMode);
+            qprdRefine(parentCTU, cuGeom, qp, qp);
+
+            SplitData splitData[4];
+            splitData[0].initSplitCUData();
+            splitData[1].initSplitCUData();
+            splitData[2].initSplitCUData();
+            splitData[3].initSplitCUData();
+
+            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
+
+            splitCUData.initSplitCUData();
+
+            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
+            {
+                if (md.bestMode == &md.pred[PRED_SPLIT])
+                    splitCUData.splitRefs = allSplitRefs;
+                else
+                {
+                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
+                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
+                    uint32_t numPU = cu.getNumPartInter(0);
+                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
+                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
+                }
+            }
+
+            if (m_param->limitModes)
+            {
+                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
+                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
+                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;
+            }
         }
     }
 
-    /* Step 2. Evaluate each of the 4 split sub-blocks in series */
-    if (mightSplit && !skipRecursion)
+    return splitCUData;
+}
+
+SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
+{
+    if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1))
+        return compressInterCU_rd0_4(parentCTU, cuGeom, qp);
+
+    uint32_t depth = cuGeom.depth;
+    ModeDepth& md = m_modeDepth[depth];
+    md.bestMode = NULL;
+
+    if (m_param->searchMethod == X265_SEA)
     {
-        if (bCtuInfoCheck && m_param->bCTUInfo & 2)
-            qp = int((1 / 0.96) * qp + 0.5);
-        Mode* splitPred = &md.pred[PRED_SPLIT];
-        splitPred->initCosts();
-        CUData* splitCU = &splitPred->cu;
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
-
-        uint32_t nextDepth = depth + 1;
-        ModeDepth& nd = m_modeDepth[nextDepth];
-        invalidateContexts(nextDepth);
-        Entropy* nextContext = &m_rqt[depth].cur;
-        int nextQP = qp;
-        splitIntra = false;
-
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        int numPredDir = m_slice->isInterP() ? 1 : 2;
+        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
+        for (int list = 0; list < numPredDir; list++)
+            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
+                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
+                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
+    }
+
+    SplitData splitCUData;
+
+    bool bHEVCBlockAnalysis = (m_param->bMVType && cuGeom.numPartitions > 16);
+    bool bRefineAVCAnalysis = (m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));
+    bool bNooffloading = !m_param->bMVType;
+
+    if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading)
+    {
+        bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
+        bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
+        bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
+        bool skipRecursion = false;
+        bool skipModes = false;
+        bool splitIntra = true;
+        bool skipRectAmp = false;
+        bool bCtuInfoCheck = false;
+        int sameContentRef = 0;
+
+        if (m_evaluateInter)
         {
-            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childGeom.flags & CUGeom::PRESENT)
+            if (m_param->interRefine == 2)
             {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
-                m_rqt[nextDepth].cur.load(*nextContext);
-
-                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
-                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
-
-                splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
-
-                // Save best CU and pred data for this sub CU
-                splitIntra |= nd.bestMode->cu.isIntra(0);
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
-                splitPred->addSubCosts(*nd.bestMode);
-
-                if (m_param->rdLevel)
-                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
+                if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
+                    skipModes = true;
+                if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
+                    skipRectAmp = true;
+            }
+            mightSplit &= false;
+        }
+
+        // avoid uninitialize value in below reference
+        if (m_param->limitModes)
+        {
+            md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
+            md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
+            md.pred[PRED_2Nx2N].rdCost = 0;
+        }
+
+        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+            m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
+
+        SplitData splitData[4];
+        splitData[0].initSplitCUData();
+        splitData[1].initSplitCUData();
+        splitData[2].initSplitCUData();
+        splitData[3].initSplitCUData();
+        uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
+        uint32_t refMasks[2];
+        if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
+        {
+            if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
+                sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
+            if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
+            {
+                mightNotSplit &= bDecidedDepth;
+                bCtuInfoCheck = skipRecursion = false;
+                skipModes = true;
+            }
+            else if (mightNotSplit && bDecidedDepth)
+            {
+                if (m_additionalCtuInfo[cuGeom.absPartIdx])
+                {
+                    bCtuInfoCheck = skipRecursion = true;
+                    refMasks[0] = allSplitRefs;
+                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
+                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
+                    if (!sameContentRef)
+                    {
+                        if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
+                        {
+                            qp -= int32_t(0.04 * qp);
+                            setLambdaFromQP(parentCTU, qp);
+                        }
+                        if (m_param->bCTUInfo & 4)
+                            skipModes = false;
+                    }
+                    if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
+                    {
+                        if (m_param->rdLevel)
+                            skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
+                        if ((m_param->bCTUInfo & 4) && sameContentRef)
+                            skipModes = md.bestMode && true;
+                    }
+                }
                 else
-                    nd.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);
-                if (m_param->rdLevel > 1)
-                    nextContext = &nd.bestMode->contexts;
+                {
+                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
+                    refMasks[0] = allSplitRefs;
+                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
+                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
+                }
+                mightSplit &= !bDecidedDepth;
             }
-            else
-                splitCU->setEmptyPart(childGeom, subPartIdx);
         }
-        nextContext->store(splitPred->contexts);
-
-        if (mightNotSplit)
-            addSplitFlagCost(*splitPred, cuGeom.depth);
-        else if (m_param->rdLevel > 1)
-            updateModeCost(*splitPred);
-        else
-            splitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);
-    }
-
-    /* Split CUs
-     *   0  1
-     *   2  3 */
-    uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
-    /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
-    if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode)))
-    {
-        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
-            setLambdaFromQP(parentCTU, qp);
-
-        if (!skipModes)
+        if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
         {
-            uint32_t refMasks[2];
+            if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
+            {
+                if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
+                {
+                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
+                    refMasks[0] = allSplitRefs;
+                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
+                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
+
+                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
+                }
+                if (m_param->analysisReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
+                    skipRectAmp = true && !!md.bestMode;
+            }
+        }
+
+        if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
+        {
+            if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
+            {
+                if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
+                {
+                    md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                    md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+
+                    skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
+                    refMasks[0] = allSplitRefs;
+                    md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
+                    checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
+
+                    if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                        skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
+                }
+            }
+        }
+
+        /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
+        if ((mightNotSplit && !md.bestMode && !bCtuInfoCheck) ||
+            (m_param->bMVType && (m_modeFlag[0] || m_modeFlag[1])))
+        {
+            md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+            md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+            checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
+            skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2) &&
+                md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
             refMasks[0] = allSplitRefs;
             md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-            checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
-
-            if (m_param->limitReferences & X265_REF_LIMIT_CU)
+            checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
+            checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
+
+            if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
+                skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
+        }
+
+        if (m_param->bMVType && md.bestMode && cuGeom.numPartitions <= 16)
+            skipRecursion = true;
+
+        // estimate split cost
+        /* Step 2. Evaluate each of the 4 split sub-blocks in series */
+        if (mightSplit && !skipRecursion)
+        {
+            if (bCtuInfoCheck && m_param->bCTUInfo & 2)
+                qp = int((1 / 0.96) * qp + 0.5);
+            Mode* splitPred = &md.pred[PRED_SPLIT];
+            splitPred->initCosts();
+            CUData* splitCU = &splitPred->cu;
+            splitCU->initSubCU(parentCTU, cuGeom, qp);
+
+            uint32_t nextDepth = depth + 1;
+            ModeDepth& nd = m_modeDepth[nextDepth];
+            invalidateContexts(nextDepth);
+            Entropy* nextContext = &m_rqt[depth].cur;
+            int nextQP = qp;
+            splitIntra = false;
+
+            for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
             {
-                CUData& cu = md.pred[PRED_2Nx2N].cu;
-                uint32_t refMask = cu.getBestRefIdx(0);
-                allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
-            }
-
-            if (m_slice->m_sliceType == B_SLICE)
-            {
-                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
-            }
-
-            Mode *bestInter = &md.pred[PRED_2Nx2N];
-            if (!skipRectAmp)
-            {
-                if (m_param->bEnableRectInter)
+                const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+                if (childGeom.flags & CUGeom::PRESENT)
                 {
-                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
-                    uint32_t threshold_2NxN, threshold_Nx2N;
-
-                    if (m_slice->m_sliceType == P_SLICE)
-                    {
-                        threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
-                        threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
-                    }
-                    else
-                    {
-                        threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
-                                       + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
-                        threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
-                                       + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
-                    }
-
-                    int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
-                    if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
-                    {
-                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
-                        if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
-                            bestInter = &md.pred[PRED_2NxN];
-                    }
-
-                    if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N)
-                    {
-                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
-                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
-                        md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
-                        if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
-                            bestInter = &md.pred[PRED_Nx2N];
-                    }
-
-                    if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN)
-                    {
-                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
-                        if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
-                            bestInter = &md.pred[PRED_2NxN];
-                    }
+                    m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
+                    m_rqt[nextDepth].cur.load(*nextContext);
+
+                    if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
+                        nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
+
+                    splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
+
+                    // Save best CU and pred data for this sub CU
+                    splitIntra |= nd.bestMode->cu.isIntra(0);
+                    splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                    splitPred->addSubCosts(*nd.bestMode);
+                    nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
+                    nextContext = &nd.bestMode->contexts;
                 }
-
-                if (m_slice->m_sps->maxAMPDepth > depth)
+                else
                 {
-                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
-                    uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
-
-                    if (m_slice->m_sliceType == P_SLICE)
-                    {
-                        threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
-                        threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
-
-                        threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
-                        threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
-                    }
-                    else
-                    {
-                        threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
-                                         + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
-                        threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
-                                         + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
-
-                        threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
-                                        + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
-                        threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
-                                        + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
-                    }
-
-                    bool bHor = false, bVer = false;
-                    if (bestInter->cu.m_partSize[0] == SIZE_2NxN)
-                        bHor = true;
-                    else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)
-                        bVer = true;
-                    else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&
-                        md.bestMode && md.bestMode->cu.getQtRootCbf(0))
-                    {
-                        bHor = true;
-                        bVer = true;
-                    }
-
-                    if (bHor)
-                    {
-                        int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
-                        if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% top */
-                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
-                            if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
-                                bestInter = &md.pred[PRED_2NxnD];
-                        }
-
-                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU)
-                        {
-                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
-                            refMasks[1] = allSplitRefs;                                    /* 75% bot */
-                            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
-                            if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
-                                bestInter = &md.pred[PRED_2NxnU];
-                        }
-
-                        if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% top */
-                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
-                            if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
-                                bestInter = &md.pred[PRED_2NxnD];
-                        }
-                    }
-                    if (bVer)
-                    {
-                        int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
-                        if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
-                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
-                            if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
-                                bestInter = &md.pred[PRED_nRx2N];
-                        }
-
-                        if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N)
-                        {
-                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
-                            refMasks[1] = allSplitRefs;                                    /* 75% right */
-                            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
-                            if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
-                                bestInter = &md.pred[PRED_nLx2N];
-                        }
-
-                        if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
-                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
-                            if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
-                                bestInter = &md.pred[PRED_nRx2N];
-                        }
-                    }
+                    splitCU->setEmptyPart(childGeom, subPartIdx);
                 }
             }
-            bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);
-            if (m_param->rdLevel >= 3)
+            nextContext->store(splitPred->contexts);
+            if (mightNotSplit)
+                addSplitFlagCost(*splitPred, cuGeom.depth);
+            else
+                updateModeCost(*splitPred);
+
+            checkDQPForSplitPred(*splitPred, cuGeom);
+        }
+
+        /* If analysis mode is simple do not Evaluate other modes */
+        if ((m_param->bMVType && cuGeom.numPartitions <= 16) && (m_slice->m_sliceType == P_SLICE || m_slice->m_sliceType == B_SLICE))
+            mightNotSplit = !(m_checkMergeAndSkipOnly[0] || (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1]));
+
+        /* Split CUs
+         *   0  1
+         *   2  3 */
+        allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
+        /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
+        if (mightNotSplit)
+        {
+            if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
+                setLambdaFromQP(parentCTU, qp);
+
+            if (!skipModes)
             {
-                /* Calculate RD cost of best inter option */
-                if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
+                refMasks[0] = allSplitRefs;
+
+                if (m_param->limitReferences & X265_REF_LIMIT_CU)
                 {
-                    uint32_t numPU = bestInter->cu.getNumPartInter(0);
-                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
-                    {
-                        PredictionUnit pu(bestInter->cu, cuGeom, puIdx);
-                        motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);
-                    }
+                    CUData& cu = md.pred[PRED_2Nx2N].cu;
+                    uint32_t refMask = cu.getBestRefIdx(0);
+                    allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
                 }
 
-                if (!chooseMerge)
+                if (m_slice->m_sliceType == B_SLICE)
                 {
-                    encodeResAndCalcRdInterCU(*bestInter, cuGeom);
-                    checkBestMode(*bestInter, depth);
-
-                    /* If BIDIR is available and within 17/16 of best inter option, choose by RDO */
-                    if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&
-                        md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17)
+                    md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
+                    checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
+                    if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
                     {
                         uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
                         if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
@@ -1542,20 +2111,176 @@ SplitData Analysis::compressInterCU_rd0_
                                 motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
                             }
                         encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
-                        checkBestMode(md.pred[PRED_BIDIR], depth);
+                        checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
                     }
                 }
 
-                if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||
-                    md.bestMode->sa8dCost == MAX_INT64)
+                if (!skipRectAmp)
+                {
+                    if (m_param->bEnableRectInter)
+                    {
+                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
+                        uint32_t threshold_2NxN, threshold_Nx2N;
+
+                        if (m_slice->m_sliceType == P_SLICE)
+                        {
+                            threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
+                            threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
+                        }
+                        else
+                        {
+                            threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
+                            threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
+                        }
+
+                        int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
+                        if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
+                        {
+                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
+                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
+                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
+                        }
+
+                        if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
+                        {
+                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
+                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
+                            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
+                            checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
+                        }
+
+                        if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
+                        {
+                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
+                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
+                            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
+                            checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
+                        }
+                    }
+
+                    // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
+                    if (m_slice->m_sps->maxAMPDepth > depth)
+                    {
+                        uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
+                        uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
+
+                        if (m_slice->m_sliceType == P_SLICE)
+                        {
+                            threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
+                            threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
+
+                            threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
+                            threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
+                        }
+                        else
+                        {
+                            threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
+                            threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
+                                + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
+
+                            threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
+                                + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
+                            threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
+                                + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
+                        }
+
+                        bool bHor = false, bVer = false;
+                        if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
+                            bHor = true;
+                        else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
+                            bVer = true;
+                        else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
+                        {
+                            bHor = true;
+                            bVer = true;
+                        }
+
+                        if (bHor)
+                        {
+                            int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
+                            if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% top */
+                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
+                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
+                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
+                            }
+
+                            if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
+                            {
+                                refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
+                                refMasks[1] = allSplitRefs;                                    /* 75% bot */
+                                md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
+                                checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
+                            }
+
+                            if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% top */
+                                refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
+                                md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
+                                checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
+                            }
+                        }
+
+                        if (bVer)
+                        {
+                            int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
+                            if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
+                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
+                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
+                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
+                            }
+
+                            if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
+                            {
+                                refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
+                                refMasks[1] = allSplitRefs;                                    /* 75% right */
+                                md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
+                                checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
+                            }
+
+                            if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
+                            {
+                                refMasks[0] = allSplitRefs;                                    /* 75% left  */
+                                refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
+                                md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                                checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
+                                checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
+                            }
+                        }
+                    }
+                }
+
+                if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck))
                 {
                     if (!m_param->limitReferences || splitIntra)
                     {
                         ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
                         md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
-                        encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
+                        checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
                         checkBestMode(md.pred[PRED_INTRA], depth);
+
+                        if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
+                        {
+                            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                            checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
+                            checkBestMode(md.pred[PRED_INTRA_NxN], depth);
+                        }
                     }
                     else
                     {
@@ -1563,669 +2288,112 @@ SplitData Analysis::compressInterCU_rd0_
                     }
                 }
             }
+
+            if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
+            {
+                uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
+
+                for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
+                {
+                    PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                    motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
+                }
+                encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
+            }
+            if (m_bTryLossless)
+                tryLossless(cuGeom);
+
+            if (mightSplit)
+                addSplitFlagCost(*md.bestMode, cuGeom.depth);
+        }
+
+        if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
+        {
+            if (mightNotSplit)
+            {
+                CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
+                int8_t maxTUDepth = -1;
+                for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
+                    maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
+                ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
+            }
+        }
+
+        /* compare split RD cost against best cost */
+        if (mightSplit && !skipRecursion)
+            checkBestMode(md.pred[PRED_SPLIT], depth);
+
+        if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
+        {
+            int cuIdx = (cuGeom.childOffset - 1) / 3;
+            cacheCost[cuIdx] = md.bestMode->rdCost;
+        }
+
+        /* determine which motion references the parent CU should search */
+        splitCUData.initSplitCUData();
+        if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
+        {
+            if (md.bestMode == &md.pred[PRED_SPLIT])
+                splitCUData.splitRefs = allSplitRefs;
             else
             {
-                /* SA8D choice between merge/skip, inter, bidir, and intra */
-                if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)
-                    md.bestMode = bestInter;
-
-                if (m_slice->m_sliceType == B_SLICE &&
-                    md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)
-                    md.bestMode = &md.pred[PRED_BIDIR];
-
-                if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64)
-                {
-                    if (!m_param->limitReferences || splitIntra)
-                    {
-                        ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
-                        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
-                        if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
-                            md.bestMode = &md.pred[PRED_INTRA];
-                    }
-                    else
-                    {
-                        ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
-                    }
-                }
-
-                /* finally code the best mode selected by SA8D costs:
-                 * RD level 2 - fully encode the best mode
-                 * RD level 1 - generate recon pixels
-                 * RD level 0 - generate chroma prediction */
-                if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)
-                {
-                    /* prediction already generated for this CU, and if rd level
-                     * is not 0, it is already fully encoded */
-                }
-                else if (md.bestMode->cu.isInter(0))
-                {
-                    uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
-                    if (m_csp != X265_CSP_I400)
-                    {
-                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
-                        {
-                            PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
-                            motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
-                        }
-                    }
-                    if (m_param->rdLevel == 2)
-                        encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
-                    else if (m_param->rdLevel == 1)
-                    {
-                        /* generate recon pixels with no rate distortion considerations */
-                        CUData& cu = md.bestMode->cu;
-
-                        uint32_t tuDepthRange[2];
-                        cu.getInterTUQtDepthRange(tuDepthRange, 0);
-                        m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);
-                        residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
-                        if (cu.getQtRootCbf(0))
-                            md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);
-                        else
-                        {
-                            md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);
-                            if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)
-                                cu.setPredModeSubParts(MODE_SKIP);
-                        }
-                    }
-                }
+                /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
+                CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
+                uint32_t numPU = cu.getNumPartInter(0);
+                for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
+                    splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
+            }
+        }
+
+        if (m_param->limitModes)
+        {
+            splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
+            splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
+            splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
+        }
+
+        /* Copy best data to encData CTU and recon */
+        md.bestMode->cu.copyToPic(depth);
+        md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
+    }
+    else
+    {
+        if (m_param->bMVType && cuGeom.numPartitions <= 16)
+        {
+            qprdRefine(parentCTU, cuGeom, qp, qp);
+
+            SplitData splitData[4];
+            splitData[0].initSplitCUData();
+            splitData[1].initSplitCUData();
+            splitData[2].initSplitCUData();
+            splitData[3].initSplitCUData();
+
+            uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
+
+            splitCUData.initSplitCUData();
+            if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
+            {
+                if (md.bestMode == &md.pred[PRED_SPLIT])
+                    splitCUData.splitRefs = allSplitRefs;
                 else
                 {
-                    if (m_param->rdLevel == 2)
-                        encodeIntraInInter(*md.bestMode, cuGeom);
-                    else if (m_param->rdLevel == 1)
-                    {
-                        /* generate recon pixels with no rate distortion considerations */
-                        CUData& cu = md.bestMode->cu;
-
-                        uint32_t tuDepthRange[2];
-                        cu.getIntraTUQtDepthRange(tuDepthRange, 0);
-
-                        residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
-                        if (m_csp != X265_CSP_I400)
-                        {
-                            getBestIntraModeChroma(*md.bestMode, cuGeom);
-                            residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
-                        }
-                        md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
-                    }
+                    /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
+                    CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
+                    uint32_t numPU = cu.getNumPartInter(0);
+                    for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
+                        splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
                 }
             }
-        } // !earlyskip
-
-        if (m_bTryLossless)
-            tryLossless(cuGeom);
-
-        if (mightSplit)
-            addSplitFlagCost(*md.bestMode, cuGeom.depth);
-    }
-
-    if (mightSplit && !skipRecursion)
-    {
-        Mode* splitPred = &md.pred[PRED_SPLIT];
-        if (!md.bestMode)
-            md.bestMode = splitPred;
-        else if (m_param->rdLevel > 1)
-            checkBestMode(*splitPred, cuGeom.depth);
-        else if (splitPred->sa8dCost < md.bestMode->sa8dCost)
-            md.bestMode = splitPred;
-
-        checkDQPForSplitPred(*md.bestMode, cuGeom);
-    }
-
-    /* determine which motion references the parent CU should search */
-    SplitData splitCUData;
-    splitCUData.initSplitCUData();
-
-    if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
-    {
-        if (md.bestMode == &md.pred[PRED_SPLIT])
-            splitCUData.splitRefs = allSplitRefs;
-        else 
-        {
-            /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
-            CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
-            uint32_t numPU = cu.getNumPartInter(0);
-            for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
-                splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
-        }
-    }
-
-    if (m_param->limitModes)
-    {
-        splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
-        splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
-        splitCUData.sa8dCost    = md.pred[PRED_2Nx2N].sa8dCost;
-    }
-    
-    if (mightNotSplit && md.bestMode->cu.isSkipped(0))
-    {
-        FrameData& curEncData = *m_frame->m_encData;
-        FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];
-        uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];
-        cuStat.count[depth] += 1;
-        cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];
-    }
-
-    /* Copy best data to encData CTU and recon */
-    md.bestMode->cu.copyToPic(depth);
-    if (m_param->rdLevel)
-        md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
-
-    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
-    {
-        if (mightNotSplit)
-        {
-            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
-            int8_t maxTUDepth = -1;
-            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
-                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
-            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
-        }
-    }
-
-    return splitCUData;
-}
-
-SplitData Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
-{
-    if (parentCTU.m_vbvAffected && !calculateQpforCuSize(parentCTU, cuGeom, 1))
-        return compressInterCU_rd0_4(parentCTU, cuGeom, qp);
-
-    uint32_t depth = cuGeom.depth;
-    ModeDepth& md = m_modeDepth[depth];
-    md.bestMode = NULL;
-
-    if (m_param->searchMethod == X265_SEA)
-    {
-        int numPredDir = m_slice->isInterP() ? 1 : 2;
-        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
-        for (int list = 0; list < numPredDir; list++)
-            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)
-                for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
-                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;
-    }
-
-    bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
-    bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
-    bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
-    bool skipRecursion = false;
-    bool skipModes = false;
-    bool splitIntra = true;
-    bool skipRectAmp = false;
-    bool bCtuInfoCheck = false;
-    int sameContentRef = 0;
-
-    if (m_evaluateInter == 1)
-    {
-        skipRectAmp = !!md.bestMode;
-        mightSplit &= false;
-    }
-
-    // avoid uninitialize value in below reference
-    if (m_param->limitModes)
-    {
-        md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0
-        md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1
-        md.pred[PRED_2Nx2N].rdCost = 0;
-    }
-
-    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
-        m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);
-
-    SplitData splitData[4];
-    splitData[0].initSplitCUData();
-    splitData[1].initSplitCUData();
-    splitData[2].initSplitCUData();
-    splitData[3].initSplitCUData();
-    uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
-    uint32_t refMasks[2];
-    if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx])
-    {
-        if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])
-            sameContentRef = findSameContentRefCount(parentCTU, cuGeom);
-        if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx])
-        {
-            mightNotSplit &= bDecidedDepth;
-            bCtuInfoCheck = skipRecursion = false;
-            skipModes = true;
-        }
-        else if (mightNotSplit && bDecidedDepth)
-        {
-            if (m_additionalCtuInfo[cuGeom.absPartIdx])
+
+            if (m_param->limitModes)
             {
-                bCtuInfoCheck = skipRecursion = true;
-                refMasks[0] = allSplitRefs;
-                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
-                checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
-                if (!sameContentRef)
-                {
-                    if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth))
-                    {
-                        qp -= int32_t(0.04 * qp);
-                        setLambdaFromQP(parentCTU, qp);
-                    }
-                    if (m_param->bCTUInfo & 4)
-                        skipModes = false;
-                }
-                if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4)))
-                {
-                    if (m_param->rdLevel)
-                        skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);
-                    if ((m_param->bCTUInfo & 4) && sameContentRef)
-                        skipModes = md.bestMode && true;
-                }
-            }
-            else
-            {
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-                skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
-                refMasks[0] = allSplitRefs;
-                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
-                checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
-            }
-            mightSplit &= !bDecidedDepth;
-        }
-    }
-    if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
-    {
-        if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx])
-        {
-            if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
-            {
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-                skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
-                refMasks[0] = allSplitRefs;
-                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
-                checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
-
-                if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
-                    skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
-            }
-            if (m_param->analysisReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N)
-                skipRectAmp = true && !!md.bestMode;
-        }
-    }
-
-    if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_multipassAnalysis)
-    {
-        if (mightNotSplit && depth == m_multipassDepth[cuGeom.absPartIdx])
-        {
-            if (m_multipassModes[cuGeom.absPartIdx] == MODE_SKIP)
-            {
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-
-                skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
-                refMasks[0] = allSplitRefs;
-                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
-                checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
-
-                if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
-                    skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
+                splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
+                splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
+                splitCUData.sa8dCost = md.pred[PRED_2Nx2N].rdCost;
             }
         }
     }
 
-    /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
-    if (mightNotSplit && !md.bestMode && !bCtuInfoCheck)
-    {
-        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-        checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-        skipModes = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
-        refMasks[0] = allSplitRefs;
-        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-        checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
-        checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
-
-        if (m_param->bEnableRecursionSkip && depth && m_modeDepth[depth - 1].bestMode)
-            skipRecursion = md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
-    }
-
-    // estimate split cost
-    /* Step 2. Evaluate each of the 4 split sub-blocks in series */
-    if (mightSplit && !skipRecursion)
-    {
-        if (bCtuInfoCheck && m_param->bCTUInfo & 2)
-            qp = int((1 / 0.96) * qp + 0.5);
-        Mode* splitPred = &md.pred[PRED_SPLIT];
-        splitPred->initCosts();
-        CUData* splitCU = &splitPred->cu;
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
-
-        uint32_t nextDepth = depth + 1;
-        ModeDepth& nd = m_modeDepth[nextDepth];
-        invalidateContexts(nextDepth);
-        Entropy* nextContext = &m_rqt[depth].cur;
-        int nextQP = qp;
-        splitIntra = false;
-
-        for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
-        {
-            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
-            if (childGeom.flags & CUGeom::PRESENT)
-            {
-                m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
-                m_rqt[nextDepth].cur.load(*nextContext);
-
-                if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
-                    nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
-
-                splitData[subPartIdx] = compressInterCU_rd5_6(parentCTU, childGeom, nextQP);
-
-                // Save best CU and pred data for this sub CU
-                splitIntra |= nd.bestMode->cu.isIntra(0);
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
-                splitPred->addSubCosts(*nd.bestMode);
-                nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
-                nextContext = &nd.bestMode->contexts;
-            }
-            else
-            {
-                splitCU->setEmptyPart(childGeom, subPartIdx);
-            }
-        }
-        nextContext->store(splitPred->contexts);
-        if (mightNotSplit)
-            addSplitFlagCost(*splitPred, cuGeom.depth);
-        else
-            updateModeCost(*splitPred);
-
-        checkDQPForSplitPred(*splitPred, cuGeom);
-    }
-
-    /* Split CUs
-     *   0  1
-     *   2  3 */
-    allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;
-    /* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */
-    if (mightNotSplit)
-    {
-        if (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)
-            setLambdaFromQP(parentCTU, qp);
-
-        if (!skipModes)
-        {
-            refMasks[0] = allSplitRefs;
-
-            if (m_param->limitReferences & X265_REF_LIMIT_CU)
-            {
-                CUData& cu = md.pred[PRED_2Nx2N].cu;
-                uint32_t refMask = cu.getBestRefIdx(0);
-                allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;
-            }
-
-            if (m_slice->m_sliceType == B_SLICE)
-            {
-                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
-                checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
-                if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
-                {
-                    uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);
-                    if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)
-                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
-                        {
-                            PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);
-                            motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);
-                        }
-                    encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);
-                    checkBestMode(md.pred[PRED_BIDIR], cuGeom.depth);
-                }
-            }
-
-            if (!skipRectAmp)
-            {
-                if (m_param->bEnableRectInter)
-                {
-                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
-                    uint32_t threshold_2NxN, threshold_Nx2N;
-
-                    if (m_slice->m_sliceType == P_SLICE)
-                    {
-                        threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];
-                        threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
-                    }
-                    else
-                    {
-                        threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
-                                       + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
-                        threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
-                                       + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
-                    }
-
-                    int try_2NxN_first = threshold_2NxN < threshold_Nx2N;
-                    if (try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
-                    {
-                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
-                        checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
-                    }
-
-                    if (splitCost < md.bestMode->rdCost + threshold_Nx2N)
-                    {
-                        refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
-                        refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
-                        md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
-                        checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
-                    }
-
-                    if (!try_2NxN_first && splitCost < md.bestMode->rdCost + threshold_2NxN)
-                    {
-                        refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
-                        refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
-                        checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
-                    }
-                }
-
-                // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
-                if (m_slice->m_sps->maxAMPDepth > depth)
-                {
-                    uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;
-                    uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;
-
-                    if (m_slice->m_sliceType == P_SLICE)
-                    {
-                        threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];
-                        threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];
-
-                        threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];
-                        threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];
-                    }
-                    else
-                    {
-                        threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]
-                                        + splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;
-                        threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]
-                                        + splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
-
-                        threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]
-                                        + splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;
-                        threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]
-                                        + splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;
-                    }
-
-                    bool bHor = false, bVer = false;
-                    if (md.bestMode->cu.m_partSize[0] == SIZE_2NxN)
-                        bHor = true;
-                    else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N)
-                        bVer = true;
-                    else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0])
-                    {
-                        bHor = true;
-                        bVer = true;
-                    }
-
-                    if (bHor)
-                    {
-                        int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;
-                        if (try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% top */
-                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
-                            checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
-                        }
-
-                        if (splitCost < md.bestMode->rdCost + threshold_2NxnU)
-                        {
-                            refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
-                            refMasks[1] = allSplitRefs;                                    /* 75% bot */
-                            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
-                            checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
-                        }
-
-                        if (!try_2NxnD_first && splitCost < md.bestMode->rdCost + threshold_2NxnD)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% top */
-                            refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
-                            checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
-                        }
-                    }
-
-                    if (bVer)
-                    {
-                        int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;
-                        if (try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
-                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
-                            checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
-                        }
-
-                        if (splitCost < md.bestMode->rdCost + threshold_nLx2N)
-                        {
-                            refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
-                            refMasks[1] = allSplitRefs;                                    /* 75% right */
-                            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
-                            checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
-                        }
-
-                        if (!try_nRx2N_first && splitCost < md.bestMode->rdCost + threshold_nRx2N)
-                        {
-                            refMasks[0] = allSplitRefs;                                    /* 75% left  */
-                            refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
-                            checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
-                            checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
-                        }
-                    }
-                }
-            }
-
-            if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE) && !((m_param->bCTUInfo & 4) && bCtuInfoCheck))
-            {
-                if (!m_param->limitReferences || splitIntra)
-                {
-                    ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
-                    md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
-                    checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
-                    checkBestMode(md.pred[PRED_INTRA], depth);
-
-                    if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
-                    {
-                        md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
-                        checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
-                        checkBestMode(md.pred[PRED_INTRA_NxN], depth);
-                    }
-                }
-                else
-                {
-                    ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);
-                }
-            }
-        }
-
-        if ((md.bestMode->cu.isInter(0) && !(md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N)) && (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400))
-        {
-            uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
-
-            for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
-            {
-                PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
-                motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, m_csp != X265_CSP_I400);
-            }
-            encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
-        }
-        if (m_bTryLossless)
-            tryLossless(cuGeom);
-
-        if (mightSplit)
-            addSplitFlagCost(*md.bestMode, cuGeom.depth);
-    }
-
-    if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
-    {
-        if (mightNotSplit)
-        {
-            CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
-            int8_t maxTUDepth = -1;
-            for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
-                maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
-            ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
-        }
-    }
-
-    /* compare split RD cost against best cost */
-    if (mightSplit && !skipRecursion)
-        checkBestMode(md.pred[PRED_SPLIT], depth);
-
-    if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
-    {
-        int cuIdx = (cuGeom.childOffset - 1) / 3;
-        cacheCost[cuIdx] = md.bestMode->rdCost;
-    }
-
-       /* determine which motion references the parent CU should search */
-    SplitData splitCUData;
-    splitCUData.initSplitCUData();
-    if (m_param->limitReferences & X265_REF_LIMIT_DEPTH)
-    {
-        if (md.bestMode == &md.pred[PRED_SPLIT])
-            splitCUData.splitRefs = allSplitRefs;
-        else
-        {
-            /* use best merge/inter mode, in case of intra use 2Nx2N inter references */
-            CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;
-            uint32_t numPU = cu.getNumPartInter(0);
-            for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))
-                splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);
-        }
-    }
-
-    if (m_param->limitModes)
-    {
-        splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0
-        splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1
-        splitCUData.sa8dCost    = md.pred[PRED_2Nx2N].rdCost;
-    }
-
-    /* Copy best data to encData CTU and recon */
-    md.bestMode->cu.copyToPic(depth);
-    md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
-
     return splitCUData;
 }
 
@@ -2240,8 +2408,7 @@ void Analysis::recodeCU(const CUData& pa
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
     bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
 
-    int split = (m_param->interRefine && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1)
-                && bDecidedDepth && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP);
+    int split = (m_param->interRefine && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
 
     if (bDecidedDepth)
     {
@@ -2251,23 +2418,25 @@ void Analysis::recodeCU(const CUData& pa
         md.bestMode = &mode;
         mode.cu.initSubCU(parentCTU, cuGeom, qp);
         PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
-        if (parentCTU.isIntra(cuGeom.absPartIdx))
+        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
         {
-            if (m_param->intraRefine != 2 || parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] <= 1)
+            bool reuseModes = !((m_param->intraRefine == 3) ||
+                                (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
+            if (reuseModes)
             {
                 memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
                 memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
             }
             checkIntra(mode, cuGeom, size);
         }
-        else
+        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
         {
             mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
             uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
             for (uint32_t part = 0; part < numPU; part++)
             {
                 PredictionUnit pu(mode.cu, cuGeom, part);
-                if (m_param->analysisReuseLevel == 10)
+                if (m_param->analysisReuseLevel >= 7)
                 {
                     analysis_inter_data* interDataCTU = (analysis_inter_data*)m_frame->m_analysisData.interData;
                     int cuIdx = (mode.cu.m_cuAddr * parentCTU.m_numPartitions) + cuGeom.absPartIdx;
@@ -2328,19 +2497,39 @@ void Analysis::recodeCU(const CUData& pa
                 checkDQP(mode, cuGeom);
         }
 
-        if (m_bTryLossless)
-            tryLossless(cuGeom);
-
-        if (mightSplit)
-            addSplitFlagCost(*md.bestMode, cuGeom.depth);
-
-        if (mightSplit && m_param->rdLevel < 5)
-            checkDQPForSplitPred(*md.bestMode, cuGeom);
-
-        if (m_param->interRefine && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0))
+        if (m_param->interRefine < 2)
+        {
+            if (m_bTryLossless)
+                tryLossless(cuGeom);
+
+            if (mightSplit)
+                addSplitFlagCost(*md.bestMode, cuGeom.depth);
+
+            if (mightSplit && m_param->rdLevel < 5)
+                checkDQPForSplitPred(*md.bestMode, cuGeom);
+        }
+
+        if (m_param->bMVType && m_param->analysisReuseLevel == 7)
+        {
+            for (int list = 0; list < m_slice->isInterB() + 1; list++)
+            {
+                m_modeFlag[list] = true;
+                if (parentCTU.m_skipFlag[list][cuGeom.absPartIdx] == 1 && cuGeom.numPartitions <= 16)
+                    m_checkMergeAndSkipOnly[list] = true;
+            }
+            m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
+            for (int list = 0; list < m_slice->isInterB() + 1; list++)
+            {
+                m_modeFlag[list] = false;
+                m_checkMergeAndSkipOnly[list] = false;
+            }
+        }
+
+        if (m_param->interRefine > 1 || (m_param->interRefine && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
         {
             m_evaluateInter = 1;
             m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
+            m_evaluateInter = 0;
         }
     }
     if (!bDecidedDepth || split)
@@ -2369,7 +2558,7 @@ void Analysis::recodeCU(const CUData& pa
                 if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
                     nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
 
-                int lamdaQP = m_param->analysisReuseLevel == 10 ? nextQP : lqp;
+                int lamdaQP = (m_param->analysisReuseLevel >= 7) ? nextQP : lqp;
 
                 if (split)
                     m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, childGeom, nextQP) : compressInterCU_rd0_4(parentCTU, childGeom, nextQP);
--- a/source/encoder/analysis.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/analysis.h	Tue Nov 21 09:50:45 2017 +0530
@@ -110,6 +110,9 @@ public:
     bool      m_bChromaSa8d;
     bool      m_bHD;
 
+    bool      m_modeFlag[2];
+    bool      m_checkMergeAndSkipOnly[2];
+
     Analysis();
 
     bool create(ThreadLocalData* tld);
@@ -145,7 +148,7 @@ protected:
     void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
 
     /* full analysis for an I-slice CU */
-    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    uint64_t compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
 
     /* full analysis for a P or B slice CU */
     uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
--- a/source/encoder/api.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/api.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -30,7 +30,6 @@
 #include "level.h"
 #include "nal.h"
 #include "bitcost.h"
-#include "x265-extras.h"
 
 /* multilib namespace reflectors */
 #if LINKED_8BIT
@@ -63,6 +62,14 @@ extern "C" {
 namespace X265_NS {
 #endif
 
+static const char* summaryCSVHeader =
+    "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
+    "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
+    "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
+    "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
+    "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
+    "MaxCLL, MaxFALL, Version\n";
+
 x265_encoder *x265_encoder_open(x265_param *p)
 {
     if (!p)
@@ -120,7 +127,7 @@ x265_encoder *x265_encoder_open(x265_par
     /* Try to open CSV file handle */
     if (encoder->m_param->csvfn)
     {
-        encoder->m_param->csvfpt = x265_csvlog_open(*encoder->m_param, encoder->m_param->csvfn, encoder->m_param->csvLogLevel);
+        encoder->m_param->csvfpt = x265_csvlog_open(encoder->m_param);
         if (!encoder->m_param->csvfpt)
         {
             x265_log(encoder->m_param, X265_LOG_ERROR, "Unable to open CSV log file <%s>, aborting\n", encoder->m_param->csvfn);
@@ -188,7 +195,10 @@ int x265_encoder_reconfig(x265_encoder* 
 
     x265_param save;
     Encoder* encoder = static_cast<Encoder*>(enc);
-    if (encoder->m_reconfigure || encoder->m_reconfigureRc) /* Reconfigure in progress */
+    if (encoder->m_latestParam->forceFlush != param_in->forceFlush)
+        return encoder->reconfigureParam(encoder->m_latestParam, param_in);
+    bool isReconfigureRc = encoder->isReconfigureRc(encoder->m_latestParam, param_in);
+    if ((encoder->m_reconfigure && !isReconfigureRc) || (encoder->m_reconfigureRc && isReconfigureRc)) /* Reconfigure in progress */
         return 1;
     memcpy(&save, encoder->m_latestParam, sizeof(x265_param));
     int ret = encoder->reconfigureParam(encoder->m_latestParam, param_in);
@@ -205,16 +215,22 @@ int x265_encoder_reconfig(x265_encoder* 
             if (encoder->m_param->bRepeatHeaders)
             {
                 if (encoder->m_scalingList.parseScalingList(encoder->m_latestParam->scalingLists))
+                {
+                    memcpy(encoder->m_latestParam, &save, sizeof(x265_param));
                     return -1;
+                }
                 encoder->m_scalingList.setupQuantMatrices(encoder->m_param->internalCsp);
             }
             else
             {
                 x265_log(encoder->m_param, X265_LOG_ERROR, "Repeat headers is turned OFF, cannot reconfigure scalinglists\n");
+                memcpy(encoder->m_latestParam, &save, sizeof(x265_param));
                 return -1;
             }
         }
-        if (encoder->m_reconfigureRc)
+        if (!isReconfigureRc)
+            encoder->m_reconfigure = true;
+        else if (encoder->m_reconfigureRc)
         {
             VPS saveVPS;
             memcpy(&saveVPS.ptl, &encoder->m_vps.ptl, sizeof(saveVPS.ptl));
@@ -225,11 +241,11 @@ int x265_encoder_reconfig(x265_encoder* 
                 x265_log(encoder->m_param, X265_LOG_WARNING, "Profile/Level/Tier has changed from %d/%d/%s to %d/%d/%s.Cannot reconfigure rate-control.\n",
                          saveVPS.ptl.profileIdc, saveVPS.ptl.levelIdc, saveVPS.ptl.tierFlag ? "High" : "Main", encoder->m_vps.ptl.profileIdc,
                          encoder->m_vps.ptl.levelIdc, encoder->m_vps.ptl.tierFlag ? "High" : "Main");
+                memcpy(encoder->m_latestParam, &save, sizeof(x265_param));
+                memcpy(&encoder->m_vps.ptl, &saveVPS.ptl, sizeof(saveVPS.ptl));
                 encoder->m_reconfigureRc = false;
             }
         }
-        else
-            encoder->m_reconfigure = true;
         encoder->printReconfigureParams();
     }
     return ret;
@@ -248,7 +264,9 @@ int x265_encoder_encode(x265_encoder *en
     {
         numEncoded = encoder->encode(pic_in, pic_out);
     }
-    while (numEncoded == 0 && !pic_in && encoder->m_numDelayedPic);
+    while ((numEncoded == 0 && !pic_in && encoder->m_numDelayedPic && !encoder->m_latestParam->forceFlush) && !encoder->m_externalFlush);
+    if (numEncoded)
+        encoder->m_externalFlush = false;
 
     // do not allow reuse of these buffers for more than one picture. The
     // encoder now owns these analysisData buffers.
@@ -269,7 +287,7 @@ int x265_encoder_encode(x265_encoder *en
         *pi_nal = 0;
 
     if (numEncoded && encoder->m_param->csvLogLevel)
-        x265_csvlog_frame(encoder->m_param->csvfpt, *encoder->m_param, *pic_out, encoder->m_param->csvLogLevel);
+        x265_csvlog_frame(encoder->m_param, pic_out);
 
     if (numEncoded < 0)
         encoder->m_aborted = true;
@@ -292,11 +310,8 @@ void x265_encoder_log(x265_encoder* enc,
     {
         Encoder *encoder = static_cast<Encoder*>(enc);
         x265_stats stats;
-        int padx = encoder->m_sps.conformanceWindow.rightOffset;
-        int pady = encoder->m_sps.conformanceWindow.bottomOffset;
         encoder->fetchStats(&stats, sizeof(stats));
-        const x265_api * api = x265_api_get(0);
-        x265_csvlog_encode(encoder->m_param->csvfpt, api->version_str, *encoder->m_param, padx, pady, stats, encoder->m_param->csvLogLevel, argc, argv);
+        x265_csvlog_encode(enc, &stats, argc, argv);
     }
 }
 
@@ -331,6 +346,37 @@ int x265_encoder_ctu_info(x265_encoder *
     return 0;
 }
 
+int x265_get_slicetype_poc_and_scenecut(x265_encoder *enc, int *slicetype, int *poc, int *sceneCut)
+{
+    if (!enc)
+        return -1;
+    Encoder *encoder = static_cast<Encoder*>(enc);
+    if (!encoder->copySlicetypePocAndSceneCut(slicetype, poc, sceneCut))
+        return 0;
+    return -1;
+}
+
+int x265_get_ref_frame_list(x265_encoder *enc, x265_picyuv** l0, x265_picyuv** l1, int sliceType, int poc)
+{
+    if (!enc)
+        return -1;
+
+    Encoder *encoder = static_cast<Encoder*>(enc);
+    return encoder->getRefFrameList((PicYuv**)l0, (PicYuv**)l1, sliceType, poc);
+}
+
+int x265_set_analysis_data(x265_encoder *enc, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes)
+{
+    if (!enc)
+        return -1;
+
+    Encoder *encoder = static_cast<Encoder*>(enc);
+    if (!encoder->setAnalysisData(analysis_data, poc, cuBytes))
+        return 0;
+
+    return -1;
+}
+
 void x265_cleanup(void)
 {
     BitCost::destroy();
@@ -352,7 +398,7 @@ void x265_picture_init(x265_param *param
     pic->userSEI.payloads = NULL;
     pic->userSEI.numPayloads = 0;
 
-    if (param->analysisReuseMode)
+    if (param->analysisReuseMode || (param->bMVType == AVC_INFO))
     {
         uint32_t widthInCU = (param->sourceWidth + param->maxCUSize - 1) >> param->maxLog2CUSize;
         uint32_t heightInCU = (param->sourceHeight + param->maxCUSize - 1) >> param->maxLog2CUSize;
@@ -404,6 +450,13 @@ static const x265_api libapi =
     sizeof(x265_frame_stats),
     &x265_encoder_intra_refresh,
     &x265_encoder_ctu_info,
+    &x265_get_slicetype_poc_and_scenecut,
+    &x265_get_ref_frame_list,
+    &x265_csvlog_open,
+    &x265_csvlog_frame,
+    &x265_csvlog_encode,
+    &x265_dither_image,
+    &x265_set_analysis_data
 };
 
 typedef const x265_api* (*api_get_func)(int bitDepth);
@@ -598,4 +651,422 @@ const x265_api* x265_api_query(int bitDe
     return &libapi;
 }
 
+FILE* x265_csvlog_open(const x265_param* param)
+{
+    FILE *csvfp = x265_fopen(param->csvfn, "r");
+    if (csvfp)
+    {
+        /* file already exists, re-open for append */
+        fclose(csvfp);
+        return x265_fopen(param->csvfn, "ab");
+    }
+    else
+    {
+        /* new CSV file, write header */
+        csvfp = x265_fopen(param->csvfn, "wb");
+        if (csvfp)
+        {
+            if (param->csvLogLevel)
+            {
+                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
+                if (param->csvLogLevel >= 2)
+                    fprintf(csvfp, "I/P cost ratio, ");
+                if (param->rc.rateControlMode == X265_RC_CRF)
+                    fprintf(csvfp, "RateFactor, ");
+                if (param->rc.vbvBufferSize)
+                    fprintf(csvfp, "BufferFill, ");
+                if (param->bEnablePsnr)
+                    fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
+                if (param->bEnableSsim)
+                    fprintf(csvfp, "SSIM, SSIM(dB), ");
+                fprintf(csvfp, "Latency, ");
+                fprintf(csvfp, "List 0, List 1");
+                uint32_t size = param->maxCUSize;
+                for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
+                    size /= 2;
+                }
+                fprintf(csvfp, ", 4x4");
+                size = param->maxCUSize;
+                if (param->bEnableRectInter)
+                {
+                    for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+                    {
+                        fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
+                        if (param->bEnableAMP)
+                            fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
+                        size /= 2;
+                    }
+                }
+                else
+                {
+                    for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+                    {
+                        fprintf(csvfp, ", Inter %dx%d", size, size);
+                        size /= 2;
+                    }
+                }
+                size = param->maxCUSize;
+                for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Skip %dx%d", size, size);
+                    size /= 2;
+                }
+                size = param->maxCUSize;
+                for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+                {
+                    fprintf(csvfp, ", Merge %dx%d", size, size);
+                    size /= 2;
+                }
+
+                if (param->csvLogLevel >= 2)
+                {
+                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Residual Energy,"
+                        " Min Luma Level, Max Luma Level, Avg Luma Level");
+
+                    if (param->internalCsp != X265_CSP_I400)
+                        fprintf(csvfp, ", Min Cb Level, Max Cb Level, Avg Cb Level, Min Cr Level, Max Cr Level, Avg Cr Level");
+
+                    /* PU statistics */
+                    size = param->maxCUSize;
+                    for (uint32_t i = 0; i< param->maxLog2CUSize - (uint32_t)g_log2Size[param->minCUSize] + 1; i++)
+                    {
+                        fprintf(csvfp, ", Intra %dx%d", size, size);
+                        fprintf(csvfp, ", Skip %dx%d", size, size);
+                        fprintf(csvfp, ", AMP %d", size);
+                        fprintf(csvfp, ", Inter %dx%d", size, size);
+                        fprintf(csvfp, ", Merge %dx%d", size, size);
+                        fprintf(csvfp, ", Inter %dx%d", size, size / 2);
+                        fprintf(csvfp, ", Merge %dx%d", size, size / 2);
+                        fprintf(csvfp, ", Inter %dx%d", size / 2, size);
+                        fprintf(csvfp, ", Merge %dx%d", size / 2, size);
+                        size /= 2;
+                    }
+
+                    if ((uint32_t)g_log2Size[param->minCUSize] == 3)
+                        fprintf(csvfp, ", 4x4");
+
+                    /* detailed performance statistics */
+                    fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms),"
+                        "Stall Time (ms), Total frame time (ms), Avg WPP, Row Blocks");
+                }
+                fprintf(csvfp, "\n");
+            }
+            else
+                fputs(summaryCSVHeader, csvfp);
+        }
+        return csvfp;
+    }
+}
+
+// per frame CSV logging
+void x265_csvlog_frame(const x265_param* param, const x265_picture* pic)
+{
+    if (!param->csvfpt)
+        return;
+
+    const x265_frame_stats* frameStats = &pic->frameData;
+    fprintf(param->csvfpt, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc,
+                                                                   frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
+    if (param->csvLogLevel >= 2)
+        fprintf(param->csvfpt, "%.2f,", frameStats->ipCostRatio);
+    if (param->rc.rateControlMode == X265_RC_CRF)
+        fprintf(param->csvfpt, "%.3lf,", frameStats->rateFactor);
+    if (param->rc.vbvBufferSize)
+        fprintf(param->csvfpt, "%.3lf,", frameStats->bufferFill);
+    if (param->bEnablePsnr)
+        fprintf(param->csvfpt, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
+    if (param->bEnableSsim)
+        fprintf(param->csvfpt, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
+    fprintf(param->csvfpt, "%d, ", frameStats->frameLatency);
+    if (frameStats->sliceType == 'I' || frameStats->sliceType == 'i')
+        fputs(" -, -,", param->csvfpt);
+    else
+    {
+        int i = 0;
+        while (frameStats->list0POC[i] != -1)
+            fprintf(param->csvfpt, "%d ", frameStats->list0POC[i++]);
+        fprintf(param->csvfpt, ",");
+        if (frameStats->sliceType != 'P')
+        {
+            i = 0;
+            while (frameStats->list1POC[i] != -1)
+                fprintf(param->csvfpt, "%d ", frameStats->list1POC[i++]);
+            fprintf(param->csvfpt, ",");
+        }
+        else
+            fputs(" -,", param->csvfpt);
+    }
+
+    if (param->csvLogLevel)
+    {
+        for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+            fprintf(param->csvfpt, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0],
+                                                                    frameStats->cuStats.percentIntraDistribution[depth][1],
+                                                                    frameStats->cuStats.percentIntraDistribution[depth][2]);
+        fprintf(param->csvfpt, "%5.2lf%%", frameStats->cuStats.percentIntraNxN);
+        if (param->bEnableRectInter)
+        {
+            for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+            {
+                fprintf(param->csvfpt, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0],
+                                                               frameStats->cuStats.percentInterDistribution[depth][1]);
+                if (param->bEnableAMP)
+                    fprintf(param->csvfpt, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
+            }
+        }
+        else
+        {
+            for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+                fprintf(param->csvfpt, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
+        }
+        for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+            fprintf(param->csvfpt, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
+        for (uint32_t depth = 0; depth <= param->maxCUDepth; depth++)
+            fprintf(param->csvfpt, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
+    }
+
+    if (param->csvLogLevel >= 2)
+    {
+        fprintf(param->csvfpt, ", %.2lf, %.2lf, %.2lf, %.2lf ", frameStats->avgLumaDistortion,
+                                                                frameStats->avgChromaDistortion,
+                                                                frameStats->avgPsyEnergy,
+                                                                frameStats->avgResEnergy);
+
+        fprintf(param->csvfpt, ", %d, %d, %.2lf", frameStats->minLumaLevel, frameStats->maxLumaLevel, frameStats->avgLumaLevel);
+
+        if (param->internalCsp != X265_CSP_I400)
+        {
+            fprintf(param->csvfpt, ", %d, %d, %.2lf", frameStats->minChromaULevel, frameStats->maxChromaULevel, frameStats->avgChromaULevel);
+            fprintf(param->csvfpt, ", %d, %d, %.2lf", frameStats->minChromaVLevel, frameStats->maxChromaVLevel, frameStats->avgChromaVLevel);
+        }
+
+        for (uint32_t i = 0; i < param->maxLog2CUSize - (uint32_t)g_log2Size[param->minCUSize] + 1; i++)
+        {
+            fprintf(param->csvfpt, ", %.2lf%%", frameStats->puStats.percentIntraPu[i]);
+            fprintf(param->csvfpt, ", %.2lf%%", frameStats->puStats.percentSkipPu[i]);
+            fprintf(param->csvfpt, ",%.2lf%%", frameStats->puStats.percentAmpPu[i]);
+            for (uint32_t j = 0; j < 3; j++)
+            {
+                fprintf(param->csvfpt, ", %.2lf%%", frameStats->puStats.percentInterPu[i][j]);
+                fprintf(param->csvfpt, ", %.2lf%%", frameStats->puStats.percentMergePu[i][j]);
+            }
+        }
+        if ((uint32_t)g_log2Size[param->minCUSize] == 3)
+            fprintf(param->csvfpt, ",%.2lf%%", frameStats->puStats.percentNxN);
+
+        fprintf(param->csvfpt, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime,
+                                                                                     frameStats->wallTime, frameStats->refWaitWallTime,
+                                                                                     frameStats->totalCTUTime, frameStats->stallTime,
+                                                                                     frameStats->totalFrameTime);
+
+        fprintf(param->csvfpt, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
+    }
+    fprintf(param->csvfpt, "\n");
+    fflush(stderr);
+}
+
+void x265_csvlog_encode(x265_encoder *enc, const x265_stats* stats, int argc, char** argv)
+{
+    if (enc)
+    {
+        Encoder *encoder = static_cast<Encoder*>(enc);
+        int padx = encoder->m_sps.conformanceWindow.rightOffset;
+        int pady = encoder->m_sps.conformanceWindow.bottomOffset;
+        const x265_api * api = x265_api_get(0);
+
+        if (!encoder->m_param->csvfpt)
+            return;
+
+        if (encoder->m_param->csvLogLevel)
+        {
+            // adding summary to a per-frame csv log file, so it needs a summary header
+            fprintf(encoder->m_param->csvfpt, "\nSummary\n");
+            fputs(summaryCSVHeader, encoder->m_param->csvfpt);
+        }
+
+        // CLI arguments or other
+        if (argc)
+        {
+            fputc('"', encoder->m_param->csvfpt);
+            for (int i = 1; i < argc; i++)
+            {
+                fputc(' ', encoder->m_param->csvfpt);
+                fputs(argv[i], encoder->m_param->csvfpt);
+            }
+            fputc('"', encoder->m_param->csvfpt);
+        }
+        else
+        {
+            const x265_param* paramTemp = encoder->m_param;
+            char *opts = x265_param2string((x265_param*)paramTemp, padx, pady);
+            if (opts)
+            {
+                fputc('"', encoder->m_param->csvfpt);
+                fputs(opts, encoder->m_param->csvfpt);
+                fputc('"', encoder->m_param->csvfpt);
+            }
+        }
+
+        // current date and time
+        time_t now;
+        struct tm* timeinfo;
+        time(&now);
+        timeinfo = localtime(&now);
+        char buffer[200];
+        strftime(buffer, 128, "%c", timeinfo);
+        fprintf(encoder->m_param->csvfpt, ", %s, ", buffer);
+
+        // elapsed time, fps, bitrate
+        fprintf(encoder->m_param->csvfpt, "%.2f, %.2f, %.2f,",
+            stats->elapsedEncodeTime, stats->encodedPictureCount / stats->elapsedEncodeTime, stats->bitrate);
+
+        if (encoder->m_param->bEnablePsnr)
+            fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf, %.3lf,",
+            stats->globalPsnrY / stats->encodedPictureCount, stats->globalPsnrU / stats->encodedPictureCount,
+            stats->globalPsnrV / stats->encodedPictureCount, stats->globalPsnr);
+        else
+            fprintf(encoder->m_param->csvfpt, " -, -, -, -,");
+        if (encoder->m_param->bEnableSsim)
+            fprintf(encoder->m_param->csvfpt, " %.6f, %6.3f,", stats->globalSsim, x265_ssim2dB(stats->globalSsim));
+        else
+            fprintf(encoder->m_param->csvfpt, " -, -,");
+
+        if (stats->statsI.numPics)
+        {
+            fprintf(encoder->m_param->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsI.numPics, stats->statsI.avgQp, stats->statsI.bitrate);
+            if (encoder->m_param->bEnablePsnr)
+                fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsI.psnrY, stats->statsI.psnrU, stats->statsI.psnrV);
+            else
+                fprintf(encoder->m_param->csvfpt, " -, -, -,");
+            if (encoder->m_param->bEnableSsim)
+                fprintf(encoder->m_param->csvfpt, " %.3lf,", stats->statsI.ssim);
+            else
+                fprintf(encoder->m_param->csvfpt, " -,");
+        }
+        else
+            fprintf(encoder->m_param->csvfpt, " -, -, -, -, -, -, -,");
+
+        if (stats->statsP.numPics)
+        {
+            fprintf(encoder->m_param->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsP.numPics, stats->statsP.avgQp, stats->statsP.bitrate);
+            if (encoder->m_param->bEnablePsnr)
+                fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsP.psnrY, stats->statsP.psnrU, stats->statsP.psnrV);
+            else
+                fprintf(encoder->m_param->csvfpt, " -, -, -,");
+            if (encoder->m_param->bEnableSsim)
+                fprintf(encoder->m_param->csvfpt, " %.3lf,", stats->statsP.ssim);
+            else
+                fprintf(encoder->m_param->csvfpt, " -,");
+        }
+        else
+            fprintf(encoder->m_param->csvfpt, " -, -, -, -, -, -, -,");
+
+        if (stats->statsB.numPics)
+        {
+            fprintf(encoder->m_param->csvfpt, " %-6u, %2.2lf, %-8.2lf,", stats->statsB.numPics, stats->statsB.avgQp, stats->statsB.bitrate);
+            if (encoder->m_param->bEnablePsnr)
+                fprintf(encoder->m_param->csvfpt, " %.3lf, %.3lf, %.3lf,", stats->statsB.psnrY, stats->statsB.psnrU, stats->statsB.psnrV);
+            else
+                fprintf(encoder->m_param->csvfpt, " -, -, -,");
+            if (encoder->m_param->bEnableSsim)
+                fprintf(encoder->m_param->csvfpt, " %.3lf,", stats->statsB.ssim);
+            else
+                fprintf(encoder->m_param->csvfpt, " -,");
+        }
+        else
+            fprintf(encoder->m_param->csvfpt, " -, -, -, -, -, -, -,");
+
+        fprintf(encoder->m_param->csvfpt, " %-6u, %-6u, %s\n", stats->maxCLL, stats->maxFALL, api->version_str);
+    }
+}
+
+/* The dithering algorithm is based on Sierra-2-4A error diffusion.
+ * We convert planes in place (without allocating a new buffer). */
+static void ditherPlane(uint16_t *src, int srcStride, int width, int height, int16_t *errors, int bitDepth)
+{
+    const int lShift = 16 - bitDepth;
+    const int rShift = 16 - bitDepth + 2;
+    const int half = (1 << (16 - bitDepth + 1));
+    const int pixelMax = (1 << bitDepth) - 1;
+
+    memset(errors, 0, (width + 1) * sizeof(int16_t));
+
+    if (bitDepth == 8)
+    {
+        for (int y = 0; y < height; y++, src += srcStride)
+        {
+            uint8_t* dst = (uint8_t *)src;
+            int16_t err = 0;
+            for (int x = 0; x < width; x++)
+            {
+                err = err * 2 + errors[x] + errors[x + 1];
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
+                dst[x] = (uint8_t)tmpDst;
+            }
+        }
+    }
+    else
+    {
+        for (int y = 0; y < height; y++, src += srcStride)
+        {
+            int16_t err = 0;
+            for (int x = 0; x < width; x++)
+            {
+                err = err * 2 + errors[x] + errors[x + 1];
+                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
+                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
+                src[x] = (uint16_t)tmpDst;
+            }
+        }
+    }
+}
+
+void x265_dither_image(x265_picture* picIn, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth)
+{
+    const x265_api* api = x265_api_get(0);
+
+    if (sizeof(x265_picture) != api->sizeof_picture)
+    {
+        fprintf(stderr, "extras [error]: structure size skew, unable to dither\n");
+        return;
+    }
+
+    if (picIn->bitDepth <= 8)
+    {
+        fprintf(stderr, "extras [error]: dither support enabled only for input bitdepth > 8\n");
+        return;
+    }
+
+    if (picIn->bitDepth == bitDepth)
+    {
+        fprintf(stderr, "extras[error]: dither support enabled only if encoder depth is different from picture depth\n");
+        return;
+    }
+
+    /* This portion of code is from readFrame in x264. */
+    for (int i = 0; i < x265_cli_csps[picIn->colorSpace].planes; i++)
+    {
+        if (picIn->bitDepth < 16)
+        {
+            /* upconvert non 16bit high depth planes to 16bit */
+            uint16_t *plane = (uint16_t*)picIn->planes[i];
+            uint32_t pixelCount = x265_picturePlaneSize(picIn->colorSpace, picWidth, picHeight, i);
+            int lShift = 16 - picIn->bitDepth;
+
+            /* This loop assumes width is equal to stride which
+             * happens to be true for file reader outputs */
+            for (uint32_t j = 0; j < pixelCount; j++)
+                plane[j] = plane[j] << lShift;
+        }
+
+        int height = (int)(picHeight >> x265_cli_csps[picIn->colorSpace].height[i]);
+        int width = (int)(picWidth >> x265_cli_csps[picIn->colorSpace].width[i]);
+
+        ditherPlane(((uint16_t*)picIn->planes[i]), picIn->stride[i] / 2, width, height, errorBuf, bitDepth);
+    }
+}
+
 } /* end namespace or extern "C" */
--- a/source/encoder/encoder.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/encoder.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -48,6 +48,12 @@ namespace X265_NS {
 const char g_sliceTypeToChar[] = {'B', 'P', 'I'};
 }
 
+/* Threshold for motion vection, based on expermental result.
+ * TODO: come up an algorithm for adoptive threshold */
+
+#define MVTHRESHOLD 10
+#define PU_2Nx2N 1
+
 static const char* defaultAnalysisFileName = "x265_analysis.dat";
 
 using namespace X265_NS;
@@ -386,9 +392,7 @@ void Encoder::create()
             }
         }
     }
-
-    m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1;
-
+    m_bZeroLatency = !m_param->bframes && !m_param->lookaheadDepth && m_param->frameNumThreads == 1 && m_param->maxSlices == 1;
     m_aborted |= parseLambdaFile(m_param);
 
     m_encodeStartTime = x265_mdate();
@@ -429,6 +433,252 @@ void Encoder::stopJobs()
     }
 }
 
+int Encoder::copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut)
+{
+    Frame *FramePtr = m_dpb->m_picList.getCurFrame();
+    if (FramePtr != NULL)
+    {
+        *slicetype = FramePtr->m_lowres.sliceType;
+        *poc = FramePtr->m_encData->m_slice->m_poc;
+        *sceneCut = FramePtr->m_lowres.bScenecut;
+    }
+    else
+    {
+        x265_log(NULL, X265_LOG_WARNING, "Frame is still in lookahead pipeline, this API must be called after (poc >= lookaheadDepth + bframes + 2) condition check\n");
+        return -1;
+    }
+    return 0;
+}
+
+int Encoder::getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc)
+{
+    if (!(IS_X265_TYPE_I(sliceType)))
+    {
+        Frame *framePtr = m_dpb->m_picList.getPOC(poc);
+        if (framePtr != NULL)
+        {
+            for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx[0]; j++)    // check only for --ref=n number of frames.
+            {
+                if (framePtr->m_encData->m_slice->m_refFrameList[0][j] && framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_reconPic != NULL)
+                {
+                    int l0POC = framePtr->m_encData->m_slice->m_refFrameList[0][j]->m_poc;
+                    Frame* l0Fp = m_dpb->m_picList.getPOC(l0POC);
+                    if (l0Fp->m_reconPic->m_picOrg[0] == NULL)
+                        l0Fp->m_reconEncoded.wait(); /* If recon is not ready, current frame encoder need to wait. */
+                    l0[j] = l0Fp->m_reconPic;
+                }
+            }
+            for (int j = 0; j < framePtr->m_encData->m_slice->m_numRefIdx[1]; j++)    // check only for --ref=n number of frames.
+            {
+                if (framePtr->m_encData->m_slice->m_refFrameList[1][j] && framePtr->m_encData->m_slice->m_refFrameList[1][j]->m_reconPic != NULL)
+                {
+                    int l1POC = framePtr->m_encData->m_slice->m_refFrameList[1][j]->m_poc;
+                    Frame* l1Fp = m_dpb->m_picList.getPOC(l1POC);
+                    if (l1Fp->m_reconPic->m_picOrg[0] == NULL)
+                        l1Fp->m_reconEncoded.wait(); /* If recon is not ready, current frame encoder need to wait. */
+                    l1[j] = l1Fp->m_reconPic;
+                }
+            }
+        }
+        else
+            x265_log(NULL, X265_LOG_WARNING, "Refrence List is not in piclist\n");
+    }
+    else
+    {
+        x265_log(NULL, X265_LOG_ERROR, "I frames does not have a refrence List\n");
+        return -1;
+    }
+    return 0;
+}
+
+int Encoder::setAnalysisDataAfterZScan(x265_analysis_data *analysis_data, Frame* curFrame)
+{
+    int mbImageWidth, mbImageHeight;
+    mbImageWidth = (curFrame->m_fencPic->m_picWidth + 16 - 1) >> 4; //AVC block sizes
+    mbImageHeight = (curFrame->m_fencPic->m_picHeight + 16 - 1) >> 4;
+    if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I)
+    {
+        curFrame->m_analysisData.sliceType = X265_TYPE_I;
+        if (m_param->analysisReuseLevel < 7)
+            return -1;
+        curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+        int num16x16inCUWidth = m_param->maxCUSize >> 4;
+        uint32_t ctuAddr, offset, cuPos;
+        analysis_intra_data * intraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
+        analysis_intra_data * srcIntraData = (analysis_intra_data *)analysis_data->intraData;
+        for (int i = 0; i < mbImageHeight; i++)
+        {
+            for (int j = 0; j < mbImageWidth; j++)
+            {
+                int mbIndex = j + i * mbImageWidth;
+                ctuAddr = (j / num16x16inCUWidth + ((i / num16x16inCUWidth) * (mbImageWidth / num16x16inCUWidth)));
+                offset = ((i % num16x16inCUWidth) << 5) + ((j % num16x16inCUWidth) << 4);
+                if ((j % 4 >= 2) && m_param->maxCUSize == 64)
+                    offset += (2 * 16);
+                if ((i % 4 >= 2) && m_param->maxCUSize == 64)
+                    offset += (2 * 32);
+                cuPos = ctuAddr  * curFrame->m_analysisData.numPartitions + offset;
+                memcpy(&(intraData)->depth[cuPos], &(srcIntraData)->depth[mbIndex * 16], 16);
+                memcpy(&(intraData)->chromaModes[cuPos], &(srcIntraData)->chromaModes[mbIndex * 16], 16);
+                memcpy(&(intraData)->partSizes[cuPos], &(srcIntraData)->partSizes[mbIndex * 16], 16);
+                memcpy(&(intraData)->partSizes[cuPos], &(srcIntraData)->partSizes[mbIndex * 16], 16);
+            }
+        }
+        memcpy(&(intraData)->modes, (srcIntraData)->modes, curFrame->m_analysisData.numPartitions * analysis_data->numCUsInFrame);
+    }
+    else
+    {
+        uint32_t numDir = analysis_data->sliceType == X265_TYPE_P ? 1 : 2;
+        if (m_param->analysisReuseLevel < 7)
+            return -1;
+        curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+        int num16x16inCUWidth = m_param->maxCUSize >> 4;
+        uint32_t ctuAddr, offset, cuPos;
+        analysis_inter_data * interData = (analysis_inter_data *)curFrame->m_analysisData.interData;
+        analysis_inter_data * srcInterData = (analysis_inter_data*)analysis_data->interData;
+        for (int i = 0; i < mbImageHeight; i++)
+        {
+            for (int j = 0; j < mbImageWidth; j++)
+            {
+                int mbIndex = j + i * mbImageWidth;
+                ctuAddr = (j / num16x16inCUWidth + ((i / num16x16inCUWidth) * (mbImageWidth / num16x16inCUWidth)));
+                offset = ((i % num16x16inCUWidth) << 5) + ((j % num16x16inCUWidth) << 4);
+                if ((j % 4 >= 2) && m_param->maxCUSize == 64)
+                    offset += (2 * 16);
+                if ((i % 4 >= 2) && m_param->maxCUSize == 64)
+                    offset += (2 * 32);
+                cuPos = ctuAddr  * curFrame->m_analysisData.numPartitions + offset;
+                memcpy(&(interData)->depth[cuPos], &(srcInterData)->depth[mbIndex * 16], 16);
+                memcpy(&(interData)->modes[cuPos], &(srcInterData)->modes[mbIndex * 16], 16);
+
+                memcpy(&(interData)->partSize[cuPos], &(srcInterData)->partSize[mbIndex * 16], 16);
+
+                int bytes = curFrame->m_analysisData.numPartitions >> ((srcInterData)->depth[mbIndex * 16] * 2);
+                int cuCount = 1;
+                if (bytes < 16)
+                    cuCount = 4;
+                for (int cuI = 0; cuI < cuCount; cuI++)
+                {
+                    int numPU = nbPartsTable[(srcInterData)->partSize[mbIndex * 16 + cuI * bytes]];
+                    for (int pu = 0; pu < numPU; pu++)
+                    {
+                        int cuOffset = cuI * bytes + pu;
+                        (interData)->mergeFlag[cuPos + cuOffset] = (srcInterData)->mergeFlag[(mbIndex * 16) + cuOffset];
+
+                        (interData)->interDir[cuPos + cuOffset] = (srcInterData)->interDir[(mbIndex * 16) + cuOffset];
+                        for (uint32_t k = 0; k < numDir; k++)
+                        {
+                            (interData)->mvpIdx[k][cuPos + cuOffset] = (srcInterData)->mvpIdx[k][(mbIndex * 16) + cuOffset];
+                            (interData)->refIdx[k][cuPos + cuOffset] = (srcInterData)->refIdx[k][(mbIndex * 16) + cuOffset];
+                            memcpy(&(interData)->mv[k][cuPos + cuOffset], &(srcInterData)->mv[k][(mbIndex * 16) + cuOffset], sizeof(MV));
+                            if (m_param->analysisReuseLevel == 7)
+                            {
+                                int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].x;
+                                int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[k][(mbIndex * 16) + cuOffset].y;
+                                double mv = sqrt(mv_x*mv_x + mv_y*mv_y);
+                                if (numPU == PU_2Nx2N && ((srcInterData)->depth[cuPos + cuOffset] == (m_param->maxCUSize >> 5)) && mv <= MVTHRESHOLD)
+                                    memset(&curFrame->m_analysisData.modeFlag[k][cuPos + cuOffset], 1, bytes);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+int Encoder::setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_t cuBytes)
+{
+    uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+    uint32_t heightInCU = (m_param->sourceHeight + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
+
+    Frame* curFrame = m_dpb->m_picList.getPOC(poc);
+    if (curFrame != NULL)
+    {
+        curFrame->m_analysisData = (*analysis_data);
+        curFrame->m_analysisData.numCUsInFrame = widthInCU * heightInCU;
+        curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+        allocAnalysis(&curFrame->m_analysisData);
+        if (m_param->maxCUSize == 16)
+        {
+            if (analysis_data->sliceType == X265_TYPE_IDR || analysis_data->sliceType == X265_TYPE_I)
+            {
+                curFrame->m_analysisData.sliceType = X265_TYPE_I;
+                if (m_param->analysisReuseLevel < 2)
+                    return -1;
+
+                curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+                size_t count = 0;
+                analysis_intra_data * currIntraData = (analysis_intra_data *)curFrame->m_analysisData.intraData;
+                analysis_intra_data * intraData = (analysis_intra_data *)analysis_data->intraData;
+                for (uint32_t d = 0; d < cuBytes; d++)
+                {
+                    int bytes = curFrame->m_analysisData.numPartitions >> ((intraData)->depth[d] * 2);
+                    memset(&(currIntraData)->depth[count], (intraData)->depth[d], bytes);
+                    memset(&(currIntraData)->chromaModes[count], (intraData)->chromaModes[d], bytes);
+                    memset(&(currIntraData)->partSizes[count], (intraData)->partSizes[d], bytes);
+                    memset(&(currIntraData)->partSizes[count], (intraData)->partSizes[d], bytes);
+                    count += bytes;
+                }
+                memcpy(&(currIntraData)->modes, (intraData)->modes, curFrame->m_analysisData.numPartitions * analysis_data->numCUsInFrame);
+            }
+            else
+            {
+                uint32_t numDir = analysis_data->sliceType == X265_TYPE_P ? 1 : 2;
+                if (m_param->analysisReuseLevel < 2)
+                    return -1;
+
+                curFrame->m_analysisData.numPartitions = m_param->num4x4Partitions;
+                size_t count = 0;
+                analysis_inter_data * currInterData = (analysis_inter_data *)curFrame->m_analysisData.interData;
+                analysis_inter_data * interData = (analysis_inter_data *)analysis_data->interData;
+                for (uint32_t d = 0; d < cuBytes; d++)
+                {
+                    int bytes = curFrame->m_analysisData.numPartitions >> ((interData)->depth[d] * 2);
+                    memset(&(currInterData)->depth[count], (interData)->depth[d], bytes);
+                    memset(&(currInterData)->modes[count], (interData)->modes[d], bytes);
+                    memcpy(&(currInterData)->sadCost[count], &((analysis_inter_data*)analysis_data->interData)->sadCost[d], bytes);
+                    if (m_param->analysisReuseLevel > 4)
+                    {
+                        memset(&(currInterData)->partSize[count], (interData)->partSize[d], bytes);
+                        int numPU = nbPartsTable[(currInterData)->partSize[d]];
+                        for (int pu = 0; pu < numPU; pu++, d++)
+                        {
+                            (currInterData)->mergeFlag[count + pu] = (interData)->mergeFlag[d];
+                            if (m_param->analysisReuseLevel >= 7)
+                            {
+                                (currInterData)->interDir[count + pu] = (interData)->interDir[d];
+                                for (uint32_t i = 0; i < numDir; i++)
+                                {
+                                    (currInterData)->mvpIdx[i][count + pu] = (interData)->mvpIdx[i][d];
+                                    (currInterData)->refIdx[i][count + pu] = (interData)->refIdx[i][d];
+                                    memcpy(&(currInterData)->mv[i][count + pu], &(interData)->mv[i][d], sizeof(MV));
+                                    if (m_param->analysisReuseLevel == 7)
+                                    {
+                                        int mv_x = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].x;
+                                        int mv_y = ((analysis_inter_data *)curFrame->m_analysisData.interData)->mv[i][count + pu].y;
+                                        double mv = sqrt(mv_x*mv_x + mv_y*mv_y);
+                                        if (numPU == PU_2Nx2N && m_param->num4x4Partitions <= 16 && mv <= MVTHRESHOLD)
+                                            memset(&curFrame->m_analysisData.modeFlag[i][count + pu], 1, bytes);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    count += bytes;
+                }
+            }
+        }
+        else
+            setAnalysisDataAfterZScan(analysis_data, curFrame);
+
+        curFrame->m_copyMVType.trigger();
+        return 0;
+    }
+    return -1;
+}
+
 void Encoder::destroy()
 {
 #if ENABLE_HDR10_PLUS
@@ -609,6 +859,17 @@ int Encoder::encode(const x265_picture* 
     }
     if (pic_in)
     {
+        if (m_latestParam->forceFlush == 1)
+        {
+            m_lookahead->setLookaheadQueue();
+            m_latestParam->forceFlush = 0;
+        }
+        if (m_latestParam->forceFlush == 2)
+        {
+            m_lookahead->m_filled = false;
+            m_latestParam->forceFlush = 0;
+        }
+
         x265_sei_payload toneMap;
         toneMap.payload = NULL;
 #if ENABLE_HDR10_PLUS
@@ -620,12 +881,12 @@ int Encoder::encode(const x265_picture* 
                 int32_t i = 0;
                 toneMap.payloadSize = 0;
                 while (m_cim[currentPOC][i] == 0xFF)
-                    toneMap.payloadSize += m_cim[currentPOC][i++] + 1;
-                toneMap.payloadSize += m_cim[currentPOC][i] + 1;
+                    toneMap.payloadSize += m_cim[currentPOC][i++];
+                toneMap.payloadSize += m_cim[currentPOC][i];
 
                 toneMap.payload = (uint8_t*)x265_malloc(sizeof(uint8_t) * toneMap.payloadSize);
                 toneMap.payloadType = USER_DATA_REGISTERED_ITU_T_T35;
-                memcpy(toneMap.payload, m_cim[currentPOC], toneMap.payloadSize);
+                memcpy(toneMap.payload, &m_cim[currentPOC][i+1], toneMap.payloadSize);
             }
         }
 #endif
@@ -779,9 +1040,22 @@ int Encoder::encode(const x265_picture* 
         {
             /* readAnalysisFile reads analysis data for the frame and allocates memory based on slicetype */
             readAnalysisFile(&inFrame->m_analysisData, inFrame->m_poc, pic_in);
+            inFrame->m_poc = inFrame->m_analysisData.poc;
             sliceType = inFrame->m_analysisData.sliceType;
             inFrame->m_lowres.bScenecut = !!inFrame->m_analysisData.bScenecut;
             inFrame->m_lowres.satdCost = inFrame->m_analysisData.satdCost;
+            if (m_param->bDisableLookahead)
+            {
+                inFrame->m_lowres.sliceType = sliceType;
+                inFrame->m_lowres.bKeyframe = !!inFrame->m_analysisData.lookahead.keyframe;
+                inFrame->m_lowres.bLastMiniGopBFrame = !!inFrame->m_analysisData.lookahead.lastMiniGopBFrame;
+                int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
+                for (int index = 0; index < vbvCount; index++)
+                {
+                    inFrame->m_lowres.plannedSatd[index] = inFrame->m_analysisData.lookahead.plannedSatd[index];
+                    inFrame->m_lowres.plannedType[index] = inFrame->m_analysisData.lookahead.plannedType[index];
+                }
+            }
         }
         if (m_param->bUseRcStats && pic_in->rcData)
         {
@@ -815,6 +1089,8 @@ int Encoder::encode(const x265_picture* 
         m_lookahead->addPicture(*inFrame, sliceType);
         m_numDelayedPic++;
     }
+    else if (m_latestParam->forceFlush == 2)
+        m_lookahead->m_filled = true;
     else
         m_lookahead->flush();
 
@@ -842,7 +1118,7 @@ int Encoder::encode(const x265_picture* 
             x265_frame_stats* frameData = NULL;
 
             /* Free up pic_in->analysisData since it has already been used */
-            if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD)
+            if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD || (m_param->bMVType && slice->m_sliceType != I_SLICE))
                 freeAnalysis(&outFrame->m_analysisData);
 
             if (pic_out)
@@ -873,12 +1149,43 @@ int Encoder::encode(const x265_picture* 
                     pic_out->analysisData.poc = pic_out->poc;
                     pic_out->analysisData.sliceType = pic_out->sliceType;
                     pic_out->analysisData.bScenecut = outFrame->m_lowres.bScenecut;
-                    pic_out->analysisData.satdCost  = outFrame->m_lowres.satdCost;                    
+                    pic_out->analysisData.satdCost  = outFrame->m_lowres.satdCost;
                     pic_out->analysisData.numCUsInFrame = outFrame->m_analysisData.numCUsInFrame;
                     pic_out->analysisData.numPartitions = outFrame->m_analysisData.numPartitions;
                     pic_out->analysisData.wt = outFrame->m_analysisData.wt;
                     pic_out->analysisData.interData = outFrame->m_analysisData.interData;
                     pic_out->analysisData.intraData = outFrame->m_analysisData.intraData;
+                    if (m_param->bDisableLookahead)
+                    {
+                        int factor = 1;
+                        if (m_param->scaleFactor)
+                            factor = m_param->scaleFactor * 2;
+                        pic_out->analysisData.numCuInHeight = outFrame->m_analysisData.numCuInHeight;
+                        pic_out->analysisData.lookahead.dts = outFrame->m_dts;
+                        pic_out->analysisData.satdCost *= factor;
+                        pic_out->analysisData.lookahead.keyframe = outFrame->m_lowres.bKeyframe;
+                        pic_out->analysisData.lookahead.lastMiniGopBFrame = outFrame->m_lowres.bLastMiniGopBFrame;
+                        int vbvCount = m_param->lookaheadDepth + m_param->bframes + 2;
+                        for (int index = 0; index < vbvCount; index++)
+                        {
+                            pic_out->analysisData.lookahead.plannedSatd[index] = outFrame->m_lowres.plannedSatd[index] * factor;
+                            pic_out->analysisData.lookahead.plannedType[index] = outFrame->m_lowres.plannedType[index];
+                        }
+                        for (uint32_t index = 0; index < pic_out->analysisData.numCuInHeight; index++)
+                        {
+                            outFrame->m_analysisData.lookahead.intraSatdForVbv[index] = outFrame->m_encData->m_rowStat[index].intraSatdForVbv * factor;
+                            outFrame->m_analysisData.lookahead.satdForVbv[index] = outFrame->m_encData->m_rowStat[index].satdForVbv * factor;
+                        }
+                        pic_out->analysisData.lookahead.intraSatdForVbv = outFrame->m_analysisData.lookahead.intraSatdForVbv;
+                        pic_out->analysisData.lookahead.satdForVbv = outFrame->m_analysisData.lookahead.satdForVbv;
+                        for (uint32_t index = 0; index < pic_out->analysisData.numCUsInFrame; index++)
+                        {
+                            outFrame->m_analysisData.lookahead.intraVbvCost[index] = outFrame->m_encData->m_cuStat[index].intraVbvCost * factor;
+                            outFrame->m_analysisData.lookahead.vbvCost[index] = outFrame->m_encData->m_cuStat[index].vbvCost * factor;
+                        }
+                        pic_out->analysisData.lookahead.intraVbvCost = outFrame->m_analysisData.lookahead.intraVbvCost;
+                        pic_out->analysisData.lookahead.vbvCost = outFrame->m_analysisData.lookahead.vbvCost;
+                    }
                     writeAnalysisFile(&pic_out->analysisData, *outFrame->m_encData);
                     if (m_param->bUseAnalysisFile)
                         freeAnalysis(&pic_out->analysisData);
@@ -1041,7 +1348,20 @@ int Encoder::encode(const x265_picture* 
                 slice->m_maxNumMergeCand = m_param->maxNumMergeCand;
                 slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * m_param->num4x4Partitions);
             }
-
+            if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->bDisableLookahead)
+            {
+                frameEnc->m_dts = frameEnc->m_analysisData.lookahead.dts;
+                for (uint32_t index = 0; index < frameEnc->m_analysisData.numCuInHeight; index++)
+                {
+                    frameEnc->m_encData->m_rowStat[index].intraSatdForVbv = frameEnc->m_analysisData.lookahead.intraSatdForVbv[index];
+                    frameEnc->m_encData->m_rowStat[index].satdForVbv = frameEnc->m_analysisData.lookahead.satdForVbv[index];
+                }
+                for (uint32_t index = 0; index < frameEnc->m_analysisData.numCUsInFrame; index++)
+                {
+                    frameEnc->m_encData->m_cuStat[index].intraVbvCost = frameEnc->m_analysisData.lookahead.intraVbvCost[index];
+                    frameEnc->m_encData->m_cuStat[index].vbvCost = frameEnc->m_analysisData.lookahead.vbvCost[index];
+                }
+            }
             if (m_param->searchMethod == X265_SEA && frameEnc->m_lowres.sliceType != X265_TYPE_B)
             {
                 int padX = m_param->maxCUSize + 32;
@@ -1094,16 +1414,19 @@ int Encoder::encode(const x265_picture* 
             frameEnc->m_encData->m_slice->m_iNumRPSInSPS = m_sps.spsrpsNum;
 
             curEncoder->m_rce.encodeOrder = frameEnc->m_encodeOrder = m_encodedFrameNum++;
-            if (m_bframeDelay)
+            if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || !m_param->bDisableLookahead)
             {
-                int64_t *prevReorderedPts = m_prevReorderedPts;
-                frameEnc->m_dts = m_encodedFrameNum > m_bframeDelay
-                    ? prevReorderedPts[(m_encodedFrameNum - m_bframeDelay) % m_bframeDelay]
-                    : frameEnc->m_reorderedPts - m_bframeDelayTime;
-                prevReorderedPts[m_encodedFrameNum % m_bframeDelay] = frameEnc->m_reorderedPts;
+                if (m_bframeDelay)
+                {
+                    int64_t *prevReorderedPts = m_prevReorderedPts;
+                    frameEnc->m_dts = m_encodedFrameNum > m_bframeDelay
+                        ? prevReorderedPts[(m_encodedFrameNum - m_bframeDelay) % m_bframeDelay]
+                        : frameEnc->m_reorderedPts - m_bframeDelayTime;
+                    prevReorderedPts[m_encodedFrameNum % m_bframeDelay] = frameEnc->m_reorderedPts;
+                }
+                else
+                    frameEnc->m_dts = frameEnc->m_reorderedPts;
             }
-            else
-                frameEnc->m_dts = frameEnc->m_reorderedPts;
 
             /* Allocate analysis data before encode in save mode. This is allocated in frameEnc */
             if (m_param->analysisReuseMode == X265_ANALYSIS_SAVE)
@@ -1116,6 +1439,7 @@ int Encoder::encode(const x265_picture* 
 
                 uint32_t numCUsInFrame   = widthInCU * heightInCU;
                 analysis->numCUsInFrame  = numCUsInFrame;
+                analysis->numCuInHeight = heightInCU;
                 analysis->numPartitions  = m_param->num4x4Partitions;
                 allocAnalysis(analysis);
             }
@@ -1141,48 +1465,62 @@ int Encoder::encode(const x265_picture* 
 
 int Encoder::reconfigureParam(x265_param* encParam, x265_param* param)
 {
-    encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
-    encParam->bEnableFastIntra = param->bEnableFastIntra;
-    encParam->bEnableEarlySkip = param->bEnableEarlySkip;
-    encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
-    encParam->searchMethod = param->searchMethod;
-    /* Scratch buffer prevents me_range from being increased for esa/tesa */
-    if (param->searchRange < encParam->searchRange)
-        encParam->searchRange = param->searchRange;
-    /* We can't switch out of subme=0 during encoding. */
-    if (encParam->subpelRefine)
-        encParam->subpelRefine = param->subpelRefine;
-    encParam->rdoqLevel = param->rdoqLevel;
-    encParam->rdLevel = param->rdLevel;
-    encParam->bEnableRectInter = param->bEnableRectInter;
-    encParam->maxNumMergeCand = param->maxNumMergeCand;
-    encParam->bIntraInBFrames = param->bIntraInBFrames;
-    if (param->scalingLists && !encParam->scalingLists)
-        encParam->scalingLists = strdup(param->scalingLists);
-    /* VBV can't be turned ON if it wasn't ON to begin with and can't be turned OFF if it was ON to begin with*/
-    if (param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0 &&
-        encParam->rc.vbvMaxBitrate > 0 && encParam->rc.vbvBufferSize > 0)
+    if (isReconfigureRc(encParam, param))
     {
-        m_reconfigureRc |= encParam->rc.vbvMaxBitrate != param->rc.vbvMaxBitrate;
-        m_reconfigureRc |= encParam->rc.vbvBufferSize != param->rc.vbvBufferSize;
-        if (m_reconfigureRc && m_param->bEmitHRDSEI)
-            x265_log(m_param, X265_LOG_WARNING, "VBV parameters cannot be changed when HRD is in use.\n");
-        else
+        /* VBV can't be turned ON if it wasn't ON to begin with and can't be turned OFF if it was ON to begin with*/
+        if (param->rc.vbvMaxBitrate > 0 && param->rc.vbvBufferSize > 0 &&
+            encParam->rc.vbvMaxBitrate > 0 && encParam->rc.vbvBufferSize > 0)
         {
-            encParam->rc.vbvMaxBitrate = param->rc.vbvMaxBitrate;
-            encParam->rc.vbvBufferSize = param->rc.vbvBufferSize;
+            m_reconfigureRc |= encParam->rc.vbvMaxBitrate != param->rc.vbvMaxBitrate;
+            m_reconfigureRc |= encParam->rc.vbvBufferSize != param->rc.vbvBufferSize;
+            if (m_reconfigureRc && m_param->bEmitHRDSEI)
+                x265_log(m_param, X265_LOG_WARNING, "VBV parameters cannot be changed when HRD is in use.\n");
+            else
+            {
+                encParam->rc.vbvMaxBitrate = param->rc.vbvMaxBitrate;
+                encParam->rc.vbvBufferSize = param->rc.vbvBufferSize;
+            }
         }
+        m_reconfigureRc |= encParam->rc.bitrate != param->rc.bitrate;
+        encParam->rc.bitrate = param->rc.bitrate;
+        m_reconfigureRc |= encParam->rc.rfConstant != param->rc.rfConstant;
+        encParam->rc.rfConstant = param->rc.rfConstant;
     }
-    m_reconfigureRc |= encParam->rc.bitrate != param->rc.bitrate;
-    encParam->rc.bitrate = param->rc.bitrate;
-    m_reconfigureRc |= encParam->rc.rfConstant != param->rc.rfConstant;
-    encParam->rc.rfConstant = param->rc.rfConstant; 
-
+    else
+    {
+        encParam->maxNumReferences = param->maxNumReferences; // never uses more refs than specified in stream headers
+        encParam->bEnableFastIntra = param->bEnableFastIntra;
+        encParam->bEnableEarlySkip = param->bEnableEarlySkip;
+        encParam->bEnableRecursionSkip = param->bEnableRecursionSkip;
+        encParam->searchMethod = param->searchMethod;
+        /* Scratch buffer prevents me_range from being increased for esa/tesa */
+        if (param->searchRange < encParam->searchRange)
+            encParam->searchRange = param->searchRange;
+        /* We can't switch out of subme=0 during encoding. */
+        if (encParam->subpelRefine)
+            encParam->subpelRefine = param->subpelRefine;
+        encParam->rdoqLevel = param->rdoqLevel;
+        encParam->rdLevel = param->rdLevel;
+        encParam->bEnableRectInter = param->bEnableRectInter;
+        encParam->maxNumMergeCand = param->maxNumMergeCand;
+        encParam->bIntraInBFrames = param->bIntraInBFrames;
+        if (param->scalingLists && !encParam->scalingLists)
+            encParam->scalingLists = strdup(param->scalingLists);
+    }
+    encParam->forceFlush = param->forceFlush;
     /* To add: Loop Filter/deblocking controls, transform skip, signhide require PPS to be resent */
     /* To add: SAO, temporal MVP, AMP, TU depths require SPS to be resent, at every CVS boundary */
     return x265_check_params(encParam);
 }
 
+bool Encoder::isReconfigureRc(x265_param* latestParam, x265_param* param_in)
+{
+    return (latestParam->rc.vbvMaxBitrate != param_in->rc.vbvMaxBitrate
+        || latestParam->rc.vbvBufferSize != param_in->rc.vbvBufferSize
+        || latestParam->rc.bitrate != param_in->rc.bitrate
+        || latestParam->rc.rfConstant != param_in->rc.rfConstant);
+}
+
 void Encoder::copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc)
 {
     uint32_t widthInCU = (m_param->sourceWidth + m_param->maxCUSize - 1) >> m_param->maxLog2CUSize;
@@ -2107,6 +2445,10 @@ void Encoder::initPPS(PPS *pps)
 void Encoder::configure(x265_param *p)
 {
     this->m_param = p;
+    if (p->bMVType == AVC_INFO)
+        this->m_externalFlush = true;
+    else 
+        this->m_externalFlush = false;
     if (p->keyframeMax < 0)
     {
         /* A negative max GOP size indicates the user wants only one I frame at
@@ -2322,6 +2664,11 @@ void Encoder::configure(x265_param *p)
             x265_log(p, X265_LOG_WARNING, "MV refinement requires analysis load, analysis-reuse-level 10, scale factor. Disabling MV refine.\n");
             p->mvRefine = 0;
         }
+        else if (p->interRefine >= 2)
+        {
+            x265_log(p, X265_LOG_WARNING, "MVs are recomputed when refine-inter >= 2. MV refinement not applicable. Disabling MV refine\n");
+            p->mvRefine = 0;
+        }
     }
 
     if ((p->analysisMultiPassRefine || p->analysisMultiPassDistortion) && (p->bDistributeModeAnalysis || p->bDistributeMotionEstimation))
@@ -2673,6 +3020,13 @@ void Encoder::allocAnalysis(x265_analysi
 {
     X265_CHECK(analysis->sliceType, "invalid slice type\n");
     analysis->interData = analysis->intraData = NULL;
+    if (m_param->bDisableLookahead)
+    {
+        CHECKED_MALLOC_ZERO(analysis->lookahead.intraSatdForVbv, uint32_t, analysis->numCuInHeight);
+        CHECKED_MALLOC_ZERO(analysis->lookahead.satdForVbv, uint32_t, analysis->numCuInHeight);
+        CHECKED_MALLOC_ZERO(analysis->lookahead.intraVbvCost, uint32_t, analysis->numCUsInFrame);
+        CHECKED_MALLOC_ZERO(analysis->lookahead.vbvCost, uint32_t, analysis->numCUsInFrame);
+    }
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
     {
         if (m_param->analysisReuseLevel < 2)
@@ -2690,7 +3044,8 @@ void Encoder::allocAnalysis(x265_analysi
     {
         int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
         uint32_t numPlanes = m_param->internalCsp == X265_CSP_I400 ? 1 : 3;
-        CHECKED_MALLOC_ZERO(analysis->wt, WeightParam, numPlanes * numDir);
+        if (!(m_param->bMVType == AVC_INFO))
+            CHECKED_MALLOC_ZERO(analysis->wt, WeightParam, numPlanes * numDir);
         if (m_param->analysisReuseLevel < 2)
             return;
 
@@ -2704,7 +3059,7 @@ void Encoder::allocAnalysis(x265_analysi
             CHECKED_MALLOC(interData->mergeFlag, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
         }
 
-        if (m_param->analysisReuseLevel == 10)
+        if (m_param->analysisReuseLevel >= 7)
         {
             CHECKED_MALLOC(interData->interDir, uint8_t, analysis->numPartitions * analysis->numCUsInFrame);
             for (int dir = 0; dir < numDir; dir++)
@@ -2738,8 +3093,15 @@ fail:
 
 void Encoder::freeAnalysis(x265_analysis_data* analysis)
 {
+    if (m_param->bDisableLookahead)
+    {
+        X265_FREE(analysis->lookahead.satdForVbv);
+        X265_FREE(analysis->lookahead.intraSatdForVbv);
+        X265_FREE(analysis->lookahead.vbvCost);
+        X265_FREE(analysis->lookahead.intraVbvCost);
+    }
     /* Early exit freeing weights alone if level is 1 (when there is no analysis inter/intra) */
-    if (analysis->sliceType > X265_TYPE_I && analysis->wt)
+    if (analysis->sliceType > X265_TYPE_I && analysis->wt && !(m_param->bMVType == AVC_INFO))
         X265_FREE(analysis->wt);
     if (m_param->analysisReuseLevel < 2)
         return;
@@ -2774,15 +3136,17 @@ void Encoder::freeAnalysis(x265_analysis
                 X265_FREE(((analysis_inter_data*)analysis->interData)->mergeFlag);
                 X265_FREE(((analysis_inter_data*)analysis->interData)->partSize);
             }
-            if (m_param->analysisReuseLevel == 10)
+            if (m_param->analysisReuseLevel >= 7)
             {
                 X265_FREE(((analysis_inter_data*)analysis->interData)->interDir);
+                X265_FREE(((analysis_inter_data*)analysis->interData)->sadCost);
                 int numDir = analysis->sliceType == X265_TYPE_P ? 1 : 2;
                 for (int dir = 0; dir < numDir; dir++)
                 {
                     X265_FREE(((analysis_inter_data*)analysis->interData)->mvpIdx[dir]);
                     X265_FREE(((analysis_inter_data*)analysis->interData)->refIdx[dir]);
                     X265_FREE(((analysis_inter_data*)analysis->interData)->mv[dir]);
+                    X265_FREE(analysis->modeFlag[dir]);
                 }
             }
             else
@@ -2918,6 +3282,11 @@ void Encoder::readAnalysisFile(x265_anal
     X265_FREAD(&analysis->satdCost, sizeof(int64_t), 1, m_analysisFile, &(picData->satdCost));
     X265_FREAD(&analysis->numCUsInFrame, sizeof(int), 1, m_analysisFile, &(picData->numCUsInFrame));
     X265_FREAD(&analysis->numPartitions, sizeof(int), 1, m_analysisFile, &(picData->numPartitions));
+    if (m_param->bDisableLookahead)
+    {
+        X265_FREAD(&analysis->numCuInHeight, sizeof(uint32_t), 1, m_analysisFile, &(picData->numCuInHeight));
+        X265_FREAD(&analysis->lookahead, sizeof(x265_lookahead_data), 1, m_analysisFile, &(picData->lookahead));
+    }
     int scaledNumPartition = analysis->numPartitions;
     int factor = 1 << m_param->scaleFactor;
 
@@ -2926,7 +3295,13 @@ void Encoder::readAnalysisFile(x265_anal
 
     /* Memory is allocated for inter and intra analysis data based on the slicetype */
     allocAnalysis(analysis);
-
+    if (m_param->bDisableLookahead)
+    {
+        X265_FREAD(analysis->lookahead.intraVbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFile, picData->lookahead.intraVbvCost);
+        X265_FREAD(analysis->lookahead.vbvCost, sizeof(uint32_t), analysis->numCUsInFrame, m_analysisFile, picData->lookahead.vbvCost);
+        X265_FREAD(analysis->lookahead.satdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFile, picData->lookahead.satdForVbv);
+        X265_FREAD(analysis->lookahead.intraSatdForVbv, sizeof(uint32_t), analysis->numCuInHeight, m_analysisFile, picData->lookahead.intraSatdForVbv);
+    }
     if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I)
     {
         if (m_param->analysisReuseLevel < 2)
--- a/source/encoder/encoder.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/encoder.h	Tue Nov 21 09:50:45 2017 +0530
@@ -138,6 +138,7 @@ public:
     RateControl*       m_rateControl;
     Lookahead*         m_lookahead;
 
+    bool               m_externalFlush;
     /* Collect statistics globally */
     EncStats           m_analyzeAll;
     EncStats           m_analyzeI;
@@ -201,8 +202,18 @@ public:
 
     int reconfigureParam(x265_param* encParam, x265_param* param);
 
+    bool isReconfigureRc(x265_param* latestParam, x265_param* param_in);
+
     void copyCtuInfo(x265_ctu_info_t** frameCtuInfo, int poc);
 
+    int copySlicetypePocAndSceneCut(int *slicetype, int *poc, int *sceneCut);
+
+    int getRefFrameList(PicYuv** l0, PicYuv** l1, int sliceType, int poc);
+
+    int setAnalysisDataAfterZScan(x265_analysis_data *analysis_data, Frame* curFrame);
+
+    int setAnalysisData(x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
+
     void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
 
     void fetchStats(x265_stats* stats, size_t statsSizeBytes);
--- a/source/encoder/frameencoder.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/frameencoder.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -88,6 +88,7 @@ void FrameEncoder::destroy()
     delete[] m_outStreams;
     delete[] m_backupStreams;
     X265_FREE(m_sliceBaseRow);
+    X265_FREE(m_sliceMaxBlockRow);
     X265_FREE(m_cuGeoms);
     X265_FREE(m_ctuGeomMap);
     X265_FREE(m_substreamSizes);
@@ -118,6 +119,40 @@ bool FrameEncoder::init(Encoder *top, in
 
     m_sliceBaseRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
     ok &= !!m_sliceBaseRow;
+    m_sliceGroupSize = (uint16_t)(m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
+    uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;    
+    uint32_t rowSum = sliceGroupSizeAccu;
+    uint32_t sidx = 0;
+    for (uint32_t i = 0; i < m_numRows; i++)
+    {
+        const uint32_t rowRange = (rowSum >> 8);
+        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
+        {
+            rowSum += sliceGroupSizeAccu;
+            m_sliceBaseRow[++sidx] = i;
+        }        
+    }
+    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
+    m_sliceBaseRow[0] = 0;
+    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
+
+    m_sliceMaxBlockRow = X265_MALLOC(uint32_t, m_param->maxSlices + 1);
+    ok &= !!m_sliceMaxBlockRow;
+    uint32_t maxBlockRows = (m_param->sourceHeight + (16 - 1)) / 16;
+    sliceGroupSizeAccu = (maxBlockRows << 8) / m_param->maxSlices;
+    rowSum = sliceGroupSizeAccu;
+    sidx = 0;
+    for (uint32_t i = 0; i < maxBlockRows; i++)
+    {
+        const uint32_t rowRange = (rowSum >> 8);
+        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
+        {
+            rowSum += sliceGroupSizeAccu;
+            m_sliceMaxBlockRow[++sidx] = i;
+        }
+    }
+    m_sliceMaxBlockRow[0] = 0;
+    m_sliceMaxBlockRow[m_param->maxSlices] = maxBlockRows;
 
     /* determine full motion search range */
     int range  = m_param->searchRange;       /* fpel search */
@@ -300,8 +335,15 @@ void FrameEncoder::threadMain()
             while (!m_frame->m_ctuInfo)
                 m_frame->m_copied.wait();
         }
+        if ((m_param->bMVType == AVC_INFO) && !m_param->analysisReuseMode && !(IS_X265_TYPE_I(m_frame->m_lowres.sliceType)))
+        {
+            while (((m_frame->m_analysisData.interData == NULL && m_frame->m_analysisData.intraData == NULL) || (uint32_t)m_frame->m_poc != m_frame->m_analysisData.poc))
+                m_frame->m_copyMVType.wait();
+        }
         compressFrame();
         m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */
+        if (m_frame != NULL)
+            m_frame->m_reconEncoded.trigger();
         m_enable.wait();
     }
 }
@@ -341,6 +383,8 @@ void FrameEncoder::compressFrame()
     m_completionCount = 0;
     m_bAllRowsStop = false;
     m_vbvResetTriggerRow = -1;
+    m_rowSliceTotalBits[0] = 0;
+    m_rowSliceTotalBits[1] = 0;
 
     m_SSDY = m_SSDU = m_SSDV = 0;
     m_ssim = 0;
@@ -550,28 +594,13 @@ void FrameEncoder::compressFrame()
 
     /* reset entropy coders and compute slice id */
     m_entropyCoder.load(m_initSliceContext);
-    const uint32_t sliceGroupSize = (m_numRows + m_param->maxSlices - 1) / m_param->maxSlices;
-    const uint32_t sliceGroupSizeAccu = (m_numRows << 8) / m_param->maxSlices;
-    m_sliceGroupSize = (uint16_t)sliceGroupSize;
+	
+    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)   
+        for (uint32_t row = m_sliceBaseRow[sliceId]; row < m_sliceBaseRow[sliceId + 1]; row++)
+            m_rows[row].init(m_initSliceContext, sliceId);   
 
-    uint32_t rowSum = sliceGroupSizeAccu;
-    uint32_t sidx = 0;
-    for (uint32_t i = 0; i < m_numRows; i++)
-    {
-        const uint32_t rowRange = (rowSum >> 8);
-
-        if ((i >= rowRange) & (sidx != m_param->maxSlices - 1))
-        {
-            rowSum += sliceGroupSizeAccu;
-            m_sliceBaseRow[++sidx] = i;
-        }
-
-        m_rows[i].init(m_initSliceContext, sidx);
-    }
-    X265_CHECK(sidx < m_param->maxSlices, "sliceID check failed!");
-
-    m_sliceBaseRow[0] = 0;
-    m_sliceBaseRow[m_param->maxSlices] = m_numRows;
+    // reset slice counter for rate control update
+    m_sliceCnt = 0;
 
     uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : m_param->maxSlices;
     X265_CHECK(m_param->bEnableWavefront || (m_param->maxSlices == 1), "Multiple slices without WPP unsupport now!");
@@ -586,8 +615,10 @@ void FrameEncoder::compressFrame()
                 m_rows[i].rowGoOnCoder.setBitstream(&m_outStreams[i]);
     }
     else
+    {
         for (uint32_t i = 0; i < numSubstreams; i++)
             m_outStreams[i].resetBits();
+    }
 
     int prevBPSEI = m_rce.encodeOrder ? m_top->m_lastBPSEI : 0;
 
@@ -697,9 +728,26 @@ void FrameEncoder::compressFrame()
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
      * filters runs behind the CTU compression and reconstruction */
 
-    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
+    for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)    
+        m_rows[m_sliceBaseRow[sliceId]].active = true;
+    
+    if (m_param->bEnableWavefront)
     {
-        m_rows[m_sliceBaseRow[sliceId]].active = true;
+        int i = 0;
+        for (uint32_t rowInSlice = 0; rowInSlice < m_sliceGroupSize; rowInSlice++)
+        {
+            for (uint32_t sliceId = 0; sliceId < m_param->maxSlices; sliceId++)
+            {
+                const uint32_t sliceStartRow = m_sliceBaseRow[sliceId];
+                const uint32_t sliceEndRow = m_sliceBaseRow[sliceId + 1] - 1;
+                const uint32_t row = sliceStartRow + rowInSlice;
+                if (row > sliceEndRow)
+                    continue;
+                m_row_to_idx[row] = i;
+                m_idx_to_row[i] = row;
+                i += 1;
+            }
+        }
     }
 
     if (m_param->bEnableWavefront)
@@ -735,11 +783,11 @@ void FrameEncoder::compressFrame()
                     }
                 }
 
-                enableRowEncoder(row); /* clear external dependency for this row */
+                enableRowEncoder(m_row_to_idx[row]); /* clear external dependency for this row */
                 if (!rowInSlice)
                 {
                     m_row0WaitTime = x265_mdate();
-                    enqueueRowEncoder(row); /* clear internal dependency, start wavefront */
+                    enqueueRowEncoder(m_row_to_idx[row]); /* clear internal dependency, start wavefront */
                 }
                 tryWakeOne();
             } // end of loop rowInSlice
@@ -964,9 +1012,8 @@ void FrameEncoder::compressFrame()
             // complete the slice header by writing WPP row-starts
             m_entropyCoder.setBitstream(&m_bs);
             if (slice->m_pps->bEntropyCodingSyncEnabled)
-            {
                 m_entropyCoder.codeSliceHeaderWPPEntryPoints(&m_substreamSizes[prevSliceRow], (nextSliceRow - prevSliceRow - 1), maxStreamSize);
-            }
+            
             m_bs.writeByteAlignment();
 
             m_nalList.serialize(slice->m_nalUnitType, m_bs);
@@ -1196,8 +1243,8 @@ void FrameEncoder::processRow(int row, i
     if (ATOMIC_INC(&m_activeWorkerCount) == 1 && m_stallStartTime)
         m_totalNoWorkerTime += x265_mdate() - m_stallStartTime;
 
-    const uint32_t realRow = row >> 1;
-    const uint32_t typeNum = row & 1;
+    const uint32_t realRow = m_idx_to_row[row >> 1];
+    const uint32_t typeNum = m_idx_to_row[row & 1];
 
     if (!typeNum)
         processRowEncoder(realRow, m_tld[threadId]);
@@ -1207,7 +1254,7 @@ void FrameEncoder::processRow(int row, i
 
         // NOTE: Active next row
         if (realRow != m_sliceBaseRow[m_rows[realRow].sliceId + 1] - 1)
-            enqueueRowFilter(realRow + 1);
+            enqueueRowFilter(m_row_to_idx[realRow + 1]);
     }
 
     if (ATOMIC_DEC(&m_activeWorkerCount) == 0)
@@ -1252,23 +1299,17 @@ void FrameEncoder::processRowEncoder(int
     const uint32_t lineStartCUAddr = row * numCols;
     bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
 
+    const uint32_t sliceId = curRow.sliceId;
     uint32_t maxBlockCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16;
-    uint32_t maxBlockRows = (m_frame->m_fencPic->m_picHeight + (16 - 1)) / 16;
     uint32_t noOfBlocks = m_param->maxCUSize / 16;
     const uint32_t bFirstRowInSlice = ((row == 0) || (m_rows[row - 1].sliceId != curRow.sliceId)) ? 1 : 0;
     const uint32_t bLastRowInSlice = ((row == m_numRows - 1) || (m_rows[row + 1].sliceId != curRow.sliceId)) ? 1 : 0;
-    const uint32_t sliceId = curRow.sliceId;
     const uint32_t endRowInSlicePlus1 = m_sliceBaseRow[sliceId + 1];
     const uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
 
-    if (bFirstRowInSlice && !curRow.completed)
-    {
-        // Load SBAC coder context from previous row and initialize row state.
-        //rowCoder.copyState(m_initSliceContext);
-        //rowCoder.loadContexts(m_rows[row - 1].bufferedEntropy);
-        rowCoder.load(m_initSliceContext);
-        //m_rows[row - 1].bufferedEntropy.loadContexts(m_initSliceContext);
-    }
+    // Load SBAC coder context from previous row and initialize row state.
+    if (bFirstRowInSlice && !curRow.completed)        
+        rowCoder.load(m_initSliceContext);     
 
     // calculate mean QP for consistent deltaQP signalling calculation
     if (m_param->bOptCUDeltaQP)
@@ -1279,15 +1320,12 @@ void FrameEncoder::processRowEncoder(int
             if (m_param->bEnableWavefront || !row)
             {
                 double meanQPOff = 0;
-                uint32_t loopIncr, count = 0;
                 bool isReferenced = IS_REFERENCED(m_frame);
                 double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
                 if (qpoffs)
                 {
-                    if (m_param->rc.qgSize == 8)
-                        loopIncr = 8;
-                    else
-                        loopIncr = 16;
+                    uint32_t loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
+
                     uint32_t cuYStart = 0, height = m_frame->m_fencPic->m_picHeight;
                     if (m_param->bEnableWavefront)
                     {
@@ -1297,6 +1335,7 @@ void FrameEncoder::processRowEncoder(int
 
                     uint32_t qgSize = m_param->rc.qgSize, width = m_frame->m_fencPic->m_picWidth;
                     uint32_t maxOffsetCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
+                    uint32_t count = 0;
                     for (uint32_t cuY = cuYStart; cuY < height && (cuY < m_frame->m_fencPic->m_picHeight); cuY += qgSize)
                     {
                         for (uint32_t cuX = 0; cuX < width; cuX += qgSize)
@@ -1328,9 +1367,7 @@ void FrameEncoder::processRowEncoder(int
             }
             curRow.avgQPComputed = 1;
         }
-    }
-
-    // TODO: specially case handle on first and last row
+    }    
 
     // Initialize restrict on MV range in slices
     tld.analysis.m_sliceMinY = -(int16_t)(rowInSlice * m_param->maxCUSize * 4) + 3 * 4;
@@ -1359,16 +1396,16 @@ void FrameEncoder::processRowEncoder(int
                 curRow.bufferedEntropy.copyState(rowCoder);
                 curRow.bufferedEntropy.loadContexts(rowCoder);
             }
-            if (!row && m_vbvResetTriggerRow != intRow)
+            if (bFirstRowInSlice && m_vbvResetTriggerRow != intRow)            
             {
                 curEncData.m_rowStat[row].rowQp = curEncData.m_avgQpRc;
                 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(curEncData.m_avgQpRc);
             }
 
             FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];
-            if (m_param->bEnableWavefront && row >= col && row && m_vbvResetTriggerRow != intRow)
+            if (m_param->bEnableWavefront && rowInSlice >= col && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
                 cuStat.baseQp = curEncData.m_cuStat[cuAddr - numCols + 1].baseQp;
-            else if (!m_param->bEnableWavefront && row && m_vbvResetTriggerRow != intRow)
+            else if (!m_param->bEnableWavefront && !bFirstRowInSlice && m_vbvResetTriggerRow != intRow)
                 cuStat.baseQp = curEncData.m_rowStat[row - 1].rowQp;
             else
                 cuStat.baseQp = curEncData.m_rowStat[row].rowQp;
@@ -1376,17 +1413,20 @@ void FrameEncoder::processRowEncoder(int
             /* TODO: use defines from slicetype.h for lowres block size */
             uint32_t block_y = (ctu->m_cuPelY >> m_param->maxLog2CUSize) * noOfBlocks;
             uint32_t block_x = (ctu->m_cuPelX >> m_param->maxLog2CUSize) * noOfBlocks;
-            
-            cuStat.vbvCost = 0;
-            cuStat.intraVbvCost = 0;
-            for (uint32_t h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
+            if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || !m_param->bDisableLookahead)
             {
-                uint32_t idx = block_x + (block_y * maxBlockCols);
+                cuStat.vbvCost = 0;
+                cuStat.intraVbvCost = 0;
 
-                for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++, idx++)
+                for (uint32_t h = 0; h < noOfBlocks && block_y < m_sliceMaxBlockRow[sliceId + 1]; h++, block_y++)
                 {
-                    cuStat.vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
-                    cuStat.intraVbvCost += m_frame->m_lowres.intraCost[idx];
+                    uint32_t idx = block_x + (block_y * maxBlockCols);
+
+                    for (uint32_t w = 0; w < noOfBlocks && (block_x + w) < maxBlockCols; w++, idx++)
+                    {
+                        cuStat.vbvCost += m_frame->m_lowres.lowresCostForRc[idx] & LOWRES_COST_MASK;
+                        cuStat.intraVbvCost += m_frame->m_lowres.intraCost[idx];
+                    }
                 }
             }
         }
@@ -1426,15 +1466,10 @@ void FrameEncoder::processRowEncoder(int
         {
             // NOTE: in VBV mode, we may reencode anytime, so we can't do Deblock stage-Horizon and SAO
             if (!bIsVbv)
-            {
-                // TODO: Multiple Threading
-                // Delay ONE row to avoid Intra Prediction Conflict
+            {                
+                // Delay one row to avoid intra prediction conflict
                 if (m_pool && !bFirstRowInSlice)
-                {
-                    // Waitting last threading finish
-                    m_frameFilter.m_parallelFilter[row - 1].waitForExit();
-
-                    // Processing new group
+                {                    
                     int allowCol = col;
 
                     // avoid race condition on last column
@@ -1444,15 +1479,11 @@ void FrameEncoder::processRowEncoder(int
                                                                   : m_frameFilter.m_parallelFilter[row - 2].m_lastCol.get()), (int)col);
                     }
                     m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(allowCol);
-                    m_frameFilter.m_parallelFilter[row - 1].tryBondPeers(*this, 1);
                 }
 
                 // Last Row may start early
                 if (m_pool && bLastRowInSlice)
                 {
-                    // Waiting for the last thread to finish
-                    m_frameFilter.m_parallelFilter[row].waitForExit();
-
                     // Deblocking last row
                     int allowCol = col;
 
@@ -1463,7 +1494,6 @@ void FrameEncoder::processRowEncoder(int
                                                                   : m_frameFilter.m_parallelFilter[row - 1].m_lastCol.get()), (int)col);
                     }
                     m_frameFilter.m_parallelFilter[row].m_allowedCol.set(allowCol);
-                    m_frameFilter.m_parallelFilter[row].tryBondPeers(*this, 1);
                 }
             } // end of !bIsVbv
         }
@@ -1479,7 +1509,7 @@ void FrameEncoder::processRowEncoder(int
         FrameStats frameLog;
         curEncData.m_rowStat[row].sumQpAq += collectCTUStatistics(*ctu, &frameLog);
 
-        // copy no. of intra, inter Cu cnt per row into frame stats for 2 pass
+        // copy number of intra, inter cu per row into frame stats for 2 pass
         if (m_param->rc.bStatWrite)
         {
             curRow.rowStats.mvBits    += best.mvBits;
@@ -1492,10 +1522,8 @@ void FrameEncoder::processRowEncoder(int
                 int shift = 2 * (m_param->maxCUDepth - depth);
                 int cuSize = m_param->maxCUSize >> depth;
 
-                if (cuSize == 8)
-                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN);
-                else
-                    curRow.rowStats.intra8x8Cnt += (int)(frameLog.cntIntra[depth] << shift);
+                curRow.rowStats.intra8x8Cnt += (cuSize == 8) ? (int)(frameLog.cntIntra[depth] + frameLog.cntIntraNxN) :
+                                                               (int)(frameLog.cntIntra[depth] << shift);
 
                 curRow.rowStats.inter8x8Cnt += (int)(frameLog.cntInter[depth] << shift);
                 curRow.rowStats.skip8x8Cnt += (int)((frameLog.cntSkipCu[depth] + frameLog.cntMergeCu[depth]) << shift);
@@ -1525,21 +1553,21 @@ void FrameEncoder::processRowEncoder(int
         if (bIsVbv)
         {   
             // Update encoded bits, satdCost, baseQP for each CU if tune grain is disabled
-            if ((m_param->bEnableWavefront && (!cuAddr || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
+            FrameData::RCStatCU& cuStat = curEncData.m_cuStat[cuAddr];    
+            if ((m_param->bEnableWavefront && ((cuAddr == m_sliceBaseRow[sliceId] * numCols) || !m_param->rc.bEnableConstVbv)) || !m_param->bEnableWavefront)
             {
-                curEncData.m_rowStat[row].rowSatd += curEncData.m_cuStat[cuAddr].vbvCost;
-                curEncData.m_rowStat[row].rowIntraSatd += curEncData.m_cuStat[cuAddr].intraVbvCost;
-                curEncData.m_rowStat[row].encodedBits += curEncData.m_cuStat[cuAddr].totalBits;
-                curEncData.m_rowStat[row].sumQpRc += curEncData.m_cuStat[cuAddr].baseQp;
+                curEncData.m_rowStat[row].rowSatd += cuStat.vbvCost;
+                curEncData.m_rowStat[row].rowIntraSatd += cuStat.intraVbvCost;
+                curEncData.m_rowStat[row].encodedBits += cuStat.totalBits;
+                curEncData.m_rowStat[row].sumQpRc += cuStat.baseQp;
                 curEncData.m_rowStat[row].numEncodedCUs = cuAddr;
             }
             
             // If current block is at row end checkpoint, call vbv ratecontrol.
-
             if (!m_param->bEnableWavefront && col == numCols - 1)
             {
                 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
-                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase);
+                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
                 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
                 curEncData.m_rowStat[row].rowQp = qpBase;
                 curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
@@ -1564,18 +1592,17 @@ void FrameEncoder::processRowEncoder(int
                     curEncData.m_rowStat[row].sumQpAq = 0;
                 }
             }
-
             // If current block is at row diagonal checkpoint, call vbv ratecontrol.
-
-            else if (m_param->bEnableWavefront && row == col && row)
+            else if (m_param->bEnableWavefront && rowInSlice == col && !bFirstRowInSlice)
             {
                 if (m_param->rc.bEnableConstVbv)
                 {
-                    int32_t startCuAddr = numCols * row;
-                    int32_t EndCuAddr = startCuAddr + col;
-                    for (int32_t r = row; r >= 0; r--)
+                    uint32_t startCuAddr = numCols * row;
+                    uint32_t EndCuAddr = startCuAddr + col;
+
+                    for (int32_t r = row; r >= (int32_t)m_sliceBaseRow[sliceId]; r--)
                     {
-                        for (int32_t c = startCuAddr; c <= EndCuAddr && c <= (int32_t)numCols * (r + 1) - 1; c++)
+                        for (uint32_t c = startCuAddr; c <= EndCuAddr && c <= numCols * (r + 1) - 1; c++)
                         {
                             curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
                             curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
@@ -1588,10 +1615,10 @@ void FrameEncoder::processRowEncoder(int
                     }
                 }
                 double qpBase = curEncData.m_cuStat[cuAddr].baseQp;
-                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase);
+                int reEncode = m_top->m_rateControl->rowVbvRateControl(m_frame, row, &m_rce, qpBase, m_sliceBaseRow, sliceId);
                 qpBase = x265_clip3((double)m_param->rc.qpMin, (double)m_param->rc.qpMax, qpBase);
                 curEncData.m_rowStat[row].rowQp = qpBase;
-                curEncData.m_rowStat[row].rowQpScale =  x265_qp2qScale(qpBase);
+                curEncData.m_rowStat[row].rowQpScale = x265_qp2qScale(qpBase);
 
                 if (reEncode < 0)
                 {
@@ -1602,7 +1629,7 @@ void FrameEncoder::processRowEncoder(int
                     m_vbvResetTriggerRow = row;
                     m_bAllRowsStop = true;
 
-                    for (uint32_t r = m_numRows - 1; r >= row; r--)
+                    for (uint32_t r = m_sliceBaseRow[sliceId + 1] - 1; r >= row; r--)
                     {
                         CTURow& stopRow = m_rows[r];
 
@@ -1665,7 +1692,7 @@ void FrameEncoder::processRowEncoder(int
                 m_rows[row + 1].completed + 2 <= curRow.completed)
             {
                 m_rows[row + 1].active = true;
-                enqueueRowEncoder(row + 1);
+                enqueueRowEncoder(m_row_to_idx[row + 1]);
                 tryWakeOne(); /* wake up a sleeping thread or set the help wanted flag */
             }
         }
@@ -1681,14 +1708,14 @@ void FrameEncoder::processRowEncoder(int
         }
     }
 
-    /** this row of CTUs has been compressed **/
+    /* this row of CTUs has been compressed */
     if (m_param->bEnableWavefront && m_param->rc.bEnableConstVbv)
     {
-        if (row == m_numRows - 1)
+        if (bLastRowInSlice)       
         {
-            for (int32_t r = 0; r < (int32_t)m_numRows; r++)
+            for (uint32_t r = m_sliceBaseRow[sliceId]; r < m_sliceBaseRow[sliceId + 1]; r++)
             {
-                for (int32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < (int32_t)numCols * (r + 1); c++)
+                for (uint32_t c = curEncData.m_rowStat[r].numEncodedCUs + 1; c < numCols * (r + 1); c++)
                 {
                     curEncData.m_rowStat[r].rowSatd += curEncData.m_cuStat[c].vbvCost;
                     curEncData.m_rowStat[r].rowIntraSatd += curEncData.m_cuStat[c].intraVbvCost;
@@ -1706,26 +1733,42 @@ void FrameEncoder::processRowEncoder(int
      * after half the frame is encoded, but after this initial period we update
      * after refLagRows (the number of rows reference frames must have completed
      * before referencees may begin encoding) */
-    uint32_t rowCount = 0;
     if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv)
     {
+        uint32_t rowCount = 0;
+        uint32_t maxRows = m_sliceBaseRow[sliceId + 1] - m_sliceBaseRow[sliceId];
+
         if (!m_rce.encodeOrder)
-            rowCount = m_numRows - 1;
+            rowCount = maxRows - 1; 
         else if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom))
-            rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1);
+            rowCount = X265_MIN((maxRows + 1) / 2, maxRows - 1);
         else
-            rowCount = X265_MIN(m_refLagRows, m_numRows - 1);
-        if (row == rowCount)
+			rowCount = X265_MIN(m_refLagRows / m_param->maxSlices, maxRows - 1);
+
+        if (rowInSlice == rowCount)
         {
-            m_rce.rowTotalBits = 0;
+            m_rowSliceTotalBits[sliceId] = 0;
             if (bIsVbv)
-                for (uint32_t i = 0; i < rowCount; i++)
-                    m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits;
+            {                
+                for (uint32_t i = m_sliceBaseRow[sliceId]; i < rowCount + m_sliceBaseRow[sliceId]; i++)
+                    m_rowSliceTotalBits[sliceId] += curEncData.m_rowStat[i].encodedBits;
+            }
             else
-                for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++)
-                    m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits;
+            {
+                uint32_t startAddr = m_sliceBaseRow[sliceId] * numCols;
+				uint32_t finishAddr = startAddr + rowCount * numCols;
+                
+				for (uint32_t cuAddr = startAddr; cuAddr < finishAddr; cuAddr++)
+                    m_rowSliceTotalBits[sliceId] += curEncData.m_cuStat[cuAddr].totalBits;
+            }            
 
-            m_top->m_rateControl->rateControlUpdateStats(&m_rce);
+            if (ATOMIC_INC(&m_sliceCnt) == (int)m_param->maxSlices)
+            {
+                m_rce.rowTotalBits = 0;
+                for (uint32_t i = 0; i < m_param->maxSlices; i++)
+                    m_rce.rowTotalBits += m_rowSliceTotalBits[i];
+                m_top->m_rateControl->rateControlUpdateStats(&m_rce);
+            }
         }
     }
 
@@ -1738,13 +1781,10 @@ void FrameEncoder::processRowEncoder(int
     /* Processing left Deblock block with current threading */
     if ((m_param->bEnableLoopFilter | m_param->bEnableSAO) & (rowInSlice >= 2))
     {
-        /* TODO: Multiple Threading */
-
         /* Check conditional to start previous row process with current threading */
         if (m_frameFilter.m_parallelFilter[row - 2].m_lastDeblocked.get() == (int)numCols)
         {
             /* stop threading on current row and restart it */
-            m_frameFilter.m_parallelFilter[row - 1].waitForExit();
             m_frameFilter.m_parallelFilter[row - 1].m_allowedCol.set(numCols);
             m_frameFilter.m_parallelFilter[row - 1].processTasks(-1);
         }
@@ -1755,11 +1795,11 @@ void FrameEncoder::processRowEncoder(int
     {
         if (rowInSlice >= m_filterRowDelay)
         {
-            enableRowFilter(row - m_filterRowDelay);
+            enableRowFilter(m_row_to_idx[row - m_filterRowDelay]);
 
             /* NOTE: Activate filter if first row (row 0) */
             if (rowInSlice == m_filterRowDelay)
-                enqueueRowFilter(row - m_filterRowDelay);
+                enqueueRowFilter(m_row_to_idx[row - m_filterRowDelay]);
             tryWakeOne();
         }
 
@@ -1767,7 +1807,7 @@ void FrameEncoder::processRowEncoder(int
         {
             for (uint32_t i = endRowInSlicePlus1 - m_filterRowDelay; i < endRowInSlicePlus1; i++)
             {
-                enableRowFilter(i);
+                enableRowFilter(m_row_to_idx[i]);
             }
             tryWakeOne();
         }
@@ -1775,7 +1815,7 @@ void FrameEncoder::processRowEncoder(int
         // handle specially case - single row slice
         if  (bFirstRowInSlice & bLastRowInSlice)
         {
-            enqueueRowFilter(row);
+            enqueueRowFilter(m_row_to_idx[row]);
             tryWakeOne();
         }
     }
--- a/source/encoder/frameencoder.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/frameencoder.h	Tue Nov 21 09:50:45 2017 +0530
@@ -138,6 +138,7 @@ public:
     volatile bool            m_bAllRowsStop;
     volatile int             m_completionCount;
     volatile int             m_vbvResetTriggerRow;
+    volatile int             m_sliceCnt;
 
     uint32_t                 m_numRows;
     uint32_t                 m_numCols;
@@ -147,8 +148,10 @@ public:
 
     CTURow*                  m_rows;
     uint16_t                 m_sliceAddrBits;
-    uint16_t                 m_sliceGroupSize;
-    uint32_t*                m_sliceBaseRow;
+    uint32_t                 m_sliceGroupSize;
+    uint32_t*                m_sliceBaseRow;    
+    uint32_t*                m_sliceMaxBlockRow;
+    int64_t                  m_rowSliceTotalBits[2];
     RateControlEntry         m_rce;
     SEIDecodedPictureHash    m_seiReconPictureDigest;
 
--- a/source/encoder/framefilter.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/framefilter.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -582,10 +582,7 @@ void FrameFilter::processRow(int row)
     CUData* ctu = encData.getPicCTU(m_parallelFilter[row].m_rowAddr);
 
     /* Processing left block Deblock with current threading */
-    {
-        /* stop threading on current row */
-        m_parallelFilter[row].waitForExit();
-
+    {        
         /* Check to avoid previous row process slower than current row */
         X265_CHECK(ctu->m_bFirstRowInSlice || m_parallelFilter[row - 1].m_lastDeblocked.get() == m_numCols, "previous row not finish");
 
@@ -618,7 +615,6 @@ void FrameFilter::processRow(int row)
     }
 
     // this row of CTUs has been encoded
-
     if (!ctu->m_bFirstRowInSlice)
         processPostRow(row - 1);
 
--- a/source/encoder/framefilter.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/framefilter.h	Tue Nov 21 09:50:45 2017 +0530
@@ -62,7 +62,7 @@ public:
     void*         m_ssimBuf;        /* Temp storage for ssim computation */
 
 #define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
-    class ParallelFilter : public BondedTaskGroup, public Deblock
+    class ParallelFilter : public Deblock
     {
     public:
         uint32_t            m_rowHeight;
@@ -104,10 +104,6 @@ public:
         {
             return m_rowHeight;
         }
-
-    protected:
-
-        ParallelFilter operator=(const ParallelFilter&);
     };
 
     ParallelFilter*     m_parallelFilter;
--- a/source/encoder/ratecontrol.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/ratecontrol.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -218,6 +218,7 @@ RateControl::RateControl(x265_param& p)
     m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize);
     m_param->rc.vbvMaxBitrate = x265_clip3(0, 2000000, m_param->rc.vbvMaxBitrate);
     m_param->rc.vbvBufferInit = x265_clip3(0.0, 2000000.0, m_param->rc.vbvBufferInit);
+    m_param->vbvBufferEnd = x265_clip3(0.0, 2000000.0, m_param->vbvBufferEnd);
     m_singleFrameVbv = 0;
     m_rateTolerance = 1.0;
 
@@ -255,6 +256,11 @@ RateControl::RateControl(x265_param& p)
         m_param->rc.vbvMaxBitrate = 0;
     }
     m_isVbv = m_param->rc.vbvMaxBitrate > 0 && m_param->rc.vbvBufferSize > 0;
+    if (m_param->vbvBufferEnd && !m_isVbv)
+    {
+        x265_log(m_param, X265_LOG_WARNING, "vbv-end requires VBV parameters, ignored\n");
+        m_param->vbvBufferEnd = 0;
+    }
     if (m_param->bEmitHRDSEI && !m_isVbv)
     {
         x265_log(m_param, X265_LOG_WARNING, "NAL HRD parameters require VBV parameters, ignored\n");
@@ -339,6 +345,10 @@ bool RateControl::init(const SPS& sps)
 
         if (m_param->rc.vbvBufferInit > 1.)
             m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, m_param->rc.vbvBufferInit / m_param->rc.vbvBufferSize);
+        if (m_param->vbvBufferEnd > 1.)
+            m_param->vbvBufferEnd = x265_clip3(0.0, 1.0, m_param->vbvBufferEnd / m_param->rc.vbvBufferSize);
+        if (m_param->vbvEndFrameAdjust > 1.)
+            m_param->vbvEndFrameAdjust = x265_clip3(0.0, 1.0, m_param->vbvEndFrameAdjust);
         m_param->rc.vbvBufferInit = x265_clip3(0.0, 1.0, X265_MAX(m_param->rc.vbvBufferInit, m_bufferRate / m_bufferSize));
         m_bufferFillFinal = m_bufferSize * m_param->rc.vbvBufferInit;
         m_bufferFillActual = m_bufferFillFinal;
@@ -732,7 +742,6 @@ void RateControl::reconfigureRC()
     m_bitrate = m_param->rc.bitrate * 1000;
 }
 
-
 void RateControl::initHRD(SPS& sps)
 {
     int vbvBufferSize = m_param->rc.vbvBufferSize * 1000;
@@ -765,6 +774,7 @@ void RateControl::initHRD(SPS& sps)
 
     #undef MAX_DURATION
 }
+
 bool RateControl::analyseABR2Pass(uint64_t allAvailableBits)
 {
     double rateFactor, stepMult;
@@ -1473,6 +1483,7 @@ double RateControl::getDiffLimitedQScale
 
     return q;
 }
+
 double RateControl::countExpectedBits(int startPos, int endPos)
 {
     double expectedBits = 0;
@@ -1484,6 +1495,7 @@ double RateControl::countExpectedBits(in
     }
     return expectedBits;
 }
+
 bool RateControl::findUnderflow(double *fills, int *t0, int *t1, int over, int endPos)
 {
     /* find an interval ending on an overflow or underflow (depending on whether
@@ -1531,6 +1543,7 @@ bool RateControl::fixUnderflow(int t0, i
     }
     return adjusted;
 }
+
 bool RateControl::cuTreeReadFor2Pass(Frame* frame)
 {
     int index = m_encOrder[frame->m_poc];
@@ -1579,24 +1592,24 @@ fail:
 double RateControl::tuneAbrQScaleFromFeedback(double qScale)
 {
     double abrBuffer = 2 * m_rateTolerance * m_bitrate;
-        /* use framesDone instead of POC as poc count is not serial with bframes enabled */
-        double overflow = 1.0;
-        double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
-        double wantedBits = timeDone * m_bitrate;
-        int64_t encodedBits = m_totalBits;
-        if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps)
-        {
-            abrBuffer = m_param->totalFrames * (m_bitrate / m_fps);
-            encodedBits = m_encodedBits;
-        }
+    /* use framesDone instead of POC as poc count is not serial with bframes enabled */
+    double overflow = 1.0;
+    double timeDone = (double)(m_framesDone - m_param->frameNumThreads + 1) * m_frameDuration;
+    double wantedBits = timeDone * m_bitrate;
+    int64_t encodedBits = m_totalBits;
+    if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps)
+    {
+        abrBuffer = m_param->totalFrames * (m_bitrate / m_fps);
+        encodedBits = m_encodedBits;
+    }
 
-        if (wantedBits > 0 && encodedBits > 0 && (!m_partialResidualFrames || 
-            m_param->rc.bStrictCbr || m_isGrainEnabled))
-        {
-            abrBuffer *= X265_MAX(1, sqrt(timeDone));
-            overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits - wantedBits) / abrBuffer);
-            qScale *= overflow;
-        }
+    if (wantedBits > 0 && encodedBits > 0 && (!m_partialResidualFrames || 
+        m_param->rc.bStrictCbr || m_isGrainEnabled))
+    {
+        abrBuffer *= X265_MAX(1, sqrt(timeDone));
+        overflow = x265_clip3(.5, 2.0, 1.0 + (encodedBits - wantedBits) / abrBuffer);
+        qScale *= overflow;
+    }
     return qScale;
 }
 
@@ -2157,29 +2170,51 @@ double RateControl::clipQscale(Frame* cu
                     curBits = predictSize(&m_pred[predType], frameQ[type], (double)satd);
                     bufferFillCur -= curBits;
                 }
-
-                /* Try to get the buffer at least 50% filled, but don't set an impossible goal. */
-                double finalDur = 1;
-                if (m_param->rc.bStrictCbr)
+                if (m_param->vbvBufferEnd && rce->encodeOrder >= m_param->vbvEndFrameAdjust * m_param->totalFrames)
                 {
-                    finalDur = x265_clip3(0.4, 1.0, totalDuration);
+                    bool loopBreak = false;
+                    double bufferDiff = m_param->vbvBufferEnd - (m_bufferFill / m_bufferSize);
+                    targetFill = m_bufferFill + m_bufferSize * (bufferDiff / (m_param->totalFrames - rce->encodeOrder));
+                    if (bufferFillCur < targetFill)
+                    {
+                        q *= 1.01;
+                        loopTerminate |= 1;
+                        loopBreak = true;
+                    }
+                    if (bufferFillCur > m_param->vbvBufferEnd * m_bufferSize)
+                    {
+                        q /= 1.01;
+                        loopTerminate |= 2;
+                        loopBreak = true;
+                    }
+                    if (!loopBreak)
+                        break;
                 }
-                targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5 , m_bufferSize * (1 - 0.5 * finalDur));
-                if (bufferFillCur < targetFill)
+                else
                 {
-                    q *= 1.01;
-                    loopTerminate |= 1;
-                    continue;
+                    /* Try to get the buffer at least 50% filled, but don't set an impossible goal. */
+                    double finalDur = 1;
+                    if (m_param->rc.bStrictCbr)
+                    {
+                        finalDur = x265_clip3(0.4, 1.0, totalDuration);
+                    }
+                    targetFill = X265_MIN(m_bufferFill + totalDuration * m_vbvMaxRate * 0.5, m_bufferSize * (1 - 0.5 * finalDur));
+                    if (bufferFillCur < targetFill)
+                    {
+                        q *= 1.01;
+                        loopTerminate |= 1;
+                        continue;
+                    }
+                    /* Try to get the buffer not more than 80% filled, but don't set an impossible goal. */
+                    targetFill = x265_clip3(m_bufferSize * (1 - 0.2 * finalDur), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
+                    if (m_isCbr && bufferFillCur > targetFill && !m_isSceneTransition)
+                    {
+                        q /= 1.01;
+                        loopTerminate |= 2;
+                        continue;
+                    }
+                    break;
                 }
-                /* Try to get the buffer not more than 80% filled, but don't set an impossible goal. */
-                targetFill = x265_clip3(m_bufferSize * (1 - 0.2 * finalDur), m_bufferSize, m_bufferFill - totalDuration * m_vbvMaxRate * 0.5);
-                if (m_isCbr && bufferFillCur > targetFill && !m_isSceneTransition)
-                {
-                    q /= 1.01;
-                    loopTerminate |= 2;
-                    continue;
-                }
-                break;
             }
             q = X265_MAX(q0 / 2, q);
         }
@@ -2330,17 +2365,18 @@ double RateControl::predictRowsSizeSum(F
     return totalSatdBits + encodedBitsSoFar;
 }
 
-int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv)
+int RateControl::rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv, uint32_t* m_sliceBaseRow, uint32_t sliceId)
 {
     FrameData& curEncData = *curFrame->m_encData;
     double qScaleVbv = x265_qp2qScale(qpVbv);
     uint64_t rowSatdCost = curEncData.m_rowStat[row].rowSatd;
     double encodedBits = curEncData.m_rowStat[row].encodedBits;
+    uint32_t rowInSlice = row - m_sliceBaseRow[sliceId];
 
-    if (m_param->bEnableWavefront && row == 1)
+    if (m_param->bEnableWavefront && rowInSlice == 1)
     {
-        rowSatdCost += curEncData.m_rowStat[0].rowSatd;
-        encodedBits += curEncData.m_rowStat[0].encodedBits;
+        rowSatdCost += curEncData.m_rowStat[row - 1].rowSatd;
+        encodedBits += curEncData.m_rowStat[row - 1].encodedBits;
     }
     rowSatdCost >>= X265_DEPTH - 8;
     updatePredictor(rce->rowPred[0], qScaleVbv, (double)rowSatdCost, encodedBits);
@@ -2350,8 +2386,8 @@ int RateControl::rowVbvRateControl(Frame
         if (qpVbv < refFrame->m_encData->m_rowStat[row].rowQp)
         {
             uint64_t intraRowSatdCost = curEncData.m_rowStat[row].rowIntraSatd;
-            if (m_param->bEnableWavefront && row == 1)
-                intraRowSatdCost += curEncData.m_rowStat[0].rowIntraSatd;
+            if (m_param->bEnableWavefront && rowInSlice == 1)
+                intraRowSatdCost += curEncData.m_rowStat[row - 1].rowIntraSatd;
             intraRowSatdCost >>= X265_DEPTH - 8;
             updatePredictor(rce->rowPred[1], qScaleVbv, (double)intraRowSatdCost, encodedBits);
         }
@@ -2376,7 +2412,7 @@ int RateControl::rowVbvRateControl(Frame
     const SPS& sps = *curEncData.m_slice->m_sps;
     double maxFrameError = X265_MAX(0.05, 1.0 / sps.numCuInHeight);
 
-    if (row < sps.numCuInHeight - 1)
+    if (row < m_sliceBaseRow[sliceId + 1] - 1)
     {
         /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */
         double rcTol = bufferLeftPlanned / m_param->frameNumThreads * m_rateTolerance;
@@ -2693,8 +2729,8 @@ int RateControl::rateControlEnd(Frame* c
             m_encodedBitsWindow[pos % s_slidingWindowFrames] = actualBits;
         if(rce->sliceType != I_SLICE)
         {
-        int qp = int (rce->qpaRc + 0.5);
-        m_qpToEncodedBits[qp] =  m_qpToEncodedBits[qp] == 0 ? actualBits : (m_qpToEncodedBits[qp] + actualBits) * 0.5;
+            int qp = int (rce->qpaRc + 0.5);
+            m_qpToEncodedBits[qp] =  m_qpToEncodedBits[qp] == 0 ? actualBits : (m_qpToEncodedBits[qp] + actualBits) * 0.5;
         }
         curFrame->m_rcData->wantedBitsWindow = m_wantedBitsWindow;
         curFrame->m_rcData->cplxrSum = m_cplxrSum;
@@ -2779,7 +2815,8 @@ int RateControl::writeRateControlFrameSt
             curFrame->m_encData->m_frameStats.percent8x8Skip  * m_ncu) < 0)
             goto writeFailure;
     }
-    else{
+    else
+    {
         RPS* rpsWriter = &curFrame->m_encData->m_slice->m_rps;
         int i, num = rpsWriter->numberOfPictures;
         char deltaPOC[128];
--- a/source/encoder/ratecontrol.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/ratecontrol.h	Tue Nov 21 09:50:45 2017 +0530
@@ -244,7 +244,7 @@ public:
     int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
     void rateControlUpdateStats(RateControlEntry* rce);
     int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, int *filler);
-    int  rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
+    int  rowVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv, uint32_t* m_sliceBaseRow, uint32_t sliceId);
     int  rateControlSliceType(int frameNum);
     bool cuTreeReadFor2Pass(Frame* curFrame);
     void hrdFullness(SEIBufferingPeriod* sei);
--- a/source/encoder/search.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/search.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -2162,7 +2162,7 @@ void Search::predInterSearch(Mode& inter
 
         /* Uni-directional prediction */
         if ((m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->analysisReuseLevel > 1 && m_param->analysisReuseLevel != 10)
-            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead))
+            || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bMVType == AVC_INFO))
         {
             for (int list = 0; list < numPredDir; list++)
             {
--- a/source/encoder/slicetype.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/slicetype.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -588,6 +588,7 @@ Lookahead::Lookahead(x265_param *param, 
     m_filled   = false;
     m_outputSignalRequired = false;
     m_isActive = true;
+    m_inputCount = 0;
 
     m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
     m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
@@ -741,23 +742,21 @@ void Lookahead::destroy()
 /* Called by API thread */
 void Lookahead::addPicture(Frame& curFrame, int sliceType)
 {
-    curFrame.m_lowres.sliceType = sliceType;
-
-    /* determine if the lookahead is (over) filled enough for frames to begin to
-     * be consumed by frame encoders */
-    if (!m_filled)
+    if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->bDisableLookahead)
     {
-        if (!m_param->bframes & !m_param->lookaheadDepth)
-            m_filled = true; /* zero-latency */
-        else if (curFrame.m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes)
-            m_filled = true; /* full capacity plus mini-gop lag */
+        if (!m_filled)
+            m_filled = true;
+        m_outputLock.acquire();
+        m_outputQueue.pushBack(curFrame);
+        m_outputLock.release();
+        m_inputCount++;
     }
-
-    m_inputLock.acquire();
-    m_inputQueue.pushBack(curFrame);
-    if (m_pool && m_inputQueue.size() >= m_fullQueueSize)
-        tryWakeOne();
-    m_inputLock.release();
+    else
+    {
+        checkLookaheadQueue(m_inputCount);
+        curFrame.m_lowres.sliceType = sliceType;
+        addPicture(curFrame);
+    }
 }
 
 void Lookahead::addPicture(Frame& curFrame)
@@ -765,6 +764,7 @@ void Lookahead::addPicture(Frame& curFra
     m_inputLock.acquire();
     m_inputQueue.pushBack(curFrame);
     m_inputLock.release();
+    m_inputCount++;
 }
 
 void Lookahead::checkLookaheadQueue(int &frameCnt)
@@ -793,6 +793,12 @@ void Lookahead::flush()
     m_filled = true;
 }
 
+void Lookahead::setLookaheadQueue()
+{
+    m_filled = false;
+    m_fullQueueSize = X265_MAX(1, m_param->lookaheadDepth);
+}
+
 void Lookahead::findJob(int /*workerThreadID*/)
 {
     bool doDecide;
@@ -832,7 +838,13 @@ Frame* Lookahead::getDecidedPicture()
         m_outputLock.release();
 
         if (out)
+        {
+            m_inputCount--;
             return out;
+        }
+
+        if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->bDisableLookahead)
+            return NULL;
 
         findJob(-1); /* run slicetypeDecide() if necessary */
 
@@ -843,7 +855,10 @@ Frame* Lookahead::getDecidedPicture()
         if (wait)
             m_outputSignal.wait();
 
-        return m_outputQueue.popFront();
+        out = m_outputQueue.popFront();
+        if (out)
+            m_inputCount--;
+        return out;
     }
     else
         return NULL;
@@ -887,68 +902,68 @@ void Lookahead::getEstimatedPictureCost(
     default:
         return;
     }
+    if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || !m_param->bDisableLookahead)
+    {
+        X265_CHECK(curFrame->m_lowres.costEst[b - p0][p1 - b] > 0, "Slice cost not estimated\n")
+        if (m_param->rc.cuTree && !m_param->rc.bStatRead)
+            /* update row satds based on cutree offsets */
+            curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
+        else if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || m_param->scaleFactor)
+        {
+            if (m_param->rc.aqMode)
+                curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
+            else
+                curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
+        }
+        if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
+        {
+            /* aggregate lowres row satds to CTU resolution */
+            curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b];
+            uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
+            uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
+            uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
+            uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
+            double *qp_offset = 0;
+            /* Factor in qpoffsets based on Aq/Cutree in CU costs */
+            if (m_param->rc.aqMode || m_param->bAQMotion)
+                qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset;
 
-    X265_CHECK(curFrame->m_lowres.costEst[b - p0][p1 - b] > 0, "Slice cost not estimated\n")
-
-    if (m_param->rc.cuTree && !m_param->rc.bStatRead)
-        /* update row satds based on cutree offsets */
-        curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b);
-    else if (m_param->analysisReuseMode != X265_ANALYSIS_LOAD || m_param->scaleFactor)
-    {
-        if (m_param->rc.aqMode)
-            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEstAq[b - p0][p1 - b];
-        else
-            curFrame->m_lowres.satdCost = curFrame->m_lowres.costEst[b - p0][p1 - b];
-    }
-
-    if (m_param->rc.vbvBufferSize && m_param->rc.vbvMaxBitrate)
-    {
-        /* aggregate lowres row satds to CTU resolution */
-        curFrame->m_lowres.lowresCostForRc = curFrame->m_lowres.lowresCosts[b - p0][p1 - b];
-        uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0, intraSum = 0;
-        uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE);
-        uint32_t numCuInHeight = (m_param->sourceHeight + m_param->maxCUSize - 1) / m_param->maxCUSize;
-        uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height;
-        double *qp_offset = 0;
-        /* Factor in qpoffsets based on Aq/Cutree in CU costs */
-        if (m_param->rc.aqMode || m_param->bAQMotion)
-            qp_offset = (frames[b]->sliceType == X265_TYPE_B || !m_param->rc.cuTree) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset;
-
-        for (uint32_t row = 0; row < numCuInHeight; row++)
-        {
-            lowresRow = row * scale;
-            for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++)
+            for (uint32_t row = 0; row < numCuInHeight; row++)
             {
-                sum = 0; intraSum = 0;
-                int diff = 0;
-                lowresCuIdx = lowresRow * widthInLowresCu;
-                for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++)
+                lowresRow = row * scale;
+                for (uint32_t cnt = 0; cnt < scale && lowresRow < heightInLowresCu; lowresRow++, cnt++)
                 {
-                    uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK;
-                    if (qp_offset)
+                    sum = 0; intraSum = 0;
+                    int diff = 0;
+                    lowresCuIdx = lowresRow * widthInLowresCu;
+                    for (lowresCol = 0; lowresCol < widthInLowresCu; lowresCol++, lowresCuIdx++)
                     {
-                        double qpOffset;
-                        if (m_param->rc.qgSize == 8)
-                            qpOffset = (qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4] +
-                                        qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1] +
-                                        qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
-                                        qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
-                        else
-                            qpOffset = qp_offset[lowresCuIdx];
-                        lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qpOffset) + 128) >> 8);
-                        int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx];
-                        curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
+                        uint16_t lowresCuCost = curFrame->m_lowres.lowresCostForRc[lowresCuIdx] & LOWRES_COST_MASK;
+                        if (qp_offset)
+                        {
+                            double qpOffset;
+                            if (m_param->rc.qgSize == 8)
+                                qpOffset = (qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4] +
+                                qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + 1] +
+                                qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
+                                qp_offset[lowresCol * 2 + lowresRow * widthInLowresCu * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+                            else
+                                qpOffset = qp_offset[lowresCuIdx];
+                            lowresCuCost = (uint16_t)((lowresCuCost * x265_exp2fix8(qpOffset) + 128) >> 8);
+                            int32_t intraCuCost = curFrame->m_lowres.intraCost[lowresCuIdx];
+                            curFrame->m_lowres.intraCost[lowresCuIdx] = (intraCuCost * x265_exp2fix8(qpOffset) + 128) >> 8;
+                        }
+                        if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
+                            for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
+                                diff += curFrame->m_lowres.intraCost[lowresCuIdx] - lowresCuCost;
+                        curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost;
+                        sum += lowresCuCost;
+                        intraSum += curFrame->m_lowres.intraCost[lowresCuIdx];
                     }
-                    if (m_param->bIntraRefresh && slice->m_sliceType == X265_TYPE_P)
-                        for (uint32_t x = curFrame->m_encData->m_pir.pirStartCol; x <= curFrame->m_encData->m_pir.pirEndCol; x++)
-                            diff += curFrame->m_lowres.intraCost[lowresCuIdx] - lowresCuCost;
-                    curFrame->m_lowres.lowresCostForRc[lowresCuIdx] = lowresCuCost;
-                    sum += lowresCuCost;
-                    intraSum += curFrame->m_lowres.intraCost[lowresCuIdx];
+                    curFrame->m_encData->m_rowStat[row].satdForVbv += sum;
+                    curFrame->m_encData->m_rowStat[row].satdForVbv += diff;
+                    curFrame->m_encData->m_rowStat[row].intraSatdForVbv += intraSum;
                 }
-                curFrame->m_encData->m_rowStat[row].satdForVbv += sum;
-                curFrame->m_encData->m_rowStat[row].satdForVbv += diff;
-                curFrame->m_encData->m_rowStat[row].intraSatdForVbv += intraSum;
             }
         }
     }
@@ -1036,6 +1051,18 @@ void Lookahead::slicetypeDecide()
          (m_param->lookaheadDepth && m_param->rc.vbvBufferSize)))
     {
         slicetypeAnalyse(frames, false);
+        bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
+        if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->scaleFactor && bIsVbv)
+        {
+            int numFrames;
+            for (numFrames = 0; numFrames < maxSearch; numFrames++)
+            {
+                Lowres *fenc = frames[numFrames + 1];
+                if (!fenc)
+                    break;
+            }
+            vbvLookahead(frames, numFrames, true);
+        }
     }
 
     int bframes, brefs;
@@ -1219,6 +1246,18 @@ void Lookahead::slicetypeDecide()
 
         frames[j + 1] = NULL;
         slicetypeAnalyse(frames, true);
+        bool bIsVbv = m_param->rc.vbvBufferSize > 0 && m_param->rc.vbvMaxBitrate > 0;
+        if (m_param->analysisReuseMode == X265_ANALYSIS_LOAD && m_param->scaleFactor && bIsVbv)
+        {
+            int numFrames;
+            for (numFrames = 0; numFrames < maxSearch; numFrames++)
+            {
+                Lowres *fenc = frames[numFrames + 1];
+                if (!fenc)
+                    break;
+            }
+            vbvLookahead(frames, numFrames, true);
+        }
     }
     m_outputLock.release();
 }
--- a/source/encoder/slicetype.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/encoder/slicetype.h	Tue Nov 21 09:50:45 2017 +0530
@@ -120,6 +120,7 @@ public:
     int           m_cuCount;
     int           m_numCoopSlices;
     int           m_numRowsPerSlice;
+    int           m_inputCount;
     double        m_cuTreeStrength;
 
     bool          m_isActive;
@@ -151,7 +152,7 @@ public:
     Frame*  getDecidedPicture();
 
     void    getEstimatedPictureCost(Frame *pic);
-
+    void    setLookaheadQueue();
 
 protected:
 
--- a/source/input/y4m.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/input/y4m.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -307,23 +307,26 @@ bool Y4MInput::parseHeader()
                         break;
                 }
 
-                switch (csp)
+                if (csp / 100 == ('m'-'0')*1000 + ('o'-'0')*100 + ('n'-'0')*10 + ('o'-'0'))
                 {
-                case ('m'-'0')*100000 + ('o'-'0')*10000 + ('n'-'0')*1000 + ('o'-'0')*100 + 16:
                     colorSpace = X265_CSP_I400;
-                    depth = 16;
-                    break;
+                    d = csp % 100;
+                }
+                else if (csp / 10 == ('m'-'0')*1000 + ('o'-'0')*100 + ('n'-'0')*10 + ('o'-'0'))
+                {
+                    colorSpace = X265_CSP_I400;
+                    d = csp % 10;
+                }
+                else if (csp == ('m'-'0')*1000 + ('o'-'0')*100 + ('n'-'0')*10 + ('o'-'0'))
+                {
+                    colorSpace = X265_CSP_I400;
+                    d = 8;
+                }
+                else
+                    colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
 
-                case ('m'-'0')*1000 + ('o'-'0')*100 + ('n'-'0')*10 + ('o'-'0'):
-                    colorSpace = X265_CSP_I400;
-                    depth = 8;
-                    break;
-                   
-                default:
-                    if (d >= 8 && d <= 16)
-                        depth = d;
-                    colorSpace = (csp == 444) ? X265_CSP_I444 : (csp == 422) ? X265_CSP_I422 : X265_CSP_I420;
-                }
+                if (d >= 8 && d <= 16)
+                    depth = d;
                 break;
 
             default:
--- a/source/test/rate-control-tests.txt	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/test/rate-control-tests.txt	Tue Nov 21 09:50:45 2017 +0530
@@ -25,7 +25,7 @@ BasketballDrive_1920x1080_50.y4m,--prese
 big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --no-wpp --aud --hrd --tune fast-decode
 sita_1920x1080_30.yuv,--preset superfast --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --aud --strict-cbr --no-wpp
 sintel_trailer_2k_480p24.y4m, --preset slow --crf 24 --vbv-bufsize 150 --vbv-maxrate 150 --dynamic-rd 1.53
-
+BasketballDrive_1920x1080_50.y4m,--preset medium --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 11500 --vbv-end 0.9 --vbv-end-fr-adj 0.7
 
 
 # multi-pass rate control tests
--- a/source/test/regression-tests.txt	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/test/regression-tests.txt	Tue Nov 21 09:50:45 2017 +0530
@@ -13,6 +13,7 @@
 
 BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709
 BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp --limit-modes
+BasketballDrive_1920x1080_50.y4m,--preset superfast --tune zerolatency --bitrate 9000 --vbv-maxrate 9000 --vbv-bufsize 9000 -F 1 --slices 2
 BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp
 BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 --slices 3
 BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 --qg-size 16 --cu-lossless --tu-inter-depth 3 --limit-tu 1
@@ -109,6 +110,7 @@ ducks_take_off_420_720p50.y4m,--preset u
 ducks_take_off_444_720p50.y4m,--preset superfast --weightp --limit-refs 2
 ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 --limit-refs 2
 ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40
+ducks_take_off_420_720p50.y4m,--preset fast --tune zerolatency --crf 21 --vbv-maxrate 6000 --vbv-bufsize 6000 -F 1 --slices 2
 ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra
 ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut
 ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40
@@ -159,4 +161,8 @@ CrowdRun_1920x1080_50_10bit_422.yuv,--pr
 #SEA Implementation Test
 silent_cif_420.y4m,--preset veryslow --me sea
 big_buck_bunny_360p24.y4m,--preset superfast --me sea
+
+#low-pass dct test
+720p50_parkrun_ter.y4m,--preset medium --lowpass-dct
+
 # vim: tw=200
--- a/source/test/testharness.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/test/testharness.h	Tue Nov 21 09:50:45 2017 +0530
@@ -68,6 +68,10 @@ protected:
 #include <intrin.h>
 #elif HAVE_RDTSC
 #include <intrin.h>
+#elif (!defined(__APPLE__) && (defined (__GNUC__) && (defined(__x86_64__) || defined(__i386__))))
+#include <x86intrin.h>
+#elif ( !defined(__APPLE__) && defined (__GNUC__) && defined(__ARM_NEON__))
+#include <arm_neon.h>
 #elif defined(__GNUC__)
 /* fallback for older GCC/MinGW */
 static inline uint32_t __rdtsc(void)
--- a/source/x265-extras.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,447 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013-2017 MulticoreWare, Inc
- *
- * Authors: Steve Borho <steve@borho.org>
- *          Selvakumar Nithiyaruban <selvakumar@multicorewareinc.com>
- *          Divya Manivannan <divya@multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "x265.h"
-#include "x265-extras.h"
-#include "param.h"
-#include "common.h"
-
-using namespace X265_NS;
-
-static const char* summaryCSVHeader =
-    "Command, Date/Time, Elapsed Time, FPS, Bitrate, "
-    "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), "
-    "I count, I ave-QP, I kbps, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), "
-    "P count, P ave-QP, P kbps, P-PSNR Y, P-PSNR U, P-PSNR V, P-SSIM (dB), "
-    "B count, B ave-QP, B kbps, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), "
-    "MaxCLL, MaxFALL, Version\n";
-
-FILE* x265_csvlog_open(const x265_param& param, const char* fname, int level)
-{
-    FILE *csvfp = x265_fopen(fname, "r");
-    if (csvfp)
-    {
-        /* file already exists, re-open for append */
-        fclose(csvfp);
-        return x265_fopen(fname, "ab");
-    }
-    else
-    {
-        /* new CSV file, write header */
-        csvfp = x265_fopen(fname, "wb");
-        if (csvfp)
-        {
-            if (level)
-            {
-                fprintf(csvfp, "Encode Order, Type, POC, QP, Bits, Scenecut, ");
-                if (level >= 2)
-                    fprintf(csvfp, "I/P cost ratio, ");
-                if (param.rc.rateControlMode == X265_RC_CRF)
-                    fprintf(csvfp, "RateFactor, ");
-                if (param.rc.vbvBufferSize)
-                    fprintf(csvfp, "BufferFill, ");
-                if (param.bEnablePsnr)
-                    fprintf(csvfp, "Y PSNR, U PSNR, V PSNR, YUV PSNR, ");
-                if (param.bEnableSsim)
-                    fprintf(csvfp, "SSIM, SSIM(dB), ");
-                fprintf(csvfp, "Latency, ");
-                fprintf(csvfp, "List 0, List 1");
-                uint32_t size = param.maxCUSize;
-                for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-                {
-                    fprintf(csvfp, ", Intra %dx%d DC, Intra %dx%d Planar, Intra %dx%d Ang", size, size, size, size, size, size);
-                    size /= 2;
-                }
-                fprintf(csvfp, ", 4x4");
-                size = param.maxCUSize;
-                if (param.bEnableRectInter)
-                {
-                    for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-                    {
-                        fprintf(csvfp, ", Inter %dx%d, Inter %dx%d (Rect)", size, size, size, size);
-                        if (param.bEnableAMP)
-                            fprintf(csvfp, ", Inter %dx%d (Amp)", size, size);
-                        size /= 2;
-                    }
-                }
-                else
-                {
-                    for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-                    {
-                        fprintf(csvfp, ", Inter %dx%d", size, size);
-                        size /= 2;
-                    }
-                }
-                size = param.maxCUSize;
-                for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-                {
-                    fprintf(csvfp, ", Skip %dx%d", size, size);
-                    size /= 2;
-                }
-                size = param.maxCUSize;
-                for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-                {
-                    fprintf(csvfp, ", Merge %dx%d", size, size);
-                    size /= 2;
-                }
-
-                if (level >= 2)
-                {
-                    fprintf(csvfp, ", Avg Luma Distortion, Avg Chroma Distortion, Avg psyEnergy, Avg Residual Energy,"
-                        " Min Luma Level, Max Luma Level, Avg Luma Level");
-
-                    if (param.internalCsp != X265_CSP_I400)
-                        fprintf(csvfp, ", Min Cb Level, Max Cb Level, Avg Cb Level, Min Cr Level, Max Cr Level, Avg Cr Level");
-
-                    /* PU statistics */
-                    size = param.maxCUSize;
-                    for (uint32_t i = 0; i< param.maxLog2CUSize - (uint32_t)g_log2Size[param.minCUSize] + 1; i++)
-                    {
-                        fprintf(csvfp, ", Intra %dx%d", size, size);
-                        fprintf(csvfp, ", Skip %dx%d", size, size);
-                        fprintf(csvfp, ", AMP %d", size);
-                        fprintf(csvfp, ", Inter %dx%d", size, size);
-                        fprintf(csvfp, ", Merge %dx%d", size, size);
-                        fprintf(csvfp, ", Inter %dx%d", size, size / 2);
-                        fprintf(csvfp, ", Merge %dx%d", size, size / 2);
-                        fprintf(csvfp, ", Inter %dx%d", size / 2, size);
-                        fprintf(csvfp, ", Merge %dx%d", size / 2, size);
-                        size /= 2;
-                    }
-
-                    if ((uint32_t)g_log2Size[param.minCUSize] == 3)
-                        fprintf(csvfp, ", 4x4");
-
-                    /* detailed performance statistics */
-                    fprintf(csvfp, ", DecideWait (ms), Row0Wait (ms), Wall time (ms), Ref Wait Wall (ms), Total CTU time (ms),"
-                    "Stall Time (ms), Total frame time (ms), Avg WPP, Row Blocks");
-                }
-                fprintf(csvfp, "\n");
-            }
-            else
-                fputs(summaryCSVHeader, csvfp);
-        }
-        return csvfp;
-    }
-}
-
-// per frame CSV logging
-void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& pic, int level)
-{
-    if (!csvfp)
-        return;
-
-    const x265_frame_stats* frameStats = &pic.frameData;
-    fprintf(csvfp, "%d, %c-SLICE, %4d, %2.2lf, %10d, %d,", frameStats->encoderOrder, frameStats->sliceType, frameStats->poc, 
-                                                           frameStats->qp, (int)frameStats->bits, frameStats->bScenecut);
-    if (level >= 2)
-        fprintf(csvfp, "%.2f,", frameStats->ipCostRatio);
-    if (param.rc.rateControlMode == X265_RC_CRF)
-        fprintf(csvfp, "%.3lf,", frameStats->rateFactor);
-    if (param.rc.vbvBufferSize)
-        fprintf(csvfp, "%.3lf,", frameStats->bufferFill);
-    if (param.bEnablePsnr)
-        fprintf(csvfp, "%.3lf, %.3lf, %.3lf, %.3lf,", frameStats->psnrY, frameStats->psnrU, frameStats->psnrV, frameStats->psnr);
-    if (param.bEnableSsim)
-        fprintf(csvfp, " %.6f, %6.3f,", frameStats->ssim, x265_ssim2dB(frameStats->ssim));
-    fprintf(csvfp, "%d, ", frameStats->frameLatency);
-    if (frameStats->sliceType == 'I' || frameStats->sliceType == 'i')
-        fputs(" -, -,", csvfp);
-    else
-    {
-        int i = 0;
-        while (frameStats->list0POC[i] != -1)
-            fprintf(csvfp, "%d ", frameStats->list0POC[i++]);
-        fprintf(csvfp, ",");
-        if (frameStats->sliceType != 'P')
-        {
-            i = 0;
-            while (frameStats->list1POC[i] != -1)
-                fprintf(csvfp, "%d ", frameStats->list1POC[i++]);
-            fprintf(csvfp, ",");
-        }
-        else
-            fputs(" -,", csvfp);
-    }
-
-    if (level)
-    {
-        for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-            fprintf(csvfp, "%5.2lf%%, %5.2lf%%, %5.2lf%%,", frameStats->cuStats.percentIntraDistribution[depth][0],
-            frameStats->cuStats.percentIntraDistribution[depth][1],
-            frameStats->cuStats.percentIntraDistribution[depth][2]);
-        fprintf(csvfp, "%5.2lf%%", frameStats->cuStats.percentIntraNxN);
-        if (param.bEnableRectInter)
-        {
-            for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-            {
-                fprintf(csvfp, ", %5.2lf%%, %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0],
-                    frameStats->cuStats.percentInterDistribution[depth][1]);
-                if (param.bEnableAMP)
-                    fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][2]);
-            }
-        }
-        else
-        {
-            for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-                fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentInterDistribution[depth][0]);
-        }
-        for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentSkipCu[depth]);
-        for (uint32_t depth = 0; depth <= param.maxCUDepth; depth++)
-            fprintf(csvfp, ", %5.2lf%%", frameStats->cuStats.percentMergeCu[depth]);
-    }
-
-    if (level >= 2)
-    {
-        fprintf(csvfp, ", %.2lf, %.2lf, %.2lf, %.2lf ", frameStats->avgLumaDistortion,
-            frameStats->avgChromaDistortion,
-            frameStats->avgPsyEnergy,
-            frameStats->avgResEnergy);
-
-        fprintf(csvfp, ", %d, %d, %.2lf", frameStats->minLumaLevel, frameStats->maxLumaLevel, frameStats->avgLumaLevel);
-
-        if (param.internalCsp != X265_CSP_I400)
-        {
-            fprintf(csvfp, ", %d, %d, %.2lf", frameStats->minChromaULevel, frameStats->maxChromaULevel, frameStats->avgChromaULevel);
-            fprintf(csvfp, ", %d, %d, %.2lf", frameStats->minChromaVLevel, frameStats->maxChromaVLevel, frameStats->avgChromaVLevel);
-        }
-
-        for (uint32_t i = 0; i < param.maxLog2CUSize - (uint32_t)g_log2Size[param.minCUSize] + 1; i++)
-        {
-            fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentIntraPu[i]);
-            fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentSkipPu[i]);
-            fprintf(csvfp, ",%.2lf%%", frameStats->puStats.percentAmpPu[i]);
-            for (uint32_t j = 0; j < 3; j++)
-            {
-                fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentInterPu[i][j]);
-                fprintf(csvfp, ", %.2lf%%", frameStats->puStats.percentMergePu[i][j]);
-            }
-        }
-        if ((uint32_t)g_log2Size[param.minCUSize] == 3)
-            fprintf(csvfp, ",%.2lf%%", frameStats->puStats.percentNxN);
-
-        fprintf(csvfp, ", %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf, %.1lf,", frameStats->decideWaitTime, frameStats->row0WaitTime,
-                                                                             frameStats->wallTime, frameStats->refWaitWallTime,
-                                                                             frameStats->totalCTUTime, frameStats->stallTime,
-                                                                             frameStats->totalFrameTime);
-
-        fprintf(csvfp, " %.3lf, %d", frameStats->avgWPP, frameStats->countRowBlocks);
-    }
-    fprintf(csvfp, "\n");
-    fflush(stderr);
-}
-
-void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, int padx, int pady, const x265_stats& stats, int level, int argc, char** argv)
-{
-    if (!csvfp)
-        return;
-
-    if (level)
-    {
-        // adding summary to a per-frame csv log file, so it needs a summary header
-        fprintf(csvfp, "\nSummary\n");
-        fputs(summaryCSVHeader, csvfp);
-    }
-
-    // CLI arguments or other
-    if (argc)
-    {
-        fputc('"', csvfp);
-        for (int i = 1; i < argc; i++)
-        {
-            fputc(' ', csvfp);
-            fputs(argv[i], csvfp);
-        }
-        fputc('"', csvfp);
-    }
-    else
-    {
-        const x265_param* paramTemp = &param;
-        char *opts = x265_param2string((x265_param*)paramTemp, padx, pady);
-        if (opts)
-        {
-            fputc('"', csvfp);
-            fputs(opts, csvfp);
-            fputc('"', csvfp);
-        }
-    }
-
-    // current date and time
-    time_t now;
-    struct tm* timeinfo;
-    time(&now);
-    timeinfo = localtime(&now);
-    char buffer[200];
-    strftime(buffer, 128, "%c", timeinfo);
-    fprintf(csvfp, ", %s, ", buffer);
-
-    // elapsed time, fps, bitrate
-    fprintf(csvfp, "%.2f, %.2f, %.2f,",
-        stats.elapsedEncodeTime, stats.encodedPictureCount / stats.elapsedEncodeTime, stats.bitrate);
-
-    if (param.bEnablePsnr)
-        fprintf(csvfp, " %.3lf, %.3lf, %.3lf, %.3lf,",
-        stats.globalPsnrY / stats.encodedPictureCount, stats.globalPsnrU / stats.encodedPictureCount,
-        stats.globalPsnrV / stats.encodedPictureCount, stats.globalPsnr);
-    else
-        fprintf(csvfp, " -, -, -, -,");
-    if (param.bEnableSsim)
-        fprintf(csvfp, " %.6f, %6.3f,", stats.globalSsim, x265_ssim2dB(stats.globalSsim));
-    else
-        fprintf(csvfp, " -, -,");
-
-    if (stats.statsI.numPics)
-    {
-        fprintf(csvfp, " %-6u, %2.2lf, %-8.2lf,", stats.statsI.numPics, stats.statsI.avgQp, stats.statsI.bitrate);
-        if (param.bEnablePsnr)
-            fprintf(csvfp, " %.3lf, %.3lf, %.3lf,", stats.statsI.psnrY, stats.statsI.psnrU, stats.statsI.psnrV);
-        else
-            fprintf(csvfp, " -, -, -,");
-        if (param.bEnableSsim)
-            fprintf(csvfp, " %.3lf,", stats.statsI.ssim);
-        else
-            fprintf(csvfp, " -,");
-    }
-    else
-        fprintf(csvfp, " -, -, -, -, -, -, -,");
-
-    if (stats.statsP.numPics)
-    {
-        fprintf(csvfp, " %-6u, %2.2lf, %-8.2lf,", stats.statsP.numPics, stats.statsP.avgQp, stats.statsP.bitrate);
-        if (param.bEnablePsnr)
-            fprintf(csvfp, " %.3lf, %.3lf, %.3lf,", stats.statsP.psnrY, stats.statsP.psnrU, stats.statsP.psnrV);
-        else
-            fprintf(csvfp, " -, -, -,");
-        if (param.bEnableSsim)
-            fprintf(csvfp, " %.3lf,", stats.statsP.ssim);
-        else
-            fprintf(csvfp, " -,");
-    }
-    else
-        fprintf(csvfp, " -, -, -, -, -, -, -,");
-
-    if (stats.statsB.numPics)
-    {
-        fprintf(csvfp, " %-6u, %2.2lf, %-8.2lf,", stats.statsB.numPics, stats.statsB.avgQp, stats.statsB.bitrate);
-        if (param.bEnablePsnr)
-            fprintf(csvfp, " %.3lf, %.3lf, %.3lf,", stats.statsB.psnrY, stats.statsB.psnrU, stats.statsB.psnrV);
-        else
-            fprintf(csvfp, " -, -, -,");
-        if (param.bEnableSsim)
-            fprintf(csvfp, " %.3lf,", stats.statsB.ssim);
-        else
-            fprintf(csvfp, " -,");
-    }
-    else
-        fprintf(csvfp, " -, -, -, -, -, -, -,");
-
-    fprintf(csvfp, " %-6u, %-6u, %s\n", stats.maxCLL, stats.maxFALL, version);
-}
-
-/* The dithering algorithm is based on Sierra-2-4A error diffusion.
- * We convert planes in place (without allocating a new buffer). */
-static void ditherPlane(uint16_t *src, int srcStride, int width, int height, int16_t *errors, int bitDepth)
-{
-    const int lShift = 16 - bitDepth;
-    const int rShift = 16 - bitDepth + 2;
-    const int half = (1 << (16 - bitDepth + 1));
-    const int pixelMax = (1 << bitDepth) - 1;
-
-    memset(errors, 0, (width + 1) * sizeof(int16_t));
-
-    if (bitDepth == 8)
-    {
-        for (int y = 0; y < height; y++, src += srcStride)
-        {
-            uint8_t* dst = (uint8_t *)src;
-            int16_t err = 0;
-            for (int x = 0; x < width; x++)
-            {
-                err = err * 2 + errors[x] + errors[x + 1];
-                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
-                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
-                dst[x] = (uint8_t)tmpDst;
-            }
-        }
-    }
-    else
-    {
-        for (int y = 0; y < height; y++, src += srcStride)
-        {
-            int16_t err = 0;
-            for (int x = 0; x < width; x++)
-            {
-                err = err * 2 + errors[x] + errors[x + 1];
-                int tmpDst = x265_clip3(0, pixelMax, ((src[x] << 2) + err + half) >> rShift);
-                errors[x] = err = (int16_t)(src[x] - (tmpDst << lShift));
-                src[x] = (uint16_t)tmpDst;
-            }
-        }
-    }
-}
-
-void x265_dither_image(const x265_api& api, x265_picture& picIn, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth)
-{
-    if (sizeof(x265_picture) != api.sizeof_picture)
-    {
-        fprintf(stderr, "extras [error]: structure size skew, unable to dither\n");
-        return;
-    }
-
-    if (picIn.bitDepth <= 8)
-    {
-        fprintf(stderr, "extras [error]: dither support enabled only for input bitdepth > 8\n");
-        return;
-    }
-
-    if (picIn.bitDepth == bitDepth)
-    {
-        fprintf(stderr, "extras[error]: dither support enabled only if encoder depth is different from picture depth\n");
-        return;
-    }
-
-    /* This portion of code is from readFrame in x264. */
-    for (int i = 0; i < x265_cli_csps[picIn.colorSpace].planes; i++)
-    {
-        if (picIn.bitDepth < 16)
-        {
-            /* upconvert non 16bit high depth planes to 16bit */
-            uint16_t *plane = (uint16_t*)picIn.planes[i];
-            uint32_t pixelCount = x265_picturePlaneSize(picIn.colorSpace, picWidth, picHeight, i);
-            int lShift = 16 - picIn.bitDepth;
-
-            /* This loop assumes width is equal to stride which
-             * happens to be true for file reader outputs */
-            for (uint32_t j = 0; j < pixelCount; j++)
-                plane[j] = plane[j] << lShift;
-        }
-
-        int height = (int)(picHeight >> x265_cli_csps[picIn.colorSpace].height[i]);
-        int width = (int)(picWidth >> x265_cli_csps[picIn.colorSpace].width[i]);
-
-        ditherPlane(((uint16_t*)picIn.planes[i]), picIn.stride[i] / 2, width, height, errorBuf, bitDepth);
-    }
-}
--- a/source/x265-extras.h	Tue Sep 05 11:21:24 2017 +0530
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,66 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2013-2017 MulticoreWare, Inc
- *
- * Authors: Steve Borho <steve@borho.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#ifndef X265_EXTRAS_H
-#define X265_EXTRAS_H 1
-
-#include "x265.h"
-
-#include <stdio.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if _WIN32
-#define LIBAPI __declspec(dllexport)
-#else
-#define LIBAPI
-#endif
-
-/* Open a CSV log file. On success it returns a file handle which must be passed
- * to x265_csvlog_frame() and/or x265_csvlog_encode(). The file handle must be
- * closed by the caller using fclose(). If level is 0, then no frame logging
- * header is written to the file. This function will return NULL if it is unable
- * to open the file for write or if it detects a structure size skew */
-LIBAPI FILE* x265_csvlog_open(const x265_param& param, const char* fname, int level);
-
-/* Log frame statistics to the CSV file handle. level should have been non-zero
- * in the call to x265_csvlog_open() if this function is called. */
-LIBAPI void x265_csvlog_frame(FILE* csvfp, const x265_param& param, const x265_picture& pic, int level);
-
-/* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are
- * intended to be command line arguments passed to the encoder. Encode
- * statistics should be queried from the encoder just prior to closing it. */
-LIBAPI void x265_csvlog_encode(FILE* csvfp, const char* version, const x265_param& param, int padx, int pady, const x265_stats& stats, int level, int argc, char** argv);
-
-/* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
- * the residual bits to dither each row. */
-LIBAPI void x265_dither_image(const x265_api& api, x265_picture&, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/source/x265.cpp	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/x265.cpp	Tue Nov 21 09:50:45 2017 +0530
@@ -26,7 +26,6 @@
 #endif
 
 #include "x265.h"
-#include "x265-extras.h"
 #include "x265cli.h"
 
 #include "input/input.h"
@@ -639,7 +638,7 @@ int main(int argc, char **argv)
         {
             if (pic_in->bitDepth > param->internalBitDepth && cliopt.bDither)
             {
-                x265_dither_image(*api, *pic_in, cliopt.input->getWidth(), cliopt.input->getHeight(), errorBuf, param->internalBitDepth);
+                x265_dither_image(pic_in, cliopt.input->getWidth(), cliopt.input->getHeight(), errorBuf, param->internalBitDepth);
                 pic_in->bitDepth = param->internalBitDepth;
             }
             /* Overwrite PTS */
--- a/source/x265.def.in	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/x265.def.in	Tue Nov 21 09:50:45 2017 +0530
@@ -23,3 +23,11 @@ x265_cleanup
 x265_api_get_${X265_BUILD}
 x265_api_query
 x265_encoder_intra_refresh
+x265_encoder_ctu_info
+x265_get_slicetype_poc_and_scenecut
+x265_get_ref_frame_list
+x265_csvlog_open
+x265_csvlog_frame
+x265_csvlog_encode
+x265_dither_image
+x265_set_analysis_data
--- a/source/x265.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/x265.h	Tue Nov 21 09:50:45 2017 +0530
@@ -35,6 +35,10 @@ extern "C" {
  *      opaque handler for encoder */
 typedef struct x265_encoder x265_encoder;
 
+/* x265_picyuv:
+ *      opaque handler for PicYuv */
+typedef struct x265_picyuv x265_picyuv;
+
 /* Application developers planning to link against a shared library version of
  * libx265 from a Microsoft Visual Studio or similar development environment
  * will need to define X265_API_IMPORTS before including this header.
@@ -88,6 +92,21 @@ typedef struct x265_nal
     uint8_t* payload;
 } x265_nal;
 
+#define X265_LOOKAHEAD_MAX 250
+
+typedef struct x265_lookahead_data
+{
+    int64_t   plannedSatd[X265_LOOKAHEAD_MAX + 1];
+    uint32_t  *vbvCost;
+    uint32_t  *intraVbvCost;
+    uint32_t  *satdForVbv;
+    uint32_t  *intraSatdForVbv;
+    int       keyframe;
+    int       lastMiniGopBFrame;
+    int       plannedType[X265_LOOKAHEAD_MAX + 1];
+    int64_t   dts;
+} x265_lookahead_data;
+
 /* Stores all analysis data for a single frame */
 typedef struct x265_analysis_data
 {
@@ -102,6 +121,9 @@ typedef struct x265_analysis_data
     void*            wt;
     void*            interData;
     void*            intraData;
+    uint32_t         numCuInHeight;
+    x265_lookahead_data lookahead;
+    uint8_t*         modeFlag[2];
 } x265_analysis_data;
 
 /* cu statistics */
@@ -202,6 +224,11 @@ typedef enum
     CTU_INFO_CHANGE = 2,
 }CTUInfo;
 
+typedef enum
+{
+    NO_INFO = 0,
+    AVC_INFO = 1,
+}MVRefineType;
 
 /* Arbitrary User SEI
  * Payload size is in bytes and the payload pointer must be non-NULL. 
@@ -523,15 +550,15 @@ typedef struct x265_stats
 /* String values accepted by x265_param_parse() (and CLI) for various parameters */
 static const char * const x265_motion_est_names[] = { "dia", "hex", "umh", "star", "sea", "full", 0 };
 static const char * const x265_source_csp_names[] = { "i400", "i420", "i422", "i444", "nv12", "nv16", 0 };
-static const char * const x265_video_format_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
+static const char * const x265_video_format_names[] = { "component", "pal", "ntsc", "secam", "mac", "unknown", 0 };
 static const char * const x265_fullrange_names[] = { "limited", "full", 0 };
-static const char * const x265_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 };
-static const char * const x265_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100",
+static const char * const x265_colorprim_names[] = { "reserved", "bt709", "unknown", "reserved", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", "smpte428", "smpte431", "smpte432", 0 };
+static const char * const x265_transfer_names[] = { "reserved", "bt709", "unknown", "reserved", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100",
                                                     "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12",
-                                                    "smpte-st-2084", "smpte-st-428", "arib-std-b67", 0 };
-static const char * const x265_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m",
-                                                     "YCgCo", "bt2020nc", "bt2020c", 0 };
-static const char * const x265_sar_names[] = { "undef", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11",
+                                                    "smpte2084", "smpte428", "arib-std-b67", 0 };
+static const char * const x265_colmatrix_names[] = { "gbr", "bt709", "unknown", "", "fcc", "bt470bg", "smpte170m", "smpte240m",
+                                                     "ycgco", "bt2020nc", "bt2020c", "smpte2085", "chroma-derived-nc", "chroma-derived-c", "ictcp", 0 };
+static const char * const x265_sar_names[] = { "unknown", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11",
                                                "32:11", "80:33", "18:11", "15:11", "64:33", "160:99", "4:3", "3:2", "2:1", 0 };
 static const char * const x265_interlace_names[] = { "prog", "tff", "bff", 0 };
 static const char * const x265_analysis_names[] = { "off", "save", "load", 0 };
@@ -1479,6 +1506,35 @@ typedef struct x265_param
 
     /* File pointer for csv log */
     FILE*     csvfpt;
+
+    /* Force flushing the frames from encoder */
+    int       forceFlush;
+
+    /* Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU */
+    int       bEnableSplitRdSkip;
+
+    /* Disable lookahead */
+    int       bDisableLookahead;
+
+    /* Use low-pass subband dct approximation 
+    *  This DCT approximation is less computational intensive and gives results close to standard DCT */
+    int       bLowPassDct;
+
+    /* Sets the portion of the decode buffer that must be available after all the
+    * specified frames have been inserted into the decode buffer. If it is less
+    * than 1, then the final buffer available is vbv-end * vbvBufferSize.  Otherwise,
+    * it is interpreted as the final buffer available in kbits. Default 0 (disabled) */
+    double    vbvBufferEnd;
+    
+    /* Frame from which qp has to be adjusted to hit final decode buffer emptiness.
+    * Specified as a fraction of the total frames. Default 0 */
+    double    vbvEndFrameAdjust;
+
+    /* Reuse MV information obtained through API */
+    int       bMVType;
+
+    /* Allow the encoder to have a copy of the planes of x265_picture in Frame */
+    int       bCopyPicToFrame;
 } x265_param;
 
 /* x265_param_alloc:
@@ -1677,10 +1733,47 @@ int x265_encoder_intra_refresh(x265_enco
  *    the encoder will wait for this copy to complete if enabled.
  */
 int x265_encoder_ctu_info(x265_encoder *, int poc, x265_ctu_info_t** ctu);
+
+/* x265_get_slicetype_poc_and_scenecut:
+ *     get the slice type, poc and scene cut information for the current frame,
+ *     returns negative on error, 0 when access unit were output.
+ *     This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check */
+int x265_get_slicetype_poc_and_scenecut(x265_encoder *encoder, int *slicetype, int *poc, int* sceneCut);
+
+/* x265_get_ref_frame_list:
+ *     returns negative on error, 0 when access unit were output.
+ *     This API must be called after(poc >= lookaheadDepth + bframes + 2) condition check */
+int x265_get_ref_frame_list(x265_encoder *encoder, x265_picyuv**, x265_picyuv**, int, int);
+
+/* x265_set_analysis_data:
+ *     set the analysis data. The incoming analysis_data structure is assumed to be AVC-sized blocks.
+ *     returns negative on error, 0 access unit were output. */
+int x265_set_analysis_data(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
+
 /* x265_cleanup:
  *       release library static allocations, reset configured CTU size */
 void x265_cleanup(void);
 
+/* Open a CSV log file. On success it returns a file handle which must be passed
+ * to x265_csvlog_frame() and/or x265_csvlog_encode(). The file handle must be
+ * closed by the caller using fclose(). If csv-loglevel is 0, then no frame logging
+ * header is written to the file. This function will return NULL if it is unable
+ * to open the file for write or if it detects a structure size skew */
+FILE* x265_csvlog_open(const x265_param *);
+
+/* Log frame statistics to the CSV file handle. csv-loglevel should have been non-zero
+ * in the call to x265_csvlog_open() if this function is called. */
+void x265_csvlog_frame(const x265_param *, const x265_picture *);
+
+/* Log final encode statistics to the CSV file handle. 'argc' and 'argv' are
+ * intended to be command line arguments passed to the encoder. Encode
+ * statistics should be queried from the encoder just prior to closing it. */
+void x265_csvlog_encode(x265_encoder *encoder, const x265_stats *, int argc, char** argv);
+
+/* In-place downshift from a bit-depth greater than 8 to a bit-depth of 8, using
+ * the residual bits to dither each row. */
+void x265_dither_image(x265_picture *, int picWidth, int picHeight, int16_t *errorBuf, int bitDepth);
+
 #define X265_MAJOR_VERSION 1
 
 /* === Multi-lib API ===
@@ -1726,6 +1819,13 @@ typedef struct x265_api
     int           sizeof_frame_stats;   /* sizeof(x265_frame_stats) */
     int           (*encoder_intra_refresh)(x265_encoder*);
     int           (*encoder_ctu_info)(x265_encoder*, int, x265_ctu_info_t**);
+    int           (*get_slicetype_poc_and_scenecut)(x265_encoder*, int*, int*, int*);
+    int           (*get_ref_frame_list)(x265_encoder*, x265_picyuv**, x265_picyuv**, int, int);
+    FILE*         (*csvlog_open)(const x265_param*);
+    void          (*csvlog_frame)(const x265_param*, const x265_picture*);
+    void          (*csvlog_encode)(x265_encoder*, const x265_stats*, int, char**);
+    void          (*dither_image)(x265_picture*, int, int, int16_t*, int);
+    int           (*set_analysis_data)(x265_encoder *encoder, x265_analysis_data *analysis_data, int poc, uint32_t cuBytes);
     /* add new pointers to the end, or increment X265_MAJOR_VERSION */
 } x265_api;
 
--- a/source/x265cli.h	Tue Sep 05 11:21:24 2017 +0530
+++ b/source/x265cli.h	Tue Nov 21 09:50:45 2017 +0530
@@ -147,6 +147,8 @@ static const struct option long_options[
     { "vbv-maxrate",    required_argument, NULL, 0 },
     { "vbv-bufsize",    required_argument, NULL, 0 },
     { "vbv-init",       required_argument, NULL, 0 },
+    { "vbv-end",        required_argument, NULL, 0 },
+    { "vbv-end-fr-adj", required_argument, NULL, 0 },
     { "bitrate",        required_argument, NULL, 0 },
     { "qp",             required_argument, NULL, 'q' },
     { "aq-mode",        required_argument, NULL, 0 },
@@ -255,8 +257,7 @@ static const struct option long_options[
     { "analysis-reuse-level", required_argument, NULL, 0 },
     { "scale-factor",   required_argument, NULL, 0 },
     { "refine-intra",   required_argument, NULL, 0 },
-    { "refine-inter",   no_argument, NULL, 0 },
-    { "no-refine-inter",no_argument, NULL, 0 },
+    { "refine-inter",   required_argument, NULL, 0 },
     { "strict-cbr",           no_argument, NULL, 0 },
     { "temporal-layers",      no_argument, NULL, 0 },
     { "no-temporal-layers",   no_argument, NULL, 0 },
@@ -280,6 +281,13 @@ static const struct option long_options[
     { "no-dhdr10-opt",        no_argument, NULL, 0},
     { "refine-mv",            no_argument, NULL, 0 },
     { "no-refine-mv",         no_argument, NULL, 0 },
+    { "force-flush",    required_argument, NULL, 0 },
+    { "splitrd-skip",         no_argument, NULL, 0 },
+    { "no-splitrd-skip",      no_argument, NULL, 0 },
+    { "lowpass-dct",          no_argument, NULL, 0 },
+    { "refine-mv-type", required_argument, NULL, 0 },
+    { "copy-pic",             no_argument, NULL, 0 },
+    { "no-copy-pic",          no_argument, NULL, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
     { 0, 0, 0, 0 },
@@ -333,6 +341,7 @@ static void showHelp(x265_param *param)
     H0("   --seek <integer>              First frame to encode\n");
     H1("   --[no-]interlace <bff|tff>    Indicate input pictures are interlace fields in temporal order. Default progressive\n");
     H1("   --dither                      Enable dither if downscaling to 8 bit pixels. Default disabled\n");
+    H0("   --[no-]copy-pic               Copy buffers of input picture in frame. Default %s\n", OPT(param->bCopyPicToFrame));
     H0("\nQuality reporting metrics:\n");
     H0("   --[no-]ssim                   Enable reporting SSIM metric scores. Default %s\n", OPT(param->bEnableSsim));
     H0("   --[no-]psnr                   Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr));
@@ -374,6 +383,7 @@ static void showHelp(x265_param *param)
     H0("   --[no-]early-skip             Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip));
     H0("   --[no-]rskip                  Enable early exit from recursion. Default %s\n", OPT(param->bEnableRecursionSkip));
     H1("   --[no-]tskip-fast             Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast));
+    H1("   --[no-]splitrd-skip           Enable skipping split RD analysis when sum of split CU rdCost larger than none split CU rdCost for Intra CU. Default %s\n", OPT(param->bEnableSplitRdSkip));
     H1("   --nr-intra <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n");
     H1("   --nr-inter <integer>          An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n");
     H0("   --ctu-info <integer>          Enable receiving ctu information asynchronously and determine reaction to the CTU information (0, 1, 2, 4, 6) Default 0\n"
@@ -415,7 +425,7 @@ static void showHelp(x265_param *param)
     H0("   --rc-lookahead <integer>      Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth);
     H1("   --lookahead-slices <0..16>    Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices);
     H0("   --lookahead-threads <integer> Number of threads to be dedicated to perform lookahead only. Default %d\n", param->lookaheadThreads);
-    H0("   --bframes <integer>           Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes);
+    H0("-b/--bframes <0..16>             Maximum number of consecutive b-frames. Default %d\n", param->bframes);
     H1("   --bframe-bias <integer>       Bias towards B frame decisions. Default %d\n", param->bFrameBias);
     H0("   --b-adapt <0..2>              0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive);
     H0("   --[no-]b-pyramid              Use B-frames as references. Default %s\n", OPT(param->bBPyramid));
@@ -423,6 +433,10 @@ static void showHelp(x265_param *param)
     H1("                                 Format of each line: framenumber frametype QP\n");
     H1("                                 QP is optional (none lets x265 choose). Frametypes: I,i,K,P,B,b.\n");
     H1("                                 QPs are restricted by qpmin/qpmax.\n");
+    H1("   --force-flush <integer>       Force the encoder to flush frames. Default %d\n", param->forceFlush);
+    H1("                                 0 - flush the encoder only when all the input pictures are over.\n");
+    H1("                                 1 - flush all the frames even when the input is not over. Slicetype decision may change with this option.\n");
+    H1("                                 2 - flush the slicetype decided frames only.\n");
     H0("\nRate control, Adaptive Quantization:\n");
     H0("   --bitrate <integer>           Target bitrate (kbps) for ABR (implied). Default %d\n", param->rc.bitrate);
     H1("-q/--qp <integer>                QP for P slices in CQP mode (implied). --ipratio and --pbration determine other slice QPs\n");
@@ -435,6 +449,8 @@ static void showHelp(x265_param *param)
     H0("   --vbv-maxrate <integer>       Max local bitrate (kbit/s). Default %d\n", param->rc.vbvMaxBitrate);
     H0("   --vbv-bufsize <integer>       Set size of the VBV buffer (kbit). Default %d\n", param->rc.vbvBufferSize);
     H0("   --vbv-init <float>            Initial VBV buffer occupancy (fraction of bufsize or in kbits). Default %.2f\n", param->rc.vbvBufferInit);
+    H0("   --vbv-end <float>             Final VBV buffer emptiness (fraction of bufsize or in kbits). Default 0 (disabled)\n");
+    H0("   --vbv-end-fr-adj <float>      Frame from which qp has to be adjusted to achieve final decode buffer emptiness. Default 0\n");
     H0("   --pass                        Multi pass rate control.\n"
        "                                   - 1 : First pass, creates stats file\n"
        "                                   - 2 : Last pass, does not overwrite stats file\n"
@@ -448,9 +464,21 @@ static void showHelp(x265_param *param)
     H0("   --analysis-reuse-mode <string|int>  save - Dump analysis info into file, load - Load analysis buffers from the file. Default %d\n", param->analysisReuseMode);
     H0("   --analysis-reuse-file <filename>    Specify file name used for either dumping or reading analysis data. Deault x265_analysis.dat\n");
     H0("   --analysis-reuse-level <1..10>      Level of analysis reuse indicates amount of info stored/reused in save/load mode, 1:least..10:most. Default %d\n", param->analysisReuseLevel);
+    H0("   --refine-mv-type <string>     Reuse MV information received through API call. Supported option is avc. Default disabled - %d\n", param->bMVType);
     H0("   --scale-factor <int>          Specify factor by which input video is scaled down for analysis save mode. Default %d\n", param->scaleFactor);
-    H0("   --refine-intra <int>          Enable intra refinement for load mode. Default %d\n", param->intraRefine);
-    H0("   --[no-]refine-inter           Enable inter refinement for load mode. Default %s\n", OPT(param->interRefine));
+    H0("   --refine-intra <0..3>         Enable intra refinement for encode that uses analysis-reuse-mode=load.\n"
+        "                                    - 0 : Forces both mode and depth from the save encode.\n"
+        "                                    - 1 : Functionality of (0) + evaluate all intra modes at min-cu-size's depth when current depth is one smaller than min-cu-size's depth.\n"
+        "                                    - 2 : Functionality of (1) + irrespective of size evaluate all angular modes when the save encode decides the best mode as angular.\n"
+        "                                    - 3 : Functionality of (1) + irrespective of size evaluate all intra modes.\n"
+        "                                Default:%d\n", param->intraRefine);
+    H0("   --refine-inter <0..3>         Enable inter refinement for encode that uses analysis-reuse-mode=load.\n"
+        "                                    - 0 : Forces both mode and depth from the save encode.\n"
+        "                                    - 1 : Functionality of (0) + evaluate all inter modes at min-cu-size's depth when current depth is one smaller than\n"
+        "                                          min-cu-size's depth. When save encode decides the current block as skip(for all sizes) evaluate skip/merge.\n"
+        "                                    - 2 : Functionality of (1) + irrespective of size restrict the modes evaluated when specific modes are decided as the best mode by the save encode.\n"
+        "                                    - 3 : Functionality of (1) + irrespective of size evaluate all inter modes.\n"
+        "                                Default:%d\n", param->interRefine);
     H0("   --[no-]refine-mv              Enable mv refinement for load mode. Default %s\n", OPT(param->mvRefine));
     H0("   --aq-mode <integer>           Mode for Adaptive Quantization - 0:none 1:uniform AQ 2:auto variance 3:auto variance with bias to dark scenes. Default %d\n", param->rc.aqMode);
     H0("   --aq-strength <float>         Reduces blocking and blurring in flat and textured areas (0 to 3.0). Default %.2f\n", param->rc.aqStrength);
@@ -492,13 +520,13 @@ static void showHelp(x265_param *param)
     H1("   --overscan <string>           Specify whether it is appropriate for decoder to show cropped region: undef, show or crop. Default undef\n");
     H0("   --videoformat <string>        Specify video format from undef, component, pal, ntsc, secam, mac. Default undef\n");
     H0("   --range <string>              Specify black level and range of luma and chroma signals as full or limited Default limited\n");
-    H0("   --colorprim <string>          Specify color primaries from undef, bt709, bt470m, bt470bg, smpte170m,\n");
-    H0("                                 smpte240m, film, bt2020. Default undef\n");
-    H0("   --transfer <string>           Specify transfer characteristics from undef, bt709, bt470m, bt470bg, smpte170m,\n");
+    H0("   --colorprim <string>          Specify color primaries from  bt709, unknown, reserved, bt470m, bt470bg, smpte170m,\n");
+    H0("                                 smpte240m, film, bt2020, smpte428, smpte431, smpte432. Default undef\n");
+    H0("   --transfer <string>           Specify transfer characteristics from bt709, unknown, reserved, bt470m, bt470bg, smpte170m,\n");
     H0("                                 smpte240m, linear, log100, log316, iec61966-2-4, bt1361e, iec61966-2-1,\n");
-    H0("                                 bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428, arib-std-b67. Default undef\n");
+    H0("                                 bt2020-10, bt2020-12, smpte2084, smpte428, arib-std-b67. Default undef\n");
     H1("   --colormatrix <string>        Specify color matrix setting from undef, bt709, fcc, bt470bg, smpte170m,\n");
-    H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n");
+    H1("                                 smpte240m, GBR, YCgCo, bt2020nc, bt2020c, smpte2085, chroma-derived-nc, chroma-derived-c, ictcp. Default undef\n");
     H1("   --chromaloc <integer>         Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField);
     H0("   --master-display <string>     SMPTE ST 2086 master display color volume info SEI (HDR)\n");
     H0("                                    format: G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min)\n");
@@ -525,6 +553,7 @@ static void showHelp(x265_param *param)
     H1("-r/--recon <filename>            Reconstructed raw image YUV or Y4M output file name\n");
     H1("   --recon-depth <integer>       Bit-depth of reconstructed raw image file. Defaults to input bit depth, or 8 if Y4M\n");
     H1("   --recon-y4m-exec <string>     pipe reconstructed frames to Y4M viewer, ex:\"ffplay -i pipe:0 -autoexit\"\n");
+    H0("   --lowpass-dct                 Use low-pass subband dct approximation. Default %s\n", OPT(param->bLowPassDct));
     H1("\nExecutable return codes:\n");
     H1("    0 - encode successful\n");
     H1("    1 - unable to parse command line\n");